Index: Makefile.inc1
===================================================================
--- Makefile.inc1
+++ Makefile.inc1
@@ -2415,7 +2415,7 @@
 # Rebuild ctfconvert and ctfmerge to avoid difficult-to-diagnose failures
 # resulting from missing bug fixes or ELF Toolchain updates.
 .if ${MK_CDDL} != "no"
-_dtrace_tools= cddl/lib/libctf cddl/usr.bin/ctfconvert \
+_dtrace_tools= cddl/lib/libctf cddl/lib/libspl cddl/usr.bin/ctfconvert \
     cddl/usr.bin/ctfmerge
 .endif
 
@@ -2729,7 +2729,12 @@
 		${_cddl_lib_libumem} ${_cddl_lib_libnvpair} \
 		${_cddl_lib_libuutil} \
 		${_cddl_lib_libavl} \
+		${_cddl_lib_libicp} \
+		${_cddl_lib_libicp_rescue} \
+		${_cddl_lib_libspl} \
+		${_cddl_lib_libtpool} \
 		${_cddl_lib_libzfs_core} ${_cddl_lib_libzfs} \
+		${_cddl_lib_libzutil} \
 		${_cddl_lib_libctf} \
 		lib/libufs \
 		lib/libutil lib/libpjdlog ${_lib_libypclnt} lib/libz lib/msun \
@@ -2795,21 +2800,34 @@
 _cddl_lib_libnvpair= cddl/lib/libnvpair
 _cddl_lib_libavl= cddl/lib/libavl
 _cddl_lib_libuutil= cddl/lib/libuutil
+_cddl_lib_libspl= cddl/lib/libspl
+
+cddl/lib/libuutil__L: cddl/lib/libavl__L cddl/lib/libspl__L
+
 .if ${MK_ZFS} != "no"
+_cddl_lib_libicp= cddl/lib/libicp
+_cddl_lib_libicp_rescue= cddl/lib/libicp_rescue
+_cddl_lib_libtpool= cddl/lib/libtpool
+_cddl_lib_libzutil= cddl/lib/libzutil
 _cddl_lib_libzfs_core= cddl/lib/libzfs_core
 _cddl_lib_libzfs= cddl/lib/libzfs
 
+cddl/lib/libtpool__L: cddl/lib/libspl__L
+
+cddl/lib/libzutil__L: cddl/lib/libavl__L cddl/lib/libtpool__L
+
 cddl/lib/libzfs_core__L: cddl/lib/libnvpair__L
 
 cddl/lib/libzfs__L: cddl/lib/libzfs_core__L lib/msun__L lib/libutil__L
 cddl/lib/libzfs__L: lib/libthr__L lib/libmd__L lib/libz__L cddl/lib/libumem__L
 cddl/lib/libzfs__L: cddl/lib/libuutil__L cddl/lib/libavl__L lib/libgeom__L
+cddl/lib/libzfs__L: cddl/lib/libnvpair__L cddl/lib/libzutil__L
 
 lib/libbe__L: cddl/lib/libzfs__L
 .endif
 _cddl_lib_libctf= cddl/lib/libctf
 _cddl_lib= cddl/lib
-cddl/lib/libctf__L: lib/libz__L
+cddl/lib/libctf__L: lib/libz__L cddl/lib/libspl__L
 .endif
 # cddl/lib/libdtrace requires lib/libproc and lib/librtld_db
 _prebuild_libs+=	lib/libprocstat lib/libproc lib/librtld_db
Index: cddl/compat/opensolaris/include/fcntl.h
===================================================================
--- cddl/compat/opensolaris/include/fcntl.h
+++ cddl/compat/opensolaris/include/fcntl.h
@@ -32,7 +32,9 @@
 
 #include_next <fcntl.h>
 
+#ifndef open64
 #define open64(...)	open(__VA_ARGS__)
+#endif
 #define openat64(...)	openat(__VA_ARGS__)
 
 #endif
Index: cddl/contrib/opensolaris/cmd/lockstat/sym.c
===================================================================
--- cddl/contrib/opensolaris/cmd/lockstat/sym.c
+++ cddl/contrib/opensolaris/cmd/lockstat/sym.c
@@ -54,6 +54,7 @@
 #endif
 #include <sys/cpuvar.h>
 
+
 typedef struct syment {
 	uintptr_t	addr;
 	char		*name;
@@ -72,6 +73,11 @@
 #endif
 #endif
 
+#define __sElfN(x)       typedef __CONCAT(__CONCAT(__CONCAT(Elf,__ELF_WORD_SIZE),_),x) x
+__sElfN(Sym);
+__sElfN(Shdr);
+#define	elf_getshdr		__elfN(getshdr)
+
 static void
 add_symbol(char *name, uintptr_t addr, size_t size)
 {
Index: cddl/contrib/opensolaris/lib/libdtrace/common/drti.c
===================================================================
--- cddl/contrib/opensolaris/lib/libdtrace/common/drti.c
+++ cddl/contrib/opensolaris/lib/libdtrace/common/drti.c
@@ -24,6 +24,7 @@
  * Use is subject to license terms.
  */
 
+#include <sys/types.h>
 #include <unistd.h>
 #include <fcntl.h>
 #include <dlfcn.h>
Index: cddl/contrib/opensolaris/lib/libdtrace/common/dt_link.c
===================================================================
--- cddl/contrib/opensolaris/lib/libdtrace/common/dt_link.c
+++ cddl/contrib/opensolaris/lib/libdtrace/common/dt_link.c
@@ -33,8 +33,6 @@
 #include <sys/types.h>
 #ifdef illumos
 #include <sys/sysmacros.h>
-#else
-#define	P2ROUNDUP(x, align)		(-(-(x) & -(align)))
 #endif
 
 #include <unistd.h>
Index: cddl/contrib/opensolaris/lib/libdtrace/common/dt_print.c
===================================================================
--- cddl/contrib/opensolaris/lib/libdtrace/common/dt_print.c
+++ cddl/contrib/opensolaris/lib/libdtrace/common/dt_print.c
@@ -77,7 +77,6 @@
 #include <netdb.h>
 #include <netinet/in.h>
 #include <arpa/inet.h>
-#include <arpa/nameser.h>
 
 #include <dt_module.h>
 #include <dt_printf.h>
Index: cddl/contrib/opensolaris/lib/libdtrace/common/dt_printf.c
===================================================================
--- cddl/contrib/opensolaris/lib/libdtrace/common/dt_printf.c
+++ cddl/contrib/opensolaris/lib/libdtrace/common/dt_printf.c
@@ -44,12 +44,19 @@
 #include <netdb.h>
 #include <netinet/in.h>
 #include <arpa/inet.h>
-#include <arpa/nameser.h>
-
+#include <sys/byteorder.h>
 #include <dt_printf.h>
 #include <dt_string.h>
 #include <dt_impl.h>
 
+#ifndef NS_IN6ADDRSZ
+#define NS_IN6ADDRSZ 16
+#endif
+
+#ifndef NS_INADDRSZ
+#define NS_INADDRSZ 4
+#endif
+
 /*ARGSUSED*/
 static int
 pfcheck_addr(dt_pfargv_t *pfv, dt_pfargd_t *pfd, dt_node_t *dnp)
Index: cddl/contrib/opensolaris/tools/ctf/cvt/util.c
===================================================================
--- cddl/contrib/opensolaris/tools/ctf/cvt/util.c
+++ cddl/contrib/opensolaris/tools/ctf/cvt/util.c
@@ -29,6 +29,7 @@
  * Utility functions
  */
 
+#include <assert.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
Index: cddl/lib/Makefile
===================================================================
--- cddl/lib/Makefile
+++ cddl/lib/Makefile
@@ -6,27 +6,40 @@
 	libavl \
 	libctf \
 	libdtrace \
+	${_libicp} \
+	${_libicp_rescue} \
 	libnvpair \
+	libspl \
+	${_libtpool} \
 	libumem \
 	libuutil \
 	${_libzfs_core} \
 	${_libzfs} \
 	${_libzpool} \
+	${_libzutil}
 
 SUBDIR.${MK_TESTS}+= tests
 
 .if ${MK_ZFS} != "no"
 _libzfs_core=	libzfs_core
+_libicp=	libicp
+_libicp_rescue=	libicp_rescue
 _libzfs=	libzfs
+_libzutil=	libzutil
 .if ${MK_LIBTHR} != "no"
 _libzpool=	libzpool
+_libtpool=	libtpool
 .endif
 .endif
 
+SUBDIR_DEPEND_libctf=		libspl
 SUBDIR_DEPEND_libdtrace=	libctf
+SUBDIR_DEPEND_libtpool=		libspl
+SUBDIR_DEPEND_libuutil=		libavl libspl
 SUBDIR_DEPEND_libzfs_core=	libnvpair
-SUBDIR_DEPEND_libzfs=	libavl libnvpair libumem libuutil libzfs_core
-SUBDIR_DEPEND_libzpool=	libavl libnvpair libumem
+SUBDIR_DEPEND_libzfs=	libavl libnvpair libumem libuutil libzfs_core libzutil
+SUBDIR_DEPEND_libzpool=	libavl libnvpair libumem libicp
+SUBDIR_DEPEND_libzutil=	libavl libtpool
 
 SUBDIR_PARALLEL=
 
Index: cddl/lib/drti/Makefile
===================================================================
--- cddl/lib/drti/Makefile
+++ cddl/lib/drti/Makefile
@@ -11,7 +11,14 @@
 CLEANFILES=	${FILES}
 # These FILES qualify as libraries for the purpose of LIBRARIES_ONLY.
 .undef LIBRARIES_ONLY
-
+CFLAGS+= -DIN_BASE
+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/os/freebsd/spl
+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/include
+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/
+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/os/freebsd
+CFLAGS+= -I${SRCTOP}/sys
+CFLAGS+= -include ${SRCTOP}/sys/contrib/openzfs/include/os/freebsd/spl/sys/ccompile.h
+CFLAGS+= -DHAVE_ISSETUGID
 CFLAGS+=	-I${SRCTOP}/sys/cddl/compat/opensolaris \
 		-I${SRCTOP}/cddl/compat/opensolaris/include \
 		-I${OPENSOLARIS_USR_DISTDIR}/head \
Index: cddl/lib/libavl/Makefile
===================================================================
--- cddl/lib/libavl/Makefile
+++ cddl/lib/libavl/Makefile
@@ -1,12 +1,15 @@
 # $FreeBSD$
 
-.PATH: ${SRCTOP}/sys/cddl/contrib/opensolaris/common/avl
+.PATH: ${SRCTOP}/sys/contrib/openzfs/module/avl
 
 PACKAGE=	runtime
 LIB=	avl
 SRCS=	avl.c
 WARNS?=	3
-CFLAGS+= -I${SRCTOP}/sys/cddl/compat/opensolaris
-CFLAGS+= -I${SRCTOP}/sys/cddl/contrib/opensolaris/uts/common
-
+CFLAGS+= -DIN_BASE
+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/include
+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/
+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/os/freebsd
+CFLAGS+= -I${SRCTOP}/sys
+CFLAGS+= -include ${SRCTOP}/sys/contrib/openzfs/include/os/freebsd/spl/sys/ccompile.h
 .include <bsd.lib.mk>
Index: cddl/lib/libctf/Makefile
===================================================================
--- cddl/lib/libctf/Makefile
+++ cddl/lib/libctf/Makefile
@@ -21,6 +21,14 @@
 WARNS?=		2
 CFLAGS+=	-DCTF_OLD_VERSIONS
 
+CFLAGS+= -DIN_BASE
+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/os/freebsd/spl
+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/include
+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/
+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/os/freebsd
+CFLAGS+= -include ${SRCTOP}/sys/contrib/openzfs/include/os/freebsd/spl/sys/ccompile.h
+CFLAGS+= -DHAVE_ISSETUGID
+
 CFLAGS+=	-I${SRCTOP}/sys/cddl/compat/opensolaris \
 		-I${SRCTOP}/cddl/compat/opensolaris/include \
 		-I${OPENSOLARIS_USR_DISTDIR}/head \
@@ -28,6 +36,6 @@
 		-I${OPENSOLARIS_USR_DISTDIR}/lib/libctf/common \
 		-I${OPENSOLARIS_SYS_DISTDIR}/uts/common
 
-LIBADD+=	z
+LIBADD+=	spl z
 
 .include <bsd.lib.mk>
Index: cddl/lib/libdtrace/Makefile
===================================================================
--- cddl/lib/libdtrace/Makefile
+++ cddl/lib/libdtrace/Makefile
@@ -66,6 +66,16 @@
 
 WARNS?=		1
 
+CFLAGS+= -DIN_BASE
+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/os/freebsd/spl
+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/include
+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/
+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/os/freebsd
+CFLAGS+= -I${SRCTOP}/sys
+CFLAGS+= -include ${SRCTOP}/sys/contrib/openzfs/include/os/freebsd/spl/sys/ccompile.h
+CFLAGS+= -DHAVE_ISSETUGID
+
+
 CFLAGS+=	-I${.OBJDIR} -I${.CURDIR} \
 		-I${SRCTOP}/sys/cddl/dev/dtrace/${MACHINE_ARCH} \
 		-I${SRCTOP}/sys/cddl/compat/opensolaris \
Index: cddl/lib/libicp/Makefile
===================================================================
--- /dev/null
+++ cddl/lib/libicp/Makefile
@@ -0,0 +1,101 @@
+# $FreeBSD$
+
+.PATH: ${SRCTOP}/sys/contrib/openzfs/module/icp
+
+PACKAGE=	runtime
+LIB=	icp
+LIBADD=
+
+
+.if ${MACHINE_ARCH} == "amd64"
+ASM_SOURCES_C = asm-x86_64/aes/aeskey.c
+ASM_SOURCES_AS = \
+        asm-x86_64/aes/aes_amd64.S \
+        asm-x86_64/aes/aes_aesni.S \
+        asm-x86_64/modes/gcm_pclmulqdq.S \
+        asm-x86_64/modes/aesni-gcm-x86_64.S \
+        asm-x86_64/modes/ghash-x86_64.S \
+        asm-x86_64/sha1/sha1-x86_64.S \
+        asm-x86_64/sha2/sha256_impl.S \
+        asm-x86_64/sha2/sha512_impl.S
+
+CFLAGS+= -D__amd64 -D_SYS_STACK_H -UHAVE_AES
+.else
+ASM_SOURCES_C =
+ASM_SOURCES_AS =
+.endif
+
+
+KERNEL_C = \
+        spi/kcf_spi.c \
+        api/kcf_ctxops.c \
+        api/kcf_digest.c \
+        api/kcf_cipher.c \
+        api/kcf_miscapi.c \
+        api/kcf_mac.c \
+        algs/aes/aes_impl_aesni.c \
+        algs/aes/aes_impl_generic.c \
+        algs/aes/aes_impl_x86-64.c \
+        algs/aes/aes_impl.c \
+        algs/aes/aes_modes.c \
+        algs/edonr/edonr.c \
+        algs/modes/modes.c \
+        algs/modes/cbc.c \
+        algs/modes/gcm_generic.c \
+        algs/modes/gcm_pclmulqdq.c \
+        algs/modes/gcm.c \
+        algs/modes/ctr.c \
+        algs/modes/ccm.c \
+        algs/modes/ecb.c \
+        algs/sha1/sha1.c \
+        algs/sha2/sha2.c \
+        algs/skein/skein.c \
+        algs/skein/skein_block.c \
+        algs/skein/skein_iv.c \
+        illumos-crypto.c \
+        io/aes.c \
+        io/edonr_mod.c \
+        io/sha1_mod.c \
+        io/sha2_mod.c \
+        io/skein_mod.c \
+        os/modhash.c \
+        os/modconf.c \
+        core/kcf_sched.c \
+        core/kcf_prov_lib.c \
+        core/kcf_callprov.c \
+        core/kcf_mech_tabs.c \
+        core/kcf_prov_tabs.c \
+        $(ASM_SOURCES_C)
+
+
+
+
+
+
+SRCS= $(ASM_SOURCES_AS) $(KERNEL_C)
+
+WARNS?=	2
+SHLIB_MAJOR= 3
+CSTD=	c99
+CFLAGS+= -DIN_BASE
+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/include
+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/
+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/os/freebsd
+CFLAGS+= -I${SRCTOP}/sys
+CFLAGS+= -I${SRCTOP}/cddl/compat/opensolaris/include
+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/module/icp/include
+CFLAGS+= -include ${SRCTOP}/sys/contrib/openzfs/include/os/freebsd/spl/sys/ccompile.h
+CFLAGS+= -DHAVE_ISSETUGID
+CFLAGS+= -include ${SRCTOP}/sys/modules/zfs/zfs_config.h
+
+
+CFLAGS.aes_amd64.S+= -DLOCORE
+CFLAGS.aes_aesni.S+= -DLOCORE
+CFLAGS.gcm_pclmulqdq.S+= -DLOCORE
+CFLAGS.aesni-gcm-x86_64.S+= -DLOCORE
+CFLAGS.ghash-x86_64.S+= -DLOCORE
+CFLAGS.sha1-x86_64.S+= -DLOCORE
+CFLAGS.sha256_impl.S+= -DLOCORE
+CFLAGS.sha512_impl.S+= -DLOCORE
+
+.include <bsd.lib.mk>
Index: cddl/lib/libicp_rescue/Makefile
===================================================================
--- /dev/null
+++ cddl/lib/libicp_rescue/Makefile
@@ -0,0 +1,99 @@
+# $FreeBSD$
+
+.PATH: ${SRCTOP}/sys/contrib/openzfs/module/icp
+
+PACKAGE=	runtime
+LIB=	icp_rescue
+LIBADD=
+
+
+.if ${MACHINE_ARCH} == "amd64"
+ASM_SOURCES_C = asm-x86_64/aes/aeskey.c
+ASM_SOURCES_AS = \
+        asm-x86_64/aes/aes_amd64.S \
+        asm-x86_64/aes/aes_aesni.S \
+        asm-x86_64/modes/gcm_pclmulqdq.S \
+        asm-x86_64/modes/aesni-gcm-x86_64.S \
+        asm-x86_64/sha1/sha1-x86_64.S \
+        asm-x86_64/sha2/sha256_impl.S \
+        asm-x86_64/sha2/sha512_impl.S
+
+CFLAGS+= -D__amd64 -D_SYS_STACK_H
+.else
+ASM_SOURCES_C =
+ASM_SOURCES_AS =
+.endif
+
+
+KERNEL_C = \
+        spi/kcf_spi.c \
+        api/kcf_ctxops.c \
+        api/kcf_digest.c \
+        api/kcf_cipher.c \
+        api/kcf_miscapi.c \
+        api/kcf_mac.c \
+        algs/aes/aes_impl_aesni.c \
+        algs/aes/aes_impl_generic.c \
+        algs/aes/aes_impl_x86-64.c \
+        algs/aes/aes_impl.c \
+        algs/aes/aes_modes.c \
+        algs/edonr/edonr.c \
+        algs/modes/modes.c \
+        algs/modes/cbc.c \
+        algs/modes/gcm_generic.c \
+        algs/modes/gcm_pclmulqdq.c \
+        algs/modes/gcm.c \
+        algs/modes/ctr.c \
+        algs/modes/ccm.c \
+        algs/modes/ecb.c \
+        algs/sha1/sha1.c \
+        algs/sha2/sha2.c \
+        algs/skein/skein_block.c \
+        illumos-crypto.c \
+        io/aes.c \
+        io/edonr_mod.c \
+        io/sha1_mod.c \
+        io/sha2_mod.c \
+        io/skein_mod.c \
+        os/modhash.c \
+        os/modconf.c \
+        core/kcf_sched.c \
+        core/kcf_prov_lib.c \
+        core/kcf_callprov.c \
+        core/kcf_mech_tabs.c \
+        core/kcf_prov_tabs.c \
+        $(ASM_SOURCES_C)
+
+
+
+
+
+
+SRCS= $(ASM_SOURCES_AS) $(KERNEL_C)
+
+WARNS?=	2
+SHLIB_MAJOR= 3
+CSTD=	c99
+CFLAGS+= -DIN_BASE
+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/include
+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/
+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/os/freebsd
+CFLAGS+= -I${SRCTOP}/sys
+CFLAGS+= -I${SRCTOP}/cddl/compat/opensolaris/include
+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/module/icp/include
+CFLAGS+= -include ${SRCTOP}/sys/contrib/openzfs/include/os/freebsd/spl/sys/ccompile.h
+CFLAGS+= -DHAVE_ISSETUGID -UHAVE_AVX -DRESCUE
+CFLAGS+= -include ${SRCTOP}/sys/modules/zfs/zfs_config.h
+
+
+CFLAGS.aes_amd64.S+= -DLOCORE
+CFLAGS.aes_aesni.S+= -DLOCORE
+CFLAGS.gcm_pclmulqdq.S+= -DLOCORE
+CFLAGS.aesni-gcm-x86_64.S+= -DLOCORE
+CFLAGS.ghash-x86_64.S+= -DLOCORE
+CFLAGS.sha1-x86_64.S+= -DLOCORE
+CFLAGS.sha256_impl.S+= -DLOCORE
+CFLAGS.sha512_impl.S+= -DLOCORE
+CFLAGS.gcm.c+= -UCAN_USE_GCM_ASM
+
+.include <bsd.lib.mk>
Index: cddl/lib/libnvpair/Makefile
===================================================================
--- cddl/lib/libnvpair/Makefile
+++ cddl/lib/libnvpair/Makefile
@@ -1,36 +1,30 @@
 # $FreeBSD$
 
-.PATH: ${SRCTOP}/cddl/contrib/opensolaris/lib/libnvpair
-.PATH: ${SRCTOP}/sys/cddl/contrib/opensolaris/common/nvpair
+.PATH: ${SRCTOP}/sys/contrib/openzfs/module/nvpair
+.PATH: ${SRCTOP}/sys/contrib/openzfs/lib/libnvpair
 
 LIB=	nvpair
 
 PACKAGE=	runtime
-INCS=	libnvpair.h
+# user
 SRCS=	libnvpair.c \
-	nvpair_alloc_system.c \
-	nvpair_json.c \
-	opensolaris_fnvpair.c \
-	opensolaris_nvpair.c \
-	opensolaris_nvpair_alloc_fixed.c
+	libnvpair_json.c \
+	nvpair_alloc_system.c
+# kernel
+SRCS+= nvpair_alloc_fixed.c \
+	nvpair.c \
+	fnvpair.c
 
-WARNS?=	1
-CFLAGS+= -I${SRCTOP}/cddl/compat/opensolaris/include
-CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/lib/libzpool/common
-CFLAGS+= -I${SRCTOP}/sys/cddl/compat/opensolaris
-CFLAGS+= -I${SRCTOP}/sys/cddl/contrib/opensolaris/uts/common
-CFLAGS+= -I${SRCTOP}/sys/cddl/contrib/opensolaris/uts/common/fs/zfs
+WARNS?=	2
+CFLAGS+= -DIN_BASE -DHAVE_RPC_TYPES
+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/include
+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/
+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/os/freebsd
 CFLAGS+= -I${SRCTOP}/sys
-CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/head
-CFLAGS+= -I${SRCTOP}/cddl/compat/opensolaris/lib/libumem
+CFLAGS+= -I${SRCTOP}/cddl/compat/opensolaris/include
+CFLAGS+= -include ${SRCTOP}/sys/contrib/openzfs/include/os/freebsd/spl/sys/ccompile.h
+CFLAGS+= -DHAVE_ISSETUGID  -DHAVE_CONFIG_H -DHAVE_XDR_BYTESREC
+
 
-# This library uses macros to define fprintf behavior for several object types
-# The compiler will see the non-string literal arguments to the fprintf calls and
-# omit warnings for them. Quiesce these warnings in contrib code:
-#
-# cddl/contrib/opensolaris/lib/libnvpair/libnvpair.c:743:12: warning: format
-#   string is not a string literal (potentially insecure) [-Wformat-security]
-#    ARENDER(pctl, nvlist_array, nvl, name, val, nelem);
-#
-CFLAGS+= -Wno-format-security
+CFLAGS.nvpair.c+= -UHAVE_RPC_TYPES
 .include <bsd.lib.mk>
Index: cddl/lib/libspl/Makefile
===================================================================
--- /dev/null
+++ cddl/lib/libspl/Makefile
@@ -0,0 +1,56 @@
+# $FreeBSD$
+
+.PATH: ${SRCTOP}/sys/contrib/openzfs/lib/libspl
+.PATH: ${SRCTOP}/sys/contrib/openzfs/lib/libspl/os/freebsd
+.PATH: ${SRCTOP}/sys/contrib/openzfs/include
+
+
+LIB=	spl
+LIBADD=
+PACKAGE=	runtime
+
+SRCS = \
+        assert.c \
+        list.c \
+        mkdirp.c \
+        page.c \
+        strlcat.c \
+        strlcpy.c \
+        timestamp.c \
+        zone.c \
+        include/sys/list.h \
+        include/sys/list_impl.h
+
+SRCS += \
+        getexecname.c \
+        gethostid.c \
+        getmntany.c \
+        mnttab.c
+
+
+.if ${MACHINE_ARCH} == "amd64"
+.PATH: ${SRCTOP}/sys/contrib/openzfs/lib/libspl/asm-x86_64
+SRCS += atomic.S
+.elif ${MACHINE_ARCH} == "i386"
+.PATH: ${SRCTOP}/sys/contrib/openzfs/lib/libspl/asm-i386
+SRCS += atomic.S
+.else
+.PATH: ${SRCTOP}/sys/contrib/openzfs/lib/libspl/asm-generic
+SRCS += atomic.c
+.endif
+
+
+WARNS?=	2
+CSTD=	c99
+CFLAGS+= -DIN_BASE
+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/include
+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/
+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/os/freebsd
+CFLAGS+= -I${SRCTOP}/cddl/compat/opensolaris/include
+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/module/icp/include
+CFLAGS+= -include ${SRCTOP}/sys/contrib/openzfs/include/os/freebsd/spl/sys/ccompile.h
+CFLAGS+= -DHAVE_ISSETUGID
+CFLAGS+= -include ${SRCTOP}/sys/modules/zfs/zfs_config.h
+CFLAGS.atomic.S+= -DLOCORE
+
+.include <bsd.lib.mk>
Index: cddl/lib/libtpool/Makefile
===================================================================
--- /dev/null
+++ cddl/lib/libtpool/Makefile
@@ -0,0 +1,27 @@
+# $FreeBSD$
+
+.PATH: ${SRCTOP}/sys/contrib/openzfs/lib/libtpool
+.PATH: ${SRCTOP}/sys/contrib/openzfs/include
+
+
+LIB=	tpool
+LIBADD=	spl
+PACKAGE=	runtime
+
+INCS=	thread_pool_impl.h
+SRCS=	thread_pool.c
+
+WARNS?=	2
+CSTD=	c99
+CFLAGS+= -DIN_BASE
+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/include
+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/
+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/os/freebsd
+CFLAGS+= -I${SRCTOP}/sys
+CFLAGS+= -I${SRCTOP}/cddl/compat/opensolaris/include
+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/module/icp/include
+CFLAGS+= -include ${SRCTOP}/sys/contrib/openzfs/include/os/freebsd/spl/sys/ccompile.h
+CFLAGS+= -DHAVE_ISSETUGID 
+CFLAGS+= -include ${SRCTOP}/sys/modules/zfs/zfs_config.h
+
+.include <bsd.lib.mk>
Index: cddl/lib/libuutil/Makefile
===================================================================
--- cddl/lib/libuutil/Makefile
+++ cddl/lib/libuutil/Makefile
@@ -1,11 +1,10 @@
 # $FreeBSD$
 
-.PATH: ${SRCTOP}/cddl/contrib/opensolaris/lib/libuutil/common
-.PATH: ${SRCTOP}/sys/cddl/contrib/opensolaris/common/avl
+.PATH: ${SRCTOP}/sys/contrib/openzfs/lib/libuutil
 
 PACKAGE=	runtime
 LIB=	uutil
-SRCS=	avl.c \
+SRCS=\
 	uu_alloc.c \
 	uu_avl.c \
 	uu_dprintf.c \
@@ -14,14 +13,17 @@
 	uu_misc.c \
 	uu_open.c \
 	uu_pname.c \
-	uu_strtoint.c
+	uu_string.c
 
-WARNS?=	1
-CFLAGS+= -DNATIVE_BUILD
-CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/lib/libuutil/common
-CFLAGS+= -I${SRCTOP}/sys/cddl/compat/opensolaris
-CFLAGS+= -I${SRCTOP}/sys/cddl/contrib/opensolaris/uts/common
+WARNS?=	2
+CFLAGS+= -DIN_BASE
+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/include
+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/
+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/os/freebsd
+CFLAGS+= -I${SRCTOP}/sys
 CFLAGS+= -I${SRCTOP}/cddl/compat/opensolaris/include
-CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/head
+CFLAGS+= -include ${SRCTOP}/sys/contrib/openzfs/include/os/freebsd/spl/sys/ccompile.h
+
+LIBADD= avl spl
 
 .include <bsd.lib.mk>
Index: cddl/lib/libzfs/Makefile
===================================================================
--- cddl/lib/libzfs/Makefile
+++ cddl/lib/libzfs/Makefile
@@ -1,62 +1,92 @@
 # $FreeBSD$
 
-.PATH: ${SRCTOP}/cddl/compat/opensolaris/misc
-.PATH: ${SRCTOP}/sys/cddl/contrib/opensolaris/common/zfs
-.PATH: ${SRCTOP}/sys/cddl/contrib/opensolaris/uts/common/fs/zfs
-.PATH: ${SRCTOP}/cddl/contrib/opensolaris/lib/libzfs/common
-.PATH: ${SRCTOP}/cddl/contrib/opensolaris/lib/libcmdutils/common
+.PATH: ${SRCTOP}/sys/contrib/openzfs/module/icp
+.PATH: ${SRCTOP}/sys/contrib/openzfs/module/zcommon
+.PATH: ${SRCTOP}/sys/contrib/openzfs/lib/libzfs
+.PATH: ${SRCTOP}/sys/contrib/openzfs/lib/libzfs/os/freebsd
+.PATH: ${SRCTOP}/sys/contrib/openzfs/lib/libshare
+.PATH: ${SRCTOP}/sys/contrib/openzfs/lib/libshare/os/freebsd
 
 PACKAGE=	runtime
 LIB=	zfs
-LIBADD=	md pthread umem util uutil m avl bsdxml geom nvpair z zfs_core
-SRCS=	deviceid.c \
-	fsshare.c \
-	mkdirp.c \
-	mnttab.c \
-	thread_pool.c \
-	zmount.c \
-	zone.c
-
-SRCS+=	nicenum.c
-
-SRCS+=	libzfs_changelist.c \
-	libzfs_compat.c \
-	libzfs_config.c \
-	libzfs_dataset.c \
-	libzfs_diff.c \
-	libzfs_import.c \
-	libzfs_iter.c \
-	libzfs_mount.c \
-	libzfs_pool.c \
-	libzfs_sendrecv.c \
-	libzfs_status.c \
-	libzfs_util.c \
-	zfeature_common.c \
-	zfs_comutil.c \
-	zfs_deleg.c \
-	zfs_fletcher.c \
-	zfs_namecheck.c \
-	zfs_prop.c \
-	zpool_prop.c \
-	zprop_common.c \
-
-WARNS?=	0
+LIBADD=	md pthread umem util uutil m avl bsdxml geom nvpair z zfs_core zutil
+
+USER_C = \
+        libzfs_changelist.c \
+        libzfs_config.c \
+        libzfs_crypto.c \
+        libzfs_dataset.c \
+        libzfs_diff.c \
+        libzfs_import.c \
+        libzfs_iter.c \
+        libzfs_mount.c \
+        libzfs_pool.c \
+        libzfs_sendrecv.c \
+        libzfs_status.c \
+        libzfs_util.c
+
+# FreeBSD
+USER_C += \
+        libzfs_compat.c \
+        libzfs_ioctl_compat.c \
+        libzfs_zmount.c
+
+# libshare
+USER_C += \
+	libshare.c \
+	nfs.c \
+	smb.c
+
+
+KERNEL_C = \
+        algs/sha2/sha2.c \
+        cityhash.c \
+        zfeature_common.c \
+        zfs_comutil.c \
+        zfs_deleg.c \
+        zfs_fletcher.c \
+        zfs_fletcher_superscalar.c \
+        zfs_fletcher_superscalar4.c \
+        zfs_namecheck.c \
+        zfs_prop.c \
+        zfs_uio.c \
+        zpool_prop.c \
+        zprop_common.c
+
+
+
+ARCH_C =
+.if ${MACHINE_ARCH} == "amd64" || ${MACHINE_ARCH} == "i386"
+ARCH_C += 	zfs_fletcher_intel.c \
+		zfs_fletcher_sse.c 
+CFLAGS +=  -DHAVE_SSE2
+.endif
+.if ${MACHINE_ARCH} == "amd64"
+ARCH_C +=	zfs_fletcher_avx512.c
+CFLAGS+= -DHAVE_AVX2 -DHAVE_AVX -D__x86_64 -DHAVE_AVX512F
+.endif
+.if ${MACHINE_ARCH} == "aarch64"
+ARCH_C +=	zfs_fletcher_aarch64_neon.c
+.endif
+
+SRCS= $(USER_C) $(KERNEL_C) $(ARCH_C)
+
+WARNS?=	2
 SHLIB_MAJOR= 3
 CSTD=	c99
-CFLAGS+= -DZFS_NO_ACL
-CFLAGS+= -I${SRCTOP}/sbin/mount
-CFLAGS+= -I${SRCTOP}/sys/cddl/compat/opensolaris
+CFLAGS+= -DIN_BASE
+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/include
+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/include/os/freebsd
+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include
+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/os/freebsd
+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libshare
+CFLAGS+= -I${SRCTOP}/sys/contrib/ck/include
+CFLAGS+= -I${SRCTOP}/sys
 CFLAGS+= -I${SRCTOP}/cddl/compat/opensolaris/include
-CFLAGS+= -I${SRCTOP}/cddl/compat/opensolaris/lib/libumem
-CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/lib/libzpool/common
-CFLAGS+= -I${SRCTOP}/sys/cddl/contrib/opensolaris/common/zfs
-CFLAGS+= -I${SRCTOP}/sys/cddl/contrib/opensolaris/uts/common/fs/zfs
-CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/head
-CFLAGS+= -I${SRCTOP}/sys/cddl/contrib/opensolaris/uts/common
-CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/lib/libnvpair
-CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/lib/libuutil/common
-CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/lib/libzfs/common
-CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/lib/libzfs_core/common
-CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/lib/libcmdutils
+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/module/icp/include
+CFLAGS+= -include ${SRCTOP}/sys/contrib/openzfs/include/os/freebsd/spl/sys/ccompile.h
+CFLAGS+= -DHAVE_ISSETUGID
+CFLAGS+= -include ${SRCTOP}/sys/modules/zfs/zfs_config.h
+
 
 .include <bsd.lib.mk>
Index: cddl/lib/libzfs_core/Makefile
===================================================================
--- cddl/lib/libzfs_core/Makefile
+++ cddl/lib/libzfs_core/Makefile
@@ -1,37 +1,27 @@
 # $FreeBSD$
 
-.PATH: ${SRCTOP}/cddl/compat/opensolaris/misc
-.PATH: ${SRCTOP}/sys/cddl/contrib/opensolaris/common/zfs
-.PATH: ${SRCTOP}/sys/cddl/contrib/opensolaris/uts/common/fs/zfs
-.PATH: ${SRCTOP}/cddl/contrib/opensolaris/lib/libzfs_core/common
-.PATH: ${SRCTOP}/cddl/contrib/opensolaris/lib/libzfs/common
+.PATH: ${SRCTOP}/sys/contrib/openzfs/lib/libzfs_core
+.PATH: ${SRCTOP}/sys/contrib/openzfs/include
+
 
 LIB=	zfs_core
 LIBADD=	nvpair
 PACKAGE=	runtime
 
-INCS=	libzfs_core.h
-SRCS=	libzfs_core.c \
-	libzfs_core_compat.c \
-	zfs_ioctl_compat.c
-
-SRCS+=	libzfs_compat.c
+SRCS=	libzfs_core.c 
 
-WARNS?=	0
+WARNS?=	2
 CSTD=	c99
-CFLAGS+= -DZFS_NO_ACL
-CFLAGS+= -I${SRCTOP}/sbin/mount
-CFLAGS+= -I${SRCTOP}/sys/cddl/compat/opensolaris
+CFLAGS+= -DIN_BASE
+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/include
+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libzfs_core/common
+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/
+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/os/freebsd
+CFLAGS+= -I${SRCTOP}/sys
 CFLAGS+= -I${SRCTOP}/cddl/compat/opensolaris/include
-CFLAGS+= -I${SRCTOP}/cddl/compat/opensolaris/lib/libumem
-CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/lib/libzpool/common
-CFLAGS+= -I${SRCTOP}/sys/cddl/contrib/opensolaris/common/zfs
-CFLAGS+= -I${SRCTOP}/sys/cddl/contrib/opensolaris/uts/common/fs/zfs
-CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/head
-CFLAGS+= -I${SRCTOP}/sys/cddl/contrib/opensolaris/uts/common
-CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/lib/libnvpair
-CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/lib/libuutil/common
-CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/lib/libzfs_core/common
-CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/lib/libzfs/common
+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/module/icp/include
+CFLAGS+= -include ${SRCTOP}/sys/contrib/openzfs/include/os/freebsd/spl/sys/ccompile.h
+CFLAGS+= -DHAVE_ISSETUGID
+CFLAGS+= -include ${SRCTOP}/sys/modules/zfs/zfs_config.h
 
 .include <bsd.lib.mk>
Index: cddl/lib/libzpool/Makefile
===================================================================
--- cddl/lib/libzpool/Makefile
+++ cddl/lib/libzpool/Makefile
@@ -1,20 +1,17 @@
 # $FreeBSD$
 
-.include "${SRCTOP}/sys/cddl/contrib/opensolaris/uts/common/Makefile.files"
 
 # ZFS_COMMON_SRCS
-.PATH: ${SRCTOP}/sys/cddl/contrib/opensolaris/uts/common/fs/zfs
+.PATH: ${SRCTOP}/sys/contrib/openzfs/module/zfs
+.PATH: ${SRCTOP}/sys/contrib/openzfs/module/zcommon
+.PATH: ${SRCTOP}/sys/contrib/openzfs/module/unicode
 # LUA_SRCS
-.PATH: ${SRCTOP}/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua
-# ZFS_SHARED_SRCS
-.PATH: ${SRCTOP}/sys/cddl/contrib/opensolaris/common/zfs
-# LZ4_COMMON_SRCS
-.PATH: ${SRCTOP}/sys/cddl/contrib/opensolaris/common/lz4
-# KERNEL_SRCS
-.PATH: ${SRCTOP}/cddl/contrib/opensolaris/lib/libzpool/common
-# LIST_SRCS
-.PATH: ${SRCTOP}/sys/cddl/contrib/opensolaris/uts/common/os
-# ATOMIC_SRCS
+.PATH: ${SRCTOP}/sys/contrib/openzfs/module/lua
+
+.PATH: ${SRCTOP}/sys/contrib/openzfs/module/os/linux/zfs
+
+.PATH: ${SRCTOP}/sys/contrib/openzfs/lib/libzpool
+
 .if exists(${SRCTOP}/sys/cddl/contrib/opensolaris/common/atomic/${MACHINE_ARCH}/opensolaris_atomic.S)
 .PATH: ${SRCTOP}/sys/cddl/contrib/opensolaris/common/atomic/${MACHINE_ARCH}
 ATOMIC_SRCS=	opensolaris_atomic.S
@@ -23,40 +20,218 @@
 .PATH: ${SRCTOP}/sys/cddl/compat/opensolaris/kern
 ATOMIC_SRCS=	opensolaris_atomic.c
 .endif
-# UNICODE_SRCS
-.PATH: ${SRCTOP}/sys/cddl/contrib/opensolaris/common/unicode
-# LIBCMDUTILS_SRCS
-.PATH: ${SRCTOP}/cddl/contrib/opensolaris/lib/libcmdutils/common
+
+.if ${MACHINE_ARCH} == "powerpc"
+# Don't waste GOT entries on small data.
+PICFLAG=	-fPIC
+.endif
 
 LIB=		zpool
 
-ZFS_COMMON_SRCS= ${ZFS_COMMON_OBJS:C/.o$/.c/} trim_map.c
-ZFS_SHARED_SRCS= ${ZFS_SHARED_OBJS:C/.o$/.c/}
-LZ4_COMMON_SRCS= lz4.c
-LUA_SRCS=	${LUA_OBJS:C/.o$/.c/}
-KERNEL_SRCS=	kernel.c taskq.c util.c
-LIST_SRCS=	list.c
-UNICODE_SRCS=	u8_textprep.c
-LIBCMDUTILS_SRCS=nicenum.c
-
-SRCS=		${ZFS_COMMON_SRCS} ${ZFS_SHARED_SRCS} ${LUA_SRCS} \
-		${LZ4_COMMON_SRCS} ${KERNEL_SRCS} ${LIST_SRCS} ${ATOMIC_SRCS} \
-		${UNICODE_SRCS} ${LIBCMDUTILS_SRCS}
-
-WARNS?=		0
-CFLAGS+=	-I${SRCTOP}/sys/cddl/compat/opensolaris
-CFLAGS+=	-I${SRCTOP}/cddl/compat/opensolaris/include
-CFLAGS+=	-I${SRCTOP}/cddl/compat/opensolaris/lib/libumem
-CFLAGS+=	-I${SRCTOP}/cddl/contrib/opensolaris/lib/libzfs/common
-CFLAGS+=	-I${SRCTOP}/cddl/contrib/opensolaris/lib/libzpool/common
-CFLAGS+=	-I${SRCTOP}/sys/cddl/contrib/opensolaris/uts/common/fs/zfs
-CFLAGS+=	-I${SRCTOP}/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua
-CFLAGS+=	-I${SRCTOP}/sys/cddl/contrib/opensolaris/common/zfs
-CFLAGS+=	-I${SRCTOP}/sys/cddl/contrib/opensolaris/common/lz4
-CFLAGS+=	-I${SRCTOP}/sys/cddl/contrib/opensolaris/uts/common
-CFLAGS+=	-I${SRCTOP}/cddl/contrib/opensolaris/head
-CFLAGS+=	-I${SRCTOP}/cddl/contrib/opensolaris/lib/libnvpair
-CFLAGS+=	-I${SRCTOP}/cddl/contrib/opensolaris/lib/libcmdutils
+
+
+USER_C = \
+	kernel.c \
+	taskq.c \
+	util.c
+
+KERNEL_C = \
+	zfeature_common.c \
+	zfs_comutil.c \
+	zfs_deleg.c \
+	zfs_fletcher.c \
+	zfs_fletcher_superscalar.c \
+	zfs_fletcher_superscalar4.c \
+	zfs_namecheck.c \
+	zfs_prop.c \
+	zfs_uio.c \
+	zpool_prop.c \
+	zprop_common.c \
+	abd.c \
+	abd_os.c \
+	aggsum.c \
+	arc.c \
+	arc_os.c \
+	blkptr.c \
+	bplist.c \
+	bpobj.c \
+	bptree.c \
+	btree.c \
+	bqueue.c \
+	cityhash.c \
+	dbuf.c \
+	dbuf_stats.c \
+	ddt.c \
+	ddt_zap.c \
+	dmu.c \
+	dmu_diff.c \
+	dmu_object.c \
+	dmu_objset.c \
+	dmu_recv.c \
+	dmu_redact.c \
+	dmu_send.c \
+	dmu_traverse.c \
+	dmu_tx.c \
+	dmu_zfetch.c \
+	dnode.c \
+	dnode_sync.c \
+	dsl_bookmark.c \
+	dsl_dataset.c \
+	dsl_deadlist.c \
+	dsl_deleg.c \
+	dsl_dir.c \
+	dsl_crypt.c \
+	dsl_pool.c \
+	dsl_prop.c \
+	dsl_scan.c \
+	dsl_synctask.c \
+	dsl_destroy.c \
+	dsl_userhold.c \
+	edonr_zfs.c \
+	hkdf.c \
+	fm.c \
+	gzip.c \
+	lzjb.c \
+	lz4.c \
+	metaslab.c \
+	mmp.c \
+	multilist.c \
+	objlist.c \
+	pathname.c \
+	range_tree.c \
+	refcount.c \
+	rrwlock.c \
+	sa.c \
+	sha256.c \
+	skein_zfs.c \
+	spa.c \
+	spa_boot.c \
+	spa_checkpoint.c \
+	spa_config.c \
+	spa_errlog.c \
+	spa_history.c \
+	spa_log_spacemap.c \
+	spa_misc.c \
+	spa_stats.c \
+	space_map.c \
+	space_reftree.c \
+	txg.c \
+	trace.c \
+	uberblock.c \
+	unique.c \
+	vdev.c \
+	vdev_cache.c \
+	vdev_file.c \
+	vdev_indirect_births.c \
+	vdev_indirect.c \
+	vdev_indirect_mapping.c \
+	vdev_initialize.c \
+	vdev_label.c \
+	vdev_mirror.c \
+	vdev_missing.c \
+	vdev_queue.c \
+	vdev_raidz.c \
+	vdev_raidz_math_aarch64_neon.c \
+	vdev_raidz_math_aarch64_neonx2.c \
+	vdev_raidz_math_avx2.c \
+	vdev_raidz_math_avx512bw.c \
+	vdev_raidz_math_avx512f.c \
+	vdev_raidz_math.c \
+	vdev_raidz_math_scalar.c \
+	vdev_rebuild.c \
+	vdev_removal.c \
+	vdev_root.c \
+	vdev_trim.c \
+	zap.c \
+	zap_leaf.c \
+	zap_micro.c \
+	zcp.c \
+	zcp_get.c \
+	zcp_global.c \
+	zcp_iter.c \
+	zcp_set.c \
+	zcp_synctask.c \
+	zfeature.c \
+	zfs_byteswap.c \
+	zfs_debug.c \
+	zfs_fm.c \
+	zfs_fuid.c \
+	zfs_sa.c \
+	zfs_znode.c \
+	zfs_ratelimit.c \
+	zfs_rlock.c \
+	zil.c \
+	zio.c \
+	zio_checksum.c \
+	zio_compress.c \
+	zio_crypt.c \
+	zio_inject.c \
+	zle.c \
+	zrlock.c \
+	zthr.c
+
+ARCH_C =
+.if ${MACHINE_ARCH} == "amd64" || ${MACHINE_ARCH} == "i386"
+ARCH_C += 	vdev_raidz_math_sse2.c \
+		vdev_raidz_math_ssse3.c \
+		zfs_fletcher_intel.c \
+		zfs_fletcher_sse.c 
+CFLAGS +=  -DHAVE_SSE2 	-DHAVE_SSE3
+.endif
+.if ${MACHINE_ARCH} == "amd64"
+ARCH_C +=	zfs_fletcher_avx512.c
+CFLAGS+= -DHAVE_AVX2 -DHAVE_AVX -D__x86_64 -DHAVE_AVX512F \
+	-DHAVE_AVX512BW
+.endif
+.if ${MACHINE_ARCH} == "aarch64"
+ARCH_C +=	zfs_fletcher_aarch64_neon.c
+.endif
+
+LUA_C = \
+	lapi.c \
+	lauxlib.c \
+	lbaselib.c \
+	lcode.c \
+	lcompat.c \
+	lcorolib.c \
+	lctype.c \
+	ldebug.c \
+	ldo.c \
+	lfunc.c \
+	lgc.c \
+	llex.c \
+	lmem.c \
+	lobject.c \
+	lopcodes.c \
+	lparser.c \
+	lstate.c \
+	lstring.c \
+	lstrlib.c \
+	ltable.c \
+	ltablib.c \
+	ltm.c \
+	lvm.c \
+	lzio.c
+
+UNICODE_C = u8_textprep.c uconv.c
+
+SRCS=		${USER_C} ${KERNEL_C} ${LUA_C} ${UNICODE_C} ${ARCH_C}
+
+WARNS?=		2
+CFLAGS+= -DIN_BASE
+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/include
+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/
+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/os/freebsd
+CFLAGS+= -I${SRCTOP}/sys
+CFLAGS+= -I${SRCTOP}/cddl/compat/opensolaris/include
+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/module/icp/include
+CFLAGS+= -include ${SRCTOP}/sys/contrib/openzfs/include/os/freebsd/spl/sys/ccompile.h
+CFLAGS+= -DHAVE_ISSETUGID
+CFLAGS+= -include ${SRCTOP}/sys/modules/zfs/zfs_config.h
+CFLAGS+= -I${SRCTOP}/sys/modules/zfs
+CFLAGS+= -DLIB_ZPOOL_BUILD -DZFS_DEBUG
+
+
 # XXX: pthread doesn't have mutex_owned() equivalent, so we need to look
 #      into libthr private structures. That's sooo evil, but it's only for
 #      ZFS debugging tools needs.
@@ -64,10 +239,9 @@
 CFLAGS+=	-I${SRCTOP}/lib/libpthread/thread
 CFLAGS+=	-I${SRCTOP}/lib/libpthread/sys
 CFLAGS+=	-I${SRCTOP}/lib/libthr/arch/${MACHINE_CPUARCH}/include
-CFLAGS.lz4.c+=	-D_FAKE_KERNEL
 CFLAGS.gcc+=	-fms-extensions
 
-LIBADD=		md pthread z nvpair avl umem
+LIBADD=		md pthread z spl icp nvpair avl umem
 
 # atomic.S doesn't like profiling.
 MK_PROFILE=	no
Index: cddl/lib/libzpool/Makefile.depend
===================================================================
--- cddl/lib/libzpool/Makefile.depend
+++ cddl/lib/libzpool/Makefile.depend
@@ -3,6 +3,7 @@
 
 DIRDEPS = \
 	cddl/lib/libavl \
+	cddl/lib/libicp \
 	cddl/lib/libnvpair \
 	cddl/lib/libumem \
 	gnu/lib/csu \
Index: cddl/lib/libzutil/Makefile
===================================================================
--- /dev/null
+++ cddl/lib/libzutil/Makefile
@@ -0,0 +1,42 @@
+# $FreeBSD$
+
+.PATH: ${SRCTOP}/sys/contrib/openzfs/lib/libzutil
+.PATH: ${SRCTOP}/sys/contrib/openzfs/lib/libzutil/os/freebsd
+.PATH: ${SRCTOP}/sys/contrib/openzfs/module/os/freebsd/zfs
+
+LIB=	zutil
+LIBADD=	avl tpool
+PACKAGE=	runtime
+
+INCS = zutil_import.h
+
+SRCS = \
+        zutil_device_path.c \
+        zutil_import.c \
+        zutil_import.h \
+        zutil_nicenum.c \
+        zutil_pool.c
+
+SRCS += \
+        zutil_device_path_os.c \
+        zutil_import_os.c \
+        zutil_compat.c
+
+SRCS += zfs_ioctl_compat.c
+
+
+WARNS?=	2
+CSTD=	c99
+
+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/include
+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/
+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/os/freebsd
+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/include/os/freebsd/zfs
+CFLAGS+= -I${SRCTOP}/sys
+CFLAGS+= -I${SRCTOP}/cddl/compat/opensolaris/include
+CFLAGS+= -include ${SRCTOP}/sys/contrib/openzfs/include/os/freebsd/spl/sys/ccompile.h
+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libzutil
+CFLAGS+= -DHAVE_ISSETUGID -DIN_BASE
+CFLAGS+= -include ${SRCTOP}/sys/modules/zfs/zfs_config.h
+
+.include <bsd.lib.mk>
Index: cddl/sbin/zfs/Makefile
===================================================================
--- cddl/sbin/zfs/Makefile
+++ cddl/sbin/zfs/Makefile
@@ -1,27 +1,35 @@
 # $FreeBSD$
 
-.PATH: ${SRCTOP}/cddl/contrib/opensolaris/cmd/zfs
+.PATH: ${SRCTOP}/sys/contrib/openzfs/cmd/zfs
+.PATH: ${SRCTOP}/sys/contrib/openzfs/module/os/freebsd/spl
 
 PACKAGE=	runtime
 PROG=	zfs
-MAN=	zfs.8 zfs-program.8
-SRCS=	zfs_main.c zfs_iter.c
+MAN=
+#MAN=	zfs.8 zfs-program.8
+SRCS = \
+	zfs_iter.c \
+	zfs_iter.h \
+	zfs_main.c \
+	zfs_util.h \
+	zfs_project.c \
+	zfs_projectutil.h
 
-WARNS?=	0
-CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/lib/libzpool/common
+WARNS?=	2
+CFLAGS+= -DIN_BASE
+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/include
+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/include/os/freebsd
+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include
+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/os/freebsd
+CFLAGS+= -I${SRCTOP}/sys
 CFLAGS+= -I${SRCTOP}/cddl/compat/opensolaris/include
-CFLAGS+= -I${SRCTOP}/cddl/compat/opensolaris/lib/libumem
-CFLAGS+= -I${SRCTOP}/sys/cddl/compat/opensolaris
-CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/head
-CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/lib/libuutil/common
-CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/lib/libzfs/common
-CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/lib/libzfs_core/common
-CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/lib/libumem/common
-CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/lib/libnvpair
-CFLAGS+= -I${SRCTOP}/sys/cddl/contrib/opensolaris/uts/common
-CFLAGS+= -I${SRCTOP}/sys/cddl/contrib/opensolaris/uts/common/fs/zfs
-CFLAGS+= -I${SRCTOP}/sys/cddl/contrib/opensolaris/common/zfs
+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/module/icp/include
+CFLAGS+= -include ${SRCTOP}/sys/contrib/openzfs/include/os/freebsd/spl/sys/ccompile.h
+CFLAGS+= -DHAVE_ISSETUGID
+CFLAGS+= -include ${SRCTOP}/sys/modules/zfs/zfs_config.h
 
-LIBADD=	jail nvpair uutil zfs_core zfs
+CFLAGS+= -I${SRCTOP}/sys/modules/zfs
 
+LIBADD=	jail avl nvpair geom  uutil zfs_core spl tpool zutil zfs m
+LDADD+=	-pthread
 .include <bsd.prog.mk>
Index: cddl/sbin/zpool/Makefile
===================================================================
--- cddl/sbin/zpool/Makefile
+++ cddl/sbin/zpool/Makefile
@@ -1,32 +1,37 @@
 # $FreeBSD$
 
-.PATH: ${SRCTOP}/cddl/contrib/opensolaris/cmd/zpool
-.PATH: ${SRCTOP}/cddl/contrib/opensolaris/cmd/stat/common
-.PATH: ${SRCTOP}/sys/cddl/contrib/opensolaris/common/zfs
+.PATH: ${SRCTOP}/sys/contrib/openzfs/cmd/zpool
+.PATH: ${SRCTOP}/sys/contrib/openzfs/cmd/zpool/os/freebsd
+
 
 PACKAGE=	runtime
 PROG=	zpool
-MAN=	zpool.8 zpool-features.7
-SRCS=	zpool_main.c zpool_vdev.c zpool_iter.c zpool_util.c zfs_comutil.c
-SRCS+=	timestamp.c
+#MAN=	zpool.8 zpool-features.7
+MAN=
+
+SRCS = \
+	zpool_iter.c \
+	zpool_main.c \
+	zpool_util.c \
+	zpool_util.h \
+	zpool_vdev.c
+
+
+SRCS += zpool_vdev_os.c
 
-WARNS?=	0
-CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/lib/libzpool/common
+WARNS?=	2
+CFLAGS+= -DIN_BASE
+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/include
+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/
+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/os/freebsd
+CFLAGS+= -I${SRCTOP}/sys
 CFLAGS+= -I${SRCTOP}/cddl/compat/opensolaris/include
-CFLAGS+= -I${SRCTOP}/cddl/compat/opensolaris/lib/libumem
-CFLAGS+= -I${SRCTOP}/sys/cddl/compat/opensolaris
-CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/head
-CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/lib/libuutil/common
-CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/lib/libumem/common
-CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/lib/libzfs/common
-CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/lib/libzfs_core/common
-CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/lib/libnvpair
-CFLAGS+= -I${SRCTOP}/sys/cddl/contrib/opensolaris/common/zfs
-CFLAGS+= -I${SRCTOP}/sys/cddl/contrib/opensolaris/uts/common
-CFLAGS+= -I${SRCTOP}/sys/cddl/contrib/opensolaris/uts/common/fs/zfs
-CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/lib/libzpool/common
-CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/cmd/stat/common
-
-LIBADD=	geom nvpair uutil zfs
+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/cmd/zpool
+CFLAGS+= -include ${SRCTOP}/sys/contrib/openzfs/include/os/freebsd/spl/sys/ccompile.h
+CFLAGS+= -DHAVE_ISSETUGID
+CFLAGS+= -include ${SRCTOP}/sys/modules/zfs/zfs_config.h
+CFLAGS+= -DSYSCONFDIR=\"/etc\"
 
+LIBADD=	geom nvpair uutil zfs zutil avl spl tpool zfs_core m
+LDADD+= 	-pthread
 .include <bsd.prog.mk>
Index: cddl/usr.bin/Makefile
===================================================================
--- cddl/usr.bin/Makefile
+++ cddl/usr.bin/Makefile
@@ -8,6 +8,7 @@
 	ctfmerge \
 	${_zinject} \
 	${_zlook} \
+	${_zstream} \
 	${_zstreamdump} \
 	${_ztest}
 
@@ -18,7 +19,7 @@
 #_zlook= zlook
 .if ${MK_LIBTHR} != "no"
 _ztest=	ztest
-_zstreamdump = zstreamdump
+_zstream = zstream
 .endif
 .endif
 
Index: cddl/usr.bin/ctfconvert/Makefile
===================================================================
--- cddl/usr.bin/ctfconvert/Makefile
+++ cddl/usr.bin/ctfconvert/Makefile
@@ -27,6 +27,12 @@
 		traverse.c \
 		util.c
 
+CFLAGS+= -DIN_BASE
+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/include
+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/
+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/os/freebsd
+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/os/freebsd/spl
+CFLAGS+= -I${SRCTOP}/cddl/compat/opensolaris/include
 CFLAGS+=	-I${SRCTOP}/sys/cddl/compat/opensolaris \
 		-I${SRCTOP}/cddl/compat/opensolaris/include \
 		-I${OPENSOLARIS_USR_DISTDIR} \
@@ -35,8 +41,9 @@
 		-I${OPENSOLARIS_USR_DISTDIR}/tools/ctf/common \
 		-I${OPENSOLARIS_USR_DISTDIR}/tools/ctf/cvt \
 		-I${OPENSOLARIS_SYS_DISTDIR}/uts/common
+CFLAGS+= -DHAVE_ISSETUGID 
 
-LIBADD=		dwarf elf z pthread
+LIBADD=		spl dwarf elf z pthread
 
 HAS_TESTS=
 SUBDIR.${MK_TESTS}+= tests
Index: cddl/usr.bin/ctfconvert/Makefile.depend
===================================================================
--- cddl/usr.bin/ctfconvert/Makefile.depend
+++ cddl/usr.bin/ctfconvert/Makefile.depend
@@ -5,6 +5,7 @@
 	gnu/lib/csu \
 	include \
 	include/xlocale \
+	cddl/lib/libspl \
 	lib/${CSU_DIR} \
 	lib/libc \
 	lib/libcompiler_rt \
Index: cddl/usr.bin/ctfdump/Makefile
===================================================================
--- cddl/usr.bin/ctfdump/Makefile
+++ cddl/usr.bin/ctfdump/Makefile
@@ -8,6 +8,13 @@
 		symbol.c \
 		utils.c
 
+CFLAGS+= -DIN_BASE
+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/include
+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/
+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/os/freebsd
+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/os/freebsd/spl
+CFLAGS+= -I${SRCTOP}/sys
+CFLAGS+= -I${SRCTOP}/cddl/compat/opensolaris/include
 CFLAGS+=	-I${OPENSOLARIS_USR_DISTDIR} \
 		-I${OPENSOLARIS_SYS_DISTDIR} \
 		-I${OPENSOLARIS_USR_DISTDIR}/head \
@@ -16,6 +23,7 @@
 		-I${SRCTOP}/cddl/compat/opensolaris/include \
 		-I${OPENSOLARIS_USR_DISTDIR}/tools/ctf/common \
 		-I${OPENSOLARIS_SYS_DISTDIR}/uts/common
+CFLAGS+= -DHAVE_ISSETUGID
 
 LIBADD=		elf z
 
Index: cddl/usr.bin/ctfmerge/Makefile
===================================================================
--- cddl/usr.bin/ctfmerge/Makefile
+++ cddl/usr.bin/ctfmerge/Makefile
@@ -24,6 +24,13 @@
 
 WARNS?=		1
 
+
+CFLAGS+= -DIN_BASE
+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/include
+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/
+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/os/freebsd
+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/os/freebsd/spl
+CFLAGS+= -I${SRCTOP}/cddl/compat/opensolaris/include
 CFLAGS+=	-I${SRCTOP}/sys/cddl/compat/opensolaris \
 		-I${SRCTOP}/cddl/compat/opensolaris/include \
 		-I${OPENSOLARIS_USR_DISTDIR} \
@@ -32,7 +39,8 @@
 		-I${OPENSOLARIS_USR_DISTDIR}/tools/ctf/common \
 		-I${OPENSOLARIS_USR_DISTDIR}/tools/ctf/cvt \
 		-I${OPENSOLARIS_SYS_DISTDIR}/uts/common
+CFLAGS+= -DHAVE_ISSETUGID
 
-LIBADD=		elf z pthread
+LIBADD=		spl elf z pthread
 
 .include <bsd.prog.mk>
Index: cddl/usr.bin/zinject/Makefile
===================================================================
--- cddl/usr.bin/zinject/Makefile
+++ cddl/usr.bin/zinject/Makefile
@@ -1,24 +1,24 @@
 # $FreeBSD$
 
-.PATH: ${SRCTOP}/cddl/contrib/opensolaris/cmd/zinject
+.PATH: ${SRCTOP}/sys/contrib/openzfs/cmd/zinject
 
 PROG=	zinject
+INCS=	zinject.h
 SRCS=	zinject.c translate.c
 MAN=
 
-WARNS?=	0
-CFLAGS+= -I${SRCTOP}/sys/cddl/compat/opensolaris
+WARNS?=	2
+CFLAGS+= -DIN_BASE
+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/include
+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/
+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/os/freebsd
+CFLAGS+= -I${SRCTOP}/sys
 CFLAGS+= -I${SRCTOP}/cddl/compat/opensolaris/include
-CFLAGS+= -I${SRCTOP}/cddl/compat/opensolaris/lib/libumem
-CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/lib/libzfs/common
-CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/lib/libzfs_core/common
-CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/lib/libzpool/common
-CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/lib/libnvpair
-CFLAGS+= -I${SRCTOP}/sys/cddl/contrib/opensolaris/uts/common/fs/zfs
-CFLAGS+= -I${SRCTOP}/sys/cddl/contrib/opensolaris/uts/common
-CFLAGS+= -I${SRCTOP}/sys/cddl/contrib/opensolaris/common/zfs/
-CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/head
+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/module/icp/include
+CFLAGS+= -include ${SRCTOP}/sys/contrib/openzfs/include/os/freebsd/spl/sys/ccompile.h
+CFLAGS+= -DHAVE_ISSETUGID
+CFLAGS+= -include ${SRCTOP}/sys/modules/zfs/zfs_config.h
 
-LIBADD=	geom m nvpair umem uutil zfs_core zfs zpool
+LIBADD=	geom m nvpair umem uutil avl spl zfs_core zfs zutil zpool
 
 .include <bsd.prog.mk>
Index: cddl/usr.bin/zstream/Makefile
===================================================================
--- /dev/null
+++ cddl/usr.bin/zstream/Makefile
@@ -0,0 +1,27 @@
+# $FreeBSD$
+
+.PATH: ${SRCTOP}/sys/contrib/openzfs/cmd/zstream
+
+PROG=	zstream
+INCS=	zstream.h
+SRCS=	zstream.c zstream_dump.c zstream_redup.c \
+	zstream_token.c
+
+
+MAN=
+
+WARNS?=	2
+CFLAGS+= -DIN_BASE
+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/include
+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/
+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/os/freebsd
+CFLAGS+= -I${SRCTOP}/sys
+CFLAGS+= -I${SRCTOP}/cddl/compat/opensolaris/include
+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/module/icp/include
+CFLAGS+= -include ${SRCTOP}/sys/contrib/openzfs/include/os/freebsd/spl/sys/ccompile.h
+CFLAGS+= -DHAVE_ISSETUGID
+CFLAGS+= -include ${SRCTOP}/sys/modules/zfs/zfs_config.h
+
+LIBADD=	geom m nvpair umem uutil avl spl zfs_core zfs zutil zpool
+
+.include <bsd.prog.mk>
Index: cddl/usr.bin/zstreamdump/Makefile
===================================================================
--- cddl/usr.bin/zstreamdump/Makefile
+++ cddl/usr.bin/zstreamdump/Makefile
@@ -16,7 +16,7 @@
 CFLAGS+= -I${SRCTOP}/sys/cddl/contrib/opensolaris/uts/common
 CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/head
 
-LIBADD=	m nvpair umem zpool zfs pthread z avl
+LIBADD=	m spl nvpair umem zpool zfs pthread z avl
 
 CSTD=	c99
 
Index: cddl/usr.bin/ztest/Makefile
===================================================================
--- cddl/usr.bin/ztest/Makefile
+++ cddl/usr.bin/ztest/Makefile
@@ -1,30 +1,29 @@
 # $FreeBSD$
 
-.PATH: ${SRCTOP}/cddl/contrib/opensolaris/cmd/ztest
+.PATH: ${SRCTOP}/sys/contrib/openzfs/cmd/ztest
 
 PROG=	ztest
 MAN=
 
-WARNS?=	0
-CFLAGS+= -I${SRCTOP}/sys/cddl/compat/opensolaris
+WARNS?=	2
+CFLAGS+= -DIN_BASE
+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/include
+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/
+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/os/freebsd
+CFLAGS+= -I${SRCTOP}/sys
 CFLAGS+= -I${SRCTOP}/cddl/compat/opensolaris/include
-CFLAGS+= -I${SRCTOP}/cddl/compat/opensolaris/lib/libumem
-CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/lib/libzfs/common
-CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/lib/libzpool/common
-CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/lib/libnvpair
-CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/lib/libcmdutils
-CFLAGS+= -I${SRCTOP}/sys/cddl/contrib/opensolaris/common/zfs
-CFLAGS+= -I${SRCTOP}/sys/cddl/contrib/opensolaris/uts/common/fs/zfs
-CFLAGS+= -I${SRCTOP}/sys/cddl/contrib/opensolaris/uts/common
-CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/head
-
-LIBADD=	geom m nvpair umem zpool pthread avl zfs_core zfs uutil
+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/module/icp/include
+CFLAGS+= -include ${SRCTOP}/sys/contrib/openzfs/include/os/freebsd/spl/sys/ccompile.h
+CFLAGS+= -DHAVE_ISSETUGID
+CFLAGS+= -include ${SRCTOP}/sys/modules/zfs/zfs_config.h
+
+LIBADD=	geom m nvpair umem zpool pthread avl zfs_core spl zutil zfs uutil icp
 
 CSTD=	c99
 
 # Since there are many asserts in this program, it makes no sense to compile
 # it without debugging.
-CFLAGS+= -g -DDEBUG=1 -Wno-format
+CFLAGS+= -g -DDEBUG=1 -Wno-format -DZFS_DEBUG=1
 CFLAGS.gcc+= -fms-extensions
 
 HAS_TESTS=
Index: cddl/usr.sbin/dtrace/Makefile
===================================================================
--- cddl/usr.sbin/dtrace/Makefile
+++ cddl/usr.sbin/dtrace/Makefile
@@ -10,6 +10,13 @@
 
 WARNS?=		1
 
+CFLAGS+= -DIN_BASE
+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/include
+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/
+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/os/freebsd
+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/os/freebsd/spl
+CFLAGS+= -I${SRCTOP}/sys
+CFLAGS+= -I${SRCTOP}/cddl/compat/opensolaris/include
 CFLAGS+=	-I${SRCTOP}/sys/cddl/compat/opensolaris \
 		-I${SRCTOP}/cddl/compat/opensolaris/include \
 		-I${OPENSOLARIS_USR_DISTDIR}/head \
@@ -17,12 +24,13 @@
 		-I${OPENSOLARIS_USR_DISTDIR}/lib/libproc/common \
 		-I${OPENSOLARIS_SYS_DISTDIR}/uts/common \
 		-I${OPENSOLARIS_SYS_DISTDIR}/compat
+CFLAGS+= -DHAVE_ISSETUGID
 
 # Optional debugging stuff...
 #CFLAGS+=	-DNEED_ERRLOC
 #YFLAGS+=	-d
 
-LIBADD=	dtrace ctf elf proc
+LIBADD=	dtrace ctf elf proc spl
 
 .if ${MK_DTRACE_TESTS} != "no"
 SUBDIR+=	tests
Index: cddl/usr.sbin/lockstat/Makefile
===================================================================
--- cddl/usr.sbin/lockstat/Makefile
+++ cddl/usr.sbin/lockstat/Makefile
@@ -8,6 +8,14 @@
 
 WARNS?=		1
 
+CFLAGS+= -DIN_BASE
+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/include
+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/
+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/os/freebsd
+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/os/freebsd/spl
+CFLAGS+= -I${SRCTOP}/sys
+CFLAGS+= -I${SRCTOP}/cddl/compat/opensolaris/include
+CFLAGS+= -I${SRCTOP}/sys/cddl/compat/opensolaris
 CFLAGS+=	-I${SRCTOP}/sys/cddl/compat/opensolaris \
 		-I${SRCTOP}/cddl/compat/opensolaris/include \
 		-I${OPENSOLARIS_USR_DISTDIR}/head \
@@ -16,6 +24,7 @@
 		-I${OPENSOLARIS_SYS_DISTDIR}/uts/common \
 		-I${OPENSOLARIS_SYS_DISTDIR}/compat \
 		-I${SRCTOP}/sys
+CFLAGS+= -DHAVE_ISSETUGID
 
 CFLAGS+=	-DNEED_ERRLOC -g
 
Index: cddl/usr.sbin/plockstat/Makefile
===================================================================
--- cddl/usr.sbin/plockstat/Makefile
+++ cddl/usr.sbin/plockstat/Makefile
@@ -8,6 +8,13 @@
 
 WARNS?=		1
 
+CFLAGS+= -DIN_BASE
+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/include
+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/
+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/os/freebsd
+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/os/freebsd/spl
+CFLAGS+= -I${SRCTOP}/sys
+CFLAGS+= -I${SRCTOP}/cddl/compat/opensolaris/include
 CFLAGS+=	-I${SRCTOP}/sys/cddl/compat/opensolaris \
 		-I${SRCTOP}/cddl/compat/opensolaris/include \
 		-I${OPENSOLARIS_USR_DISTDIR}/head \
@@ -17,6 +24,7 @@
 		-I${OPENSOLARIS_SYS_DISTDIR}/compat \
 		-I${SRCTOP}/cddl/lib/libdtrace \
 		-I${SRCTOP}/sys
+CFLAGS+= -DHAVE_ISSETUGID
 
 LIBADD=	dtrace proc
 
Index: cddl/usr.sbin/zdb/Makefile
===================================================================
--- cddl/usr.sbin/zdb/Makefile
+++ cddl/usr.sbin/zdb/Makefile
@@ -1,33 +1,28 @@
 # $FreeBSD$
 
-.PATH: ${SRCTOP}/cddl/contrib/opensolaris/cmd/zdb
+.PATH: ${SRCTOP}/sys/contrib/openzfs/cmd/zdb
 
 PROG=	zdb
-MAN=	zdb.8
+#MAN=	zdb.8
+MAN=
+INCS=	zdb.h
 SRCS=	zdb.c zdb_il.c
 
 WARNS?=	2
 CSTD=	c99
-
-CFLAGS+= -I${SRCTOP}/sys/cddl/compat/opensolaris
-CFLAGS+= -I${SRCTOP}/cddl/compat/opensolaris/include
-CFLAGS+= -I${SRCTOP}/cddl/compat/opensolaris/lib/libumem
-CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/lib/libcmdutils
-CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/lib/libnvpair
-CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/lib/libuutil/common
-CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/lib/libzfs/common
-CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/lib/libzfs_core/common
-CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/lib/libzpool/common
-CFLAGS+= -I${SRCTOP}/sys/cddl/contrib/opensolaris/uts/common/fs/zfs
-CFLAGS+= -I${SRCTOP}/sys/cddl/contrib/opensolaris/uts/common
-CFLAGS+= -I${SRCTOP}/sys/cddl/contrib/opensolaris/common/zfs
-CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/head
-
-LIBADD=	nvpair umem uutil zfs zpool
+CFLAGS+= -DIN_BASE
+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/include
+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/
+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/os/freebsd
+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/os/freebsd/spl
+CFLAGS+= -I${SRCTOP}/sys
+CFLAGS+= -include ${SRCTOP}/sys/contrib/openzfs/include/os/freebsd/spl/sys/ccompile.h
+CFLAGS+= -DHAVE_ISSETUGID
+LIBADD=	nvpair umem uutil zfs spl avl zutil zpool
 
 CFLAGS.gcc+= -fms-extensions
 # Since there are many asserts in this program, it makes no sense to compile
 # it without debugging.
-CFLAGS+=	-g -DDEBUG=1
+CFLAGS+=	-g -DDEBUG=1 -DZFS_DEBUG=1
 
 .include <bsd.prog.mk>
Index: cddl/usr.sbin/zfsd/Makefile.common
===================================================================
--- cddl/usr.sbin/zfsd/Makefile.common
+++ cddl/usr.sbin/zfsd/Makefile.common
@@ -10,29 +10,24 @@
 		zpool_list.cc		\
 		zfsd_main.cc
 
-WARNS?=		3
+WARNS?=		2
 
 # Ignore warnings about Solaris specific pragmas.
 IGNORE_PRAGMA=  YES
 
-INCFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/lib/libzpool/common
-INCFLAGS+= -I${SRCTOP}/cddl/compat/opensolaris/include
-INCFLAGS+= -I${SRCTOP}/cddl/compat/opensolaris/lib/libumem
-INCFLAGS+= -I${SRCTOP}/sys/cddl/compat/opensolaris
-INCFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/head
-INCFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/lib/libuutil/common
-INCFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/lib/libumem/common
-INCFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/lib/libzfs_core/common
-INCFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/lib/libzfs/common
-INCFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/lib/libnvpair
-INCFLAGS+= -I${SRCTOP}/sys/cddl/contrib/opensolaris/common/zfs
-INCFLAGS+= -I${SRCTOP}/sys/cddl/contrib/opensolaris/uts/common
-INCFLAGS+= -I${SRCTOP}/sys/cddl/contrib/opensolaris/uts/common/fs/zfs
-INCFLAGS+= -I${SRCTOP}/cddl/usr.sbin
+CFLAGS+= -DIN_BASE
+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/include
+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include
+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/os/freebsd
+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/os/freebsd/spl
+CFLAGS+= -I${SRCTOP}/sys
+CFLAGS+= -include ${SRCTOP}/sys/contrib/openzfs/include/os/freebsd/spl/sys/ccompile.h
+CFLAGS+= -I${SRCTOP}/cddl/usr.sbin
 
-CFLAGS+= -DNEED_SOLARIS_BOOLEAN ${INCFLAGS}
+# use issetugid(2)
+CFLAGS+= -D_MACHINE_FLOAT_H_ -DHAVE_ISSETUGID
 
-LIBADD+=	devdctl zfs zfs_core util geom bsdxml sbuf nvpair uutil
+LIBADD+=	devdctl zfs zfs_core util geom bsdxml sbuf nvpair avl uutil zutil
 
 cscope:
 	find ${.CURDIR} -type f -a \( -name "*.[ch]" -o -name "*.cc" \) \
Index: cddl/usr.sbin/zfsd/callout.cc
===================================================================
--- cddl/usr.sbin/zfsd/callout.cc
+++ cddl/usr.sbin/zfsd/callout.cc
@@ -39,6 +39,7 @@
  *        timer services built on top of the POSIX interval timer.
  */
 
+#include <sys/byteorder.h>
 #include <sys/time.h>
 
 #include <signal.h>
Index: cddl/usr.sbin/zfsd/case_file.cc
===================================================================
--- cddl/usr.sbin/zfsd/case_file.cc
+++ cddl/usr.sbin/zfsd/case_file.cc
@@ -39,11 +39,13 @@
  * accumulate in order to mark a device as degraded.
  */
 #include <sys/cdefs.h>
+#include <sys/byteorder.h>
 #include <sys/time.h>
 
 #include <sys/fs/zfs.h>
 
 #include <dirent.h>
+#include <fcntl.h>
 #include <iomanip>
 #include <fstream>
 #include <functional>
@@ -75,7 +77,6 @@
 __FBSDID("$FreeBSD$");
 
 /*============================ Namespace Control =============================*/
-using std::auto_ptr;
 using std::hex;
 using std::ifstream;
 using std::stringstream;
@@ -239,8 +240,6 @@
 {
 	ZpoolList zpl(ZpoolList::ZpoolByGUID, &m_poolGUID);
 	zpool_handle_t *pool(zpl.empty() ? NULL : zpl.front());
-	zpool_boot_label_t boot_type;
-	uint64_t boot_size;
 
 	if (pool == NULL || !RefreshVdevState()) {
 		/*
@@ -333,13 +332,7 @@
 	}
 
 	/* Write a label on the newly inserted disk. */
-	if (zpool_is_bootable(pool))
-		boot_type = ZPOOL_COPY_BOOT_LABEL;
-	else
-		boot_type = ZPOOL_NO_BOOT_LABEL;
-	boot_size = zpool_get_prop_int(pool, ZPOOL_PROP_BOOTSIZE, NULL);
-	if (zpool_label_disk(g_zfsHandle, pool, devPath.c_str(),
-	    boot_type, boot_size, NULL) != 0) {
+	if (zpool_label_disk(g_zfsHandle, pool, devPath.c_str()) != 0) {
 		syslog(LOG_ERR,
 		       "Replace vdev(%s/%s) by physical path (label): %s: %s\n",
 		       zpool_get_name(pool), VdevGUIDString().c_str(),
@@ -1118,7 +1111,7 @@
 	nvlist_free(newvd);
 
 	retval = (zpool_vdev_attach(zhp, oldstr.c_str(), path, nvroot,
-	    /*replace*/B_TRUE) == 0);
+       /*replace*/B_TRUE, /*rebuild*/ B_FALSE) == 0);
 	if (retval)
 		syslog(LOG_INFO, "Replacing vdev(%s/%s) with %s\n",
 		    poolname, oldstr.c_str(), path);
Index: cddl/usr.sbin/zfsd/tests/zfsd_unittest.cc
===================================================================
--- cddl/usr.sbin/zfsd/tests/zfsd_unittest.cc
+++ cddl/usr.sbin/zfsd/tests/zfsd_unittest.cc
@@ -30,6 +30,7 @@
  * Authors: Alan Somers         (Spectra Logic Corporation)
  */
 #include <sys/cdefs.h>
+#include <sys/byteorder.h>
 
 #include <stdarg.h>
 #include <syslog.h>
Index: cddl/usr.sbin/zfsd/vdev.cc
===================================================================
--- cddl/usr.sbin/zfsd/vdev.cc
+++ cddl/usr.sbin/zfsd/vdev.cc
@@ -39,6 +39,7 @@
  */
 #include <syslog.h>
 #include <sys/cdefs.h>
+#include <sys/byteorder.h>
 #include <sys/fs/zfs.h>
 
 #include <libzfs.h>
Index: cddl/usr.sbin/zfsd/vdev_iterator.cc
===================================================================
--- cddl/usr.sbin/zfsd/vdev_iterator.cc
+++ cddl/usr.sbin/zfsd/vdev_iterator.cc
@@ -38,6 +38,7 @@
  * Implementation of the VdevIterator class.
  */
 #include <sys/cdefs.h>
+#include <sys/byteorder.h>
 #include <sys/fs/zfs.h>
 
 #include <stdint.h>
Index: cddl/usr.sbin/zfsd/zfsd.cc
===================================================================
--- cddl/usr.sbin/zfsd/zfsd.cc
+++ cddl/usr.sbin/zfsd/zfsd.cc
@@ -42,10 +42,12 @@
  */
 
 #include <sys/cdefs.h>
+#include <sys/byteorder.h>
 #include <sys/param.h>
 #include <sys/fs/zfs.h>
 
 #include <err.h>
+#include <fcntl.h>
 #include <libgeom.h>
 #include <libutil.h>
 #include <poll.h>
Index: cddl/usr.sbin/zfsd/zfsd_event.cc
===================================================================
--- cddl/usr.sbin/zfsd/zfsd_event.cc
+++ cddl/usr.sbin/zfsd/zfsd_event.cc
@@ -34,6 +34,7 @@
  * \file zfsd_event.cc
  */
 #include <sys/cdefs.h>
+#include <sys/byteorder.h>
 #include <sys/time.h>
 #include <sys/fs/zfs.h>
 #include <sys/vdev_impl.h>
@@ -41,6 +42,7 @@
 #include <syslog.h>
 
 #include <libzfs.h>
+#include <libzutil.h>
 /* 
  * Undefine flush, defined by cpufunc.h on sparc64, because it conflicts with
  * C++ flush methods
@@ -190,7 +192,8 @@
 		if (poolName != NULL)
 			free(poolName);
 
-		nlabels = zpool_read_all_labels(devFd, &devLabel);
+		if (zpool_read_label(devFd, &devLabel, &nlabels) != 0)
+			return (NULL);
 		/*
 		 * If we find a disk with fewer than the maximum number of
 		 * labels, it might be the whole disk of a partitioned disk
Index: cddl/usr.sbin/zfsd/zfsd_exception.cc
===================================================================
--- cddl/usr.sbin/zfsd/zfsd_exception.cc
+++ cddl/usr.sbin/zfsd/zfsd_exception.cc
@@ -36,6 +36,7 @@
  * Implementation of the ZfsdException class.
  */
 #include <sys/cdefs.h>
+#include <sys/byteorder.h>
 #include <sys/fs/zfs.h>
 
 #include <syslog.h>
Index: cddl/usr.sbin/zfsd/zpool_list.cc
===================================================================
--- cddl/usr.sbin/zfsd/zpool_list.cc
+++ cddl/usr.sbin/zfsd/zpool_list.cc
@@ -38,6 +38,7 @@
  * Implementation of the ZpoolList class.
  */
 #include <sys/cdefs.h>
+#include <sys/byteorder.h>
 #include <sys/fs/zfs.h>
 
 #include <stdint.h>
Index: cddl/usr.sbin/zhack/Makefile
===================================================================
--- cddl/usr.sbin/zhack/Makefile
+++ cddl/usr.sbin/zhack/Makefile
@@ -1,6 +1,6 @@
 # $FreeBSD$
 
-.PATH: ${SRCTOP}/cddl/contrib/opensolaris/cmd/zhack
+.PATH: ${SRCTOP}/sys/contrib/openzfs/cmd/zhack
 
 PROG=	zhack
 MAN=
@@ -8,20 +8,20 @@
 WARNS?=	0
 CSTD=	c99
 
-CFLAGS+= -I${SRCTOP}/sys/cddl/compat/opensolaris
+WARNS?=	2
+CFLAGS+= -DIN_BASE
+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/include
+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/
+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/os/freebsd
+CFLAGS+= -I${SRCTOP}/sys
 CFLAGS+= -I${SRCTOP}/cddl/compat/opensolaris/include
-CFLAGS+= -I${SRCTOP}/cddl/compat/opensolaris/lib/libumem
-CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/lib/libnvpair
-CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/lib/libuutil/common
-CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/lib/libzfs/common
-CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/lib/libzfs_core/common
-CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/lib/libzpool/common
-CFLAGS+= -I${SRCTOP}/sys/cddl/contrib/opensolaris/uts/common/fs/zfs
-CFLAGS+= -I${SRCTOP}/sys/cddl/contrib/opensolaris/uts/common
-CFLAGS+= -I${SRCTOP}/sys/cddl/contrib/opensolaris/common/zfs
-CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/head
-
-LIBADD=	nvpair zfs zpool
+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/module/icp/include
+CFLAGS+= -include ${SRCTOP}/sys/contrib/openzfs/include/os/freebsd/spl/sys/ccompile.h
+CFLAGS+= -DHAVE_ISSETUGID
+CFLAGS+= -include ${SRCTOP}/sys/modules/zfs/zfs_config.h
+
+
+LIBADD=	nvpair zfs spl zutil zpool
 
 CFLAGS+=	-DDEBUG=1
 #DEBUG_FLAGS+=	-g
Index: include/Makefile
===================================================================
--- include/Makefile
+++ include/Makefile
@@ -244,7 +244,7 @@
 	${INSTALL} -C ${TAG_ARGS} -o ${BINOWN} -g ${BINGRP} -m 444 teken.h \
 	    ${SDESTDIR}${INCLUDEDIR}/teken
 .if ${MK_CDDL} != "no"
-	cd ${SRCTOP}/sys/cddl/contrib/opensolaris/uts/common/sys; \
+	cd ${SRCTOP}/sys/contrib/openzfs/include/sys; \
 	${INSTALL} -C ${TAG_ARGS} -o ${BINOWN} -g ${BINGRP} -m 444 nvpair.h \
 	    ${SDESTDIR}${INCLUDEDIR}/sys
 .endif
@@ -377,7 +377,7 @@
 	done
 .if ${MK_CDDL} != "no"
 	${INSTALL_SYMLINK} ${TAG_ARGS} \
-	    ../../../sys/cddl/contrib/opensolaris/uts/common/sys/nvpair.h \
+	    ../../../sys/contrib/openenzfs/include/sys/nvpair.h \
 	    ${SDESTDIR}${INCLUDEDIR}/sys
 .endif
 .if ${MK_MLX5TOOL} != "no"
Index: lib/libbe/Makefile
===================================================================
--- lib/libbe/Makefile
+++ lib/libbe/Makefile
@@ -16,19 +16,18 @@
 IGNORE_PRAGMA=	yes
 
 LIBADD+= zfs
-LIBADD+= nvpair
-
-CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/lib/libzfs/common
-CFLAGS+= -I${SRCTOP}/sys/cddl/compat/opensolaris
+LIBADD+= nvpair spl
+
+CFLAGS+= -DIN_BASE -DHAVE_RPC_TYPES
+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/include
+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/include/os/freebsd
+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include
+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/os/freebsd
+CFLAGS+= -I${SRCTOP}/sys
 CFLAGS+= -I${SRCTOP}/cddl/compat/opensolaris/include
-CFLAGS+= -I${SRCTOP}/cddl/compat/opensolaris/lib/libumem
-CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/lib/libzpool/common
-CFLAGS+= -I${SRCTOP}/sys/cddl/contrib/opensolaris/common/zfs
-CFLAGS+= -I${SRCTOP}/sys/cddl/contrib/opensolaris/uts/common/fs/zfs
-CFLAGS+= -I${SRCTOP}/sys/cddl/contrib/opensolaris/uts/common
-CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/head
-
-CFLAGS+= -DNEED_SOLARIS_BOOLEAN
+CFLAGS+= -include ${SRCTOP}/sys/contrib/openzfs/include/os/freebsd/spl/sys/ccompile.h
+CFLAGS+= -DHAVE_ISSETUGID
+
 
 HAS_TESTS=	YES
 SUBDIR.${MK_TESTS}+= tests
Index: lib/libbe/be.c
===================================================================
--- lib/libbe/be.c
+++ lib/libbe/be.c
@@ -35,10 +35,13 @@
 #include <sys/queue.h>
 #include <sys/zfs_context.h>
 #include <sys/mntent.h>
+#include <sys/zfs_ioctl.h>
 
+#include <libzutil.h>
 #include <ctype.h>
 #include <libgen.h>
 #include <libzfs_core.h>
+#include <libzfs_impl.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <time.h>
@@ -993,12 +996,8 @@
 	    ZFS_TYPE_FILESYSTEM)) == NULL)
 		return (set_error(lbh, BE_ERR_ZFSOPEN));
 
-	/* recurse, nounmount, forceunmount */
-	struct renameflags flags = {
-		.nounmount = 1,
-	};
 
-	err = zfs_rename(zfs_hdl, NULL, full_new, flags);
+	err = zfs_rename(zfs_hdl,full_new, B_FALSE, B_FALSE);
 
 	zfs_close(zfs_hdl);
 	if (err != 0)
@@ -1025,7 +1024,7 @@
 	if ((zfs = zfs_open(lbh->lzh, buf, ZFS_TYPE_DATASET)) == NULL)
 		return (set_error(lbh, BE_ERR_ZFSOPEN));
 
-	err = zfs_send_one(zfs, NULL, fd, flags);
+	err = zfs_send_one(zfs, NULL, fd, &flags, /* redactbook */ NULL);
 	zfs_close(zfs);
 
 	return (err);
Index: lib/libbe/tests/Makefile
===================================================================
--- lib/libbe/tests/Makefile
+++ lib/libbe/tests/Makefile
@@ -8,14 +8,19 @@
 SRCS_target_prog=	target_prog.c
 BINDIR_target_prog=	${TESTSDIR}
 
-LIBADD+= zfs
-LIBADD+= nvpair
-LIBADD+= be
+LIBADD+= zfs \
+	spl \
+	nvpair \
+	be \
 
 CFLAGS+= -I${SRCTOP}/lib/libbe
-CFLAGS+= -I${SRCTOP}/sys/cddl/compat/opensolaris
-CFLAGS+= -I${SRCTOP}/sys/cddl/contrib/opensolaris/uts/common
-
-CFLAGS+= -DNEED_SOLARIS_BOOLEAN
+CFLAGS+= -DIN_BASE -DHAVE_RPC_TYPES
+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/include
+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/
+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/os/freebsd
+CFLAGS+= -I${SRCTOP}/sys
+CFLAGS+= -I${SRCTOP}/cddl/compat/opensolaris/include
+CFLAGS+= -include ${SRCTOP}/sys/contrib/openzfs/include/os/freebsd/spl/sys/ccompile.h
+CFLAGS+= -DHAVE_ISSETUGID 
 
 .include <bsd.test.mk>
Index: lib/libproc/Makefile
===================================================================
--- lib/libproc/Makefile
+++ lib/libproc/Makefile
@@ -29,6 +29,13 @@
 .if ${MK_CDDL} != "no"
 LIBADD+=	ctf
 IGNORE_PRAGMA=	YES
+CFLAGS+= -DIN_BASE
+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/os/freebsd/spl
+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/include
+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/
+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/os/freebsd
+CFLAGS+= -include ${SRCTOP}/sys/contrib/openzfs/include/os/freebsd/spl/sys/ccompile.h
+CFLAGS+= -DHAVE_ISSETUGID -DHAVE_BOOLEAN
 CFLAGS+=	-I${SRCTOP}/cddl/contrib/opensolaris/lib/libctf/common \
 		-I${SRCTOP}/sys/cddl/contrib/opensolaris/uts/common \
 		-I${SRCTOP}/sys/cddl/compat/opensolaris
Index: lib/libproc/proc_bkpt.c
===================================================================
--- lib/libproc/proc_bkpt.c
+++ lib/libproc/proc_bkpt.c
@@ -112,7 +112,7 @@
 		return (-1);
 	}
 
-	DPRINTFX("adding breakpoint at 0x%lx", address);
+	DPRINTFX("adding breakpoint at 0x%lx", (unsigned long)address);
 
 	stopped = 0;
 	if (phdl->status != PS_STOP) {
@@ -173,7 +173,7 @@
 		return (-1);
 	}
 
-	DPRINTFX("removing breakpoint at 0x%lx", address);
+	DPRINTFX("removing breakpoint at 0x%lx", (unsigned long)address);
 
 	stopped = 0;
 	if (phdl->status != PS_STOP) {
Index: lib/libproc/proc_sym.c
===================================================================
--- lib/libproc/proc_sym.c
+++ lib/libproc/proc_sym.c
@@ -307,7 +307,7 @@
 	 */
 	if (data->d_size < sizeof(crc) + 1) {
 		DPRINTFX("ERROR: debuglink section is too small (%zd bytes)",
-		    data->d_size);
+		    (ssize_t)data->d_size);
 		goto internal;
 	}
 	if (strnlen(data->d_buf, data->d_size) >= data->d_size - sizeof(crc)) {
@@ -510,7 +510,7 @@
 	int error;
 
 	if ((mapping = _proc_addr2map(p, addr)) == NULL) {
-		DPRINTFX("ERROR: proc_addr2map failed to resolve 0x%jx", addr);
+		DPRINTFX("ERROR: proc_addr2map failed to resolve 0x%jx", (uintmax_t)addr);
 		return (-1);
 	}
 	if (open_object(mapping) != 0) {
Index: lib/libprocstat/libprocstat.c
===================================================================
--- lib/libprocstat/libprocstat.c
+++ lib/libprocstat/libprocstat.c
@@ -70,6 +70,7 @@
 #include <sys/ptrace.h>
 #define	_KERNEL
 #include <sys/mount.h>
+#include <sys/filedesc.h>
 #include <sys/pipe.h>
 #include <ufs/ufs/quota.h>
 #include <ufs/ufs/inode.h>
Index: lib/libprocstat/zfs/Makefile
===================================================================
--- lib/libprocstat/zfs/Makefile
+++ lib/libprocstat/zfs/Makefile
@@ -6,15 +6,19 @@
 OBJS=	zfs_defs.o
 WARNS?=	1
 
-CFLAGS+= -I${SRCTOP}/sys/cddl/compat/opensolaris
-CFLAGS+= -I${SRCTOP}/cddl/compat/opensolaris/include
-CFLAGS+= -I${SRCTOP}/cddl/compat/opensolaris/lib/libumem
-CFLAGS+= -I${SRCTOP}/sys/cddl/contrib/opensolaris/common/zfs
-CFLAGS+= -I${SRCTOP}/sys/cddl/contrib/opensolaris/uts/common/fs/zfs
-CFLAGS+= -I${SRCTOP}/sys/cddl/contrib/opensolaris/uts/common
-CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/head
-CFLAGS+= -I${.CURDIR:H}
-CFLAGS+= -DNEED_SOLARIS_BOOLEAN
+
+CFLAGS+= -DIN_BASE -D__KERNEL__ -D_KERNEL -I. -I${.CURDIR}
+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/include
+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/include/os/freebsd/spl
+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/include/os/freebsd/
+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/include/os/freebsd/zfs
+CFLAGS+= -I${SRCTOP}/sys/contrib/ck/include
+
+CFLAGS+= -I${SRCTOP}/sys -I.  -I..
+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/module/icp/include
+CFLAGS+= -include ${SRCTOP}/sys/contrib/openzfs/include/os/freebsd/spl/sys/ccompile.h
+CFLAGS+= -DHAVE_ISSETUGID -D_SYS_VMEM_H_ -UKLD_TIED -DKLD_MODULE
+
 CFLAGS+= -fno-builtin -nostdlib
 
 all: ${OBJS}
Index: lib/libprocstat/zfs_defs.c
===================================================================
--- lib/libprocstat/zfs_defs.c
+++ lib/libprocstat/zfs_defs.c
@@ -26,13 +26,9 @@
  */
 
 #include <sys/cdefs.h>
+#include <sys/types.h>
 __FBSDID("$FreeBSD$");
 
-/* Pretend we are kernel to get the same binary layout. */
-#define _KERNEL
-
-/* A hack to deal with kpilite.h. */
-#define KLD_MODULE
 
 /*
  * Prevent some headers from getting included and fake some types
@@ -41,14 +37,40 @@
  */
 #define _OPENSOLARIS_SYS_PATHNAME_H_
 #define _OPENSOLARIS_SYS_POLICY_H_
-#define _OPENSOLARIS_SYS_VNODE_H_
 #define _VNODE_PAGER_
 
-typedef struct vnode vnode_t;
-typedef struct vattr vattr_t;
-typedef struct xvattr xvattr_t;
-typedef struct vsecattr vsecattr_t;
-typedef enum vtype vtype_t;
+
+enum vtype	{ VNON, VREG, VDIR, VBLK, VCHR, VLNK, VSOCK, VFIFO, VBAD,
+		  VMARKER };
+
+/*
+ * Vnode attributes.  A field value of VNOVAL represents a field whose value
+ * is unavailable (getattr) or which is not to be changed (setattr).
+ */
+struct vattr {
+	enum vtype	va_type;	/* vnode type (for create) */
+	u_short		va_mode;	/* files access mode and type */
+	u_short		va_padding0;
+	uid_t		va_uid;		/* owner user id */
+	gid_t		va_gid;		/* owner group id */
+	nlink_t		va_nlink;	/* number of references to file */
+	dev_t		va_fsid;	/* filesystem id */
+	ino_t		va_fileid;	/* file id */
+	u_quad_t	va_size;	/* file size in bytes */
+	long		va_blocksize;	/* blocksize preferred for i/o */
+	struct timespec	va_atime;	/* time of last access */
+	struct timespec	va_mtime;	/* time of last modification */
+	struct timespec	va_ctime;	/* time file changed */
+	struct timespec	va_birthtime;	/* time file created */
+	u_long		va_gen;		/* generation number of file */
+	u_long		va_flags;	/* flags defined for file */
+	dev_t		va_rdev;	/* device the special file represents */
+	u_quad_t	va_bytes;	/* bytes of disk space held by file */
+	u_quad_t	va_filerev;	/* file modification number */
+	u_int		va_vaflags;	/* operations flags, see below */
+	long		va_spare;	/* remain quad aligned */
+};
+
 
 #include <sys/zfs_context.h>
 #include <sys/zfs_znode.h>
Index: libexec/rc/rc.d/zfs
===================================================================
--- libexec/rc/rc.d/zfs
+++ libexec/rc/rc.d/zfs
@@ -25,6 +25,13 @@
 
 zfs_start_main()
 {
+	local cachefile
+
+	for cachefile in /boot/zfs/zpool.cache /etc/zfs/zpool.cache; do
+		if [ -r $cachefile ]; then
+			zpool import -c $cachefile -a
+		fi
+	done
 	zfs mount -va
 	zfs share -a
 	if [ ! -r /etc/zfs/exports ]; then
Index: rescue/rescue/Makefile
===================================================================
--- rescue/rescue/Makefile
+++ rescue/rescue/Makefile
@@ -129,7 +129,7 @@
 CRUNCH_LIBS+= -l80211 -lalias -lcam -lncursesw -ldevstat -lipsec -llzma
 .if ${MK_ZFS} != "no"
 CRUNCH_LIBS+= -lavl -lzpool -lzfs_core -lzfs -lnvpair -lpthread -luutil -lumem
-CRUNCH_LIBS+= -lbe
+CRUNCH_LIBS+= -lbe -lzutil -ltpool -lspl -licp_rescue
 .else
 # liblzma needs pthread
 CRUNCH_LIBS+= -lpthread
Index: sbin/bectl/Makefile
===================================================================
--- sbin/bectl/Makefile
+++ sbin/bectl/Makefile
@@ -7,16 +7,22 @@
 
 SRCS=	bectl.c bectl_jail.c bectl_list.c
 
-LIBADD+= be
-LIBADD+= jail
-LIBADD+= nvpair
-LIBADD+= util
-
-CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/lib/libzfs/common
-CFLAGS+= -I${SRCTOP}/sys/cddl/compat/opensolaris
-CFLAGS+= -I${SRCTOP}/sys/cddl/contrib/opensolaris/uts/common
-
-CFLAGS+= -DNEED_SOLARIS_BOOLEAN
+LIBADD+= be \
+	jail \
+	nvpair \
+	spl \
+	util \
+
+CFLAGS+= -DIN_BASE
+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/include
+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/
+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/os/freebsd
+CFLAGS+= -I${SRCTOP}/sys
+CFLAGS+= -I${SRCTOP}/cddl/compat/opensolaris/include
+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/module/icp/include
+CFLAGS+= -include ${SRCTOP}/sys/contrib/openzfs/include/os/freebsd/spl/sys/ccompile.h
+CFLAGS+= -DHAVE_ISSETUGID 
+CFLAGS+= -include ${SRCTOP}/sys/modules/zfs/zfs_config.h
 
 HAS_TESTS=	yes
 SUBDIR.${MK_TESTS}+= tests
Index: sbin/bectl/Makefile.depend
===================================================================
--- sbin/bectl/Makefile.depend
+++ sbin/bectl/Makefile.depend
@@ -4,6 +4,7 @@
 DIRDEPS = \
 	cddl/lib/libavl \
 	cddl/lib/libnvpair \
+	cddl/lib/libspl \
 	cddl/lib/libumem \
 	cddl/lib/libuutil \
 	cddl/lib/libzfs \
Index: sbin/bectl/bectl.c
===================================================================
--- sbin/bectl/bectl.c
+++ sbin/bectl/bectl.c
@@ -60,6 +60,8 @@
 
 libbe_handle_t *be;
 
+int aok;
+
 int
 usage(bool explicit)
 {
Index: sbin/zfsbootcfg/Makefile
===================================================================
--- sbin/zfsbootcfg/Makefile
+++ sbin/zfsbootcfg/Makefile
@@ -2,7 +2,7 @@
 # $FreeBSD$
 
 PROG=	zfsbootcfg
-WARNS?=	1
+WARNS?=	2
 MAN=	zfsbootcfg.8
 
 LIBADD+=zfs
@@ -11,17 +11,16 @@
 LIBADD+=uutil
 LIBADD+=geom
 
+CFLAGS+= -DIN_BASE
+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/include
+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/include/os/freebsd
+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include
+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/os/freebsd
+CFLAGS+= -I${SRCTOP}/sys
 CFLAGS+= -I${SRCTOP}/cddl/compat/opensolaris/include
-CFLAGS+= -I${SRCTOP}/cddl/compat/opensolaris/lib/libumem
-CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/lib/libzfs/common
-CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/lib/libzfs_core/common
-CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/lib/libzpool/common
-CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/lib/libnvpair
-CFLAGS+= -I${SRCTOP}/sys/cddl/compat/opensolaris
-CFLAGS+= -I${SRCTOP}/sys/cddl/contrib/opensolaris/uts/common/fs/zfs
-CFLAGS+= -I${SRCTOP}/sys/cddl/contrib/opensolaris/uts/common
-CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/head
-
-CFLAGS+= -DNEED_SOLARIS_BOOLEAN
+CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/module/icp/include
+CFLAGS+= -include ${SRCTOP}/sys/contrib/openzfs/include/os/freebsd/spl/sys/ccompile.h
+CFLAGS+= -DHAVE_ISSETUGID
+CFLAGS+= -include ${SRCTOP}/sys/modules/zfs/zfs_config.h
 
 .include <bsd.prog.mk>
Index: share/mk/bsd.libnames.mk
===================================================================
--- share/mk/bsd.libnames.mk
+++ share/mk/bsd.libnames.mk
@@ -80,6 +80,7 @@
 LIBIBNETDISC?=	${LIBDESTDIR}${LIBDIR_BASE}/libibnetdisc.a
 LIBIBUMAD?=	${LIBDESTDIR}${LIBDIR_BASE}/libibumad.a
 LIBIBVERBS?=	${LIBDESTDIR}${LIBDIR_BASE}/libibverbs.a
+LIBICP?=	${LIBDESTDIR}${LIBDIR_BASE}/libicp.a
 LIBIPSEC?=	${LIBDESTDIR}${LIBDIR_BASE}/libipsec.a
 LIBIPT?=	${LIBDESTDIR}${LIBDIR_BASE}/libipt.a
 LIBJAIL?=	${LIBDESTDIR}${LIBDIR_BASE}/libjail.a
@@ -135,6 +136,7 @@
 LIBSBUF?=	${LIBDESTDIR}${LIBDIR_BASE}/libsbuf.a
 LIBSDP?=	${LIBDESTDIR}${LIBDIR_BASE}/libsdp.a
 LIBSMB?=	${LIBDESTDIR}${LIBDIR_BASE}/libsmb.a
+LIBSPL?=	${LIBDESTDIR}${LIBDIR_BASE}/libspl.a
 LIBSSL?=	${LIBDESTDIR}${LIBDIR_BASE}/libssl.a
 LIBSSP_NONSHARED?=	${LIBDESTDIR}${LIBDIR_BASE}/libssp_nonshared.a
 LIBSTATS?=	${LIBDESTDIR}${LIBDIR_BASE}/libstats.a
@@ -146,6 +148,7 @@
 LIBTERMCAPW?=	${LIBDESTDIR}${LIBDIR_BASE}/libtermcapw.a
 LIBTERMLIB?=	"don't use LIBTERMLIB, use LIBTERMCAP"
 LIBTINFO?=	"don't use LIBTINFO, use LIBNCURSES"
+LIBTPOOL?=	${LIBDESTDIR}${LIBDIR_BASE}/libtpool.a
 LIBUFS?=	${LIBDESTDIR}${LIBDIR_BASE}/libufs.a
 LIBUGIDFW?=	${LIBDESTDIR}${LIBDIR_BASE}/libugidfw.a
 LIBULOG?=	${LIBDESTDIR}${LIBDIR_BASE}/libulog.a
@@ -166,6 +169,7 @@
 LIBZFS?=	${LIBDESTDIR}${LIBDIR_BASE}/libzfs.a
 LIBZFS_CORE?=	${LIBDESTDIR}${LIBDIR_BASE}/libzfs_core.a
 LIBZPOOL?=	${LIBDESTDIR}${LIBDIR_BASE}/libzpool.a
+LIBZUTIL?=	${LIBDESTDIR}${LIBDIR_BASE}/libzutil.a
 
 # enforce the 2 -lpthread and -lc to always be the last in that exact order
 .if defined(LDADD)
Index: share/mk/src.libnames.mk
===================================================================
--- share/mk/src.libnames.mk
+++ share/mk/src.libnames.mk
@@ -124,6 +124,7 @@
 		heimntlm \
 		heimsqlite \
 		hx509 \
+		icp \
 		ipsec \
 		ipt \
 		jail \
@@ -171,6 +172,7 @@
 		sdp \
 		sm \
 		smb \
+		spl \
 		ssl \
 		ssp_nonshared \
 		stats \
@@ -180,6 +182,7 @@
 		tacplus \
 		termcap \
 		termcapw \
+		tpool \
 		ufs \
 		ugidfw \
 		ulog \
@@ -198,6 +201,7 @@
 		zfs_core \
 		zfs \
 		zpool \
+		zutil
 
 .if ${MK_BLACKLIST} != "no"
 _LIBRARIES+= \
@@ -354,9 +358,10 @@
 _DP_ucl=	m
 _DP_vmmapi=	util
 _DP_opencsd=	cxxrt
-_DP_ctf=	z
+_DP_ctf=	spl z
 _DP_dtrace=	ctf elf proc pthread rtld_db
 _DP_xo=		util
+_DP_ztest=	geom m nvpair umem zpool pthread avl zfs_core spl zutil zfs uutil icp
 # The libc dependencies are not strictly needed but are defined to make the
 # assert happy.
 _DP_c=		compiler_rt
@@ -374,11 +379,14 @@
 _DP_ulog=	md
 _DP_fifolog=	z
 _DP_ipf=	kvm
-_DP_zfs=	md pthread umem util uutil m nvpair avl bsdxml geom nvpair z \
-		zfs_core
+_DP_tpool=	spl
+_DP_uutil=	avl spl
+_DP_zfs=	md pthread umem util uutil m avl bsdxml geom nvpair \
+	z zfs_core zutil
 _DP_zfs_core=	nvpair
-_DP_zpool=	md pthread z nvpair avl umem
-_DP_be=		zfs nvpair
+_DP_zpool=	md pthread z icp spl nvpair avl umem
+_DP_zutil=	avl tpool
+_DP_be=		zfs spl nvpair
 
 # OFED support
 .if ${MK_OFED} != "no"
@@ -582,12 +590,15 @@
 LIBAVLDIR=	${OBJTOP}/cddl/lib/libavl
 LIBCTFDIR=	${OBJTOP}/cddl/lib/libctf
 LIBDTRACEDIR=	${OBJTOP}/cddl/lib/libdtrace
+LIBICPDIR=	${OBJTOP}/cddl/lib/libicp
 LIBNVPAIRDIR=	${OBJTOP}/cddl/lib/libnvpair
 LIBUMEMDIR=	${OBJTOP}/cddl/lib/libumem
 LIBUUTILDIR=	${OBJTOP}/cddl/lib/libuutil
 LIBZFSDIR=	${OBJTOP}/cddl/lib/libzfs
 LIBZFS_COREDIR=	${OBJTOP}/cddl/lib/libzfs_core
 LIBZPOOLDIR=	${OBJTOP}/cddl/lib/libzpool
+LIBZUTILDIR=	${OBJTOP}/cddl/lib/libzutil
+LIBTPOOLDIR=	${OBJTOP}/cddl/lib/libtpool
 
 # OFED support
 LIBCXGB4DIR=	${OBJTOP}/lib/ofed/libcxgb4
@@ -653,6 +664,7 @@
 LIBPANELDIR=	${OBJTOP}/lib/ncurses/panel
 LIBPANELWDIR=	${OBJTOP}/lib/ncurses/panelw
 LIBCRYPTODIR=	${OBJTOP}/secure/lib/libcrypto
+LIBSPLDIR=	${OBJTOP}/cddl/lib/libspl
 LIBSSHDIR=	${OBJTOP}/secure/lib/libssh
 LIBSSLDIR=	${OBJTOP}/secure/lib/libssl
 LIBTEKENDIR=	${OBJTOP}/sys/teken/libteken
Index: sys/cddl/compat/opensolaris/kern/opensolaris.c
===================================================================
--- sys/cddl/compat/opensolaris/kern/opensolaris.c
+++ sys/cddl/compat/opensolaris/kern/opensolaris.c
@@ -37,6 +37,8 @@
 #include <sys/module.h>
 #include <sys/mutex.h>
 
+extern struct opensolaris_utsname utsname;
+
 cpu_core_t	cpu_core[MAXCPU];
 kmutex_t	cpu_lock;
 solaris_cpu_t	solaris_cpu[MAXCPU];
@@ -82,7 +84,6 @@
 
 	switch (type) {
 	case MOD_LOAD:
-		utsname.nodename = prison0.pr_hostname;
 		break;
 
 	case MOD_UNLOAD:
Index: sys/cddl/compat/opensolaris/sys/acl.h
===================================================================
--- sys/cddl/compat/opensolaris/sys/acl.h
+++ /dev/null
@@ -1,39 +0,0 @@
-/*-
- * Copyright (c) 2008, 2009 Edward Tomasz Napierała <trasz@FreeBSD.org>
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- *
- * $FreeBSD$
- */
-
-#ifndef OPENSOLARIS_SYS_ACL_H
-#define OPENSOLARIS_SYS_ACL_H
-
-#include_next <sys/acl.h>
-
-struct acl;
-
-void aces_from_acl(ace_t *aces, int *nentries, const struct acl *aclp);
-int acl_from_aces(struct acl *aclp, const ace_t *aces, int nentries);
-
-#endif /* OPENSOLARIS_SYS_ACL_H */
Index: sys/cddl/compat/opensolaris/sys/file.h
===================================================================
--- sys/cddl/compat/opensolaris/sys/file.h
+++ /dev/null
@@ -1,64 +0,0 @@
-/*-
- * Copyright (c) 2007 Pawel Jakub Dawidek <pjd@FreeBSD.org>
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- *
- * $FreeBSD$
- */
-
-#ifndef _OPENSOLARIS_SYS_FILE_H_
-#define	_OPENSOLARIS_SYS_FILE_H_
-
-#include_next <sys/file.h>
-
-#define	FKIOCTL	0x80000000	/* ioctl addresses are from kernel */
-
-#ifdef _KERNEL
-typedef	struct file	file_t;
-
-#include <sys/capsicum.h>
-
-static __inline file_t *
-getf(int fd, cap_rights_t *rightsp)
-{
-	struct file *fp;
-
-	if (fget(curthread, fd, rightsp, &fp) == 0)
-		return (fp);
-	return (NULL);
-}
-
-static __inline void
-releasef(int fd)
-{
-	struct file *fp;
-
-	/* No CAP_ rights required, as we're only releasing. */
-	if (fget(curthread, fd, &cap_no_rights, &fp) == 0) {
-		fdrop(fp, curthread);
-		fdrop(fp, curthread);
-	}
-}
-#endif	/* _KERNEL */
-
-#endif	/* !_OPENSOLARIS_SYS_FILE_H_ */
Index: sys/cddl/compat/opensolaris/sys/kobj.h
===================================================================
--- sys/cddl/compat/opensolaris/sys/kobj.h
+++ /dev/null
@@ -1,60 +0,0 @@
-/*-
- * Copyright (c) 2007 Pawel Jakub Dawidek <pjd@FreeBSD.org>
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- *
- * $FreeBSD$
- */
-
-#ifndef _OPENSOLARIS_SYS_KOBJ_H_
-#define	_OPENSOLARIS_SYS_KOBJ_H_
-
-#include <sys/types.h>
-#include <sys/kmem.h>
-#include_next <sys/kobj.h>
-#ifdef AT_UID
-#undef AT_UID
-#endif
-#ifdef AT_GID
-#undef AT_GID
-#endif
-#include <sys/vnode.h>
-
-#define	KM_NOWAIT	0x01
-#define	KM_TMP		0x02
-
-void kobj_free(void *address, size_t size);
-void *kobj_alloc(size_t size, int flag);
-void *kobj_zalloc(size_t size, int flag);
-
-struct _buf {
-	void *ptr;
-	int mounted;
-};
-
-struct _buf *kobj_open_file(const char *path);
-int kobj_get_filesize(struct _buf *file, uint64_t *size);
-int kobj_read_file(struct _buf *file, char *buf, unsigned size, unsigned off);
-void kobj_close_file(struct _buf *file);
-
-#endif	/* _OPENSOLARIS_SYS_KOBJ_H_ */
Index: sys/cddl/compat/opensolaris/sys/lock.h
===================================================================
--- sys/cddl/compat/opensolaris/sys/lock.h
+++ /dev/null
@@ -1,45 +0,0 @@
-/*-
- * Copyright (c) 2007 Pawel Jakub Dawidek <pjd@FreeBSD.org>
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- *
- * $FreeBSD$
- */
-
-#ifndef _OPENSOLARIS_SYS_LOCK_H_
-#define	_OPENSOLARIS_SYS_LOCK_H_
-
-#include_next <sys/lock.h>
-
-#ifdef _KERNEL
-
-#define	LO_ALLMASK	(LO_INITIALIZED | LO_WITNESS | LO_QUIET |	\
-			 LO_RECURSABLE | LO_SLEEPABLE | LO_UPGRADABLE |	\
-			 LO_DUPOK | LO_CLASSMASK | LO_NOPROFILE)
-#define	LO_EXPECTED	(LO_INITIALIZED | LO_WITNESS | LO_RECURSABLE |	\
-			 LO_SLEEPABLE | LO_UPGRADABLE | LO_DUPOK |	\
-			 /* sx lock class */(2 << LO_CLASSSHIFT))
-
-#endif	/* defined(_KERNEL) */
-
-#endif	/* _OPENSOLARIS_SYS_LOCK_H_ */
Index: sys/cddl/compat/opensolaris/sys/misc.h
===================================================================
--- sys/cddl/compat/opensolaris/sys/misc.h
+++ sys/cddl/compat/opensolaris/sys/misc.h
@@ -55,7 +55,6 @@
 };
 
 extern char hw_serial[11];
-extern struct opensolaris_utsname utsname;
 #endif
 
 #endif	/* _OPENSOLARIS_SYS_MISC_H_ */
Index: sys/cddl/compat/opensolaris/sys/mman.h
===================================================================
--- sys/cddl/compat/opensolaris/sys/mman.h
+++ /dev/null
@@ -1,37 +0,0 @@
-/*
- * Copyright (C) 2007 John Birrell <jb@freebsd.org>
- * All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- * 
- * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED.  IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- *
- * $FreeBSD$
- *
- */
-
-#ifndef _COMPAT_OPENSOLARIS_SYS_MMAN_H_
-#define _COMPAT_OPENSOLARIS_SYS_MMAN_H_
-
-#include_next <sys/mman.h>
-
-#define	mmap64(_a,_b,_c,_d,_e,_f)	mmap(_a,_b,_c,_d,_e,_f)
-
-#endif
Index: sys/cddl/compat/opensolaris/sys/modctl.h
===================================================================
--- sys/cddl/compat/opensolaris/sys/modctl.h
+++ sys/cddl/compat/opensolaris/sys/modctl.h
@@ -31,6 +31,7 @@
 #define	_COMPAT_OPENSOLARIS_SYS_MODCTL_H
 
 #include <sys/param.h>
+#include <sys/queue.h>
 #include <sys/linker.h>
 
 typedef struct linker_file modctl_t;
Index: sys/cddl/compat/opensolaris/sys/mount.h
===================================================================
--- sys/cddl/compat/opensolaris/sys/mount.h
+++ /dev/null
@@ -1,41 +0,0 @@
-/*-
- * Copyright (c) 2007 Pawel Jakub Dawidek <pjd@FreeBSD.org>
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- *
- * $FreeBSD$
- */
-
-#ifndef _OPENSOLARIS_SYS_MOUNT_H_
-#define	_OPENSOLARIS_SYS_MOUNT_H_
-
-#include <sys/param.h>
-
-#include_next <sys/mount.h>
-
-#define	MS_FORCE	MNT_FORCE
-#define	MS_REMOUNT	MNT_UPDATE
-
-typedef	struct fid		fid_t;
-
-#endif	/* !_OPENSOLARIS_SYS_MOUNT_H_ */
Index: sys/cddl/compat/opensolaris/sys/mutex.h
===================================================================
--- sys/cddl/compat/opensolaris/sys/mutex.h
+++ /dev/null
@@ -1,77 +0,0 @@
-/*-
- * Copyright (c) 2007 Pawel Jakub Dawidek <pjd@FreeBSD.org>
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- *
- * $FreeBSD$
- */
-
-#ifndef _OPENSOLARIS_SYS_MUTEX_H_
-#define	_OPENSOLARIS_SYS_MUTEX_H_
-
-#ifdef _KERNEL
-
-#include <sys/param.h>
-#include <sys/lock.h>
-#include_next <sys/mutex.h>
-#include <sys/proc.h>
-#include <sys/sx.h>
-
-typedef enum {
-	MUTEX_DEFAULT = 6	/* kernel default mutex */
-} kmutex_type_t;
-
-#define	MUTEX_HELD(x)		(mutex_owned(x))
-#define	MUTEX_NOT_HELD(x)	(!mutex_owned(x) || KERNEL_PANICKED())
-
-typedef struct sx	kmutex_t;
-
-#ifndef OPENSOLARIS_WITNESS
-#define	MUTEX_FLAGS	(SX_DUPOK | SX_NEW | SX_NOWITNESS)
-#else
-#define	MUTEX_FLAGS	(SX_DUPOK | SX_NEW)
-#endif
-
-#define	mutex_init(lock, desc, type, arg)	do {			\
-	const char *_name;						\
-	ASSERT((type) == 0 || (type) == MUTEX_DEFAULT);			\
-	KASSERT(((lock)->lock_object.lo_flags & LO_ALLMASK) !=		\
-	    LO_EXPECTED, ("lock %s already initialized", #lock));	\
-	for (_name = #lock; *_name != '\0'; _name++) {			\
-		if (*_name >= 'a' && *_name <= 'z')			\
-			break;						\
-	}								\
-	if (*_name == '\0')						\
-		_name = #lock;						\
-	sx_init_flags((lock), _name, MUTEX_FLAGS);			\
-} while (0)
-#define	mutex_destroy(lock)	sx_destroy(lock)
-#define	mutex_enter(lock)	sx_xlock(lock)
-#define	mutex_tryenter(lock)	sx_try_xlock(lock)
-#define	mutex_exit(lock)	sx_xunlock(lock)
-#define	mutex_owned(lock)	sx_xlocked(lock)
-#define	mutex_owner(lock)	sx_xholder(lock)
-
-#endif	/* _KERNEL */
-
-#endif	/* _OPENSOLARIS_SYS_MUTEX_H_ */
Index: sys/cddl/compat/opensolaris/sys/nvpair.h
===================================================================
--- sys/cddl/compat/opensolaris/sys/nvpair.h
+++ /dev/null
@@ -1,230 +0,0 @@
-/*-
- * Copyright (c) 2014 Sandvine Inc.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- *
- * $FreeBSD$
- */
-
-#ifndef _OPENSOLARIS_SYS_NVPAIR_H_
-#define _OPENSOLARIS_SYS_NVPAIR_H_
-
-#ifdef _KERNEL
-
-/*
- * Some of the symbols in the Illumos nvpair library conflict with symbols
- * provided by nv(9), so we use this preprocessor hack to avoid the conflict.
- *
- * This list was generated by:
- *   cat nv.h nv_impl.h nvlist_* nvpair_impl.h | \
- *     sed -nE 's/^[[:alnum:]_][[:alnum:]_ ]*[[:space:]]+[*]*([[:alnum:]_]+)\(.*$/#define \1 illumos_\1/p' | \
- *     sort -u
- */
-#define nvlist_add_binary illumos_nvlist_add_binary
-#define nvlist_add_bool illumos_nvlist_add_bool
-#define nvlist_add_bool_array illumos_nvlist_add_bool_array
-#define nvlist_add_descriptor illumos_nvlist_add_descriptor
-#define nvlist_add_descriptor_array illumos_nvlist_add_descriptor_array
-#define nvlist_add_null illumos_nvlist_add_null
-#define nvlist_add_number illumos_nvlist_add_number
-#define nvlist_add_number_array illumos_nvlist_add_number_array
-#define nvlist_add_nvlist illumos_nvlist_add_nvlist
-#define nvlist_add_nvlist_array illumos_nvlist_add_nvlist_array
-#define nvlist_add_nvpair illumos_nvlist_add_nvpair
-#define nvlist_add_string illumos_nvlist_add_string
-#define nvlist_add_string_array illumos_nvlist_add_string_array
-#define nvlist_add_stringf illumos_nvlist_add_stringf
-#define nvlist_add_stringv illumos_nvlist_add_stringv
-#define nvlist_clone illumos_nvlist_clone
-#define nvlist_create illumos_nvlist_create
-#define nvlist_descriptors illumos_nvlist_descriptors
-#define nvlist_destroy illumos_nvlist_destroy
-#define nvlist_dump illumos_nvlist_dump
-#define nvlist_empty illumos_nvlist_empty
-#define nvlist_error illumos_nvlist_error
-#define nvlist_exists illumos_nvlist_exists
-#define nvlist_exists_binary illumos_nvlist_exists_binary
-#define nvlist_exists_bool illumos_nvlist_exists_bool
-#define nvlist_exists_bool_array illumos_nvlist_exists_bool_array
-#define nvlist_exists_descriptor illumos_nvlist_exists_descriptor
-#define nvlist_exists_descriptor_array illumos_nvlist_exists_descriptor_array
-#define nvlist_exists_null illumos_nvlist_exists_null
-#define nvlist_exists_number illumos_nvlist_exists_number
-#define nvlist_exists_number_array illumos_nvlist_exists_number_array
-#define nvlist_exists_nvlist illumos_nvlist_exists_nvlist
-#define nvlist_exists_nvlist_array illumos_nvlist_exists_nvlist_array
-#define nvlist_exists_string illumos_nvlist_exists_string
-#define nvlist_exists_string_array illumos_nvlist_exists_string_array
-#define nvlist_exists_type illumos_nvlist_exists_type
-#define nvlist_fdump illumos_nvlist_fdump
-#define nvlist_first_nvpair illumos_nvlist_first_nvpair
-#define nvlist_flags illumos_nvlist_flags
-#define nvlist_free illumos_nvlist_free
-#define nvlist_free_binary illumos_nvlist_free_binary
-#define nvlist_free_binary_array illumos_nvlist_free_binary_array
-#define nvlist_free_bool illumos_nvlist_free_bool
-#define nvlist_free_bool_array illumos_nvlist_free_bool_array
-#define nvlist_free_descriptor illumos_nvlist_free_descriptor
-#define nvlist_free_descriptor_array illumos_nvlist_free_descriptor_array
-#define nvlist_free_null illumos_nvlist_free_null
-#define nvlist_free_number illumos_nvlist_free_number
-#define nvlist_free_number_array illumos_nvlist_free_number_array
-#define nvlist_free_nvlist illumos_nvlist_free_nvlist
-#define nvlist_free_nvlist_array illumos_nvlist_free_nvlist_array
-#define nvlist_free_nvpair illumos_nvlist_free_nvpair
-#define nvlist_free_string illumos_nvlist_free_string
-#define nvlist_free_string_array illumos_nvlist_free_string_array
-#define nvlist_free_type illumos_nvlist_free_type
-#define nvlist_get_array_next illumos_nvlist_get_array_next
-#define nvlist_get_binary illumos_nvlist_get_binary
-#define nvlist_get_bool illumos_nvlist_get_bool
-#define nvlist_get_bool_array illumos_nvlist_get_bool_array
-#define nvlist_get_descriptor illumos_nvlist_get_descriptor
-#define nvlist_get_descriptor_array illumos_nvlist_get_descriptor_array
-#define nvlist_get_number illumos_nvlist_get_number
-#define nvlist_get_number_array illumos_nvlist_get_number_array
-#define nvlist_get_nvlist illumos_nvlist_get_nvlist
-#define nvlist_get_nvpair illumos_nvlist_get_nvpair
-#define nvlist_get_nvpair_parent illumos_nvlist_get_nvpair_parent
-#define nvlist_get_pararr illumos_nvlist_get_pararr
-#define nvlist_get_parent illumos_nvlist_get_parent
-#define nvlist_get_string illumos_nvlist_get_string
-#define nvlist_in_array illumos_nvlist_in_array
-#define nvlist_move_binary illumos_nvlist_move_binary
-#define nvlist_move_bool_array illumos_nvlist_move_bool_array
-#define nvlist_move_descriptor illumos_nvlist_move_descriptor
-#define nvlist_move_descriptor_array illumos_nvlist_move_descriptor_array
-#define nvlist_move_number_array illumos_nvlist_move_number_array
-#define nvlist_move_nvlist illumos_nvlist_move_nvlist
-#define nvlist_move_nvlist_array illumos_nvlist_move_nvlist_array
-#define nvlist_move_nvpair illumos_nvlist_move_nvpair
-#define nvlist_move_string illumos_nvlist_move_string
-#define nvlist_move_string_array illumos_nvlist_move_string_array
-#define nvlist_ndescriptors illumos_nvlist_ndescriptors
-#define nvlist_next illumos_nvlist_next
-#define nvlist_next_nvpair illumos_nvlist_next_nvpair
-#define nvlist_pack illumos_nvlist_pack
-#define nvlist_prev_nvpair illumos_nvlist_prev_nvpair
-#define nvlist_recv illumos_nvlist_recv
-#define nvlist_remove_nvpair illumos_nvlist_remove_nvpair
-#define nvlist_send illumos_nvlist_send
-#define nvlist_set_array_next illumos_nvlist_set_array_next
-#define nvlist_set_error illumos_nvlist_set_error
-#define nvlist_set_flags illumos_nvlist_set_flags
-#define nvlist_set_parent illumos_nvlist_set_parent
-#define nvlist_size illumos_nvlist_size
-#define nvlist_take_binary illumos_nvlist_take_binary
-#define nvlist_take_bool illumos_nvlist_take_bool
-#define nvlist_take_bool_array illumos_nvlist_take_bool_array
-#define nvlist_take_descriptor illumos_nvlist_take_descriptor
-#define nvlist_take_descriptor_array illumos_nvlist_take_descriptor_array
-#define nvlist_take_number illumos_nvlist_take_number
-#define nvlist_take_number_array illumos_nvlist_take_number_array
-#define nvlist_take_nvlist illumos_nvlist_take_nvlist
-#define nvlist_take_nvlist_array illumos_nvlist_take_nvlist_array
-#define nvlist_take_nvpair illumos_nvlist_take_nvpair
-#define nvlist_take_string illumos_nvlist_take_string
-#define nvlist_take_string_array illumos_nvlist_take_string_array
-#define nvlist_unpack illumos_nvlist_unpack
-#define nvlist_unpack_header illumos_nvlist_unpack_header
-#define nvlist_xfer illumos_nvlist_xfer
-#define nvpair_assert illumos_nvpair_assert
-#define nvpair_clone illumos_nvpair_clone
-#define nvpair_create_binary illumos_nvpair_create_binary
-#define nvpair_create_bool illumos_nvpair_create_bool
-#define nvpair_create_bool_array illumos_nvpair_create_bool_array
-#define nvpair_create_descriptor illumos_nvpair_create_descriptor
-#define nvpair_create_descriptor_array illumos_nvpair_create_descriptor_array
-#define nvpair_create_null illumos_nvpair_create_null
-#define nvpair_create_number illumos_nvpair_create_number
-#define nvpair_create_number_array illumos_nvpair_create_number_array
-#define nvpair_create_nvlist illumos_nvpair_create_nvlist
-#define nvpair_create_nvlist_array illumos_nvpair_create_nvlist_array
-#define nvpair_create_string illumos_nvpair_create_string
-#define nvpair_create_string_array illumos_nvpair_create_string_array
-#define nvpair_create_stringf illumos_nvpair_create_stringf
-#define nvpair_create_stringv illumos_nvpair_create_stringv
-#define nvpair_free illumos_nvpair_free
-#define nvpair_free_structure illumos_nvpair_free_structure
-#define nvpair_get_binary illumos_nvpair_get_binary
-#define nvpair_get_bool illumos_nvpair_get_bool
-#define nvpair_get_bool_array illumos_nvpair_get_bool_array
-#define nvpair_get_descriptor illumos_nvpair_get_descriptor
-#define nvpair_get_descriptor_array illumos_nvpair_get_descriptor_array
-#define nvpair_get_number illumos_nvpair_get_number
-#define nvpair_get_number_array illumos_nvpair_get_number_array
-#define nvpair_get_nvlist illumos_nvpair_get_nvlist
-#define nvpair_get_string illumos_nvpair_get_string
-#define nvpair_header_size illumos_nvpair_header_size
-#define nvpair_init_datasize illumos_nvpair_init_datasize
-#define nvpair_insert illumos_nvpair_insert
-#define nvpair_move_binary illumos_nvpair_move_binary
-#define nvpair_move_bool_array illumos_nvpair_move_bool_array
-#define nvpair_move_descriptor illumos_nvpair_move_descriptor
-#define nvpair_move_descriptor_array illumos_nvpair_move_descriptor_array
-#define nvpair_move_number_array illumos_nvpair_move_number_array
-#define nvpair_move_nvlist illumos_nvpair_move_nvlist
-#define nvpair_move_nvlist_array illumos_nvpair_move_nvlist_array
-#define nvpair_move_string illumos_nvpair_move_string
-#define nvpair_move_string_array illumos_nvpair_move_string_array
-#define nvpair_name illumos_nvpair_name
-#define nvpair_next illumos_nvpair_next
-#define nvpair_nvlist illumos_nvpair_nvlist
-#define nvpair_pack_binary illumos_nvpair_pack_binary
-#define nvpair_pack_bool illumos_nvpair_pack_bool
-#define nvpair_pack_bool_array illumos_nvpair_pack_bool_array
-#define nvpair_pack_descriptor illumos_nvpair_pack_descriptor
-#define nvpair_pack_descriptor_array illumos_nvpair_pack_descriptor_array
-#define nvpair_pack_header illumos_nvpair_pack_header
-#define nvpair_pack_null illumos_nvpair_pack_null
-#define nvpair_pack_number illumos_nvpair_pack_number
-#define nvpair_pack_number_array illumos_nvpair_pack_number_array
-#define nvpair_pack_nvlist_array_next illumos_nvpair_pack_nvlist_array_next
-#define nvpair_pack_nvlist_up illumos_nvpair_pack_nvlist_up
-#define nvpair_pack_string illumos_nvpair_pack_string
-#define nvpair_pack_string_array illumos_nvpair_pack_string_array
-#define nvpair_prev illumos_nvpair_prev
-#define nvpair_remove illumos_nvpair_remove
-#define nvpair_size illumos_nvpair_size
-#define nvpair_type illumos_nvpair_type
-#define nvpair_type_string illumos_nvpair_type_string
-#define nvpair_unpack illumos_nvpair_unpack
-#define nvpair_unpack_binary illumos_nvpair_unpack_binary
-#define nvpair_unpack_bool illumos_nvpair_unpack_bool
-#define nvpair_unpack_bool_array illumos_nvpair_unpack_bool_array
-#define nvpair_unpack_descriptor illumos_nvpair_unpack_descriptor
-#define nvpair_unpack_descriptor_array illumos_nvpair_unpack_descriptor_array
-#define nvpair_unpack_header illumos_nvpair_unpack_header
-#define nvpair_unpack_null illumos_nvpair_unpack_null
-#define nvpair_unpack_number illumos_nvpair_unpack_number
-#define nvpair_unpack_number_array illumos_nvpair_unpack_number_array
-#define nvpair_unpack_nvlist illumos_nvpair_unpack_nvlist
-#define nvpair_unpack_nvlist_array illumos_nvpair_unpack_nvlist_array
-#define nvpair_unpack_string illumos_nvpair_unpack_string
-#define nvpair_unpack_string_array illumos_nvpair_unpack_string_array
-
-#endif /* _KERNEL */
-
-#include_next <sys/nvpair.h>
-
-#endif
Index: sys/cddl/compat/opensolaris/sys/param.h
===================================================================
--- sys/cddl/compat/opensolaris/sys/param.h
+++ /dev/null
@@ -1,41 +0,0 @@
-/*
- * Copyright (C) 2007 John Birrell <jb@freebsd.org>
- * All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- * 
- * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED.  IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- *
- * $FreeBSD$
- *
- */
-
-#ifndef _COMPAT_OPENSOLARIS_SYS_PARAM_H_
-#define _COMPAT_OPENSOLARIS_SYS_PARAM_H_
-
-#include_next <sys/param.h>
-
-#define	PAGESIZE	PAGE_SIZE
-
-#ifdef _KERNEL
-#define	ptob(x)		((uint64_t)(x) << PAGE_SHIFT)
-#endif
-
-#endif
Index: sys/cddl/compat/opensolaris/sys/proc.h
===================================================================
--- sys/cddl/compat/opensolaris/sys/proc.h
+++ /dev/null
@@ -1,105 +0,0 @@
-/*-
- * Copyright (c) 2007 Pawel Jakub Dawidek <pjd@FreeBSD.org>
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- *
- * $FreeBSD$
- */
-
-#ifndef _OPENSOLARIS_SYS_PROC_H_
-#define	_OPENSOLARIS_SYS_PROC_H_
-
-#include <sys/param.h>
-#include <sys/kthread.h>
-#include_next <sys/proc.h>
-#include <sys/stdint.h>
-#include <sys/smp.h>
-#include <sys/sched.h>
-#include <sys/lock.h>
-#include <sys/mutex.h>
-#include <sys/unistd.h>
-#include <sys/debug.h>
-
-#ifdef _KERNEL
-
-#define	CPU		curcpu
-#define	minclsyspri	PRIBIO
-#define	maxclsyspri	PVM
-#define	max_ncpus	(mp_maxid + 1)
-#define	boot_max_ncpus	(mp_maxid + 1)
-#define	syscid		1
-
-#define	TS_RUN	0
-
-#define	p0	proc0
-
-#define	t_did	td_tid
-
-typedef	short		pri_t;
-typedef	struct thread	_kthread;
-typedef	struct thread	kthread_t;
-typedef struct thread	*kthread_id_t;
-typedef struct proc	proc_t;
-
-extern struct proc *system_proc;
-
-static __inline kthread_t *
-do_thread_create(caddr_t stk, size_t stksize, void (*proc)(void *), void *arg,
-    size_t len, proc_t *pp, int state, pri_t pri)
-{
-	kthread_t *td = NULL;
-	proc_t **ppp;
-	int error;
-
-	/*
-	 * Be sure there are no surprises.
-	 */
-	ASSERT(stk == NULL);
-	ASSERT(len == 0);
-	ASSERT(state == TS_RUN);
-	ASSERT(pp != NULL);
-
-	if (pp == &p0)
-		ppp = &system_proc;
-	else
-		ppp = &pp;
-	error = kproc_kthread_add(proc, arg, ppp, &td, RFSTOPPED,
-	    stksize / PAGE_SIZE, "zfskern", "solthread %p", proc);
-	if (error == 0) {
-		thread_lock(td);
-		sched_prio(td, pri);
-		sched_add(td, SRQ_BORING);
-	}
-	return (td);
-}
-
-#define	thread_create(stk, stksize, proc, arg, len, pp, state, pri) \
-	do_thread_create(stk, stksize, proc, arg, len, pp, state, pri)
-#define	thread_exit()	kthread_exit()
-
-int	uread(proc_t *, void *, size_t, uintptr_t);
-int	uwrite(proc_t *, void *, size_t, uintptr_t);
-
-#endif	/* _KERNEL */
-
-#endif	/* _OPENSOLARIS_SYS_PROC_H_ */
Index: sys/cddl/compat/opensolaris/sys/stat.h
===================================================================
--- sys/cddl/compat/opensolaris/sys/stat.h
+++ /dev/null
@@ -1,56 +0,0 @@
-/*
- * Copyright (C) 2007 John Birrell <jb@freebsd.org>
- * All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- * 
- * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED.  IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- *
- * $FreeBSD$
- *
- */
-
-#ifndef _COMPAT_OPENSOLARIS_SYS_STAT_H_
-#define _COMPAT_OPENSOLARIS_SYS_STAT_H_
-
-#include_next <sys/stat.h>
-
-#define	stat64	stat
-
-#define	MAXOFFSET_T	OFF_MAX
-
-#ifndef _KERNEL
-#include <sys/disk.h>
-
-static __inline int
-fstat64(int fd, struct stat *sb)
-{
-	int ret;
-
-	ret = fstat(fd, sb);
-	if (ret == 0) {
-		if (S_ISCHR(sb->st_mode))
-			(void)ioctl(fd, DIOCGMEDIASIZE, &sb->st_size);
-	}
-	return (ret);
-}
-#endif
-
-#endif	/* !_COMPAT_OPENSOLARIS_SYS_STAT_H_ */
Index: sys/cddl/compat/opensolaris/sys/systm.h
===================================================================
--- sys/cddl/compat/opensolaris/sys/systm.h
+++ /dev/null
@@ -1,47 +0,0 @@
-/*-
- * Copyright (c) 2007 Pawel Jakub Dawidek <pjd@FreeBSD.org>
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- *
- * $FreeBSD$
- */
-
-#ifndef _OPENSOLARIS_SYS_SYSTM_H_
-#define	_OPENSOLARIS_SYS_SYSTM_H_
-
-#ifdef _KERNEL
-
-#include <sys/param.h>
-#include_next <sys/systm.h>
-
-#include <sys/string.h>
-
-#define	PAGESIZE	PAGE_SIZE
-#define	PAGEOFFSET	(PAGESIZE - 1)
-#define	PAGEMASK	(~PAGEOFFSET)
-
-#define	delay(x)	pause("soldelay", (x))
-
-#endif	/* _KERNEL */
-
-#endif	/* _OPENSOLARIS_SYS_SYSTM_H_ */
Index: sys/cddl/compat/opensolaris/sys/time.h
===================================================================
--- sys/cddl/compat/opensolaris/sys/time.h
+++ /dev/null
@@ -1,94 +0,0 @@
-/*-
- * Copyright (c) 2007 Pawel Jakub Dawidek <pjd@FreeBSD.org>
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- *
- * $FreeBSD$
- */
-
-#ifndef _OPENSOLARIS_SYS_TIME_H_
-#define	_OPENSOLARIS_SYS_TIME_H_
-
-#include_next <sys/time.h>
-
-#define SEC		1
-#define MILLISEC	1000
-#define MICROSEC	1000000
-#define NANOSEC		1000000000
-#define TIME_MAX	LLONG_MAX
-
-#define	MSEC2NSEC(m)	((hrtime_t)(m) * (NANOSEC / MILLISEC))
-#define	NSEC2MSEC(n)	((n) / (NANOSEC / MILLISEC))
-
-#define	USEC2NSEC(m)	((hrtime_t)(m) * (NANOSEC / MICROSEC))
-#define	NSEC2USEC(n)	((n) / (NANOSEC / MICROSEC))
-
-#define	NSEC2SEC(n)	((n) / (NANOSEC / SEC))
-#define	SEC2NSEC(m)	((hrtime_t)(m) * (NANOSEC / SEC))
-
-typedef longlong_t	hrtime_t;
-
-#if defined(__i386__) || defined(__powerpc__)
-#define	TIMESPEC_OVERFLOW(ts)						\
-	((ts)->tv_sec < INT32_MIN || (ts)->tv_sec > INT32_MAX)
-#else
-#define	TIMESPEC_OVERFLOW(ts)						\
-	((ts)->tv_sec < INT64_MIN || (ts)->tv_sec > INT64_MAX)
-#endif
-
-#define	SEC_TO_TICK(sec)	((sec) * hz)
-#define	NSEC_TO_TICK(nsec)	((nsec) / (NANOSEC / hz))
-
-#ifdef _KERNEL
-static __inline hrtime_t
-gethrtime(void) {
-
-	struct timespec ts;
-	hrtime_t nsec;
-
-	getnanouptime(&ts);
-	nsec = (hrtime_t)ts.tv_sec * NANOSEC + ts.tv_nsec;
-	return (nsec);
-}
-
-#define	gethrestime_sec()	(time_second)
-#define	gethrestime(ts)		getnanotime(ts)
-#define	gethrtime_waitfree()	gethrtime()
-
-extern int nsec_per_tick;	/* nanoseconds per clock tick */
-
-#define ddi_get_lbolt64()				\
-    (int64_t)(((getsbinuptime() >> 16) * hz) >> 16)
-#define ddi_get_lbolt()		(clock_t)ddi_get_lbolt64()
-
-#else
-
-static __inline hrtime_t gethrtime(void) {
-	struct timespec ts;
-	clock_gettime(CLOCK_UPTIME,&ts);
-	return (((u_int64_t) ts.tv_sec) * NANOSEC + ts.tv_nsec);
-}
-
-#endif	/* _KERNEL */
-
-#endif	/* !_OPENSOLARIS_SYS_TIME_H_ */
Index: sys/cddl/compat/opensolaris/sys/types.h
===================================================================
--- sys/cddl/compat/opensolaris/sys/types.h
+++ /dev/null
@@ -1,101 +0,0 @@
-/*-
- * Copyright (c) 2007 Pawel Jakub Dawidek <pjd@FreeBSD.org>
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- *
- * $FreeBSD$
- */
-
-#ifndef _OPENSOLARIS_SYS_TYPES_H_
-#define	_OPENSOLARIS_SYS_TYPES_H_
-
-/*
- * This is a bag of dirty hacks to keep things compiling.
- */
-
-#include <sys/stdint.h>
-
-#ifdef _KERNEL
-typedef	int64_t		clock_t;
-#define	_CLOCK_T_DECLARED
-#endif
-
-#include_next <sys/types.h>
-
-#define	MAXNAMELEN	256
-
-typedef	struct timespec	timestruc_t;
-typedef	struct timespec	timespec_t;
-typedef u_int		uint_t;
-typedef u_char		uchar_t;
-typedef u_short		ushort_t;
-typedef u_long		ulong_t;
-typedef long long	longlong_t;  
-typedef unsigned long long u_longlong_t;
-#ifndef	_OFF64_T_DECLARED
-#define	_OFF64_T_DECLARED
-typedef off_t		off64_t;
-#endif
-typedef id_t		taskid_t;
-typedef id_t		projid_t;
-typedef id_t		poolid_t;
-typedef id_t		zoneid_t;
-typedef id_t		ctid_t;
-typedef	mode_t		o_mode_t;
-typedef	uint64_t	pgcnt_t;
-typedef	u_int		minor_t;
-
-#ifdef _KERNEL
-
-#define	B_FALSE	0
-#define	B_TRUE	1
-
-typedef	short		index_t;
-typedef	off_t		offset_t;
-#ifndef _PTRDIFF_T_DECLARED
-typedef	__ptrdiff_t		ptrdiff_t;	/* pointer difference */
-#define _PTRDIFF_T_DECLARED
-#endif
-typedef	int64_t		rlim64_t;
-typedef	int		major_t;
-
-#else
-#ifdef NEED_SOLARIS_BOOLEAN
-#if defined(__XOPEN_OR_POSIX)
-typedef enum { _B_FALSE, _B_TRUE }	boolean_t;
-#else
-typedef enum { B_FALSE, B_TRUE }	boolean_t;
-#endif /* defined(__XOPEN_OR_POSIX) */
-#endif
-
-typedef	longlong_t	offset_t;
-typedef	u_longlong_t	u_offset_t;
-typedef	uint64_t	upad64_t;
-typedef	short		pri_t;
-typedef	int32_t		daddr32_t;
-typedef	int32_t		time32_t;
-typedef	u_longlong_t	diskaddr_t;
-
-#endif	/* !_KERNEL */
-
-#endif	/* !_OPENSOLARIS_SYS_TYPES_H_ */
Index: sys/cddl/compat/opensolaris/sys/uio.h
===================================================================
--- sys/cddl/compat/opensolaris/sys/uio.h
+++ /dev/null
@@ -1,89 +0,0 @@
-/*-
- * Copyright (c) 2010 Pawel Jakub Dawidek <pjd@FreeBSD.org>
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- *
- * $FreeBSD$
- */
-
-#ifndef _OPENSOLARIS_SYS_UIO_H_
-#define	_OPENSOLARIS_SYS_UIO_H_
-
-#include_next <sys/uio.h>
-#include <sys/debug.h>
-
-#ifndef _KERNEL
-#define	FOF_OFFSET	1	/* Use the offset in uio argument */
-
-struct uio {
-	struct	iovec *uio_iov;
-	int	uio_iovcnt;
-	off_t	uio_offset;
-	int	uio_resid;
-	enum	uio_seg uio_segflg;
-	enum	uio_rw uio_rw;
-	void	*uio_td;
-};
-#endif
-
-#define	uio_loffset	uio_offset
-
-typedef	struct uio	uio_t;
-typedef	struct iovec	iovec_t;
-
-typedef enum xuio_type {
-	UIOTYPE_ASYNCIO,
-	UIOTYPE_ZEROCOPY
-} xuio_type_t;
-
-typedef struct xuio {
-	uio_t	xu_uio;
-
-	/* Extended uio fields */
-	enum xuio_type xu_type; /* What kind of uio structure? */
-	union {
-		struct {
-			int xu_zc_rw;
-			void *xu_zc_priv;
-		} xu_zc;
-	} xu_ext;
-} xuio_t;
-
-#define	XUIO_XUZC_PRIV(xuio)	xuio->xu_ext.xu_zc.xu_zc_priv
-#define	XUIO_XUZC_RW(xuio)	xuio->xu_ext.xu_zc.xu_zc_rw
-
-#ifdef BUILDING_ZFS
-static __inline int
-zfs_uiomove(void *cp, size_t n, enum uio_rw dir, uio_t *uio)
-{
-
-	ASSERT(uio->uio_rw == dir);
-	return (uiomove(cp, (int)n, uio));
-}
-#define	uiomove(cp, n, dir, uio)	zfs_uiomove((cp), (n), (dir), (uio))
-
-int uiocopy(void *p, size_t n, enum uio_rw rw, struct uio *uio, size_t *cbytes);
-void uioskip(uio_t *uiop, size_t n);
-#endif	/* BUILDING_ZFS */
-
-#endif	/* !_OPENSOLARIS_SYS_UIO_H_ */
Index: sys/cddl/compat/opensolaris/sys/vnode.h
===================================================================
--- sys/cddl/compat/opensolaris/sys/vnode.h
+++ /dev/null
@@ -1,287 +0,0 @@
-/*-
- * Copyright (c) 2007 Pawel Jakub Dawidek <pjd@FreeBSD.org>
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- *
- * $FreeBSD$
- */
-
-#ifndef _OPENSOLARIS_SYS_VNODE_H_
-#define	_OPENSOLARIS_SYS_VNODE_H_
-
-#ifdef _KERNEL
-
-struct vnode;
-struct vattr;
-
-typedef	struct vnode	vnode_t;
-typedef	struct vattr	vattr_t;
-typedef enum vtype vtype_t;
-
-#include <sys/namei.h>
-enum symfollow { NO_FOLLOW = NOFOLLOW };
-
-#include <sys/proc.h>
-#include_next <sys/vnode.h>
-#include <sys/mount.h>
-#include <sys/cred.h>
-#include <sys/fcntl.h>
-#include <sys/file.h>
-#include <sys/filedesc.h>
-#include <sys/syscallsubr.h>
-
-typedef	struct vop_vector	vnodeops_t;
-#define	VOP_FID		VOP_VPTOFH
-#define	vop_fid		vop_vptofh
-#define	vop_fid_args	vop_vptofh_args
-#define	a_fid		a_fhp
-
-#define	IS_XATTRDIR(dvp)	(0)
-
-#define	v_count	v_usecount
-
-#define	V_APPEND	VAPPEND
-
-#define	rootvfs		(rootvnode == NULL ? NULL : rootvnode->v_mount)
-
-static __inline int
-vn_is_readonly(vnode_t *vp)
-{
-	return (vp->v_mount->mnt_flag & MNT_RDONLY);
-}
-#define	vn_vfswlock(vp)		(0)
-#define	vn_vfsunlock(vp)	do { } while (0)
-#define	vn_ismntpt(vp)		((vp)->v_type == VDIR && (vp)->v_mountedhere != NULL)
-#define	vn_mountedvfs(vp)	((vp)->v_mountedhere)
-#define	vn_has_cached_data(vp)	\
-	((vp)->v_object != NULL && \
-	 (vp)->v_object->resident_page_count > 0)
-#define	vn_exists(vp)		do { } while (0)
-#define	vn_invalid(vp)		do { } while (0)
-#define	vn_renamepath(tdvp, svp, tnm, lentnm)	do { } while (0)
-#define	vn_free(vp)		do { } while (0)
-#define	vn_matchops(vp, vops)	((vp)->v_op == &(vops))
-
-#define	VN_HOLD(v)	vref(v)
-#define	VN_RELE(v)	vrele(v)
-#define	VN_URELE(v)	vput(v)
-
-#define	vnevent_create(vp, ct)			do { } while (0)
-#define	vnevent_link(vp, ct)			do { } while (0)
-#define	vnevent_remove(vp, dvp, name, ct)	do { } while (0)
-#define	vnevent_rmdir(vp, dvp, name, ct)	do { } while (0)
-#define	vnevent_rename_src(vp, dvp, name, ct)	do { } while (0)
-#define	vnevent_rename_dest(vp, dvp, name, ct)	do { } while (0)
-#define	vnevent_rename_dest_dir(vp, ct)		do { } while (0)
-
-#define	specvp(vp, rdev, type, cr)	(VN_HOLD(vp), (vp))
-#define	MANDMODE(mode)		(0)
-#define	MANDLOCK(vp, mode)	(0)
-#define	chklock(vp, op, offset, size, mode, ct)	(0)
-#define	cleanlocks(vp, pid, foo)	do { } while (0)
-#define	cleanshares(vp, pid)		do { } while (0)
-
-/*
- * We will use va_spare is place of Solaris' va_mask.
- * This field is initialized in zfs_setattr().
- */
-#define	va_mask		va_spare
-/* TODO: va_fileid is shorter than va_nodeid !!! */
-#define	va_nodeid	va_fileid
-/* TODO: This field needs conversion! */
-#define	va_nblocks	va_bytes
-#define	va_blksize	va_blocksize
-#define	va_seq		va_gen
-
-#define	MAXOFFSET_T	OFF_MAX
-#define	EXCL		0
-
-#define	ACCESSED		(AT_ATIME)
-#define	STATE_CHANGED		(AT_CTIME)
-#define	CONTENT_MODIFIED	(AT_MTIME | AT_CTIME)
-
-static __inline void
-vattr_init_mask(vattr_t *vap)
-{
-
-	vap->va_mask = 0;
-
-	if (vap->va_type != VNON)
-		vap->va_mask |= AT_TYPE;
-	if (vap->va_uid != (uid_t)VNOVAL)
-		vap->va_mask |= AT_UID;
-	if (vap->va_gid != (gid_t)VNOVAL)
-		vap->va_mask |= AT_GID;
-	if (vap->va_size != (u_quad_t)VNOVAL)
-		vap->va_mask |= AT_SIZE;
-	if (vap->va_atime.tv_sec != VNOVAL)
-		vap->va_mask |= AT_ATIME;
-	if (vap->va_mtime.tv_sec != VNOVAL)
-		vap->va_mask |= AT_MTIME;
-	if (vap->va_mode != (u_short)VNOVAL)
-		vap->va_mask |= AT_MODE;
-	if (vap->va_flags != VNOVAL)
-		vap->va_mask |= AT_XVATTR;
-}
-
-#define	FCREAT		O_CREAT
-#define	FTRUNC		O_TRUNC
-#define	FEXCL		O_EXCL
-#define	FDSYNC		FFSYNC
-#define	FRSYNC		FFSYNC
-#define	FSYNC		FFSYNC
-#define	FOFFMAX		0x00
-#define	FIGNORECASE	0x00
-
-static __inline int
-vn_openat(char *pnamep, enum uio_seg seg, int filemode, int createmode,
-    vnode_t **vpp, enum create crwhy, mode_t umask, struct vnode *startvp,
-    int fd)
-{
-	struct thread *td = curthread;
-	struct nameidata nd;
-	int error, operation;
-
-	ASSERT(seg == UIO_SYSSPACE);
-	if ((filemode & FCREAT) != 0) {
-		ASSERT(filemode == (FWRITE | FCREAT | FTRUNC | FOFFMAX));
-		ASSERT(crwhy == CRCREAT);
-		operation = CREATE;
-	} else {
-		ASSERT(filemode == (FREAD | FOFFMAX) ||
-		    filemode == (FREAD | FWRITE | FOFFMAX));
-		ASSERT(crwhy == 0);
-		operation = LOOKUP;
-	}
-	ASSERT(umask == 0);
-
-	pwd_ensure_dirs();
-
-	if (startvp != NULL)
-		vref(startvp);
-	NDINIT_ATVP(&nd, operation, 0, UIO_SYSSPACE, pnamep, startvp, td);
-	filemode |= O_NOFOLLOW;
-	error = vn_open_cred(&nd, &filemode, createmode, 0, td->td_ucred, NULL);
-	NDFREE(&nd, NDF_ONLY_PNBUF);
-	if (error == 0) {
-		/* We just unlock so we hold a reference. */
-		VOP_UNLOCK(nd.ni_vp);
-		*vpp = nd.ni_vp;
-	}
-	return (error);
-}
-
-static __inline int
-zfs_vn_open(char *pnamep, enum uio_seg seg, int filemode, int createmode,
-    vnode_t **vpp, enum create crwhy, mode_t umask)
-{
-
-	return (vn_openat(pnamep, seg, filemode, createmode, vpp, crwhy,
-	    umask, NULL, -1));
-}
-#define	vn_open(pnamep, seg, filemode, createmode, vpp, crwhy, umask)	\
-	zfs_vn_open((pnamep), (seg), (filemode), (createmode), (vpp), (crwhy), (umask))
-
-#define	RLIM64_INFINITY	0
-static __inline int
-zfs_vn_rdwr(enum uio_rw rw, vnode_t *vp, caddr_t base, ssize_t len,
-    offset_t offset, enum uio_seg seg, int ioflag, int ulimit, cred_t *cr,
-    ssize_t *residp)
-{
-	struct thread *td = curthread;
-	int error;
-	ssize_t resid;
-
-	ASSERT(ioflag == 0);
-	ASSERT(ulimit == RLIM64_INFINITY);
-
-	if (rw == UIO_WRITE) {
-		ioflag = IO_SYNC;
-	} else {
-		ioflag = IO_DIRECT;
-	}
-	error = vn_rdwr(rw, vp, base, len, offset, seg, ioflag, cr, NOCRED,
-	    &resid, td);
-	if (residp != NULL)
-		*residp = (ssize_t)resid;
-	return (error);
-}
-#define	vn_rdwr(rw, vp, base, len, offset, seg, ioflag, ulimit, cr, residp) \
-	zfs_vn_rdwr((rw), (vp), (base), (len), (offset), (seg), (ioflag), (ulimit), (cr), (residp))
-
-static __inline int
-zfs_vop_fsync(vnode_t *vp, int flag, cred_t *cr)
-{
-	struct mount *mp;
-	int error;
-
-	ASSERT(flag == FSYNC);
-
-	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
-		goto drop;
-	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
-	error = VOP_FSYNC(vp, MNT_WAIT, curthread);
-	VOP_UNLOCK(vp);
-	vn_finished_write(mp);
-drop:
-	return (error);
-}
-#define	VOP_FSYNC(vp, flag, cr, ct)	zfs_vop_fsync((vp), (flag), (cr))
-
-static __inline int
-zfs_vop_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr)
-{
-	int error;
-
-	ASSERT(count == 1);
-	ASSERT(offset == 0);
-
-	error = vn_close(vp, flag, cr, curthread);
-	return (error);
-}
-#define	VOP_CLOSE(vp, oflags, count, offset, cr, ct)			\
-	zfs_vop_close((vp), (oflags), (count), (offset), (cr))
-
-static __inline int
-vn_rename(char *from, char *to, enum uio_seg seg)
-{
-
-	ASSERT(seg == UIO_SYSSPACE);
-
-	return (kern_renameat(curthread, AT_FDCWD, from, AT_FDCWD, to, seg));
-}
-
-static __inline int
-vn_remove(char *fnamep, enum uio_seg seg, enum rm dirflag)
-{
-
-	ASSERT(seg == UIO_SYSSPACE);
-	ASSERT(dirflag == RMFILE);
-
-	return (kern_funlinkat(curthread, AT_FDCWD, fnamep, FD_NONE, seg, 0,
-	    0));
-}
-
-#endif	/* _KERNEL */
-
-#endif	/* _OPENSOLARIS_SYS_VNODE_H_ */
Index: sys/cddl/contrib/opensolaris/common/acl/acl_common.h
===================================================================
--- sys/cddl/contrib/opensolaris/common/acl/acl_common.h
+++ /dev/null
@@ -1,69 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
- */
-
-#ifndef	_ACL_COMMON_H
-#define	_ACL_COMMON_H
-
-#include <sys/types.h>
-#include <sys/acl.h>
-#include <sys/stat.h>
-
-#ifdef	__cplusplus
-extern "C" {
-#endif
-
-typedef struct trivial_acl {
-	uint32_t	allow0;		/* allow mask for bits only in owner */
-	uint32_t	deny1;		/* deny mask for bits not in owner */
-	uint32_t	deny2;		/* deny mask for bits not in group */
-	uint32_t	owner;		/* allow mask matching mode */
-	uint32_t	group;		/* allow mask matching mode */
-	uint32_t	everyone;	/* allow mask matching mode */
-} trivial_acl_t;
-
-extern int acltrivial(const char *);
-extern void adjust_ace_pair(ace_t *pair, mode_t mode);
-extern void adjust_ace_pair_common(void *, size_t, size_t, mode_t);
-extern int ace_trivial(ace_t *acep, int aclcnt);
-extern int ace_trivial_common(void *, int,
-    uint64_t (*walk)(void *, uint64_t, int aclcnt, uint16_t *, uint16_t *,
-    uint32_t *mask));
-#if !defined(_KERNEL)
-extern acl_t *acl_alloc(acl_type_t);
-extern void acl_free(acl_t *aclp);
-extern int acl_translate(acl_t *aclp, int target_flavor, boolean_t isdir,
-    uid_t owner, gid_t group);
-#endif	/* !_KERNEL */
-void ksort(caddr_t v, int n, int s, int (*f)());
-int cmp2acls(void *a, void *b);
-int acl_trivial_create(mode_t mode, boolean_t isdir, ace_t **acl, int *count);
-void acl_trivial_access_masks(mode_t mode, boolean_t isdir,
-    trivial_acl_t *masks);
-
-#ifdef	__cplusplus
-}
-#endif
-
-#endif /* _ACL_COMMON_H */
Index: sys/cddl/contrib/opensolaris/common/acl/acl_common.c
===================================================================
--- sys/cddl/contrib/opensolaris/common/acl/acl_common.c
+++ /dev/null
@@ -1,1765 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
- */
-
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <sys/avl.h>
-#include <sys/misc.h>
-#if defined(_KERNEL)
-#include <sys/kmem.h>
-#include <sys/systm.h>
-#include <sys/sysmacros.h>
-#include <acl/acl_common.h>
-#include <sys/debug.h>
-#else
-#include <errno.h>
-#include <stdlib.h>
-#include <stddef.h>
-#include <strings.h>
-#include <unistd.h>
-#include <assert.h>
-#include <grp.h>
-#include <pwd.h>
-#include <acl_common.h>
-#define	ASSERT	assert
-#endif
-
-#define	ACE_POSIX_SUPPORTED_BITS (ACE_READ_DATA | \
-    ACE_WRITE_DATA | ACE_APPEND_DATA | ACE_EXECUTE | \
-    ACE_READ_ATTRIBUTES | ACE_READ_ACL | ACE_WRITE_ACL)
-
-
-#define	ACL_SYNCHRONIZE_SET_DENY		0x0000001
-#define	ACL_SYNCHRONIZE_SET_ALLOW		0x0000002
-#define	ACL_SYNCHRONIZE_ERR_DENY		0x0000004
-#define	ACL_SYNCHRONIZE_ERR_ALLOW		0x0000008
-
-#define	ACL_WRITE_OWNER_SET_DENY		0x0000010
-#define	ACL_WRITE_OWNER_SET_ALLOW		0x0000020
-#define	ACL_WRITE_OWNER_ERR_DENY		0x0000040
-#define	ACL_WRITE_OWNER_ERR_ALLOW		0x0000080
-
-#define	ACL_DELETE_SET_DENY			0x0000100
-#define	ACL_DELETE_SET_ALLOW			0x0000200
-#define	ACL_DELETE_ERR_DENY			0x0000400
-#define	ACL_DELETE_ERR_ALLOW			0x0000800
-
-#define	ACL_WRITE_ATTRS_OWNER_SET_DENY		0x0001000
-#define	ACL_WRITE_ATTRS_OWNER_SET_ALLOW		0x0002000
-#define	ACL_WRITE_ATTRS_OWNER_ERR_DENY		0x0004000
-#define	ACL_WRITE_ATTRS_OWNER_ERR_ALLOW		0x0008000
-
-#define	ACL_WRITE_ATTRS_WRITER_SET_DENY		0x0010000
-#define	ACL_WRITE_ATTRS_WRITER_SET_ALLOW	0x0020000
-#define	ACL_WRITE_ATTRS_WRITER_ERR_DENY		0x0040000
-#define	ACL_WRITE_ATTRS_WRITER_ERR_ALLOW	0x0080000
-
-#define	ACL_WRITE_NAMED_WRITER_SET_DENY		0x0100000
-#define	ACL_WRITE_NAMED_WRITER_SET_ALLOW	0x0200000
-#define	ACL_WRITE_NAMED_WRITER_ERR_DENY		0x0400000
-#define	ACL_WRITE_NAMED_WRITER_ERR_ALLOW	0x0800000
-
-#define	ACL_READ_NAMED_READER_SET_DENY		0x1000000
-#define	ACL_READ_NAMED_READER_SET_ALLOW		0x2000000
-#define	ACL_READ_NAMED_READER_ERR_DENY		0x4000000
-#define	ACL_READ_NAMED_READER_ERR_ALLOW		0x8000000
-
-
-#define	ACE_VALID_MASK_BITS (\
-    ACE_READ_DATA | \
-    ACE_LIST_DIRECTORY | \
-    ACE_WRITE_DATA | \
-    ACE_ADD_FILE | \
-    ACE_APPEND_DATA | \
-    ACE_ADD_SUBDIRECTORY | \
-    ACE_READ_NAMED_ATTRS | \
-    ACE_WRITE_NAMED_ATTRS | \
-    ACE_EXECUTE | \
-    ACE_DELETE_CHILD | \
-    ACE_READ_ATTRIBUTES | \
-    ACE_WRITE_ATTRIBUTES | \
-    ACE_DELETE | \
-    ACE_READ_ACL | \
-    ACE_WRITE_ACL | \
-    ACE_WRITE_OWNER | \
-    ACE_SYNCHRONIZE)
-
-#define	ACE_MASK_UNDEFINED			0x80000000
-
-#define	ACE_VALID_FLAG_BITS (ACE_FILE_INHERIT_ACE | \
-    ACE_DIRECTORY_INHERIT_ACE | \
-    ACE_NO_PROPAGATE_INHERIT_ACE | ACE_INHERIT_ONLY_ACE | \
-    ACE_SUCCESSFUL_ACCESS_ACE_FLAG | ACE_FAILED_ACCESS_ACE_FLAG | \
-    ACE_IDENTIFIER_GROUP | ACE_OWNER | ACE_GROUP | ACE_EVERYONE)
-
-/*
- * ACL conversion helpers
- */
-
-typedef enum {
-	ace_unused,
-	ace_user_obj,
-	ace_user,
-	ace_group, /* includes GROUP and GROUP_OBJ */
-	ace_other_obj
-} ace_to_aent_state_t;
-
-typedef struct acevals {
-	uid_t key;
-	avl_node_t avl;
-	uint32_t mask;
-	uint32_t allowed;
-	uint32_t denied;
-	int aent_type;
-} acevals_t;
-
-typedef struct ace_list {
-	acevals_t user_obj;
-	avl_tree_t user;
-	int numusers;
-	acevals_t group_obj;
-	avl_tree_t group;
-	int numgroups;
-	acevals_t other_obj;
-	uint32_t acl_mask;
-	int hasmask;
-	int dfacl_flag;
-	ace_to_aent_state_t state;
-	int seen; /* bitmask of all aclent_t a_type values seen */
-} ace_list_t;
-
-/*
- * Generic shellsort, from K&R (1st ed, p 58.), somewhat modified.
- * v = Ptr to array/vector of objs
- * n = # objs in the array
- * s = size of each obj (must be multiples of a word size)
- * f = ptr to function to compare two objs
- *	returns (-1 = less than, 0 = equal, 1 = greater than
- */
-void
-ksort(caddr_t v, int n, int s, int (*f)())
-{
-	int g, i, j, ii;
-	unsigned int *p1, *p2;
-	unsigned int tmp;
-
-	/* No work to do */
-	if (v == NULL || n <= 1)
-		return;
-
-	/* Sanity check on arguments */
-	ASSERT(((uintptr_t)v & 0x3) == 0 && (s & 0x3) == 0);
-	ASSERT(s > 0);
-	for (g = n / 2; g > 0; g /= 2) {
-		for (i = g; i < n; i++) {
-			for (j = i - g; j >= 0 &&
-			    (*f)(v + j * s, v + (j + g) * s) == 1;
-			    j -= g) {
-				p1 = (void *)(v + j * s);
-				p2 = (void *)(v + (j + g) * s);
-				for (ii = 0; ii < s / 4; ii++) {
-					tmp = *p1;
-					*p1++ = *p2;
-					*p2++ = tmp;
-				}
-			}
-		}
-	}
-}
-
-/*
- * Compare two acls, all fields.  Returns:
- * -1 (less than)
- *  0 (equal)
- * +1 (greater than)
- */
-int
-cmp2acls(void *a, void *b)
-{
-	aclent_t *x = (aclent_t *)a;
-	aclent_t *y = (aclent_t *)b;
-
-	/* Compare types */
-	if (x->a_type < y->a_type)
-		return (-1);
-	if (x->a_type > y->a_type)
-		return (1);
-	/* Equal types; compare id's */
-	if (x->a_id < y->a_id)
-		return (-1);
-	if (x->a_id > y->a_id)
-		return (1);
-	/* Equal ids; compare perms */
-	if (x->a_perm < y->a_perm)
-		return (-1);
-	if (x->a_perm > y->a_perm)
-		return (1);
-	/* Totally equal */
-	return (0);
-}
-
-/*ARGSUSED*/
-static void *
-cacl_realloc(void *ptr, size_t size, size_t new_size)
-{
-#if defined(_KERNEL)
-	void *tmp;
-
-	tmp = kmem_alloc(new_size, KM_SLEEP);
-	(void) memcpy(tmp, ptr, (size < new_size) ? size : new_size);
-	kmem_free(ptr, size);
-	return (tmp);
-#else
-	return (realloc(ptr, new_size));
-#endif
-}
-
-static int
-cacl_malloc(void **ptr, size_t size)
-{
-#if defined(_KERNEL)
-	*ptr = kmem_zalloc(size, KM_SLEEP);
-	return (0);
-#else
-	*ptr = calloc(1, size);
-	if (*ptr == NULL)
-		return (errno);
-
-	return (0);
-#endif
-}
-
-/*ARGSUSED*/
-static void
-cacl_free(void *ptr, size_t size)
-{
-#if defined(_KERNEL)
-	kmem_free(ptr, size);
-#else
-	free(ptr);
-#endif
-}
-
-#if !defined(_KERNEL)
-acl_t *
-acl_alloc(enum acl_type type)
-{
-	acl_t *aclp;
-
-	if (cacl_malloc((void **)&aclp, sizeof (acl_t)) != 0)
-		return (NULL);
-
-	aclp->acl_aclp = NULL;
-	aclp->acl_cnt = 0;
-
-	switch (type) {
-	case ACE_T:
-		aclp->acl_type = ACE_T;
-		aclp->acl_entry_size = sizeof (ace_t);
-		break;
-	case ACLENT_T:
-		aclp->acl_type = ACLENT_T;
-		aclp->acl_entry_size = sizeof (aclent_t);
-		break;
-	default:
-		acl_free(aclp);
-		aclp = NULL;
-	}
-	return (aclp);
-}
-
-/*
- * Free acl_t structure
- */
-void
-acl_free(acl_t *aclp)
-{
-	int acl_size;
-
-	if (aclp == NULL)
-		return;
-
-	if (aclp->acl_aclp) {
-		acl_size = aclp->acl_cnt * aclp->acl_entry_size;
-		cacl_free(aclp->acl_aclp, acl_size);
-	}
-
-	cacl_free(aclp, sizeof (acl_t));
-}
-
-static uint32_t
-access_mask_set(int haswriteperm, int hasreadperm, int isowner, int isallow)
-{
-	uint32_t access_mask = 0;
-	int acl_produce;
-	int synchronize_set = 0, write_owner_set = 0;
-	int delete_set = 0, write_attrs_set = 0;
-	int read_named_set = 0, write_named_set = 0;
-
-	acl_produce = (ACL_SYNCHRONIZE_SET_ALLOW |
-	    ACL_WRITE_ATTRS_OWNER_SET_ALLOW |
-	    ACL_WRITE_ATTRS_WRITER_SET_DENY);
-
-	if (isallow) {
-		synchronize_set = ACL_SYNCHRONIZE_SET_ALLOW;
-		write_owner_set = ACL_WRITE_OWNER_SET_ALLOW;
-		delete_set = ACL_DELETE_SET_ALLOW;
-		if (hasreadperm)
-			read_named_set = ACL_READ_NAMED_READER_SET_ALLOW;
-		if (haswriteperm)
-			write_named_set = ACL_WRITE_NAMED_WRITER_SET_ALLOW;
-		if (isowner)
-			write_attrs_set = ACL_WRITE_ATTRS_OWNER_SET_ALLOW;
-		else if (haswriteperm)
-			write_attrs_set = ACL_WRITE_ATTRS_WRITER_SET_ALLOW;
-	} else {
-
-		synchronize_set = ACL_SYNCHRONIZE_SET_DENY;
-		write_owner_set = ACL_WRITE_OWNER_SET_DENY;
-		delete_set = ACL_DELETE_SET_DENY;
-		if (hasreadperm)
-			read_named_set = ACL_READ_NAMED_READER_SET_DENY;
-		if (haswriteperm)
-			write_named_set = ACL_WRITE_NAMED_WRITER_SET_DENY;
-		if (isowner)
-			write_attrs_set = ACL_WRITE_ATTRS_OWNER_SET_DENY;
-		else if (haswriteperm)
-			write_attrs_set = ACL_WRITE_ATTRS_WRITER_SET_DENY;
-		else
-			/*
-			 * If the entity is not the owner and does not
-			 * have write permissions ACE_WRITE_ATTRIBUTES will
-			 * always go in the DENY ACE.
-			 */
-			access_mask |= ACE_WRITE_ATTRIBUTES;
-	}
-
-	if (acl_produce & synchronize_set)
-		access_mask |= ACE_SYNCHRONIZE;
-	if (acl_produce & write_owner_set)
-		access_mask |= ACE_WRITE_OWNER;
-	if (acl_produce & delete_set)
-		access_mask |= ACE_DELETE;
-	if (acl_produce & write_attrs_set)
-		access_mask |= ACE_WRITE_ATTRIBUTES;
-	if (acl_produce & read_named_set)
-		access_mask |= ACE_READ_NAMED_ATTRS;
-	if (acl_produce & write_named_set)
-		access_mask |= ACE_WRITE_NAMED_ATTRS;
-
-	return (access_mask);
-}
-
-/*
- * Given an mode_t, convert it into an access_mask as used
- * by nfsace, assuming aclent_t -> nfsace semantics.
- */
-static uint32_t
-mode_to_ace_access(mode_t mode, boolean_t isdir, int isowner, int isallow)
-{
-	uint32_t access = 0;
-	int haswriteperm = 0;
-	int hasreadperm = 0;
-
-	if (isallow) {
-		haswriteperm = (mode & S_IWOTH);
-		hasreadperm = (mode & S_IROTH);
-	} else {
-		haswriteperm = !(mode & S_IWOTH);
-		hasreadperm = !(mode & S_IROTH);
-	}
-
-	/*
-	 * The following call takes care of correctly setting the following
-	 * mask bits in the access_mask:
-	 * ACE_SYNCHRONIZE, ACE_WRITE_OWNER, ACE_DELETE,
-	 * ACE_WRITE_ATTRIBUTES, ACE_WRITE_NAMED_ATTRS, ACE_READ_NAMED_ATTRS
-	 */
-	access = access_mask_set(haswriteperm, hasreadperm, isowner, isallow);
-
-	if (isallow) {
-		access |= ACE_READ_ACL | ACE_READ_ATTRIBUTES;
-		if (isowner)
-			access |= ACE_WRITE_ACL;
-	} else {
-		if (! isowner)
-			access |= ACE_WRITE_ACL;
-	}
-
-	/* read */
-	if (mode & S_IROTH) {
-		access |= ACE_READ_DATA;
-	}
-	/* write */
-	if (mode & S_IWOTH) {
-		access |= ACE_WRITE_DATA |
-		    ACE_APPEND_DATA;
-		if (isdir)
-			access |= ACE_DELETE_CHILD;
-	}
-	/* exec */
-	if (mode & S_IXOTH) {
-		access |= ACE_EXECUTE;
-	}
-
-	return (access);
-}
-
-/*
- * Given an nfsace (presumably an ALLOW entry), make a
- * corresponding DENY entry at the address given.
- */
-static void
-ace_make_deny(ace_t *allow, ace_t *deny, int isdir, int isowner)
-{
-	(void) memcpy(deny, allow, sizeof (ace_t));
-
-	deny->a_who = allow->a_who;
-
-	deny->a_type = ACE_ACCESS_DENIED_ACE_TYPE;
-	deny->a_access_mask ^= ACE_POSIX_SUPPORTED_BITS;
-	if (isdir)
-		deny->a_access_mask ^= ACE_DELETE_CHILD;
-
-	deny->a_access_mask &= ~(ACE_SYNCHRONIZE | ACE_WRITE_OWNER |
-	    ACE_DELETE | ACE_WRITE_ATTRIBUTES | ACE_READ_NAMED_ATTRS |
-	    ACE_WRITE_NAMED_ATTRS);
-	deny->a_access_mask |= access_mask_set((allow->a_access_mask &
-	    ACE_WRITE_DATA), (allow->a_access_mask & ACE_READ_DATA), isowner,
-	    B_FALSE);
-}
-/*
- * Make an initial pass over an array of aclent_t's.  Gather
- * information such as an ACL_MASK (if any), number of users,
- * number of groups, and whether the array needs to be sorted.
- */
-static int
-ln_aent_preprocess(aclent_t *aclent, int n,
-    int *hasmask, mode_t *mask,
-    int *numuser, int *numgroup, int *needsort)
-{
-	int error = 0;
-	int i;
-	int curtype = 0;
-
-	*hasmask = 0;
-	*mask = 07;
-	*needsort = 0;
-	*numuser = 0;
-	*numgroup = 0;
-
-	for (i = 0; i < n; i++) {
-		if (aclent[i].a_type < curtype)
-			*needsort = 1;
-		else if (aclent[i].a_type > curtype)
-			curtype = aclent[i].a_type;
-		if (aclent[i].a_type & USER)
-			(*numuser)++;
-		if (aclent[i].a_type & (GROUP | GROUP_OBJ))
-			(*numgroup)++;
-		if (aclent[i].a_type & CLASS_OBJ) {
-			if (*hasmask) {
-				error = EINVAL;
-				goto out;
-			} else {
-				*hasmask = 1;
-				*mask = aclent[i].a_perm;
-			}
-		}
-	}
-
-	if ((! *hasmask) && (*numuser + *numgroup > 1)) {
-		error = EINVAL;
-		goto out;
-	}
-
-out:
-	return (error);
-}
-
-/*
- * Convert an array of aclent_t into an array of nfsace entries,
- * following POSIX draft -> nfsv4 conversion semantics as outlined in
- * the IETF draft.
- */
-static int
-ln_aent_to_ace(aclent_t *aclent, int n, ace_t **acepp, int *rescount, int isdir)
-{
-	int error = 0;
-	mode_t mask;
-	int numuser, numgroup, needsort;
-	int resultsize = 0;
-	int i, groupi = 0, skip;
-	ace_t *acep, *result = NULL;
-	int hasmask;
-
-	error = ln_aent_preprocess(aclent, n, &hasmask, &mask,
-	    &numuser, &numgroup, &needsort);
-	if (error != 0)
-		goto out;
-
-	/* allow + deny for each aclent */
-	resultsize = n * 2;
-	if (hasmask) {
-		/*
-		 * stick extra deny on the group_obj and on each
-		 * user|group for the mask (the group_obj was added
-		 * into the count for numgroup)
-		 */
-		resultsize += numuser + numgroup;
-		/* ... and don't count the mask itself */
-		resultsize -= 2;
-	}
-
-	/* sort the source if necessary */
-	if (needsort)
-		ksort((caddr_t)aclent, n, sizeof (aclent_t), cmp2acls);
-
-	if (cacl_malloc((void **)&result, resultsize * sizeof (ace_t)) != 0)
-		goto out;
-
-	acep = result;
-
-	for (i = 0; i < n; i++) {
-		/*
-		 * don't process CLASS_OBJ (mask); mask was grabbed in
-		 * ln_aent_preprocess()
-		 */
-		if (aclent[i].a_type & CLASS_OBJ)
-			continue;
-
-		/* If we need an ACL_MASK emulator, prepend it now */
-		if ((hasmask) &&
-		    (aclent[i].a_type & (USER | GROUP | GROUP_OBJ))) {
-			acep->a_type = ACE_ACCESS_DENIED_ACE_TYPE;
-			acep->a_flags = 0;
-			if (aclent[i].a_type & GROUP_OBJ) {
-				acep->a_who = (uid_t)-1;
-				acep->a_flags |=
-				    (ACE_IDENTIFIER_GROUP|ACE_GROUP);
-			} else if (aclent[i].a_type & USER) {
-				acep->a_who = aclent[i].a_id;
-			} else {
-				acep->a_who = aclent[i].a_id;
-				acep->a_flags |= ACE_IDENTIFIER_GROUP;
-			}
-			if (aclent[i].a_type & ACL_DEFAULT) {
-				acep->a_flags |= ACE_INHERIT_ONLY_ACE |
-				    ACE_FILE_INHERIT_ACE |
-				    ACE_DIRECTORY_INHERIT_ACE;
-			}
-			/*
-			 * Set the access mask for the prepended deny
-			 * ace.  To do this, we invert the mask (found
-			 * in ln_aent_preprocess()) then convert it to an
-			 * DENY ace access_mask.
-			 */
-			acep->a_access_mask = mode_to_ace_access((mask ^ 07),
-			    isdir, 0, 0);
-			acep += 1;
-		}
-
-		/* handle a_perm -> access_mask */
-		acep->a_access_mask = mode_to_ace_access(aclent[i].a_perm,
-		    isdir, aclent[i].a_type & USER_OBJ, 1);
-
-		/* emulate a default aclent */
-		if (aclent[i].a_type & ACL_DEFAULT) {
-			acep->a_flags |= ACE_INHERIT_ONLY_ACE |
-			    ACE_FILE_INHERIT_ACE |
-			    ACE_DIRECTORY_INHERIT_ACE;
-		}
-
-		/*
-		 * handle a_perm and a_id
-		 *
-		 * this must be done last, since it involves the
-		 * corresponding deny aces, which are handled
-		 * differently for each different a_type.
-		 */
-		if (aclent[i].a_type & USER_OBJ) {
-			acep->a_who = (uid_t)-1;
-			acep->a_flags |= ACE_OWNER;
-			ace_make_deny(acep, acep + 1, isdir, B_TRUE);
-			acep += 2;
-		} else if (aclent[i].a_type & USER) {
-			acep->a_who = aclent[i].a_id;
-			ace_make_deny(acep, acep + 1, isdir, B_FALSE);
-			acep += 2;
-		} else if (aclent[i].a_type & (GROUP_OBJ | GROUP)) {
-			if (aclent[i].a_type & GROUP_OBJ) {
-				acep->a_who = (uid_t)-1;
-				acep->a_flags |= ACE_GROUP;
-			} else {
-				acep->a_who = aclent[i].a_id;
-			}
-			acep->a_flags |= ACE_IDENTIFIER_GROUP;
-			/*
-			 * Set the corresponding deny for the group ace.
-			 *
-			 * The deny aces go after all of the groups, unlike
-			 * everything else, where they immediately follow
-			 * the allow ace.
-			 *
-			 * We calculate "skip", the number of slots to
-			 * skip ahead for the deny ace, here.
-			 *
-			 * The pattern is:
-			 * MD1 A1 MD2 A2 MD3 A3 D1 D2 D3
-			 * thus, skip is
-			 * (2 * numgroup) - 1 - groupi
-			 * (2 * numgroup) to account for MD + A
-			 * - 1 to account for the fact that we're on the
-			 * access (A), not the mask (MD)
-			 * - groupi to account for the fact that we have
-			 * passed up groupi number of MD's.
-			 */
-			skip = (2 * numgroup) - 1 - groupi;
-			ace_make_deny(acep, acep + skip, isdir, B_FALSE);
-			/*
-			 * If we just did the last group, skip acep past
-			 * all of the denies; else, just move ahead one.
-			 */
-			if (++groupi >= numgroup)
-				acep += numgroup + 1;
-			else
-				acep += 1;
-		} else if (aclent[i].a_type & OTHER_OBJ) {
-			acep->a_who = (uid_t)-1;
-			acep->a_flags |= ACE_EVERYONE;
-			ace_make_deny(acep, acep + 1, isdir, B_FALSE);
-			acep += 2;
-		} else {
-			error = EINVAL;
-			goto out;
-		}
-	}
-
-	*acepp = result;
-	*rescount = resultsize;
-
-out:
-	if (error != 0) {
-		if ((result != NULL) && (resultsize > 0)) {
-			cacl_free(result, resultsize * sizeof (ace_t));
-		}
-	}
-
-	return (error);
-}
-
-static int
-convert_aent_to_ace(aclent_t *aclentp, int aclcnt, boolean_t isdir,
-    ace_t **retacep, int *retacecnt)
-{
-	ace_t *acep;
-	ace_t *dfacep;
-	int acecnt = 0;
-	int dfacecnt = 0;
-	int dfaclstart = 0;
-	int dfaclcnt = 0;
-	aclent_t *aclp;
-	int i;
-	int error;
-	int acesz, dfacesz;
-
-	ksort((caddr_t)aclentp, aclcnt, sizeof (aclent_t), cmp2acls);
-
-	for (i = 0, aclp = aclentp; i < aclcnt; aclp++, i++) {
-		if (aclp->a_type & ACL_DEFAULT)
-			break;
-	}
-
-	if (i < aclcnt) {
-		dfaclstart = i;
-		dfaclcnt = aclcnt - i;
-	}
-
-	if (dfaclcnt && !isdir) {
-		return (EINVAL);
-	}
-
-	error = ln_aent_to_ace(aclentp, i,  &acep, &acecnt, isdir);
-	if (error)
-		return (error);
-
-	if (dfaclcnt) {
-		error = ln_aent_to_ace(&aclentp[dfaclstart], dfaclcnt,
-		    &dfacep, &dfacecnt, isdir);
-		if (error) {
-			if (acep) {
-				cacl_free(acep, acecnt * sizeof (ace_t));
-			}
-			return (error);
-		}
-	}
-
-	if (dfacecnt != 0) {
-		acesz = sizeof (ace_t) * acecnt;
-		dfacesz = sizeof (ace_t) * dfacecnt;
-		acep = cacl_realloc(acep, acesz, acesz + dfacesz);
-		if (acep == NULL)
-			return (ENOMEM);
-		if (dfaclcnt) {
-			(void) memcpy(acep + acecnt, dfacep, dfacesz);
-		}
-	}
-	if (dfaclcnt)
-		cacl_free(dfacep, dfacecnt * sizeof (ace_t));
-
-	*retacecnt = acecnt + dfacecnt;
-	*retacep = acep;
-	return (0);
-}
-
-static int
-ace_mask_to_mode(uint32_t  mask, o_mode_t *modep, boolean_t isdir)
-{
-	int error = 0;
-	o_mode_t mode = 0;
-	uint32_t bits, wantbits;
-
-	/* read */
-	if (mask & ACE_READ_DATA)
-		mode |= S_IROTH;
-
-	/* write */
-	wantbits = (ACE_WRITE_DATA | ACE_APPEND_DATA);
-	if (isdir)
-		wantbits |= ACE_DELETE_CHILD;
-	bits = mask & wantbits;
-	if (bits != 0) {
-		if (bits != wantbits) {
-			error = ENOTSUP;
-			goto out;
-		}
-		mode |= S_IWOTH;
-	}
-
-	/* exec */
-	if (mask & ACE_EXECUTE) {
-		mode |= S_IXOTH;
-	}
-
-	*modep = mode;
-
-out:
-	return (error);
-}
-
-static void
-acevals_init(acevals_t *vals, uid_t key)
-{
-	bzero(vals, sizeof (*vals));
-	vals->allowed = ACE_MASK_UNDEFINED;
-	vals->denied = ACE_MASK_UNDEFINED;
-	vals->mask = ACE_MASK_UNDEFINED;
-	vals->key = key;
-}
-
-static void
-ace_list_init(ace_list_t *al, int dfacl_flag)
-{
-	acevals_init(&al->user_obj, 0);
-	acevals_init(&al->group_obj, 0);
-	acevals_init(&al->other_obj, 0);
-	al->numusers = 0;
-	al->numgroups = 0;
-	al->acl_mask = 0;
-	al->hasmask = 0;
-	al->state = ace_unused;
-	al->seen = 0;
-	al->dfacl_flag = dfacl_flag;
-}
-
-/*
- * Find or create an acevals holder for a given id and avl tree.
- *
- * Note that only one thread will ever touch these avl trees, so
- * there is no need for locking.
- */
-static acevals_t *
-acevals_find(ace_t *ace, avl_tree_t *avl, int *num)
-{
-	acevals_t key, *rc;
-	avl_index_t where;
-
-	key.key = ace->a_who;
-	rc = avl_find(avl, &key, &where);
-	if (rc != NULL)
-		return (rc);
-
-	/* this memory is freed by ln_ace_to_aent()->ace_list_free() */
-	if (cacl_malloc((void **)&rc, sizeof (acevals_t)) != 0)
-		return (NULL);
-
-	acevals_init(rc, ace->a_who);
-	avl_insert(avl, rc, where);
-	(*num)++;
-
-	return (rc);
-}
-
-static int
-access_mask_check(ace_t *acep, int mask_bit, int isowner)
-{
-	int set_deny, err_deny;
-	int set_allow, err_allow;
-	int acl_consume;
-	int haswriteperm, hasreadperm;
-
-	if (acep->a_type == ACE_ACCESS_DENIED_ACE_TYPE) {
-		haswriteperm = (acep->a_access_mask & ACE_WRITE_DATA) ? 0 : 1;
-		hasreadperm = (acep->a_access_mask & ACE_READ_DATA) ? 0 : 1;
-	} else {
-		haswriteperm = (acep->a_access_mask & ACE_WRITE_DATA) ? 1 : 0;
-		hasreadperm = (acep->a_access_mask & ACE_READ_DATA) ? 1 : 0;
-	}
-
-	acl_consume = (ACL_SYNCHRONIZE_ERR_DENY |
-	    ACL_DELETE_ERR_DENY |
-	    ACL_WRITE_OWNER_ERR_DENY |
-	    ACL_WRITE_OWNER_ERR_ALLOW |
-	    ACL_WRITE_ATTRS_OWNER_SET_ALLOW |
-	    ACL_WRITE_ATTRS_OWNER_ERR_DENY |
-	    ACL_WRITE_ATTRS_WRITER_SET_DENY |
-	    ACL_WRITE_ATTRS_WRITER_ERR_ALLOW |
-	    ACL_WRITE_NAMED_WRITER_ERR_DENY |
-	    ACL_READ_NAMED_READER_ERR_DENY);
-
-	if (mask_bit == ACE_SYNCHRONIZE) {
-		set_deny = ACL_SYNCHRONIZE_SET_DENY;
-		err_deny =  ACL_SYNCHRONIZE_ERR_DENY;
-		set_allow = ACL_SYNCHRONIZE_SET_ALLOW;
-		err_allow = ACL_SYNCHRONIZE_ERR_ALLOW;
-	} else if (mask_bit == ACE_WRITE_OWNER) {
-		set_deny = ACL_WRITE_OWNER_SET_DENY;
-		err_deny =  ACL_WRITE_OWNER_ERR_DENY;
-		set_allow = ACL_WRITE_OWNER_SET_ALLOW;
-		err_allow = ACL_WRITE_OWNER_ERR_ALLOW;
-	} else if (mask_bit == ACE_DELETE) {
-		set_deny = ACL_DELETE_SET_DENY;
-		err_deny =  ACL_DELETE_ERR_DENY;
-		set_allow = ACL_DELETE_SET_ALLOW;
-		err_allow = ACL_DELETE_ERR_ALLOW;
-	} else if (mask_bit == ACE_WRITE_ATTRIBUTES) {
-		if (isowner) {
-			set_deny = ACL_WRITE_ATTRS_OWNER_SET_DENY;
-			err_deny =  ACL_WRITE_ATTRS_OWNER_ERR_DENY;
-			set_allow = ACL_WRITE_ATTRS_OWNER_SET_ALLOW;
-			err_allow = ACL_WRITE_ATTRS_OWNER_ERR_ALLOW;
-		} else if (haswriteperm) {
-			set_deny = ACL_WRITE_ATTRS_WRITER_SET_DENY;
-			err_deny =  ACL_WRITE_ATTRS_WRITER_ERR_DENY;
-			set_allow = ACL_WRITE_ATTRS_WRITER_SET_ALLOW;
-			err_allow = ACL_WRITE_ATTRS_WRITER_ERR_ALLOW;
-		} else {
-			if ((acep->a_access_mask & mask_bit) &&
-			    (acep->a_type & ACE_ACCESS_ALLOWED_ACE_TYPE)) {
-				return (ENOTSUP);
-			}
-			return (0);
-		}
-	} else if (mask_bit == ACE_READ_NAMED_ATTRS) {
-		if (!hasreadperm)
-			return (0);
-
-		set_deny = ACL_READ_NAMED_READER_SET_DENY;
-		err_deny = ACL_READ_NAMED_READER_ERR_DENY;
-		set_allow = ACL_READ_NAMED_READER_SET_ALLOW;
-		err_allow = ACL_READ_NAMED_READER_ERR_ALLOW;
-	} else if (mask_bit == ACE_WRITE_NAMED_ATTRS) {
-		if (!haswriteperm)
-			return (0);
-
-		set_deny = ACL_WRITE_NAMED_WRITER_SET_DENY;
-		err_deny = ACL_WRITE_NAMED_WRITER_ERR_DENY;
-		set_allow = ACL_WRITE_NAMED_WRITER_SET_ALLOW;
-		err_allow = ACL_WRITE_NAMED_WRITER_ERR_ALLOW;
-	} else {
-		return (EINVAL);
-	}
-
-	if (acep->a_type == ACE_ACCESS_DENIED_ACE_TYPE) {
-		if (acl_consume & set_deny) {
-			if (!(acep->a_access_mask & mask_bit)) {
-				return (ENOTSUP);
-			}
-		} else if (acl_consume & err_deny) {
-			if (acep->a_access_mask & mask_bit) {
-				return (ENOTSUP);
-			}
-		}
-	} else {
-		/* ACE_ACCESS_ALLOWED_ACE_TYPE */
-		if (acl_consume & set_allow) {
-			if (!(acep->a_access_mask & mask_bit)) {
-				return (ENOTSUP);
-			}
-		} else if (acl_consume & err_allow) {
-			if (acep->a_access_mask & mask_bit) {
-				return (ENOTSUP);
-			}
-		}
-	}
-	return (0);
-}
-
-static int
-ace_to_aent_legal(ace_t *acep)
-{
-	int error = 0;
-	int isowner;
-
-	/* only ALLOW or DENY */
-	if ((acep->a_type != ACE_ACCESS_ALLOWED_ACE_TYPE) &&
-	    (acep->a_type != ACE_ACCESS_DENIED_ACE_TYPE)) {
-		error = ENOTSUP;
-		goto out;
-	}
-
-	/* check for invalid flags */
-	if (acep->a_flags & ~(ACE_VALID_FLAG_BITS)) {
-		error = EINVAL;
-		goto out;
-	}
-
-	/* some flags are illegal */
-	if (acep->a_flags & (ACE_SUCCESSFUL_ACCESS_ACE_FLAG |
-	    ACE_FAILED_ACCESS_ACE_FLAG |
-	    ACE_NO_PROPAGATE_INHERIT_ACE)) {
-		error = ENOTSUP;
-		goto out;
-	}
-
-	/* check for invalid masks */
-	if (acep->a_access_mask & ~(ACE_VALID_MASK_BITS)) {
-		error = EINVAL;
-		goto out;
-	}
-
-	if ((acep->a_flags & ACE_OWNER)) {
-		isowner = 1;
-	} else {
-		isowner = 0;
-	}
-
-	error = access_mask_check(acep, ACE_SYNCHRONIZE, isowner);
-	if (error)
-		goto out;
-
-	error = access_mask_check(acep, ACE_WRITE_OWNER, isowner);
-	if (error)
-		goto out;
-
-	error = access_mask_check(acep, ACE_DELETE, isowner);
-	if (error)
-		goto out;
-
-	error = access_mask_check(acep, ACE_WRITE_ATTRIBUTES, isowner);
-	if (error)
-		goto out;
-
-	error = access_mask_check(acep, ACE_READ_NAMED_ATTRS, isowner);
-	if (error)
-		goto out;
-
-	error = access_mask_check(acep, ACE_WRITE_NAMED_ATTRS, isowner);
-	if (error)
-		goto out;
-
-	/* more detailed checking of masks */
-	if (acep->a_type == ACE_ACCESS_ALLOWED_ACE_TYPE) {
-		if (! (acep->a_access_mask & ACE_READ_ATTRIBUTES)) {
-			error = ENOTSUP;
-			goto out;
-		}
-		if ((acep->a_access_mask & ACE_WRITE_DATA) &&
-		    (! (acep->a_access_mask & ACE_APPEND_DATA))) {
-			error = ENOTSUP;
-			goto out;
-		}
-		if ((! (acep->a_access_mask & ACE_WRITE_DATA)) &&
-		    (acep->a_access_mask & ACE_APPEND_DATA)) {
-			error = ENOTSUP;
-			goto out;
-		}
-	}
-
-	/* ACL enforcement */
-	if ((acep->a_access_mask & ACE_READ_ACL) &&
-	    (acep->a_type != ACE_ACCESS_ALLOWED_ACE_TYPE)) {
-		error = ENOTSUP;
-		goto out;
-	}
-	if (acep->a_access_mask & ACE_WRITE_ACL) {
-		if ((acep->a_type == ACE_ACCESS_DENIED_ACE_TYPE) &&
-		    (isowner)) {
-			error = ENOTSUP;
-			goto out;
-		}
-		if ((acep->a_type == ACE_ACCESS_ALLOWED_ACE_TYPE) &&
-		    (! isowner)) {
-			error = ENOTSUP;
-			goto out;
-		}
-	}
-
-out:
-	return (error);
-}
-
-static int
-ace_allow_to_mode(uint32_t mask, o_mode_t *modep, boolean_t isdir)
-{
-	/* ACE_READ_ACL and ACE_READ_ATTRIBUTES must both be set */
-	if ((mask & (ACE_READ_ACL | ACE_READ_ATTRIBUTES)) !=
-	    (ACE_READ_ACL | ACE_READ_ATTRIBUTES)) {
-		return (ENOTSUP);
-	}
-
-	return (ace_mask_to_mode(mask, modep, isdir));
-}
-
-static int
-acevals_to_aent(acevals_t *vals, aclent_t *dest, ace_list_t *list,
-    uid_t owner, gid_t group, boolean_t isdir)
-{
-	int error;
-	uint32_t  flips = ACE_POSIX_SUPPORTED_BITS;
-
-	if (isdir)
-		flips |= ACE_DELETE_CHILD;
-	if (vals->allowed != (vals->denied ^ flips)) {
-		error = ENOTSUP;
-		goto out;
-	}
-	if ((list->hasmask) && (list->acl_mask != vals->mask) &&
-	    (vals->aent_type & (USER | GROUP | GROUP_OBJ))) {
-		error = ENOTSUP;
-		goto out;
-	}
-	error = ace_allow_to_mode(vals->allowed, &dest->a_perm, isdir);
-	if (error != 0)
-		goto out;
-	dest->a_type = vals->aent_type;
-	if (dest->a_type & (USER | GROUP)) {
-		dest->a_id = vals->key;
-	} else if (dest->a_type & USER_OBJ) {
-		dest->a_id = owner;
-	} else if (dest->a_type & GROUP_OBJ) {
-		dest->a_id = group;
-	} else if (dest->a_type & OTHER_OBJ) {
-		dest->a_id = 0;
-	} else {
-		error = EINVAL;
-		goto out;
-	}
-
-out:
-	return (error);
-}
-
-
-static int
-ace_list_to_aent(ace_list_t *list, aclent_t **aclentp, int *aclcnt,
-    uid_t owner, gid_t group, boolean_t isdir)
-{
-	int error = 0;
-	aclent_t *aent, *result = NULL;
-	acevals_t *vals;
-	int resultcount;
-
-	if ((list->seen & (USER_OBJ | GROUP_OBJ | OTHER_OBJ)) !=
-	    (USER_OBJ | GROUP_OBJ | OTHER_OBJ)) {
-		error = ENOTSUP;
-		goto out;
-	}
-	if ((! list->hasmask) && (list->numusers + list->numgroups > 0)) {
-		error = ENOTSUP;
-		goto out;
-	}
-
-	resultcount = 3 + list->numusers + list->numgroups;
-	/*
-	 * This must be the same condition as below, when we add the CLASS_OBJ
-	 * (aka ACL mask)
-	 */
-	if ((list->hasmask) || (! list->dfacl_flag))
-		resultcount += 1;
-
-	if (cacl_malloc((void **)&result,
-	    resultcount * sizeof (aclent_t)) != 0) {
-		error = ENOMEM;
-		goto out;
-	}
-	aent = result;
-
-	/* USER_OBJ */
-	if (!(list->user_obj.aent_type & USER_OBJ)) {
-		error = EINVAL;
-		goto out;
-	}
-
-	error = acevals_to_aent(&list->user_obj, aent, list, owner, group,
-	    isdir);
-
-	if (error != 0)
-		goto out;
-	++aent;
-	/* USER */
-	vals = NULL;
-	for (vals = avl_first(&list->user); vals != NULL;
-	    vals = AVL_NEXT(&list->user, vals)) {
-		if (!(vals->aent_type & USER)) {
-			error = EINVAL;
-			goto out;
-		}
-		error = acevals_to_aent(vals, aent, list, owner, group,
-		    isdir);
-		if (error != 0)
-			goto out;
-		++aent;
-	}
-	/* GROUP_OBJ */
-	if (!(list->group_obj.aent_type & GROUP_OBJ)) {
-		error = EINVAL;
-		goto out;
-	}
-	error = acevals_to_aent(&list->group_obj, aent, list, owner, group,
-	    isdir);
-	if (error != 0)
-		goto out;
-	++aent;
-	/* GROUP */
-	vals = NULL;
-	for (vals = avl_first(&list->group); vals != NULL;
-	    vals = AVL_NEXT(&list->group, vals)) {
-		if (!(vals->aent_type & GROUP)) {
-			error = EINVAL;
-			goto out;
-		}
-		error = acevals_to_aent(vals, aent, list, owner, group,
-		    isdir);
-		if (error != 0)
-			goto out;
-		++aent;
-	}
-	/*
-	 * CLASS_OBJ (aka ACL_MASK)
-	 *
-	 * An ACL_MASK is not fabricated if the ACL is a default ACL.
-	 * This is to follow UFS's behavior.
-	 */
-	if ((list->hasmask) || (! list->dfacl_flag)) {
-		if (list->hasmask) {
-			uint32_t flips = ACE_POSIX_SUPPORTED_BITS;
-			if (isdir)
-				flips |= ACE_DELETE_CHILD;
-			error = ace_mask_to_mode(list->acl_mask ^ flips,
-			    &aent->a_perm, isdir);
-			if (error != 0)
-				goto out;
-		} else {
-			/* fabricate the ACL_MASK from the group permissions */
-			error = ace_mask_to_mode(list->group_obj.allowed,
-			    &aent->a_perm, isdir);
-			if (error != 0)
-				goto out;
-		}
-		aent->a_id = 0;
-		aent->a_type = CLASS_OBJ | list->dfacl_flag;
-		++aent;
-	}
-	/* OTHER_OBJ */
-	if (!(list->other_obj.aent_type & OTHER_OBJ)) {
-		error = EINVAL;
-		goto out;
-	}
-	error = acevals_to_aent(&list->other_obj, aent, list, owner, group,
-	    isdir);
-	if (error != 0)
-		goto out;
-	++aent;
-
-	*aclentp = result;
-	*aclcnt = resultcount;
-
-out:
-	if (error != 0) {
-		if (result != NULL)
-			cacl_free(result, resultcount * sizeof (aclent_t));
-	}
-
-	return (error);
-}
-
-
-/*
- * free all data associated with an ace_list
- */
-static void
-ace_list_free(ace_list_t *al)
-{
-	acevals_t *node;
-	void *cookie;
-
-	if (al == NULL)
-		return;
-
-	cookie = NULL;
-	while ((node = avl_destroy_nodes(&al->user, &cookie)) != NULL)
-		cacl_free(node, sizeof (acevals_t));
-	cookie = NULL;
-	while ((node = avl_destroy_nodes(&al->group, &cookie)) != NULL)
-		cacl_free(node, sizeof (acevals_t));
-
-	avl_destroy(&al->user);
-	avl_destroy(&al->group);
-
-	/* free the container itself */
-	cacl_free(al, sizeof (ace_list_t));
-}
-
-static int
-acevals_compare(const void *va, const void *vb)
-{
-	const acevals_t *a = va, *b = vb;
-
-	if (a->key == b->key)
-		return (0);
-
-	if (a->key > b->key)
-		return (1);
-
-	else
-		return (-1);
-}
-
-/*
- * Convert a list of ace_t entries to equivalent regular and default
- * aclent_t lists.  Return error (ENOTSUP) when conversion is not possible.
- */
-static int
-ln_ace_to_aent(ace_t *ace, int n, uid_t owner, gid_t group,
-    aclent_t **aclentp, int *aclcnt, aclent_t **dfaclentp, int *dfaclcnt,
-    boolean_t isdir)
-{
-	int error = 0;
-	ace_t *acep;
-	uint32_t bits;
-	int i;
-	ace_list_t *normacl = NULL, *dfacl = NULL, *acl;
-	acevals_t *vals;
-
-	*aclentp = NULL;
-	*aclcnt = 0;
-	*dfaclentp = NULL;
-	*dfaclcnt = 0;
-
-	/* we need at least user_obj, group_obj, and other_obj */
-	if (n < 6) {
-		error = ENOTSUP;
-		goto out;
-	}
-	if (ace == NULL) {
-		error = EINVAL;
-		goto out;
-	}
-
-	error = cacl_malloc((void **)&normacl, sizeof (ace_list_t));
-	if (error != 0)
-		goto out;
-
-	avl_create(&normacl->user, acevals_compare, sizeof (acevals_t),
-	    offsetof(acevals_t, avl));
-	avl_create(&normacl->group, acevals_compare, sizeof (acevals_t),
-	    offsetof(acevals_t, avl));
-
-	ace_list_init(normacl, 0);
-
-	error = cacl_malloc((void **)&dfacl, sizeof (ace_list_t));
-	if (error != 0)
-		goto out;
-
-	avl_create(&dfacl->user, acevals_compare, sizeof (acevals_t),
-	    offsetof(acevals_t, avl));
-	avl_create(&dfacl->group, acevals_compare, sizeof (acevals_t),
-	    offsetof(acevals_t, avl));
-	ace_list_init(dfacl, ACL_DEFAULT);
-
-	/* process every ace_t... */
-	for (i = 0; i < n; i++) {
-		acep = &ace[i];
-
-		/* rule out certain cases quickly */
-		error = ace_to_aent_legal(acep);
-		if (error != 0)
-			goto out;
-
-		/*
-		 * Turn off these bits in order to not have to worry about
-		 * them when doing the checks for compliments.
-		 */
-		acep->a_access_mask &= ~(ACE_WRITE_OWNER | ACE_DELETE |
-		    ACE_SYNCHRONIZE | ACE_WRITE_ATTRIBUTES |
-		    ACE_READ_NAMED_ATTRS | ACE_WRITE_NAMED_ATTRS);
-
-		/* see if this should be a regular or default acl */
-		bits = acep->a_flags &
-		    (ACE_INHERIT_ONLY_ACE |
-		    ACE_FILE_INHERIT_ACE |
-		    ACE_DIRECTORY_INHERIT_ACE);
-		if (bits != 0) {
-			/* all or nothing on these inherit bits */
-			if (bits != (ACE_INHERIT_ONLY_ACE |
-			    ACE_FILE_INHERIT_ACE |
-			    ACE_DIRECTORY_INHERIT_ACE)) {
-				error = ENOTSUP;
-				goto out;
-			}
-			acl = dfacl;
-		} else {
-			acl = normacl;
-		}
-
-		if ((acep->a_flags & ACE_OWNER)) {
-			if (acl->state > ace_user_obj) {
-				error = ENOTSUP;
-				goto out;
-			}
-			acl->state = ace_user_obj;
-			acl->seen |= USER_OBJ;
-			vals = &acl->user_obj;
-			vals->aent_type = USER_OBJ | acl->dfacl_flag;
-		} else if ((acep->a_flags & ACE_EVERYONE)) {
-			acl->state = ace_other_obj;
-			acl->seen |= OTHER_OBJ;
-			vals = &acl->other_obj;
-			vals->aent_type = OTHER_OBJ | acl->dfacl_flag;
-		} else if (acep->a_flags & ACE_IDENTIFIER_GROUP) {
-			if (acl->state > ace_group) {
-				error = ENOTSUP;
-				goto out;
-			}
-			if ((acep->a_flags & ACE_GROUP)) {
-				acl->seen |= GROUP_OBJ;
-				vals = &acl->group_obj;
-				vals->aent_type = GROUP_OBJ | acl->dfacl_flag;
-			} else {
-				acl->seen |= GROUP;
-				vals = acevals_find(acep, &acl->group,
-				    &acl->numgroups);
-				if (vals == NULL) {
-					error = ENOMEM;
-					goto out;
-				}
-				vals->aent_type = GROUP | acl->dfacl_flag;
-			}
-			acl->state = ace_group;
-		} else {
-			if (acl->state > ace_user) {
-				error = ENOTSUP;
-				goto out;
-			}
-			acl->state = ace_user;
-			acl->seen |= USER;
-			vals = acevals_find(acep, &acl->user,
-			    &acl->numusers);
-			if (vals == NULL) {
-				error = ENOMEM;
-				goto out;
-			}
-			vals->aent_type = USER | acl->dfacl_flag;
-		}
-
-		if (!(acl->state > ace_unused)) {
-			error = EINVAL;
-			goto out;
-		}
-
-		if (acep->a_type == ACE_ACCESS_ALLOWED_ACE_TYPE) {
-			/* no more than one allowed per aclent_t */
-			if (vals->allowed != ACE_MASK_UNDEFINED) {
-				error = ENOTSUP;
-				goto out;
-			}
-			vals->allowed = acep->a_access_mask;
-		} else {
-			/*
-			 * it's a DENY; if there was a previous DENY, it
-			 * must have been an ACL_MASK.
-			 */
-			if (vals->denied != ACE_MASK_UNDEFINED) {
-				/* ACL_MASK is for USER and GROUP only */
-				if ((acl->state != ace_user) &&
-				    (acl->state != ace_group)) {
-					error = ENOTSUP;
-					goto out;
-				}
-
-				if (! acl->hasmask) {
-					acl->hasmask = 1;
-					acl->acl_mask = vals->denied;
-				/* check for mismatched ACL_MASK emulations */
-				} else if (acl->acl_mask != vals->denied) {
-					error = ENOTSUP;
-					goto out;
-				}
-				vals->mask = vals->denied;
-			}
-			vals->denied = acep->a_access_mask;
-		}
-	}
-
-	/* done collating; produce the aclent_t lists */
-	if (normacl->state != ace_unused) {
-		error = ace_list_to_aent(normacl, aclentp, aclcnt,
-		    owner, group, isdir);
-		if (error != 0) {
-			goto out;
-		}
-	}
-	if (dfacl->state != ace_unused) {
-		error = ace_list_to_aent(dfacl, dfaclentp, dfaclcnt,
-		    owner, group, isdir);
-		if (error != 0) {
-			goto out;
-		}
-	}
-
-out:
-	if (normacl != NULL)
-		ace_list_free(normacl);
-	if (dfacl != NULL)
-		ace_list_free(dfacl);
-
-	return (error);
-}
-
-static int
-convert_ace_to_aent(ace_t *acebufp, int acecnt, boolean_t isdir,
-    uid_t owner, gid_t group, aclent_t **retaclentp, int *retaclcnt)
-{
-	int error = 0;
-	aclent_t *aclentp, *dfaclentp;
-	int aclcnt, dfaclcnt;
-	int aclsz, dfaclsz;
-
-	error = ln_ace_to_aent(acebufp, acecnt, owner, group,
-	    &aclentp, &aclcnt, &dfaclentp, &dfaclcnt, isdir);
-
-	if (error)
-		return (error);
-
-
-	if (dfaclcnt != 0) {
-		/*
-		 * Slap aclentp and dfaclentp into a single array.
-		 */
-		aclsz = sizeof (aclent_t) * aclcnt;
-		dfaclsz = sizeof (aclent_t) * dfaclcnt;
-		aclentp = cacl_realloc(aclentp, aclsz, aclsz + dfaclsz);
-		if (aclentp != NULL) {
-			(void) memcpy(aclentp + aclcnt, dfaclentp, dfaclsz);
-		} else {
-			error = ENOMEM;
-		}
-	}
-
-	if (aclentp) {
-		*retaclentp = aclentp;
-		*retaclcnt = aclcnt + dfaclcnt;
-	}
-
-	if (dfaclentp)
-		cacl_free(dfaclentp, dfaclsz);
-
-	return (error);
-}
-
-
-int
-acl_translate(acl_t *aclp, int target_flavor, boolean_t isdir, uid_t owner,
-    gid_t group)
-{
-	int aclcnt;
-	void *acldata;
-	int error;
-
-	/*
-	 * See if we need to translate
-	 */
-	if ((target_flavor == _ACL_ACE_ENABLED && aclp->acl_type == ACE_T) ||
-	    (target_flavor == _ACL_ACLENT_ENABLED &&
-	    aclp->acl_type == ACLENT_T))
-		return (0);
-
-	if (target_flavor == -1) {
-		error = EINVAL;
-		goto out;
-	}
-
-	if (target_flavor ==  _ACL_ACE_ENABLED &&
-	    aclp->acl_type == ACLENT_T) {
-		error = convert_aent_to_ace(aclp->acl_aclp,
-		    aclp->acl_cnt, isdir, (ace_t **)&acldata, &aclcnt);
-		if (error)
-			goto out;
-
-	} else if (target_flavor == _ACL_ACLENT_ENABLED &&
-	    aclp->acl_type == ACE_T) {
-		error = convert_ace_to_aent(aclp->acl_aclp, aclp->acl_cnt,
-		    isdir, owner, group, (aclent_t **)&acldata, &aclcnt);
-		if (error)
-			goto out;
-	} else {
-		error = ENOTSUP;
-		goto out;
-	}
-
-	/*
-	 * replace old acl with newly translated acl
-	 */
-	cacl_free(aclp->acl_aclp, aclp->acl_cnt * aclp->acl_entry_size);
-	aclp->acl_aclp = acldata;
-	aclp->acl_cnt = aclcnt;
-	if (target_flavor == _ACL_ACE_ENABLED) {
-		aclp->acl_type = ACE_T;
-		aclp->acl_entry_size = sizeof (ace_t);
-	} else {
-		aclp->acl_type = ACLENT_T;
-		aclp->acl_entry_size = sizeof (aclent_t);
-	}
-	return (0);
-
-out:
-
-#if !defined(_KERNEL)
-	errno = error;
-	return (-1);
-#else
-	return (error);
-#endif
-}
-#endif /* !_KERNEL */
-
-#define	SET_ACE(acl, index, who, mask, type, flags) { \
-	acl[0][index].a_who = (uint32_t)who; \
-	acl[0][index].a_type = type; \
-	acl[0][index].a_flags = flags; \
-	acl[0][index++].a_access_mask = mask; \
-}
-
-void
-acl_trivial_access_masks(mode_t mode, boolean_t isdir, trivial_acl_t *masks)
-{
-	uint32_t read_mask = ACE_READ_DATA;
-	uint32_t write_mask = ACE_WRITE_DATA|ACE_APPEND_DATA;
-	uint32_t execute_mask = ACE_EXECUTE;
-
-	(void) isdir;	/* will need this later */
-
-	masks->deny1 = 0;
-	if (!(mode & S_IRUSR) && (mode & (S_IRGRP|S_IROTH)))
-		masks->deny1 |= read_mask;
-	if (!(mode & S_IWUSR) && (mode & (S_IWGRP|S_IWOTH)))
-		masks->deny1 |= write_mask;
-	if (!(mode & S_IXUSR) && (mode & (S_IXGRP|S_IXOTH)))
-		masks->deny1 |= execute_mask;
-
-	masks->deny2 = 0;
-	if (!(mode & S_IRGRP) && (mode & S_IROTH))
-		masks->deny2 |= read_mask;
-	if (!(mode & S_IWGRP) && (mode & S_IWOTH))
-		masks->deny2 |= write_mask;
-	if (!(mode & S_IXGRP) && (mode & S_IXOTH))
-		masks->deny2 |= execute_mask;
-
-	masks->allow0 = 0;
-	if ((mode & S_IRUSR) && (!(mode & S_IRGRP) && (mode & S_IROTH)))
-		masks->allow0 |= read_mask;
-	if ((mode & S_IWUSR) && (!(mode & S_IWGRP) && (mode & S_IWOTH)))
-		masks->allow0 |= write_mask;
-	if ((mode & S_IXUSR) && (!(mode & S_IXGRP) && (mode & S_IXOTH)))
-		masks->allow0 |= execute_mask;
-
-	masks->owner = ACE_WRITE_ATTRIBUTES|ACE_WRITE_OWNER|ACE_WRITE_ACL|
-	    ACE_WRITE_NAMED_ATTRS|ACE_READ_ACL|ACE_READ_ATTRIBUTES|
-	    ACE_READ_NAMED_ATTRS|ACE_SYNCHRONIZE;
-	if (mode & S_IRUSR)
-		masks->owner |= read_mask;
-	if (mode & S_IWUSR)
-		masks->owner |= write_mask;
-	if (mode & S_IXUSR)
-		masks->owner |= execute_mask;
-
-	masks->group = ACE_READ_ACL|ACE_READ_ATTRIBUTES|ACE_READ_NAMED_ATTRS|
-	    ACE_SYNCHRONIZE;
-	if (mode & S_IRGRP)
-		masks->group |= read_mask;
-	if (mode & S_IWGRP)
-		masks->group |= write_mask;
-	if (mode & S_IXGRP)
-		masks->group |= execute_mask;
-
-	masks->everyone = ACE_READ_ACL|ACE_READ_ATTRIBUTES|ACE_READ_NAMED_ATTRS|
-	    ACE_SYNCHRONIZE;
-	if (mode & S_IROTH)
-		masks->everyone |= read_mask;
-	if (mode & S_IWOTH)
-		masks->everyone |= write_mask;
-	if (mode & S_IXOTH)
-		masks->everyone |= execute_mask;
-}
-
-int
-acl_trivial_create(mode_t mode, boolean_t isdir, ace_t **acl, int *count)
-{
-	int		index = 0;
-	int		error;
-	trivial_acl_t	masks;
-
-	*count = 3;
-	acl_trivial_access_masks(mode, isdir, &masks);
-
-	if (masks.allow0)
-		(*count)++;
-	if (masks.deny1)
-		(*count)++;
-	if (masks.deny2)
-		(*count)++;
-
-	if ((error = cacl_malloc((void **)acl, *count * sizeof (ace_t))) != 0)
-		return (error);
-
-	if (masks.allow0) {
-		SET_ACE(acl, index, -1, masks.allow0,
-		    ACE_ACCESS_ALLOWED_ACE_TYPE, ACE_OWNER);
-	}
-	if (masks.deny1) {
-		SET_ACE(acl, index, -1, masks.deny1,
-		    ACE_ACCESS_DENIED_ACE_TYPE, ACE_OWNER);
-	}
-	if (masks.deny2) {
-		SET_ACE(acl, index, -1, masks.deny2,
-		    ACE_ACCESS_DENIED_ACE_TYPE, ACE_GROUP|ACE_IDENTIFIER_GROUP);
-	}
-
-	SET_ACE(acl, index, -1, masks.owner, ACE_ACCESS_ALLOWED_ACE_TYPE,
-	    ACE_OWNER);
-	SET_ACE(acl, index, -1, masks.group, ACE_ACCESS_ALLOWED_ACE_TYPE,
-	    ACE_IDENTIFIER_GROUP|ACE_GROUP);
-	SET_ACE(acl, index, -1, masks.everyone, ACE_ACCESS_ALLOWED_ACE_TYPE,
-	    ACE_EVERYONE);
-
-	return (0);
-}
-
-/*
- * ace_trivial:
- * determine whether an ace_t acl is trivial
- *
- * Trivialness implies that the acl is composed of only
- * owner, group, everyone entries.  ACL can't
- * have read_acl denied, and write_owner/write_acl/write_attributes
- * can only be owner@ entry.
- */
-int
-ace_trivial_common(void *acep, int aclcnt,
-    uint64_t (*walk)(void *, uint64_t, int aclcnt,
-    uint16_t *, uint16_t *, uint32_t *))
-{
-	uint16_t flags;
-	uint32_t mask;
-	uint16_t type;
-	uint64_t cookie = 0;
-
-	while (cookie = walk(acep, cookie, aclcnt, &flags, &type, &mask)) {
-		switch (flags & ACE_TYPE_FLAGS) {
-		case ACE_OWNER:
-		case ACE_GROUP|ACE_IDENTIFIER_GROUP:
-		case ACE_EVERYONE:
-			break;
-		default:
-			return (1);
-
-		}
-
-		if (flags & (ACE_FILE_INHERIT_ACE|
-		    ACE_DIRECTORY_INHERIT_ACE|ACE_NO_PROPAGATE_INHERIT_ACE|
-		    ACE_INHERIT_ONLY_ACE))
-			return (1);
-
-		/*
-		 * Special check for some special bits
-		 *
-		 * Don't allow anybody to deny reading basic
-		 * attributes or a files ACL.
-		 */
-		if ((mask & (ACE_READ_ACL|ACE_READ_ATTRIBUTES)) &&
-		    (type == ACE_ACCESS_DENIED_ACE_TYPE))
-			return (1);
-
-		/*
-		 * Delete permissions are never set by default
-		 */
-		if (mask & (ACE_DELETE|ACE_DELETE_CHILD))
-			return (1);
-		/*
-		 * only allow owner@ to have
-		 * write_acl/write_owner/write_attributes/write_xattr/
-		 */
-		if (type == ACE_ACCESS_ALLOWED_ACE_TYPE &&
-		    (!(flags & ACE_OWNER) && (mask &
-		    (ACE_WRITE_OWNER|ACE_WRITE_ACL| ACE_WRITE_ATTRIBUTES|
-		    ACE_WRITE_NAMED_ATTRS))))
-			return (1);
-
-	}
-	return (0);
-}
-
-uint64_t
-ace_walk(void *datap, uint64_t cookie, int aclcnt, uint16_t *flags,
-    uint16_t *type, uint32_t *mask)
-{
-	ace_t *acep = datap;
-
-	if (cookie >= aclcnt)
-		return (0);
-
-	*flags = acep[cookie].a_flags;
-	*type = acep[cookie].a_type;
-	*mask = acep[cookie++].a_access_mask;
-
-	return (cookie);
-}
-
-int
-ace_trivial(ace_t *acep, int aclcnt)
-{
-	return (ace_trivial_common(acep, aclcnt, ace_walk));
-}
Index: sys/cddl/contrib/opensolaris/common/atomic/i386/opensolaris_atomic.S
===================================================================
--- sys/cddl/contrib/opensolaris/common/atomic/i386/opensolaris_atomic.S
+++ /dev/null
@@ -1,133 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-
-/*
- * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-
-	.file	"atomic.s"
-
-#define	_ASM
-#include <sys/asm_linkage.h>
-
-	/*
-	 * NOTE: If atomic_dec_64 and atomic_dec_64_nv are ever
-	 * separated, it is important to edit the libc i386 platform
-	 * specific mapfile and remove the NODYNSORT attribute
-	 * from atomic_dec_64_nv.
-	 */
-	ENTRY(atomic_dec_64)
-	ALTENTRY(atomic_dec_64_nv)
-	pushl	%edi
-	pushl	%ebx
-	movl	12(%esp), %edi	// %edi = target address
-	movl	(%edi), %eax
-	movl	4(%edi), %edx	// %edx:%eax = old value
-1:
-	xorl	%ebx, %ebx
-	xorl	%ecx, %ecx
-	not	%ecx
-	not	%ebx		// %ecx:%ebx = -1
-	addl	%eax, %ebx
-	adcl	%edx, %ecx	// add in the carry from inc
-	lock
-	cmpxchg8b (%edi)	// try to stick it in
-	jne	1b
-	movl	%ebx, %eax
-	movl	%ecx, %edx	// return new value
-	popl	%ebx
-	popl	%edi
-	ret
-	SET_SIZE(atomic_dec_64_nv)
-	SET_SIZE(atomic_dec_64)
-
-	/*
-	 * NOTE: If atomic_add_64 and atomic_add_64_nv are ever
-	 * separated, it is important to edit the libc i386 platform
-	 * specific mapfile and remove the NODYNSORT attribute
-	 * from atomic_add_64_nv.
-	 */
-	ENTRY(atomic_add_64)
-	ALTENTRY(atomic_add_64_nv)
-	pushl	%edi
-	pushl	%ebx
-	movl	12(%esp), %edi	// %edi = target address
-	movl	(%edi), %eax
-	movl	4(%edi), %edx	// %edx:%eax = old value
-1:
-	movl	16(%esp), %ebx
-	movl	20(%esp), %ecx	// %ecx:%ebx = delta
-	addl	%eax, %ebx
-	adcl	%edx, %ecx	// %ecx:%ebx = new value
-	lock
-	cmpxchg8b (%edi)	// try to stick it in
-	jne	1b
-	movl	%ebx, %eax
-	movl	%ecx, %edx	// return new value
-	popl	%ebx
-	popl	%edi
-	ret
-	SET_SIZE(atomic_add_64_nv)
-	SET_SIZE(atomic_add_64)
-
-	ENTRY(atomic_cas_64)
-	pushl	%ebx
-	pushl	%esi
-	movl	12(%esp), %esi
-	movl	16(%esp), %eax
-	movl	20(%esp), %edx
-	movl	24(%esp), %ebx
-	movl	28(%esp), %ecx
-	lock
-	cmpxchg8b (%esi)
-	popl	%esi
-	popl	%ebx
-	ret
-	SET_SIZE(atomic_cas_64)
-
-	ENTRY(atomic_swap_64)
-	pushl	%esi
-	pushl	%ebx
-	movl	12(%esp), %esi
-	movl	16(%esp), %ebx
-	movl	20(%esp), %ecx
-	movl	(%esi), %eax
-	movl	4(%esi), %edx	// %edx:%eax = old value
-1:
-	lock
-	cmpxchg8b (%esi)
-	jne	1b
-	popl	%ebx
-	popl	%esi
-	ret
-	SET_SIZE(atomic_swap_64)
-
-	ENTRY(atomic_load_64)
-	pushl	%esi
-	movl	8(%esp), %esi
-	movl	%ebx, %eax	// make old and new values equal, so that
-	movl	%ecx, %edx	// destination is never changed
-	lock
-	cmpxchg8b (%esi)
-	popl	%esi
-	ret
-	SET_SIZE(atomic_load_64)
Index: sys/cddl/contrib/opensolaris/common/avl/avl.c
===================================================================
--- sys/cddl/contrib/opensolaris/common/avl/avl.c
+++ /dev/null
@@ -1,1063 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-
-/*
- * Copyright (c) 2014 by Delphix. All rights reserved.
- * Copyright 2015 Nexenta Systems, Inc.  All rights reserved.
- */
-
-/*
- * AVL - generic AVL tree implementation for kernel use
- *
- * A complete description of AVL trees can be found in many CS textbooks.
- *
- * Here is a very brief overview. An AVL tree is a binary search tree that is
- * almost perfectly balanced. By "almost" perfectly balanced, we mean that at
- * any given node, the left and right subtrees are allowed to differ in height
- * by at most 1 level.
- *
- * This relaxation from a perfectly balanced binary tree allows doing
- * insertion and deletion relatively efficiently. Searching the tree is
- * still a fast operation, roughly O(log(N)).
- *
- * The key to insertion and deletion is a set of tree manipulations called
- * rotations, which bring unbalanced subtrees back into the semi-balanced state.
- *
- * This implementation of AVL trees has the following peculiarities:
- *
- *	- The AVL specific data structures are physically embedded as fields
- *	  in the "using" data structures.  To maintain generality the code
- *	  must constantly translate between "avl_node_t *" and containing
- *	  data structure "void *"s by adding/subtracting the avl_offset.
- *
- *	- Since the AVL data is always embedded in other structures, there is
- *	  no locking or memory allocation in the AVL routines. This must be
- *	  provided for by the enclosing data structure's semantics. Typically,
- *	  avl_insert()/_add()/_remove()/avl_insert_here() require some kind of
- *	  exclusive write lock. Other operations require a read lock.
- *
- *      - The implementation uses iteration instead of explicit recursion,
- *	  since it is intended to run on limited size kernel stacks. Since
- *	  there is no recursion stack present to move "up" in the tree,
- *	  there is an explicit "parent" link in the avl_node_t.
- *
- *      - The left/right children pointers of a node are in an array.
- *	  In the code, variables (instead of constants) are used to represent
- *	  left and right indices.  The implementation is written as if it only
- *	  dealt with left handed manipulations.  By changing the value assigned
- *	  to "left", the code also works for right handed trees.  The
- *	  following variables/terms are frequently used:
- *
- *		int left;	// 0 when dealing with left children,
- *				// 1 for dealing with right children
- *
- *		int left_heavy;	// -1 when left subtree is taller at some node,
- *				// +1 when right subtree is taller
- *
- *		int right;	// will be the opposite of left (0 or 1)
- *		int right_heavy;// will be the opposite of left_heavy (-1 or 1)
- *
- *		int direction;  // 0 for "<" (ie. left child); 1 for ">" (right)
- *
- *	  Though it is a little more confusing to read the code, the approach
- *	  allows using half as much code (and hence cache footprint) for tree
- *	  manipulations and eliminates many conditional branches.
- *
- *	- The avl_index_t is an opaque "cookie" used to find nodes at or
- *	  adjacent to where a new value would be inserted in the tree. The value
- *	  is a modified "avl_node_t *".  The bottom bit (normally 0 for a
- *	  pointer) is set to indicate if that the new node has a value greater
- *	  than the value of the indicated "avl_node_t *".
- *
- * Note - in addition to userland (e.g. libavl and libutil) and the kernel
- * (e.g. genunix), avl.c is compiled into ld.so and kmdb's genunix module,
- * which each have their own compilation environments and subsequent
- * requirements. Each of these environments must be considered when adding
- * dependencies from avl.c.
- */
-
-#include <sys/types.h>
-#include <sys/param.h>
-#include <sys/stdint.h>
-#include <sys/debug.h>
-#include <sys/avl.h>
-
-/*
- * Small arrays to translate between balance (or diff) values and child indices.
- *
- * Code that deals with binary tree data structures will randomly use
- * left and right children when examining a tree.  C "if()" statements
- * which evaluate randomly suffer from very poor hardware branch prediction.
- * In this code we avoid some of the branch mispredictions by using the
- * following translation arrays. They replace random branches with an
- * additional memory reference. Since the translation arrays are both very
- * small the data should remain efficiently in cache.
- */
-static const int  avl_child2balance[2]	= {-1, 1};
-static const int  avl_balance2child[]	= {0, 0, 1};
-
-
-/*
- * Walk from one node to the previous valued node (ie. an infix walk
- * towards the left). At any given node we do one of 2 things:
- *
- * - If there is a left child, go to it, then to it's rightmost descendant.
- *
- * - otherwise we return through parent nodes until we've come from a right
- *   child.
- *
- * Return Value:
- * NULL - if at the end of the nodes
- * otherwise next node
- */
-void *
-avl_walk(avl_tree_t *tree, void	*oldnode, int left)
-{
-	size_t off = tree->avl_offset;
-	avl_node_t *node = AVL_DATA2NODE(oldnode, off);
-	int right = 1 - left;
-	int was_child;
-
-
-	/*
-	 * nowhere to walk to if tree is empty
-	 */
-	if (node == NULL)
-		return (NULL);
-
-	/*
-	 * Visit the previous valued node. There are two possibilities:
-	 *
-	 * If this node has a left child, go down one left, then all
-	 * the way right.
-	 */
-	if (node->avl_child[left] != NULL) {
-		for (node = node->avl_child[left];
-		    node->avl_child[right] != NULL;
-		    node = node->avl_child[right])
-			;
-	/*
-	 * Otherwise, return thru left children as far as we can.
-	 */
-	} else {
-		for (;;) {
-			was_child = AVL_XCHILD(node);
-			node = AVL_XPARENT(node);
-			if (node == NULL)
-				return (NULL);
-			if (was_child == right)
-				break;
-		}
-	}
-
-	return (AVL_NODE2DATA(node, off));
-}
-
-/*
- * Return the lowest valued node in a tree or NULL.
- * (leftmost child from root of tree)
- */
-void *
-avl_first(avl_tree_t *tree)
-{
-	avl_node_t *node;
-	avl_node_t *prev = NULL;
-	size_t off = tree->avl_offset;
-
-	for (node = tree->avl_root; node != NULL; node = node->avl_child[0])
-		prev = node;
-
-	if (prev != NULL)
-		return (AVL_NODE2DATA(prev, off));
-	return (NULL);
-}
-
-/*
- * Return the highest valued node in a tree or NULL.
- * (rightmost child from root of tree)
- */
-void *
-avl_last(avl_tree_t *tree)
-{
-	avl_node_t *node;
-	avl_node_t *prev = NULL;
-	size_t off = tree->avl_offset;
-
-	for (node = tree->avl_root; node != NULL; node = node->avl_child[1])
-		prev = node;
-
-	if (prev != NULL)
-		return (AVL_NODE2DATA(prev, off));
-	return (NULL);
-}
-
-/*
- * Access the node immediately before or after an insertion point.
- *
- * "avl_index_t" is a (avl_node_t *) with the bottom bit indicating a child
- *
- * Return value:
- *	NULL: no node in the given direction
- *	"void *"  of the found tree node
- */
-void *
-avl_nearest(avl_tree_t *tree, avl_index_t where, int direction)
-{
-	int child = AVL_INDEX2CHILD(where);
-	avl_node_t *node = AVL_INDEX2NODE(where);
-	void *data;
-	size_t off = tree->avl_offset;
-
-	if (node == NULL) {
-		ASSERT(tree->avl_root == NULL);
-		return (NULL);
-	}
-	data = AVL_NODE2DATA(node, off);
-	if (child != direction)
-		return (data);
-
-	return (avl_walk(tree, data, direction));
-}
-
-
-/*
- * Search for the node which contains "value".  The algorithm is a
- * simple binary tree search.
- *
- * return value:
- *	NULL: the value is not in the AVL tree
- *		*where (if not NULL)  is set to indicate the insertion point
- *	"void *"  of the found tree node
- */
-void *
-avl_find(avl_tree_t *tree, const void *value, avl_index_t *where)
-{
-	avl_node_t *node;
-	avl_node_t *prev = NULL;
-	int child = 0;
-	int diff;
-	size_t off = tree->avl_offset;
-
-	for (node = tree->avl_root; node != NULL;
-	    node = node->avl_child[child]) {
-
-		prev = node;
-
-		diff = tree->avl_compar(value, AVL_NODE2DATA(node, off));
-		ASSERT(-1 <= diff && diff <= 1);
-		if (diff == 0) {
-#ifdef DEBUG
-			if (where != NULL)
-				*where = 0;
-#endif
-			return (AVL_NODE2DATA(node, off));
-		}
-		child = avl_balance2child[1 + diff];
-
-	}
-
-	if (where != NULL)
-		*where = AVL_MKINDEX(prev, child);
-
-	return (NULL);
-}
-
-
-/*
- * Perform a rotation to restore balance at the subtree given by depth.
- *
- * This routine is used by both insertion and deletion. The return value
- * indicates:
- *	 0 : subtree did not change height
- *	!0 : subtree was reduced in height
- *
- * The code is written as if handling left rotations, right rotations are
- * symmetric and handled by swapping values of variables right/left[_heavy]
- *
- * On input balance is the "new" balance at "node". This value is either
- * -2 or +2.
- */
-static int
-avl_rotation(avl_tree_t *tree, avl_node_t *node, int balance)
-{
-	int left = !(balance < 0);	/* when balance = -2, left will be 0 */
-	int right = 1 - left;
-	int left_heavy = balance >> 1;
-	int right_heavy = -left_heavy;
-	avl_node_t *parent = AVL_XPARENT(node);
-	avl_node_t *child = node->avl_child[left];
-	avl_node_t *cright;
-	avl_node_t *gchild;
-	avl_node_t *gright;
-	avl_node_t *gleft;
-	int which_child = AVL_XCHILD(node);
-	int child_bal = AVL_XBALANCE(child);
-
-	/* BEGIN CSTYLED */
-	/*
-	 * case 1 : node is overly left heavy, the left child is balanced or
-	 * also left heavy. This requires the following rotation.
-	 *
-	 *                   (node bal:-2)
-	 *                    /           \
-	 *                   /             \
-	 *              (child bal:0 or -1)
-	 *              /    \
-	 *             /      \
-	 *                     cright
-	 *
-	 * becomes:
-	 *
-	 *              (child bal:1 or 0)
-	 *              /        \
-	 *             /          \
-	 *                        (node bal:-1 or 0)
-	 *                         /     \
-	 *                        /       \
-	 *                     cright
-	 *
-	 * we detect this situation by noting that child's balance is not
-	 * right_heavy.
-	 */
-	/* END CSTYLED */
-	if (child_bal != right_heavy) {
-
-		/*
-		 * compute new balance of nodes
-		 *
-		 * If child used to be left heavy (now balanced) we reduced
-		 * the height of this sub-tree -- used in "return...;" below
-		 */
-		child_bal += right_heavy; /* adjust towards right */
-
-		/*
-		 * move "cright" to be node's left child
-		 */
-		cright = child->avl_child[right];
-		node->avl_child[left] = cright;
-		if (cright != NULL) {
-			AVL_SETPARENT(cright, node);
-			AVL_SETCHILD(cright, left);
-		}
-
-		/*
-		 * move node to be child's right child
-		 */
-		child->avl_child[right] = node;
-		AVL_SETBALANCE(node, -child_bal);
-		AVL_SETCHILD(node, right);
-		AVL_SETPARENT(node, child);
-
-		/*
-		 * update the pointer into this subtree
-		 */
-		AVL_SETBALANCE(child, child_bal);
-		AVL_SETCHILD(child, which_child);
-		AVL_SETPARENT(child, parent);
-		if (parent != NULL)
-			parent->avl_child[which_child] = child;
-		else
-			tree->avl_root = child;
-
-		return (child_bal == 0);
-	}
-
-	/* BEGIN CSTYLED */
-	/*
-	 * case 2 : When node is left heavy, but child is right heavy we use
-	 * a different rotation.
-	 *
-	 *                   (node b:-2)
-	 *                    /   \
-	 *                   /     \
-	 *                  /       \
-	 *             (child b:+1)
-	 *              /     \
-	 *             /       \
-	 *                   (gchild b: != 0)
-	 *                     /  \
-	 *                    /    \
-	 *                 gleft   gright
-	 *
-	 * becomes:
-	 *
-	 *              (gchild b:0)
-	 *              /       \
-	 *             /         \
-	 *            /           \
-	 *        (child b:?)   (node b:?)
-	 *         /  \          /   \
-	 *        /    \        /     \
-	 *            gleft   gright
-	 *
-	 * computing the new balances is more complicated. As an example:
-	 *	 if gchild was right_heavy, then child is now left heavy
-	 *		else it is balanced
-	 */
-	/* END CSTYLED */
-	gchild = child->avl_child[right];
-	gleft = gchild->avl_child[left];
-	gright = gchild->avl_child[right];
-
-	/*
-	 * move gright to left child of node and
-	 *
-	 * move gleft to right child of node
-	 */
-	node->avl_child[left] = gright;
-	if (gright != NULL) {
-		AVL_SETPARENT(gright, node);
-		AVL_SETCHILD(gright, left);
-	}
-
-	child->avl_child[right] = gleft;
-	if (gleft != NULL) {
-		AVL_SETPARENT(gleft, child);
-		AVL_SETCHILD(gleft, right);
-	}
-
-	/*
-	 * move child to left child of gchild and
-	 *
-	 * move node to right child of gchild and
-	 *
-	 * fixup parent of all this to point to gchild
-	 */
-	balance = AVL_XBALANCE(gchild);
-	gchild->avl_child[left] = child;
-	AVL_SETBALANCE(child, (balance == right_heavy ? left_heavy : 0));
-	AVL_SETPARENT(child, gchild);
-	AVL_SETCHILD(child, left);
-
-	gchild->avl_child[right] = node;
-	AVL_SETBALANCE(node, (balance == left_heavy ? right_heavy : 0));
-	AVL_SETPARENT(node, gchild);
-	AVL_SETCHILD(node, right);
-
-	AVL_SETBALANCE(gchild, 0);
-	AVL_SETPARENT(gchild, parent);
-	AVL_SETCHILD(gchild, which_child);
-	if (parent != NULL)
-		parent->avl_child[which_child] = gchild;
-	else
-		tree->avl_root = gchild;
-
-	return (1);	/* the new tree is always shorter */
-}
-
-
-/*
- * Insert a new node into an AVL tree at the specified (from avl_find()) place.
- *
- * Newly inserted nodes are always leaf nodes in the tree, since avl_find()
- * searches out to the leaf positions.  The avl_index_t indicates the node
- * which will be the parent of the new node.
- *
- * After the node is inserted, a single rotation further up the tree may
- * be necessary to maintain an acceptable AVL balance.
- */
-void
-avl_insert(avl_tree_t *tree, void *new_data, avl_index_t where)
-{
-	avl_node_t *node;
-	avl_node_t *parent = AVL_INDEX2NODE(where);
-	int old_balance;
-	int new_balance;
-	int which_child = AVL_INDEX2CHILD(where);
-	size_t off = tree->avl_offset;
-
-	ASSERT(tree);
-#ifdef _LP64
-	ASSERT(((uintptr_t)new_data & 0x7) == 0);
-#endif
-
-	node = AVL_DATA2NODE(new_data, off);
-
-	/*
-	 * First, add the node to the tree at the indicated position.
-	 */
-	++tree->avl_numnodes;
-
-	node->avl_child[0] = NULL;
-	node->avl_child[1] = NULL;
-
-	AVL_SETCHILD(node, which_child);
-	AVL_SETBALANCE(node, 0);
-	AVL_SETPARENT(node, parent);
-	if (parent != NULL) {
-		ASSERT(parent->avl_child[which_child] == NULL);
-		parent->avl_child[which_child] = node;
-	} else {
-		ASSERT(tree->avl_root == NULL);
-		tree->avl_root = node;
-	}
-	/*
-	 * Now, back up the tree modifying the balance of all nodes above the
-	 * insertion point. If we get to a highly unbalanced ancestor, we
-	 * need to do a rotation.  If we back out of the tree we are done.
-	 * If we brought any subtree into perfect balance (0), we are also done.
-	 */
-	for (;;) {
-		node = parent;
-		if (node == NULL)
-			return;
-
-		/*
-		 * Compute the new balance
-		 */
-		old_balance = AVL_XBALANCE(node);
-		new_balance = old_balance + avl_child2balance[which_child];
-
-		/*
-		 * If we introduced equal balance, then we are done immediately
-		 */
-		if (new_balance == 0) {
-			AVL_SETBALANCE(node, 0);
-			return;
-		}
-
-		/*
-		 * If both old and new are not zero we went
-		 * from -1 to -2 balance, do a rotation.
-		 */
-		if (old_balance != 0)
-			break;
-
-		AVL_SETBALANCE(node, new_balance);
-		parent = AVL_XPARENT(node);
-		which_child = AVL_XCHILD(node);
-	}
-
-	/*
-	 * perform a rotation to fix the tree and return
-	 */
-	(void) avl_rotation(tree, node, new_balance);
-}
-
-/*
- * Insert "new_data" in "tree" in the given "direction" either after or
- * before (AVL_AFTER, AVL_BEFORE) the data "here".
- *
- * Insertions can only be done at empty leaf points in the tree, therefore
- * if the given child of the node is already present we move to either
- * the AVL_PREV or AVL_NEXT and reverse the insertion direction. Since
- * every other node in the tree is a leaf, this always works.
- *
- * To help developers using this interface, we assert that the new node
- * is correctly ordered at every step of the way in DEBUG kernels.
- */
-void
-avl_insert_here(
-	avl_tree_t *tree,
-	void *new_data,
-	void *here,
-	int direction)
-{
-	avl_node_t *node;
-	int child = direction;	/* rely on AVL_BEFORE == 0, AVL_AFTER == 1 */
-#ifdef DEBUG
-	int diff;
-#endif
-
-	ASSERT(tree != NULL);
-	ASSERT(new_data != NULL);
-	ASSERT(here != NULL);
-	ASSERT(direction == AVL_BEFORE || direction == AVL_AFTER);
-
-	/*
-	 * If corresponding child of node is not NULL, go to the neighboring
-	 * node and reverse the insertion direction.
-	 */
-	node = AVL_DATA2NODE(here, tree->avl_offset);
-
-#ifdef DEBUG
-	diff = tree->avl_compar(new_data, here);
-	ASSERT(-1 <= diff && diff <= 1);
-	ASSERT(diff != 0);
-	ASSERT(diff > 0 ? child == 1 : child == 0);
-#endif
-
-	if (node->avl_child[child] != NULL) {
-		node = node->avl_child[child];
-		child = 1 - child;
-		while (node->avl_child[child] != NULL) {
-#ifdef DEBUG
-			diff = tree->avl_compar(new_data,
-			    AVL_NODE2DATA(node, tree->avl_offset));
-			ASSERT(-1 <= diff && diff <= 1);
-			ASSERT(diff != 0);
-			ASSERT(diff > 0 ? child == 1 : child == 0);
-#endif
-			node = node->avl_child[child];
-		}
-#ifdef DEBUG
-		diff = tree->avl_compar(new_data,
-		    AVL_NODE2DATA(node, tree->avl_offset));
-		ASSERT(-1 <= diff && diff <= 1);
-		ASSERT(diff != 0);
-		ASSERT(diff > 0 ? child == 1 : child == 0);
-#endif
-	}
-	ASSERT(node->avl_child[child] == NULL);
-
-	avl_insert(tree, new_data, AVL_MKINDEX(node, child));
-}
-
-/*
- * Add a new node to an AVL tree.
- */
-void
-avl_add(avl_tree_t *tree, void *new_node)
-{
-	avl_index_t where;
-
-	/*
-	 * This is unfortunate.  We want to call panic() here, even for
-	 * non-DEBUG kernels.  In userland, however, we can't depend on anything
-	 * in libc or else the rtld build process gets confused.
-	 * Thankfully, rtld provides us with its own assfail() so we can use
-	 * that here.  We use assfail() directly to get a nice error message
-	 * in the core - much like what panic() does for crashdumps.
-	 */
-	if (avl_find(tree, new_node, &where) != NULL)
-#ifdef _KERNEL
-		panic("avl_find() succeeded inside avl_add()");
-#else
-		(void) assfail("avl_find() succeeded inside avl_add()",
-		    __FILE__, __LINE__);
-#endif
-	avl_insert(tree, new_node, where);
-}
-
-/*
- * Delete a node from the AVL tree.  Deletion is similar to insertion, but
- * with 2 complications.
- *
- * First, we may be deleting an interior node. Consider the following subtree:
- *
- *     d           c            c
- *    / \         / \          / \
- *   b   e       b   e        b   e
- *  / \	        / \          /
- * a   c       a            a
- *
- * When we are deleting node (d), we find and bring up an adjacent valued leaf
- * node, say (c), to take the interior node's place. In the code this is
- * handled by temporarily swapping (d) and (c) in the tree and then using
- * common code to delete (d) from the leaf position.
- *
- * Secondly, an interior deletion from a deep tree may require more than one
- * rotation to fix the balance. This is handled by moving up the tree through
- * parents and applying rotations as needed. The return value from
- * avl_rotation() is used to detect when a subtree did not change overall
- * height due to a rotation.
- */
-void
-avl_remove(avl_tree_t *tree, void *data)
-{
-	avl_node_t *delete;
-	avl_node_t *parent;
-	avl_node_t *node;
-	avl_node_t tmp;
-	int old_balance;
-	int new_balance;
-	int left;
-	int right;
-	int which_child;
-	size_t off = tree->avl_offset;
-
-	ASSERT(tree);
-
-	delete = AVL_DATA2NODE(data, off);
-
-	/*
-	 * Deletion is easiest with a node that has at most 1 child.
-	 * We swap a node with 2 children with a sequentially valued
-	 * neighbor node. That node will have at most 1 child. Note this
-	 * has no effect on the ordering of the remaining nodes.
-	 *
-	 * As an optimization, we choose the greater neighbor if the tree
-	 * is right heavy, otherwise the left neighbor. This reduces the
-	 * number of rotations needed.
-	 */
-	if (delete->avl_child[0] != NULL && delete->avl_child[1] != NULL) {
-
-		/*
-		 * choose node to swap from whichever side is taller
-		 */
-		old_balance = AVL_XBALANCE(delete);
-		left = avl_balance2child[old_balance + 1];
-		right = 1 - left;
-
-		/*
-		 * get to the previous value'd node
-		 * (down 1 left, as far as possible right)
-		 */
-		for (node = delete->avl_child[left];
-		    node->avl_child[right] != NULL;
-		    node = node->avl_child[right])
-			;
-
-		/*
-		 * create a temp placeholder for 'node'
-		 * move 'node' to delete's spot in the tree
-		 */
-		tmp = *node;
-
-		*node = *delete;
-		if (node->avl_child[left] == node)
-			node->avl_child[left] = &tmp;
-
-		parent = AVL_XPARENT(node);
-		if (parent != NULL)
-			parent->avl_child[AVL_XCHILD(node)] = node;
-		else
-			tree->avl_root = node;
-		AVL_SETPARENT(node->avl_child[left], node);
-		AVL_SETPARENT(node->avl_child[right], node);
-
-		/*
-		 * Put tmp where node used to be (just temporary).
-		 * It always has a parent and at most 1 child.
-		 */
-		delete = &tmp;
-		parent = AVL_XPARENT(delete);
-		parent->avl_child[AVL_XCHILD(delete)] = delete;
-		which_child = (delete->avl_child[1] != 0);
-		if (delete->avl_child[which_child] != NULL)
-			AVL_SETPARENT(delete->avl_child[which_child], delete);
-	}
-
-
-	/*
-	 * Here we know "delete" is at least partially a leaf node. It can
-	 * be easily removed from the tree.
-	 */
-	ASSERT(tree->avl_numnodes > 0);
-	--tree->avl_numnodes;
-	parent = AVL_XPARENT(delete);
-	which_child = AVL_XCHILD(delete);
-	if (delete->avl_child[0] != NULL)
-		node = delete->avl_child[0];
-	else
-		node = delete->avl_child[1];
-
-	/*
-	 * Connect parent directly to node (leaving out delete).
-	 */
-	if (node != NULL) {
-		AVL_SETPARENT(node, parent);
-		AVL_SETCHILD(node, which_child);
-	}
-	if (parent == NULL) {
-		tree->avl_root = node;
-		return;
-	}
-	parent->avl_child[which_child] = node;
-
-
-	/*
-	 * Since the subtree is now shorter, begin adjusting parent balances
-	 * and performing any needed rotations.
-	 */
-	do {
-
-		/*
-		 * Move up the tree and adjust the balance
-		 *
-		 * Capture the parent and which_child values for the next
-		 * iteration before any rotations occur.
-		 */
-		node = parent;
-		old_balance = AVL_XBALANCE(node);
-		new_balance = old_balance - avl_child2balance[which_child];
-		parent = AVL_XPARENT(node);
-		which_child = AVL_XCHILD(node);
-
-		/*
-		 * If a node was in perfect balance but isn't anymore then
-		 * we can stop, since the height didn't change above this point
-		 * due to a deletion.
-		 */
-		if (old_balance == 0) {
-			AVL_SETBALANCE(node, new_balance);
-			break;
-		}
-
-		/*
-		 * If the new balance is zero, we don't need to rotate
-		 * else
-		 * need a rotation to fix the balance.
-		 * If the rotation doesn't change the height
-		 * of the sub-tree we have finished adjusting.
-		 */
-		if (new_balance == 0)
-			AVL_SETBALANCE(node, new_balance);
-		else if (!avl_rotation(tree, node, new_balance))
-			break;
-	} while (parent != NULL);
-}
-
-#define	AVL_REINSERT(tree, obj)		\
-	avl_remove((tree), (obj));	\
-	avl_add((tree), (obj))
-
-boolean_t
-avl_update_lt(avl_tree_t *t, void *obj)
-{
-	void *neighbor;
-
-	ASSERT(((neighbor = AVL_NEXT(t, obj)) == NULL) ||
-	    (t->avl_compar(obj, neighbor) <= 0));
-
-	neighbor = AVL_PREV(t, obj);
-	if ((neighbor != NULL) && (t->avl_compar(obj, neighbor) < 0)) {
-		AVL_REINSERT(t, obj);
-		return (B_TRUE);
-	}
-
-	return (B_FALSE);
-}
-
-boolean_t
-avl_update_gt(avl_tree_t *t, void *obj)
-{
-	void *neighbor;
-
-	ASSERT(((neighbor = AVL_PREV(t, obj)) == NULL) ||
-	    (t->avl_compar(obj, neighbor) >= 0));
-
-	neighbor = AVL_NEXT(t, obj);
-	if ((neighbor != NULL) && (t->avl_compar(obj, neighbor) > 0)) {
-		AVL_REINSERT(t, obj);
-		return (B_TRUE);
-	}
-
-	return (B_FALSE);
-}
-
-boolean_t
-avl_update(avl_tree_t *t, void *obj)
-{
-	void *neighbor;
-
-	neighbor = AVL_PREV(t, obj);
-	if ((neighbor != NULL) && (t->avl_compar(obj, neighbor) < 0)) {
-		AVL_REINSERT(t, obj);
-		return (B_TRUE);
-	}
-
-	neighbor = AVL_NEXT(t, obj);
-	if ((neighbor != NULL) && (t->avl_compar(obj, neighbor) > 0)) {
-		AVL_REINSERT(t, obj);
-		return (B_TRUE);
-	}
-
-	return (B_FALSE);
-}
-
-void
-avl_swap(avl_tree_t *tree1, avl_tree_t *tree2)
-{
-	avl_node_t *temp_node;
-	ulong_t temp_numnodes;
-
-	ASSERT3P(tree1->avl_compar, ==, tree2->avl_compar);
-	ASSERT3U(tree1->avl_offset, ==, tree2->avl_offset);
-	ASSERT3U(tree1->avl_size, ==, tree2->avl_size);
-
-	temp_node = tree1->avl_root;
-	temp_numnodes = tree1->avl_numnodes;
-	tree1->avl_root = tree2->avl_root;
-	tree1->avl_numnodes = tree2->avl_numnodes;
-	tree2->avl_root = temp_node;
-	tree2->avl_numnodes = temp_numnodes;
-}
-
-/*
- * initialize a new AVL tree
- */
-void
-avl_create(avl_tree_t *tree, int (*compar) (const void *, const void *),
-    size_t size, size_t offset)
-{
-	ASSERT(tree);
-	ASSERT(compar);
-	ASSERT(size > 0);
-	ASSERT(size >= offset + sizeof (avl_node_t));
-#ifdef _LP64
-	ASSERT((offset & 0x7) == 0);
-#endif
-
-	tree->avl_compar = compar;
-	tree->avl_root = NULL;
-	tree->avl_numnodes = 0;
-	tree->avl_size = size;
-	tree->avl_offset = offset;
-}
-
-/*
- * Delete a tree.
- */
-/* ARGSUSED */
-void
-avl_destroy(avl_tree_t *tree)
-{
-	ASSERT(tree);
-	ASSERT(tree->avl_numnodes == 0);
-	ASSERT(tree->avl_root == NULL);
-}
-
-
-/*
- * Return the number of nodes in an AVL tree.
- */
-ulong_t
-avl_numnodes(avl_tree_t *tree)
-{
-	ASSERT(tree);
-	return (tree->avl_numnodes);
-}
-
-boolean_t
-avl_is_empty(avl_tree_t *tree)
-{
-	ASSERT(tree);
-	return (tree->avl_numnodes == 0);
-}
-
-#define	CHILDBIT	(1L)
-
-/*
- * Post-order tree walk used to visit all tree nodes and destroy the tree
- * in post order. This is used for destroying a tree without paying any cost
- * for rebalancing it.
- *
- * example:
- *
- *	void *cookie = NULL;
- *	my_data_t *node;
- *
- *	while ((node = avl_destroy_nodes(tree, &cookie)) != NULL)
- *		free(node);
- *	avl_destroy(tree);
- *
- * The cookie is really an avl_node_t to the current node's parent and
- * an indication of which child you looked at last.
- *
- * On input, a cookie value of CHILDBIT indicates the tree is done.
- */
-void *
-avl_destroy_nodes(avl_tree_t *tree, void **cookie)
-{
-	avl_node_t	*node;
-	avl_node_t	*parent;
-	int		child;
-	void		*first;
-	size_t		off = tree->avl_offset;
-
-	/*
-	 * Initial calls go to the first node or it's right descendant.
-	 */
-	if (*cookie == NULL) {
-		first = avl_first(tree);
-
-		/*
-		 * deal with an empty tree
-		 */
-		if (first == NULL) {
-			*cookie = (void *)CHILDBIT;
-			return (NULL);
-		}
-
-		node = AVL_DATA2NODE(first, off);
-		parent = AVL_XPARENT(node);
-		goto check_right_side;
-	}
-
-	/*
-	 * If there is no parent to return to we are done.
-	 */
-	parent = (avl_node_t *)((uintptr_t)(*cookie) & ~CHILDBIT);
-	if (parent == NULL) {
-		if (tree->avl_root != NULL) {
-			ASSERT(tree->avl_numnodes == 1);
-			tree->avl_root = NULL;
-			tree->avl_numnodes = 0;
-		}
-		return (NULL);
-	}
-
-	/*
-	 * Remove the child pointer we just visited from the parent and tree.
-	 */
-	child = (uintptr_t)(*cookie) & CHILDBIT;
-	parent->avl_child[child] = NULL;
-	ASSERT(tree->avl_numnodes > 1);
-	--tree->avl_numnodes;
-
-	/*
-	 * If we just did a right child or there isn't one, go up to parent.
-	 */
-	if (child == 1 || parent->avl_child[1] == NULL) {
-		node = parent;
-		parent = AVL_XPARENT(parent);
-		goto done;
-	}
-
-	/*
-	 * Do parent's right child, then leftmost descendent.
-	 */
-	node = parent->avl_child[1];
-	while (node->avl_child[0] != NULL) {
-		parent = node;
-		node = node->avl_child[0];
-	}
-
-	/*
-	 * If here, we moved to a left child. It may have one
-	 * child on the right (when balance == +1).
-	 */
-check_right_side:
-	if (node->avl_child[1] != NULL) {
-		ASSERT(AVL_XBALANCE(node) == 1);
-		parent = node;
-		node = node->avl_child[1];
-		ASSERT(node->avl_child[0] == NULL &&
-		    node->avl_child[1] == NULL);
-	} else {
-		ASSERT(AVL_XBALANCE(node) <= 0);
-	}
-
-done:
-	if (parent == NULL) {
-		*cookie = (void *)CHILDBIT;
-		ASSERT(node == tree->avl_root);
-	} else {
-		*cookie = (void *)((uintptr_t)parent | AVL_XCHILD(node));
-	}
-
-	return (AVL_NODE2DATA(node, off));
-}
Index: sys/cddl/contrib/opensolaris/common/nvpair/opensolaris_fnvpair.c
===================================================================
--- sys/cddl/contrib/opensolaris/common/nvpair/opensolaris_fnvpair.c
+++ /dev/null
@@ -1,512 +0,0 @@
-
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-
-/*
- * Copyright (c) 2012 by Delphix. All rights reserved.
- */
-
-#include <sys/nvpair.h>
-#ifndef _KERNEL
-#include <sys/zfs_context.h>
-#else
-#include <sys/debug.h>
-#include <sys/kmem.h>
-#include <sys/param.h>
-#include <sys/debug.h>
-#endif
-
-/*
- * "Force" nvlist wrapper.
- *
- * These functions wrap the nvlist_* functions with assertions that assume
- * the operation is successful.  This allows the caller's code to be much
- * more readable, especially for the fnvlist_lookup_* and fnvpair_value_*
- * functions, which can return the requested value (rather than filling in
- * a pointer).
- *
- * These functions use NV_UNIQUE_NAME, encoding NV_ENCODE_NATIVE, and allocate
- * with KM_SLEEP.
- *
- * More wrappers should be added as needed -- for example
- * nvlist_lookup_*_array and nvpair_value_*_array.
- */
-
-nvlist_t *
-fnvlist_alloc(void)
-{
-	nvlist_t *nvl;
-	VERIFY0(nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP));
-	return (nvl);
-}
-
-void
-fnvlist_free(nvlist_t *nvl)
-{
-	nvlist_free(nvl);
-}
-
-size_t
-fnvlist_size(nvlist_t *nvl)
-{
-	size_t size;
-	VERIFY0(nvlist_size(nvl, &size, NV_ENCODE_NATIVE));
-	return (size);
-}
-
-/*
- * Returns allocated buffer of size *sizep.  Caller must free the buffer with
- * fnvlist_pack_free().
- */
-char *
-fnvlist_pack(nvlist_t *nvl, size_t *sizep)
-{
-	char *packed = 0;
-	VERIFY3U(nvlist_pack(nvl, &packed, sizep, NV_ENCODE_NATIVE,
-	    KM_SLEEP), ==, 0);
-	return (packed);
-}
-
-/*ARGSUSED*/
-void
-fnvlist_pack_free(char *pack, size_t size)
-{
-#ifdef _KERNEL
-	kmem_free(pack, size);
-#else
-	free(pack);
-#endif
-}
-
-nvlist_t *
-fnvlist_unpack(char *buf, size_t buflen)
-{
-	nvlist_t *rv;
-	VERIFY0(nvlist_unpack(buf, buflen, &rv, KM_SLEEP));
-	return (rv);
-}
-
-nvlist_t *
-fnvlist_dup(nvlist_t *nvl)
-{
-	nvlist_t *rv;
-	VERIFY0(nvlist_dup(nvl, &rv, KM_SLEEP));
-	return (rv);
-}
-
-void
-fnvlist_merge(nvlist_t *dst, nvlist_t *src)
-{
-	VERIFY0(nvlist_merge(dst, src, KM_SLEEP));
-}
-
-size_t
-fnvlist_num_pairs(nvlist_t *nvl)
-{
-	size_t count = 0;
-	nvpair_t *pair;
-
-	for (pair = nvlist_next_nvpair(nvl, 0); pair != NULL;
-	    pair = nvlist_next_nvpair(nvl, pair))
-		count++;
-	return (count);
-}
-
-void
-fnvlist_add_boolean(nvlist_t *nvl, const char *name)
-{
-	VERIFY0(nvlist_add_boolean(nvl, name));
-}
-
-void
-fnvlist_add_boolean_value(nvlist_t *nvl, const char *name, boolean_t val)
-{
-	VERIFY0(nvlist_add_boolean_value(nvl, name, val));
-}
-
-void
-fnvlist_add_byte(nvlist_t *nvl, const char *name, uchar_t val)
-{
-	VERIFY0(nvlist_add_byte(nvl, name, val));
-}
-
-void
-fnvlist_add_int8(nvlist_t *nvl, const char *name, int8_t val)
-{
-	VERIFY0(nvlist_add_int8(nvl, name, val));
-}
-
-void
-fnvlist_add_uint8(nvlist_t *nvl, const char *name, uint8_t val)
-{
-	VERIFY0(nvlist_add_uint8(nvl, name, val));
-}
-
-void
-fnvlist_add_int16(nvlist_t *nvl, const char *name, int16_t val)
-{
-	VERIFY0(nvlist_add_int16(nvl, name, val));
-}
-
-void
-fnvlist_add_uint16(nvlist_t *nvl, const char *name, uint16_t val)
-{
-	VERIFY0(nvlist_add_uint16(nvl, name, val));
-}
-
-void
-fnvlist_add_int32(nvlist_t *nvl, const char *name, int32_t val)
-{
-	VERIFY0(nvlist_add_int32(nvl, name, val));
-}
-
-void
-fnvlist_add_uint32(nvlist_t *nvl, const char *name, uint32_t val)
-{
-	VERIFY0(nvlist_add_uint32(nvl, name, val));
-}
-
-void
-fnvlist_add_int64(nvlist_t *nvl, const char *name, int64_t val)
-{
-	VERIFY0(nvlist_add_int64(nvl, name, val));
-}
-
-void
-fnvlist_add_uint64(nvlist_t *nvl, const char *name, uint64_t val)
-{
-	VERIFY0(nvlist_add_uint64(nvl, name, val));
-}
-
-void
-fnvlist_add_string(nvlist_t *nvl, const char *name, const char *val)
-{
-	VERIFY0(nvlist_add_string(nvl, name, val));
-}
-
-void
-fnvlist_add_nvlist(nvlist_t *nvl, const char *name, nvlist_t *val)
-{
-	VERIFY0(nvlist_add_nvlist(nvl, name, val));
-}
-
-void
-fnvlist_add_nvpair(nvlist_t *nvl, nvpair_t *pair)
-{
-	VERIFY0(nvlist_add_nvpair(nvl, pair));
-}
-
-void
-fnvlist_add_boolean_array(nvlist_t *nvl, const char *name,
-    boolean_t *val, uint_t n)
-{
-	VERIFY0(nvlist_add_boolean_array(nvl, name, val, n));
-}
-
-void
-fnvlist_add_byte_array(nvlist_t *nvl, const char *name, uchar_t *val, uint_t n)
-{
-	VERIFY0(nvlist_add_byte_array(nvl, name, val, n));
-}
-
-void
-fnvlist_add_int8_array(nvlist_t *nvl, const char *name, int8_t *val, uint_t n)
-{
-	VERIFY0(nvlist_add_int8_array(nvl, name, val, n));
-}
-
-void
-fnvlist_add_uint8_array(nvlist_t *nvl, const char *name, uint8_t *val, uint_t n)
-{
-	VERIFY0(nvlist_add_uint8_array(nvl, name, val, n));
-}
-
-void
-fnvlist_add_int16_array(nvlist_t *nvl, const char *name, int16_t *val, uint_t n)
-{
-	VERIFY0(nvlist_add_int16_array(nvl, name, val, n));
-}
-
-void
-fnvlist_add_uint16_array(nvlist_t *nvl, const char *name,
-    uint16_t *val, uint_t n)
-{
-	VERIFY0(nvlist_add_uint16_array(nvl, name, val, n));
-}
-
-void
-fnvlist_add_int32_array(nvlist_t *nvl, const char *name, int32_t *val, uint_t n)
-{
-	VERIFY0(nvlist_add_int32_array(nvl, name, val, n));
-}
-
-void
-fnvlist_add_uint32_array(nvlist_t *nvl, const char *name,
-    uint32_t *val, uint_t n)
-{
-	VERIFY0(nvlist_add_uint32_array(nvl, name, val, n));
-}
-
-void
-fnvlist_add_int64_array(nvlist_t *nvl, const char *name, int64_t *val, uint_t n)
-{
-	VERIFY0(nvlist_add_int64_array(nvl, name, val, n));
-}
-
-void
-fnvlist_add_uint64_array(nvlist_t *nvl, const char *name,
-    uint64_t *val, uint_t n)
-{
-	VERIFY0(nvlist_add_uint64_array(nvl, name, val, n));
-}
-
-void
-fnvlist_add_string_array(nvlist_t *nvl, const char *name,
-    char * const *val, uint_t n)
-{
-	VERIFY0(nvlist_add_string_array(nvl, name, val, n));
-}
-
-void
-fnvlist_add_nvlist_array(nvlist_t *nvl, const char *name,
-    nvlist_t **val, uint_t n)
-{
-	VERIFY0(nvlist_add_nvlist_array(nvl, name, val, n));
-}
-
-void
-fnvlist_remove(nvlist_t *nvl, const char *name)
-{
-	VERIFY0(nvlist_remove_all(nvl, name));
-}
-
-void
-fnvlist_remove_nvpair(nvlist_t *nvl, nvpair_t *pair)
-{
-	VERIFY0(nvlist_remove_nvpair(nvl, pair));
-}
-
-nvpair_t *
-fnvlist_lookup_nvpair(nvlist_t *nvl, const char *name)
-{
-	nvpair_t *rv;
-	VERIFY0(nvlist_lookup_nvpair(nvl, name, &rv));
-	return (rv);
-}
-
-/* returns B_TRUE if the entry exists */
-boolean_t
-fnvlist_lookup_boolean(nvlist_t *nvl, const char *name)
-{
-	return (nvlist_lookup_boolean(nvl, name) == 0);
-}
-
-boolean_t
-fnvlist_lookup_boolean_value(nvlist_t *nvl, const char *name)
-{
-	boolean_t rv;
-	VERIFY0(nvlist_lookup_boolean_value(nvl, name, &rv));
-	return (rv);
-}
-
-uchar_t
-fnvlist_lookup_byte(nvlist_t *nvl, const char *name)
-{
-	uchar_t rv;
-	VERIFY0(nvlist_lookup_byte(nvl, name, &rv));
-	return (rv);
-}
-
-int8_t
-fnvlist_lookup_int8(nvlist_t *nvl, const char *name)
-{
-	int8_t rv;
-	VERIFY0(nvlist_lookup_int8(nvl, name, &rv));
-	return (rv);
-}
-
-int16_t
-fnvlist_lookup_int16(nvlist_t *nvl, const char *name)
-{
-	int16_t rv;
-	VERIFY0(nvlist_lookup_int16(nvl, name, &rv));
-	return (rv);
-}
-
-int32_t
-fnvlist_lookup_int32(nvlist_t *nvl, const char *name)
-{
-	int32_t rv;
-	VERIFY0(nvlist_lookup_int32(nvl, name, &rv));
-	return (rv);
-}
-
-int64_t
-fnvlist_lookup_int64(nvlist_t *nvl, const char *name)
-{
-	int64_t rv;
-	VERIFY0(nvlist_lookup_int64(nvl, name, &rv));
-	return (rv);
-}
-
-uint8_t
-fnvlist_lookup_uint8_t(nvlist_t *nvl, const char *name)
-{
-	uint8_t rv;
-	VERIFY0(nvlist_lookup_uint8(nvl, name, &rv));
-	return (rv);
-}
-
-uint16_t
-fnvlist_lookup_uint16(nvlist_t *nvl, const char *name)
-{
-	uint16_t rv;
-	VERIFY0(nvlist_lookup_uint16(nvl, name, &rv));
-	return (rv);
-}
-
-uint32_t
-fnvlist_lookup_uint32(nvlist_t *nvl, const char *name)
-{
-	uint32_t rv;
-	VERIFY0(nvlist_lookup_uint32(nvl, name, &rv));
-	return (rv);
-}
-
-uint64_t
-fnvlist_lookup_uint64(nvlist_t *nvl, const char *name)
-{
-	uint64_t rv;
-	VERIFY0(nvlist_lookup_uint64(nvl, name, &rv));
-	return (rv);
-}
-
-char *
-fnvlist_lookup_string(nvlist_t *nvl, const char *name)
-{
-	char *rv;
-	VERIFY0(nvlist_lookup_string(nvl, name, &rv));
-	return (rv);
-}
-
-nvlist_t *
-fnvlist_lookup_nvlist(nvlist_t *nvl, const char *name)
-{
-	nvlist_t *rv;
-	VERIFY0(nvlist_lookup_nvlist(nvl, name, &rv));
-	return (rv);
-}
-
-boolean_t
-fnvpair_value_boolean_value(nvpair_t *nvp)
-{
-	boolean_t rv;
-	VERIFY0(nvpair_value_boolean_value(nvp, &rv));
-	return (rv);
-}
-
-uchar_t
-fnvpair_value_byte(nvpair_t *nvp)
-{
-	uchar_t rv;
-	VERIFY0(nvpair_value_byte(nvp, &rv));
-	return (rv);
-}
-
-int8_t
-fnvpair_value_int8(nvpair_t *nvp)
-{
-	int8_t rv;
-	VERIFY0(nvpair_value_int8(nvp, &rv));
-	return (rv);
-}
-
-int16_t
-fnvpair_value_int16(nvpair_t *nvp)
-{
-	int16_t rv;
-	VERIFY0(nvpair_value_int16(nvp, &rv));
-	return (rv);
-}
-
-int32_t
-fnvpair_value_int32(nvpair_t *nvp)
-{
-	int32_t rv;
-	VERIFY0(nvpair_value_int32(nvp, &rv));
-	return (rv);
-}
-
-int64_t
-fnvpair_value_int64(nvpair_t *nvp)
-{
-	int64_t rv;
-	VERIFY0(nvpair_value_int64(nvp, &rv));
-	return (rv);
-}
-
-uint8_t
-fnvpair_value_uint8_t(nvpair_t *nvp)
-{
-	uint8_t rv;
-	VERIFY0(nvpair_value_uint8(nvp, &rv));
-	return (rv);
-}
-
-uint16_t
-fnvpair_value_uint16(nvpair_t *nvp)
-{
-	uint16_t rv;
-	VERIFY0(nvpair_value_uint16(nvp, &rv));
-	return (rv);
-}
-
-uint32_t
-fnvpair_value_uint32(nvpair_t *nvp)
-{
-	uint32_t rv;
-	VERIFY0(nvpair_value_uint32(nvp, &rv));
-	return (rv);
-}
-
-uint64_t
-fnvpair_value_uint64(nvpair_t *nvp)
-{
-	uint64_t rv;
-	VERIFY0(nvpair_value_uint64(nvp, &rv));
-	return (rv);
-}
-
-char *
-fnvpair_value_string(nvpair_t *nvp)
-{
-	char *rv;
-	VERIFY0(nvpair_value_string(nvp, &rv));
-	return (rv);
-}
-
-nvlist_t *
-fnvpair_value_nvlist(nvpair_t *nvp)
-{
-	nvlist_t *rv;
-	VERIFY0(nvpair_value_nvlist(nvp, &rv));
-	return (rv);
-}
Index: sys/cddl/contrib/opensolaris/common/nvpair/opensolaris_nvpair.c
===================================================================
--- sys/cddl/contrib/opensolaris/common/nvpair/opensolaris_nvpair.c
+++ /dev/null
@@ -1,3600 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-
-/*
- * Copyright (c) 2000, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2015, 2017 by Delphix. All rights reserved.
- */
-
-#include <sys/debug.h>
-#include <sys/nvpair.h>
-#include <sys/nvpair_impl.h>
-#include <rpc/types.h>
-#include <rpc/xdr.h>
-
-#if defined(_KERNEL) && !defined(_BOOT)
-#include <sys/varargs.h>
-#include <sys/sunddi.h>
-#else
-#include <stdarg.h>
-#include <stdlib.h>
-#include <string.h>
-#include <strings.h>
-#endif
-
-#ifndef	offsetof
-#define	offsetof(s, m)		((size_t)(&(((s *)0)->m)))
-#endif
-#define	skip_whitespace(p)	while ((*(p) == ' ') || (*(p) == '\t')) p++
-
-#if defined(__FreeBSD__) && !defined(_KERNEL)
-/*
- * libnvpair is the lowest commen denominator for ZFS related libraries,
- * defining aok here makes it usable by all ZFS related libraries
- */
-int aok;
-#endif
-
-/*
- * nvpair.c - Provides kernel & userland interfaces for manipulating
- *	name-value pairs.
- *
- * Overview Diagram
- *
- *  +--------------+
- *  |  nvlist_t    |
- *  |--------------|
- *  | nvl_version  |
- *  | nvl_nvflag   |
- *  | nvl_priv    -+-+
- *  | nvl_flag     | |
- *  | nvl_pad      | |
- *  +--------------+ |
- *                   V
- *      +--------------+      last i_nvp in list
- *      | nvpriv_t     |  +--------------------->
- *      |--------------|  |
- *   +--+- nvp_list    |  |   +------------+
- *   |  |  nvp_last   -+--+   + nv_alloc_t |
- *   |  |  nvp_curr    |      |------------|
- *   |  |  nvp_nva    -+----> | nva_ops    |
- *   |  |  nvp_stat    |      | nva_arg    |
- *   |  +--------------+      +------------+
- *   |
- *   +-------+
- *           V
- *   +---------------------+      +-------------------+
- *   |  i_nvp_t            |  +-->|  i_nvp_t          |  +-->
- *   |---------------------|  |   |-------------------|  |
- *   | nvi_next           -+--+   | nvi_next         -+--+
- *   | nvi_prev (NULL)     | <----+ nvi_prev          |
- *   | . . . . . . . . . . |      | . . . . . . . . . |
- *   | nvp (nvpair_t)      |      | nvp (nvpair_t)    |
- *   |  - nvp_size         |      |  - nvp_size       |
- *   |  - nvp_name_sz      |      |  - nvp_name_sz    |
- *   |  - nvp_value_elem   |      |  - nvp_value_elem |
- *   |  - nvp_type         |      |  - nvp_type       |
- *   |  - data ...         |      |  - data ...       |
- *   +---------------------+      +-------------------+
- *
- *
- *
- *   +---------------------+              +---------------------+
- *   |  i_nvp_t            |  +-->    +-->|  i_nvp_t (last)     |
- *   |---------------------|  |       |   |---------------------|
- *   |  nvi_next          -+--+ ... --+   | nvi_next (NULL)     |
- * <-+- nvi_prev           |<-- ...  <----+ nvi_prev            |
- *   | . . . . . . . . .   |              | . . . . . . . . .   |
- *   | nvp (nvpair_t)      |              | nvp (nvpair_t)      |
- *   |  - nvp_size         |              |  - nvp_size         |
- *   |  - nvp_name_sz      |              |  - nvp_name_sz      |
- *   |  - nvp_value_elem   |              |  - nvp_value_elem   |
- *   |  - DATA_TYPE_NVLIST |              |  - nvp_type         |
- *   |  - data (embedded)  |              |  - data ...         |
- *   |    nvlist name      |              +---------------------+
- *   |  +--------------+   |
- *   |  |  nvlist_t    |   |
- *   |  |--------------|   |
- *   |  | nvl_version  |   |
- *   |  | nvl_nvflag   |   |
- *   |  | nvl_priv   --+---+---->
- *   |  | nvl_flag     |   |
- *   |  | nvl_pad      |   |
- *   |  +--------------+   |
- *   +---------------------+
- *
- *
- * N.B. nvpair_t may be aligned on 4 byte boundary, so +4 will
- * allow value to be aligned on 8 byte boundary
- *
- * name_len is the length of the name string including the null terminator
- * so it must be >= 1
- */
-#define	NVP_SIZE_CALC(name_len, data_len) \
-	(NV_ALIGN((sizeof (nvpair_t)) + name_len) + NV_ALIGN(data_len))
-
-static int i_get_value_size(data_type_t type, const void *data, uint_t nelem);
-static int nvlist_add_common(nvlist_t *nvl, const char *name, data_type_t type,
-    uint_t nelem, const void *data);
-
-#define	NV_STAT_EMBEDDED	0x1
-#define	EMBEDDED_NVL(nvp)	((nvlist_t *)(void *)NVP_VALUE(nvp))
-#define	EMBEDDED_NVL_ARRAY(nvp)	((nvlist_t **)(void *)NVP_VALUE(nvp))
-
-#define	NVP_VALOFF(nvp)	(NV_ALIGN(sizeof (nvpair_t) + (nvp)->nvp_name_sz))
-#define	NVPAIR2I_NVP(nvp) \
-	((i_nvp_t *)((size_t)(nvp) - offsetof(i_nvp_t, nvi_nvp)))
-
-#ifdef _KERNEL
-int nvpair_max_recursion = 20;
-#else
-int nvpair_max_recursion = 100;
-#endif
-
-uint64_t nvlist_hashtable_init_size = (1 << 4);
-
-int
-nv_alloc_init(nv_alloc_t *nva, const nv_alloc_ops_t *nvo, /* args */ ...)
-{
-	va_list valist;
-	int err = 0;
-
-	nva->nva_ops = nvo;
-	nva->nva_arg = NULL;
-
-	va_start(valist, nvo);
-	if (nva->nva_ops->nv_ao_init != NULL)
-		err = nva->nva_ops->nv_ao_init(nva, valist);
-	va_end(valist);
-
-	return (err);
-}
-
-void
-nv_alloc_reset(nv_alloc_t *nva)
-{
-	if (nva->nva_ops->nv_ao_reset != NULL)
-		nva->nva_ops->nv_ao_reset(nva);
-}
-
-void
-nv_alloc_fini(nv_alloc_t *nva)
-{
-	if (nva->nva_ops->nv_ao_fini != NULL)
-		nva->nva_ops->nv_ao_fini(nva);
-}
-
-nv_alloc_t *
-nvlist_lookup_nv_alloc(nvlist_t *nvl)
-{
-	nvpriv_t *priv;
-
-	if (nvl == NULL ||
-	    (priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv) == NULL)
-		return (NULL);
-
-	return (priv->nvp_nva);
-}
-
-static void *
-nv_mem_zalloc(nvpriv_t *nvp, size_t size)
-{
-	nv_alloc_t *nva = nvp->nvp_nva;
-	void *buf;
-
-	if ((buf = nva->nva_ops->nv_ao_alloc(nva, size)) != NULL)
-		bzero(buf, size);
-
-	return (buf);
-}
-
-static void
-nv_mem_free(nvpriv_t *nvp, void *buf, size_t size)
-{
-	nv_alloc_t *nva = nvp->nvp_nva;
-
-	nva->nva_ops->nv_ao_free(nva, buf, size);
-}
-
-static void
-nv_priv_init(nvpriv_t *priv, nv_alloc_t *nva, uint32_t stat)
-{
-	bzero(priv, sizeof (nvpriv_t));
-
-	priv->nvp_nva = nva;
-	priv->nvp_stat = stat;
-}
-
-static nvpriv_t *
-nv_priv_alloc(nv_alloc_t *nva)
-{
-	nvpriv_t *priv;
-
-	/*
-	 * nv_mem_alloc() cannot called here because it needs the priv
-	 * argument.
-	 */
-	if ((priv = nva->nva_ops->nv_ao_alloc(nva, sizeof (nvpriv_t))) == NULL)
-		return (NULL);
-
-	nv_priv_init(priv, nva, 0);
-
-	return (priv);
-}
-
-/*
- * Embedded lists need their own nvpriv_t's.  We create a new
- * nvpriv_t using the parameters and allocator from the parent
- * list's nvpriv_t.
- */
-static nvpriv_t *
-nv_priv_alloc_embedded(nvpriv_t *priv)
-{
-	nvpriv_t *emb_priv;
-
-	if ((emb_priv = nv_mem_zalloc(priv, sizeof (nvpriv_t))) == NULL)
-		return (NULL);
-
-	nv_priv_init(emb_priv, priv->nvp_nva, NV_STAT_EMBEDDED);
-
-	return (emb_priv);
-}
-
-static int
-nvt_tab_alloc(nvpriv_t *priv, uint64_t buckets)
-{
-	ASSERT3P(priv->nvp_hashtable, ==, NULL);
-	ASSERT0(priv->nvp_nbuckets);
-	ASSERT0(priv->nvp_nentries);
-
-	i_nvp_t **tab = nv_mem_zalloc(priv, buckets * sizeof (i_nvp_t *));
-	if (tab == NULL)
-		return (ENOMEM);
-
-	priv->nvp_hashtable = tab;
-	priv->nvp_nbuckets = buckets;
-	return (0);
-}
-
-static void
-nvt_tab_free(nvpriv_t *priv)
-{
-	i_nvp_t **tab = priv->nvp_hashtable;
-	if (tab == NULL) {
-		ASSERT0(priv->nvp_nbuckets);
-		ASSERT0(priv->nvp_nentries);
-		return;
-	}
-
-	nv_mem_free(priv, tab, priv->nvp_nbuckets * sizeof (i_nvp_t *));
-
-	priv->nvp_hashtable = NULL;
-	priv->nvp_nbuckets = 0;
-	priv->nvp_nentries = 0;
-}
-
-static uint32_t
-nvt_hash(const char *p)
-{
-	uint32_t g, hval = 0;
-
-	while (*p) {
-		hval = (hval << 4) + *p++;
-		if ((g = (hval & 0xf0000000)) != 0)
-			hval ^= g >> 24;
-		hval &= ~g;
-	}
-	return (hval);
-}
-
-static boolean_t
-nvt_nvpair_match(nvpair_t *nvp1, nvpair_t *nvp2, uint32_t nvflag)
-{
-	boolean_t match = B_FALSE;
-	if (nvflag & NV_UNIQUE_NAME_TYPE) {
-		if (strcmp(NVP_NAME(nvp1), NVP_NAME(nvp2)) == 0 &&
-		    NVP_TYPE(nvp1) == NVP_TYPE(nvp2))
-			match = B_TRUE;
-	} else {
-		ASSERT(nvflag == 0 || nvflag & NV_UNIQUE_NAME);
-		if (strcmp(NVP_NAME(nvp1), NVP_NAME(nvp2)) == 0)
-			match = B_TRUE;
-	}
-	return (match);
-}
-
-static nvpair_t *
-nvt_lookup_name_type(nvlist_t *nvl, const char *name, data_type_t type)
-{
-	nvpriv_t *priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv;
-	ASSERT(priv != NULL);
-
-	i_nvp_t **tab = priv->nvp_hashtable;
-
-	if (tab == NULL) {
-		ASSERT3P(priv->nvp_list, ==, NULL);
-		ASSERT0(priv->nvp_nbuckets);
-		ASSERT0(priv->nvp_nentries);
-		return (NULL);
-	} else {
-		ASSERT(priv->nvp_nbuckets != 0);
-	}
-
-	uint64_t hash = nvt_hash(name);
-	uint64_t index = hash & (priv->nvp_nbuckets - 1);
-
-	ASSERT3U(index, <, priv->nvp_nbuckets);
-	i_nvp_t *entry = tab[index];
-
-	for (i_nvp_t *e = entry; e != NULL; e = e->nvi_hashtable_next) {
-		if (strcmp(NVP_NAME(&e->nvi_nvp), name) == 0 &&
-		    (type == DATA_TYPE_DONTCARE ||
-		    NVP_TYPE(&e->nvi_nvp) == type))
-			return (&e->nvi_nvp);
-	}
-	return (NULL);
-}
-
-static nvpair_t *
-nvt_lookup_name(nvlist_t *nvl, const char *name)
-{
-	return (nvt_lookup_name_type(nvl, name, DATA_TYPE_DONTCARE));
-}
-
-static int
-nvt_resize(nvpriv_t *priv, uint32_t new_size)
-{
-	i_nvp_t **tab = priv->nvp_hashtable;
-
-	/*
-	 * Migrate all the entries from the current table
-	 * to a newly-allocated table with the new size by
-	 * re-adjusting the pointers of their entries.
-	 */
-	uint32_t size = priv->nvp_nbuckets;
-	uint32_t new_mask = new_size - 1;
-	ASSERT(((new_size) & ((new_size) - 1)) == 0);
-
-	i_nvp_t **new_tab = nv_mem_zalloc(priv, new_size * sizeof (i_nvp_t *));
-	if (new_tab == NULL)
-		return (ENOMEM);
-
-	uint32_t nentries = 0;
-	for (uint32_t i = 0; i < size; i++) {
-		i_nvp_t *next, *e = tab[i];
-
-		while (e != NULL) {
-			next = e->nvi_hashtable_next;
-
-			uint32_t hash = nvt_hash(NVP_NAME(&e->nvi_nvp));
-			uint32_t index = hash & new_mask;
-
-			e->nvi_hashtable_next = new_tab[index];
-			new_tab[index] = e;
-			nentries++;
-
-			e = next;
-		}
-		tab[i] = NULL;
-	}
-	ASSERT3U(nentries, ==, priv->nvp_nentries);
-
-	nvt_tab_free(priv);
-
-	priv->nvp_hashtable = new_tab;
-	priv->nvp_nbuckets = new_size;
-	priv->nvp_nentries = nentries;
-
-	return (0);
-}
-
-static boolean_t
-nvt_needs_togrow(nvpriv_t *priv)
-{
-	/*
-	 * Grow only when we have more elements than buckets
-	 * and the # of buckets doesn't overflow.
-	 */
-	return (priv->nvp_nentries > priv->nvp_nbuckets &&
-	    (UINT32_MAX >> 1) >= priv->nvp_nbuckets);
-}
-
-/*
- * Allocate a new table that's twice the size of the old one,
- * and migrate all the entries from the old one to the new
- * one by re-adjusting their pointers.
- */
-static int
-nvt_grow(nvpriv_t *priv)
-{
-	uint32_t current_size = priv->nvp_nbuckets;
-	/* ensure we won't overflow */
-	ASSERT3U(UINT32_MAX >> 1, >=, current_size);
-	return (nvt_resize(priv, current_size << 1));
-}
-
-static boolean_t
-nvt_needs_toshrink(nvpriv_t *priv)
-{
-	/*
-	 * Shrink only when the # of elements is less than or
-	 * equal to 1/4 the # of buckets. Never shrink less than
-	 * nvlist_hashtable_init_size.
-	 */
-	ASSERT3U(priv->nvp_nbuckets, >=, nvlist_hashtable_init_size);
-	if (priv->nvp_nbuckets == nvlist_hashtable_init_size)
-		return (B_FALSE);
-	return (priv->nvp_nentries <= (priv->nvp_nbuckets >> 2));
-}
-
-/*
- * Allocate a new table that's half the size of the old one,
- * and migrate all the entries from the old one to the new
- * one by re-adjusting their pointers.
- */
-static int
-nvt_shrink(nvpriv_t *priv)
-{
-	uint32_t current_size = priv->nvp_nbuckets;
-	/* ensure we won't overflow */
-	ASSERT3U(current_size, >=, nvlist_hashtable_init_size);
-	return (nvt_resize(priv, current_size >> 1));
-}
-
-static int
-nvt_remove_nvpair(nvlist_t *nvl, nvpair_t *nvp)
-{
-	nvpriv_t *priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv;
-
-	if (nvt_needs_toshrink(priv)) {
-		int err = nvt_shrink(priv);
-		if (err != 0)
-			return (err);
-	}
-	i_nvp_t **tab = priv->nvp_hashtable;
-
-	char *name = NVP_NAME(nvp);
-	uint64_t hash = nvt_hash(name);
-	uint64_t index = hash & (priv->nvp_nbuckets - 1);
-
-	ASSERT3U(index, <, priv->nvp_nbuckets);
-	i_nvp_t *bucket = tab[index];
-
-	for (i_nvp_t *prev = NULL, *e = bucket;
-	    e != NULL; prev = e, e = e->nvi_hashtable_next) {
-		if (nvt_nvpair_match(&e->nvi_nvp, nvp, nvl->nvl_flag)) {
-			if (prev != NULL) {
-				prev->nvi_hashtable_next =
-				    e->nvi_hashtable_next;
-			} else {
-				ASSERT3P(e, ==, bucket);
-				tab[index] = e->nvi_hashtable_next;
-			}
-			e->nvi_hashtable_next = NULL;
-			priv->nvp_nentries--;
-			break;
-		}
-	}
-
-	return (0);
-}
-
-static int
-nvt_add_nvpair(nvlist_t *nvl, nvpair_t *nvp)
-{
-	nvpriv_t *priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv;
-
-	/* initialize nvpair table now if it doesn't exist. */
-	if (priv->nvp_hashtable == NULL) {
-		int err = nvt_tab_alloc(priv, nvlist_hashtable_init_size);
-		if (err != 0)
-			return (err);
-	}
-
-	/*
-	 * if we don't allow duplicate entries, make sure to
-	 * unlink any existing entries from the table.
-	 */
-	if (nvl->nvl_nvflag != 0) {
-		int err = nvt_remove_nvpair(nvl, nvp);
-		if (err != 0)
-			return (err);
-	}
-
-	if (nvt_needs_togrow(priv)) {
-		int err = nvt_grow(priv);
-		if (err != 0)
-			return (err);
-	}
-	i_nvp_t **tab = priv->nvp_hashtable;
-
-	char *name = NVP_NAME(nvp);
-	uint64_t hash = nvt_hash(name);
-	uint64_t index = hash & (priv->nvp_nbuckets - 1);
-
-	ASSERT3U(index, <, priv->nvp_nbuckets);
-	i_nvp_t *bucket = tab[index];
-
-	/* insert link at the beginning of the bucket */
-	i_nvp_t *new_entry = NVPAIR2I_NVP(nvp);
-	ASSERT3P(new_entry->nvi_hashtable_next, ==, NULL);
-	new_entry->nvi_hashtable_next = bucket;
-	tab[index] = new_entry;
-
-	priv->nvp_nentries++;
-	return (0);
-}
-
-static void
-nvlist_init(nvlist_t *nvl, uint32_t nvflag, nvpriv_t *priv)
-{
-	nvl->nvl_version = NV_VERSION;
-	nvl->nvl_nvflag = nvflag & (NV_UNIQUE_NAME|NV_UNIQUE_NAME_TYPE);
-	nvl->nvl_priv = (uint64_t)(uintptr_t)priv;
-	nvl->nvl_flag = 0;
-	nvl->nvl_pad = 0;
-}
-
-uint_t
-nvlist_nvflag(nvlist_t *nvl)
-{
-	return (nvl->nvl_nvflag);
-}
-
-/*
- * nvlist_alloc - Allocate nvlist.
- */
-/*ARGSUSED1*/
-int
-nvlist_alloc(nvlist_t **nvlp, uint_t nvflag, int kmflag)
-{
-#if defined(_KERNEL) && !defined(_BOOT)
-	return (nvlist_xalloc(nvlp, nvflag,
-	    (kmflag == KM_SLEEP ? nv_alloc_sleep : nv_alloc_nosleep)));
-#else
-	return (nvlist_xalloc(nvlp, nvflag, nv_alloc_nosleep));
-#endif
-}
-
-int
-nvlist_xalloc(nvlist_t **nvlp, uint_t nvflag, nv_alloc_t *nva)
-{
-	nvpriv_t *priv;
-
-	if (nvlp == NULL || nva == NULL)
-		return (EINVAL);
-
-	if ((priv = nv_priv_alloc(nva)) == NULL)
-		return (ENOMEM);
-
-	if ((*nvlp = nv_mem_zalloc(priv,
-	    NV_ALIGN(sizeof (nvlist_t)))) == NULL) {
-		nv_mem_free(priv, priv, sizeof (nvpriv_t));
-		return (ENOMEM);
-	}
-
-	nvlist_init(*nvlp, nvflag, priv);
-
-	return (0);
-}
-
-/*
- * nvp_buf_alloc - Allocate i_nvp_t for storing a new nv pair.
- */
-static nvpair_t *
-nvp_buf_alloc(nvlist_t *nvl, size_t len)
-{
-	nvpriv_t *priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv;
-	i_nvp_t *buf;
-	nvpair_t *nvp;
-	size_t nvsize;
-
-	/*
-	 * Allocate the buffer
-	 */
-	nvsize = len + offsetof(i_nvp_t, nvi_nvp);
-
-	if ((buf = nv_mem_zalloc(priv, nvsize)) == NULL)
-		return (NULL);
-
-	nvp = &buf->nvi_nvp;
-	nvp->nvp_size = len;
-
-	return (nvp);
-}
-
-/*
- * nvp_buf_free - de-Allocate an i_nvp_t.
- */
-static void
-nvp_buf_free(nvlist_t *nvl, nvpair_t *nvp)
-{
-	nvpriv_t *priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv;
-	size_t nvsize = nvp->nvp_size + offsetof(i_nvp_t, nvi_nvp);
-
-	nv_mem_free(priv, NVPAIR2I_NVP(nvp), nvsize);
-}
-
-/*
- * nvp_buf_link - link a new nv pair into the nvlist.
- */
-static void
-nvp_buf_link(nvlist_t *nvl, nvpair_t *nvp)
-{
-	nvpriv_t *priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv;
-	i_nvp_t *curr = NVPAIR2I_NVP(nvp);
-
-	/* Put element at end of nvlist */
-	if (priv->nvp_list == NULL) {
-		priv->nvp_list = priv->nvp_last = curr;
-	} else {
-		curr->nvi_prev = priv->nvp_last;
-		priv->nvp_last->nvi_next = curr;
-		priv->nvp_last = curr;
-	}
-}
-
-/*
- * nvp_buf_unlink - unlink an removed nvpair out of the nvlist.
- */
-static void
-nvp_buf_unlink(nvlist_t *nvl, nvpair_t *nvp)
-{
-	nvpriv_t *priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv;
-	i_nvp_t *curr = NVPAIR2I_NVP(nvp);
-
-	/*
-	 * protect nvlist_next_nvpair() against walking on freed memory.
-	 */
-	if (priv->nvp_curr == curr)
-		priv->nvp_curr = curr->nvi_next;
-
-	if (curr == priv->nvp_list)
-		priv->nvp_list = curr->nvi_next;
-	else
-		curr->nvi_prev->nvi_next = curr->nvi_next;
-
-	if (curr == priv->nvp_last)
-		priv->nvp_last = curr->nvi_prev;
-	else
-		curr->nvi_next->nvi_prev = curr->nvi_prev;
-}
-
-/*
- * take a nvpair type and number of elements and make sure the are valid
- */
-static int
-i_validate_type_nelem(data_type_t type, uint_t nelem)
-{
-	switch (type) {
-	case DATA_TYPE_BOOLEAN:
-		if (nelem != 0)
-			return (EINVAL);
-		break;
-	case DATA_TYPE_BOOLEAN_VALUE:
-	case DATA_TYPE_BYTE:
-	case DATA_TYPE_INT8:
-	case DATA_TYPE_UINT8:
-	case DATA_TYPE_INT16:
-	case DATA_TYPE_UINT16:
-	case DATA_TYPE_INT32:
-	case DATA_TYPE_UINT32:
-	case DATA_TYPE_INT64:
-	case DATA_TYPE_UINT64:
-	case DATA_TYPE_STRING:
-	case DATA_TYPE_HRTIME:
-	case DATA_TYPE_NVLIST:
-#if !defined(_KERNEL)
-	case DATA_TYPE_DOUBLE:
-#endif
-		if (nelem != 1)
-			return (EINVAL);
-		break;
-	case DATA_TYPE_BOOLEAN_ARRAY:
-	case DATA_TYPE_BYTE_ARRAY:
-	case DATA_TYPE_INT8_ARRAY:
-	case DATA_TYPE_UINT8_ARRAY:
-	case DATA_TYPE_INT16_ARRAY:
-	case DATA_TYPE_UINT16_ARRAY:
-	case DATA_TYPE_INT32_ARRAY:
-	case DATA_TYPE_UINT32_ARRAY:
-	case DATA_TYPE_INT64_ARRAY:
-	case DATA_TYPE_UINT64_ARRAY:
-	case DATA_TYPE_STRING_ARRAY:
-	case DATA_TYPE_NVLIST_ARRAY:
-		/* we allow arrays with 0 elements */
-		break;
-	default:
-		return (EINVAL);
-	}
-	return (0);
-}
-
-/*
- * Verify nvp_name_sz and check the name string length.
- */
-static int
-i_validate_nvpair_name(nvpair_t *nvp)
-{
-	if ((nvp->nvp_name_sz <= 0) ||
-	    (nvp->nvp_size < NVP_SIZE_CALC(nvp->nvp_name_sz, 0)))
-		return (EFAULT);
-
-	/* verify the name string, make sure its terminated */
-	if (NVP_NAME(nvp)[nvp->nvp_name_sz - 1] != '\0')
-		return (EFAULT);
-
-	return (strlen(NVP_NAME(nvp)) == nvp->nvp_name_sz - 1 ? 0 : EFAULT);
-}
-
-static int
-i_validate_nvpair_value(data_type_t type, uint_t nelem, const void *data)
-{
-	switch (type) {
-	case DATA_TYPE_BOOLEAN_VALUE:
-		if (*(boolean_t *)data != B_TRUE &&
-		    *(boolean_t *)data != B_FALSE)
-			return (EINVAL);
-		break;
-	case DATA_TYPE_BOOLEAN_ARRAY: {
-		int i;
-
-		for (i = 0; i < nelem; i++)
-			if (((boolean_t *)data)[i] != B_TRUE &&
-			    ((boolean_t *)data)[i] != B_FALSE)
-				return (EINVAL);
-		break;
-	}
-	default:
-		break;
-	}
-
-	return (0);
-}
-
-/*
- * This function takes a pointer to what should be a nvpair and it's size
- * and then verifies that all the nvpair fields make sense and can be
- * trusted.  This function is used when decoding packed nvpairs.
- */
-static int
-i_validate_nvpair(nvpair_t *nvp)
-{
-	data_type_t type = NVP_TYPE(nvp);
-	int size1, size2;
-
-	/* verify nvp_name_sz, check the name string length */
-	if (i_validate_nvpair_name(nvp) != 0)
-		return (EFAULT);
-
-	if (i_validate_nvpair_value(type, NVP_NELEM(nvp), NVP_VALUE(nvp)) != 0)
-		return (EFAULT);
-
-	/*
-	 * verify nvp_type, nvp_value_elem, and also possibly
-	 * verify string values and get the value size.
-	 */
-	size2 = i_get_value_size(type, NVP_VALUE(nvp), NVP_NELEM(nvp));
-	size1 = nvp->nvp_size - NVP_VALOFF(nvp);
-	if (size2 < 0 || size1 != NV_ALIGN(size2))
-		return (EFAULT);
-
-	return (0);
-}
-
-static int
-nvlist_copy_pairs(nvlist_t *snvl, nvlist_t *dnvl)
-{
-	nvpriv_t *priv;
-	i_nvp_t *curr;
-
-	if ((priv = (nvpriv_t *)(uintptr_t)snvl->nvl_priv) == NULL)
-		return (EINVAL);
-
-	for (curr = priv->nvp_list; curr != NULL; curr = curr->nvi_next) {
-		nvpair_t *nvp = &curr->nvi_nvp;
-		int err;
-
-		if ((err = nvlist_add_common(dnvl, NVP_NAME(nvp), NVP_TYPE(nvp),
-		    NVP_NELEM(nvp), NVP_VALUE(nvp))) != 0)
-			return (err);
-	}
-
-	return (0);
-}
-
-/*
- * Frees all memory allocated for an nvpair (like embedded lists) with
- * the exception of the nvpair buffer itself.
- */
-static void
-nvpair_free(nvpair_t *nvp)
-{
-	switch (NVP_TYPE(nvp)) {
-	case DATA_TYPE_NVLIST:
-		nvlist_free(EMBEDDED_NVL(nvp));
-		break;
-	case DATA_TYPE_NVLIST_ARRAY: {
-		nvlist_t **nvlp = EMBEDDED_NVL_ARRAY(nvp);
-		int i;
-
-		for (i = 0; i < NVP_NELEM(nvp); i++)
-			nvlist_free(nvlp[i]);
-		break;
-	}
-	default:
-		break;
-	}
-}
-
-/*
- * nvlist_free - free an unpacked nvlist
- */
-void
-nvlist_free(nvlist_t *nvl)
-{
-	nvpriv_t *priv;
-	i_nvp_t *curr;
-
-	if (nvl == NULL ||
-	    (priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv) == NULL)
-		return;
-
-	/*
-	 * Unpacked nvlist are linked through i_nvp_t
-	 */
-	curr = priv->nvp_list;
-	while (curr != NULL) {
-		nvpair_t *nvp = &curr->nvi_nvp;
-		curr = curr->nvi_next;
-
-		nvpair_free(nvp);
-		nvp_buf_free(nvl, nvp);
-	}
-
-	if (!(priv->nvp_stat & NV_STAT_EMBEDDED))
-		nv_mem_free(priv, nvl, NV_ALIGN(sizeof (nvlist_t)));
-	else
-		nvl->nvl_priv = 0;
-
-	nvt_tab_free(priv);
-	nv_mem_free(priv, priv, sizeof (nvpriv_t));
-}
-
-static int
-nvlist_contains_nvp(nvlist_t *nvl, nvpair_t *nvp)
-{
-	nvpriv_t *priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv;
-	i_nvp_t *curr;
-
-	if (nvp == NULL)
-		return (0);
-
-	for (curr = priv->nvp_list; curr != NULL; curr = curr->nvi_next)
-		if (&curr->nvi_nvp == nvp)
-			return (1);
-
-	return (0);
-}
-
-/*
- * Make a copy of nvlist
- */
-/*ARGSUSED1*/
-int
-nvlist_dup(nvlist_t *nvl, nvlist_t **nvlp, int kmflag)
-{
-#if defined(_KERNEL) && !defined(_BOOT)
-	return (nvlist_xdup(nvl, nvlp,
-	    (kmflag == KM_SLEEP ? nv_alloc_sleep : nv_alloc_nosleep)));
-#else
-	return (nvlist_xdup(nvl, nvlp, nv_alloc_nosleep));
-#endif
-}
-
-int
-nvlist_xdup(nvlist_t *nvl, nvlist_t **nvlp, nv_alloc_t *nva)
-{
-	int err;
-	nvlist_t *ret;
-
-	if (nvl == NULL || nvlp == NULL)
-		return (EINVAL);
-
-	if ((err = nvlist_xalloc(&ret, nvl->nvl_nvflag, nva)) != 0)
-		return (err);
-
-	if ((err = nvlist_copy_pairs(nvl, ret)) != 0)
-		nvlist_free(ret);
-	else
-		*nvlp = ret;
-
-	return (err);
-}
-
-/*
- * Remove all with matching name
- */
-int
-nvlist_remove_all(nvlist_t *nvl, const char *name)
-{
-	int error = ENOENT;
-
-	if (nvl == NULL || name == NULL || nvl->nvl_priv == 0)
-		return (EINVAL);
-
-	nvpair_t *nvp;
-	while ((nvp = nvt_lookup_name(nvl, name)) != NULL) {
-		VERIFY0(nvlist_remove_nvpair(nvl, nvp));
-		error = 0;
-	}
-
-	return (error);
-}
-
-/*
- * Remove first one with matching name and type
- */
-int
-nvlist_remove(nvlist_t *nvl, const char *name, data_type_t type)
-{
-	if (nvl == NULL || name == NULL || nvl->nvl_priv == 0)
-		return (EINVAL);
-
-	nvpair_t *nvp = nvt_lookup_name_type(nvl, name, type);
-	if (nvp == NULL)
-		return (ENOENT);
-
-	return (nvlist_remove_nvpair(nvl, nvp));
-}
-
-int
-nvlist_remove_nvpair(nvlist_t *nvl, nvpair_t *nvp)
-{
-	if (nvl == NULL || nvp == NULL)
-		return (EINVAL);
-
-	int err = nvt_remove_nvpair(nvl, nvp);
-	if (err != 0)
-		return (err);
-
-	nvp_buf_unlink(nvl, nvp);
-	nvpair_free(nvp);
-	nvp_buf_free(nvl, nvp);
-	return (0);
-}
-
-/*
- * This function calculates the size of an nvpair value.
- *
- * The data argument controls the behavior in case of the data types
- * 	DATA_TYPE_STRING    	and
- *	DATA_TYPE_STRING_ARRAY
- * Is data == NULL then the size of the string(s) is excluded.
- */
-static int
-i_get_value_size(data_type_t type, const void *data, uint_t nelem)
-{
-	uint64_t value_sz;
-
-	if (i_validate_type_nelem(type, nelem) != 0)
-		return (-1);
-
-	/* Calculate required size for holding value */
-	switch (type) {
-	case DATA_TYPE_BOOLEAN:
-		value_sz = 0;
-		break;
-	case DATA_TYPE_BOOLEAN_VALUE:
-		value_sz = sizeof (boolean_t);
-		break;
-	case DATA_TYPE_BYTE:
-		value_sz = sizeof (uchar_t);
-		break;
-	case DATA_TYPE_INT8:
-		value_sz = sizeof (int8_t);
-		break;
-	case DATA_TYPE_UINT8:
-		value_sz = sizeof (uint8_t);
-		break;
-	case DATA_TYPE_INT16:
-		value_sz = sizeof (int16_t);
-		break;
-	case DATA_TYPE_UINT16:
-		value_sz = sizeof (uint16_t);
-		break;
-	case DATA_TYPE_INT32:
-		value_sz = sizeof (int32_t);
-		break;
-	case DATA_TYPE_UINT32:
-		value_sz = sizeof (uint32_t);
-		break;
-	case DATA_TYPE_INT64:
-		value_sz = sizeof (int64_t);
-		break;
-	case DATA_TYPE_UINT64:
-		value_sz = sizeof (uint64_t);
-		break;
-#if !defined(_KERNEL)
-	case DATA_TYPE_DOUBLE:
-		value_sz = sizeof (double);
-		break;
-#endif
-	case DATA_TYPE_STRING:
-		if (data == NULL)
-			value_sz = 0;
-		else
-			value_sz = strlen(data) + 1;
-		break;
-	case DATA_TYPE_BOOLEAN_ARRAY:
-		value_sz = (uint64_t)nelem * sizeof (boolean_t);
-		break;
-	case DATA_TYPE_BYTE_ARRAY:
-		value_sz = (uint64_t)nelem * sizeof (uchar_t);
-		break;
-	case DATA_TYPE_INT8_ARRAY:
-		value_sz = (uint64_t)nelem * sizeof (int8_t);
-		break;
-	case DATA_TYPE_UINT8_ARRAY:
-		value_sz = (uint64_t)nelem * sizeof (uint8_t);
-		break;
-	case DATA_TYPE_INT16_ARRAY:
-		value_sz = (uint64_t)nelem * sizeof (int16_t);
-		break;
-	case DATA_TYPE_UINT16_ARRAY:
-		value_sz = (uint64_t)nelem * sizeof (uint16_t);
-		break;
-	case DATA_TYPE_INT32_ARRAY:
-		value_sz = (uint64_t)nelem * sizeof (int32_t);
-		break;
-	case DATA_TYPE_UINT32_ARRAY:
-		value_sz = (uint64_t)nelem * sizeof (uint32_t);
-		break;
-	case DATA_TYPE_INT64_ARRAY:
-		value_sz = (uint64_t)nelem * sizeof (int64_t);
-		break;
-	case DATA_TYPE_UINT64_ARRAY:
-		value_sz = (uint64_t)nelem * sizeof (uint64_t);
-		break;
-	case DATA_TYPE_STRING_ARRAY:
-		value_sz = (uint64_t)nelem * sizeof (uint64_t);
-
-		if (data != NULL) {
-			char *const *strs = data;
-			uint_t i;
-
-			/* no alignment requirement for strings */
-			for (i = 0; i < nelem; i++) {
-				if (strs[i] == NULL)
-					return (-1);
-				value_sz += strlen(strs[i]) + 1;
-			}
-		}
-		break;
-	case DATA_TYPE_HRTIME:
-		value_sz = sizeof (hrtime_t);
-		break;
-	case DATA_TYPE_NVLIST:
-		value_sz = NV_ALIGN(sizeof (nvlist_t));
-		break;
-	case DATA_TYPE_NVLIST_ARRAY:
-		value_sz = (uint64_t)nelem * sizeof (uint64_t) +
-		    (uint64_t)nelem * NV_ALIGN(sizeof (nvlist_t));
-		break;
-	default:
-		return (-1);
-	}
-
-	return (value_sz > INT32_MAX ? -1 : (int)value_sz);
-}
-
-static int
-nvlist_copy_embedded(nvlist_t *nvl, nvlist_t *onvl, nvlist_t *emb_nvl)
-{
-	nvpriv_t *priv;
-	int err;
-
-	if ((priv = nv_priv_alloc_embedded((nvpriv_t *)(uintptr_t)
-	    nvl->nvl_priv)) == NULL)
-		return (ENOMEM);
-
-	nvlist_init(emb_nvl, onvl->nvl_nvflag, priv);
-
-	if ((err = nvlist_copy_pairs(onvl, emb_nvl)) != 0) {
-		nvlist_free(emb_nvl);
-		emb_nvl->nvl_priv = 0;
-	}
-
-	return (err);
-}
-
-/*
- * nvlist_add_common - Add new <name,value> pair to nvlist
- */
-static int
-nvlist_add_common(nvlist_t *nvl, const char *name,
-    data_type_t type, uint_t nelem, const void *data)
-{
-	nvpair_t *nvp;
-	uint_t i;
-
-	int nvp_sz, name_sz, value_sz;
-	int err = 0;
-
-	if (name == NULL || nvl == NULL || nvl->nvl_priv == 0)
-		return (EINVAL);
-
-	if (nelem != 0 && data == NULL)
-		return (EINVAL);
-
-	/*
-	 * Verify type and nelem and get the value size.
-	 * In case of data types DATA_TYPE_STRING and DATA_TYPE_STRING_ARRAY
-	 * is the size of the string(s) included.
-	 */
-	if ((value_sz = i_get_value_size(type, data, nelem)) < 0)
-		return (EINVAL);
-
-	if (i_validate_nvpair_value(type, nelem, data) != 0)
-		return (EINVAL);
-
-	/*
-	 * If we're adding an nvlist or nvlist array, ensure that we are not
-	 * adding the input nvlist to itself, which would cause recursion,
-	 * and ensure that no NULL nvlist pointers are present.
-	 */
-	switch (type) {
-	case DATA_TYPE_NVLIST:
-		if (data == nvl || data == NULL)
-			return (EINVAL);
-		break;
-	case DATA_TYPE_NVLIST_ARRAY: {
-		nvlist_t **onvlp = (nvlist_t **)data;
-		for (i = 0; i < nelem; i++) {
-			if (onvlp[i] == nvl || onvlp[i] == NULL)
-				return (EINVAL);
-		}
-		break;
-	}
-	default:
-		break;
-	}
-
-	/* calculate sizes of the nvpair elements and the nvpair itself */
-	name_sz = strlen(name) + 1;
-	if (name_sz >= 1ULL << (sizeof (nvp->nvp_name_sz) * 8 - 1))
-		return (EINVAL);
-
-	nvp_sz = NVP_SIZE_CALC(name_sz, value_sz);
-
-	if ((nvp = nvp_buf_alloc(nvl, nvp_sz)) == NULL)
-		return (ENOMEM);
-
-	ASSERT(nvp->nvp_size == nvp_sz);
-	nvp->nvp_name_sz = name_sz;
-	nvp->nvp_value_elem = nelem;
-	nvp->nvp_type = type;
-	bcopy(name, NVP_NAME(nvp), name_sz);
-
-	switch (type) {
-	case DATA_TYPE_BOOLEAN:
-		break;
-	case DATA_TYPE_STRING_ARRAY: {
-		char *const *strs = data;
-		char *buf = NVP_VALUE(nvp);
-		char **cstrs = (void *)buf;
-
-		/* skip pre-allocated space for pointer array */
-		buf += nelem * sizeof (uint64_t);
-		for (i = 0; i < nelem; i++) {
-			int slen = strlen(strs[i]) + 1;
-			bcopy(strs[i], buf, slen);
-			cstrs[i] = buf;
-			buf += slen;
-		}
-		break;
-	}
-	case DATA_TYPE_NVLIST: {
-		nvlist_t *nnvl = EMBEDDED_NVL(nvp);
-		nvlist_t *onvl = (nvlist_t *)data;
-
-		if ((err = nvlist_copy_embedded(nvl, onvl, nnvl)) != 0) {
-			nvp_buf_free(nvl, nvp);
-			return (err);
-		}
-		break;
-	}
-	case DATA_TYPE_NVLIST_ARRAY: {
-		nvlist_t **onvlp = (nvlist_t **)data;
-		nvlist_t **nvlp = EMBEDDED_NVL_ARRAY(nvp);
-		nvlist_t *embedded = (nvlist_t *)
-		    ((uintptr_t)nvlp + nelem * sizeof (uint64_t));
-
-		for (i = 0; i < nelem; i++) {
-			if ((err = nvlist_copy_embedded(nvl,
-			    onvlp[i], embedded)) != 0) {
-				/*
-				 * Free any successfully created lists
-				 */
-				nvpair_free(nvp);
-				nvp_buf_free(nvl, nvp);
-				return (err);
-			}
-
-			nvlp[i] = embedded++;
-		}
-		break;
-	}
-	default:
-		bcopy(data, NVP_VALUE(nvp), value_sz);
-	}
-
-	/* if unique name, remove before add */
-	if (nvl->nvl_nvflag & NV_UNIQUE_NAME)
-		(void) nvlist_remove_all(nvl, name);
-	else if (nvl->nvl_nvflag & NV_UNIQUE_NAME_TYPE)
-		(void) nvlist_remove(nvl, name, type);
-
-	err = nvt_add_nvpair(nvl, nvp);
-	if (err != 0) {
-		nvpair_free(nvp);
-		nvp_buf_free(nvl, nvp);
-		return (err);
-	}
-	nvp_buf_link(nvl, nvp);
-
-	return (0);
-}
-
-int
-nvlist_add_boolean(nvlist_t *nvl, const char *name)
-{
-	return (nvlist_add_common(nvl, name, DATA_TYPE_BOOLEAN, 0, NULL));
-}
-
-int
-nvlist_add_boolean_value(nvlist_t *nvl, const char *name, boolean_t val)
-{
-	return (nvlist_add_common(nvl, name, DATA_TYPE_BOOLEAN_VALUE, 1, &val));
-}
-
-int
-nvlist_add_byte(nvlist_t *nvl, const char *name, uchar_t val)
-{
-	return (nvlist_add_common(nvl, name, DATA_TYPE_BYTE, 1, &val));
-}
-
-int
-nvlist_add_int8(nvlist_t *nvl, const char *name, int8_t val)
-{
-	return (nvlist_add_common(nvl, name, DATA_TYPE_INT8, 1, &val));
-}
-
-int
-nvlist_add_uint8(nvlist_t *nvl, const char *name, uint8_t val)
-{
-	return (nvlist_add_common(nvl, name, DATA_TYPE_UINT8, 1, &val));
-}
-
-int
-nvlist_add_int16(nvlist_t *nvl, const char *name, int16_t val)
-{
-	return (nvlist_add_common(nvl, name, DATA_TYPE_INT16, 1, &val));
-}
-
-int
-nvlist_add_uint16(nvlist_t *nvl, const char *name, uint16_t val)
-{
-	return (nvlist_add_common(nvl, name, DATA_TYPE_UINT16, 1, &val));
-}
-
-int
-nvlist_add_int32(nvlist_t *nvl, const char *name, int32_t val)
-{
-	return (nvlist_add_common(nvl, name, DATA_TYPE_INT32, 1, &val));
-}
-
-int
-nvlist_add_uint32(nvlist_t *nvl, const char *name, uint32_t val)
-{
-	return (nvlist_add_common(nvl, name, DATA_TYPE_UINT32, 1, &val));
-}
-
-int
-nvlist_add_int64(nvlist_t *nvl, const char *name, int64_t val)
-{
-	return (nvlist_add_common(nvl, name, DATA_TYPE_INT64, 1, &val));
-}
-
-int
-nvlist_add_uint64(nvlist_t *nvl, const char *name, uint64_t val)
-{
-	return (nvlist_add_common(nvl, name, DATA_TYPE_UINT64, 1, &val));
-}
-
-#if !defined(_KERNEL)
-int
-nvlist_add_double(nvlist_t *nvl, const char *name, double val)
-{
-	return (nvlist_add_common(nvl, name, DATA_TYPE_DOUBLE, 1, &val));
-}
-#endif
-
-int
-nvlist_add_string(nvlist_t *nvl, const char *name, const char *val)
-{
-	return (nvlist_add_common(nvl, name, DATA_TYPE_STRING, 1, (void *)val));
-}
-
-int
-nvlist_add_boolean_array(nvlist_t *nvl, const char *name,
-    boolean_t *a, uint_t n)
-{
-	return (nvlist_add_common(nvl, name, DATA_TYPE_BOOLEAN_ARRAY, n, a));
-}
-
-int
-nvlist_add_byte_array(nvlist_t *nvl, const char *name, uchar_t *a, uint_t n)
-{
-	return (nvlist_add_common(nvl, name, DATA_TYPE_BYTE_ARRAY, n, a));
-}
-
-int
-nvlist_add_int8_array(nvlist_t *nvl, const char *name, int8_t *a, uint_t n)
-{
-	return (nvlist_add_common(nvl, name, DATA_TYPE_INT8_ARRAY, n, a));
-}
-
-int
-nvlist_add_uint8_array(nvlist_t *nvl, const char *name, uint8_t *a, uint_t n)
-{
-	return (nvlist_add_common(nvl, name, DATA_TYPE_UINT8_ARRAY, n, a));
-}
-
-int
-nvlist_add_int16_array(nvlist_t *nvl, const char *name, int16_t *a, uint_t n)
-{
-	return (nvlist_add_common(nvl, name, DATA_TYPE_INT16_ARRAY, n, a));
-}
-
-int
-nvlist_add_uint16_array(nvlist_t *nvl, const char *name, uint16_t *a, uint_t n)
-{
-	return (nvlist_add_common(nvl, name, DATA_TYPE_UINT16_ARRAY, n, a));
-}
-
-int
-nvlist_add_int32_array(nvlist_t *nvl, const char *name, int32_t *a, uint_t n)
-{
-	return (nvlist_add_common(nvl, name, DATA_TYPE_INT32_ARRAY, n, a));
-}
-
-int
-nvlist_add_uint32_array(nvlist_t *nvl, const char *name, uint32_t *a, uint_t n)
-{
-	return (nvlist_add_common(nvl, name, DATA_TYPE_UINT32_ARRAY, n, a));
-}
-
-int
-nvlist_add_int64_array(nvlist_t *nvl, const char *name, int64_t *a, uint_t n)
-{
-	return (nvlist_add_common(nvl, name, DATA_TYPE_INT64_ARRAY, n, a));
-}
-
-int
-nvlist_add_uint64_array(nvlist_t *nvl, const char *name, uint64_t *a, uint_t n)
-{
-	return (nvlist_add_common(nvl, name, DATA_TYPE_UINT64_ARRAY, n, a));
-}
-
-int
-nvlist_add_string_array(nvlist_t *nvl, const char *name,
-    char *const *a, uint_t n)
-{
-	return (nvlist_add_common(nvl, name, DATA_TYPE_STRING_ARRAY, n, a));
-}
-
-int
-nvlist_add_hrtime(nvlist_t *nvl, const char *name, hrtime_t val)
-{
-	return (nvlist_add_common(nvl, name, DATA_TYPE_HRTIME, 1, &val));
-}
-
-int
-nvlist_add_nvlist(nvlist_t *nvl, const char *name, nvlist_t *val)
-{
-	return (nvlist_add_common(nvl, name, DATA_TYPE_NVLIST, 1, val));
-}
-
-int
-nvlist_add_nvlist_array(nvlist_t *nvl, const char *name, nvlist_t **a, uint_t n)
-{
-	return (nvlist_add_common(nvl, name, DATA_TYPE_NVLIST_ARRAY, n, a));
-}
-
-/* reading name-value pairs */
-nvpair_t *
-nvlist_next_nvpair(nvlist_t *nvl, nvpair_t *nvp)
-{
-	nvpriv_t *priv;
-	i_nvp_t *curr;
-
-	if (nvl == NULL ||
-	    (priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv) == NULL)
-		return (NULL);
-
-	curr = NVPAIR2I_NVP(nvp);
-
-	/*
-	 * Ensure that nvp is a valid nvpair on this nvlist.
-	 * NB: nvp_curr is used only as a hint so that we don't always
-	 * have to walk the list to determine if nvp is still on the list.
-	 */
-	if (nvp == NULL)
-		curr = priv->nvp_list;
-	else if (priv->nvp_curr == curr || nvlist_contains_nvp(nvl, nvp))
-		curr = curr->nvi_next;
-	else
-		curr = NULL;
-
-	priv->nvp_curr = curr;
-
-	return (curr != NULL ? &curr->nvi_nvp : NULL);
-}
-
-nvpair_t *
-nvlist_prev_nvpair(nvlist_t *nvl, nvpair_t *nvp)
-{
-	nvpriv_t *priv;
-	i_nvp_t *curr;
-
-	if (nvl == NULL ||
-	    (priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv) == NULL)
-		return (NULL);
-
-	curr = NVPAIR2I_NVP(nvp);
-
-	if (nvp == NULL)
-		curr = priv->nvp_last;
-	else if (priv->nvp_curr == curr || nvlist_contains_nvp(nvl, nvp))
-		curr = curr->nvi_prev;
-	else
-		curr = NULL;
-
-	priv->nvp_curr = curr;
-
-	return (curr != NULL ? &curr->nvi_nvp : NULL);
-}
-
-boolean_t
-nvlist_empty(nvlist_t *nvl)
-{
-	nvpriv_t *priv;
-
-	if (nvl == NULL ||
-	    (priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv) == NULL)
-		return (B_TRUE);
-
-	return (priv->nvp_list == NULL);
-}
-
-char *
-nvpair_name(nvpair_t *nvp)
-{
-	return (NVP_NAME(nvp));
-}
-
-data_type_t
-nvpair_type(nvpair_t *nvp)
-{
-	return (NVP_TYPE(nvp));
-}
-
-int
-nvpair_type_is_array(nvpair_t *nvp)
-{
-	data_type_t type = NVP_TYPE(nvp);
-
-	if ((type == DATA_TYPE_BYTE_ARRAY) ||
-	    (type == DATA_TYPE_INT8_ARRAY) ||
-	    (type == DATA_TYPE_UINT8_ARRAY) ||
-	    (type == DATA_TYPE_INT16_ARRAY) ||
-	    (type == DATA_TYPE_UINT16_ARRAY) ||
-	    (type == DATA_TYPE_INT32_ARRAY) ||
-	    (type == DATA_TYPE_UINT32_ARRAY) ||
-	    (type == DATA_TYPE_INT64_ARRAY) ||
-	    (type == DATA_TYPE_UINT64_ARRAY) ||
-	    (type == DATA_TYPE_BOOLEAN_ARRAY) ||
-	    (type == DATA_TYPE_STRING_ARRAY) ||
-	    (type == DATA_TYPE_NVLIST_ARRAY))
-		return (1);
-	return (0);
-
-}
-
-static int
-nvpair_value_common(nvpair_t *nvp, data_type_t type, uint_t *nelem, void *data)
-{
-	if (nvp == NULL || nvpair_type(nvp) != type)
-		return (EINVAL);
-
-	/*
-	 * For non-array types, we copy the data.
-	 * For array types (including string), we set a pointer.
-	 */
-	switch (type) {
-	case DATA_TYPE_BOOLEAN:
-		if (nelem != NULL)
-			*nelem = 0;
-		break;
-
-	case DATA_TYPE_BOOLEAN_VALUE:
-	case DATA_TYPE_BYTE:
-	case DATA_TYPE_INT8:
-	case DATA_TYPE_UINT8:
-	case DATA_TYPE_INT16:
-	case DATA_TYPE_UINT16:
-	case DATA_TYPE_INT32:
-	case DATA_TYPE_UINT32:
-	case DATA_TYPE_INT64:
-	case DATA_TYPE_UINT64:
-	case DATA_TYPE_HRTIME:
-#if !defined(_KERNEL)
-	case DATA_TYPE_DOUBLE:
-#endif
-		if (data == NULL)
-			return (EINVAL);
-		bcopy(NVP_VALUE(nvp), data,
-		    (size_t)i_get_value_size(type, NULL, 1));
-		if (nelem != NULL)
-			*nelem = 1;
-		break;
-
-	case DATA_TYPE_NVLIST:
-	case DATA_TYPE_STRING:
-		if (data == NULL)
-			return (EINVAL);
-		*(void **)data = (void *)NVP_VALUE(nvp);
-		if (nelem != NULL)
-			*nelem = 1;
-		break;
-
-	case DATA_TYPE_BOOLEAN_ARRAY:
-	case DATA_TYPE_BYTE_ARRAY:
-	case DATA_TYPE_INT8_ARRAY:
-	case DATA_TYPE_UINT8_ARRAY:
-	case DATA_TYPE_INT16_ARRAY:
-	case DATA_TYPE_UINT16_ARRAY:
-	case DATA_TYPE_INT32_ARRAY:
-	case DATA_TYPE_UINT32_ARRAY:
-	case DATA_TYPE_INT64_ARRAY:
-	case DATA_TYPE_UINT64_ARRAY:
-	case DATA_TYPE_STRING_ARRAY:
-	case DATA_TYPE_NVLIST_ARRAY:
-		if (nelem == NULL || data == NULL)
-			return (EINVAL);
-		if ((*nelem = NVP_NELEM(nvp)) != 0)
-			*(void **)data = (void *)NVP_VALUE(nvp);
-		else
-			*(void **)data = NULL;
-		break;
-
-	default:
-		return (ENOTSUP);
-	}
-
-	return (0);
-}
-
-static int
-nvlist_lookup_common(nvlist_t *nvl, const char *name, data_type_t type,
-    uint_t *nelem, void *data)
-{
-	if (name == NULL || nvl == NULL || nvl->nvl_priv == 0)
-		return (EINVAL);
-
-	if (!(nvl->nvl_nvflag & (NV_UNIQUE_NAME | NV_UNIQUE_NAME_TYPE)))
-		return (ENOTSUP);
-
-	nvpair_t *nvp = nvt_lookup_name_type(nvl, name, type);
-	if (nvp == NULL)
-		return (ENOENT);
-
-	return (nvpair_value_common(nvp, type, nelem, data));
-}
-
-int
-nvlist_lookup_boolean(nvlist_t *nvl, const char *name)
-{
-	return (nvlist_lookup_common(nvl, name, DATA_TYPE_BOOLEAN, NULL, NULL));
-}
-
-int
-nvlist_lookup_boolean_value(nvlist_t *nvl, const char *name, boolean_t *val)
-{
-	return (nvlist_lookup_common(nvl, name,
-	    DATA_TYPE_BOOLEAN_VALUE, NULL, val));
-}
-
-int
-nvlist_lookup_byte(nvlist_t *nvl, const char *name, uchar_t *val)
-{
-	return (nvlist_lookup_common(nvl, name, DATA_TYPE_BYTE, NULL, val));
-}
-
-int
-nvlist_lookup_int8(nvlist_t *nvl, const char *name, int8_t *val)
-{
-	return (nvlist_lookup_common(nvl, name, DATA_TYPE_INT8, NULL, val));
-}
-
-int
-nvlist_lookup_uint8(nvlist_t *nvl, const char *name, uint8_t *val)
-{
-	return (nvlist_lookup_common(nvl, name, DATA_TYPE_UINT8, NULL, val));
-}
-
-int
-nvlist_lookup_int16(nvlist_t *nvl, const char *name, int16_t *val)
-{
-	return (nvlist_lookup_common(nvl, name, DATA_TYPE_INT16, NULL, val));
-}
-
-int
-nvlist_lookup_uint16(nvlist_t *nvl, const char *name, uint16_t *val)
-{
-	return (nvlist_lookup_common(nvl, name, DATA_TYPE_UINT16, NULL, val));
-}
-
-int
-nvlist_lookup_int32(nvlist_t *nvl, const char *name, int32_t *val)
-{
-	return (nvlist_lookup_common(nvl, name, DATA_TYPE_INT32, NULL, val));
-}
-
-int
-nvlist_lookup_uint32(nvlist_t *nvl, const char *name, uint32_t *val)
-{
-	return (nvlist_lookup_common(nvl, name, DATA_TYPE_UINT32, NULL, val));
-}
-
-int
-nvlist_lookup_int64(nvlist_t *nvl, const char *name, int64_t *val)
-{
-	return (nvlist_lookup_common(nvl, name, DATA_TYPE_INT64, NULL, val));
-}
-
-int
-nvlist_lookup_uint64(nvlist_t *nvl, const char *name, uint64_t *val)
-{
-	return (nvlist_lookup_common(nvl, name, DATA_TYPE_UINT64, NULL, val));
-}
-
-#if !defined(_KERNEL)
-int
-nvlist_lookup_double(nvlist_t *nvl, const char *name, double *val)
-{
-	return (nvlist_lookup_common(nvl, name, DATA_TYPE_DOUBLE, NULL, val));
-}
-#endif
-
-int
-nvlist_lookup_string(nvlist_t *nvl, const char *name, char **val)
-{
-	return (nvlist_lookup_common(nvl, name, DATA_TYPE_STRING, NULL, val));
-}
-
-int
-nvlist_lookup_nvlist(nvlist_t *nvl, const char *name, nvlist_t **val)
-{
-	return (nvlist_lookup_common(nvl, name, DATA_TYPE_NVLIST, NULL, val));
-}
-
-int
-nvlist_lookup_boolean_array(nvlist_t *nvl, const char *name,
-    boolean_t **a, uint_t *n)
-{
-	return (nvlist_lookup_common(nvl, name,
-	    DATA_TYPE_BOOLEAN_ARRAY, n, a));
-}
-
-int
-nvlist_lookup_byte_array(nvlist_t *nvl, const char *name,
-    uchar_t **a, uint_t *n)
-{
-	return (nvlist_lookup_common(nvl, name, DATA_TYPE_BYTE_ARRAY, n, a));
-}
-
-int
-nvlist_lookup_int8_array(nvlist_t *nvl, const char *name, int8_t **a, uint_t *n)
-{
-	return (nvlist_lookup_common(nvl, name, DATA_TYPE_INT8_ARRAY, n, a));
-}
-
-int
-nvlist_lookup_uint8_array(nvlist_t *nvl, const char *name,
-    uint8_t **a, uint_t *n)
-{
-	return (nvlist_lookup_common(nvl, name, DATA_TYPE_UINT8_ARRAY, n, a));
-}
-
-int
-nvlist_lookup_int16_array(nvlist_t *nvl, const char *name,
-    int16_t **a, uint_t *n)
-{
-	return (nvlist_lookup_common(nvl, name, DATA_TYPE_INT16_ARRAY, n, a));
-}
-
-int
-nvlist_lookup_uint16_array(nvlist_t *nvl, const char *name,
-    uint16_t **a, uint_t *n)
-{
-	return (nvlist_lookup_common(nvl, name, DATA_TYPE_UINT16_ARRAY, n, a));
-}
-
-int
-nvlist_lookup_int32_array(nvlist_t *nvl, const char *name,
-    int32_t **a, uint_t *n)
-{
-	return (nvlist_lookup_common(nvl, name, DATA_TYPE_INT32_ARRAY, n, a));
-}
-
-int
-nvlist_lookup_uint32_array(nvlist_t *nvl, const char *name,
-    uint32_t **a, uint_t *n)
-{
-	return (nvlist_lookup_common(nvl, name, DATA_TYPE_UINT32_ARRAY, n, a));
-}
-
-int
-nvlist_lookup_int64_array(nvlist_t *nvl, const char *name,
-    int64_t **a, uint_t *n)
-{
-	return (nvlist_lookup_common(nvl, name, DATA_TYPE_INT64_ARRAY, n, a));
-}
-
-int
-nvlist_lookup_uint64_array(nvlist_t *nvl, const char *name,
-    uint64_t **a, uint_t *n)
-{
-	return (nvlist_lookup_common(nvl, name, DATA_TYPE_UINT64_ARRAY, n, a));
-}
-
-int
-nvlist_lookup_string_array(nvlist_t *nvl, const char *name,
-    char ***a, uint_t *n)
-{
-	return (nvlist_lookup_common(nvl, name, DATA_TYPE_STRING_ARRAY, n, a));
-}
-
-int
-nvlist_lookup_nvlist_array(nvlist_t *nvl, const char *name,
-    nvlist_t ***a, uint_t *n)
-{
-	return (nvlist_lookup_common(nvl, name, DATA_TYPE_NVLIST_ARRAY, n, a));
-}
-
-int
-nvlist_lookup_hrtime(nvlist_t *nvl, const char *name, hrtime_t *val)
-{
-	return (nvlist_lookup_common(nvl, name, DATA_TYPE_HRTIME, NULL, val));
-}
-
-int
-nvlist_lookup_pairs(nvlist_t *nvl, int flag, ...)
-{
-	va_list ap;
-	char *name;
-	int noentok = (flag & NV_FLAG_NOENTOK ? 1 : 0);
-	int ret = 0;
-
-	va_start(ap, flag);
-	while (ret == 0 && (name = va_arg(ap, char *)) != NULL) {
-		data_type_t type;
-		void *val;
-		uint_t *nelem;
-
-		switch (type = va_arg(ap, data_type_t)) {
-		case DATA_TYPE_BOOLEAN:
-			ret = nvlist_lookup_common(nvl, name, type, NULL, NULL);
-			break;
-
-		case DATA_TYPE_BOOLEAN_VALUE:
-		case DATA_TYPE_BYTE:
-		case DATA_TYPE_INT8:
-		case DATA_TYPE_UINT8:
-		case DATA_TYPE_INT16:
-		case DATA_TYPE_UINT16:
-		case DATA_TYPE_INT32:
-		case DATA_TYPE_UINT32:
-		case DATA_TYPE_INT64:
-		case DATA_TYPE_UINT64:
-		case DATA_TYPE_HRTIME:
-		case DATA_TYPE_STRING:
-		case DATA_TYPE_NVLIST:
-#if !defined(_KERNEL)
-		case DATA_TYPE_DOUBLE:
-#endif
-			val = va_arg(ap, void *);
-			ret = nvlist_lookup_common(nvl, name, type, NULL, val);
-			break;
-
-		case DATA_TYPE_BYTE_ARRAY:
-		case DATA_TYPE_BOOLEAN_ARRAY:
-		case DATA_TYPE_INT8_ARRAY:
-		case DATA_TYPE_UINT8_ARRAY:
-		case DATA_TYPE_INT16_ARRAY:
-		case DATA_TYPE_UINT16_ARRAY:
-		case DATA_TYPE_INT32_ARRAY:
-		case DATA_TYPE_UINT32_ARRAY:
-		case DATA_TYPE_INT64_ARRAY:
-		case DATA_TYPE_UINT64_ARRAY:
-		case DATA_TYPE_STRING_ARRAY:
-		case DATA_TYPE_NVLIST_ARRAY:
-			val = va_arg(ap, void *);
-			nelem = va_arg(ap, uint_t *);
-			ret = nvlist_lookup_common(nvl, name, type, nelem, val);
-			break;
-
-		default:
-			ret = EINVAL;
-		}
-
-		if (ret == ENOENT && noentok)
-			ret = 0;
-	}
-	va_end(ap);
-
-	return (ret);
-}
-
-/*
- * Find the 'name'ed nvpair in the nvlist 'nvl'. If 'name' found, the function
- * returns zero and a pointer to the matching nvpair is returned in '*ret'
- * (given 'ret' is non-NULL). If 'sep' is specified then 'name' will penitrate
- * multiple levels of embedded nvlists, with 'sep' as the separator. As an
- * example, if sep is '.', name might look like: "a" or "a.b" or "a.c[3]" or
- * "a.d[3].e[1]".  This matches the C syntax for array embed (for convience,
- * code also supports "a.d[3]e[1]" syntax).
- *
- * If 'ip' is non-NULL and the last name component is an array, return the
- * value of the "...[index]" array index in *ip. For an array reference that
- * is not indexed, *ip will be returned as -1. If there is a syntax error in
- * 'name', and 'ep' is non-NULL then *ep will be set to point to the location
- * inside the 'name' string where the syntax error was detected.
- */
-static int
-nvlist_lookup_nvpair_ei_sep(nvlist_t *nvl, const char *name, const char sep,
-    nvpair_t **ret, int *ip, char **ep)
-{
-	nvpair_t	*nvp;
-	const char	*np;
-	char		*sepp;
-	char		*idxp, *idxep;
-	nvlist_t	**nva;
-	long		idx;
-	int		n;
-
-	if (ip)
-		*ip = -1;			/* not indexed */
-	if (ep)
-		*ep = NULL;
-
-	if ((nvl == NULL) || (name == NULL))
-		return (EINVAL);
-
-	sepp = NULL;
-	idx = 0;
-	/* step through components of name */
-	for (np = name; np && *np; np = sepp) {
-		/* ensure unique names */
-		if (!(nvl->nvl_nvflag & NV_UNIQUE_NAME))
-			return (ENOTSUP);
-
-		/* skip white space */
-		skip_whitespace(np);
-		if (*np == 0)
-			break;
-
-		/* set 'sepp' to end of current component 'np' */
-		if (sep)
-			sepp = strchr(np, sep);
-		else
-			sepp = NULL;
-
-		/* find start of next "[ index ]..." */
-		idxp = strchr(np, '[');
-
-		/* if sepp comes first, set idxp to NULL */
-		if (sepp && idxp && (sepp < idxp))
-			idxp = NULL;
-
-		/*
-		 * At this point 'idxp' is set if there is an index
-		 * expected for the current component.
-		 */
-		if (idxp) {
-			/* set 'n' to length of current 'np' name component */
-			n = idxp++ - np;
-
-			/* keep sepp up to date for *ep use as we advance */
-			skip_whitespace(idxp);
-			sepp = idxp;
-
-			/* determine the index value */
-#if defined(_KERNEL) && !defined(_BOOT)
-			if (ddi_strtol(idxp, &idxep, 0, &idx))
-				goto fail;
-#else
-			idx = strtol(idxp, &idxep, 0);
-#endif
-			if (idxep == idxp)
-				goto fail;
-
-			/* keep sepp up to date for *ep use as we advance */
-			sepp = idxep;
-
-			/* skip white space index value and check for ']' */
-			skip_whitespace(sepp);
-			if (*sepp++ != ']')
-				goto fail;
-
-			/* for embedded arrays, support C syntax: "a[1].b" */
-			skip_whitespace(sepp);
-			if (sep && (*sepp == sep))
-				sepp++;
-		} else if (sepp) {
-			n = sepp++ - np;
-		} else {
-			n = strlen(np);
-		}
-
-		/* trim trailing whitespace by reducing length of 'np' */
-		if (n == 0)
-			goto fail;
-		for (n--; (np[n] == ' ') || (np[n] == '\t'); n--)
-			;
-		n++;
-
-		/* skip whitespace, and set sepp to NULL if complete */
-		if (sepp) {
-			skip_whitespace(sepp);
-			if (*sepp == 0)
-				sepp = NULL;
-		}
-
-		/*
-		 * At this point:
-		 * o  'n' is the length of current 'np' component.
-		 * o  'idxp' is set if there was an index, and value 'idx'.
-		 * o  'sepp' is set to the beginning of the next component,
-		 *    and set to NULL if we have no more components.
-		 *
-		 * Search for nvpair with matching component name.
-		 */
-		for (nvp = nvlist_next_nvpair(nvl, NULL); nvp != NULL;
-		    nvp = nvlist_next_nvpair(nvl, nvp)) {
-
-			/* continue if no match on name */
-			if (strncmp(np, nvpair_name(nvp), n) ||
-			    (strlen(nvpair_name(nvp)) != n))
-				continue;
-
-			/* if indexed, verify type is array oriented */
-			if (idxp && !nvpair_type_is_array(nvp))
-				goto fail;
-
-			/*
-			 * Full match found, return nvp and idx if this
-			 * was the last component.
-			 */
-			if (sepp == NULL) {
-				if (ret)
-					*ret = nvp;
-				if (ip && idxp)
-					*ip = (int)idx;	/* return index */
-				return (0);		/* found */
-			}
-
-			/*
-			 * More components: current match must be
-			 * of DATA_TYPE_NVLIST or DATA_TYPE_NVLIST_ARRAY
-			 * to support going deeper.
-			 */
-			if (nvpair_type(nvp) == DATA_TYPE_NVLIST) {
-				nvl = EMBEDDED_NVL(nvp);
-				break;
-			} else if (nvpair_type(nvp) == DATA_TYPE_NVLIST_ARRAY) {
-				(void) nvpair_value_nvlist_array(nvp,
-				    &nva, (uint_t *)&n);
-				if ((n < 0) || (idx >= n))
-					goto fail;
-				nvl = nva[idx];
-				break;
-			}
-
-			/* type does not support more levels */
-			goto fail;
-		}
-		if (nvp == NULL)
-			goto fail;		/* 'name' not found */
-
-		/* search for match of next component in embedded 'nvl' list */
-	}
-
-fail:	if (ep && sepp)
-		*ep = sepp;
-	return (EINVAL);
-}
-
-/*
- * Return pointer to nvpair with specified 'name'.
- */
-int
-nvlist_lookup_nvpair(nvlist_t *nvl, const char *name, nvpair_t **ret)
-{
-	return (nvlist_lookup_nvpair_ei_sep(nvl, name, 0, ret, NULL, NULL));
-}
-
-/*
- * Determine if named nvpair exists in nvlist (use embedded separator of '.'
- * and return array index).  See nvlist_lookup_nvpair_ei_sep for more detailed
- * description.
- */
-int nvlist_lookup_nvpair_embedded_index(nvlist_t *nvl,
-    const char *name, nvpair_t **ret, int *ip, char **ep)
-{
-	return (nvlist_lookup_nvpair_ei_sep(nvl, name, '.', ret, ip, ep));
-}
-
-boolean_t
-nvlist_exists(nvlist_t *nvl, const char *name)
-{
-	nvpriv_t *priv;
-	nvpair_t *nvp;
-	i_nvp_t *curr;
-
-	if (name == NULL || nvl == NULL ||
-	    (priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv) == NULL)
-		return (B_FALSE);
-
-	for (curr = priv->nvp_list; curr != NULL; curr = curr->nvi_next) {
-		nvp = &curr->nvi_nvp;
-
-		if (strcmp(name, NVP_NAME(nvp)) == 0)
-			return (B_TRUE);
-	}
-
-	return (B_FALSE);
-}
-
-int
-nvpair_value_boolean_value(nvpair_t *nvp, boolean_t *val)
-{
-	return (nvpair_value_common(nvp, DATA_TYPE_BOOLEAN_VALUE, NULL, val));
-}
-
-int
-nvpair_value_byte(nvpair_t *nvp, uchar_t *val)
-{
-	return (nvpair_value_common(nvp, DATA_TYPE_BYTE, NULL, val));
-}
-
-int
-nvpair_value_int8(nvpair_t *nvp, int8_t *val)
-{
-	return (nvpair_value_common(nvp, DATA_TYPE_INT8, NULL, val));
-}
-
-int
-nvpair_value_uint8(nvpair_t *nvp, uint8_t *val)
-{
-	return (nvpair_value_common(nvp, DATA_TYPE_UINT8, NULL, val));
-}
-
-int
-nvpair_value_int16(nvpair_t *nvp, int16_t *val)
-{
-	return (nvpair_value_common(nvp, DATA_TYPE_INT16, NULL, val));
-}
-
-int
-nvpair_value_uint16(nvpair_t *nvp, uint16_t *val)
-{
-	return (nvpair_value_common(nvp, DATA_TYPE_UINT16, NULL, val));
-}
-
-int
-nvpair_value_int32(nvpair_t *nvp, int32_t *val)
-{
-	return (nvpair_value_common(nvp, DATA_TYPE_INT32, NULL, val));
-}
-
-int
-nvpair_value_uint32(nvpair_t *nvp, uint32_t *val)
-{
-	return (nvpair_value_common(nvp, DATA_TYPE_UINT32, NULL, val));
-}
-
-int
-nvpair_value_int64(nvpair_t *nvp, int64_t *val)
-{
-	return (nvpair_value_common(nvp, DATA_TYPE_INT64, NULL, val));
-}
-
-int
-nvpair_value_uint64(nvpair_t *nvp, uint64_t *val)
-{
-	return (nvpair_value_common(nvp, DATA_TYPE_UINT64, NULL, val));
-}
-
-#if !defined(_KERNEL)
-int
-nvpair_value_double(nvpair_t *nvp, double *val)
-{
-	return (nvpair_value_common(nvp, DATA_TYPE_DOUBLE, NULL, val));
-}
-#endif
-
-int
-nvpair_value_string(nvpair_t *nvp, char **val)
-{
-	return (nvpair_value_common(nvp, DATA_TYPE_STRING, NULL, val));
-}
-
-int
-nvpair_value_nvlist(nvpair_t *nvp, nvlist_t **val)
-{
-	return (nvpair_value_common(nvp, DATA_TYPE_NVLIST, NULL, val));
-}
-
-int
-nvpair_value_boolean_array(nvpair_t *nvp, boolean_t **val, uint_t *nelem)
-{
-	return (nvpair_value_common(nvp, DATA_TYPE_BOOLEAN_ARRAY, nelem, val));
-}
-
-int
-nvpair_value_byte_array(nvpair_t *nvp, uchar_t **val, uint_t *nelem)
-{
-	return (nvpair_value_common(nvp, DATA_TYPE_BYTE_ARRAY, nelem, val));
-}
-
-int
-nvpair_value_int8_array(nvpair_t *nvp, int8_t **val, uint_t *nelem)
-{
-	return (nvpair_value_common(nvp, DATA_TYPE_INT8_ARRAY, nelem, val));
-}
-
-int
-nvpair_value_uint8_array(nvpair_t *nvp, uint8_t **val, uint_t *nelem)
-{
-	return (nvpair_value_common(nvp, DATA_TYPE_UINT8_ARRAY, nelem, val));
-}
-
-int
-nvpair_value_int16_array(nvpair_t *nvp, int16_t **val, uint_t *nelem)
-{
-	return (nvpair_value_common(nvp, DATA_TYPE_INT16_ARRAY, nelem, val));
-}
-
-int
-nvpair_value_uint16_array(nvpair_t *nvp, uint16_t **val, uint_t *nelem)
-{
-	return (nvpair_value_common(nvp, DATA_TYPE_UINT16_ARRAY, nelem, val));
-}
-
-int
-nvpair_value_int32_array(nvpair_t *nvp, int32_t **val, uint_t *nelem)
-{
-	return (nvpair_value_common(nvp, DATA_TYPE_INT32_ARRAY, nelem, val));
-}
-
-int
-nvpair_value_uint32_array(nvpair_t *nvp, uint32_t **val, uint_t *nelem)
-{
-	return (nvpair_value_common(nvp, DATA_TYPE_UINT32_ARRAY, nelem, val));
-}
-
-int
-nvpair_value_int64_array(nvpair_t *nvp, int64_t **val, uint_t *nelem)
-{
-	return (nvpair_value_common(nvp, DATA_TYPE_INT64_ARRAY, nelem, val));
-}
-
-int
-nvpair_value_uint64_array(nvpair_t *nvp, uint64_t **val, uint_t *nelem)
-{
-	return (nvpair_value_common(nvp, DATA_TYPE_UINT64_ARRAY, nelem, val));
-}
-
-int
-nvpair_value_string_array(nvpair_t *nvp, char ***val, uint_t *nelem)
-{
-	return (nvpair_value_common(nvp, DATA_TYPE_STRING_ARRAY, nelem, val));
-}
-
-int
-nvpair_value_nvlist_array(nvpair_t *nvp, nvlist_t ***val, uint_t *nelem)
-{
-	return (nvpair_value_common(nvp, DATA_TYPE_NVLIST_ARRAY, nelem, val));
-}
-
-int
-nvpair_value_hrtime(nvpair_t *nvp, hrtime_t *val)
-{
-	return (nvpair_value_common(nvp, DATA_TYPE_HRTIME, NULL, val));
-}
-
-/*
- * Add specified pair to the list.
- */
-int
-nvlist_add_nvpair(nvlist_t *nvl, nvpair_t *nvp)
-{
-	if (nvl == NULL || nvp == NULL)
-		return (EINVAL);
-
-	return (nvlist_add_common(nvl, NVP_NAME(nvp), NVP_TYPE(nvp),
-	    NVP_NELEM(nvp), NVP_VALUE(nvp)));
-}
-
-/*
- * Merge the supplied nvlists and put the result in dst.
- * The merged list will contain all names specified in both lists,
- * the values are taken from nvl in the case of duplicates.
- * Return 0 on success.
- */
-/*ARGSUSED*/
-int
-nvlist_merge(nvlist_t *dst, nvlist_t *nvl, int flag)
-{
-	if (nvl == NULL || dst == NULL)
-		return (EINVAL);
-
-	if (dst != nvl)
-		return (nvlist_copy_pairs(nvl, dst));
-
-	return (0);
-}
-
-/*
- * Encoding related routines
- */
-#define	NVS_OP_ENCODE	0
-#define	NVS_OP_DECODE	1
-#define	NVS_OP_GETSIZE	2
-
-typedef struct nvs_ops nvs_ops_t;
-
-typedef struct {
-	int		nvs_op;
-	const nvs_ops_t	*nvs_ops;
-	void		*nvs_private;
-	nvpriv_t	*nvs_priv;
-	int		nvs_recursion;
-} nvstream_t;
-
-/*
- * nvs operations are:
- *   - nvs_nvlist
- *     encoding / decoding of a nvlist header (nvlist_t)
- *     calculates the size used for header and end detection
- *
- *   - nvs_nvpair
- *     responsible for the first part of encoding / decoding of an nvpair
- *     calculates the decoded size of an nvpair
- *
- *   - nvs_nvp_op
- *     second part of encoding / decoding of an nvpair
- *
- *   - nvs_nvp_size
- *     calculates the encoding size of an nvpair
- *
- *   - nvs_nvl_fini
- *     encodes the end detection mark (zeros).
- */
-struct nvs_ops {
-	int (*nvs_nvlist)(nvstream_t *, nvlist_t *, size_t *);
-	int (*nvs_nvpair)(nvstream_t *, nvpair_t *, size_t *);
-	int (*nvs_nvp_op)(nvstream_t *, nvpair_t *);
-	int (*nvs_nvp_size)(nvstream_t *, nvpair_t *, size_t *);
-	int (*nvs_nvl_fini)(nvstream_t *);
-};
-
-typedef struct {
-	char	nvh_encoding;	/* nvs encoding method */
-	char	nvh_endian;	/* nvs endian */
-	char	nvh_reserved1;	/* reserved for future use */
-	char	nvh_reserved2;	/* reserved for future use */
-} nvs_header_t;
-
-static int
-nvs_encode_pairs(nvstream_t *nvs, nvlist_t *nvl)
-{
-	nvpriv_t *priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv;
-	i_nvp_t *curr;
-
-	/*
-	 * Walk nvpair in list and encode each nvpair
-	 */
-	for (curr = priv->nvp_list; curr != NULL; curr = curr->nvi_next)
-		if (nvs->nvs_ops->nvs_nvpair(nvs, &curr->nvi_nvp, NULL) != 0)
-			return (EFAULT);
-
-	return (nvs->nvs_ops->nvs_nvl_fini(nvs));
-}
-
-static int
-nvs_decode_pairs(nvstream_t *nvs, nvlist_t *nvl)
-{
-	nvpair_t *nvp;
-	size_t nvsize;
-	int err;
-
-	/*
-	 * Get decoded size of next pair in stream, alloc
-	 * memory for nvpair_t, then decode the nvpair
-	 */
-	while ((err = nvs->nvs_ops->nvs_nvpair(nvs, NULL, &nvsize)) == 0) {
-		if (nvsize == 0) /* end of list */
-			break;
-
-		/* make sure len makes sense */
-		if (nvsize < NVP_SIZE_CALC(1, 0))
-			return (EFAULT);
-
-		if ((nvp = nvp_buf_alloc(nvl, nvsize)) == NULL)
-			return (ENOMEM);
-
-		if ((err = nvs->nvs_ops->nvs_nvp_op(nvs, nvp)) != 0) {
-			nvp_buf_free(nvl, nvp);
-			return (err);
-		}
-
-		if (i_validate_nvpair(nvp) != 0) {
-			nvpair_free(nvp);
-			nvp_buf_free(nvl, nvp);
-			return (EFAULT);
-		}
-
-		err = nvt_add_nvpair(nvl, nvp);
-		if (err != 0) {
-			nvpair_free(nvp);
-			nvp_buf_free(nvl, nvp);
-			return (err);
-		}
-		nvp_buf_link(nvl, nvp);
-	}
-	return (err);
-}
-
-static int
-nvs_getsize_pairs(nvstream_t *nvs, nvlist_t *nvl, size_t *buflen)
-{
-	nvpriv_t *priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv;
-	i_nvp_t *curr;
-	uint64_t nvsize = *buflen;
-	size_t size;
-
-	/*
-	 * Get encoded size of nvpairs in nvlist
-	 */
-	for (curr = priv->nvp_list; curr != NULL; curr = curr->nvi_next) {
-		if (nvs->nvs_ops->nvs_nvp_size(nvs, &curr->nvi_nvp, &size) != 0)
-			return (EINVAL);
-
-		if ((nvsize += size) > INT32_MAX)
-			return (EINVAL);
-	}
-
-	*buflen = nvsize;
-	return (0);
-}
-
-static int
-nvs_operation(nvstream_t *nvs, nvlist_t *nvl, size_t *buflen)
-{
-	int err;
-
-	if (nvl->nvl_priv == 0)
-		return (EFAULT);
-
-	/*
-	 * Perform the operation, starting with header, then each nvpair
-	 */
-	if ((err = nvs->nvs_ops->nvs_nvlist(nvs, nvl, buflen)) != 0)
-		return (err);
-
-	switch (nvs->nvs_op) {
-	case NVS_OP_ENCODE:
-		err = nvs_encode_pairs(nvs, nvl);
-		break;
-
-	case NVS_OP_DECODE:
-		err = nvs_decode_pairs(nvs, nvl);
-		break;
-
-	case NVS_OP_GETSIZE:
-		err = nvs_getsize_pairs(nvs, nvl, buflen);
-		break;
-
-	default:
-		err = EINVAL;
-	}
-
-	return (err);
-}
-
-static int
-nvs_embedded(nvstream_t *nvs, nvlist_t *embedded)
-{
-	switch (nvs->nvs_op) {
-	case NVS_OP_ENCODE: {
-		int err;
-
-		if (nvs->nvs_recursion >= nvpair_max_recursion)
-			return (EINVAL);
-		nvs->nvs_recursion++;
-		err = nvs_operation(nvs, embedded, NULL);
-		nvs->nvs_recursion--;
-		return (err);
-	}
-	case NVS_OP_DECODE: {
-		nvpriv_t *priv;
-		int err;
-
-		if (embedded->nvl_version != NV_VERSION)
-			return (ENOTSUP);
-
-		if ((priv = nv_priv_alloc_embedded(nvs->nvs_priv)) == NULL)
-			return (ENOMEM);
-
-		nvlist_init(embedded, embedded->nvl_nvflag, priv);
-
-		if (nvs->nvs_recursion >= nvpair_max_recursion) {
-			nvlist_free(embedded);
-			return (EINVAL);
-		}
-		nvs->nvs_recursion++;
-		if ((err = nvs_operation(nvs, embedded, NULL)) != 0)
-			nvlist_free(embedded);
-		nvs->nvs_recursion--;
-		return (err);
-	}
-	default:
-		break;
-	}
-
-	return (EINVAL);
-}
-
-static int
-nvs_embedded_nvl_array(nvstream_t *nvs, nvpair_t *nvp, size_t *size)
-{
-	size_t nelem = NVP_NELEM(nvp);
-	nvlist_t **nvlp = EMBEDDED_NVL_ARRAY(nvp);
-	int i;
-
-	switch (nvs->nvs_op) {
-	case NVS_OP_ENCODE:
-		for (i = 0; i < nelem; i++)
-			if (nvs_embedded(nvs, nvlp[i]) != 0)
-				return (EFAULT);
-		break;
-
-	case NVS_OP_DECODE: {
-		size_t len = nelem * sizeof (uint64_t);
-		nvlist_t *embedded = (nvlist_t *)((uintptr_t)nvlp + len);
-
-		bzero(nvlp, len);	/* don't trust packed data */
-		for (i = 0; i < nelem; i++) {
-			if (nvs_embedded(nvs, embedded) != 0) {
-				nvpair_free(nvp);
-				return (EFAULT);
-			}
-
-			nvlp[i] = embedded++;
-		}
-		break;
-	}
-	case NVS_OP_GETSIZE: {
-		uint64_t nvsize = 0;
-
-		for (i = 0; i < nelem; i++) {
-			size_t nvp_sz = 0;
-
-			if (nvs_operation(nvs, nvlp[i], &nvp_sz) != 0)
-				return (EINVAL);
-
-			if ((nvsize += nvp_sz) > INT32_MAX)
-				return (EINVAL);
-		}
-
-		*size = nvsize;
-		break;
-	}
-	default:
-		return (EINVAL);
-	}
-
-	return (0);
-}
-
-static int nvs_native(nvstream_t *, nvlist_t *, char *, size_t *);
-static int nvs_xdr(nvstream_t *, nvlist_t *, char *, size_t *);
-
-/*
- * Common routine for nvlist operations:
- * encode, decode, getsize (encoded size).
- */
-static int
-nvlist_common(nvlist_t *nvl, char *buf, size_t *buflen, int encoding,
-    int nvs_op)
-{
-	int err = 0;
-	nvstream_t nvs;
-	int nvl_endian;
-#if BYTE_ORDER == _LITTLE_ENDIAN
-	int host_endian = 1;
-#else
-	int host_endian = 0;
-#endif	/* _LITTLE_ENDIAN */
-	nvs_header_t *nvh = (void *)buf;
-
-	if (buflen == NULL || nvl == NULL ||
-	    (nvs.nvs_priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv) == NULL)
-		return (EINVAL);
-
-	nvs.nvs_op = nvs_op;
-	nvs.nvs_recursion = 0;
-
-	/*
-	 * For NVS_OP_ENCODE and NVS_OP_DECODE make sure an nvlist and
-	 * a buffer is allocated.  The first 4 bytes in the buffer are
-	 * used for encoding method and host endian.
-	 */
-	switch (nvs_op) {
-	case NVS_OP_ENCODE:
-		if (buf == NULL || *buflen < sizeof (nvs_header_t))
-			return (EINVAL);
-
-		nvh->nvh_encoding = encoding;
-		nvh->nvh_endian = nvl_endian = host_endian;
-		nvh->nvh_reserved1 = 0;
-		nvh->nvh_reserved2 = 0;
-		break;
-
-	case NVS_OP_DECODE:
-		if (buf == NULL || *buflen < sizeof (nvs_header_t))
-			return (EINVAL);
-
-		/* get method of encoding from first byte */
-		encoding = nvh->nvh_encoding;
-		nvl_endian = nvh->nvh_endian;
-		break;
-
-	case NVS_OP_GETSIZE:
-		nvl_endian = host_endian;
-
-		/*
-		 * add the size for encoding
-		 */
-		*buflen = sizeof (nvs_header_t);
-		break;
-
-	default:
-		return (ENOTSUP);
-	}
-
-	/*
-	 * Create an nvstream with proper encoding method
-	 */
-	switch (encoding) {
-	case NV_ENCODE_NATIVE:
-		/*
-		 * check endianness, in case we are unpacking
-		 * from a file
-		 */
-		if (nvl_endian != host_endian)
-			return (ENOTSUP);
-		err = nvs_native(&nvs, nvl, buf, buflen);
-		break;
-	case NV_ENCODE_XDR:
-		err = nvs_xdr(&nvs, nvl, buf, buflen);
-		break;
-	default:
-		err = ENOTSUP;
-		break;
-	}
-
-	return (err);
-}
-
-int
-nvlist_size(nvlist_t *nvl, size_t *size, int encoding)
-{
-	return (nvlist_common(nvl, NULL, size, encoding, NVS_OP_GETSIZE));
-}
-
-/*
- * Pack nvlist into contiguous memory
- */
-/*ARGSUSED1*/
-int
-nvlist_pack(nvlist_t *nvl, char **bufp, size_t *buflen, int encoding,
-    int kmflag)
-{
-#if defined(_KERNEL) && !defined(_BOOT)
-	return (nvlist_xpack(nvl, bufp, buflen, encoding,
-	    (kmflag == KM_SLEEP ? nv_alloc_sleep : nv_alloc_nosleep)));
-#else
-	return (nvlist_xpack(nvl, bufp, buflen, encoding, nv_alloc_nosleep));
-#endif
-}
-
-int
-nvlist_xpack(nvlist_t *nvl, char **bufp, size_t *buflen, int encoding,
-    nv_alloc_t *nva)
-{
-	nvpriv_t nvpriv;
-	size_t alloc_size;
-	char *buf;
-	int err;
-
-	if (nva == NULL || nvl == NULL || bufp == NULL || buflen == NULL)
-		return (EINVAL);
-
-	if (*bufp != NULL)
-		return (nvlist_common(nvl, *bufp, buflen, encoding,
-		    NVS_OP_ENCODE));
-
-	/*
-	 * Here is a difficult situation:
-	 * 1. The nvlist has fixed allocator properties.
-	 *    All other nvlist routines (like nvlist_add_*, ...) use
-	 *    these properties.
-	 * 2. When using nvlist_pack() the user can specify their own
-	 *    allocator properties (e.g. by using KM_NOSLEEP).
-	 *
-	 * We use the user specified properties (2). A clearer solution
-	 * will be to remove the kmflag from nvlist_pack(), but we will
-	 * not change the interface.
-	 */
-	nv_priv_init(&nvpriv, nva, 0);
-
-	if ((err = nvlist_size(nvl, &alloc_size, encoding)))
-		return (err);
-
-	if ((buf = nv_mem_zalloc(&nvpriv, alloc_size)) == NULL)
-		return (ENOMEM);
-
-	if ((err = nvlist_common(nvl, buf, &alloc_size, encoding,
-	    NVS_OP_ENCODE)) != 0) {
-		nv_mem_free(&nvpriv, buf, alloc_size);
-	} else {
-		*buflen = alloc_size;
-		*bufp = buf;
-	}
-
-	return (err);
-}
-
-/*
- * Unpack buf into an nvlist_t
- */
-/*ARGSUSED1*/
-int
-nvlist_unpack(char *buf, size_t buflen, nvlist_t **nvlp, int kmflag)
-{
-#if defined(_KERNEL) && !defined(_BOOT)
-	return (nvlist_xunpack(buf, buflen, nvlp,
-	    (kmflag == KM_SLEEP ? nv_alloc_sleep : nv_alloc_nosleep)));
-#else
-	return (nvlist_xunpack(buf, buflen, nvlp, nv_alloc_nosleep));
-#endif
-}
-
-int
-nvlist_xunpack(char *buf, size_t buflen, nvlist_t **nvlp, nv_alloc_t *nva)
-{
-	nvlist_t *nvl;
-	int err;
-
-	if (nvlp == NULL)
-		return (EINVAL);
-
-	if ((err = nvlist_xalloc(&nvl, 0, nva)) != 0)
-		return (err);
-
-	if ((err = nvlist_common(nvl, buf, &buflen, 0, NVS_OP_DECODE)) != 0)
-		nvlist_free(nvl);
-	else
-		*nvlp = nvl;
-
-	return (err);
-}
-
-/*
- * Native encoding functions
- */
-typedef struct {
-	/*
-	 * This structure is used when decoding a packed nvpair in
-	 * the native format.  n_base points to a buffer containing the
-	 * packed nvpair.  n_end is a pointer to the end of the buffer.
-	 * (n_end actually points to the first byte past the end of the
-	 * buffer.)  n_curr is a pointer that lies between n_base and n_end.
-	 * It points to the current data that we are decoding.
-	 * The amount of data left in the buffer is equal to n_end - n_curr.
-	 * n_flag is used to recognize a packed embedded list.
-	 */
-	caddr_t n_base;
-	caddr_t n_end;
-	caddr_t n_curr;
-	uint_t  n_flag;
-} nvs_native_t;
-
-static int
-nvs_native_create(nvstream_t *nvs, nvs_native_t *native, char *buf,
-    size_t buflen)
-{
-	switch (nvs->nvs_op) {
-	case NVS_OP_ENCODE:
-	case NVS_OP_DECODE:
-		nvs->nvs_private = native;
-		native->n_curr = native->n_base = buf;
-		native->n_end = buf + buflen;
-		native->n_flag = 0;
-		return (0);
-
-	case NVS_OP_GETSIZE:
-		nvs->nvs_private = native;
-		native->n_curr = native->n_base = native->n_end = NULL;
-		native->n_flag = 0;
-		return (0);
-	default:
-		return (EINVAL);
-	}
-}
-
-/*ARGSUSED*/
-static void
-nvs_native_destroy(nvstream_t *nvs)
-{
-}
-
-static int
-native_cp(nvstream_t *nvs, void *buf, size_t size)
-{
-	nvs_native_t *native = (nvs_native_t *)nvs->nvs_private;
-
-	if (native->n_curr + size > native->n_end)
-		return (EFAULT);
-
-	/*
-	 * The bcopy() below eliminates alignment requirement
-	 * on the buffer (stream) and is preferred over direct access.
-	 */
-	switch (nvs->nvs_op) {
-	case NVS_OP_ENCODE:
-		bcopy(buf, native->n_curr, size);
-		break;
-	case NVS_OP_DECODE:
-		bcopy(native->n_curr, buf, size);
-		break;
-	default:
-		return (EINVAL);
-	}
-
-	native->n_curr += size;
-	return (0);
-}
-
-/*
- * operate on nvlist_t header
- */
-static int
-nvs_native_nvlist(nvstream_t *nvs, nvlist_t *nvl, size_t *size)
-{
-	nvs_native_t *native = nvs->nvs_private;
-
-	switch (nvs->nvs_op) {
-	case NVS_OP_ENCODE:
-	case NVS_OP_DECODE:
-		if (native->n_flag)
-			return (0);	/* packed embedded list */
-
-		native->n_flag = 1;
-
-		/* copy version and nvflag of the nvlist_t */
-		if (native_cp(nvs, &nvl->nvl_version, sizeof (int32_t)) != 0 ||
-		    native_cp(nvs, &nvl->nvl_nvflag, sizeof (int32_t)) != 0)
-			return (EFAULT);
-
-		return (0);
-
-	case NVS_OP_GETSIZE:
-		/*
-		 * if calculate for packed embedded list
-		 * 	4 for end of the embedded list
-		 * else
-		 * 	2 * sizeof (int32_t) for nvl_version and nvl_nvflag
-		 * 	and 4 for end of the entire list
-		 */
-		if (native->n_flag) {
-			*size += 4;
-		} else {
-			native->n_flag = 1;
-			*size += 2 * sizeof (int32_t) + 4;
-		}
-
-		return (0);
-
-	default:
-		return (EINVAL);
-	}
-}
-
-static int
-nvs_native_nvl_fini(nvstream_t *nvs)
-{
-	if (nvs->nvs_op == NVS_OP_ENCODE) {
-		nvs_native_t *native = (nvs_native_t *)nvs->nvs_private;
-		/*
-		 * Add 4 zero bytes at end of nvlist. They are used
-		 * for end detection by the decode routine.
-		 */
-		if (native->n_curr + sizeof (int) > native->n_end)
-			return (EFAULT);
-
-		bzero(native->n_curr, sizeof (int));
-		native->n_curr += sizeof (int);
-	}
-
-	return (0);
-}
-
-static int
-nvpair_native_embedded(nvstream_t *nvs, nvpair_t *nvp)
-{
-	if (nvs->nvs_op == NVS_OP_ENCODE) {
-		nvs_native_t *native = (nvs_native_t *)nvs->nvs_private;
-		char *packed = (void *)
-		    (native->n_curr - nvp->nvp_size + NVP_VALOFF(nvp));
-		/*
-		 * Null out the pointer that is meaningless in the packed
-		 * structure. The address may not be aligned, so we have
-		 * to use bzero.
-		 */
-		bzero(packed + offsetof(nvlist_t, nvl_priv),
-		    sizeof(((nvlist_t *)NULL)->nvl_priv));
-	}
-
-	return (nvs_embedded(nvs, EMBEDDED_NVL(nvp)));
-}
-
-static int
-nvpair_native_embedded_array(nvstream_t *nvs, nvpair_t *nvp)
-{
-	if (nvs->nvs_op == NVS_OP_ENCODE) {
-		nvs_native_t *native = (nvs_native_t *)nvs->nvs_private;
-		char *value = native->n_curr - nvp->nvp_size + NVP_VALOFF(nvp);
-		size_t len = NVP_NELEM(nvp) * sizeof (uint64_t);
-		int i;
-		/*
-		 * Null out pointers that are meaningless in the packed
-		 * structure. The addresses may not be aligned, so we have
-		 * to use bzero.
-		 */
-		bzero(value, len);
-
-		value += len;
-		for (i = 0; i < NVP_NELEM(nvp); i++) {
-			/*
-			 * Null out the pointer that is meaningless in the
-			 * packed structure. The address may not be aligned,
-			 * so we have to use bzero.
-			 */
-			bzero(value + offsetof(nvlist_t, nvl_priv),
-			    sizeof(((nvlist_t *)NULL)->nvl_priv));
-			value += sizeof(nvlist_t);
-		}
-	}
-
-	return (nvs_embedded_nvl_array(nvs, nvp, NULL));
-}
-
-static void
-nvpair_native_string_array(nvstream_t *nvs, nvpair_t *nvp)
-{
-	switch (nvs->nvs_op) {
-	case NVS_OP_ENCODE: {
-		nvs_native_t *native = (nvs_native_t *)nvs->nvs_private;
-		uint64_t *strp = (void *)
-		    (native->n_curr - nvp->nvp_size + NVP_VALOFF(nvp));
-		/*
-		 * Null out pointers that are meaningless in the packed
-		 * structure. The addresses may not be aligned, so we have
-		 * to use bzero.
-		 */
-		bzero(strp, NVP_NELEM(nvp) * sizeof (uint64_t));
-		break;
-	}
-	case NVS_OP_DECODE: {
-		char **strp = (void *)NVP_VALUE(nvp);
-		char *buf = ((char *)strp + NVP_NELEM(nvp) * sizeof (uint64_t));
-		int i;
-
-		for (i = 0; i < NVP_NELEM(nvp); i++) {
-			strp[i] = buf;
-			buf += strlen(buf) + 1;
-		}
-		break;
-	}
-	}
-}
-
-static int
-nvs_native_nvp_op(nvstream_t *nvs, nvpair_t *nvp)
-{
-	data_type_t type;
-	int value_sz;
-	int ret = 0;
-
-	/*
-	 * We do the initial bcopy of the data before we look at
-	 * the nvpair type, because when we're decoding, we won't
-	 * have the correct values for the pair until we do the bcopy.
-	 */
-	switch (nvs->nvs_op) {
-	case NVS_OP_ENCODE:
-	case NVS_OP_DECODE:
-		if (native_cp(nvs, nvp, nvp->nvp_size) != 0)
-			return (EFAULT);
-		break;
-	default:
-		return (EINVAL);
-	}
-
-	/* verify nvp_name_sz, check the name string length */
-	if (i_validate_nvpair_name(nvp) != 0)
-		return (EFAULT);
-
-	type = NVP_TYPE(nvp);
-
-	/*
-	 * Verify type and nelem and get the value size.
-	 * In case of data types DATA_TYPE_STRING and DATA_TYPE_STRING_ARRAY
-	 * is the size of the string(s) excluded.
-	 */
-	if ((value_sz = i_get_value_size(type, NULL, NVP_NELEM(nvp))) < 0)
-		return (EFAULT);
-
-	if (NVP_SIZE_CALC(nvp->nvp_name_sz, value_sz) > nvp->nvp_size)
-		return (EFAULT);
-
-	switch (type) {
-	case DATA_TYPE_NVLIST:
-		ret = nvpair_native_embedded(nvs, nvp);
-		break;
-	case DATA_TYPE_NVLIST_ARRAY:
-		ret = nvpair_native_embedded_array(nvs, nvp);
-		break;
-	case DATA_TYPE_STRING_ARRAY:
-		nvpair_native_string_array(nvs, nvp);
-		break;
-	default:
-		break;
-	}
-
-	return (ret);
-}
-
-static int
-nvs_native_nvp_size(nvstream_t *nvs, nvpair_t *nvp, size_t *size)
-{
-	uint64_t nvp_sz = nvp->nvp_size;
-
-	switch (NVP_TYPE(nvp)) {
-	case DATA_TYPE_NVLIST: {
-		size_t nvsize = 0;
-
-		if (nvs_operation(nvs, EMBEDDED_NVL(nvp), &nvsize) != 0)
-			return (EINVAL);
-
-		nvp_sz += nvsize;
-		break;
-	}
-	case DATA_TYPE_NVLIST_ARRAY: {
-		size_t nvsize;
-
-		if (nvs_embedded_nvl_array(nvs, nvp, &nvsize) != 0)
-			return (EINVAL);
-
-		nvp_sz += nvsize;
-		break;
-	}
-	default:
-		break;
-	}
-
-	if (nvp_sz > INT32_MAX)
-		return (EINVAL);
-
-	*size = nvp_sz;
-
-	return (0);
-}
-
-static int
-nvs_native_nvpair(nvstream_t *nvs, nvpair_t *nvp, size_t *size)
-{
-	switch (nvs->nvs_op) {
-	case NVS_OP_ENCODE:
-		return (nvs_native_nvp_op(nvs, nvp));
-
-	case NVS_OP_DECODE: {
-		nvs_native_t *native = (nvs_native_t *)nvs->nvs_private;
-		int32_t decode_len;
-
-		/* try to read the size value from the stream */
-		if (native->n_curr + sizeof (int32_t) > native->n_end)
-			return (EFAULT);
-		bcopy(native->n_curr, &decode_len, sizeof (int32_t));
-
-		/* sanity check the size value */
-		if (decode_len < 0 ||
-		    decode_len > native->n_end - native->n_curr)
-			return (EFAULT);
-
-		*size = decode_len;
-
-		/*
-		 * If at the end of the stream then move the cursor
-		 * forward, otherwise nvpair_native_op() will read
-		 * the entire nvpair at the same cursor position.
-		 */
-		if (*size == 0)
-			native->n_curr += sizeof (int32_t);
-		break;
-	}
-
-	default:
-		return (EINVAL);
-	}
-
-	return (0);
-}
-
-static const nvs_ops_t nvs_native_ops = {
-	nvs_native_nvlist,
-	nvs_native_nvpair,
-	nvs_native_nvp_op,
-	nvs_native_nvp_size,
-	nvs_native_nvl_fini
-};
-
-static int
-nvs_native(nvstream_t *nvs, nvlist_t *nvl, char *buf, size_t *buflen)
-{
-	nvs_native_t native;
-	int err;
-
-	nvs->nvs_ops = &nvs_native_ops;
-
-	if ((err = nvs_native_create(nvs, &native, buf + sizeof (nvs_header_t),
-	    *buflen - sizeof (nvs_header_t))) != 0)
-		return (err);
-
-	err = nvs_operation(nvs, nvl, buflen);
-
-	nvs_native_destroy(nvs);
-
-	return (err);
-}
-
-/*
- * XDR encoding functions
- *
- * An xdr packed nvlist is encoded as:
- *
- *  - encoding methode and host endian (4 bytes)
- *  - nvl_version (4 bytes)
- *  - nvl_nvflag (4 bytes)
- *
- *  - encoded nvpairs, the format of one xdr encoded nvpair is:
- *	- encoded size of the nvpair (4 bytes)
- *	- decoded size of the nvpair (4 bytes)
- *	- name string, (4 + sizeof(NV_ALIGN4(string))
- *	  a string is coded as size (4 bytes) and data
- *	- data type (4 bytes)
- *	- number of elements in the nvpair (4 bytes)
- *	- data
- *
- *  - 2 zero's for end of the entire list (8 bytes)
- */
-static int
-nvs_xdr_create(nvstream_t *nvs, XDR *xdr, char *buf, size_t buflen)
-{
-	/* xdr data must be 4 byte aligned */
-	if ((ulong_t)buf % 4 != 0)
-		return (EFAULT);
-
-	switch (nvs->nvs_op) {
-	case NVS_OP_ENCODE:
-		xdrmem_create(xdr, buf, (uint_t)buflen, XDR_ENCODE);
-		nvs->nvs_private = xdr;
-		return (0);
-	case NVS_OP_DECODE:
-		xdrmem_create(xdr, buf, (uint_t)buflen, XDR_DECODE);
-		nvs->nvs_private = xdr;
-		return (0);
-	case NVS_OP_GETSIZE:
-		nvs->nvs_private = NULL;
-		return (0);
-	default:
-		return (EINVAL);
-	}
-}
-
-static void
-nvs_xdr_destroy(nvstream_t *nvs)
-{
-	switch (nvs->nvs_op) {
-	case NVS_OP_ENCODE:
-	case NVS_OP_DECODE:
-		xdr_destroy((XDR *)nvs->nvs_private);
-		break;
-	default:
-		break;
-	}
-}
-
-static int
-nvs_xdr_nvlist(nvstream_t *nvs, nvlist_t *nvl, size_t *size)
-{
-	switch (nvs->nvs_op) {
-	case NVS_OP_ENCODE:
-	case NVS_OP_DECODE: {
-		XDR 	*xdr = nvs->nvs_private;
-
-		if (!xdr_int(xdr, &nvl->nvl_version) ||
-		    !xdr_u_int(xdr, &nvl->nvl_nvflag))
-			return (EFAULT);
-		break;
-	}
-	case NVS_OP_GETSIZE: {
-		/*
-		 * 2 * 4 for nvl_version + nvl_nvflag
-		 * and 8 for end of the entire list
-		 */
-		*size += 2 * 4 + 8;
-		break;
-	}
-	default:
-		return (EINVAL);
-	}
-	return (0);
-}
-
-static int
-nvs_xdr_nvl_fini(nvstream_t *nvs)
-{
-	if (nvs->nvs_op == NVS_OP_ENCODE) {
-		XDR *xdr = nvs->nvs_private;
-		int zero = 0;
-
-		if (!xdr_int(xdr, &zero) || !xdr_int(xdr, &zero))
-			return (EFAULT);
-	}
-
-	return (0);
-}
-
-/*
- * The format of xdr encoded nvpair is:
- * encode_size, decode_size, name string, data type, nelem, data
- */
-static int
-nvs_xdr_nvp_op(nvstream_t *nvs, nvpair_t *nvp)
-{
-	data_type_t type;
-	char	*buf;
-	char	*buf_end = (char *)nvp + nvp->nvp_size;
-	int	value_sz;
-	uint_t	nelem, buflen;
-	bool_t	ret = FALSE;
-	XDR	*xdr = nvs->nvs_private;
-
-	ASSERT(xdr != NULL && nvp != NULL);
-
-	/* name string */
-	if ((buf = NVP_NAME(nvp)) >= buf_end)
-		return (EFAULT);
-	buflen = buf_end - buf;
-
-	if (!xdr_string(xdr, &buf, buflen - 1))
-		return (EFAULT);
-	nvp->nvp_name_sz = strlen(buf) + 1;
-
-	/* type and nelem */
-	if (!xdr_int(xdr, (int *)&nvp->nvp_type) ||
-	    !xdr_int(xdr, &nvp->nvp_value_elem))
-		return (EFAULT);
-
-	type = NVP_TYPE(nvp);
-	nelem = nvp->nvp_value_elem;
-
-	/*
-	 * Verify type and nelem and get the value size.
-	 * In case of data types DATA_TYPE_STRING and DATA_TYPE_STRING_ARRAY
-	 * is the size of the string(s) excluded.
-	 */
-	if ((value_sz = i_get_value_size(type, NULL, nelem)) < 0)
-		return (EFAULT);
-
-	/* if there is no data to extract then return */
-	if (nelem == 0)
-		return (0);
-
-	/* value */
-	if ((buf = NVP_VALUE(nvp)) >= buf_end)
-		return (EFAULT);
-	buflen = buf_end - buf;
-
-	if (buflen < value_sz)
-		return (EFAULT);
-
-	switch (type) {
-	case DATA_TYPE_NVLIST:
-		if (nvs_embedded(nvs, (void *)buf) == 0)
-			return (0);
-		break;
-
-	case DATA_TYPE_NVLIST_ARRAY:
-		if (nvs_embedded_nvl_array(nvs, nvp, NULL) == 0)
-			return (0);
-		break;
-
-	case DATA_TYPE_BOOLEAN:
-		ret = TRUE;
-		break;
-
-	case DATA_TYPE_BYTE:
-	case DATA_TYPE_INT8:
-	case DATA_TYPE_UINT8:
-		ret = xdr_char(xdr, buf);
-		break;
-
-	case DATA_TYPE_INT16:
-		ret = xdr_short(xdr, (void *)buf);
-		break;
-
-	case DATA_TYPE_UINT16:
-		ret = xdr_u_short(xdr, (void *)buf);
-		break;
-
-	case DATA_TYPE_BOOLEAN_VALUE:
-	case DATA_TYPE_INT32:
-		ret = xdr_int(xdr, (void *)buf);
-		break;
-
-	case DATA_TYPE_UINT32:
-		ret = xdr_u_int(xdr, (void *)buf);
-		break;
-
-	case DATA_TYPE_INT64:
-		ret = xdr_longlong_t(xdr, (void *)buf);
-		break;
-
-	case DATA_TYPE_UINT64:
-		ret = xdr_u_longlong_t(xdr, (void *)buf);
-		break;
-
-	case DATA_TYPE_HRTIME:
-		/*
-		 * NOTE: must expose the definition of hrtime_t here
-		 */
-		ret = xdr_longlong_t(xdr, (void *)buf);
-		break;
-#if !defined(_KERNEL)
-	case DATA_TYPE_DOUBLE:
-		ret = xdr_double(xdr, (void *)buf);
-		break;
-#endif
-	case DATA_TYPE_STRING:
-		ret = xdr_string(xdr, &buf, buflen - 1);
-		break;
-
-	case DATA_TYPE_BYTE_ARRAY:
-		ret = xdr_opaque(xdr, buf, nelem);
-		break;
-
-	case DATA_TYPE_INT8_ARRAY:
-	case DATA_TYPE_UINT8_ARRAY:
-		ret = xdr_array(xdr, &buf, &nelem, buflen, sizeof (int8_t),
-		    (xdrproc_t)xdr_char);
-		break;
-
-	case DATA_TYPE_INT16_ARRAY:
-		ret = xdr_array(xdr, &buf, &nelem, buflen / sizeof (int16_t),
-		    sizeof (int16_t), (xdrproc_t)xdr_short);
-		break;
-
-	case DATA_TYPE_UINT16_ARRAY:
-		ret = xdr_array(xdr, &buf, &nelem, buflen / sizeof (uint16_t),
-		    sizeof (uint16_t), (xdrproc_t)xdr_u_short);
-		break;
-
-	case DATA_TYPE_BOOLEAN_ARRAY:
-	case DATA_TYPE_INT32_ARRAY:
-		ret = xdr_array(xdr, &buf, &nelem, buflen / sizeof (int32_t),
-		    sizeof (int32_t), (xdrproc_t)xdr_int);
-		break;
-
-	case DATA_TYPE_UINT32_ARRAY:
-		ret = xdr_array(xdr, &buf, &nelem, buflen / sizeof (uint32_t),
-		    sizeof (uint32_t), (xdrproc_t)xdr_u_int);
-		break;
-
-	case DATA_TYPE_INT64_ARRAY:
-		ret = xdr_array(xdr, &buf, &nelem, buflen / sizeof (int64_t),
-		    sizeof (int64_t), (xdrproc_t)xdr_longlong_t);
-		break;
-
-	case DATA_TYPE_UINT64_ARRAY:
-		ret = xdr_array(xdr, &buf, &nelem, buflen / sizeof (uint64_t),
-		    sizeof (uint64_t), (xdrproc_t)xdr_u_longlong_t);
-		break;
-
-	case DATA_TYPE_STRING_ARRAY: {
-		size_t len = nelem * sizeof (uint64_t);
-		char **strp = (void *)buf;
-		int i;
-
-		if (nvs->nvs_op == NVS_OP_DECODE)
-			bzero(buf, len);	/* don't trust packed data */
-
-		for (i = 0; i < nelem; i++) {
-			if (buflen <= len)
-				return (EFAULT);
-
-			buf += len;
-			buflen -= len;
-
-			if (xdr_string(xdr, &buf, buflen - 1) != TRUE)
-				return (EFAULT);
-
-			if (nvs->nvs_op == NVS_OP_DECODE)
-				strp[i] = buf;
-			len = strlen(buf) + 1;
-		}
-		ret = TRUE;
-		break;
-	}
-	default:
-		break;
-	}
-
-	return (ret == TRUE ? 0 : EFAULT);
-}
-
-static int
-nvs_xdr_nvp_size(nvstream_t *nvs, nvpair_t *nvp, size_t *size)
-{
-	data_type_t type = NVP_TYPE(nvp);
-	/*
-	 * encode_size + decode_size + name string size + data type + nelem
-	 * where name string size = 4 + NV_ALIGN4(strlen(NVP_NAME(nvp)))
-	 */
-	uint64_t nvp_sz = 4 + 4 + 4 + NV_ALIGN4(strlen(NVP_NAME(nvp))) + 4 + 4;
-
-	switch (type) {
-	case DATA_TYPE_BOOLEAN:
-		break;
-
-	case DATA_TYPE_BOOLEAN_VALUE:
-	case DATA_TYPE_BYTE:
-	case DATA_TYPE_INT8:
-	case DATA_TYPE_UINT8:
-	case DATA_TYPE_INT16:
-	case DATA_TYPE_UINT16:
-	case DATA_TYPE_INT32:
-	case DATA_TYPE_UINT32:
-		nvp_sz += 4;	/* 4 is the minimum xdr unit */
-		break;
-
-	case DATA_TYPE_INT64:
-	case DATA_TYPE_UINT64:
-	case DATA_TYPE_HRTIME:
-#if !defined(_KERNEL)
-	case DATA_TYPE_DOUBLE:
-#endif
-		nvp_sz += 8;
-		break;
-
-	case DATA_TYPE_STRING:
-		nvp_sz += 4 + NV_ALIGN4(strlen((char *)NVP_VALUE(nvp)));
-		break;
-
-	case DATA_TYPE_BYTE_ARRAY:
-		nvp_sz += NV_ALIGN4(NVP_NELEM(nvp));
-		break;
-
-	case DATA_TYPE_BOOLEAN_ARRAY:
-	case DATA_TYPE_INT8_ARRAY:
-	case DATA_TYPE_UINT8_ARRAY:
-	case DATA_TYPE_INT16_ARRAY:
-	case DATA_TYPE_UINT16_ARRAY:
-	case DATA_TYPE_INT32_ARRAY:
-	case DATA_TYPE_UINT32_ARRAY:
-		nvp_sz += 4 + 4 * (uint64_t)NVP_NELEM(nvp);
-		break;
-
-	case DATA_TYPE_INT64_ARRAY:
-	case DATA_TYPE_UINT64_ARRAY:
-		nvp_sz += 4 + 8 * (uint64_t)NVP_NELEM(nvp);
-		break;
-
-	case DATA_TYPE_STRING_ARRAY: {
-		int i;
-		char **strs = (void *)NVP_VALUE(nvp);
-
-		for (i = 0; i < NVP_NELEM(nvp); i++)
-			nvp_sz += 4 + NV_ALIGN4(strlen(strs[i]));
-
-		break;
-	}
-
-	case DATA_TYPE_NVLIST:
-	case DATA_TYPE_NVLIST_ARRAY: {
-		size_t nvsize = 0;
-		int old_nvs_op = nvs->nvs_op;
-		int err;
-
-		nvs->nvs_op = NVS_OP_GETSIZE;
-		if (type == DATA_TYPE_NVLIST)
-			err = nvs_operation(nvs, EMBEDDED_NVL(nvp), &nvsize);
-		else
-			err = nvs_embedded_nvl_array(nvs, nvp, &nvsize);
-		nvs->nvs_op = old_nvs_op;
-
-		if (err != 0)
-			return (EINVAL);
-
-		nvp_sz += nvsize;
-		break;
-	}
-
-	default:
-		return (EINVAL);
-	}
-
-	if (nvp_sz > INT32_MAX)
-		return (EINVAL);
-
-	*size = nvp_sz;
-
-	return (0);
-}
-
-
-/*
- * The NVS_XDR_MAX_LEN macro takes a packed xdr buffer of size x and estimates
- * the largest nvpair that could be encoded in the buffer.
- *
- * See comments above nvpair_xdr_op() for the format of xdr encoding.
- * The size of a xdr packed nvpair without any data is 5 words.
- *
- * Using the size of the data directly as an estimate would be ok
- * in all cases except one.  If the data type is of DATA_TYPE_STRING_ARRAY
- * then the actual nvpair has space for an array of pointers to index
- * the strings.  These pointers are not encoded into the packed xdr buffer.
- *
- * If the data is of type DATA_TYPE_STRING_ARRAY and all the strings are
- * of length 0, then each string is endcoded in xdr format as a single word.
- * Therefore when expanded to an nvpair there will be 2.25 word used for
- * each string.  (a int64_t allocated for pointer usage, and a single char
- * for the null termination.)
- *
- * This is the calculation performed by the NVS_XDR_MAX_LEN macro.
- */
-#define	NVS_XDR_HDR_LEN		((size_t)(5 * 4))
-#define	NVS_XDR_DATA_LEN(y)	(((size_t)(y) <= NVS_XDR_HDR_LEN) ? \
-					0 : ((size_t)(y) - NVS_XDR_HDR_LEN))
-#define	NVS_XDR_MAX_LEN(x)	(NVP_SIZE_CALC(1, 0) + \
-					(NVS_XDR_DATA_LEN(x) * 2) + \
-					NV_ALIGN4((NVS_XDR_DATA_LEN(x) / 4)))
-
-static int
-nvs_xdr_nvpair(nvstream_t *nvs, nvpair_t *nvp, size_t *size)
-{
-	XDR 	*xdr = nvs->nvs_private;
-	int32_t	encode_len, decode_len;
-
-	switch (nvs->nvs_op) {
-	case NVS_OP_ENCODE: {
-		size_t nvsize;
-
-		if (nvs_xdr_nvp_size(nvs, nvp, &nvsize) != 0)
-			return (EFAULT);
-
-		decode_len = nvp->nvp_size;
-		encode_len = nvsize;
-		if (!xdr_int(xdr, &encode_len) || !xdr_int(xdr, &decode_len))
-			return (EFAULT);
-
-		return (nvs_xdr_nvp_op(nvs, nvp));
-	}
-	case NVS_OP_DECODE: {
-		struct xdr_bytesrec bytesrec;
-
-		/* get the encode and decode size */
-		if (!xdr_int(xdr, &encode_len) || !xdr_int(xdr, &decode_len))
-			return (EFAULT);
-		*size = decode_len;
-
-		/* are we at the end of the stream? */
-		if (*size == 0)
-			return (0);
-
-		/* sanity check the size parameter */
-		if (!xdr_control(xdr, XDR_GET_BYTES_AVAIL, &bytesrec))
-			return (EFAULT);
-
-		if (*size > NVS_XDR_MAX_LEN(bytesrec.xc_num_avail))
-			return (EFAULT);
-		break;
-	}
-
-	default:
-		return (EINVAL);
-	}
-	return (0);
-}
-
-static const struct nvs_ops nvs_xdr_ops = {
-	nvs_xdr_nvlist,
-	nvs_xdr_nvpair,
-	nvs_xdr_nvp_op,
-	nvs_xdr_nvp_size,
-	nvs_xdr_nvl_fini
-};
-
-static int
-nvs_xdr(nvstream_t *nvs, nvlist_t *nvl, char *buf, size_t *buflen)
-{
-	XDR xdr;
-	int err;
-
-	nvs->nvs_ops = &nvs_xdr_ops;
-
-	if ((err = nvs_xdr_create(nvs, &xdr, buf + sizeof (nvs_header_t),
-	    *buflen - sizeof (nvs_header_t))) != 0)
-		return (err);
-
-	err = nvs_operation(nvs, nvl, buflen);
-
-	nvs_xdr_destroy(nvs);
-
-	return (err);
-}
Index: sys/cddl/contrib/opensolaris/common/nvpair/opensolaris_nvpair_alloc_fixed.c
===================================================================
--- sys/cddl/contrib/opensolaris/common/nvpair/opensolaris_nvpair_alloc_fixed.c
+++ /dev/null
@@ -1,118 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-
-/*
- * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
-#include <sys/nvpair.h>
-#include <sys/sysmacros.h>
-#if defined(_KERNEL) && !defined(_BOOT)
-#include <sys/varargs.h>
-#else
-#include <stdarg.h>
-#include <strings.h>
-#endif
-
-/*
- * This allocator is very simple.
- *  - it uses a pre-allocated buffer for memory allocations.
- *  - it does _not_ free memory in the pre-allocated buffer.
- *
- * The reason for the selected implemention is simplicity.
- * This allocator is designed for the usage in interrupt context when
- * the caller may not wait for free memory.
- */
-
-/* pre-allocated buffer for memory allocations */
-typedef struct nvbuf {
-	uintptr_t	nvb_buf;	/* address of pre-allocated buffer */
-	uintptr_t 	nvb_lim;	/* limit address in the buffer */
-	uintptr_t	nvb_cur;	/* current address in the buffer */
-} nvbuf_t;
-
-/*
- * Initialize the pre-allocated buffer allocator. The caller needs to supply
- *
- *   buf	address of pre-allocated buffer
- *   bufsz	size of pre-allocated buffer
- *
- * nv_fixed_init() calculates the remaining members of nvbuf_t.
- */
-static int
-nv_fixed_init(nv_alloc_t *nva, va_list valist)
-{
-	uintptr_t base = va_arg(valist, uintptr_t);
-	uintptr_t lim = base + va_arg(valist, size_t);
-	nvbuf_t *nvb = (nvbuf_t *)P2ROUNDUP(base, sizeof (uintptr_t));
-
-	if (base == 0 || (uintptr_t)&nvb[1] > lim)
-		return (EINVAL);
-
-	nvb->nvb_buf = (uintptr_t)&nvb[0];
-	nvb->nvb_cur = (uintptr_t)&nvb[1];
-	nvb->nvb_lim = lim;
-	nva->nva_arg = nvb;
-
-	return (0);
-}
-
-static void *
-nv_fixed_alloc(nv_alloc_t *nva, size_t size)
-{
-	nvbuf_t *nvb = nva->nva_arg;
-	uintptr_t new = nvb->nvb_cur;
-
-	if (size == 0 || new + size > nvb->nvb_lim)
-		return (NULL);
-
-	nvb->nvb_cur = P2ROUNDUP(new + size, sizeof (uintptr_t));
-
-	return ((void *)new);
-}
-
-/*ARGSUSED*/
-static void
-nv_fixed_free(nv_alloc_t *nva, void *buf, size_t size)
-{
-	/* don't free memory in the pre-allocated buffer */
-}
-
-static void
-nv_fixed_reset(nv_alloc_t *nva)
-{
-	nvbuf_t *nvb = nva->nva_arg;
-
-	nvb->nvb_cur = (uintptr_t)&nvb[1];
-}
-
-const nv_alloc_ops_t nv_fixed_ops_def = {
-	nv_fixed_init,	/* nv_ao_init() */
-	NULL,		/* nv_ao_fini() */
-	nv_fixed_alloc,	/* nv_ao_alloc() */
-	nv_fixed_free,	/* nv_ao_free() */
-	nv_fixed_reset	/* nv_ao_reset() */
-};
-
-const nv_alloc_ops_t *nv_fixed_ops = &nv_fixed_ops_def;
Index: sys/cddl/contrib/opensolaris/common/zfs/zfeature_common.h
===================================================================
--- sys/cddl/contrib/opensolaris/common/zfs/zfeature_common.h
+++ /dev/null
@@ -1,111 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-
-/*
- * Copyright (c) 2011, 2017 by Delphix. All rights reserved.
- * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
- * Copyright (c) 2013, Joyent, Inc. All rights reserved.
- * Copyright (c) 2014 Integros [integros.com]
- * Copyright (c) 2017, Intel Corporation.
- */
-
-#ifndef _ZFEATURE_COMMON_H
-#define	_ZFEATURE_COMMON_H
-
-#include <sys/fs/zfs.h>
-#include <sys/types.h>
-
-#ifdef	__cplusplus
-extern "C" {
-#endif
-
-struct zfeature_info;
-
-typedef enum spa_feature {
-	SPA_FEATURE_NONE = -1,
-	SPA_FEATURE_ASYNC_DESTROY,
-	SPA_FEATURE_EMPTY_BPOBJ,
-	SPA_FEATURE_LZ4_COMPRESS,
-	SPA_FEATURE_MULTI_VDEV_CRASH_DUMP,
-	SPA_FEATURE_SPACEMAP_HISTOGRAM,
-	SPA_FEATURE_ENABLED_TXG,
-	SPA_FEATURE_HOLE_BIRTH,
-	SPA_FEATURE_EXTENSIBLE_DATASET,
-	SPA_FEATURE_EMBEDDED_DATA,
-	SPA_FEATURE_BOOKMARKS,
-	SPA_FEATURE_FS_SS_LIMIT,
-	SPA_FEATURE_LARGE_BLOCKS,
-	SPA_FEATURE_LARGE_DNODE,
-	SPA_FEATURE_SHA512,
-	SPA_FEATURE_SKEIN,
-#ifdef illumos
-	SPA_FEATURE_EDONR,
-#endif
-	SPA_FEATURE_DEVICE_REMOVAL,
-	SPA_FEATURE_OBSOLETE_COUNTS,
-	SPA_FEATURE_POOL_CHECKPOINT,
-	SPA_FEATURE_SPACEMAP_V2,
-	SPA_FEATURE_ALLOCATION_CLASSES,
-	SPA_FEATURES
-} spa_feature_t;
-
-#define	SPA_FEATURE_DISABLED	(-1ULL)
-
-typedef enum zfeature_flags {
-	/* Can open pool readonly even if this feature is not supported. */
-	ZFEATURE_FLAG_READONLY_COMPAT =		(1 << 0),
-	/* Is this feature necessary to read the MOS? */
-	ZFEATURE_FLAG_MOS =			(1 << 1),
-	/* Activate this feature at the same time it is enabled. */
-	ZFEATURE_FLAG_ACTIVATE_ON_ENABLE =	(1 << 2),
-	/* Each dataset has a field set if it has ever used this feature. */
-	ZFEATURE_FLAG_PER_DATASET =		(1 << 3)
-} zfeature_flags_t;
-
-typedef struct zfeature_info {
-	spa_feature_t fi_feature;
-	const char *fi_uname;	/* User-facing feature name */
-	const char *fi_guid;	/* On-disk feature identifier */
-	const char *fi_desc;	/* Feature description */
-	zfeature_flags_t fi_flags;
-	/* array of dependencies, terminated by SPA_FEATURE_NONE */
-	const spa_feature_t *fi_depends;
-} zfeature_info_t;
-
-typedef int (zfeature_func_t)(zfeature_info_t *, void *);
-
-#define	ZFS_FEATURE_DEBUG
-
-extern zfeature_info_t spa_feature_table[SPA_FEATURES];
-
-extern boolean_t zfeature_is_valid_guid(const char *);
-
-extern boolean_t zfeature_is_supported(const char *);
-extern int zfeature_lookup_name(const char *, spa_feature_t *);
-extern boolean_t zfeature_depends_on(spa_feature_t, spa_feature_t);
-
-extern void zpool_feature_init(void);
-
-#ifdef	__cplusplus
-}
-#endif
-
-#endif	/* _ZFEATURE_COMMON_H */
Index: sys/cddl/contrib/opensolaris/common/zfs/zfeature_common.c
===================================================================
--- sys/cddl/contrib/opensolaris/common/zfs/zfeature_common.c
+++ /dev/null
@@ -1,310 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-
-/*
- * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
- * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
- * Copyright (c) 2013, Joyent, Inc. All rights reserved.
- * Copyright (c) 2014, Nexenta Systems, Inc. All rights reserved.
- * Copyright (c) 2014 Integros [integros.com]
- * Copyright (c) 2017, Intel Corporation.
- */
-
-#ifdef _KERNEL
-#include <sys/systm.h>
-#else
-#include <errno.h>
-#include <string.h>
-#endif
-#include <sys/debug.h>
-#include <sys/fs/zfs.h>
-#include <sys/types.h>
-#include "zfeature_common.h"
-
-/*
- * Set to disable all feature checks while opening pools, allowing pools with
- * unsupported features to be opened. Set for testing only.
- */
-boolean_t zfeature_checks_disable = B_FALSE;
-
-zfeature_info_t spa_feature_table[SPA_FEATURES];
-
-/*
- * Valid characters for feature guids. This list is mainly for aesthetic
- * purposes and could be expanded in the future. There are different allowed
- * characters in the guids reverse dns portion (before the colon) and its
- * short name (after the colon).
- */
-static int
-valid_char(char c, boolean_t after_colon)
-{
-	return ((c >= 'a' && c <= 'z') ||
-	    (c >= '0' && c <= '9') ||
-	    (after_colon && c == '_') ||
-	    (!after_colon && (c == '.' || c == '-')));
-}
-
-/*
- * Every feature guid must contain exactly one colon which separates a reverse
- * dns organization name from the feature's "short" name (e.g.
- * "com.company:feature_name").
- */
-boolean_t
-zfeature_is_valid_guid(const char *name)
-{
-	int i;
-	boolean_t has_colon = B_FALSE;
-
-	i = 0;
-	while (name[i] != '\0') {
-		char c = name[i++];
-		if (c == ':') {
-			if (has_colon)
-				return (B_FALSE);
-			has_colon = B_TRUE;
-			continue;
-		}
-		if (!valid_char(c, has_colon))
-			return (B_FALSE);
-	}
-
-	return (has_colon);
-}
-
-boolean_t
-zfeature_is_supported(const char *guid)
-{
-	if (zfeature_checks_disable)
-		return (B_TRUE);
-
-	for (spa_feature_t i = 0; i < SPA_FEATURES; i++) {
-		zfeature_info_t *feature = &spa_feature_table[i];
-		if (strcmp(guid, feature->fi_guid) == 0)
-			return (B_TRUE);
-	}
-	return (B_FALSE);
-}
-
-int
-zfeature_lookup_name(const char *name, spa_feature_t *res)
-{
-	for (spa_feature_t i = 0; i < SPA_FEATURES; i++) {
-		zfeature_info_t *feature = &spa_feature_table[i];
-		if (strcmp(name, feature->fi_uname) == 0) {
-			if (res != NULL)
-				*res = i;
-			return (0);
-		}
-	}
-
-	return (ENOENT);
-}
-
-boolean_t
-zfeature_depends_on(spa_feature_t fid, spa_feature_t check)
-{
-	zfeature_info_t *feature = &spa_feature_table[fid];
-
-	for (int i = 0; feature->fi_depends[i] != SPA_FEATURE_NONE; i++) {
-		if (feature->fi_depends[i] == check)
-			return (B_TRUE);
-	}
-	return (B_FALSE);
-}
-
-static void
-zfeature_register(spa_feature_t fid, const char *guid, const char *name,
-    const char *desc, zfeature_flags_t flags, const spa_feature_t *deps)
-{
-	zfeature_info_t *feature = &spa_feature_table[fid];
-	static spa_feature_t nodeps[] = { SPA_FEATURE_NONE };
-
-	ASSERT(name != NULL);
-	ASSERT(desc != NULL);
-	ASSERT((flags & ZFEATURE_FLAG_READONLY_COMPAT) == 0 ||
-	    (flags & ZFEATURE_FLAG_MOS) == 0);
-	ASSERT3U(fid, <, SPA_FEATURES);
-	ASSERT(zfeature_is_valid_guid(guid));
-
-	if (deps == NULL)
-		deps = nodeps;
-
-	feature->fi_feature = fid;
-	feature->fi_guid = guid;
-	feature->fi_uname = name;
-	feature->fi_desc = desc;
-	feature->fi_flags = flags;
-	feature->fi_depends = deps;
-}
-
-void
-zpool_feature_init(void)
-{
-	zfeature_register(SPA_FEATURE_ASYNC_DESTROY,
-	    "com.delphix:async_destroy", "async_destroy",
-	    "Destroy filesystems asynchronously.",
-	    ZFEATURE_FLAG_READONLY_COMPAT, NULL);
-
-	zfeature_register(SPA_FEATURE_EMPTY_BPOBJ,
-	    "com.delphix:empty_bpobj", "empty_bpobj",
-	    "Snapshots use less space.",
-	    ZFEATURE_FLAG_READONLY_COMPAT, NULL);
-
-	zfeature_register(SPA_FEATURE_LZ4_COMPRESS,
-	    "org.illumos:lz4_compress", "lz4_compress",
-	    "LZ4 compression algorithm support.",
-	    ZFEATURE_FLAG_ACTIVATE_ON_ENABLE, NULL);
-
-	zfeature_register(SPA_FEATURE_MULTI_VDEV_CRASH_DUMP,
-	    "com.joyent:multi_vdev_crash_dump", "multi_vdev_crash_dump",
-	    "Crash dumps to multiple vdev pools.",
-	    0, NULL);
-
-	zfeature_register(SPA_FEATURE_SPACEMAP_HISTOGRAM,
-	    "com.delphix:spacemap_histogram", "spacemap_histogram",
-	    "Spacemaps maintain space histograms.",
-	    ZFEATURE_FLAG_READONLY_COMPAT, NULL);
-
-	zfeature_register(SPA_FEATURE_ENABLED_TXG,
-	    "com.delphix:enabled_txg", "enabled_txg",
-	    "Record txg at which a feature is enabled",
-	    ZFEATURE_FLAG_READONLY_COMPAT, NULL);
-
-	static spa_feature_t hole_birth_deps[] = { SPA_FEATURE_ENABLED_TXG,
-	    SPA_FEATURE_NONE };
-	zfeature_register(SPA_FEATURE_HOLE_BIRTH,
-	    "com.delphix:hole_birth", "hole_birth",
-	    "Retain hole birth txg for more precise zfs send",
-	    ZFEATURE_FLAG_MOS | ZFEATURE_FLAG_ACTIVATE_ON_ENABLE,
-	    hole_birth_deps);
-
-	zfeature_register(SPA_FEATURE_EXTENSIBLE_DATASET,
-	    "com.delphix:extensible_dataset", "extensible_dataset",
-	    "Enhanced dataset functionality, used by other features.",
-	    0, NULL);
-
-	static const spa_feature_t bookmarks_deps[] = {
-		SPA_FEATURE_EXTENSIBLE_DATASET,
-		SPA_FEATURE_NONE
-	};
-	zfeature_register(SPA_FEATURE_BOOKMARKS,
-	    "com.delphix:bookmarks", "bookmarks",
-	    "\"zfs bookmark\" command",
-	    ZFEATURE_FLAG_READONLY_COMPAT, bookmarks_deps);
-
-	static const spa_feature_t filesystem_limits_deps[] = {
-	    SPA_FEATURE_EXTENSIBLE_DATASET,
-	    SPA_FEATURE_NONE
-	};
-	zfeature_register(SPA_FEATURE_FS_SS_LIMIT,
-	    "com.joyent:filesystem_limits", "filesystem_limits",
-	    "Filesystem and snapshot limits.",
-	    ZFEATURE_FLAG_READONLY_COMPAT, filesystem_limits_deps);
-
-	zfeature_register(SPA_FEATURE_EMBEDDED_DATA,
-	    "com.delphix:embedded_data", "embedded_data",
-	    "Blocks which compress very well use even less space.",
-	    ZFEATURE_FLAG_MOS | ZFEATURE_FLAG_ACTIVATE_ON_ENABLE,
-	    NULL);
-
-	zfeature_register(SPA_FEATURE_POOL_CHECKPOINT,
-	    "com.delphix:zpool_checkpoint", "zpool_checkpoint",
-	    "Pool state can be checkpointed, allowing rewind later.",
-	    ZFEATURE_FLAG_READONLY_COMPAT, NULL);
-
-	zfeature_register(SPA_FEATURE_SPACEMAP_V2,
-	    "com.delphix:spacemap_v2", "spacemap_v2",
-	    "Space maps representing large segments are more efficient.",
-	    ZFEATURE_FLAG_READONLY_COMPAT | ZFEATURE_FLAG_ACTIVATE_ON_ENABLE,
-	    NULL);
-
-	static const spa_feature_t large_blocks_deps[] = {
-		SPA_FEATURE_EXTENSIBLE_DATASET,
-		SPA_FEATURE_NONE
-	};
-	zfeature_register(SPA_FEATURE_LARGE_BLOCKS,
-	    "org.open-zfs:large_blocks", "large_blocks",
-	    "Support for blocks larger than 128KB.",
-	    ZFEATURE_FLAG_PER_DATASET, large_blocks_deps);
-
-	{
-	static const spa_feature_t large_dnode_deps[] = {
-		SPA_FEATURE_EXTENSIBLE_DATASET,
-		SPA_FEATURE_NONE
-	};
-	zfeature_register(SPA_FEATURE_LARGE_DNODE,
-	    "org.zfsonlinux:large_dnode", "large_dnode",
-	    "Variable on-disk size of dnodes.",
-	    ZFEATURE_FLAG_PER_DATASET, large_dnode_deps);
-	}
-
-	static const spa_feature_t sha512_deps[] = {
-		SPA_FEATURE_EXTENSIBLE_DATASET,
-		SPA_FEATURE_NONE
-	};
-	zfeature_register(SPA_FEATURE_SHA512,
-	    "org.illumos:sha512", "sha512",
-	    "SHA-512/256 hash algorithm.",
-	    ZFEATURE_FLAG_PER_DATASET, sha512_deps);
-
-	static const spa_feature_t skein_deps[] = {
-		SPA_FEATURE_EXTENSIBLE_DATASET,
-		SPA_FEATURE_NONE
-	};
-	zfeature_register(SPA_FEATURE_SKEIN,
-	    "org.illumos:skein", "skein",
-	    "Skein hash algorithm.",
-	    ZFEATURE_FLAG_PER_DATASET, skein_deps);
-
-#ifdef illumos
-	static const spa_feature_t edonr_deps[] = {
-		SPA_FEATURE_EXTENSIBLE_DATASET,
-		SPA_FEATURE_NONE
-	};
-	zfeature_register(SPA_FEATURE_EDONR,
-	    "org.illumos:edonr", "edonr",
-	    "Edon-R hash algorithm.",
-	    ZFEATURE_FLAG_PER_DATASET, edonr_deps);
-#endif
-
-	zfeature_register(SPA_FEATURE_DEVICE_REMOVAL,
-	    "com.delphix:device_removal", "device_removal",
-	    "Top-level vdevs can be removed, reducing logical pool size.",
-	    ZFEATURE_FLAG_MOS, NULL);
-
-	static const spa_feature_t obsolete_counts_deps[] = {
-		SPA_FEATURE_EXTENSIBLE_DATASET,
-		SPA_FEATURE_DEVICE_REMOVAL,
-		SPA_FEATURE_NONE
-	};
-	zfeature_register(SPA_FEATURE_OBSOLETE_COUNTS,
-	    "com.delphix:obsolete_counts", "obsolete_counts",
-	    "Reduce memory used by removed devices when their blocks are "
-	    "freed or remapped.",
-	    ZFEATURE_FLAG_READONLY_COMPAT, obsolete_counts_deps);
-
-	{
-	zfeature_register(SPA_FEATURE_ALLOCATION_CLASSES,
-	    "org.zfsonlinux:allocation_classes", "allocation_classes",
-	    "Support for separate allocation classes.",
-	    ZFEATURE_FLAG_READONLY_COMPAT, NULL);
-	}
-}
Index: sys/cddl/contrib/opensolaris/common/zfs/zfs_comutil.h
===================================================================
--- sys/cddl/contrib/opensolaris/common/zfs/zfs_comutil.h
+++ /dev/null
@@ -1,52 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
- * Copyright 2019 Joyent, Inc.
- */
-
-#ifndef	_ZFS_COMUTIL_H
-#define	_ZFS_COMUTIL_H
-
-#include <sys/fs/zfs.h>
-#include <sys/types.h>
-
-#ifdef	__cplusplus
-extern "C" {
-#endif
-
-/* Needed for ZoL errno usage in MMP kernel and user code */
-#define	EREMOTEIO EREMOTE
-
-extern boolean_t zfs_allocatable_devs(nvlist_t *);
-extern void zpool_get_load_policy(nvlist_t *, zpool_load_policy_t *);
-
-extern int zfs_zpl_version_map(int spa_version);
-extern int zfs_spa_version_map(int zpl_version);
-#define	ZFS_NUM_LEGACY_HISTORY_EVENTS 41
-extern const char *zfs_history_event_names[ZFS_NUM_LEGACY_HISTORY_EVENTS];
-
-#ifdef	__cplusplus
-}
-#endif
-
-#endif	/* _ZFS_COMUTIL_H */
Index: sys/cddl/contrib/opensolaris/common/zfs/zfs_comutil.c
===================================================================
--- sys/cddl/contrib/opensolaris/common/zfs/zfs_comutil.c
+++ /dev/null
@@ -1,206 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
- */
-
-/*
- * This file is intended for functions that ought to be common between user
- * land (libzfs) and the kernel. When many common routines need to be shared
- * then a separate file should to be created.
- */
-
-#if defined(_KERNEL)
-#include <sys/systm.h>
-#else
-#include <string.h>
-#endif
-
-#include <sys/types.h>
-#include <sys/fs/zfs.h>
-#include <sys/nvpair.h>
-#include "zfs_comutil.h"
-
-/*
- * Are there allocatable vdevs?
- */
-boolean_t
-zfs_allocatable_devs(nvlist_t *nv)
-{
-	uint64_t is_log;
-	uint_t c;
-	nvlist_t **child;
-	uint_t children;
-
-	if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
-	    &child, &children) != 0) {
-		return (B_FALSE);
-	}
-	for (c = 0; c < children; c++) {
-		is_log = 0;
-		(void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_LOG,
-		    &is_log);
-		if (!is_log)
-			return (B_TRUE);
-	}
-	return (B_FALSE);
-}
-
-void
-zpool_get_load_policy(nvlist_t *nvl, zpool_load_policy_t *zlpp)
-{
-	nvlist_t *policy;
-	nvpair_t *elem;
-	char *nm;
-
-	/* Defaults */
-	zlpp->zlp_rewind = ZPOOL_NO_REWIND;
-	zlpp->zlp_maxmeta = 0;
-	zlpp->zlp_maxdata = UINT64_MAX;
-	zlpp->zlp_txg = UINT64_MAX;
-
-	if (nvl == NULL)
-		return;
-
-	elem = NULL;
-	while ((elem = nvlist_next_nvpair(nvl, elem)) != NULL) {
-		nm = nvpair_name(elem);
-		if (strcmp(nm, ZPOOL_LOAD_POLICY) == 0) {
-			if (nvpair_value_nvlist(elem, &policy) == 0)
-				zpool_get_load_policy(policy, zlpp);
-			return;
-		} else if (strcmp(nm, ZPOOL_LOAD_REWIND_POLICY) == 0) {
-			if (nvpair_value_uint32(elem, &zlpp->zlp_rewind) == 0)
-				if (zlpp->zlp_rewind & ~ZPOOL_REWIND_POLICIES)
-					zlpp->zlp_rewind = ZPOOL_NO_REWIND;
-		} else if (strcmp(nm, ZPOOL_LOAD_REQUEST_TXG) == 0) {
-			(void) nvpair_value_uint64(elem, &zlpp->zlp_txg);
-		} else if (strcmp(nm, ZPOOL_LOAD_META_THRESH) == 0) {
-			(void) nvpair_value_uint64(elem, &zlpp->zlp_maxmeta);
-		} else if (strcmp(nm, ZPOOL_LOAD_DATA_THRESH) == 0) {
-			(void) nvpair_value_uint64(elem, &zlpp->zlp_maxdata);
-		}
-	}
-	if (zlpp->zlp_rewind == 0)
-		zlpp->zlp_rewind = ZPOOL_NO_REWIND;
-}
-
-typedef struct zfs_version_spa_map {
-	int	version_zpl;
-	int	version_spa;
-} zfs_version_spa_map_t;
-
-/*
- * Keep this table in monotonically increasing version number order.
- */
-static zfs_version_spa_map_t zfs_version_table[] = {
-	{ZPL_VERSION_INITIAL, SPA_VERSION_INITIAL},
-	{ZPL_VERSION_DIRENT_TYPE, SPA_VERSION_INITIAL},
-	{ZPL_VERSION_FUID, SPA_VERSION_FUID},
-	{ZPL_VERSION_USERSPACE, SPA_VERSION_USERSPACE},
-	{ZPL_VERSION_SA, SPA_VERSION_SA},
-	{0, 0}
-};
-
-/*
- * Return the max zpl version for a corresponding spa version
- * -1 is returned if no mapping exists.
- */
-int
-zfs_zpl_version_map(int spa_version)
-{
-	int i;
-	int version = -1;
-
-	for (i = 0; zfs_version_table[i].version_spa; i++) {
-		if (spa_version >= zfs_version_table[i].version_spa)
-			version = zfs_version_table[i].version_zpl;
-	}
-
-	return (version);
-}
-
-/*
- * Return the min spa version for a corresponding spa version
- * -1 is returned if no mapping exists.
- */
-int
-zfs_spa_version_map(int zpl_version)
-{
-	int i;
-	int version = -1;
-
-	for (i = 0; zfs_version_table[i].version_zpl; i++) {
-		if (zfs_version_table[i].version_zpl >= zpl_version)
-			return (zfs_version_table[i].version_spa);
-	}
-
-	return (version);
-}
-
-/*
- * This is the table of legacy internal event names; it should not be modified.
- * The internal events are now stored in the history log as strings.
- */
-const char *zfs_history_event_names[ZFS_NUM_LEGACY_HISTORY_EVENTS] = {
-	"invalid event",
-	"pool create",
-	"vdev add",
-	"pool remove",
-	"pool destroy",
-	"pool export",
-	"pool import",
-	"vdev attach",
-	"vdev replace",
-	"vdev detach",
-	"vdev online",
-	"vdev offline",
-	"vdev upgrade",
-	"pool clear",
-	"pool scrub",
-	"pool property set",
-	"create",
-	"clone",
-	"destroy",
-	"destroy_begin_sync",
-	"inherit",
-	"property set",
-	"quota set",
-	"permission update",
-	"permission remove",
-	"permission who remove",
-	"promote",
-	"receive",
-	"rename",
-	"reservation set",
-	"replay_inc_sync",
-	"replay_full_sync",
-	"rollback",
-	"snapshot",
-	"filesystem version upgrade",
-	"refquota set",
-	"refreservation set",
-	"pool scrub done",
-	"user hold",
-	"user release",
-	"pool split",
-};
Index: sys/cddl/contrib/opensolaris/common/zfs/zfs_deleg.h
===================================================================
--- sys/cddl/contrib/opensolaris/common/zfs/zfs_deleg.h
+++ /dev/null
@@ -1,90 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright 2010 Nexenta Systems, Inc. All rights reserved.
- * Copyright (c) 2013, 2015 by Delphix. All rights reserved.
- */
-
-#ifndef	_ZFS_DELEG_H
-#define	_ZFS_DELEG_H
-
-#include <sys/fs/zfs.h>
-
-#ifdef	__cplusplus
-extern "C" {
-#endif
-
-#define	ZFS_DELEG_SET_NAME_CHR		'@'		/* set name lead char */
-#define	ZFS_DELEG_FIELD_SEP_CHR		'$'		/* field separator */
-
-/*
- * Max name length for a delegation attribute
- */
-#define	ZFS_MAX_DELEG_NAME	128
-
-#define	ZFS_DELEG_LOCAL		'l'
-#define	ZFS_DELEG_DESCENDENT	'd'
-#define	ZFS_DELEG_NA		'-'
-
-typedef enum {
-	ZFS_DELEG_NOTE_CREATE,
-	ZFS_DELEG_NOTE_DESTROY,
-	ZFS_DELEG_NOTE_SNAPSHOT,
-	ZFS_DELEG_NOTE_ROLLBACK,
-	ZFS_DELEG_NOTE_CLONE,
-	ZFS_DELEG_NOTE_PROMOTE,
-	ZFS_DELEG_NOTE_RENAME,
-	ZFS_DELEG_NOTE_SEND,
-	ZFS_DELEG_NOTE_RECEIVE,
-	ZFS_DELEG_NOTE_ALLOW,
-	ZFS_DELEG_NOTE_USERPROP,
-	ZFS_DELEG_NOTE_MOUNT,
-	ZFS_DELEG_NOTE_SHARE,
-	ZFS_DELEG_NOTE_USERQUOTA,
-	ZFS_DELEG_NOTE_GROUPQUOTA,
-	ZFS_DELEG_NOTE_USERUSED,
-	ZFS_DELEG_NOTE_GROUPUSED,
-	ZFS_DELEG_NOTE_HOLD,
-	ZFS_DELEG_NOTE_RELEASE,
-	ZFS_DELEG_NOTE_DIFF,
-	ZFS_DELEG_NOTE_BOOKMARK,
-	ZFS_DELEG_NOTE_REMAP,
-	ZFS_DELEG_NOTE_NONE
-} zfs_deleg_note_t;
-
-typedef struct zfs_deleg_perm_tab {
-	char *z_perm;
-	zfs_deleg_note_t z_note;
-} zfs_deleg_perm_tab_t;
-
-extern zfs_deleg_perm_tab_t zfs_deleg_perm_tab[];
-
-int zfs_deleg_verify_nvlist(nvlist_t *nvlist);
-void zfs_deleg_whokey(char *attr, zfs_deleg_who_type_t type,
-    char checkflag, void *data);
-const char *zfs_deleg_canonicalize_perm(const char *perm);
-
-#ifdef	__cplusplus
-}
-#endif
-
-#endif	/* _ZFS_DELEG_H */
Index: sys/cddl/contrib/opensolaris/common/zfs/zfs_deleg.c
===================================================================
--- sys/cddl/contrib/opensolaris/common/zfs/zfs_deleg.c
+++ /dev/null
@@ -1,235 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright 2010 Nexenta Systems, Inc. All rights reserved.
- * Copyright (c) 2013, 2015 by Delphix. All rights reserved.
- * Copyright 2016 Igor Kozhukhov <ikozhukhov@gmail.com>
- */
-
-#include <sys/zfs_context.h>
-
-#if defined(_KERNEL)
-#include <sys/systm.h>
-#include <sys/sunddi.h>
-#include <sys/ctype.h>
-#else
-#include <stdio.h>
-#include <unistd.h>
-#include <strings.h>
-#include <libnvpair.h>
-#include <ctype.h>
-#endif
-#include <sys/dsl_deleg.h>
-#include "zfs_prop.h"
-#include "zfs_deleg.h"
-#include "zfs_namecheck.h"
-
-zfs_deleg_perm_tab_t zfs_deleg_perm_tab[] = {
-	{ZFS_DELEG_PERM_ALLOW},
-	{ZFS_DELEG_PERM_BOOKMARK},
-	{ZFS_DELEG_PERM_CLONE},
-	{ZFS_DELEG_PERM_CREATE},
-	{ZFS_DELEG_PERM_DESTROY},
-	{ZFS_DELEG_PERM_DIFF},
-	{ZFS_DELEG_PERM_MOUNT},
-	{ZFS_DELEG_PERM_PROMOTE},
-	{ZFS_DELEG_PERM_RECEIVE},
-	{ZFS_DELEG_PERM_REMAP},
-	{ZFS_DELEG_PERM_RENAME},
-	{ZFS_DELEG_PERM_ROLLBACK},
-	{ZFS_DELEG_PERM_SNAPSHOT},
-	{ZFS_DELEG_PERM_SHARE},
-	{ZFS_DELEG_PERM_SEND},
-	{ZFS_DELEG_PERM_USERPROP},
-	{ZFS_DELEG_PERM_USERQUOTA},
-	{ZFS_DELEG_PERM_GROUPQUOTA},
-	{ZFS_DELEG_PERM_USERUSED},
-	{ZFS_DELEG_PERM_GROUPUSED},
-	{ZFS_DELEG_PERM_HOLD},
-	{ZFS_DELEG_PERM_RELEASE},
-	{NULL}
-};
-
-static int
-zfs_valid_permission_name(const char *perm)
-{
-	if (zfs_deleg_canonicalize_perm(perm))
-		return (0);
-
-	return (permset_namecheck(perm, NULL, NULL));
-}
-
-const char *
-zfs_deleg_canonicalize_perm(const char *perm)
-{
-	int i;
-	zfs_prop_t prop;
-
-	for (i = 0; zfs_deleg_perm_tab[i].z_perm != NULL; i++) {
-		if (strcmp(perm, zfs_deleg_perm_tab[i].z_perm) == 0)
-			return (perm);
-	}
-
-	prop = zfs_name_to_prop(perm);
-	if (prop != ZPROP_INVAL && zfs_prop_delegatable(prop))
-		return (zfs_prop_to_name(prop));
-	return (NULL);
-
-}
-
-static int
-zfs_validate_who(char *who)
-{
-	char *p;
-
-	if (who[2] != ZFS_DELEG_FIELD_SEP_CHR)
-		return (-1);
-
-	switch (who[0]) {
-	case ZFS_DELEG_USER:
-	case ZFS_DELEG_GROUP:
-	case ZFS_DELEG_USER_SETS:
-	case ZFS_DELEG_GROUP_SETS:
-		if (who[1] != ZFS_DELEG_LOCAL && who[1] != ZFS_DELEG_DESCENDENT)
-			return (-1);
-		for (p = &who[3]; *p; p++)
-			if (!isdigit(*p))
-				return (-1);
-		break;
-
-	case ZFS_DELEG_NAMED_SET:
-	case ZFS_DELEG_NAMED_SET_SETS:
-		if (who[1] != ZFS_DELEG_NA)
-			return (-1);
-		return (permset_namecheck(&who[3], NULL, NULL));
-
-	case ZFS_DELEG_CREATE:
-	case ZFS_DELEG_CREATE_SETS:
-		if (who[1] != ZFS_DELEG_NA)
-			return (-1);
-		if (who[3] != '\0')
-			return (-1);
-		break;
-
-	case ZFS_DELEG_EVERYONE:
-	case ZFS_DELEG_EVERYONE_SETS:
-		if (who[1] != ZFS_DELEG_LOCAL && who[1] != ZFS_DELEG_DESCENDENT)
-			return (-1);
-		if (who[3] != '\0')
-			return (-1);
-		break;
-
-	default:
-		return (-1);
-	}
-
-	return (0);
-}
-
-int
-zfs_deleg_verify_nvlist(nvlist_t *nvp)
-{
-	nvpair_t *who, *perm_name;
-	nvlist_t *perms;
-	int error;
-
-	if (nvp == NULL)
-		return (-1);
-
-	who = nvlist_next_nvpair(nvp, NULL);
-	if (who == NULL)
-		return (-1);
-
-	do {
-		if (zfs_validate_who(nvpair_name(who)))
-			return (-1);
-
-		error = nvlist_lookup_nvlist(nvp, nvpair_name(who), &perms);
-
-		if (error && error != ENOENT)
-			return (-1);
-		if (error == ENOENT)
-			continue;
-
-		perm_name = nvlist_next_nvpair(perms, NULL);
-		if (perm_name == NULL) {
-			return (-1);
-		}
-		do {
-			error = zfs_valid_permission_name(
-			    nvpair_name(perm_name));
-			if (error)
-				return (-1);
-		} while ((perm_name = nvlist_next_nvpair(perms, perm_name))
-		    != NULL);
-	} while ((who = nvlist_next_nvpair(nvp, who)) != NULL);
-	return (0);
-}
-
-/*
- * Construct the base attribute name.  The base attribute names
- * are the "key" to locate the jump objects which contain the actual
- * permissions.  The base attribute names are encoded based on
- * type of entry and whether it is a local or descendent permission.
- *
- * Arguments:
- * attr - attribute name return string, attribute is assumed to be
- *        ZFS_MAX_DELEG_NAME long.
- * type - type of entry to construct
- * inheritchr - inheritance type (local,descendent, or NA for create and
- *                               permission set definitions
- * data - is either a permission set name or a 64 bit uid/gid.
- */
-void
-zfs_deleg_whokey(char *attr, zfs_deleg_who_type_t type,
-    char inheritchr, void *data)
-{
-	int len = ZFS_MAX_DELEG_NAME;
-	uint64_t *id = data;
-
-	switch (type) {
-	case ZFS_DELEG_USER:
-	case ZFS_DELEG_GROUP:
-	case ZFS_DELEG_USER_SETS:
-	case ZFS_DELEG_GROUP_SETS:
-		(void) snprintf(attr, len, "%c%c%c%lld", type, inheritchr,
-		    ZFS_DELEG_FIELD_SEP_CHR, (longlong_t)*id);
-		break;
-	case ZFS_DELEG_NAMED_SET_SETS:
-	case ZFS_DELEG_NAMED_SET:
-		(void) snprintf(attr, len, "%c-%c%s", type,
-		    ZFS_DELEG_FIELD_SEP_CHR, (char *)data);
-		break;
-	case ZFS_DELEG_CREATE:
-	case ZFS_DELEG_CREATE_SETS:
-		(void) snprintf(attr, len, "%c-%c", type,
-		    ZFS_DELEG_FIELD_SEP_CHR);
-		break;
-	case ZFS_DELEG_EVERYONE:
-	case ZFS_DELEG_EVERYONE_SETS:
-		(void) snprintf(attr, len, "%c%c%c", type, inheritchr,
-		    ZFS_DELEG_FIELD_SEP_CHR);
-		break;
-	default:
-		ASSERT(!"bad zfs_deleg_who_type_t");
-	}
-}
Index: sys/cddl/contrib/opensolaris/common/zfs/zfs_fletcher.h
===================================================================
--- sys/cddl/contrib/opensolaris/common/zfs/zfs_fletcher.h
+++ /dev/null
@@ -1,58 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-/*
- * Copyright 2013 Saso Kiselkov. All rights reserved.
- * Copyright (c) 2016 by Delphix. All rights reserved.
- */
-
-#ifndef	_ZFS_FLETCHER_H
-#define	_ZFS_FLETCHER_H
-
-#include <sys/types.h>
-#include <sys/spa.h>
-
-#ifdef	__cplusplus
-extern "C" {
-#endif
-
-/*
- * fletcher checksum functions
- */
-
-void fletcher_init(zio_cksum_t *);
-void fletcher_2_native(const void *, size_t, const void *, zio_cksum_t *);
-void fletcher_2_byteswap(const void *, size_t, const void *, zio_cksum_t *);
-int fletcher_2_incremental_native(void *, size_t, void *);
-int fletcher_2_incremental_byteswap(void *, size_t, void *);
-void fletcher_4_native(const void *, size_t, const void *, zio_cksum_t *);
-void fletcher_4_byteswap(const void *, size_t, const void *, zio_cksum_t *);
-int fletcher_4_incremental_native(void *, size_t, void *);
-int fletcher_4_incremental_byteswap(void *, size_t, void *);
-
-#ifdef	__cplusplus
-}
-#endif
-
-#endif	/* _ZFS_FLETCHER_H */
Index: sys/cddl/contrib/opensolaris/common/zfs/zfs_fletcher.c
===================================================================
--- sys/cddl/contrib/opensolaris/common/zfs/zfs_fletcher.c
+++ /dev/null
@@ -1,279 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-/*
- * Copyright 2013 Saso Kiselkov. All rights reserved.
- * Copyright (c) 2016 by Delphix. All rights reserved.
- */
-
-/*
- * Fletcher Checksums
- * ------------------
- *
- * ZFS's 2nd and 4th order Fletcher checksums are defined by the following
- * recurrence relations:
- *
- *	a  = a    + f
- *	 i    i-1    i-1
- *
- *	b  = b    + a
- *	 i    i-1    i
- *
- *	c  = c    + b		(fletcher-4 only)
- *	 i    i-1    i
- *
- *	d  = d    + c		(fletcher-4 only)
- *	 i    i-1    i
- *
- * Where
- *	a_0 = b_0 = c_0 = d_0 = 0
- * and
- *	f_0 .. f_(n-1) are the input data.
- *
- * Using standard techniques, these translate into the following series:
- *
- *	     __n_			     __n_
- *	     \   |			     \   |
- *	a  =  >     f			b  =  >     i * f
- *	 n   /___|   n - i		 n   /___|	 n - i
- *	     i = 1			     i = 1
- *
- *
- *	     __n_			     __n_
- *	     \   |  i*(i+1)		     \   |  i*(i+1)*(i+2)
- *	c  =  >     ------- f		d  =  >     ------------- f
- *	 n   /___|     2     n - i	 n   /___|	  6	   n - i
- *	     i = 1			     i = 1
- *
- * For fletcher-2, the f_is are 64-bit, and [ab]_i are 64-bit accumulators.
- * Since the additions are done mod (2^64), errors in the high bits may not
- * be noticed.  For this reason, fletcher-2 is deprecated.
- *
- * For fletcher-4, the f_is are 32-bit, and [abcd]_i are 64-bit accumulators.
- * A conservative estimate of how big the buffer can get before we overflow
- * can be estimated using f_i = 0xffffffff for all i:
- *
- * % bc
- *  f=2^32-1;d=0; for (i = 1; d<2^64; i++) { d += f*i*(i+1)*(i+2)/6 }; (i-1)*4
- * 2264
- *  quit
- * %
- *
- * So blocks of up to 2k will not overflow.  Our largest block size is
- * 128k, which has 32k 4-byte words, so we can compute the largest possible
- * accumulators, then divide by 2^64 to figure the max amount of overflow:
- *
- * % bc
- *  a=b=c=d=0; f=2^32-1; for (i=1; i<=32*1024; i++) { a+=f; b+=a; c+=b; d+=c }
- *  a/2^64;b/2^64;c/2^64;d/2^64
- * 0
- * 0
- * 1365
- * 11186858
- *  quit
- * %
- *
- * So a and b cannot overflow.  To make sure each bit of input has some
- * effect on the contents of c and d, we can look at what the factors of
- * the coefficients in the equations for c_n and d_n are.  The number of 2s
- * in the factors determines the lowest set bit in the multiplier.  Running
- * through the cases for n*(n+1)/2 reveals that the highest power of 2 is
- * 2^14, and for n*(n+1)*(n+2)/6 it is 2^15.  So while some data may overflow
- * the 64-bit accumulators, every bit of every f_i effects every accumulator,
- * even for 128k blocks.
- *
- * If we wanted to make a stronger version of fletcher4 (fletcher4c?),
- * we could do our calculations mod (2^32 - 1) by adding in the carries
- * periodically, and store the number of carries in the top 32-bits.
- *
- * --------------------
- * Checksum Performance
- * --------------------
- *
- * There are two interesting components to checksum performance: cached and
- * uncached performance.  With cached data, fletcher-2 is about four times
- * faster than fletcher-4.  With uncached data, the performance difference is
- * negligible, since the cost of a cache fill dominates the processing time.
- * Even though fletcher-4 is slower than fletcher-2, it is still a pretty
- * efficient pass over the data.
- *
- * In normal operation, the data which is being checksummed is in a buffer
- * which has been filled either by:
- *
- *	1. a compression step, which will be mostly cached, or
- *	2. a bcopy() or copyin(), which will be uncached (because the
- *	   copy is cache-bypassing).
- *
- * For both cached and uncached data, both fletcher checksums are much faster
- * than sha-256, and slower than 'off', which doesn't touch the data at all.
- */
-
-#include <sys/types.h>
-#include <sys/sysmacros.h>
-#include <sys/byteorder.h>
-#include <sys/zio.h>
-#include <sys/spa.h>
-#include <zfs_fletcher.h>
-
-void
-fletcher_init(zio_cksum_t *zcp)
-{
-	ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0);
-}
-
-int
-fletcher_2_incremental_native(void *buf, size_t size, void *data)
-{
-	zio_cksum_t *zcp = data;
-
-	const uint64_t *ip = buf;
-	const uint64_t *ipend = ip + (size / sizeof (uint64_t));
-	uint64_t a0, b0, a1, b1;
-
-	a0 = zcp->zc_word[0];
-	a1 = zcp->zc_word[1];
-	b0 = zcp->zc_word[2];
-	b1 = zcp->zc_word[3];
-
-	for (; ip < ipend; ip += 2) {
-		a0 += ip[0];
-		a1 += ip[1];
-		b0 += a0;
-		b1 += a1;
-	}
-
-	ZIO_SET_CHECKSUM(zcp, a0, a1, b0, b1);
-	return (0);
-}
-
-/*ARGSUSED*/
-void
-fletcher_2_native(const void *buf, size_t size,
-    const void *ctx_template, zio_cksum_t *zcp)
-{
-	fletcher_init(zcp);
-	(void) fletcher_2_incremental_native((void *) buf, size, zcp);
-}
-
-int
-fletcher_2_incremental_byteswap(void *buf, size_t size, void *data)
-{
-	zio_cksum_t *zcp = data;
-
-	const uint64_t *ip = buf;
-	const uint64_t *ipend = ip + (size / sizeof (uint64_t));
-	uint64_t a0, b0, a1, b1;
-
-	a0 = zcp->zc_word[0];
-	a1 = zcp->zc_word[1];
-	b0 = zcp->zc_word[2];
-	b1 = zcp->zc_word[3];
-
-	for (; ip < ipend; ip += 2) {
-		a0 += BSWAP_64(ip[0]);
-		a1 += BSWAP_64(ip[1]);
-		b0 += a0;
-		b1 += a1;
-	}
-
-	ZIO_SET_CHECKSUM(zcp, a0, a1, b0, b1);
-	return (0);
-}
-
-/*ARGSUSED*/
-void
-fletcher_2_byteswap(const void *buf, size_t size,
-    const void *ctx_template, zio_cksum_t *zcp)
-{
-	fletcher_init(zcp);
-	(void) fletcher_2_incremental_byteswap((void *) buf, size, zcp);
-}
-
-int
-fletcher_4_incremental_native(void *buf, size_t size, void *data)
-{
-	zio_cksum_t *zcp = data;
-
-	const uint32_t *ip = buf;
-	const uint32_t *ipend = ip + (size / sizeof (uint32_t));
-	uint64_t a, b, c, d;
-
-	a = zcp->zc_word[0];
-	b = zcp->zc_word[1];
-	c = zcp->zc_word[2];
-	d = zcp->zc_word[3];
-
-	for (; ip < ipend; ip++) {
-		a += ip[0];
-		b += a;
-		c += b;
-		d += c;
-	}
-
-	ZIO_SET_CHECKSUM(zcp, a, b, c, d);
-	return (0);
-}
-
-/*ARGSUSED*/
-void
-fletcher_4_native(const void *buf, size_t size,
-    const void *ctx_template, zio_cksum_t *zcp)
-{
-	fletcher_init(zcp);
-	(void) fletcher_4_incremental_native((void *) buf, size, zcp);
-}
-
-int
-fletcher_4_incremental_byteswap(void *buf, size_t size, void *data)
-{
-	zio_cksum_t *zcp = data;
-
-	const uint32_t *ip = buf;
-	const uint32_t *ipend = ip + (size / sizeof (uint32_t));
-	uint64_t a, b, c, d;
-
-	a = zcp->zc_word[0];
-	b = zcp->zc_word[1];
-	c = zcp->zc_word[2];
-	d = zcp->zc_word[3];
-
-	for (; ip < ipend; ip++) {
-		a += BSWAP_32(ip[0]);
-		b += a;
-		c += b;
-		d += c;
-	}
-
-	ZIO_SET_CHECKSUM(zcp, a, b, c, d);
-	return (0);
-}
-
-/*ARGSUSED*/
-void
-fletcher_4_byteswap(const void *buf, size_t size,
-    const void *ctx_template, zio_cksum_t *zcp)
-{
-	fletcher_init(zcp);
-	(void) fletcher_4_incremental_byteswap((void *) buf, size, zcp);
-}
Index: sys/cddl/contrib/opensolaris/common/zfs/zfs_ioctl_compat.h
===================================================================
--- sys/cddl/contrib/opensolaris/common/zfs/zfs_ioctl_compat.h
+++ /dev/null
@@ -1,543 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2014 Xin Li <delphij@FreeBSD.org>.  All rights reserved.
- * Copyright 2013 Martin Matuska <mm@FreeBSD.org>.  All rights reserved.
- * Use is subject to license terms.
- */
-
-#ifndef	_SYS_ZFS_IOCTL_COMPAT_H
-#define	_SYS_ZFS_IOCTL_COMPAT_H
-
-#include <sys/cred.h>
-#include <sys/dmu.h>
-#include <sys/zio.h>
-#include <sys/dsl_deleg.h>
-#include <sys/zfs_ioctl.h>
-
-#ifdef _KERNEL
-#include <sys/nvpair.h>
-#endif  /* _KERNEL */
-
-#ifdef	__cplusplus
-extern "C" {
-#endif
-
-/*
- * Backwards ioctl compatibility
- */
-
-/* ioctl versions for vfs.zfs.version.ioctl */
-#define	ZFS_IOCVER_UNDEF	-1
-#define	ZFS_IOCVER_NONE		0
-#define	ZFS_IOCVER_DEADMAN	1
-#define	ZFS_IOCVER_LZC		2
-#define	ZFS_IOCVER_ZCMD		3
-#define	ZFS_IOCVER_EDBP		4
-#define	ZFS_IOCVER_RESUME	5
-#define	ZFS_IOCVER_INLANES	6
-#define	ZFS_IOCVER_PAD		7
-#define	ZFS_IOCVER_CURRENT	ZFS_IOCVER_PAD
-
-/* compatibility conversion flag */
-#define	ZFS_CMD_COMPAT_NONE	0
-#define	ZFS_CMD_COMPAT_V15	1
-#define	ZFS_CMD_COMPAT_V28	2
-#define	ZFS_CMD_COMPAT_DEADMAN	3
-#define	ZFS_CMD_COMPAT_LZC	4
-#define	ZFS_CMD_COMPAT_ZCMD	5
-#define	ZFS_CMD_COMPAT_EDBP	6
-#define	ZFS_CMD_COMPAT_RESUME	7
-#define	ZFS_CMD_COMPAT_INLANES	8
-
-#define	ZFS_IOC_COMPAT_PASS	254
-#define	ZFS_IOC_COMPAT_FAIL	255
-
-#define	ZFS_IOCREQ(ioreq)	((ioreq) & 0xff)
-
-typedef struct zfs_iocparm {
-	uint32_t	zfs_ioctl_version;
-	uint64_t	zfs_cmd;
-	uint64_t	zfs_cmd_size;
-} zfs_iocparm_t;
-
-typedef struct zinject_record_v15 {
-	uint64_t	zi_objset;
-	uint64_t	zi_object;
-	uint64_t	zi_start;
-	uint64_t	zi_end;
-	uint64_t	zi_guid;
-	uint32_t	zi_level;
-	uint32_t	zi_error;
-	uint64_t	zi_type;
-	uint32_t	zi_freq;
-	uint32_t	zi_failfast;
-} zinject_record_v15_t;
-
-typedef struct zfs_cmd_v15 {
-	char		zc_name[MAXPATHLEN];
-	char		zc_value[MAXPATHLEN];
-	char		zc_string[MAXNAMELEN];
-	uint64_t	zc_guid;
-	uint64_t	zc_nvlist_conf;		/* really (char *) */
-	uint64_t	zc_nvlist_conf_size;
-	uint64_t	zc_nvlist_src;		/* really (char *) */
-	uint64_t	zc_nvlist_src_size;
-	uint64_t	zc_nvlist_dst;		/* really (char *) */
-	uint64_t	zc_nvlist_dst_size;
-	uint64_t	zc_cookie;
-	uint64_t	zc_objset_type;
-	uint64_t	zc_perm_action;
-	uint64_t 	zc_history;		/* really (char *) */
-	uint64_t 	zc_history_len;
-	uint64_t	zc_history_offset;
-	uint64_t	zc_obj;
-	zfs_share_t	zc_share;
-	uint64_t	zc_jailid;
-	dmu_objset_stats_t zc_objset_stats;
-	struct drr_begin zc_begin_record;
-	zinject_record_v15_t zc_inject_record;
-} zfs_cmd_v15_t;
-
-typedef struct zinject_record_v28 {
-	uint64_t	zi_objset;
-	uint64_t	zi_object;
-	uint64_t	zi_start;
-	uint64_t	zi_end;
-	uint64_t	zi_guid;
-	uint32_t	zi_level;
-	uint32_t	zi_error;
-	uint64_t	zi_type;
-	uint32_t	zi_freq;
-	uint32_t	zi_failfast;
-	char		zi_func[MAXNAMELEN];
-	uint32_t	zi_iotype;
-	int32_t		zi_duration;
-	uint64_t	zi_timer;
-} zinject_record_v28_t;
-
-typedef struct zfs_cmd_v28 {
-	char		zc_name[MAXPATHLEN];
-	char		zc_value[MAXPATHLEN * 2];
-	char		zc_string[MAXNAMELEN];
-	char		zc_top_ds[MAXPATHLEN];
-	uint64_t	zc_guid;
-	uint64_t	zc_nvlist_conf;		/* really (char *) */
-	uint64_t	zc_nvlist_conf_size;
-	uint64_t	zc_nvlist_src;		/* really (char *) */
-	uint64_t	zc_nvlist_src_size;
-	uint64_t	zc_nvlist_dst;		/* really (char *) */
-	uint64_t	zc_nvlist_dst_size;
-	uint64_t	zc_cookie;
-	uint64_t	zc_objset_type;
-	uint64_t	zc_perm_action;
-	uint64_t 	zc_history;		/* really (char *) */
-	uint64_t 	zc_history_len;
-	uint64_t	zc_history_offset;
-	uint64_t	zc_obj;
-	uint64_t	zc_iflags;		/* internal to zfs(7fs) */
-	zfs_share_t	zc_share;
-	uint64_t	zc_jailid;
-	dmu_objset_stats_t zc_objset_stats;
-	struct drr_begin zc_begin_record;
-	zinject_record_v28_t zc_inject_record;
-	boolean_t	zc_defer_destroy;
-	boolean_t	zc_temphold;
-	uint64_t	zc_action_handle;
-	int		zc_cleanup_fd;
-	uint8_t		zc_simple;
-	uint8_t		zc_pad[3];		/* alignment */
-	uint64_t	zc_sendobj;
-	uint64_t	zc_fromobj;
-	uint64_t	zc_createtxg;
-	zfs_stat_t	zc_stat;
-} zfs_cmd_v28_t;
-
-typedef struct zinject_record_deadman {
-	uint64_t	zi_objset;
-	uint64_t	zi_object;
-	uint64_t	zi_start;
-	uint64_t	zi_end;
-	uint64_t	zi_guid;
-	uint32_t	zi_level;
-	uint32_t	zi_error;
-	uint64_t	zi_type;
-	uint32_t	zi_freq;
-	uint32_t	zi_failfast;
-	char		zi_func[MAXNAMELEN];
-	uint32_t	zi_iotype;
-	int32_t		zi_duration;
-	uint64_t	zi_timer;
-	uint32_t	zi_cmd;
-	uint32_t	zi_pad;
-} zinject_record_deadman_t;
-
-typedef struct zfs_cmd_deadman {
-	char		zc_name[MAXPATHLEN];
-	char		zc_value[MAXPATHLEN * 2];
-	char		zc_string[MAXNAMELEN];
-	char		zc_top_ds[MAXPATHLEN];
-	uint64_t	zc_guid;
-	uint64_t	zc_nvlist_conf;		/* really (char *) */
-	uint64_t	zc_nvlist_conf_size;
-	uint64_t	zc_nvlist_src;		/* really (char *) */
-	uint64_t	zc_nvlist_src_size;
-	uint64_t	zc_nvlist_dst;		/* really (char *) */
-	uint64_t	zc_nvlist_dst_size;
-	uint64_t	zc_cookie;
-	uint64_t	zc_objset_type;
-	uint64_t	zc_perm_action;
-	uint64_t 	zc_history;		/* really (char *) */
-	uint64_t 	zc_history_len;
-	uint64_t	zc_history_offset;
-	uint64_t	zc_obj;
-	uint64_t	zc_iflags;		/* internal to zfs(7fs) */
-	zfs_share_t	zc_share;
-	uint64_t	zc_jailid;
-	dmu_objset_stats_t zc_objset_stats;
-	struct drr_begin zc_begin_record;
-	/* zc_inject_record doesn't change in libzfs_core */
-	zinject_record_deadman_t zc_inject_record;
-	boolean_t	zc_defer_destroy;
-	boolean_t	zc_temphold;
-	uint64_t	zc_action_handle;
-	int		zc_cleanup_fd;
-	uint8_t		zc_simple;
-	uint8_t		zc_pad[3];		/* alignment */
-	uint64_t	zc_sendobj;
-	uint64_t	zc_fromobj;
-	uint64_t	zc_createtxg;
-	zfs_stat_t	zc_stat;
-} zfs_cmd_deadman_t;
-
-typedef struct zfs_cmd_zcmd {
-	char		zc_name[MAXPATHLEN];	/* name of pool or dataset */
-	uint64_t	zc_nvlist_src;		/* really (char *) */
-	uint64_t	zc_nvlist_src_size;
-	uint64_t	zc_nvlist_dst;		/* really (char *) */
-	uint64_t	zc_nvlist_dst_size;
-	boolean_t	zc_nvlist_dst_filled;	/* put an nvlist in dst? */
-	int		zc_pad2;
-
-	/*
-	 * The following members are for legacy ioctls which haven't been
-	 * converted to the new method.
-	 */
-	uint64_t	zc_history;		/* really (char *) */
-	char		zc_value[MAXPATHLEN * 2];
-	char		zc_string[MAXNAMELEN];
-	uint64_t	zc_guid;
-	uint64_t	zc_nvlist_conf;		/* really (char *) */
-	uint64_t	zc_nvlist_conf_size;
-	uint64_t	zc_cookie;
-	uint64_t	zc_objset_type;
-	uint64_t	zc_perm_action;
-	uint64_t	zc_history_len;
-	uint64_t	zc_history_offset;
-	uint64_t	zc_obj;
-	uint64_t	zc_iflags;		/* internal to zfs(7fs) */
-	zfs_share_t	zc_share;
-	uint64_t	zc_jailid;
-	dmu_objset_stats_t zc_objset_stats;
-	struct drr_begin zc_begin_record;
-	zinject_record_deadman_t zc_inject_record;
-	boolean_t	zc_defer_destroy;
-	boolean_t	zc_temphold;
-	uint64_t	zc_action_handle;
-	int		zc_cleanup_fd;
-	uint8_t		zc_simple;
-	uint8_t		zc_pad[3];		/* alignment */
-	uint64_t	zc_sendobj;
-	uint64_t	zc_fromobj;
-	uint64_t	zc_createtxg;
-	zfs_stat_t	zc_stat;
-} zfs_cmd_zcmd_t;
-
-typedef struct zfs_cmd_edbp {
-	char		zc_name[MAXPATHLEN];	/* name of pool or dataset */
-	uint64_t	zc_nvlist_src;		/* really (char *) */
-	uint64_t	zc_nvlist_src_size;
-	uint64_t	zc_nvlist_dst;		/* really (char *) */
-	uint64_t	zc_nvlist_dst_size;
-	boolean_t	zc_nvlist_dst_filled;	/* put an nvlist in dst? */
-	int		zc_pad2;
-
-	/*
-	 * The following members are for legacy ioctls which haven't been
-	 * converted to the new method.
-	 */
-	uint64_t	zc_history;		/* really (char *) */
-	char		zc_value[MAXPATHLEN * 2];
-	char		zc_string[MAXNAMELEN];
-	uint64_t	zc_guid;
-	uint64_t	zc_nvlist_conf;		/* really (char *) */
-	uint64_t	zc_nvlist_conf_size;
-	uint64_t	zc_cookie;
-	uint64_t	zc_objset_type;
-	uint64_t	zc_perm_action;
-	uint64_t	zc_history_len;
-	uint64_t	zc_history_offset;
-	uint64_t	zc_obj;
-	uint64_t	zc_iflags;		/* internal to zfs(7fs) */
-	zfs_share_t	zc_share;
-	uint64_t	zc_jailid;
-	dmu_objset_stats_t zc_objset_stats;
-	struct drr_begin zc_begin_record;
-	zinject_record_deadman_t zc_inject_record;
-	uint32_t	zc_defer_destroy;
-	uint32_t	zc_flags;
-	uint64_t	zc_action_handle;
-	int		zc_cleanup_fd;
-	uint8_t		zc_simple;
-	uint8_t		zc_pad[3];		/* alignment */
-	uint64_t	zc_sendobj;
-	uint64_t	zc_fromobj;
-	uint64_t	zc_createtxg;
-	zfs_stat_t	zc_stat;
-} zfs_cmd_edbp_t;
-
-typedef struct zfs_cmd_resume {
-	char		zc_name[MAXPATHLEN];	/* name of pool or dataset */
-	uint64_t	zc_nvlist_src;		/* really (char *) */
-	uint64_t	zc_nvlist_src_size;
-	uint64_t	zc_nvlist_dst;		/* really (char *) */
-	uint64_t	zc_nvlist_dst_size;
-	boolean_t	zc_nvlist_dst_filled;	/* put an nvlist in dst? */
-	int		zc_pad2;
-
-	/*
-	 * The following members are for legacy ioctls which haven't been
-	 * converted to the new method.
-	 */
-	uint64_t	zc_history;		/* really (char *) */
-	char		zc_value[MAXPATHLEN * 2];
-	char		zc_string[MAXNAMELEN];
-	uint64_t	zc_guid;
-	uint64_t	zc_nvlist_conf;		/* really (char *) */
-	uint64_t	zc_nvlist_conf_size;
-	uint64_t	zc_cookie;
-	uint64_t	zc_objset_type;
-	uint64_t	zc_perm_action;
-	uint64_t	zc_history_len;
-	uint64_t	zc_history_offset;
-	uint64_t	zc_obj;
-	uint64_t	zc_iflags;		/* internal to zfs(7fs) */
-	zfs_share_t	zc_share;
-	uint64_t	zc_jailid;
-	dmu_objset_stats_t zc_objset_stats;
-	dmu_replay_record_t zc_begin_record;
-	zinject_record_deadman_t zc_inject_record;
-	uint32_t	zc_defer_destroy;
-	uint32_t	zc_flags;
-	uint64_t	zc_action_handle;
-	int		zc_cleanup_fd;
-	uint8_t		zc_simple;
-	boolean_t	zc_resumable;
-	uint64_t	zc_sendobj;
-	uint64_t	zc_fromobj;
-	uint64_t	zc_createtxg;
-	zfs_stat_t	zc_stat;
-} zfs_cmd_resume_t;
-
-typedef struct zfs_cmd_inlanes {
-	char		zc_name[MAXPATHLEN];	/* name of pool or dataset */
-	uint64_t	zc_nvlist_src;		/* really (char *) */
-	uint64_t	zc_nvlist_src_size;
-	uint64_t	zc_nvlist_dst;		/* really (char *) */
-	uint64_t	zc_nvlist_dst_size;
-	boolean_t	zc_nvlist_dst_filled;	/* put an nvlist in dst? */
-	int		zc_pad2;
-
-	/*
-	 * The following members are for legacy ioctls which haven't been
-	 * converted to the new method.
-	 */
-	uint64_t	zc_history;		/* really (char *) */
-	char		zc_value[MAXPATHLEN * 2];
-	char		zc_string[MAXNAMELEN];
-	uint64_t	zc_guid;
-	uint64_t	zc_nvlist_conf;		/* really (char *) */
-	uint64_t	zc_nvlist_conf_size;
-	uint64_t	zc_cookie;
-	uint64_t	zc_objset_type;
-	uint64_t	zc_perm_action;
-	uint64_t	zc_history_len;
-	uint64_t	zc_history_offset;
-	uint64_t	zc_obj;
-	uint64_t	zc_iflags;		/* internal to zfs(7fs) */
-	zfs_share_t	zc_share;
-	uint64_t	zc_jailid;
-	dmu_objset_stats_t zc_objset_stats;
-	dmu_replay_record_t zc_begin_record;
-	zinject_record_t zc_inject_record;
-	uint32_t	zc_defer_destroy;
-	uint32_t	zc_flags;
-	uint64_t	zc_action_handle;
-	int		zc_cleanup_fd;
-	uint8_t		zc_simple;
-	boolean_t	zc_resumable;
-	uint64_t	zc_sendobj;
-	uint64_t	zc_fromobj;
-	uint64_t	zc_createtxg;
-	zfs_stat_t	zc_stat;
-} zfs_cmd_inlanes_t;
-
-#ifdef _KERNEL
-unsigned static long zfs_ioctl_v15_to_v28[] = {
-	0,	/*  0 ZFS_IOC_POOL_CREATE */
-	1,	/*  1 ZFS_IOC_POOL_DESTROY */
-	2,	/*  2 ZFS_IOC_POOL_IMPORT */
-	3,	/*  3 ZFS_IOC_POOL_EXPORT */
-	4,	/*  4 ZFS_IOC_POOL_CONFIGS */
-	5,	/*  5 ZFS_IOC_POOL_STATS */
-	6,	/*  6 ZFS_IOC_POOL_TRYIMPORT */
-	7,	/*  7 ZFS_IOC_POOL_SCRUB */
-	8,	/*  8 ZFS_IOC_POOL_FREEZE */
-	9,	/*  9 ZFS_IOC_POOL_UPGRADE */
-	10,	/* 10 ZFS_IOC_POOL_GET_HISTORY */
-	11,	/* 11 ZFS_IOC_VDEV_ADD */
-	12,	/* 12 ZFS_IOC_VDEV_REMOVE */
-	13,	/* 13 ZFS_IOC_VDEV_SET_STATE */
-	14,	/* 14 ZFS_IOC_VDEV_ATTACH */
-	15,	/* 15 ZFS_IOC_VDEV_DETACH */
-	16,	/* 16 ZFS_IOC_VDEV_SETPATH */
-	18,	/* 17 ZFS_IOC_OBJSET_STATS */
-	19,	/* 18 ZFS_IOC_OBJSET_ZPLPROPS */
-	20, 	/* 19 ZFS_IOC_DATASET_LIST_NEXT */
-	21,	/* 20 ZFS_IOC_SNAPSHOT_LIST_NEXT */
-	22,	/* 21 ZFS_IOC_SET_PROP */
-	ZFS_IOC_COMPAT_PASS,	/* 22 ZFS_IOC_CREATE_MINOR */
-	ZFS_IOC_COMPAT_PASS,	/* 23 ZFS_IOC_REMOVE_MINOR */
-	23,	/* 24 ZFS_IOC_CREATE */
-	24,	/* 25 ZFS_IOC_DESTROY */
-	25,	/* 26 ZFS_IOC_ROLLBACK */
-	26,	/* 27 ZFS_IOC_RENAME */
-	27,	/* 28 ZFS_IOC_RECV */
-	28,	/* 29 ZFS_IOC_SEND */
-	29,	/* 30 ZFS_IOC_INJECT_FAULT */
-	30,	/* 31 ZFS_IOC_CLEAR_FAULT */
-	31,	/* 32 ZFS_IOC_INJECT_LIST_NEXT */
-	32,	/* 33 ZFS_IOC_ERROR_LOG */
-	33,	/* 34 ZFS_IOC_CLEAR */
-	34,	/* 35 ZFS_IOC_PROMOTE */
-	35,	/* 36 ZFS_IOC_DESTROY_SNAPS */
-	36,	/* 37 ZFS_IOC_SNAPSHOT */
-	37,	/* 38 ZFS_IOC_DSOBJ_TO_DSNAME */
-	38,	/* 39 ZFS_IOC_OBJ_TO_PATH */
-	39,	/* 40 ZFS_IOC_POOL_SET_PROPS */
-	40,	/* 41 ZFS_IOC_POOL_GET_PROPS */
-	41,	/* 42 ZFS_IOC_SET_FSACL */
-	42,	/* 43 ZFS_IOC_GET_FSACL */
-	ZFS_IOC_COMPAT_PASS,	/* 44 ZFS_IOC_ISCSI_PERM_CHECK */
-	43,	/* 45 ZFS_IOC_SHARE */
-	44,	/* 46 ZFS_IOC_IHNERIT_PROP */
-	58,	/* 47 ZFS_IOC_JAIL */
-	59,	/* 48 ZFS_IOC_UNJAIL */
-	45,	/* 49 ZFS_IOC_SMB_ACL */
-	46,	/* 50 ZFS_IOC_USERSPACE_ONE */
-	47,	/* 51 ZFS_IOC_USERSPACE_MANY */
-	48,	/* 52 ZFS_IOC_USERSPACE_UPGRADE */
-	17,	/* 53 ZFS_IOC_SETFRU */
-};
-
-#else	/* KERNEL */
-unsigned static long zfs_ioctl_v28_to_v15[] = {
-	0,	/*  0 ZFS_IOC_POOL_CREATE */
-	1,	/*  1 ZFS_IOC_POOL_DESTROY */
-	2,	/*  2 ZFS_IOC_POOL_IMPORT */
-	3,	/*  3 ZFS_IOC_POOL_EXPORT */
-	4,	/*  4 ZFS_IOC_POOL_CONFIGS */
-	5,	/*  5 ZFS_IOC_POOL_STATS */
-	6,	/*  6 ZFS_IOC_POOL_TRYIMPORT */
-	7,	/*  7 ZFS_IOC_POOL_SCAN */
-	8,	/*  8 ZFS_IOC_POOL_FREEZE */
-	9,	/*  9 ZFS_IOC_POOL_UPGRADE */
-	10,	/* 10 ZFS_IOC_POOL_GET_HISTORY */
-	11,	/* 11 ZFS_IOC_VDEV_ADD */
-	12,	/* 12 ZFS_IOC_VDEV_REMOVE */
-	13,	/* 13 ZFS_IOC_VDEV_SET_STATE */
-	14,	/* 14 ZFS_IOC_VDEV_ATTACH */
-	15,	/* 15 ZFS_IOC_VDEV_DETACH */
-	16,	/* 16 ZFS_IOC_VDEV_SETPATH */
-	53,	/* 17 ZFS_IOC_VDEV_SETFRU */
-	17,	/* 18 ZFS_IOC_OBJSET_STATS */
-	18,	/* 19 ZFS_IOC_OBJSET_ZPLPROPS */
-	19, 	/* 20 ZFS_IOC_DATASET_LIST_NEXT */
-	20,	/* 21 ZFS_IOC_SNAPSHOT_LIST_NEXT */
-	21,	/* 22 ZFS_IOC_SET_PROP */
-	24,	/* 23 ZFS_IOC_CREATE */
-	25,	/* 24 ZFS_IOC_DESTROY */
-	26,	/* 25 ZFS_IOC_ROLLBACK */
-	27,	/* 26 ZFS_IOC_RENAME */
-	28,	/* 27 ZFS_IOC_RECV */
-	29,	/* 28 ZFS_IOC_SEND */
-	30,	/* 39 ZFS_IOC_INJECT_FAULT */
-	31,	/* 30 ZFS_IOC_CLEAR_FAULT */
-	32,	/* 31 ZFS_IOC_INJECT_LIST_NEXT */
-	33,	/* 32 ZFS_IOC_ERROR_LOG */
-	34,	/* 33 ZFS_IOC_CLEAR */
-	35,	/* 34 ZFS_IOC_PROMOTE */
-	36,	/* 35 ZFS_IOC_DESTROY_SNAPS */
-	37,	/* 36 ZFS_IOC_SNAPSHOT */
-	38,	/* 37 ZFS_IOC_DSOBJ_TO_DSNAME */
-	39,	/* 38 ZFS_IOC_OBJ_TO_PATH */
-	40,	/* 39 ZFS_IOC_POOL_SET_PROPS */
-	41,	/* 40 ZFS_IOC_POOL_GET_PROPS */
-	42,	/* 41 ZFS_IOC_SET_FSACL */
-	43,	/* 42 ZFS_IOC_GET_FSACL */
-	45,	/* 43 ZFS_IOC_SHARE */
-	46,	/* 44 ZFS_IOC_IHNERIT_PROP */
-	49,	/* 45 ZFS_IOC_SMB_ACL */
-	50,	/* 46 ZFS_IOC_USERSPACE_ONE */
-	51,	/* 47 ZFS_IOC_USERSPACE_MANY */
-	52,	/* 48 ZFS_IOC_USERSPACE_UPGRADE */
-	ZFS_IOC_COMPAT_FAIL,	/* 49 ZFS_IOC_HOLD */
-	ZFS_IOC_COMPAT_FAIL,	/* 50 ZFS_IOC_RELEASE */
-	ZFS_IOC_COMPAT_FAIL,	/* 51 ZFS_IOC_GET_HOLDS */
-	ZFS_IOC_COMPAT_FAIL,	/* 52 ZFS_IOC_OBJSET_RECVD_PROPS */
-	ZFS_IOC_COMPAT_FAIL,	/* 53 ZFS_IOC_VDEV_SPLIT */
-	ZFS_IOC_COMPAT_FAIL,	/* 54 ZFS_IOC_NEXT_OBJ */
-	ZFS_IOC_COMPAT_FAIL,	/* 55 ZFS_IOC_DIFF */
-	ZFS_IOC_COMPAT_FAIL,	/* 56 ZFS_IOC_TMP_SNAPSHOT */
-	ZFS_IOC_COMPAT_FAIL,	/* 57 ZFS_IOC_OBJ_TO_STATS */
-	47,	/* 58 ZFS_IOC_JAIL */
-	48,	/* 59 ZFS_IOC_UNJAIL */
-};
-#endif	/* ! _KERNEL */
-
-#ifdef _KERNEL
-int zfs_ioctl_compat_pre(zfs_cmd_t *, int *, const int);
-void zfs_ioctl_compat_post(zfs_cmd_t *, const int, const int);
-nvlist_t *zfs_ioctl_compat_innvl(zfs_cmd_t *, nvlist_t *, const int,
-    const int);
-nvlist_t *zfs_ioctl_compat_outnvl(zfs_cmd_t *, nvlist_t *, const int,
-    const int);
-#else
-int zcmd_ioctl_compat(int, int, zfs_cmd_t *, const int);
-#endif	/* _KERNEL */
-void zfs_cmd_compat_get(zfs_cmd_t *, caddr_t, const int);
-void zfs_cmd_compat_put(zfs_cmd_t *, caddr_t, const int, const int);
-
-#ifdef	__cplusplus
-}
-#endif
-
-#endif	/* _SYS_ZFS_IOCTL_COMPAT_H */
Index: sys/cddl/contrib/opensolaris/common/zfs/zfs_ioctl_compat.c
===================================================================
--- sys/cddl/contrib/opensolaris/common/zfs/zfs_ioctl_compat.c
+++ /dev/null
@@ -1,1380 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2013 Xin Li <delphij@FreeBSD.org>. All rights reserved.
- * Copyright 2013 Martin Matuska <mm@FreeBSD.org>. All rights reserved.
- * Portions Copyright 2005, 2010, Oracle and/or its affiliates.
- * All rights reserved.
- * Use is subject to license terms.
- */
-
-#include <sys/types.h>
-#include <sys/param.h>
-#include <sys/cred.h>
-#include <sys/dmu.h>
-#include <sys/zio.h>
-#include <sys/nvpair.h>
-#include <sys/dsl_deleg.h>
-#include <sys/zfs_ioctl.h>
-#include "zfs_namecheck.h"
-#include "zfs_ioctl_compat.h"
-
-static int zfs_version_ioctl = ZFS_IOCVER_CURRENT;
-SYSCTL_DECL(_vfs_zfs_version);
-SYSCTL_INT(_vfs_zfs_version, OID_AUTO, ioctl, CTLFLAG_RD, &zfs_version_ioctl,
-    0, "ZFS_IOCTL_VERSION");
-
-/*
- * FreeBSD zfs_cmd compatibility with older binaries
- * appropriately remap/extend the zfs_cmd_t structure
- */
-void
-zfs_cmd_compat_get(zfs_cmd_t *zc, caddr_t addr, const int cflag)
-{
-	zfs_cmd_v15_t *zc_c;
-	zfs_cmd_v28_t *zc28_c;
-	zfs_cmd_deadman_t *zcdm_c;
-	zfs_cmd_zcmd_t *zcmd_c;
-	zfs_cmd_edbp_t *edbp_c;
-	zfs_cmd_resume_t *resume_c;
-	zfs_cmd_inlanes_t *inlanes_c;
-
-	switch (cflag) {
-	case ZFS_CMD_COMPAT_INLANES:
-		inlanes_c = (void *)addr;
-		/* zc */
-		strlcpy(zc->zc_name, inlanes_c->zc_name, MAXPATHLEN);
-		strlcpy(zc->zc_value, inlanes_c->zc_value, MAXPATHLEN * 2);
-		strlcpy(zc->zc_string, inlanes_c->zc_string, MAXPATHLEN);
-
-#define FIELD_COPY(field) zc->field = inlanes_c->field
-		FIELD_COPY(zc_nvlist_src);
-		FIELD_COPY(zc_nvlist_src_size);
-		FIELD_COPY(zc_nvlist_dst);
-		FIELD_COPY(zc_nvlist_dst_size);
-		FIELD_COPY(zc_nvlist_dst_filled);
-		FIELD_COPY(zc_pad2);
-		FIELD_COPY(zc_history);
-		FIELD_COPY(zc_guid);
-		FIELD_COPY(zc_nvlist_conf);
-		FIELD_COPY(zc_nvlist_conf_size);
-		FIELD_COPY(zc_cookie);
-		FIELD_COPY(zc_objset_type);
-		FIELD_COPY(zc_perm_action);
-		FIELD_COPY(zc_history_len);
-		FIELD_COPY(zc_history_offset);
-		FIELD_COPY(zc_obj);
-		FIELD_COPY(zc_iflags);
-		FIELD_COPY(zc_share);
-		FIELD_COPY(zc_jailid);
-		FIELD_COPY(zc_objset_stats);
-		FIELD_COPY(zc_begin_record);
-		FIELD_COPY(zc_inject_record);
-		FIELD_COPY(zc_defer_destroy);
-		FIELD_COPY(zc_flags);
-		FIELD_COPY(zc_action_handle);
-		FIELD_COPY(zc_cleanup_fd);
-		FIELD_COPY(zc_simple);
-		FIELD_COPY(zc_resumable);
-		FIELD_COPY(zc_sendobj);
-		FIELD_COPY(zc_fromobj);
-		FIELD_COPY(zc_createtxg);
-		FIELD_COPY(zc_stat);
-#undef FIELD_COPY
-		break;
-
-	case ZFS_CMD_COMPAT_RESUME:
-		resume_c = (void *)addr;
-		/* zc */
-		strlcpy(zc->zc_name, resume_c->zc_name, MAXPATHLEN);
-		strlcpy(zc->zc_value, resume_c->zc_value, MAXPATHLEN * 2);
-		strlcpy(zc->zc_string, resume_c->zc_string, MAXPATHLEN);
-
-#define FIELD_COPY(field) zc->field = resume_c->field
-		FIELD_COPY(zc_nvlist_src);
-		FIELD_COPY(zc_nvlist_src_size);
-		FIELD_COPY(zc_nvlist_dst);
-		FIELD_COPY(zc_nvlist_dst_size);
-		FIELD_COPY(zc_nvlist_dst_filled);
-		FIELD_COPY(zc_pad2);
-		FIELD_COPY(zc_history);
-		FIELD_COPY(zc_guid);
-		FIELD_COPY(zc_nvlist_conf);
-		FIELD_COPY(zc_nvlist_conf_size);
-		FIELD_COPY(zc_cookie);
-		FIELD_COPY(zc_objset_type);
-		FIELD_COPY(zc_perm_action);
-		FIELD_COPY(zc_history_len);
-		FIELD_COPY(zc_history_offset);
-		FIELD_COPY(zc_obj);
-		FIELD_COPY(zc_iflags);
-		FIELD_COPY(zc_share);
-		FIELD_COPY(zc_jailid);
-		FIELD_COPY(zc_objset_stats);
-		FIELD_COPY(zc_begin_record);
-		FIELD_COPY(zc_inject_record.zi_objset);
-		FIELD_COPY(zc_inject_record.zi_object);
-		FIELD_COPY(zc_inject_record.zi_start);
-		FIELD_COPY(zc_inject_record.zi_end);
-		FIELD_COPY(zc_inject_record.zi_guid);
-		FIELD_COPY(zc_inject_record.zi_level);
-		FIELD_COPY(zc_inject_record.zi_error);
-		FIELD_COPY(zc_inject_record.zi_type);
-		FIELD_COPY(zc_inject_record.zi_freq);
-		FIELD_COPY(zc_inject_record.zi_failfast);
-		strlcpy(zc->zc_inject_record.zi_func,
-		    resume_c->zc_inject_record.zi_func, MAXNAMELEN);
-		FIELD_COPY(zc_inject_record.zi_iotype);
-		FIELD_COPY(zc_inject_record.zi_duration);
-		FIELD_COPY(zc_inject_record.zi_timer);
-		zc->zc_inject_record.zi_nlanes = 1;
-		FIELD_COPY(zc_inject_record.zi_cmd);
-		FIELD_COPY(zc_inject_record.zi_pad);
-		FIELD_COPY(zc_defer_destroy);
-		FIELD_COPY(zc_flags);
-		FIELD_COPY(zc_action_handle);
-		FIELD_COPY(zc_cleanup_fd);
-		FIELD_COPY(zc_simple);
-		FIELD_COPY(zc_resumable);
-		FIELD_COPY(zc_sendobj);
-		FIELD_COPY(zc_fromobj);
-		FIELD_COPY(zc_createtxg);
-		FIELD_COPY(zc_stat);
-#undef FIELD_COPY
-		break;
-
-	case ZFS_CMD_COMPAT_EDBP:
-		edbp_c = (void *)addr;
-		/* zc */
-		strlcpy(zc->zc_name, edbp_c->zc_name, MAXPATHLEN);
-		strlcpy(zc->zc_value, edbp_c->zc_value, MAXPATHLEN * 2);
-		strlcpy(zc->zc_string, edbp_c->zc_string, MAXPATHLEN);
-
-#define FIELD_COPY(field) zc->field = edbp_c->field
-		FIELD_COPY(zc_nvlist_src);
-		FIELD_COPY(zc_nvlist_src_size);
-		FIELD_COPY(zc_nvlist_dst);
-		FIELD_COPY(zc_nvlist_dst_size);
-		FIELD_COPY(zc_nvlist_dst_filled);
-		FIELD_COPY(zc_pad2);
-		FIELD_COPY(zc_history);
-		FIELD_COPY(zc_guid);
-		FIELD_COPY(zc_nvlist_conf);
-		FIELD_COPY(zc_nvlist_conf_size);
-		FIELD_COPY(zc_cookie);
-		FIELD_COPY(zc_objset_type);
-		FIELD_COPY(zc_perm_action);
-		FIELD_COPY(zc_history_len);
-		FIELD_COPY(zc_history_offset);
-		FIELD_COPY(zc_obj);
-		FIELD_COPY(zc_iflags);
-		FIELD_COPY(zc_share);
-		FIELD_COPY(zc_jailid);
-		FIELD_COPY(zc_objset_stats);
-		zc->zc_begin_record.drr_u.drr_begin = edbp_c->zc_begin_record;
-		FIELD_COPY(zc_inject_record.zi_objset);
-		FIELD_COPY(zc_inject_record.zi_object);
-		FIELD_COPY(zc_inject_record.zi_start);
-		FIELD_COPY(zc_inject_record.zi_end);
-		FIELD_COPY(zc_inject_record.zi_guid);
-		FIELD_COPY(zc_inject_record.zi_level);
-		FIELD_COPY(zc_inject_record.zi_error);
-		FIELD_COPY(zc_inject_record.zi_type);
-		FIELD_COPY(zc_inject_record.zi_freq);
-		FIELD_COPY(zc_inject_record.zi_failfast);
-		strlcpy(zc->zc_inject_record.zi_func,
-		    edbp_c->zc_inject_record.zi_func, MAXNAMELEN);
-		FIELD_COPY(zc_inject_record.zi_iotype);
-		FIELD_COPY(zc_inject_record.zi_duration);
-		FIELD_COPY(zc_inject_record.zi_timer);
-		zc->zc_inject_record.zi_nlanes = 1;
-		FIELD_COPY(zc_inject_record.zi_cmd);
-		FIELD_COPY(zc_inject_record.zi_pad);
-		FIELD_COPY(zc_defer_destroy);
-		FIELD_COPY(zc_flags);
-		FIELD_COPY(zc_action_handle);
-		FIELD_COPY(zc_cleanup_fd);
-		FIELD_COPY(zc_simple);
-		zc->zc_resumable = B_FALSE;
-		FIELD_COPY(zc_sendobj);
-		FIELD_COPY(zc_fromobj);
-		FIELD_COPY(zc_createtxg);
-		FIELD_COPY(zc_stat);
-#undef FIELD_COPY
-		break;
-
-	case ZFS_CMD_COMPAT_ZCMD:
-		zcmd_c = (void *)addr;
-		/* zc */
-		strlcpy(zc->zc_name, zcmd_c->zc_name, MAXPATHLEN);
-		strlcpy(zc->zc_value, zcmd_c->zc_value, MAXPATHLEN * 2);
-		strlcpy(zc->zc_string, zcmd_c->zc_string, MAXPATHLEN);
-
-#define FIELD_COPY(field) zc->field = zcmd_c->field
-		FIELD_COPY(zc_nvlist_src);
-		FIELD_COPY(zc_nvlist_src_size);
-		FIELD_COPY(zc_nvlist_dst);
-		FIELD_COPY(zc_nvlist_dst_size);
-		FIELD_COPY(zc_nvlist_dst_filled);
-		FIELD_COPY(zc_pad2);
-		FIELD_COPY(zc_history);
-		FIELD_COPY(zc_guid);
-		FIELD_COPY(zc_nvlist_conf);
-		FIELD_COPY(zc_nvlist_conf_size);
-		FIELD_COPY(zc_cookie);
-		FIELD_COPY(zc_objset_type);
-		FIELD_COPY(zc_perm_action);
-		FIELD_COPY(zc_history_len);
-		FIELD_COPY(zc_history_offset);
-		FIELD_COPY(zc_obj);
-		FIELD_COPY(zc_iflags);
-		FIELD_COPY(zc_share);
-		FIELD_COPY(zc_jailid);
-		FIELD_COPY(zc_objset_stats);
-		zc->zc_begin_record.drr_u.drr_begin = zcmd_c->zc_begin_record;
-		FIELD_COPY(zc_inject_record.zi_objset);
-		FIELD_COPY(zc_inject_record.zi_object);
-		FIELD_COPY(zc_inject_record.zi_start);
-		FIELD_COPY(zc_inject_record.zi_end);
-		FIELD_COPY(zc_inject_record.zi_guid);
-		FIELD_COPY(zc_inject_record.zi_level);
-		FIELD_COPY(zc_inject_record.zi_error);
-		FIELD_COPY(zc_inject_record.zi_type);
-		FIELD_COPY(zc_inject_record.zi_freq);
-		FIELD_COPY(zc_inject_record.zi_failfast);
-		strlcpy(zc->zc_inject_record.zi_func,
-		    zcmd_c->zc_inject_record.zi_func, MAXNAMELEN);
-		FIELD_COPY(zc_inject_record.zi_iotype);
-		FIELD_COPY(zc_inject_record.zi_duration);
-		FIELD_COPY(zc_inject_record.zi_timer);
-		zc->zc_inject_record.zi_nlanes = 1;
-		FIELD_COPY(zc_inject_record.zi_cmd);
-		FIELD_COPY(zc_inject_record.zi_pad);
-
-		/* boolean_t -> uint32_t */
-		zc->zc_defer_destroy = (uint32_t)(zcmd_c->zc_defer_destroy);
-		zc->zc_flags = 0;
-
-		FIELD_COPY(zc_action_handle);
-		FIELD_COPY(zc_cleanup_fd);
-		FIELD_COPY(zc_simple);
-		zc->zc_resumable = B_FALSE;
-		FIELD_COPY(zc_sendobj);
-		FIELD_COPY(zc_fromobj);
-		FIELD_COPY(zc_createtxg);
-		FIELD_COPY(zc_stat);
-#undef FIELD_COPY
-
-		break;
-
-	case ZFS_CMD_COMPAT_DEADMAN:
-		zcdm_c = (void *)addr;
-		/* zc */
-		strlcpy(zc->zc_name, zcdm_c->zc_name, MAXPATHLEN);
-		strlcpy(zc->zc_value, zcdm_c->zc_value, MAXPATHLEN * 2);
-		strlcpy(zc->zc_string, zcdm_c->zc_string, MAXPATHLEN);
-
-#define FIELD_COPY(field) zc->field = zcdm_c->field
-		zc->zc_guid = zcdm_c->zc_guid;
-		zc->zc_nvlist_conf = zcdm_c->zc_nvlist_conf;
-		zc->zc_nvlist_conf_size = zcdm_c->zc_nvlist_conf_size;
-		zc->zc_nvlist_src = zcdm_c->zc_nvlist_src;
-		zc->zc_nvlist_src_size = zcdm_c->zc_nvlist_src_size;
-		zc->zc_nvlist_dst = zcdm_c->zc_nvlist_dst;
-		zc->zc_nvlist_dst_size = zcdm_c->zc_nvlist_dst_size;
-		zc->zc_cookie = zcdm_c->zc_cookie;
-		zc->zc_objset_type = zcdm_c->zc_objset_type;
-		zc->zc_perm_action = zcdm_c->zc_perm_action;
-		zc->zc_history = zcdm_c->zc_history;
-		zc->zc_history_len = zcdm_c->zc_history_len;
-		zc->zc_history_offset = zcdm_c->zc_history_offset;
-		zc->zc_obj = zcdm_c->zc_obj;
-		zc->zc_iflags = zcdm_c->zc_iflags;
-		zc->zc_share = zcdm_c->zc_share;
-		zc->zc_jailid = zcdm_c->zc_jailid;
-		zc->zc_objset_stats = zcdm_c->zc_objset_stats;
-		zc->zc_begin_record.drr_u.drr_begin = zcdm_c->zc_begin_record;
-		zc->zc_defer_destroy = zcdm_c->zc_defer_destroy;
-		(void)zcdm_c->zc_temphold;
-		zc->zc_action_handle = zcdm_c->zc_action_handle;
-		zc->zc_cleanup_fd = zcdm_c->zc_cleanup_fd;
-		zc->zc_simple = zcdm_c->zc_simple;
-		zc->zc_resumable = B_FALSE;
-		zc->zc_sendobj = zcdm_c->zc_sendobj;
-		zc->zc_fromobj = zcdm_c->zc_fromobj;
-		zc->zc_createtxg = zcdm_c->zc_createtxg;
-		zc->zc_stat = zcdm_c->zc_stat;
-		FIELD_COPY(zc_inject_record.zi_objset);
-		FIELD_COPY(zc_inject_record.zi_object);
-		FIELD_COPY(zc_inject_record.zi_start);
-		FIELD_COPY(zc_inject_record.zi_end);
-		FIELD_COPY(zc_inject_record.zi_guid);
-		FIELD_COPY(zc_inject_record.zi_level);
-		FIELD_COPY(zc_inject_record.zi_error);
-		FIELD_COPY(zc_inject_record.zi_type);
-		FIELD_COPY(zc_inject_record.zi_freq);
-		FIELD_COPY(zc_inject_record.zi_failfast);
-		strlcpy(zc->zc_inject_record.zi_func,
-		    resume_c->zc_inject_record.zi_func, MAXNAMELEN);
-		FIELD_COPY(zc_inject_record.zi_iotype);
-		FIELD_COPY(zc_inject_record.zi_duration);
-		FIELD_COPY(zc_inject_record.zi_timer);
-		zc->zc_inject_record.zi_nlanes = 1;
-		FIELD_COPY(zc_inject_record.zi_cmd);
-		FIELD_COPY(zc_inject_record.zi_pad);
-
-		/* we always assume zc_nvlist_dst_filled is true */
-		zc->zc_nvlist_dst_filled = B_TRUE;
-#undef FIELD_COPY
-		break;
-
-	case ZFS_CMD_COMPAT_V28:
-		zc28_c = (void *)addr;
-
-		/* zc */
-		strlcpy(zc->zc_name, zc28_c->zc_name, MAXPATHLEN);
-		strlcpy(zc->zc_value, zc28_c->zc_value, MAXPATHLEN * 2);
-		strlcpy(zc->zc_string, zc28_c->zc_string, MAXPATHLEN);
-		zc->zc_guid = zc28_c->zc_guid;
-		zc->zc_nvlist_conf = zc28_c->zc_nvlist_conf;
-		zc->zc_nvlist_conf_size = zc28_c->zc_nvlist_conf_size;
-		zc->zc_nvlist_src = zc28_c->zc_nvlist_src;
-		zc->zc_nvlist_src_size = zc28_c->zc_nvlist_src_size;
-		zc->zc_nvlist_dst = zc28_c->zc_nvlist_dst;
-		zc->zc_nvlist_dst_size = zc28_c->zc_nvlist_dst_size;
-		zc->zc_cookie = zc28_c->zc_cookie;
-		zc->zc_objset_type = zc28_c->zc_objset_type;
-		zc->zc_perm_action = zc28_c->zc_perm_action;
-		zc->zc_history = zc28_c->zc_history;
-		zc->zc_history_len = zc28_c->zc_history_len;
-		zc->zc_history_offset = zc28_c->zc_history_offset;
-		zc->zc_obj = zc28_c->zc_obj;
-		zc->zc_iflags = zc28_c->zc_iflags;
-		zc->zc_share = zc28_c->zc_share;
-		zc->zc_jailid = zc28_c->zc_jailid;
-		zc->zc_objset_stats = zc28_c->zc_objset_stats;
-		zc->zc_begin_record.drr_u.drr_begin = zc28_c->zc_begin_record;
-		zc->zc_defer_destroy = zc28_c->zc_defer_destroy;
-		(void)zc28_c->zc_temphold;
-		zc->zc_action_handle = zc28_c->zc_action_handle;
-		zc->zc_cleanup_fd = zc28_c->zc_cleanup_fd;
-		zc->zc_simple = zc28_c->zc_simple;
-		zc->zc_resumable = B_FALSE;
-		zc->zc_sendobj = zc28_c->zc_sendobj;
-		zc->zc_fromobj = zc28_c->zc_fromobj;
-		zc->zc_createtxg = zc28_c->zc_createtxg;
-		zc->zc_stat = zc28_c->zc_stat;
-
-		/* zc->zc_inject_record */
-		zc->zc_inject_record.zi_objset =
-		    zc28_c->zc_inject_record.zi_objset;
-		zc->zc_inject_record.zi_object =
-		    zc28_c->zc_inject_record.zi_object;
-		zc->zc_inject_record.zi_start =
-		    zc28_c->zc_inject_record.zi_start;
-		zc->zc_inject_record.zi_end =
-		    zc28_c->zc_inject_record.zi_end;
-		zc->zc_inject_record.zi_guid =
-		    zc28_c->zc_inject_record.zi_guid;
-		zc->zc_inject_record.zi_level =
-		    zc28_c->zc_inject_record.zi_level;
-		zc->zc_inject_record.zi_error =
-		    zc28_c->zc_inject_record.zi_error;
-		zc->zc_inject_record.zi_type =
-		    zc28_c->zc_inject_record.zi_type;
-		zc->zc_inject_record.zi_freq =
-		    zc28_c->zc_inject_record.zi_freq;
-		zc->zc_inject_record.zi_failfast =
-		    zc28_c->zc_inject_record.zi_failfast;
-		strlcpy(zc->zc_inject_record.zi_func,
-		    zc28_c->zc_inject_record.zi_func, MAXNAMELEN);
-		zc->zc_inject_record.zi_iotype =
-		    zc28_c->zc_inject_record.zi_iotype;
-		zc->zc_inject_record.zi_duration =
-		    zc28_c->zc_inject_record.zi_duration;
-		zc->zc_inject_record.zi_timer =
-		    zc28_c->zc_inject_record.zi_timer;
-		zc->zc_inject_record.zi_nlanes = 1;
-		zc->zc_inject_record.zi_cmd = ZINJECT_UNINITIALIZED;
-		zc->zc_inject_record.zi_pad = 0;
-		break;
-
-	case ZFS_CMD_COMPAT_V15:
-		zc_c = (void *)addr;
-
-		/* zc */
-		strlcpy(zc->zc_name, zc_c->zc_name, MAXPATHLEN);
-		strlcpy(zc->zc_value, zc_c->zc_value, MAXPATHLEN);
-		strlcpy(zc->zc_string, zc_c->zc_string, MAXPATHLEN);
-		zc->zc_guid = zc_c->zc_guid;
-		zc->zc_nvlist_conf = zc_c->zc_nvlist_conf;
-		zc->zc_nvlist_conf_size = zc_c->zc_nvlist_conf_size;
-		zc->zc_nvlist_src = zc_c->zc_nvlist_src;
-		zc->zc_nvlist_src_size = zc_c->zc_nvlist_src_size;
-		zc->zc_nvlist_dst = zc_c->zc_nvlist_dst;
-		zc->zc_nvlist_dst_size = zc_c->zc_nvlist_dst_size;
-		zc->zc_cookie = zc_c->zc_cookie;
-		zc->zc_objset_type = zc_c->zc_objset_type;
-		zc->zc_perm_action = zc_c->zc_perm_action;
-		zc->zc_history = zc_c->zc_history;
-		zc->zc_history_len = zc_c->zc_history_len;
-		zc->zc_history_offset = zc_c->zc_history_offset;
-		zc->zc_obj = zc_c->zc_obj;
-		zc->zc_share = zc_c->zc_share;
-		zc->zc_jailid = zc_c->zc_jailid;
-		zc->zc_objset_stats = zc_c->zc_objset_stats;
-		zc->zc_begin_record.drr_u.drr_begin = zc_c->zc_begin_record;
-
-		/* zc->zc_inject_record */
-		zc->zc_inject_record.zi_objset =
-		    zc_c->zc_inject_record.zi_objset;
-		zc->zc_inject_record.zi_object =
-		    zc_c->zc_inject_record.zi_object;
-		zc->zc_inject_record.zi_start =
-		    zc_c->zc_inject_record.zi_start;
-		zc->zc_inject_record.zi_end =
-		    zc_c->zc_inject_record.zi_end;
-		zc->zc_inject_record.zi_guid =
-		    zc_c->zc_inject_record.zi_guid;
-		zc->zc_inject_record.zi_level =
-		    zc_c->zc_inject_record.zi_level;
-		zc->zc_inject_record.zi_error =
-		    zc_c->zc_inject_record.zi_error;
-		zc->zc_inject_record.zi_type =
-		    zc_c->zc_inject_record.zi_type;
-		zc->zc_inject_record.zi_freq =
-		    zc_c->zc_inject_record.zi_freq;
-		zc->zc_inject_record.zi_failfast =
-		    zc_c->zc_inject_record.zi_failfast;
-		break;
-	}
-}
-
-void
-zfs_cmd_compat_put(zfs_cmd_t *zc, caddr_t addr, const int request,
-    const int cflag)
-{
-	zfs_cmd_v15_t *zc_c;
-	zfs_cmd_v28_t *zc28_c;
-	zfs_cmd_deadman_t *zcdm_c;
-	zfs_cmd_zcmd_t *zcmd_c;
-	zfs_cmd_edbp_t *edbp_c;
-	zfs_cmd_resume_t *resume_c;
-	zfs_cmd_inlanes_t *inlanes_c;
-
-	switch (cflag) {
-	case ZFS_CMD_COMPAT_INLANES:
-		inlanes_c = (void *)addr;
-		strlcpy(inlanes_c->zc_name, zc->zc_name, MAXPATHLEN);
-		strlcpy(inlanes_c->zc_value, zc->zc_value, MAXPATHLEN * 2);
-		strlcpy(inlanes_c->zc_string, zc->zc_string, MAXPATHLEN);
-
-#define FIELD_COPY(field) inlanes_c->field = zc->field
-		FIELD_COPY(zc_nvlist_src);
-		FIELD_COPY(zc_nvlist_src_size);
-		FIELD_COPY(zc_nvlist_dst);
-		FIELD_COPY(zc_nvlist_dst_size);
-		FIELD_COPY(zc_nvlist_dst_filled);
-		FIELD_COPY(zc_pad2);
-		FIELD_COPY(zc_history);
-		FIELD_COPY(zc_guid);
-		FIELD_COPY(zc_nvlist_conf);
-		FIELD_COPY(zc_nvlist_conf_size);
-		FIELD_COPY(zc_cookie);
-		FIELD_COPY(zc_objset_type);
-		FIELD_COPY(zc_perm_action);
-		FIELD_COPY(zc_history_len);
-		FIELD_COPY(zc_history_offset);
-		FIELD_COPY(zc_obj);
-		FIELD_COPY(zc_iflags);
-		FIELD_COPY(zc_share);
-		FIELD_COPY(zc_jailid);
-		FIELD_COPY(zc_objset_stats);
-		FIELD_COPY(zc_begin_record);
-		FIELD_COPY(zc_inject_record);
-		FIELD_COPY(zc_defer_destroy);
-		FIELD_COPY(zc_flags);
-		FIELD_COPY(zc_action_handle);
-		FIELD_COPY(zc_cleanup_fd);
-		FIELD_COPY(zc_simple);
-		FIELD_COPY(zc_sendobj);
-		FIELD_COPY(zc_fromobj);
-		FIELD_COPY(zc_createtxg);
-		FIELD_COPY(zc_stat);
-#undef FIELD_COPY
-		break;
-
-	case ZFS_CMD_COMPAT_RESUME:
-		resume_c = (void *)addr;
-		strlcpy(resume_c->zc_name, zc->zc_name, MAXPATHLEN);
-		strlcpy(resume_c->zc_value, zc->zc_value, MAXPATHLEN * 2);
-		strlcpy(resume_c->zc_string, zc->zc_string, MAXPATHLEN);
-
-#define FIELD_COPY(field) resume_c->field = zc->field
-		FIELD_COPY(zc_nvlist_src);
-		FIELD_COPY(zc_nvlist_src_size);
-		FIELD_COPY(zc_nvlist_dst);
-		FIELD_COPY(zc_nvlist_dst_size);
-		FIELD_COPY(zc_nvlist_dst_filled);
-		FIELD_COPY(zc_pad2);
-		FIELD_COPY(zc_history);
-		FIELD_COPY(zc_guid);
-		FIELD_COPY(zc_nvlist_conf);
-		FIELD_COPY(zc_nvlist_conf_size);
-		FIELD_COPY(zc_cookie);
-		FIELD_COPY(zc_objset_type);
-		FIELD_COPY(zc_perm_action);
-		FIELD_COPY(zc_history_len);
-		FIELD_COPY(zc_history_offset);
-		FIELD_COPY(zc_obj);
-		FIELD_COPY(zc_iflags);
-		FIELD_COPY(zc_share);
-		FIELD_COPY(zc_jailid);
-		FIELD_COPY(zc_objset_stats);
-		FIELD_COPY(zc_begin_record);
-		FIELD_COPY(zc_inject_record.zi_objset);
-		FIELD_COPY(zc_inject_record.zi_object);
-		FIELD_COPY(zc_inject_record.zi_start);
-		FIELD_COPY(zc_inject_record.zi_end);
-		FIELD_COPY(zc_inject_record.zi_guid);
-		FIELD_COPY(zc_inject_record.zi_level);
-		FIELD_COPY(zc_inject_record.zi_error);
-		FIELD_COPY(zc_inject_record.zi_type);
-		FIELD_COPY(zc_inject_record.zi_freq);
-		FIELD_COPY(zc_inject_record.zi_failfast);
-		strlcpy(resume_c->zc_inject_record.zi_func,
-		    zc->zc_inject_record.zi_func, MAXNAMELEN);
-		FIELD_COPY(zc_inject_record.zi_iotype);
-		FIELD_COPY(zc_inject_record.zi_duration);
-		FIELD_COPY(zc_inject_record.zi_timer);
-		FIELD_COPY(zc_inject_record.zi_cmd);
-		FIELD_COPY(zc_inject_record.zi_pad);
-		FIELD_COPY(zc_defer_destroy);
-		FIELD_COPY(zc_flags);
-		FIELD_COPY(zc_action_handle);
-		FIELD_COPY(zc_cleanup_fd);
-		FIELD_COPY(zc_simple);
-		FIELD_COPY(zc_sendobj);
-		FIELD_COPY(zc_fromobj);
-		FIELD_COPY(zc_createtxg);
-		FIELD_COPY(zc_stat);
-#undef FIELD_COPY
-		break;
-
-	case ZFS_CMD_COMPAT_EDBP:
-		edbp_c = (void *)addr;
-		strlcpy(edbp_c->zc_name, zc->zc_name, MAXPATHLEN);
-		strlcpy(edbp_c->zc_value, zc->zc_value, MAXPATHLEN * 2);
-		strlcpy(edbp_c->zc_string, zc->zc_string, MAXPATHLEN);
-
-#define FIELD_COPY(field) edbp_c->field = zc->field
-		FIELD_COPY(zc_nvlist_src);
-		FIELD_COPY(zc_nvlist_src_size);
-		FIELD_COPY(zc_nvlist_dst);
-		FIELD_COPY(zc_nvlist_dst_size);
-		FIELD_COPY(zc_nvlist_dst_filled);
-		FIELD_COPY(zc_pad2);
-		FIELD_COPY(zc_history);
-		FIELD_COPY(zc_guid);
-		FIELD_COPY(zc_nvlist_conf);
-		FIELD_COPY(zc_nvlist_conf_size);
-		FIELD_COPY(zc_cookie);
-		FIELD_COPY(zc_objset_type);
-		FIELD_COPY(zc_perm_action);
-		FIELD_COPY(zc_history_len);
-		FIELD_COPY(zc_history_offset);
-		FIELD_COPY(zc_obj);
-		FIELD_COPY(zc_iflags);
-		FIELD_COPY(zc_share);
-		FIELD_COPY(zc_jailid);
-		FIELD_COPY(zc_objset_stats);
-		edbp_c->zc_begin_record = zc->zc_begin_record.drr_u.drr_begin;
-		FIELD_COPY(zc_inject_record.zi_objset);
-		FIELD_COPY(zc_inject_record.zi_object);
-		FIELD_COPY(zc_inject_record.zi_start);
-		FIELD_COPY(zc_inject_record.zi_end);
-		FIELD_COPY(zc_inject_record.zi_guid);
-		FIELD_COPY(zc_inject_record.zi_level);
-		FIELD_COPY(zc_inject_record.zi_error);
-		FIELD_COPY(zc_inject_record.zi_type);
-		FIELD_COPY(zc_inject_record.zi_freq);
-		FIELD_COPY(zc_inject_record.zi_failfast);
-		strlcpy(resume_c->zc_inject_record.zi_func,
-		    zc->zc_inject_record.zi_func, MAXNAMELEN);
-		FIELD_COPY(zc_inject_record.zi_iotype);
-		FIELD_COPY(zc_inject_record.zi_duration);
-		FIELD_COPY(zc_inject_record.zi_timer);
-		FIELD_COPY(zc_inject_record.zi_cmd);
-		FIELD_COPY(zc_inject_record.zi_pad);
-		FIELD_COPY(zc_defer_destroy);
-		FIELD_COPY(zc_flags);
-		FIELD_COPY(zc_action_handle);
-		FIELD_COPY(zc_cleanup_fd);
-		FIELD_COPY(zc_simple);
-		FIELD_COPY(zc_sendobj);
-		FIELD_COPY(zc_fromobj);
-		FIELD_COPY(zc_createtxg);
-		FIELD_COPY(zc_stat);
-#undef FIELD_COPY
-		break;
-
-	case ZFS_CMD_COMPAT_ZCMD:
-		zcmd_c = (void *)addr;
-		/* zc */
-		strlcpy(zcmd_c->zc_name, zc->zc_name, MAXPATHLEN);
-		strlcpy(zcmd_c->zc_value, zc->zc_value, MAXPATHLEN * 2);
-		strlcpy(zcmd_c->zc_string, zc->zc_string, MAXPATHLEN);
-
-#define FIELD_COPY(field) zcmd_c->field = zc->field
-		FIELD_COPY(zc_nvlist_src);
-		FIELD_COPY(zc_nvlist_src_size);
-		FIELD_COPY(zc_nvlist_dst);
-		FIELD_COPY(zc_nvlist_dst_size);
-		FIELD_COPY(zc_nvlist_dst_filled);
-		FIELD_COPY(zc_pad2);
-		FIELD_COPY(zc_history);
-		FIELD_COPY(zc_guid);
-		FIELD_COPY(zc_nvlist_conf);
-		FIELD_COPY(zc_nvlist_conf_size);
-		FIELD_COPY(zc_cookie);
-		FIELD_COPY(zc_objset_type);
-		FIELD_COPY(zc_perm_action);
-		FIELD_COPY(zc_history_len);
-		FIELD_COPY(zc_history_offset);
-		FIELD_COPY(zc_obj);
-		FIELD_COPY(zc_iflags);
-		FIELD_COPY(zc_share);
-		FIELD_COPY(zc_jailid);
-		FIELD_COPY(zc_objset_stats);
-		zcmd_c->zc_begin_record = zc->zc_begin_record.drr_u.drr_begin;
-		FIELD_COPY(zc_inject_record.zi_objset);
-		FIELD_COPY(zc_inject_record.zi_object);
-		FIELD_COPY(zc_inject_record.zi_start);
-		FIELD_COPY(zc_inject_record.zi_end);
-		FIELD_COPY(zc_inject_record.zi_guid);
-		FIELD_COPY(zc_inject_record.zi_level);
-		FIELD_COPY(zc_inject_record.zi_error);
-		FIELD_COPY(zc_inject_record.zi_type);
-		FIELD_COPY(zc_inject_record.zi_freq);
-		FIELD_COPY(zc_inject_record.zi_failfast);
-		strlcpy(resume_c->zc_inject_record.zi_func,
-		    zc->zc_inject_record.zi_func, MAXNAMELEN);
-		FIELD_COPY(zc_inject_record.zi_iotype);
-		FIELD_COPY(zc_inject_record.zi_duration);
-		FIELD_COPY(zc_inject_record.zi_timer);
-		FIELD_COPY(zc_inject_record.zi_cmd);
-		FIELD_COPY(zc_inject_record.zi_pad);
-
-		/* boolean_t -> uint32_t */
-		zcmd_c->zc_defer_destroy = (uint32_t)(zc->zc_defer_destroy);
-		zcmd_c->zc_temphold = 0;
-
-		FIELD_COPY(zc_action_handle);
-		FIELD_COPY(zc_cleanup_fd);
-		FIELD_COPY(zc_simple);
-		FIELD_COPY(zc_sendobj);
-		FIELD_COPY(zc_fromobj);
-		FIELD_COPY(zc_createtxg);
-		FIELD_COPY(zc_stat);
-#undef FIELD_COPY
-
-		break;
-
-	case ZFS_CMD_COMPAT_DEADMAN:
-		zcdm_c = (void *)addr;
-
-		strlcpy(zcdm_c->zc_name, zc->zc_name, MAXPATHLEN);
-		strlcpy(zcdm_c->zc_value, zc->zc_value, MAXPATHLEN * 2);
-		strlcpy(zcdm_c->zc_string, zc->zc_string, MAXPATHLEN);
-
-#define FIELD_COPY(field) zcdm_c->field = zc->field
-		zcdm_c->zc_guid = zc->zc_guid;
-		zcdm_c->zc_nvlist_conf = zc->zc_nvlist_conf;
-		zcdm_c->zc_nvlist_conf_size = zc->zc_nvlist_conf_size;
-		zcdm_c->zc_nvlist_src = zc->zc_nvlist_src;
-		zcdm_c->zc_nvlist_src_size = zc->zc_nvlist_src_size;
-		zcdm_c->zc_nvlist_dst = zc->zc_nvlist_dst;
-		zcdm_c->zc_nvlist_dst_size = zc->zc_nvlist_dst_size;
-		zcdm_c->zc_cookie = zc->zc_cookie;
-		zcdm_c->zc_objset_type = zc->zc_objset_type;
-		zcdm_c->zc_perm_action = zc->zc_perm_action;
-		zcdm_c->zc_history = zc->zc_history;
-		zcdm_c->zc_history_len = zc->zc_history_len;
-		zcdm_c->zc_history_offset = zc->zc_history_offset;
-		zcdm_c->zc_obj = zc->zc_obj;
-		zcdm_c->zc_iflags = zc->zc_iflags;
-		zcdm_c->zc_share = zc->zc_share;
-		zcdm_c->zc_jailid = zc->zc_jailid;
-		zcdm_c->zc_objset_stats = zc->zc_objset_stats;
-		zcdm_c->zc_begin_record = zc->zc_begin_record.drr_u.drr_begin;
-		zcdm_c->zc_defer_destroy = zc->zc_defer_destroy;
-		zcdm_c->zc_temphold = 0;
-		zcdm_c->zc_action_handle = zc->zc_action_handle;
-		zcdm_c->zc_cleanup_fd = zc->zc_cleanup_fd;
-		zcdm_c->zc_simple = zc->zc_simple;
-		zcdm_c->zc_sendobj = zc->zc_sendobj;
-		zcdm_c->zc_fromobj = zc->zc_fromobj;
-		zcdm_c->zc_createtxg = zc->zc_createtxg;
-		zcdm_c->zc_stat = zc->zc_stat;
-		FIELD_COPY(zc_inject_record.zi_objset);
-		FIELD_COPY(zc_inject_record.zi_object);
-		FIELD_COPY(zc_inject_record.zi_start);
-		FIELD_COPY(zc_inject_record.zi_end);
-		FIELD_COPY(zc_inject_record.zi_guid);
-		FIELD_COPY(zc_inject_record.zi_level);
-		FIELD_COPY(zc_inject_record.zi_error);
-		FIELD_COPY(zc_inject_record.zi_type);
-		FIELD_COPY(zc_inject_record.zi_freq);
-		FIELD_COPY(zc_inject_record.zi_failfast);
-		strlcpy(resume_c->zc_inject_record.zi_func,
-		    zc->zc_inject_record.zi_func, MAXNAMELEN);
-		FIELD_COPY(zc_inject_record.zi_iotype);
-		FIELD_COPY(zc_inject_record.zi_duration);
-		FIELD_COPY(zc_inject_record.zi_timer);
-		FIELD_COPY(zc_inject_record.zi_cmd);
-		FIELD_COPY(zc_inject_record.zi_pad);
-#undef FIELD_COPY
-#ifndef _KERNEL
-		if (request == ZFS_IOC_RECV)
-			strlcpy(zcdm_c->zc_top_ds,
-			    zc->zc_value + strlen(zc->zc_value) + 1,
-			    (MAXPATHLEN * 2) - strlen(zc->zc_value) - 1);
-#endif
-		break;
-
-	case ZFS_CMD_COMPAT_V28:
-		zc28_c = (void *)addr;
-
-		strlcpy(zc28_c->zc_name, zc->zc_name, MAXPATHLEN);
-		strlcpy(zc28_c->zc_value, zc->zc_value, MAXPATHLEN * 2);
-		strlcpy(zc28_c->zc_string, zc->zc_string, MAXPATHLEN);
-		zc28_c->zc_guid = zc->zc_guid;
-		zc28_c->zc_nvlist_conf = zc->zc_nvlist_conf;
-		zc28_c->zc_nvlist_conf_size = zc->zc_nvlist_conf_size;
-		zc28_c->zc_nvlist_src = zc->zc_nvlist_src;
-		zc28_c->zc_nvlist_src_size = zc->zc_nvlist_src_size;
-		zc28_c->zc_nvlist_dst = zc->zc_nvlist_dst;
-		zc28_c->zc_nvlist_dst_size = zc->zc_nvlist_dst_size;
-		zc28_c->zc_cookie = zc->zc_cookie;
-		zc28_c->zc_objset_type = zc->zc_objset_type;
-		zc28_c->zc_perm_action = zc->zc_perm_action;
-		zc28_c->zc_history = zc->zc_history;
-		zc28_c->zc_history_len = zc->zc_history_len;
-		zc28_c->zc_history_offset = zc->zc_history_offset;
-		zc28_c->zc_obj = zc->zc_obj;
-		zc28_c->zc_iflags = zc->zc_iflags;
-		zc28_c->zc_share = zc->zc_share;
-		zc28_c->zc_jailid = zc->zc_jailid;
-		zc28_c->zc_objset_stats = zc->zc_objset_stats;
-		zc28_c->zc_begin_record = zc->zc_begin_record.drr_u.drr_begin;
-		zc28_c->zc_defer_destroy = zc->zc_defer_destroy;
-		zc28_c->zc_temphold = 0;
-		zc28_c->zc_action_handle = zc->zc_action_handle;
-		zc28_c->zc_cleanup_fd = zc->zc_cleanup_fd;
-		zc28_c->zc_simple = zc->zc_simple;
-		zc28_c->zc_sendobj = zc->zc_sendobj;
-		zc28_c->zc_fromobj = zc->zc_fromobj;
-		zc28_c->zc_createtxg = zc->zc_createtxg;
-		zc28_c->zc_stat = zc->zc_stat;
-#ifndef _KERNEL
-		if (request == ZFS_IOC_RECV)
-			strlcpy(zc28_c->zc_top_ds,
-			    zc->zc_value + strlen(zc->zc_value) + 1,
-			    MAXPATHLEN * 2 - strlen(zc->zc_value) - 1);
-#endif
-		/* zc_inject_record */
-		zc28_c->zc_inject_record.zi_objset =
-		    zc->zc_inject_record.zi_objset;
-		zc28_c->zc_inject_record.zi_object =
-		    zc->zc_inject_record.zi_object;
-		zc28_c->zc_inject_record.zi_start =
-		    zc->zc_inject_record.zi_start;
-		zc28_c->zc_inject_record.zi_end =
-		    zc->zc_inject_record.zi_end;
-		zc28_c->zc_inject_record.zi_guid =
-		    zc->zc_inject_record.zi_guid;
-		zc28_c->zc_inject_record.zi_level =
-		    zc->zc_inject_record.zi_level;
-		zc28_c->zc_inject_record.zi_error =
-		    zc->zc_inject_record.zi_error;
-		zc28_c->zc_inject_record.zi_type =
-		    zc->zc_inject_record.zi_type;
-		zc28_c->zc_inject_record.zi_freq =
-		    zc->zc_inject_record.zi_freq;
-		zc28_c->zc_inject_record.zi_failfast =
-		    zc->zc_inject_record.zi_failfast;
-		strlcpy(zc28_c->zc_inject_record.zi_func,
-		    zc->zc_inject_record.zi_func, MAXNAMELEN);
-		zc28_c->zc_inject_record.zi_iotype =
-		    zc->zc_inject_record.zi_iotype;
-		zc28_c->zc_inject_record.zi_duration =
-		    zc->zc_inject_record.zi_duration;
-		zc28_c->zc_inject_record.zi_timer =
-		    zc->zc_inject_record.zi_timer;
-		break;
-
-	case ZFS_CMD_COMPAT_V15:
-		zc_c = (void *)addr;
-
-		/* zc */
-		strlcpy(zc_c->zc_name, zc->zc_name, MAXPATHLEN);
-		strlcpy(zc_c->zc_value, zc->zc_value, MAXPATHLEN);
-		strlcpy(zc_c->zc_string, zc->zc_string, MAXPATHLEN);
-		zc_c->zc_guid = zc->zc_guid;
-		zc_c->zc_nvlist_conf = zc->zc_nvlist_conf;
-		zc_c->zc_nvlist_conf_size = zc->zc_nvlist_conf_size;
-		zc_c->zc_nvlist_src = zc->zc_nvlist_src;
-		zc_c->zc_nvlist_src_size = zc->zc_nvlist_src_size;
-		zc_c->zc_nvlist_dst = zc->zc_nvlist_dst;
-		zc_c->zc_nvlist_dst_size = zc->zc_nvlist_dst_size;
-		zc_c->zc_cookie = zc->zc_cookie;
-		zc_c->zc_objset_type = zc->zc_objset_type;
-		zc_c->zc_perm_action = zc->zc_perm_action;
-		zc_c->zc_history = zc->zc_history;
-		zc_c->zc_history_len = zc->zc_history_len;
-		zc_c->zc_history_offset = zc->zc_history_offset;
-		zc_c->zc_obj = zc->zc_obj;
-		zc_c->zc_share = zc->zc_share;
-		zc_c->zc_jailid = zc->zc_jailid;
-		zc_c->zc_objset_stats = zc->zc_objset_stats;
-		zc_c->zc_begin_record = zc->zc_begin_record.drr_u.drr_begin;
-
-		/* zc_inject_record */
-		zc_c->zc_inject_record.zi_objset =
-		    zc->zc_inject_record.zi_objset;
-		zc_c->zc_inject_record.zi_object =
-		    zc->zc_inject_record.zi_object;
-		zc_c->zc_inject_record.zi_start =
-		    zc->zc_inject_record.zi_start;
-		zc_c->zc_inject_record.zi_end =
-		    zc->zc_inject_record.zi_end;
-		zc_c->zc_inject_record.zi_guid =
-		    zc->zc_inject_record.zi_guid;
-		zc_c->zc_inject_record.zi_level =
-		    zc->zc_inject_record.zi_level;
-		zc_c->zc_inject_record.zi_error =
-		    zc->zc_inject_record.zi_error;
-		zc_c->zc_inject_record.zi_type =
-		    zc->zc_inject_record.zi_type;
-		zc_c->zc_inject_record.zi_freq =
-		    zc->zc_inject_record.zi_freq;
-		zc_c->zc_inject_record.zi_failfast =
-		    zc->zc_inject_record.zi_failfast;
-
-		break;
-	}
-}
-
-static int
-zfs_ioctl_compat_get_nvlist(uint64_t nvl, size_t size, int iflag,
-    nvlist_t **nvp)
-{
-	char *packed;
-	int error;
-	nvlist_t *list = NULL;
-
-	/*
-	 * Read in and unpack the user-supplied nvlist.
-	 */
-	if (size == 0)
-		return (EINVAL);
-
-#ifdef _KERNEL
-	packed = kmem_alloc(size, KM_SLEEP);
-	if ((error = ddi_copyin((void *)(uintptr_t)nvl, packed, size,
-	    iflag)) != 0) {
-		kmem_free(packed, size);
-		return (error);
-	}
-#else
-	packed = (void *)(uintptr_t)nvl;
-#endif
-
-	error = nvlist_unpack(packed, size, &list, 0);
-
-#ifdef _KERNEL
-	kmem_free(packed, size);
-#endif
-
-	if (error != 0)
-		return (error);
-
-	*nvp = list;
-	return (0);
-}
-
-static int
-zfs_ioctl_compat_put_nvlist(zfs_cmd_t *zc, nvlist_t *nvl)
-{
-	char *packed = NULL;
-	int error = 0;
-	size_t size;
-
-	VERIFY(nvlist_size(nvl, &size, NV_ENCODE_NATIVE) == 0);
-
-#ifdef _KERNEL
-	packed = kmem_alloc(size, KM_SLEEP);
-	VERIFY(nvlist_pack(nvl, &packed, &size, NV_ENCODE_NATIVE,
-	    KM_SLEEP) == 0);
-
-	if (ddi_copyout(packed,
-	    (void *)(uintptr_t)zc->zc_nvlist_dst, size, zc->zc_iflags) != 0)
-		error = EFAULT;
-	kmem_free(packed, size);
-#else
-	packed = (void *)(uintptr_t)zc->zc_nvlist_dst;
-	VERIFY(nvlist_pack(nvl, &packed, &size, NV_ENCODE_NATIVE,
-	    0) == 0);
-#endif
-
-	zc->zc_nvlist_dst_size = size;
-	return (error);
-}
-
-static void
-zfs_ioctl_compat_fix_stats_nvlist(nvlist_t *nvl)
-{
-	nvlist_t **child;
-	nvlist_t *nvroot = NULL;
-	vdev_stat_t *vs;
-	uint_t c, children, nelem;
-
-	if (nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_CHILDREN,
-	    &child, &children) == 0) {
-		for (c = 0; c < children; c++) {
-			zfs_ioctl_compat_fix_stats_nvlist(child[c]);
-		}
-	}
-
-	if (nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_VDEV_TREE,
-	    &nvroot) == 0)
-		zfs_ioctl_compat_fix_stats_nvlist(nvroot);
-#ifdef _KERNEL
-	if ((nvlist_lookup_uint64_array(nvl, ZPOOL_CONFIG_VDEV_STATS,
-#else
-	if ((nvlist_lookup_uint64_array(nvl, "stats",
-#endif
-
-	    (uint64_t **)&vs, &nelem) == 0)) {
-		nvlist_add_uint64_array(nvl,
-#ifdef _KERNEL
-		    "stats",
-#else
-		    ZPOOL_CONFIG_VDEV_STATS,
-#endif
-		    (uint64_t *)vs, nelem);
-#ifdef _KERNEL
-		nvlist_remove(nvl, ZPOOL_CONFIG_VDEV_STATS,
-#else
-		nvlist_remove(nvl, "stats",
-#endif
-		    DATA_TYPE_UINT64_ARRAY);
-	}
-}
-
-static int
-zfs_ioctl_compat_fix_stats(zfs_cmd_t *zc, const int nc)
-{
-	nvlist_t *nv, *nvp = NULL;
-	nvpair_t *elem;
-	int error;
-
-	if ((error = zfs_ioctl_compat_get_nvlist(zc->zc_nvlist_dst,
-	    zc->zc_nvlist_dst_size, zc->zc_iflags, &nv)) != 0)
-		return (error);
-
-	if (nc == 5) { /* ZFS_IOC_POOL_STATS */
-		elem = NULL;
-		while ((elem = nvlist_next_nvpair(nv, elem)) != NULL) {
-			if (nvpair_value_nvlist(elem, &nvp) == 0)
-				zfs_ioctl_compat_fix_stats_nvlist(nvp);
-		}
-		elem = NULL;
-	} else
-		zfs_ioctl_compat_fix_stats_nvlist(nv);
-
-	error = zfs_ioctl_compat_put_nvlist(zc, nv);
-
-	nvlist_free(nv);
-
-	return (error);
-}
-
-static int
-zfs_ioctl_compat_pool_get_props(zfs_cmd_t *zc)
-{
-	nvlist_t *nv, *nva = NULL;
-	int error;
-
-	if ((error = zfs_ioctl_compat_get_nvlist(zc->zc_nvlist_dst,
-	    zc->zc_nvlist_dst_size, zc->zc_iflags, &nv)) != 0)
-		return (error);
-
-#ifdef _KERNEL
-	if (nvlist_lookup_nvlist(nv, "allocated", &nva) == 0) {
-		nvlist_add_nvlist(nv, "used", nva);
-		nvlist_remove(nv, "allocated", DATA_TYPE_NVLIST);
-	}
-
-	if (nvlist_lookup_nvlist(nv, "free", &nva) == 0) {
-		nvlist_add_nvlist(nv, "available", nva);
-		nvlist_remove(nv, "free", DATA_TYPE_NVLIST);
-	}
-#else
-	if (nvlist_lookup_nvlist(nv, "used", &nva) == 0) {
-		nvlist_add_nvlist(nv, "allocated", nva);
-		nvlist_remove(nv, "used", DATA_TYPE_NVLIST);
-	}
-
-	if (nvlist_lookup_nvlist(nv, "available", &nva) == 0) {
-		nvlist_add_nvlist(nv, "free", nva);
-		nvlist_remove(nv, "available", DATA_TYPE_NVLIST);
-	}
-#endif
-
-	error = zfs_ioctl_compat_put_nvlist(zc, nv);
-
-	nvlist_free(nv);
-
-	return (error);
-}
-
-#ifndef _KERNEL
-int
-zcmd_ioctl_compat(int fd, int request, zfs_cmd_t *zc, const int cflag)
-{
-	int nc, ret;
-	void *zc_c;
-	unsigned long ncmd;
-	zfs_iocparm_t zp;
-
-	switch (cflag) {
-	case ZFS_CMD_COMPAT_NONE:
-		ncmd = _IOWR('Z', request, struct zfs_iocparm);
-		zp.zfs_cmd = (uint64_t)zc;
-		zp.zfs_cmd_size = sizeof(zfs_cmd_t);
-		zp.zfs_ioctl_version = ZFS_IOCVER_CURRENT;
-		return (ioctl(fd, ncmd, &zp));
-	case ZFS_CMD_COMPAT_INLANES:
-		ncmd = _IOWR('Z', request, struct zfs_iocparm);
-		zp.zfs_cmd = (uint64_t)zc;
-		zp.zfs_cmd_size = sizeof(zfs_cmd_inlanes_t);
-		zp.zfs_ioctl_version = ZFS_IOCVER_INLANES;
-		return (ioctl(fd, ncmd, &zp));
-	case ZFS_CMD_COMPAT_RESUME:
-		ncmd = _IOWR('Z', request, struct zfs_iocparm);
-		zp.zfs_cmd = (uint64_t)zc;
-		zp.zfs_cmd_size = sizeof(zfs_cmd_resume_t);
-		zp.zfs_ioctl_version = ZFS_IOCVER_RESUME;
-		return (ioctl(fd, ncmd, &zp));
-	case ZFS_CMD_COMPAT_EDBP:
-		ncmd = _IOWR('Z', request, struct zfs_iocparm);
-		zp.zfs_cmd = (uint64_t)zc;
-		zp.zfs_cmd_size = sizeof(zfs_cmd_edbp_t);
-		zp.zfs_ioctl_version = ZFS_IOCVER_EDBP;
-		return (ioctl(fd, ncmd, &zp));
-	case ZFS_CMD_COMPAT_ZCMD:
-		ncmd = _IOWR('Z', request, struct zfs_iocparm);
-		zp.zfs_cmd = (uint64_t)zc;
-		zp.zfs_cmd_size = sizeof(zfs_cmd_zcmd_t);
-		zp.zfs_ioctl_version = ZFS_IOCVER_ZCMD;
-		return (ioctl(fd, ncmd, &zp));
-	case ZFS_CMD_COMPAT_LZC:
-		ncmd = _IOWR('Z', request, struct zfs_cmd);
-		return (ioctl(fd, ncmd, zc));
-	case ZFS_CMD_COMPAT_DEADMAN:
-		zc_c = malloc(sizeof(zfs_cmd_deadman_t));
-		ncmd = _IOWR('Z', request, struct zfs_cmd_deadman);
-		break;
-	case ZFS_CMD_COMPAT_V28:
-		zc_c = malloc(sizeof(zfs_cmd_v28_t));
-		ncmd = _IOWR('Z', request, struct zfs_cmd_v28);
-		break;
-	case ZFS_CMD_COMPAT_V15:
-		nc = zfs_ioctl_v28_to_v15[request];
-		zc_c = malloc(sizeof(zfs_cmd_v15_t));
-		ncmd = _IOWR('Z', nc, struct zfs_cmd_v15);
-		break;
-	default:
-		return (EINVAL);
-	}
-
-	if (ZFS_IOCREQ(ncmd) == ZFS_IOC_COMPAT_FAIL)
-		return (ENOTSUP);
-
-	zfs_cmd_compat_put(zc, (caddr_t)zc_c, request, cflag);
-
-	ret = ioctl(fd, ncmd, zc_c);
-	if (cflag == ZFS_CMD_COMPAT_V15 &&
-	    nc == ZFS_IOC_POOL_IMPORT)
-		ret = ioctl(fd, _IOWR('Z', ZFS_IOC_POOL_CONFIGS,
-		    struct zfs_cmd_v15), zc_c);
-	zfs_cmd_compat_get(zc, (caddr_t)zc_c, cflag);
-	free(zc_c);
-
-	if (cflag == ZFS_CMD_COMPAT_V15) {
-		switch (nc) {
-		case ZFS_IOC_POOL_IMPORT:
-		case ZFS_IOC_POOL_CONFIGS:
-		case ZFS_IOC_POOL_STATS:
-		case ZFS_IOC_POOL_TRYIMPORT:
-			zfs_ioctl_compat_fix_stats(zc, nc);
-			break;
-		case 41: /* ZFS_IOC_POOL_GET_PROPS (v15) */
-			zfs_ioctl_compat_pool_get_props(zc);
-			break;
-		}
-	}
-
-	return (ret);
-}
-#else /* _KERNEL */
-int
-zfs_ioctl_compat_pre(zfs_cmd_t *zc, int *vec, const int cflag)
-{
-	int error = 0;
-
-	/* are we creating a clone? */
-	if (*vec == ZFS_IOC_CREATE && zc->zc_value[0] != '\0')
-		*vec = ZFS_IOC_CLONE;
-
-	if (cflag == ZFS_CMD_COMPAT_V15) {
-		switch (*vec) {
-
-		case 7: /* ZFS_IOC_POOL_SCRUB (v15) */
-			zc->zc_cookie = POOL_SCAN_SCRUB;
-			break;
-		}
-	}
-
-	return (error);
-}
-
-void
-zfs_ioctl_compat_post(zfs_cmd_t *zc, int vec, const int cflag)
-{
-	if (cflag == ZFS_CMD_COMPAT_V15) {
-		switch (vec) {
-		case ZFS_IOC_POOL_CONFIGS:
-		case ZFS_IOC_POOL_STATS:
-		case ZFS_IOC_POOL_TRYIMPORT:
-			zfs_ioctl_compat_fix_stats(zc, vec);
-			break;
-		case 41: /* ZFS_IOC_POOL_GET_PROPS (v15) */
-			zfs_ioctl_compat_pool_get_props(zc);
-			break;
-		}
-	}
-}
-
-nvlist_t *
-zfs_ioctl_compat_innvl(zfs_cmd_t *zc, nvlist_t * innvl, const int vec,
-    const int cflag)
-{
-	nvlist_t *nvl, *tmpnvl, *hnvl;
-	nvpair_t *elem;
-	char *poolname, *snapname;
-	int err;
-
-	if (cflag == ZFS_CMD_COMPAT_NONE || cflag == ZFS_CMD_COMPAT_LZC ||
-	    cflag == ZFS_CMD_COMPAT_ZCMD || cflag == ZFS_CMD_COMPAT_EDBP ||
-	    cflag == ZFS_CMD_COMPAT_RESUME || cflag == ZFS_CMD_COMPAT_INLANES)
-		goto out;
-
-	switch (vec) {
-	case ZFS_IOC_CREATE:
-		nvl = fnvlist_alloc();
-		fnvlist_add_int32(nvl, "type", zc->zc_objset_type);
-		if (innvl != NULL) {
-			fnvlist_add_nvlist(nvl, "props", innvl);
-			nvlist_free(innvl);
-		}
-		return (nvl);
-	break;
-	case ZFS_IOC_CLONE:
-		nvl = fnvlist_alloc();
-		fnvlist_add_string(nvl, "origin", zc->zc_value);
-		if (innvl != NULL) {
-			fnvlist_add_nvlist(nvl, "props", innvl);
-			nvlist_free(innvl);
-		}
-		return (nvl);
-	break;
-	case ZFS_IOC_SNAPSHOT:
-		if (innvl == NULL)
-			goto out;
-		nvl = fnvlist_alloc();
-		fnvlist_add_nvlist(nvl, "props", innvl);
-		tmpnvl = fnvlist_alloc();
-		snapname = kmem_asprintf("%s@%s", zc->zc_name, zc->zc_value);
-		fnvlist_add_boolean(tmpnvl, snapname);
-		kmem_free(snapname, strlen(snapname + 1));
-		/* check if we are doing a recursive snapshot */
-		if (zc->zc_cookie)
-			dmu_get_recursive_snaps_nvl(zc->zc_name, zc->zc_value,
-			    tmpnvl);
-		fnvlist_add_nvlist(nvl, "snaps", tmpnvl);
-		fnvlist_free(tmpnvl);
-		nvlist_free(innvl);
-		/* strip dataset part from zc->zc_name */
-		zc->zc_name[strcspn(zc->zc_name, "/@")] = '\0';
-		return (nvl);
-	break;
-	case ZFS_IOC_SPACE_SNAPS:
-		nvl = fnvlist_alloc();
-		fnvlist_add_string(nvl, "firstsnap", zc->zc_value);
-		if (innvl != NULL)
-			nvlist_free(innvl);
-		return (nvl);
-	break;
-	case ZFS_IOC_DESTROY_SNAPS:
-		if (innvl == NULL && cflag == ZFS_CMD_COMPAT_DEADMAN)
-			goto out;
-		nvl = fnvlist_alloc();
-		if (innvl != NULL) {
-			fnvlist_add_nvlist(nvl, "snaps", innvl);
-		} else {
-			/*
-			 * We are probably called by even older binaries,
-			 * allocate and populate nvlist with recursive
-			 * snapshots
-			 */
-			if (zfs_component_namecheck(zc->zc_value, NULL,
-			    NULL) == 0) {
-				tmpnvl = fnvlist_alloc();
-				if (dmu_get_recursive_snaps_nvl(zc->zc_name,
-				    zc->zc_value, tmpnvl) == 0)
-					fnvlist_add_nvlist(nvl, "snaps",
-					    tmpnvl);
-				nvlist_free(tmpnvl);
-			}
-		}
-		if (innvl != NULL)
-			nvlist_free(innvl);
-		/* strip dataset part from zc->zc_name */
-		zc->zc_name[strcspn(zc->zc_name, "/@")] = '\0';
-		return (nvl);
-	break;
-	case ZFS_IOC_HOLD:
-		nvl = fnvlist_alloc();
-		tmpnvl = fnvlist_alloc();
-		if (zc->zc_cleanup_fd != -1)
-			fnvlist_add_int32(nvl, "cleanup_fd",
-			    (int32_t)zc->zc_cleanup_fd);
-		if (zc->zc_cookie) {
-			hnvl = fnvlist_alloc();
-			if (dmu_get_recursive_snaps_nvl(zc->zc_name,
-			    zc->zc_value, hnvl) == 0) {
-				elem = NULL;
-				while ((elem = nvlist_next_nvpair(hnvl,
-				    elem)) != NULL) {
-					nvlist_add_string(tmpnvl,
-					    nvpair_name(elem), zc->zc_string);
-				}
-			}
-			nvlist_free(hnvl);
-		} else {
-			snapname = kmem_asprintf("%s@%s", zc->zc_name,
-			    zc->zc_value);
-			nvlist_add_string(tmpnvl, snapname, zc->zc_string);
-			kmem_free(snapname, strlen(snapname + 1));
-		}
-		fnvlist_add_nvlist(nvl, "holds", tmpnvl);
-		nvlist_free(tmpnvl);
-		if (innvl != NULL)
-			nvlist_free(innvl);
-		/* strip dataset part from zc->zc_name */
-		zc->zc_name[strcspn(zc->zc_name, "/@")] = '\0';
-		return (nvl);
-	break;
-	case ZFS_IOC_RELEASE:
-		nvl = fnvlist_alloc();
-		tmpnvl = fnvlist_alloc();
-		if (zc->zc_cookie) {
-			hnvl = fnvlist_alloc();
-			if (dmu_get_recursive_snaps_nvl(zc->zc_name,
-			    zc->zc_value, hnvl) == 0) {
-				elem = NULL;
-				while ((elem = nvlist_next_nvpair(hnvl,
-				    elem)) != NULL) {
-					fnvlist_add_boolean(tmpnvl,
-					    zc->zc_string);
-					fnvlist_add_nvlist(nvl,
-					    nvpair_name(elem), tmpnvl);
-				}
-			}
-			nvlist_free(hnvl);
-		} else {
-			snapname = kmem_asprintf("%s@%s", zc->zc_name,
-			    zc->zc_value);
-			fnvlist_add_boolean(tmpnvl, zc->zc_string);
-			fnvlist_add_nvlist(nvl, snapname, tmpnvl);
-			kmem_free(snapname, strlen(snapname + 1));
-		}
-		nvlist_free(tmpnvl);
-		if (innvl != NULL)
-			nvlist_free(innvl);
-		/* strip dataset part from zc->zc_name */
-		zc->zc_name[strcspn(zc->zc_name, "/@")] = '\0';
-		return (nvl);
-	break;
-	}
-out:
-	return (innvl);
-}
-
-nvlist_t *
-zfs_ioctl_compat_outnvl(zfs_cmd_t *zc, nvlist_t * outnvl, const int vec,
-    const int cflag)
-{
-	nvlist_t *tmpnvl;
-
-	if (cflag == ZFS_CMD_COMPAT_NONE || cflag == ZFS_CMD_COMPAT_LZC ||
-	    cflag == ZFS_CMD_COMPAT_ZCMD || cflag == ZFS_CMD_COMPAT_EDBP ||
-	    cflag == ZFS_CMD_COMPAT_RESUME || cflag == ZFS_CMD_COMPAT_INLANES)
-		return (outnvl);
-
-	switch (vec) {
-	case ZFS_IOC_SPACE_SNAPS:
-		(void) nvlist_lookup_uint64(outnvl, "used", &zc->zc_cookie);
-		(void) nvlist_lookup_uint64(outnvl, "compressed",
-		    &zc->zc_objset_type);
-		(void) nvlist_lookup_uint64(outnvl, "uncompressed",
-		    &zc->zc_perm_action);
-		nvlist_free(outnvl);
-		/* return empty outnvl */
-		tmpnvl = fnvlist_alloc();
-		return (tmpnvl);
-	break;
-	case ZFS_IOC_CREATE:
-	case ZFS_IOC_CLONE:
-	case ZFS_IOC_HOLD:
-	case ZFS_IOC_RELEASE:
-		nvlist_free(outnvl);
-		/* return empty outnvl */
-		tmpnvl = fnvlist_alloc();
-		return (tmpnvl);
-	break;
-	}
-
-	return (outnvl);
-}
-#endif /* KERNEL */
Index: sys/cddl/contrib/opensolaris/common/zfs/zfs_namecheck.h
===================================================================
--- sys/cddl/contrib/opensolaris/common/zfs/zfs_namecheck.h
+++ /dev/null
@@ -1,66 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-/*
- * Copyright (c) 2013, 2016 by Delphix. All rights reserved.
- */
-
-#ifndef	_ZFS_NAMECHECK_H
-#define	_ZFS_NAMECHECK_H
-
-#ifdef	__cplusplus
-extern "C" {
-#endif
-
-typedef enum {
-	NAME_ERR_LEADING_SLASH,		/* name begins with leading slash */
-	NAME_ERR_EMPTY_COMPONENT,	/* name contains an empty component */
-	NAME_ERR_TRAILING_SLASH,	/* name ends with a slash */
-	NAME_ERR_INVALCHAR,		/* invalid character found */
-	NAME_ERR_MULTIPLE_DELIMITERS,	/* multiple '@'/'#' delimiters found */
-	NAME_ERR_NOLETTER,		/* pool doesn't begin with a letter */
-	NAME_ERR_RESERVED,		/* entire name is reserved */
-	NAME_ERR_DISKLIKE,		/* reserved disk name (c[0-9].*) */
-	NAME_ERR_TOOLONG,		/* name is too long */
-	NAME_ERR_NO_AT,			/* permission set is missing '@' */
-} namecheck_err_t;
-
-#define	ZFS_PERMSET_MAXLEN	64
-
-extern int zfs_max_dataset_nesting;
-
-int get_dataset_depth(const char *);
-int pool_namecheck(const char *, namecheck_err_t *, char *);
-int entity_namecheck(const char *, namecheck_err_t *, char *);
-int dataset_namecheck(const char *, namecheck_err_t *, char *);
-int dataset_nestcheck(const char *);
-int mountpoint_namecheck(const char *, namecheck_err_t *);
-int zfs_component_namecheck(const char *, namecheck_err_t *, char *);
-int permset_namecheck(const char *, namecheck_err_t *, char *);
-
-#ifdef	__cplusplus
-}
-#endif
-
-#endif	/* _ZFS_NAMECHECK_H */
Index: sys/cddl/contrib/opensolaris/common/zfs/zfs_namecheck.c
===================================================================
--- sys/cddl/contrib/opensolaris/common/zfs/zfs_namecheck.c
+++ /dev/null
@@ -1,399 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-/*
- * Copyright (c) 2013, 2016 by Delphix. All rights reserved.
- */
-
-/*
- * Common name validation routines for ZFS.  These routines are shared by the
- * userland code as well as the ioctl() layer to ensure that we don't
- * inadvertently expose a hole through direct ioctl()s that never gets tested.
- * In userland, however, we want significantly more information about _why_ the
- * name is invalid.  In the kernel, we only care whether it's valid or not.
- * Each routine therefore takes a 'namecheck_err_t' which describes exactly why
- * the name failed to validate.
- */
-
-#if defined(_KERNEL)
-#include <sys/systm.h>
-#else
-#include <string.h>
-#endif
-
-#include <sys/dsl_dir.h>
-#include <sys/param.h>
-#include <sys/nvpair.h>
-#include "zfs_namecheck.h"
-#include "zfs_deleg.h"
-
-/*
- * Deeply nested datasets can overflow the stack, so we put a limit
- * in the amount of nesting a path can have. zfs_max_dataset_nesting
- * can be tuned temporarily to fix existing datasets that exceed our
- * predefined limit.
- */
-int zfs_max_dataset_nesting = 50;
-
-static int
-valid_char(char c)
-{
-	return ((c >= 'a' && c <= 'z') ||
-	    (c >= 'A' && c <= 'Z') ||
-	    (c >= '0' && c <= '9') ||
-	    c == '-' || c == '_' || c == '.' || c == ':' || c == ' ');
-}
-
-/*
- * Looks at a path and returns its level of nesting (depth).
- */
-int
-get_dataset_depth(const char *path)
-{
-	const char *loc = path;
-	int nesting = 0;
-
-	/*
-	 * Keep track of nesting until you hit the end of the
-	 * path or found the snapshot/bookmark seperator.
-	 */
-	for (int i = 0; loc[i] != '\0' &&
-	    loc[i] != '@' &&
-	    loc[i] != '#'; i++) {
-		if (loc[i] == '/')
-			nesting++;
-	}
-
-	return (nesting);
-}
-
-/*
- * Snapshot names must be made up of alphanumeric characters plus the following
- * characters:
- *
- *	[-_.: ]
- *
- * Returns 0 on success, -1 on error.
- */
-int
-zfs_component_namecheck(const char *path, namecheck_err_t *why, char *what)
-{
-	const char *loc;
-
-	if (strlen(path) >= ZFS_MAX_DATASET_NAME_LEN) {
-		if (why)
-			*why = NAME_ERR_TOOLONG;
-		return (-1);
-	}
-
-	if (path[0] == '\0') {
-		if (why)
-			*why = NAME_ERR_EMPTY_COMPONENT;
-		return (-1);
-	}
-
-	for (loc = path; *loc; loc++) {
-		if (!valid_char(*loc)) {
-			if (why) {
-				*why = NAME_ERR_INVALCHAR;
-				*what = *loc;
-			}
-			return (-1);
-		}
-	}
-	return (0);
-}
-
-
-/*
- * Permissions set name must start with the letter '@' followed by the
- * same character restrictions as snapshot names, except that the name
- * cannot exceed 64 characters.
- *
- * Returns 0 on success, -1 on error.
- */
-int
-permset_namecheck(const char *path, namecheck_err_t *why, char *what)
-{
-	if (strlen(path) >= ZFS_PERMSET_MAXLEN) {
-		if (why)
-			*why = NAME_ERR_TOOLONG;
-		return (-1);
-	}
-
-	if (path[0] != '@') {
-		if (why) {
-			*why = NAME_ERR_NO_AT;
-			*what = path[0];
-		}
-		return (-1);
-	}
-
-	return (zfs_component_namecheck(&path[1], why, what));
-}
-
-/*
- * Dataset paths should not be deeper than zfs_max_dataset_nesting
- * in terms of nesting.
- *
- * Returns 0 on success, -1 on error.
- */
-int
-dataset_nestcheck(const char *path)
-{
-	return ((get_dataset_depth(path) < zfs_max_dataset_nesting) ? 0 : -1);
-}
-
-/*
- * Entity names must be of the following form:
- *
- *	[component/]*[component][(@|#)component]?
- *
- * Where each component is made up of alphanumeric characters plus the following
- * characters:
- *
- *	[-_.:%]
- *
- * We allow '%' here as we use that character internally to create unique
- * names for temporary clones (for online recv).
- *
- * Returns 0 on success, -1 on error.
- */
-int
-entity_namecheck(const char *path, namecheck_err_t *why, char *what)
-{
-	const char *end;
-
-	/*
-	 * Make sure the name is not too long.
-	 */
-	if (strlen(path) >= ZFS_MAX_DATASET_NAME_LEN) {
-		if (why)
-			*why = NAME_ERR_TOOLONG;
-		return (-1);
-	}
-
-	/* Explicitly check for a leading slash.  */
-	if (path[0] == '/') {
-		if (why)
-			*why = NAME_ERR_LEADING_SLASH;
-		return (-1);
-	}
-
-	if (path[0] == '\0') {
-		if (why)
-			*why = NAME_ERR_EMPTY_COMPONENT;
-		return (-1);
-	}
-
-	const char *start = path;
-	boolean_t found_delim = B_FALSE;
-	for (;;) {
-		/* Find the end of this component */
-		end = start;
-		while (*end != '/' && *end != '@' && *end != '#' &&
-		    *end != '\0')
-			end++;
-
-		if (*end == '\0' && end[-1] == '/') {
-			/* trailing slashes are not allowed */
-			if (why)
-				*why = NAME_ERR_TRAILING_SLASH;
-			return (-1);
-		}
-
-		/* Validate the contents of this component */
-		for (const char *loc = start; loc != end; loc++) {
-			if (!valid_char(*loc) && *loc != '%') {
-				if (why) {
-					*why = NAME_ERR_INVALCHAR;
-					*what = *loc;
-				}
-				return (-1);
-			}
-		}
-
-		/* Snapshot or bookmark delimiter found */
-		if (*end == '@' || *end == '#') {
-			/* Multiple delimiters are not allowed */
-			if (found_delim != 0) {
-				if (why)
-					*why = NAME_ERR_MULTIPLE_DELIMITERS;
-				return (-1);
-			}
-
-			found_delim = B_TRUE;
-		}
-
-		/* Zero-length components are not allowed */
-		if (start == end) {
-			if (why)
-				*why = NAME_ERR_EMPTY_COMPONENT;
-			return (-1);
-		}
-
-		/* If we've reached the end of the string, we're OK */
-		if (*end == '\0')
-			return (0);
-
-		/*
-		 * If there is a '/' in a snapshot or bookmark name
-		 * then report an error
-		 */
-		if (*end == '/' && found_delim != 0) {
-			if (why)
-				*why = NAME_ERR_TRAILING_SLASH;
-			return (-1);
-		}
-
-		/* Update to the next component */
-		start = end + 1;
-	}
-}
-
-/*
- * Dataset is any entity, except bookmark
- */
-int
-dataset_namecheck(const char *path, namecheck_err_t *why, char *what)
-{
-	int ret = entity_namecheck(path, why, what);
-
-	if (ret == 0 && strchr(path, '#') != NULL) {
-		if (why != NULL) {
-			*why = NAME_ERR_INVALCHAR;
-			*what = '#';
-		}
-		return (-1);
-	}
-
-	return (ret);
-}
-
-/*
- * mountpoint names must be of the following form:
- *
- *	/[component][/]*[component][/]
- *
- * Returns 0 on success, -1 on error.
- */
-int
-mountpoint_namecheck(const char *path, namecheck_err_t *why)
-{
-	const char *start, *end;
-
-	/*
-	 * Make sure none of the mountpoint component names are too long.
-	 * If a component name is too long then the mkdir of the mountpoint
-	 * will fail but then the mountpoint property will be set to a value
-	 * that can never be mounted.  Better to fail before setting the prop.
-	 * Extra slashes are OK, they will be tossed by the mountpoint mkdir.
-	 */
-
-	if (path == NULL || *path != '/') {
-		if (why)
-			*why = NAME_ERR_LEADING_SLASH;
-		return (-1);
-	}
-
-	/* Skip leading slash  */
-	start = &path[1];
-	do {
-		end = start;
-		while (*end != '/' && *end != '\0')
-			end++;
-
-		if (end - start >= ZFS_MAX_DATASET_NAME_LEN) {
-			if (why)
-				*why = NAME_ERR_TOOLONG;
-			return (-1);
-		}
-		start = end + 1;
-
-	} while (*end != '\0');
-
-	return (0);
-}
-
-/*
- * For pool names, we have the same set of valid characters as described in
- * dataset names, with the additional restriction that the pool name must begin
- * with a letter.  The pool names 'raidz' and 'mirror' are also reserved names
- * that cannot be used.
- *
- * Returns 0 on success, -1 on error.
- */
-int
-pool_namecheck(const char *pool, namecheck_err_t *why, char *what)
-{
-	const char *c;
-
-	/*
-	 * Make sure the name is not too long.
-	 * If we're creating a pool with version >= SPA_VERSION_DSL_SCRUB (v11)
-	 * we need to account for additional space needed by the origin ds which
-	 * will also be snapshotted: "poolname"+"/"+"$ORIGIN"+"@"+"$ORIGIN".
-	 * Play it safe and enforce this limit even if the pool version is < 11
-	 * so it can be upgraded without issues.
-	 */
-	if (strlen(pool) >= (ZFS_MAX_DATASET_NAME_LEN - 2 -
-	    strlen(ORIGIN_DIR_NAME) * 2)) {
-		if (why)
-			*why = NAME_ERR_TOOLONG;
-		return (-1);
-	}
-
-	c = pool;
-	while (*c != '\0') {
-		if (!valid_char(*c)) {
-			if (why) {
-				*why = NAME_ERR_INVALCHAR;
-				*what = *c;
-			}
-			return (-1);
-		}
-		c++;
-	}
-
-	if (!(*pool >= 'a' && *pool <= 'z') &&
-	    !(*pool >= 'A' && *pool <= 'Z')) {
-		if (why)
-			*why = NAME_ERR_NOLETTER;
-		return (-1);
-	}
-
-	if (strcmp(pool, "mirror") == 0 || strcmp(pool, "raidz") == 0) {
-		if (why)
-			*why = NAME_ERR_RESERVED;
-		return (-1);
-	}
-
-	if (pool[0] == 'c' && (pool[1] >= '0' && pool[1] <= '9')) {
-		if (why)
-			*why = NAME_ERR_DISKLIKE;
-		return (-1);
-	}
-
-	return (0);
-}
Index: sys/cddl/contrib/opensolaris/common/zfs/zfs_prop.h
===================================================================
--- sys/cddl/contrib/opensolaris/common/zfs/zfs_prop.h
+++ /dev/null
@@ -1,131 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-
-#ifndef	_ZFS_PROP_H
-#define	_ZFS_PROP_H
-
-#include <sys/fs/zfs.h>
-#include <sys/types.h>
-
-#ifdef	__cplusplus
-extern "C" {
-#endif
-
-/*
- * For index types (e.g. compression and checksum), we want the numeric value
- * in the kernel, but the string value in userland.
- */
-typedef enum {
-	PROP_TYPE_NUMBER,	/* numeric value */
-	PROP_TYPE_STRING,	/* string value */
-	PROP_TYPE_INDEX		/* numeric value indexed by string */
-} zprop_type_t;
-
-typedef enum {
-	PROP_DEFAULT,
-	PROP_READONLY,
-	PROP_INHERIT,
-	/*
-	 * ONETIME properties are a sort of conglomeration of READONLY
-	 * and INHERIT.  They can be set only during object creation,
-	 * after that they are READONLY.  If not explicitly set during
-	 * creation, they can be inherited.
-	 */
-	PROP_ONETIME
-} zprop_attr_t;
-
-typedef struct zfs_index {
-	const char *pi_name;
-	uint64_t pi_value;
-} zprop_index_t;
-
-typedef struct {
-	const char *pd_name;		/* human-readable property name */
-	int pd_propnum;			/* property number */
-	zprop_type_t pd_proptype;	/* string, boolean, index, number */
-	const char *pd_strdefault;	/* default for strings */
-	uint64_t pd_numdefault;		/* for boolean / index / number */
-	zprop_attr_t pd_attr;		/* default, readonly, inherit */
-	int pd_types;			/* bitfield of valid dataset types */
-					/* fs | vol | snap; or pool */
-	const char *pd_values;		/* string telling acceptable values */
-	const char *pd_colname;		/* column header for "zfs list" */
-	boolean_t pd_rightalign;	/* column alignment for "zfs list" */
-	boolean_t pd_visible;		/* do we list this property with the */
-					/* "zfs get" help message */
-	const zprop_index_t *pd_table;	/* for index properties, a table */
-					/* defining the possible values */
-	size_t pd_table_size;		/* number of entries in pd_table[] */
-} zprop_desc_t;
-
-/*
- * zfs dataset property functions
- */
-void zfs_prop_init(void);
-zprop_type_t zfs_prop_get_type(zfs_prop_t);
-boolean_t zfs_prop_delegatable(zfs_prop_t prop);
-zprop_desc_t *zfs_prop_get_table(void);
-
-/*
- * zpool property functions
- */
-void zpool_prop_init(void);
-zprop_type_t zpool_prop_get_type(zpool_prop_t);
-zprop_desc_t *zpool_prop_get_table(void);
-
-/*
- * Common routines to initialize property tables
- */
-void zprop_register_impl(int, const char *, zprop_type_t, uint64_t,
-    const char *, zprop_attr_t, int, const char *, const char *,
-    boolean_t, boolean_t, const zprop_index_t *);
-void zprop_register_string(int, const char *, const char *,
-    zprop_attr_t attr, int, const char *, const char *);
-void zprop_register_number(int, const char *, uint64_t, zprop_attr_t, int,
-    const char *, const char *);
-void zprop_register_index(int, const char *, uint64_t, zprop_attr_t, int,
-    const char *, const char *, const zprop_index_t *);
-void zprop_register_hidden(int, const char *, zprop_type_t, zprop_attr_t,
-    int, const char *);
-
-/*
- * Common routines for zfs and zpool property management
- */
-int zprop_iter_common(zprop_func, void *, boolean_t, boolean_t, zfs_type_t);
-int zprop_name_to_prop(const char *, zfs_type_t);
-int zprop_string_to_index(int, const char *, uint64_t *, zfs_type_t);
-int zprop_index_to_string(int, uint64_t, const char **, zfs_type_t);
-uint64_t zprop_random_value(int, uint64_t, zfs_type_t);
-const char *zprop_values(int, zfs_type_t);
-size_t zprop_width(int, boolean_t *, zfs_type_t);
-boolean_t zprop_valid_for_type(int, zfs_type_t);
-boolean_t zfs_prop_written(const char *name);
-
-
-#ifdef	__cplusplus
-}
-#endif
-
-#endif	/* _ZFS_PROP_H */
Index: sys/cddl/contrib/opensolaris/common/zfs/zfs_prop.c
===================================================================
--- sys/cddl/contrib/opensolaris/common/zfs/zfs_prop.c
+++ /dev/null
@@ -1,718 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, 2016 by Delphix. All rights reserved.
- * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
- * Copyright (c) 2013, Joyent, Inc. All rights reserved.
- * Copyright (c) 2014 Integros [integros.com]
- */
-
-/* Portions Copyright 2010 Robert Milkowski */
-
-#include <sys/zio.h>
-#include <sys/spa.h>
-#include <sys/u8_textprep.h>
-#include <sys/zfs_acl.h>
-#include <sys/zfs_ioctl.h>
-#include <sys/zfs_znode.h>
-
-#include "zfs_prop.h"
-#include "zfs_deleg.h"
-
-#if defined(_KERNEL)
-#include <sys/systm.h>
-#else
-#include <stdlib.h>
-#include <string.h>
-#include <ctype.h>
-#endif
-
-static zprop_desc_t zfs_prop_table[ZFS_NUM_PROPS];
-
-/* Note this is indexed by zfs_userquota_prop_t, keep the order the same */
-const char *zfs_userquota_prop_prefixes[] = {
-	"userused@",
-	"userquota@",
-	"groupused@",
-	"groupquota@"
-};
-
-zprop_desc_t *
-zfs_prop_get_table(void)
-{
-	return (zfs_prop_table);
-}
-
-void
-zfs_prop_init(void)
-{
-	static zprop_index_t checksum_table[] = {
-		{ "on",		ZIO_CHECKSUM_ON },
-		{ "off",	ZIO_CHECKSUM_OFF },
-		{ "fletcher2",	ZIO_CHECKSUM_FLETCHER_2 },
-		{ "fletcher4",	ZIO_CHECKSUM_FLETCHER_4 },
-		{ "sha256",	ZIO_CHECKSUM_SHA256 },
-		{ "noparity",	ZIO_CHECKSUM_NOPARITY },
-		{ "sha512",	ZIO_CHECKSUM_SHA512 },
-		{ "skein",	ZIO_CHECKSUM_SKEIN },
-#ifdef illumos
-		{ "edonr",	ZIO_CHECKSUM_EDONR },
-#endif
-		{ NULL }
-	};
-
-	static zprop_index_t dedup_table[] = {
-		{ "on",		ZIO_CHECKSUM_ON },
-		{ "off",	ZIO_CHECKSUM_OFF },
-		{ "verify",	ZIO_CHECKSUM_ON | ZIO_CHECKSUM_VERIFY },
-		{ "sha256",	ZIO_CHECKSUM_SHA256 },
-		{ "sha256,verify",
-				ZIO_CHECKSUM_SHA256 | ZIO_CHECKSUM_VERIFY },
-		{ "sha512",	ZIO_CHECKSUM_SHA512 },
-		{ "sha512,verify",
-				ZIO_CHECKSUM_SHA512 | ZIO_CHECKSUM_VERIFY },
-		{ "skein",	ZIO_CHECKSUM_SKEIN },
-		{ "skein,verify",
-				ZIO_CHECKSUM_SKEIN | ZIO_CHECKSUM_VERIFY },
-#ifdef illumos
-		{ "edonr,verify",
-				ZIO_CHECKSUM_EDONR | ZIO_CHECKSUM_VERIFY },
-#endif
-		{ NULL }
-	};
-
-	static zprop_index_t compress_table[] = {
-		{ "on",		ZIO_COMPRESS_ON },
-		{ "off",	ZIO_COMPRESS_OFF },
-		{ "lzjb",	ZIO_COMPRESS_LZJB },
-		{ "gzip",	ZIO_COMPRESS_GZIP_6 },	/* gzip default */
-		{ "gzip-1",	ZIO_COMPRESS_GZIP_1 },
-		{ "gzip-2",	ZIO_COMPRESS_GZIP_2 },
-		{ "gzip-3",	ZIO_COMPRESS_GZIP_3 },
-		{ "gzip-4",	ZIO_COMPRESS_GZIP_4 },
-		{ "gzip-5",	ZIO_COMPRESS_GZIP_5 },
-		{ "gzip-6",	ZIO_COMPRESS_GZIP_6 },
-		{ "gzip-7",	ZIO_COMPRESS_GZIP_7 },
-		{ "gzip-8",	ZIO_COMPRESS_GZIP_8 },
-		{ "gzip-9",	ZIO_COMPRESS_GZIP_9 },
-		{ "zle",	ZIO_COMPRESS_ZLE },
-		{ "lz4",	ZIO_COMPRESS_LZ4 },
-		{ NULL }
-	};
-
-	static zprop_index_t snapdir_table[] = {
-		{ "hidden",	ZFS_SNAPDIR_HIDDEN },
-		{ "visible",	ZFS_SNAPDIR_VISIBLE },
-		{ NULL }
-	};
-
-	static zprop_index_t acl_mode_table[] = {
-		{ "discard",	ZFS_ACL_DISCARD },
-		{ "groupmask",	ZFS_ACL_GROUPMASK },
-		{ "passthrough", ZFS_ACL_PASSTHROUGH },
-		{ "restricted", ZFS_ACL_RESTRICTED },
-		{ NULL }
-	};
-
-	static zprop_index_t acl_inherit_table[] = {
-		{ "discard",	ZFS_ACL_DISCARD },
-		{ "noallow",	ZFS_ACL_NOALLOW },
-		{ "restricted",	ZFS_ACL_RESTRICTED },
-		{ "passthrough", ZFS_ACL_PASSTHROUGH },
-		{ "secure",	ZFS_ACL_RESTRICTED }, /* bkwrd compatability */
-		{ "passthrough-x", ZFS_ACL_PASSTHROUGH_X },
-		{ NULL }
-	};
-
-	static zprop_index_t case_table[] = {
-		{ "sensitive",		ZFS_CASE_SENSITIVE },
-		{ "insensitive",	ZFS_CASE_INSENSITIVE },
-		{ "mixed",		ZFS_CASE_MIXED },
-		{ NULL }
-	};
-
-	static zprop_index_t copies_table[] = {
-		{ "1",		1 },
-		{ "2",		2 },
-		{ "3",		3 },
-		{ NULL }
-	};
-
-	/*
-	 * Use the unique flags we have to send to u8_strcmp() and/or
-	 * u8_textprep() to represent the various normalization property
-	 * values.
-	 */
-	static zprop_index_t normalize_table[] = {
-		{ "none",	0 },
-		{ "formD",	U8_TEXTPREP_NFD },
-		{ "formKC",	U8_TEXTPREP_NFKC },
-		{ "formC",	U8_TEXTPREP_NFC },
-		{ "formKD",	U8_TEXTPREP_NFKD },
-		{ NULL }
-	};
-
-	static zprop_index_t version_table[] = {
-		{ "1",		1 },
-		{ "2",		2 },
-		{ "3",		3 },
-		{ "4",		4 },
-		{ "5",		5 },
-		{ "current",	ZPL_VERSION },
-		{ NULL }
-	};
-
-	static zprop_index_t boolean_table[] = {
-		{ "off",	0 },
-		{ "on",		1 },
-		{ NULL }
-	};
-
-	static zprop_index_t logbias_table[] = {
-		{ "latency",	ZFS_LOGBIAS_LATENCY },
-		{ "throughput",	ZFS_LOGBIAS_THROUGHPUT },
-		{ NULL }
-	};
-
-	static zprop_index_t canmount_table[] = {
-		{ "off",	ZFS_CANMOUNT_OFF },
-		{ "on",		ZFS_CANMOUNT_ON },
-		{ "noauto",	ZFS_CANMOUNT_NOAUTO },
-		{ NULL }
-	};
-
-	static zprop_index_t cache_table[] = {
-		{ "none",	ZFS_CACHE_NONE },
-		{ "metadata",	ZFS_CACHE_METADATA },
-		{ "all",	ZFS_CACHE_ALL },
-		{ NULL }
-	};
-
-	static zprop_index_t sync_table[] = {
-		{ "standard",	ZFS_SYNC_STANDARD },
-		{ "always",	ZFS_SYNC_ALWAYS },
-		{ "disabled",	ZFS_SYNC_DISABLED },
-		{ NULL }
-	};
-
-	static zprop_index_t volmode_table[] = {
-		{ "default",	ZFS_VOLMODE_DEFAULT },
-		{ "geom",	ZFS_VOLMODE_GEOM },
-		{ "dev",	ZFS_VOLMODE_DEV },
-		{ "none",	ZFS_VOLMODE_NONE },
-		{ NULL }
-	};
-
-	static zprop_index_t dnsize_table[] = {
-		{ "legacy",	ZFS_DNSIZE_LEGACY },
-		{ "auto",	ZFS_DNSIZE_AUTO },
-		{ "1k",		ZFS_DNSIZE_1K },
-		{ "2k",		ZFS_DNSIZE_2K },
-		{ "4k",		ZFS_DNSIZE_4K },
-		{ "8k",		ZFS_DNSIZE_8K },
-		{ "16k",	ZFS_DNSIZE_16K },
-		{ NULL }
-	};
-
-	static zprop_index_t redundant_metadata_table[] = {
-		{ "all",	ZFS_REDUNDANT_METADATA_ALL },
-		{ "most",	ZFS_REDUNDANT_METADATA_MOST },
-		{ NULL }
-	};
-
-	/* inherit index properties */
-	zprop_register_index(ZFS_PROP_REDUNDANT_METADATA, "redundant_metadata",
-	    ZFS_REDUNDANT_METADATA_ALL,
-	    PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME,
-	    "all | most", "REDUND_MD",
-	    redundant_metadata_table);
-	zprop_register_index(ZFS_PROP_SYNC, "sync", ZFS_SYNC_STANDARD,
-	    PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME,
-	    "standard | always | disabled", "SYNC",
-	    sync_table);
-	zprop_register_index(ZFS_PROP_CHECKSUM, "checksum",
-	    ZIO_CHECKSUM_DEFAULT, PROP_INHERIT, ZFS_TYPE_FILESYSTEM |
-	    ZFS_TYPE_VOLUME,
-	    "on | off | fletcher2 | fletcher4 | sha256 | sha512 | "
-	    "skein", "CHECKSUM", checksum_table);
-	zprop_register_index(ZFS_PROP_DEDUP, "dedup", ZIO_CHECKSUM_OFF,
-	    PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME,
-	    "on | off | verify | sha256[,verify], sha512[,verify], "
-	    "skein[,verify]", "DEDUP", dedup_table);
-	zprop_register_index(ZFS_PROP_COMPRESSION, "compression",
-	    ZIO_COMPRESS_DEFAULT, PROP_INHERIT,
-	    ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME,
-	    "on | off | lzjb | gzip | gzip-[1-9] | zle | lz4",
-	    "COMPRESS", compress_table);
-	zprop_register_index(ZFS_PROP_SNAPDIR, "snapdir", ZFS_SNAPDIR_HIDDEN,
-	    PROP_INHERIT, ZFS_TYPE_FILESYSTEM,
-	    "hidden | visible", "SNAPDIR", snapdir_table);
-	zprop_register_index(ZFS_PROP_ACLMODE, "aclmode", ZFS_ACL_DISCARD,
-	    PROP_INHERIT, ZFS_TYPE_FILESYSTEM,
-	    "discard | groupmask | passthrough | restricted", "ACLMODE",
-	    acl_mode_table);
-	zprop_register_index(ZFS_PROP_ACLINHERIT, "aclinherit",
-	    ZFS_ACL_RESTRICTED, PROP_INHERIT, ZFS_TYPE_FILESYSTEM,
-	    "discard | noallow | restricted | passthrough | passthrough-x",
-	    "ACLINHERIT", acl_inherit_table);
-	zprop_register_index(ZFS_PROP_COPIES, "copies", 1, PROP_INHERIT,
-	    ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME,
-	    "1 | 2 | 3", "COPIES", copies_table);
-	zprop_register_index(ZFS_PROP_PRIMARYCACHE, "primarycache",
-	    ZFS_CACHE_ALL, PROP_INHERIT,
-	    ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT | ZFS_TYPE_VOLUME,
-	    "all | none | metadata", "PRIMARYCACHE", cache_table);
-	zprop_register_index(ZFS_PROP_SECONDARYCACHE, "secondarycache",
-	    ZFS_CACHE_ALL, PROP_INHERIT,
-	    ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT | ZFS_TYPE_VOLUME,
-	    "all | none | metadata", "SECONDARYCACHE", cache_table);
-	zprop_register_index(ZFS_PROP_LOGBIAS, "logbias", ZFS_LOGBIAS_LATENCY,
-	    PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME,
-	    "latency | throughput", "LOGBIAS", logbias_table);
-	zprop_register_index(ZFS_PROP_VOLMODE, "volmode",
-	    ZFS_VOLMODE_DEFAULT, PROP_INHERIT,
-	    ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT | ZFS_TYPE_VOLUME,
-	    "default | geom | dev | none", "VOLMODE", volmode_table);
- 
-	zprop_register_index(ZFS_PROP_DNODESIZE, "dnodesize",
-	    ZFS_DNSIZE_LEGACY, PROP_INHERIT, ZFS_TYPE_FILESYSTEM,
-	    "legacy | auto | 1k | 2k | 4k | 8k | 16k", "DNSIZE", dnsize_table);
-
-	/* inherit index (boolean) properties */
-	zprop_register_index(ZFS_PROP_ATIME, "atime", 1, PROP_INHERIT,
-	    ZFS_TYPE_FILESYSTEM, "on | off", "ATIME", boolean_table);
-	zprop_register_index(ZFS_PROP_DEVICES, "devices", 1, PROP_INHERIT,
-	    ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT, "on | off", "DEVICES",
-	    boolean_table);
-	zprop_register_index(ZFS_PROP_EXEC, "exec", 1, PROP_INHERIT,
-	    ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT, "on | off", "EXEC",
-	    boolean_table);
-	zprop_register_index(ZFS_PROP_SETUID, "setuid", 1, PROP_INHERIT,
-	    ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT, "on | off", "SETUID",
-	    boolean_table);
-	zprop_register_index(ZFS_PROP_READONLY, "readonly", 0, PROP_INHERIT,
-	    ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "on | off", "RDONLY",
-	    boolean_table);
-	zprop_register_index(ZFS_PROP_ZONED, "jailed", 0, PROP_INHERIT,
-	    ZFS_TYPE_FILESYSTEM, "on | off", "JAILED", boolean_table);
-	zprop_register_index(ZFS_PROP_XATTR, "xattr", 1, PROP_INHERIT,
-	    ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT, "on | off", "XATTR",
-	    boolean_table);
-	zprop_register_index(ZFS_PROP_VSCAN, "vscan", 0, PROP_INHERIT,
-	    ZFS_TYPE_FILESYSTEM, "on | off", "VSCAN",
-	    boolean_table);
-	zprop_register_index(ZFS_PROP_NBMAND, "nbmand", 0, PROP_INHERIT,
-	    ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT, "on | off", "NBMAND",
-	    boolean_table);
-
-	/* default index properties */
-	zprop_register_index(ZFS_PROP_VERSION, "version", 0, PROP_DEFAULT,
-	    ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT,
-	    "1 | 2 | 3 | 4 | 5 | current", "VERSION", version_table);
-	zprop_register_index(ZFS_PROP_CANMOUNT, "canmount", ZFS_CANMOUNT_ON,
-	    PROP_DEFAULT, ZFS_TYPE_FILESYSTEM, "on | off | noauto",
-	    "CANMOUNT", canmount_table);
-
-	/* readonly index (boolean) properties */
-	zprop_register_index(ZFS_PROP_MOUNTED, "mounted", 0, PROP_READONLY,
-	    ZFS_TYPE_FILESYSTEM, "yes | no", "MOUNTED", boolean_table);
-	zprop_register_index(ZFS_PROP_DEFER_DESTROY, "defer_destroy", 0,
-	    PROP_READONLY, ZFS_TYPE_SNAPSHOT, "yes | no", "DEFER_DESTROY",
-	    boolean_table);
-
-	/* set once index properties */
-	zprop_register_index(ZFS_PROP_NORMALIZE, "normalization", 0,
-	    PROP_ONETIME, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT,
-	    "none | formC | formD | formKC | formKD", "NORMALIZATION",
-	    normalize_table);
-	zprop_register_index(ZFS_PROP_CASE, "casesensitivity",
-	    ZFS_CASE_SENSITIVE, PROP_ONETIME, ZFS_TYPE_FILESYSTEM |
-	    ZFS_TYPE_SNAPSHOT,
-	    "sensitive | insensitive | mixed", "CASE", case_table);
-
-	/* set once index (boolean) properties */
-	zprop_register_index(ZFS_PROP_UTF8ONLY, "utf8only", 0, PROP_ONETIME,
-	    ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT,
-	    "on | off", "UTF8ONLY", boolean_table);
-
-	/* string properties */
-	zprop_register_string(ZFS_PROP_ORIGIN, "origin", NULL, PROP_READONLY,
-	    ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "<snapshot>", "ORIGIN");
-	zprop_register_string(ZFS_PROP_CLONES, "clones", NULL, PROP_READONLY,
-	    ZFS_TYPE_SNAPSHOT, "<dataset>[,...]", "CLONES");
-	zprop_register_string(ZFS_PROP_MOUNTPOINT, "mountpoint", "/",
-	    PROP_INHERIT, ZFS_TYPE_FILESYSTEM, "<path> | legacy | none",
-	    "MOUNTPOINT");
-	zprop_register_string(ZFS_PROP_SHARENFS, "sharenfs", "off",
-	    PROP_INHERIT, ZFS_TYPE_FILESYSTEM, "on | off | share(1M) options",
-	    "SHARENFS");
-	zprop_register_string(ZFS_PROP_TYPE, "type", NULL, PROP_READONLY,
-	    ZFS_TYPE_DATASET | ZFS_TYPE_BOOKMARK,
-	    "filesystem | volume | snapshot | bookmark", "TYPE");
-	zprop_register_string(ZFS_PROP_SHARESMB, "sharesmb", "off",
-	    PROP_INHERIT, ZFS_TYPE_FILESYSTEM,
-	    "on | off | sharemgr(1M) options", "SHARESMB");
-	zprop_register_string(ZFS_PROP_MLSLABEL, "mlslabel",
-	    ZFS_MLSLABEL_DEFAULT, PROP_INHERIT, ZFS_TYPE_DATASET,
-	    "<sensitivity label>", "MLSLABEL");
-	zprop_register_string(ZFS_PROP_RECEIVE_RESUME_TOKEN,
-	    "receive_resume_token",
-	    NULL, PROP_READONLY, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME,
-	    "<string token>", "RESUMETOK");
-
-	/* readonly number properties */
-	zprop_register_number(ZFS_PROP_USED, "used", 0, PROP_READONLY,
-	    ZFS_TYPE_DATASET, "<size>", "USED");
-	zprop_register_number(ZFS_PROP_AVAILABLE, "available", 0, PROP_READONLY,
-	    ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "<size>", "AVAIL");
-	zprop_register_number(ZFS_PROP_REFERENCED, "referenced", 0,
-	    PROP_READONLY, ZFS_TYPE_DATASET, "<size>", "REFER");
-	zprop_register_number(ZFS_PROP_COMPRESSRATIO, "compressratio", 0,
-	    PROP_READONLY, ZFS_TYPE_DATASET,
-	    "<1.00x or higher if compressed>", "RATIO");
-	zprop_register_number(ZFS_PROP_REFRATIO, "refcompressratio", 0,
-	    PROP_READONLY, ZFS_TYPE_DATASET,
-	    "<1.00x or higher if compressed>", "REFRATIO");
-	zprop_register_number(ZFS_PROP_VOLBLOCKSIZE, "volblocksize",
-	    ZVOL_DEFAULT_BLOCKSIZE, PROP_ONETIME,
-	    ZFS_TYPE_VOLUME, "512 to 128k, power of 2",	"VOLBLOCK");
-	zprop_register_number(ZFS_PROP_USEDSNAP, "usedbysnapshots", 0,
-	    PROP_READONLY, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "<size>",
-	    "USEDSNAP");
-	zprop_register_number(ZFS_PROP_USEDDS, "usedbydataset", 0,
-	    PROP_READONLY, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "<size>",
-	    "USEDDS");
-	zprop_register_number(ZFS_PROP_USEDCHILD, "usedbychildren", 0,
-	    PROP_READONLY, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "<size>",
-	    "USEDCHILD");
-	zprop_register_number(ZFS_PROP_USEDREFRESERV, "usedbyrefreservation", 0,
-	    PROP_READONLY,
-	    ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "<size>", "USEDREFRESERV");
-	zprop_register_number(ZFS_PROP_USERREFS, "userrefs", 0, PROP_READONLY,
-	    ZFS_TYPE_SNAPSHOT, "<count>", "USERREFS");
-	zprop_register_number(ZFS_PROP_WRITTEN, "written", 0, PROP_READONLY,
-	    ZFS_TYPE_DATASET, "<size>", "WRITTEN");
-	zprop_register_number(ZFS_PROP_LOGICALUSED, "logicalused", 0,
-	    PROP_READONLY, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "<size>",
-	    "LUSED");
-	zprop_register_number(ZFS_PROP_LOGICALREFERENCED, "logicalreferenced",
-	    0, PROP_READONLY, ZFS_TYPE_DATASET, "<size>", "LREFER");
-
-	/* default number properties */
-	zprop_register_number(ZFS_PROP_QUOTA, "quota", 0, PROP_DEFAULT,
-	    ZFS_TYPE_FILESYSTEM, "<size> | none", "QUOTA");
-	zprop_register_number(ZFS_PROP_RESERVATION, "reservation", 0,
-	    PROP_DEFAULT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME,
-	    "<size> | none", "RESERV");
-	zprop_register_number(ZFS_PROP_VOLSIZE, "volsize", 0, PROP_DEFAULT,
-	    ZFS_TYPE_VOLUME, "<size>", "VOLSIZE");
-	zprop_register_number(ZFS_PROP_REFQUOTA, "refquota", 0, PROP_DEFAULT,
-	    ZFS_TYPE_FILESYSTEM, "<size> | none", "REFQUOTA");
-	zprop_register_number(ZFS_PROP_REFRESERVATION, "refreservation", 0,
-	    PROP_DEFAULT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME,
-	    "<size> | none", "REFRESERV");
-	zprop_register_number(ZFS_PROP_FILESYSTEM_LIMIT, "filesystem_limit",
-	    UINT64_MAX, PROP_DEFAULT, ZFS_TYPE_FILESYSTEM,
-	    "<count> | none", "FSLIMIT");
-	zprop_register_number(ZFS_PROP_SNAPSHOT_LIMIT, "snapshot_limit",
-	    UINT64_MAX, PROP_DEFAULT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME,
-	    "<count> | none", "SSLIMIT");
-	zprop_register_number(ZFS_PROP_FILESYSTEM_COUNT, "filesystem_count",
-	    UINT64_MAX, PROP_DEFAULT, ZFS_TYPE_FILESYSTEM,
-	    "<count>", "FSCOUNT");
-	zprop_register_number(ZFS_PROP_SNAPSHOT_COUNT, "snapshot_count",
-	    UINT64_MAX, PROP_DEFAULT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME,
-	    "<count>", "SSCOUNT");
-	zprop_register_number(ZFS_PROP_GUID, "guid", 0, PROP_READONLY,
-	    ZFS_TYPE_DATASET | ZFS_TYPE_BOOKMARK, "<uint64>", "GUID");
-	zprop_register_number(ZFS_PROP_CREATETXG, "createtxg", 0, PROP_READONLY,
-	    ZFS_TYPE_DATASET | ZFS_TYPE_BOOKMARK, "<uint64>", "CREATETXG");
-
-	/* inherit number properties */
-	zprop_register_number(ZFS_PROP_RECORDSIZE, "recordsize",
-	    SPA_OLD_MAXBLOCKSIZE, PROP_INHERIT,
-	    ZFS_TYPE_FILESYSTEM, "512 to 1M, power of 2", "RECSIZE");
-	zprop_register_number(ZFS_PROP_SPECIAL_SMALL_BLOCKS,
-	    "special_small_blocks", 0, PROP_INHERIT, ZFS_TYPE_FILESYSTEM,
-	    "zero or 512 to 128K, power of 2", "SPECIAL_SMALL_BLOCKS");
-
-	/* hidden properties */
-	zprop_register_hidden(ZFS_PROP_REMAPTXG, "remaptxg", PROP_TYPE_NUMBER,
-	    PROP_READONLY, ZFS_TYPE_DATASET, "REMAPTXG");
-	zprop_register_hidden(ZFS_PROP_NUMCLONES, "numclones", PROP_TYPE_NUMBER,
-	    PROP_READONLY, ZFS_TYPE_SNAPSHOT, "NUMCLONES");
-	zprop_register_hidden(ZFS_PROP_NAME, "name", PROP_TYPE_STRING,
-	    PROP_READONLY, ZFS_TYPE_DATASET | ZFS_TYPE_BOOKMARK, "NAME");
-	zprop_register_hidden(ZFS_PROP_ISCSIOPTIONS, "iscsioptions",
-	    PROP_TYPE_STRING, PROP_INHERIT, ZFS_TYPE_VOLUME, "ISCSIOPTIONS");
-	zprop_register_hidden(ZFS_PROP_STMF_SHAREINFO, "stmf_sbd_lu",
-	    PROP_TYPE_STRING, PROP_INHERIT, ZFS_TYPE_VOLUME,
-	    "STMF_SBD_LU");
-	zprop_register_hidden(ZFS_PROP_USERACCOUNTING, "useraccounting",
-	    PROP_TYPE_NUMBER, PROP_READONLY, ZFS_TYPE_DATASET,
-	    "USERACCOUNTING");
-	zprop_register_hidden(ZFS_PROP_UNIQUE, "unique", PROP_TYPE_NUMBER,
-	    PROP_READONLY, ZFS_TYPE_DATASET, "UNIQUE");
-	zprop_register_hidden(ZFS_PROP_OBJSETID, "objsetid", PROP_TYPE_NUMBER,
-	    PROP_READONLY, ZFS_TYPE_DATASET, "OBJSETID");
-	zprop_register_hidden(ZFS_PROP_INCONSISTENT, "inconsistent",
-	    PROP_TYPE_NUMBER, PROP_READONLY, ZFS_TYPE_DATASET, "INCONSISTENT");
-	zprop_register_hidden(ZFS_PROP_PREV_SNAP, "prevsnap", PROP_TYPE_STRING,
-	    PROP_READONLY, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "PREVSNAP");
-
-	/* oddball properties */
-	zprop_register_impl(ZFS_PROP_CREATION, "creation", PROP_TYPE_NUMBER, 0,
-	    NULL, PROP_READONLY, ZFS_TYPE_DATASET | ZFS_TYPE_BOOKMARK,
-	    "<date>", "CREATION", B_FALSE, B_TRUE, NULL);
-}
-
-boolean_t
-zfs_prop_delegatable(zfs_prop_t prop)
-{
-	zprop_desc_t *pd = &zfs_prop_table[prop];
-
-	/* The mlslabel property is never delegatable. */
-	if (prop == ZFS_PROP_MLSLABEL)
-		return (B_FALSE);
-
-	return (pd->pd_attr != PROP_READONLY);
-}
-
-/*
- * Given a zfs dataset property name, returns the corresponding property ID.
- */
-zfs_prop_t
-zfs_name_to_prop(const char *propname)
-{
-	return (zprop_name_to_prop(propname, ZFS_TYPE_DATASET));
-}
-
-/*
- * For user property names, we allow all lowercase alphanumeric characters, plus
- * a few useful punctuation characters.
- */
-static int
-valid_char(char c)
-{
-	return ((c >= 'a' && c <= 'z') ||
-	    (c >= '0' && c <= '9') ||
-	    c == '-' || c == '_' || c == '.' || c == ':');
-}
-
-/*
- * Returns true if this is a valid user-defined property (one with a ':').
- */
-boolean_t
-zfs_prop_user(const char *name)
-{
-	int i;
-	char c;
-	boolean_t foundsep = B_FALSE;
-
-	for (i = 0; i < strlen(name); i++) {
-		c = name[i];
-		if (!valid_char(c))
-			return (B_FALSE);
-		if (c == ':')
-			foundsep = B_TRUE;
-	}
-
-	if (!foundsep)
-		return (B_FALSE);
-
-	return (B_TRUE);
-}
-
-/*
- * Returns true if this is a valid userspace-type property (one with a '@').
- * Note that after the @, any character is valid (eg, another @, for SID
- * user@domain).
- */
-boolean_t
-zfs_prop_userquota(const char *name)
-{
-	zfs_userquota_prop_t prop;
-
-	for (prop = 0; prop < ZFS_NUM_USERQUOTA_PROPS; prop++) {
-		if (strncmp(name, zfs_userquota_prop_prefixes[prop],
-		    strlen(zfs_userquota_prop_prefixes[prop])) == 0) {
-			return (B_TRUE);
-		}
-	}
-
-	return (B_FALSE);
-}
-
-/*
- * Returns true if this is a valid written@ property.
- * Note that after the @, any character is valid (eg, another @, for
- * written@pool/fs@origin).
- */
-boolean_t
-zfs_prop_written(const char *name)
-{
-	static const char *prefix = "written@";
-	return (strncmp(name, prefix, strlen(prefix)) == 0);
-}
-
-/*
- * Tables of index types, plus functions to convert between the user view
- * (strings) and internal representation (uint64_t).
- */
-int
-zfs_prop_string_to_index(zfs_prop_t prop, const char *string, uint64_t *index)
-{
-	return (zprop_string_to_index(prop, string, index, ZFS_TYPE_DATASET));
-}
-
-int
-zfs_prop_index_to_string(zfs_prop_t prop, uint64_t index, const char **string)
-{
-	return (zprop_index_to_string(prop, index, string, ZFS_TYPE_DATASET));
-}
-
-uint64_t
-zfs_prop_random_value(zfs_prop_t prop, uint64_t seed)
-{
-	return (zprop_random_value(prop, seed, ZFS_TYPE_DATASET));
-}
-
-/*
- * Returns TRUE if the property applies to any of the given dataset types.
- */
-boolean_t
-zfs_prop_valid_for_type(int prop, zfs_type_t types)
-{
-	return (zprop_valid_for_type(prop, types));
-}
-
-zprop_type_t
-zfs_prop_get_type(zfs_prop_t prop)
-{
-	return (zfs_prop_table[prop].pd_proptype);
-}
-
-/*
- * Returns TRUE if the property is readonly.
- */
-boolean_t
-zfs_prop_readonly(zfs_prop_t prop)
-{
-	return (zfs_prop_table[prop].pd_attr == PROP_READONLY ||
-	    zfs_prop_table[prop].pd_attr == PROP_ONETIME);
-}
-
-/*
- * Returns TRUE if the property is visible (not hidden).
- */
-boolean_t
-zfs_prop_visible(zfs_prop_t prop)
-{
-	return (zfs_prop_table[prop].pd_visible);
-}
-
-/*
- * Returns TRUE if the property is only allowed to be set once.
- */
-boolean_t
-zfs_prop_setonce(zfs_prop_t prop)
-{
-	return (zfs_prop_table[prop].pd_attr == PROP_ONETIME);
-}
-
-const char *
-zfs_prop_default_string(zfs_prop_t prop)
-{
-	return (zfs_prop_table[prop].pd_strdefault);
-}
-
-uint64_t
-zfs_prop_default_numeric(zfs_prop_t prop)
-{
-	return (zfs_prop_table[prop].pd_numdefault);
-}
-
-/*
- * Given a dataset property ID, returns the corresponding name.
- * Assuming the zfs dataset property ID is valid.
- */
-const char *
-zfs_prop_to_name(zfs_prop_t prop)
-{
-	return (zfs_prop_table[prop].pd_name);
-}
-
-/*
- * Returns TRUE if the property is inheritable.
- */
-boolean_t
-zfs_prop_inheritable(zfs_prop_t prop)
-{
-	return (zfs_prop_table[prop].pd_attr == PROP_INHERIT ||
-	    zfs_prop_table[prop].pd_attr == PROP_ONETIME);
-}
-
-#ifndef _KERNEL
-
-/*
- * Returns a string describing the set of acceptable values for the given
- * zfs property, or NULL if it cannot be set.
- */
-const char *
-zfs_prop_values(zfs_prop_t prop)
-{
-	return (zfs_prop_table[prop].pd_values);
-}
-
-/*
- * Returns TRUE if this property is a string type.  Note that index types
- * (compression, checksum) are treated as strings in userland, even though they
- * are stored numerically on disk.
- */
-int
-zfs_prop_is_string(zfs_prop_t prop)
-{
-	return (zfs_prop_table[prop].pd_proptype == PROP_TYPE_STRING ||
-	    zfs_prop_table[prop].pd_proptype == PROP_TYPE_INDEX);
-}
-
-/*
- * Returns the column header for the given property.  Used only in
- * 'zfs list -o', but centralized here with the other property information.
- */
-const char *
-zfs_prop_column_name(zfs_prop_t prop)
-{
-	return (zfs_prop_table[prop].pd_colname);
-}
-
-/*
- * Returns whether the given property should be displayed right-justified for
- * 'zfs list'.
- */
-boolean_t
-zfs_prop_align_right(zfs_prop_t prop)
-{
-	return (zfs_prop_table[prop].pd_rightalign);
-}
-
-#endif
Index: sys/cddl/contrib/opensolaris/common/zfs/zpool_prop.c
===================================================================
--- sys/cddl/contrib/opensolaris/common/zfs/zpool_prop.c
+++ /dev/null
@@ -1,250 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
- * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
- * Copyright (c) 2014 Integros [integros.com]
- */
-
-#include <sys/zio.h>
-#include <sys/spa.h>
-#include <sys/zfs_acl.h>
-#include <sys/zfs_ioctl.h>
-#include <sys/fs/zfs.h>
-
-#include "zfs_prop.h"
-
-#if defined(_KERNEL)
-#include <sys/systm.h>
-#else
-#include <stdlib.h>
-#include <string.h>
-#include <ctype.h>
-#endif
-
-static zprop_desc_t zpool_prop_table[ZPOOL_NUM_PROPS];
-
-zprop_desc_t *
-zpool_prop_get_table(void)
-{
-	return (zpool_prop_table);
-}
-
-void
-zpool_prop_init(void)
-{
-	static zprop_index_t boolean_table[] = {
-		{ "off",	0},
-		{ "on",		1},
-		{ NULL }
-	};
-
-	static zprop_index_t failuremode_table[] = {
-		{ "wait",	ZIO_FAILURE_MODE_WAIT },
-		{ "continue",	ZIO_FAILURE_MODE_CONTINUE },
-		{ "panic",	ZIO_FAILURE_MODE_PANIC },
-		{ NULL }
-	};
-
-	/* string properties */
-	zprop_register_string(ZPOOL_PROP_ALTROOT, "altroot", NULL, PROP_DEFAULT,
-	    ZFS_TYPE_POOL, "<path>", "ALTROOT");
-	zprop_register_string(ZPOOL_PROP_BOOTFS, "bootfs", NULL, PROP_DEFAULT,
-	    ZFS_TYPE_POOL, "<filesystem>", "BOOTFS");
-	zprop_register_string(ZPOOL_PROP_CACHEFILE, "cachefile", NULL,
-	    PROP_DEFAULT, ZFS_TYPE_POOL, "<file> | none", "CACHEFILE");
-	zprop_register_string(ZPOOL_PROP_COMMENT, "comment", NULL,
-	    PROP_DEFAULT, ZFS_TYPE_POOL, "<comment-string>", "COMMENT");
-
-	/* readonly number properties */
-	zprop_register_number(ZPOOL_PROP_SIZE, "size", 0, PROP_READONLY,
-	    ZFS_TYPE_POOL, "<size>", "SIZE");
-	zprop_register_number(ZPOOL_PROP_FREE, "free", 0, PROP_READONLY,
-	    ZFS_TYPE_POOL, "<size>", "FREE");
-	zprop_register_number(ZPOOL_PROP_FREEING, "freeing", 0, PROP_READONLY,
-	    ZFS_TYPE_POOL, "<size>", "FREEING");
-	zprop_register_number(ZPOOL_PROP_CHECKPOINT, "checkpoint", 0,
-	    PROP_READONLY, ZFS_TYPE_POOL, "<size>", "CKPOINT");
-	zprop_register_number(ZPOOL_PROP_LEAKED, "leaked", 0, PROP_READONLY,
-	    ZFS_TYPE_POOL, "<size>", "LEAKED");
-	zprop_register_number(ZPOOL_PROP_ALLOCATED, "allocated", 0,
-	    PROP_READONLY, ZFS_TYPE_POOL, "<size>", "ALLOC");
-	zprop_register_number(ZPOOL_PROP_EXPANDSZ, "expandsize", 0,
-	    PROP_READONLY, ZFS_TYPE_POOL, "<size>", "EXPANDSZ");
-	zprop_register_number(ZPOOL_PROP_FRAGMENTATION, "fragmentation", 0,
-	    PROP_READONLY, ZFS_TYPE_POOL, "<percent>", "FRAG");
-	zprop_register_number(ZPOOL_PROP_CAPACITY, "capacity", 0, PROP_READONLY,
-	    ZFS_TYPE_POOL, "<size>", "CAP");
-	zprop_register_number(ZPOOL_PROP_GUID, "guid", 0, PROP_READONLY,
-	    ZFS_TYPE_POOL, "<guid>", "GUID");
-	zprop_register_number(ZPOOL_PROP_HEALTH, "health", 0, PROP_READONLY,
-	    ZFS_TYPE_POOL, "<state>", "HEALTH");
-	zprop_register_number(ZPOOL_PROP_DEDUPRATIO, "dedupratio", 0,
-	    PROP_READONLY, ZFS_TYPE_POOL, "<1.00x or higher if deduped>",
-	    "DEDUP");
-
-	/* system partition size */
-	zprop_register_number(ZPOOL_PROP_BOOTSIZE, "bootsize", 0, PROP_ONETIME,
-	    ZFS_TYPE_POOL, "<size>", "BOOTSIZE");
-
-	/* default number properties */
-	zprop_register_number(ZPOOL_PROP_VERSION, "version", SPA_VERSION,
-	    PROP_DEFAULT, ZFS_TYPE_POOL, "<version>", "VERSION");
-	zprop_register_number(ZPOOL_PROP_DEDUPDITTO, "dedupditto", 0,
-	    PROP_DEFAULT, ZFS_TYPE_POOL, "<threshold (min 100)>", "DEDUPDITTO");
-
-	/* default index (boolean) properties */
-	zprop_register_index(ZPOOL_PROP_DELEGATION, "delegation", 1,
-	    PROP_DEFAULT, ZFS_TYPE_POOL, "on | off", "DELEGATION",
-	    boolean_table);
-	zprop_register_index(ZPOOL_PROP_AUTOREPLACE, "autoreplace", 0,
-	    PROP_DEFAULT, ZFS_TYPE_POOL, "on | off", "REPLACE", boolean_table);
-	zprop_register_index(ZPOOL_PROP_LISTSNAPS, "listsnapshots", 0,
-	    PROP_DEFAULT, ZFS_TYPE_POOL, "on | off", "LISTSNAPS",
-	    boolean_table);
-	zprop_register_index(ZPOOL_PROP_AUTOEXPAND, "autoexpand", 0,
-	    PROP_DEFAULT, ZFS_TYPE_POOL, "on | off", "EXPAND", boolean_table);
-	zprop_register_index(ZPOOL_PROP_READONLY, "readonly", 0,
-	    PROP_DEFAULT, ZFS_TYPE_POOL, "on | off", "RDONLY", boolean_table);
-	zprop_register_index(ZPOOL_PROP_MULTIHOST, "multihost", 0,
-	    PROP_DEFAULT, ZFS_TYPE_POOL, "on | off", "MULTIHOST",
-	    boolean_table);
-
-	/* default index properties */
-	zprop_register_index(ZPOOL_PROP_FAILUREMODE, "failmode",
-	    ZIO_FAILURE_MODE_WAIT, PROP_DEFAULT, ZFS_TYPE_POOL,
-	    "wait | continue | panic", "FAILMODE", failuremode_table);
-
-	/* hidden properties */
-	zprop_register_hidden(ZPOOL_PROP_NAME, "name", PROP_TYPE_STRING,
-	    PROP_READONLY, ZFS_TYPE_POOL, "NAME");
-	zprop_register_hidden(ZPOOL_PROP_MAXBLOCKSIZE, "maxblocksize",
-	    PROP_TYPE_NUMBER, PROP_READONLY, ZFS_TYPE_POOL, "MAXBLOCKSIZE");
-	zprop_register_hidden(ZPOOL_PROP_TNAME, "tname", PROP_TYPE_STRING,
-	    PROP_ONETIME, ZFS_TYPE_POOL, "TNAME");
-	zprop_register_hidden(ZPOOL_PROP_MAXDNODESIZE, "maxdnodesize",
-	    PROP_TYPE_NUMBER, PROP_READONLY, ZFS_TYPE_POOL, "MAXDNODESIZE");
-}
-
-/*
- * Given a property name and its type, returns the corresponding property ID.
- */
-zpool_prop_t
-zpool_name_to_prop(const char *propname)
-{
-	return (zprop_name_to_prop(propname, ZFS_TYPE_POOL));
-}
-
-/*
- * Given a pool property ID, returns the corresponding name.
- * Assuming the pool propety ID is valid.
- */
-const char *
-zpool_prop_to_name(zpool_prop_t prop)
-{
-	return (zpool_prop_table[prop].pd_name);
-}
-
-zprop_type_t
-zpool_prop_get_type(zpool_prop_t prop)
-{
-	return (zpool_prop_table[prop].pd_proptype);
-}
-
-boolean_t
-zpool_prop_readonly(zpool_prop_t prop)
-{
-	return (zpool_prop_table[prop].pd_attr == PROP_READONLY);
-}
-
-const char *
-zpool_prop_default_string(zpool_prop_t prop)
-{
-	return (zpool_prop_table[prop].pd_strdefault);
-}
-
-uint64_t
-zpool_prop_default_numeric(zpool_prop_t prop)
-{
-	return (zpool_prop_table[prop].pd_numdefault);
-}
-
-/*
- * Returns true if this is a valid feature@ property.
- */
-boolean_t
-zpool_prop_feature(const char *name)
-{
-	static const char *prefix = "feature@";
-	return (strncmp(name, prefix, strlen(prefix)) == 0);
-}
-
-/*
- * Returns true if this is a valid unsupported@ property.
- */
-boolean_t
-zpool_prop_unsupported(const char *name)
-{
-	static const char *prefix = "unsupported@";
-	return (strncmp(name, prefix, strlen(prefix)) == 0);
-}
-
-int
-zpool_prop_string_to_index(zpool_prop_t prop, const char *string,
-    uint64_t *index)
-{
-	return (zprop_string_to_index(prop, string, index, ZFS_TYPE_POOL));
-}
-
-int
-zpool_prop_index_to_string(zpool_prop_t prop, uint64_t index,
-    const char **string)
-{
-	return (zprop_index_to_string(prop, index, string, ZFS_TYPE_POOL));
-}
-
-uint64_t
-zpool_prop_random_value(zpool_prop_t prop, uint64_t seed)
-{
-	return (zprop_random_value(prop, seed, ZFS_TYPE_POOL));
-}
-
-#ifndef _KERNEL
-
-const char *
-zpool_prop_values(zpool_prop_t prop)
-{
-	return (zpool_prop_table[prop].pd_values);
-}
-
-const char *
-zpool_prop_column_name(zpool_prop_t prop)
-{
-	return (zpool_prop_table[prop].pd_colname);
-}
-
-boolean_t
-zpool_prop_align_right(zpool_prop_t prop)
-{
-	return (zpool_prop_table[prop].pd_rightalign);
-}
-#endif
Index: sys/cddl/contrib/opensolaris/common/zfs/zprop_common.c
===================================================================
--- sys/cddl/contrib/opensolaris/common/zfs/zprop_common.c
+++ /dev/null
@@ -1,430 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-/*
- * Copyright (c) 2012 by Delphix. All rights reserved.
- */
-
-/*
- * Common routines used by zfs and zpool property management.
- */
-
-#include <sys/zio.h>
-#include <sys/spa.h>
-#include <sys/zfs_acl.h>
-#include <sys/zfs_ioctl.h>
-#include <sys/zfs_znode.h>
-#include <sys/fs/zfs.h>
-
-#include "zfs_prop.h"
-#include "zfs_deleg.h"
-
-#if defined(_KERNEL)
-#include <sys/systm.h>
-#include <sys/libkern.h>
-#else
-#include <stdlib.h>
-#include <string.h>
-#include <ctype.h>
-#endif
-
-static zprop_desc_t *
-zprop_get_proptable(zfs_type_t type)
-{
-	if (type == ZFS_TYPE_POOL)
-		return (zpool_prop_get_table());
-	else
-		return (zfs_prop_get_table());
-}
-
-static int
-zprop_get_numprops(zfs_type_t type)
-{
-	if (type == ZFS_TYPE_POOL)
-		return (ZPOOL_NUM_PROPS);
-	else
-		return (ZFS_NUM_PROPS);
-}
-
-void
-zprop_register_impl(int prop, const char *name, zprop_type_t type,
-    uint64_t numdefault, const char *strdefault, zprop_attr_t attr,
-    int objset_types, const char *values, const char *colname,
-    boolean_t rightalign, boolean_t visible, const zprop_index_t *idx_tbl)
-{
-	zprop_desc_t *prop_tbl = zprop_get_proptable(objset_types);
-	zprop_desc_t *pd;
-
-	pd = &prop_tbl[prop];
-
-	ASSERT(pd->pd_name == NULL || pd->pd_name == name);
-	ASSERT(name != NULL);
-	ASSERT(colname != NULL);
-
-	pd->pd_name = name;
-	pd->pd_propnum = prop;
-	pd->pd_proptype = type;
-	pd->pd_numdefault = numdefault;
-	pd->pd_strdefault = strdefault;
-	pd->pd_attr = attr;
-	pd->pd_types = objset_types;
-	pd->pd_values = values;
-	pd->pd_colname = colname;
-	pd->pd_rightalign = rightalign;
-	pd->pd_visible = visible;
-	pd->pd_table = idx_tbl;
-	pd->pd_table_size = 0;
-	while (idx_tbl && (idx_tbl++)->pi_name != NULL)
-		pd->pd_table_size++;
-}
-
-void
-zprop_register_string(int prop, const char *name, const char *def,
-    zprop_attr_t attr, int objset_types, const char *values,
-    const char *colname)
-{
-	zprop_register_impl(prop, name, PROP_TYPE_STRING, 0, def, attr,
-	    objset_types, values, colname, B_FALSE, B_TRUE, NULL);
-
-}
-
-void
-zprop_register_number(int prop, const char *name, uint64_t def,
-    zprop_attr_t attr, int objset_types, const char *values,
-    const char *colname)
-{
-	zprop_register_impl(prop, name, PROP_TYPE_NUMBER, def, NULL, attr,
-	    objset_types, values, colname, B_TRUE, B_TRUE, NULL);
-}
-
-void
-zprop_register_index(int prop, const char *name, uint64_t def,
-    zprop_attr_t attr, int objset_types, const char *values,
-    const char *colname, const zprop_index_t *idx_tbl)
-{
-	zprop_register_impl(prop, name, PROP_TYPE_INDEX, def, NULL, attr,
-	    objset_types, values, colname, B_TRUE, B_TRUE, idx_tbl);
-}
-
-void
-zprop_register_hidden(int prop, const char *name, zprop_type_t type,
-    zprop_attr_t attr, int objset_types, const char *colname)
-{
-	zprop_register_impl(prop, name, type, 0, NULL, attr,
-	    objset_types, NULL, colname,
-	    type == PROP_TYPE_NUMBER, B_FALSE, NULL);
-}
-
-
-/*
- * A comparison function we can use to order indexes into property tables.
- */
-static int
-zprop_compare(const void *arg1, const void *arg2)
-{
-	const zprop_desc_t *p1 = *((zprop_desc_t **)arg1);
-	const zprop_desc_t *p2 = *((zprop_desc_t **)arg2);
-	boolean_t p1ro, p2ro;
-
-	p1ro = (p1->pd_attr == PROP_READONLY);
-	p2ro = (p2->pd_attr == PROP_READONLY);
-
-	if (p1ro == p2ro)
-		return (strcmp(p1->pd_name, p2->pd_name));
-
-	return (p1ro ? -1 : 1);
-}
-
-/*
- * Iterate over all properties in the given property table, calling back
- * into the specified function for each property. We will continue to
- * iterate until we either reach the end or the callback function returns
- * something other than ZPROP_CONT.
- */
-int
-zprop_iter_common(zprop_func func, void *cb, boolean_t show_all,
-    boolean_t ordered, zfs_type_t type)
-{
-	int i, j, num_props, size, prop;
-	zprop_desc_t *prop_tbl;
-	zprop_desc_t **order;
-
-	prop_tbl = zprop_get_proptable(type);
-	num_props = zprop_get_numprops(type);
-	size = num_props * sizeof (zprop_desc_t *);
-
-#if defined(_KERNEL)
-	order = kmem_alloc(size, KM_SLEEP);
-#else
-	if ((order = malloc(size)) == NULL)
-		return (ZPROP_CONT);
-#endif
-
-	for (j = 0; j < num_props; j++)
-		order[j] = &prop_tbl[j];
-
-	if (ordered) {
-		qsort((void *)order, num_props, sizeof (zprop_desc_t *),
-		    zprop_compare);
-	}
-
-	prop = ZPROP_CONT;
-	for (i = 0; i < num_props; i++) {
-		if ((order[i]->pd_visible || show_all) &&
-		    (func(order[i]->pd_propnum, cb) != ZPROP_CONT)) {
-			prop = order[i]->pd_propnum;
-			break;
-		}
-	}
-
-#if defined(_KERNEL)
-	kmem_free(order, size);
-#else
-	free(order);
-#endif
-	return (prop);
-}
-
-static boolean_t
-propname_match(const char *p, size_t len, zprop_desc_t *prop_entry)
-{
-	const char *propname = prop_entry->pd_name;
-#ifndef _KERNEL
-	const char *colname = prop_entry->pd_colname;
-	int c;
-#endif
-
-	if (len == strlen(propname) &&
-	    strncmp(p, propname, len) == 0)
-		return (B_TRUE);
-
-#ifndef _KERNEL
-	if (colname == NULL || len != strlen(colname))
-		return (B_FALSE);
-
-	for (c = 0; c < len; c++)
-		if (p[c] != tolower(colname[c]))
-			break;
-
-	return (colname[c] == '\0');
-#else
-	return (B_FALSE);
-#endif
-}
-
-typedef struct name_to_prop_cb {
-	const char *propname;
-	zprop_desc_t *prop_tbl;
-} name_to_prop_cb_t;
-
-static int
-zprop_name_to_prop_cb(int prop, void *cb_data)
-{
-	name_to_prop_cb_t *data = cb_data;
-
-	if (propname_match(data->propname, strlen(data->propname),
-	    &data->prop_tbl[prop]))
-		return (prop);
-
-	return (ZPROP_CONT);
-}
-
-int
-zprop_name_to_prop(const char *propname, zfs_type_t type)
-{
-	int prop;
-	name_to_prop_cb_t cb_data;
-
-	cb_data.propname = propname;
-	cb_data.prop_tbl = zprop_get_proptable(type);
-
-	prop = zprop_iter_common(zprop_name_to_prop_cb, &cb_data,
-	    B_TRUE, B_FALSE, type);
-
-	return (prop == ZPROP_CONT ? ZPROP_INVAL : prop);
-}
-
-int
-zprop_string_to_index(int prop, const char *string, uint64_t *index,
-    zfs_type_t type)
-{
-	zprop_desc_t *prop_tbl;
-	const zprop_index_t *idx_tbl;
-	int i;
-
-	if (prop == ZPROP_INVAL || prop == ZPROP_CONT)
-		return (-1);
-
-	ASSERT(prop < zprop_get_numprops(type));
-	prop_tbl = zprop_get_proptable(type);
-	if ((idx_tbl = prop_tbl[prop].pd_table) == NULL)
-		return (-1);
-
-	for (i = 0; idx_tbl[i].pi_name != NULL; i++) {
-		if (strcmp(string, idx_tbl[i].pi_name) == 0) {
-			*index = idx_tbl[i].pi_value;
-			return (0);
-		}
-	}
-
-	return (-1);
-}
-
-int
-zprop_index_to_string(int prop, uint64_t index, const char **string,
-    zfs_type_t type)
-{
-	zprop_desc_t *prop_tbl;
-	const zprop_index_t *idx_tbl;
-	int i;
-
-	if (prop == ZPROP_INVAL || prop == ZPROP_CONT)
-		return (-1);
-
-	ASSERT(prop < zprop_get_numprops(type));
-	prop_tbl = zprop_get_proptable(type);
-	if ((idx_tbl = prop_tbl[prop].pd_table) == NULL)
-		return (-1);
-
-	for (i = 0; idx_tbl[i].pi_name != NULL; i++) {
-		if (idx_tbl[i].pi_value == index) {
-			*string = idx_tbl[i].pi_name;
-			return (0);
-		}
-	}
-
-	return (-1);
-}
-
-/*
- * Return a random valid property value.  Used by ztest.
- */
-uint64_t
-zprop_random_value(int prop, uint64_t seed, zfs_type_t type)
-{
-	zprop_desc_t *prop_tbl;
-	const zprop_index_t *idx_tbl;
-
-	ASSERT((uint_t)prop < zprop_get_numprops(type));
-	prop_tbl = zprop_get_proptable(type);
-	idx_tbl = prop_tbl[prop].pd_table;
-
-	if (idx_tbl == NULL)
-		return (seed);
-
-	return (idx_tbl[seed % prop_tbl[prop].pd_table_size].pi_value);
-}
-
-const char *
-zprop_values(int prop, zfs_type_t type)
-{
-	zprop_desc_t *prop_tbl;
-
-	ASSERT(prop != ZPROP_INVAL && prop != ZPROP_CONT);
-	ASSERT(prop < zprop_get_numprops(type));
-
-	prop_tbl = zprop_get_proptable(type);
-
-	return (prop_tbl[prop].pd_values);
-}
-
-/*
- * Returns TRUE if the property applies to any of the given dataset types.
- */
-boolean_t
-zprop_valid_for_type(int prop, zfs_type_t type)
-{
-	zprop_desc_t *prop_tbl;
-
-	if (prop == ZPROP_INVAL || prop == ZPROP_CONT)
-		return (B_FALSE);
-
-	ASSERT(prop < zprop_get_numprops(type));
-	prop_tbl = zprop_get_proptable(type);
-	return ((prop_tbl[prop].pd_types & type) != 0);
-}
-
-#ifndef _KERNEL
-
-/*
- * Determines the minimum width for the column, and indicates whether it's fixed
- * or not.  Only string columns are non-fixed.
- */
-size_t
-zprop_width(int prop, boolean_t *fixed, zfs_type_t type)
-{
-	zprop_desc_t *prop_tbl, *pd;
-	const zprop_index_t *idx;
-	size_t ret;
-	int i;
-
-	ASSERT(prop != ZPROP_INVAL && prop != ZPROP_CONT);
-	ASSERT(prop < zprop_get_numprops(type));
-
-	prop_tbl = zprop_get_proptable(type);
-	pd = &prop_tbl[prop];
-
-	*fixed = B_TRUE;
-
-	/*
-	 * Start with the width of the column name.
-	 */
-	ret = strlen(pd->pd_colname);
-
-	/*
-	 * For fixed-width values, make sure the width is large enough to hold
-	 * any possible value.
-	 */
-	switch (pd->pd_proptype) {
-	case PROP_TYPE_NUMBER:
-		/*
-		 * The maximum length of a human-readable number is 5 characters
-		 * ("20.4M", for example).
-		 */
-		if (ret < 5)
-			ret = 5;
-		/*
-		 * 'creation' is handled specially because it's a number
-		 * internally, but displayed as a date string.
-		 */
-		if (prop == ZFS_PROP_CREATION)
-			*fixed = B_FALSE;
-		break;
-	case PROP_TYPE_INDEX:
-		idx = prop_tbl[prop].pd_table;
-		for (i = 0; idx[i].pi_name != NULL; i++) {
-			if (strlen(idx[i].pi_name) > ret)
-				ret = strlen(idx[i].pi_name);
-		}
-		break;
-
-	case PROP_TYPE_STRING:
-		*fixed = B_FALSE;
-		break;
-	}
-
-	return (ret);
-}
-
-#endif
Index: sys/cddl/contrib/opensolaris/uts/common/dtrace/dtrace.c
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/dtrace/dtrace.c
+++ sys/cddl/contrib/opensolaris/uts/common/dtrace/dtrace.c
@@ -67,13 +67,15 @@
  * on capital-f functions.
  */
 #include <sys/errno.h>
+#include <sys/param.h>
+#include <sys/types.h>
 #ifndef illumos
 #include <sys/time.h>
 #endif
 #include <sys/stat.h>
-#include <sys/modctl.h>
 #include <sys/conf.h>
 #include <sys/systm.h>
+#include <sys/endian.h>
 #ifdef illumos
 #include <sys/ddi.h>
 #include <sys/sunddi.h>
@@ -96,7 +98,6 @@
 #include <sys/panic.h>
 #include <sys/priv_impl.h>
 #endif
-#include <sys/policy.h>
 #ifdef illumos
 #include <sys/cred_impl.h>
 #include <sys/procfs_isa.h>
@@ -119,6 +120,7 @@
 #include <sys/limits.h>
 #include <sys/linker.h>
 #include <sys/kdb.h>
+#include <sys/jail.h>
 #include <sys/kernel.h>
 #include <sys/malloc.h>
 #include <sys/lock.h>
@@ -129,6 +131,13 @@
 #include <sys/sx.h>
 #include <sys/sysctl.h>
 
+
+#include <sys/mount.h>
+#undef AT_UID
+#undef AT_GID
+#include <sys/vnode.h>
+#include <sys/cred.h>
+
 #include <sys/dtrace_bsd.h>
 
 #include <netinet/in.h>
@@ -299,8 +308,10 @@
 #define	ipaddr_t	in_addr_t
 #define mod_modname	pathname
 #define vuprintf	vprintf
+#ifndef crgetzoneid
+#define crgetzoneid(_a)        0
+#endif
 #define ttoproc(_a)	((_a)->td_proc)
-#define crgetzoneid(_a)	0
 #define SNOCD		0
 #define CPU_ON_INTR(_a)	0
 
@@ -491,7 +502,7 @@
 	if ((remp) != NULL) {						\
 		*(remp) = (uintptr_t)(baseaddr) + (basesz) - (addr);	\
 	}								\
-_NOTE(CONSTCOND) } while (0)
+} while (0)
 
 
 /*
Index: sys/cddl/contrib/opensolaris/uts/common/dtrace/fasttrap.c
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/dtrace/fasttrap.c
+++ sys/cddl/contrib/opensolaris/uts/common/dtrace/fasttrap.c
@@ -35,6 +35,7 @@
 #include <sys/atomic.h>
 #include <sys/errno.h>
 #include <sys/stat.h>
+#include <sys/endian.h>
 #include <sys/modctl.h>
 #include <sys/conf.h>
 #include <sys/systm.h>
@@ -54,6 +55,8 @@
 #include <sys/dtrace_impl.h>
 #include <sys/sysmacros.h>
 #include <sys/proc.h>
+#undef AT_UID
+#undef AT_GID
 #include <sys/policy.h>
 #ifdef illumos
 #include <util/qsort.h>
Index: sys/cddl/contrib/opensolaris/uts/common/fs/vnode.c
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/vnode.c
+++ /dev/null
@@ -1,94 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-
-/*
- * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved.
- */
-
-/*	Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T	*/
-/*	  All Rights Reserved  	*/
-
-/*
- * University Copyright- Copyright (c) 1982, 1986, 1988
- * The Regents of the University of California
- * All Rights Reserved
- *
- * University Acknowledgment- Portions of this document are derived from
- * software developed by the University of California, Berkeley, and its
- * contributors.
- */
-
-#include <sys/types.h>
-#include <sys/param.h>
-#include <sys/proc.h>
-#include <sys/taskq.h>
-#include <sys/vnode.h>
-
-/* Extensible attribute (xva) routines. */
-
-/*
- * Zero out the structure, set the size of the requested/returned bitmaps,
- * set AT_XVATTR in the embedded vattr_t's va_mask, and set up the pointer
- * to the returned attributes array.
- */
-void
-xva_init(xvattr_t *xvap)
-{
-	bzero(xvap, sizeof (xvattr_t));
-	xvap->xva_mapsize = XVA_MAPSIZE;
-	xvap->xva_magic = XVA_MAGIC;
-	xvap->xva_vattr.va_mask = AT_XVATTR;
-	xvap->xva_rtnattrmapp = &(xvap->xva_rtnattrmap)[0];
-}
-
-/*
- * If AT_XVATTR is set, returns a pointer to the embedded xoptattr_t
- * structure.  Otherwise, returns NULL.
- */
-xoptattr_t *
-xva_getxoptattr(xvattr_t *xvap)
-{
-	xoptattr_t *xoap = NULL;
-	if (xvap->xva_vattr.va_mask & AT_XVATTR)
-		xoap = &xvap->xva_xoptattrs;
-	return (xoap);
-}
-
-/*
- * Like vn_rele() except if we are going to call VOP_INACTIVE() then do it
- * asynchronously using a taskq. This can avoid deadlocks caused by re-entering
- * the file system as a result of releasing the vnode. Note, file systems
- * already have to handle the race where the vnode is incremented before the
- * inactive routine is called and does its locking.
- *
- * Warning: Excessive use of this routine can lead to performance problems.
- * This is because taskqs throttle back allocation if too many are created.
- */
-void
-vn_rele_async(vnode_t *vp, taskq_t *taskq)
-{
-	VERIFY(vp->v_count > 0);
-	if (refcount_release_if_not_last(&vp->v_usecount)) {
-		return;
-	}
-	VERIFY(taskq_dispatch((taskq_t *)taskq,
-	    (task_func_t *)vrele, vp, TQ_SLEEP) != 0);
-}
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/THIRDPARTYLICENSE.cityhash
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/THIRDPARTYLICENSE.cityhash
+++ /dev/null
@@ -1,19 +0,0 @@
-Copyright (c) 2011 Google, Inc.
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/THIRDPARTYLICENSE.cityhash.descrip
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/THIRDPARTYLICENSE.cityhash.descrip
+++ /dev/null
@@ -1 +0,0 @@
-CITYHASH CHECKSUM FUNCTIONALITY IN ZFS
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/abd.c
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/abd.c
+++ /dev/null
@@ -1,960 +0,0 @@
-/*
- * This file and its contents are supplied under the terms of the
- * Common Development and Distribution License ("CDDL"), version 1.0.
- * You may only use this file in accordance with the terms of version
- * 1.0 of the CDDL.
- *
- * A full copy of the text of the CDDL should have accompanied this
- * source.  A copy of the CDDL is also available via the Internet at
- * http://www.illumos.org/license/CDDL.
- */
-
-/*
- * Copyright (c) 2014 by Chunwei Chen. All rights reserved.
- * Copyright (c) 2016 by Delphix. All rights reserved.
- */
-
-/*
- * ARC buffer data (ABD).
- *
- * ABDs are an abstract data structure for the ARC which can use two
- * different ways of storing the underlying data:
- *
- * (a) Linear buffer. In this case, all the data in the ABD is stored in one
- *     contiguous buffer in memory (from a zio_[data_]buf_* kmem cache).
- *
- *         +-------------------+
- *         | ABD (linear)      |
- *         |   abd_flags = ... |
- *         |   abd_size = ...  |     +--------------------------------+
- *         |   abd_buf ------------->| raw buffer of size abd_size    |
- *         +-------------------+     +--------------------------------+
- *              no abd_chunks
- *
- * (b) Scattered buffer. In this case, the data in the ABD is split into
- *     equal-sized chunks (from the abd_chunk_cache kmem_cache), with pointers
- *     to the chunks recorded in an array at the end of the ABD structure.
- *
- *         +-------------------+
- *         | ABD (scattered)   |
- *         |   abd_flags = ... |
- *         |   abd_size = ...  |
- *         |   abd_offset = 0  |                           +-----------+
- *         |   abd_chunks[0] ----------------------------->| chunk 0   |
- *         |   abd_chunks[1] ---------------------+        +-----------+
- *         |   ...             |                  |        +-----------+
- *         |   abd_chunks[N-1] ---------+         +------->| chunk 1   |
- *         +-------------------+        |                  +-----------+
- *                                      |                      ...
- *                                      |                  +-----------+
- *                                      +----------------->| chunk N-1 |
- *                                                         +-----------+
- *
- * Using a large proportion of scattered ABDs decreases ARC fragmentation since
- * when we are at the limit of allocatable space, using equal-size chunks will
- * allow us to quickly reclaim enough space for a new large allocation (assuming
- * it is also scattered).
- *
- * In addition to directly allocating a linear or scattered ABD, it is also
- * possible to create an ABD by requesting the "sub-ABD" starting at an offset
- * within an existing ABD. In linear buffers this is simple (set abd_buf of
- * the new ABD to the starting point within the original raw buffer), but
- * scattered ABDs are a little more complex. The new ABD makes a copy of the
- * relevant abd_chunks pointers (but not the underlying data). However, to
- * provide arbitrary rather than only chunk-aligned starting offsets, it also
- * tracks an abd_offset field which represents the starting point of the data
- * within the first chunk in abd_chunks. For both linear and scattered ABDs,
- * creating an offset ABD marks the original ABD as the offset's parent, and the
- * original ABD's abd_children refcount is incremented. This data allows us to
- * ensure the root ABD isn't deleted before its children.
- *
- * Most consumers should never need to know what type of ABD they're using --
- * the ABD public API ensures that it's possible to transparently switch from
- * using a linear ABD to a scattered one when doing so would be beneficial.
- *
- * If you need to use the data within an ABD directly, if you know it's linear
- * (because you allocated it) you can use abd_to_buf() to access the underlying
- * raw buffer. Otherwise, you should use one of the abd_borrow_buf* functions
- * which will allocate a raw buffer if necessary. Use the abd_return_buf*
- * functions to return any raw buffers that are no longer necessary when you're
- * done using them.
- *
- * There are a variety of ABD APIs that implement basic buffer operations:
- * compare, copy, read, write, and fill with zeroes. If you need a custom
- * function which progressively accesses the whole ABD, use the abd_iterate_*
- * functions.
- */
-
-#include <sys/abd.h>
-#include <sys/param.h>
-#include <sys/zio.h>
-#include <sys/zfs_context.h>
-#include <sys/zfs_znode.h>
-
-typedef struct abd_stats {
-	kstat_named_t abdstat_struct_size;
-	kstat_named_t abdstat_scatter_cnt;
-	kstat_named_t abdstat_scatter_data_size;
-	kstat_named_t abdstat_scatter_chunk_waste;
-	kstat_named_t abdstat_linear_cnt;
-	kstat_named_t abdstat_linear_data_size;
-} abd_stats_t;
-
-static abd_stats_t abd_stats = {
-	/* Amount of memory occupied by all of the abd_t struct allocations */
-	{ "struct_size",			KSTAT_DATA_UINT64 },
-	/*
-	 * The number of scatter ABDs which are currently allocated, excluding
-	 * ABDs which don't own their data (for instance the ones which were
-	 * allocated through abd_get_offset()).
-	 */
-	{ "scatter_cnt",			KSTAT_DATA_UINT64 },
-	/* Amount of data stored in all scatter ABDs tracked by scatter_cnt */
-	{ "scatter_data_size",			KSTAT_DATA_UINT64 },
-	/*
-	 * The amount of space wasted at the end of the last chunk across all
-	 * scatter ABDs tracked by scatter_cnt.
-	 */
-	{ "scatter_chunk_waste",		KSTAT_DATA_UINT64 },
-	/*
-	 * The number of linear ABDs which are currently allocated, excluding
-	 * ABDs which don't own their data (for instance the ones which were
-	 * allocated through abd_get_offset() and abd_get_from_buf()). If an
-	 * ABD takes ownership of its buf then it will become tracked.
-	 */
-	{ "linear_cnt",				KSTAT_DATA_UINT64 },
-	/* Amount of data stored in all linear ABDs tracked by linear_cnt */
-	{ "linear_data_size",			KSTAT_DATA_UINT64 },
-};
-
-#define	ABDSTAT(stat)		(abd_stats.stat.value.ui64)
-#define	ABDSTAT_INCR(stat, val) \
-	atomic_add_64(&abd_stats.stat.value.ui64, (val))
-#define	ABDSTAT_BUMP(stat)	ABDSTAT_INCR(stat, 1)
-#define	ABDSTAT_BUMPDOWN(stat)	ABDSTAT_INCR(stat, -1)
-
-/*
- * It is possible to make all future ABDs be linear by setting this to B_FALSE.
- * Otherwise, ABDs are allocated scattered by default unless the caller uses
- * abd_alloc_linear().
- */
-boolean_t zfs_abd_scatter_enabled = B_TRUE;
-
-/*
- * The size of the chunks ABD allocates. Because the sizes allocated from the
- * kmem_cache can't change, this tunable can only be modified at boot. Changing
- * it at runtime would cause ABD iteration to work incorrectly for ABDs which
- * were allocated with the old size, so a safeguard has been put in place which
- * will cause the machine to panic if you change it and try to access the data
- * within a scattered ABD.
- */
-size_t zfs_abd_chunk_size = 4096;
-
-#if defined(__FreeBSD__) && defined(_KERNEL)
-SYSCTL_DECL(_vfs_zfs);
-
-SYSCTL_INT(_vfs_zfs, OID_AUTO, abd_scatter_enabled, CTLFLAG_RWTUN,
-    &zfs_abd_scatter_enabled, 0, "Enable scattered ARC data buffers");
-SYSCTL_ULONG(_vfs_zfs, OID_AUTO, abd_chunk_size, CTLFLAG_RDTUN,
-    &zfs_abd_chunk_size, 0, "The size of the chunks ABD allocates");
-#endif
-
-#ifdef _KERNEL
-extern vmem_t *zio_alloc_arena;
-#endif
-
-kmem_cache_t *abd_chunk_cache;
-static kstat_t *abd_ksp;
-
-extern inline boolean_t abd_is_linear(abd_t *abd);
-extern inline void abd_copy(abd_t *dabd, abd_t *sabd, size_t size);
-extern inline void abd_copy_from_buf(abd_t *abd, const void *buf, size_t size);
-extern inline void abd_copy_to_buf(void* buf, abd_t *abd, size_t size);
-extern inline int abd_cmp_buf(abd_t *abd, const void *buf, size_t size);
-extern inline void abd_zero(abd_t *abd, size_t size);
-
-static void *
-abd_alloc_chunk()
-{
-	void *c = kmem_cache_alloc(abd_chunk_cache, KM_PUSHPAGE);
-	ASSERT3P(c, !=, NULL);
-	return (c);
-}
-
-static void
-abd_free_chunk(void *c)
-{
-	kmem_cache_free(abd_chunk_cache, c);
-}
-
-void
-abd_init(void)
-{
-#ifdef illumos
-	vmem_t *data_alloc_arena = NULL;
-
-#ifdef _KERNEL
-	data_alloc_arena = zio_alloc_arena;
-#endif
-
-	/*
-	 * Since ABD chunks do not appear in crash dumps, we pass KMC_NOTOUCH
-	 * so that no allocator metadata is stored with the buffers.
-	 */
-	abd_chunk_cache = kmem_cache_create("abd_chunk", zfs_abd_chunk_size, 0,
-	    NULL, NULL, NULL, NULL, data_alloc_arena, KMC_NOTOUCH);
-#else
-	abd_chunk_cache = kmem_cache_create("abd_chunk", zfs_abd_chunk_size, 0,
-	    NULL, NULL, NULL, NULL, 0, KMC_NOTOUCH | KMC_NODEBUG);
-#endif
-	abd_ksp = kstat_create("zfs", 0, "abdstats", "misc", KSTAT_TYPE_NAMED,
-	    sizeof (abd_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
-	if (abd_ksp != NULL) {
-		abd_ksp->ks_data = &abd_stats;
-		kstat_install(abd_ksp);
-	}
-}
-
-void
-abd_fini(void)
-{
-	if (abd_ksp != NULL) {
-		kstat_delete(abd_ksp);
-		abd_ksp = NULL;
-	}
-
-	kmem_cache_destroy(abd_chunk_cache);
-	abd_chunk_cache = NULL;
-}
-
-static inline size_t
-abd_chunkcnt_for_bytes(size_t size)
-{
-	return (P2ROUNDUP(size, zfs_abd_chunk_size) / zfs_abd_chunk_size);
-}
-
-static inline size_t
-abd_scatter_chunkcnt(abd_t *abd)
-{
-	ASSERT(!abd_is_linear(abd));
-	return (abd_chunkcnt_for_bytes(
-	    abd->abd_u.abd_scatter.abd_offset + abd->abd_size));
-}
-
-static inline void
-abd_verify(abd_t *abd)
-{
-	ASSERT3U(abd->abd_size, >, 0);
-	ASSERT3U(abd->abd_size, <=, SPA_MAXBLOCKSIZE);
-	ASSERT3U(abd->abd_flags, ==, abd->abd_flags & (ABD_FLAG_LINEAR |
-	    ABD_FLAG_OWNER | ABD_FLAG_META));
-	IMPLY(abd->abd_parent != NULL, !(abd->abd_flags & ABD_FLAG_OWNER));
-	IMPLY(abd->abd_flags & ABD_FLAG_META, abd->abd_flags & ABD_FLAG_OWNER);
-	if (abd_is_linear(abd)) {
-		ASSERT3P(abd->abd_u.abd_linear.abd_buf, !=, NULL);
-	} else {
-		ASSERT3U(abd->abd_u.abd_scatter.abd_offset, <,
-		    zfs_abd_chunk_size);
-		size_t n = abd_scatter_chunkcnt(abd);
-		for (int i = 0; i < n; i++) {
-			ASSERT3P(
-			    abd->abd_u.abd_scatter.abd_chunks[i], !=, NULL);
-		}
-	}
-}
-
-static inline abd_t *
-abd_alloc_struct(size_t chunkcnt)
-{
-	size_t size = offsetof(abd_t, abd_u.abd_scatter.abd_chunks[chunkcnt]);
-	abd_t *abd = kmem_alloc(size, KM_PUSHPAGE);
-	ASSERT3P(abd, !=, NULL);
-	ABDSTAT_INCR(abdstat_struct_size, size);
-
-	return (abd);
-}
-
-static inline void
-abd_free_struct(abd_t *abd)
-{
-	size_t chunkcnt = abd_is_linear(abd) ? 0 : abd_scatter_chunkcnt(abd);
-	int size = offsetof(abd_t, abd_u.abd_scatter.abd_chunks[chunkcnt]);
-	kmem_free(abd, size);
-	ABDSTAT_INCR(abdstat_struct_size, -size);
-}
-
-/*
- * Allocate an ABD, along with its own underlying data buffers. Use this if you
- * don't care whether the ABD is linear or not.
- */
-abd_t *
-abd_alloc(size_t size, boolean_t is_metadata)
-{
-	if (!zfs_abd_scatter_enabled || size <= zfs_abd_chunk_size)
-		return (abd_alloc_linear(size, is_metadata));
-
-	VERIFY3U(size, <=, SPA_MAXBLOCKSIZE);
-
-	size_t n = abd_chunkcnt_for_bytes(size);
-	abd_t *abd = abd_alloc_struct(n);
-
-	abd->abd_flags = ABD_FLAG_OWNER;
-	if (is_metadata) {
-		abd->abd_flags |= ABD_FLAG_META;
-	}
-	abd->abd_size = size;
-	abd->abd_parent = NULL;
-	zfs_refcount_create(&abd->abd_children);
-
-	abd->abd_u.abd_scatter.abd_offset = 0;
-	abd->abd_u.abd_scatter.abd_chunk_size = zfs_abd_chunk_size;
-
-	for (int i = 0; i < n; i++) {
-		void *c = abd_alloc_chunk();
-		ASSERT3P(c, !=, NULL);
-		abd->abd_u.abd_scatter.abd_chunks[i] = c;
-	}
-
-	ABDSTAT_BUMP(abdstat_scatter_cnt);
-	ABDSTAT_INCR(abdstat_scatter_data_size, size);
-	ABDSTAT_INCR(abdstat_scatter_chunk_waste,
-	    n * zfs_abd_chunk_size - size);
-
-	return (abd);
-}
-
-static void
-abd_free_scatter(abd_t *abd)
-{
-	size_t n = abd_scatter_chunkcnt(abd);
-	for (int i = 0; i < n; i++) {
-		abd_free_chunk(abd->abd_u.abd_scatter.abd_chunks[i]);
-	}
-
-	zfs_refcount_destroy(&abd->abd_children);
-	ABDSTAT_BUMPDOWN(abdstat_scatter_cnt);
-	ABDSTAT_INCR(abdstat_scatter_data_size, -(int)abd->abd_size);
-	ABDSTAT_INCR(abdstat_scatter_chunk_waste,
-	    abd->abd_size - n * zfs_abd_chunk_size);
-
-	abd_free_struct(abd);
-}
-
-/*
- * Allocate an ABD that must be linear, along with its own underlying data
- * buffer. Only use this when it would be very annoying to write your ABD
- * consumer with a scattered ABD.
- */
-abd_t *
-abd_alloc_linear(size_t size, boolean_t is_metadata)
-{
-	abd_t *abd = abd_alloc_struct(0);
-
-	VERIFY3U(size, <=, SPA_MAXBLOCKSIZE);
-
-	abd->abd_flags = ABD_FLAG_LINEAR | ABD_FLAG_OWNER;
-	if (is_metadata) {
-		abd->abd_flags |= ABD_FLAG_META;
-	}
-	abd->abd_size = size;
-	abd->abd_parent = NULL;
-	zfs_refcount_create(&abd->abd_children);
-
-	if (is_metadata) {
-		abd->abd_u.abd_linear.abd_buf = zio_buf_alloc(size);
-	} else {
-		abd->abd_u.abd_linear.abd_buf = zio_data_buf_alloc(size);
-	}
-
-	ABDSTAT_BUMP(abdstat_linear_cnt);
-	ABDSTAT_INCR(abdstat_linear_data_size, size);
-
-	return (abd);
-}
-
-static void
-abd_free_linear(abd_t *abd)
-{
-	if (abd->abd_flags & ABD_FLAG_META) {
-		zio_buf_free(abd->abd_u.abd_linear.abd_buf, abd->abd_size);
-	} else {
-		zio_data_buf_free(abd->abd_u.abd_linear.abd_buf, abd->abd_size);
-	}
-
-	zfs_refcount_destroy(&abd->abd_children);
-	ABDSTAT_BUMPDOWN(abdstat_linear_cnt);
-	ABDSTAT_INCR(abdstat_linear_data_size, -(int)abd->abd_size);
-
-	abd_free_struct(abd);
-}
-
-/*
- * Free an ABD. Only use this on ABDs allocated with abd_alloc() or
- * abd_alloc_linear().
- */
-void
-abd_free(abd_t *abd)
-{
-	abd_verify(abd);
-	ASSERT3P(abd->abd_parent, ==, NULL);
-	ASSERT(abd->abd_flags & ABD_FLAG_OWNER);
-	if (abd_is_linear(abd))
-		abd_free_linear(abd);
-	else
-		abd_free_scatter(abd);
-}
-
-/*
- * Allocate an ABD of the same format (same metadata flag, same scatterize
- * setting) as another ABD.
- */
-abd_t *
-abd_alloc_sametype(abd_t *sabd, size_t size)
-{
-	boolean_t is_metadata = (sabd->abd_flags & ABD_FLAG_META) != 0;
-	if (abd_is_linear(sabd)) {
-		return (abd_alloc_linear(size, is_metadata));
-	} else {
-		return (abd_alloc(size, is_metadata));
-	}
-}
-
-/*
- * If we're going to use this ABD for doing I/O using the block layer, the
- * consumer of the ABD data doesn't care if it's scattered or not, and we don't
- * plan to store this ABD in memory for a long period of time, we should
- * allocate the ABD type that requires the least data copying to do the I/O.
- *
- * Currently this is linear ABDs, however if ldi_strategy() can ever issue I/Os
- * using a scatter/gather list we should switch to that and replace this call
- * with vanilla abd_alloc().
- */
-abd_t *
-abd_alloc_for_io(size_t size, boolean_t is_metadata)
-{
-	return (abd_alloc_linear(size, is_metadata));
-}
-
-/*
- * Allocate a new ABD to point to offset off of sabd. It shares the underlying
- * buffer data with sabd. Use abd_put() to free. sabd must not be freed while
- * any derived ABDs exist.
- */
-abd_t *
-abd_get_offset(abd_t *sabd, size_t off)
-{
-	abd_t *abd;
-
-	abd_verify(sabd);
-	ASSERT3U(off, <=, sabd->abd_size);
-
-	if (abd_is_linear(sabd)) {
-		abd = abd_alloc_struct(0);
-
-		/*
-		 * Even if this buf is filesystem metadata, we only track that
-		 * if we own the underlying data buffer, which is not true in
-		 * this case. Therefore, we don't ever use ABD_FLAG_META here.
-		 */
-		abd->abd_flags = ABD_FLAG_LINEAR;
-
-		abd->abd_u.abd_linear.abd_buf =
-		    (char *)sabd->abd_u.abd_linear.abd_buf + off;
-	} else {
-		size_t new_offset = sabd->abd_u.abd_scatter.abd_offset + off;
-		size_t chunkcnt = abd_scatter_chunkcnt(sabd) -
-		    (new_offset / zfs_abd_chunk_size);
-
-		abd = abd_alloc_struct(chunkcnt);
-
-		/*
-		 * Even if this buf is filesystem metadata, we only track that
-		 * if we own the underlying data buffer, which is not true in
-		 * this case. Therefore, we don't ever use ABD_FLAG_META here.
-		 */
-		abd->abd_flags = 0;
-
-		abd->abd_u.abd_scatter.abd_offset =
-		    new_offset % zfs_abd_chunk_size;
-		abd->abd_u.abd_scatter.abd_chunk_size = zfs_abd_chunk_size;
-
-		/* Copy the scatterlist starting at the correct offset */
-		(void) memcpy(&abd->abd_u.abd_scatter.abd_chunks,
-		    &sabd->abd_u.abd_scatter.abd_chunks[new_offset /
-		    zfs_abd_chunk_size],
-		    chunkcnt * sizeof (void *));
-	}
-
-	abd->abd_size = sabd->abd_size - off;
-	abd->abd_parent = sabd;
-	zfs_refcount_create(&abd->abd_children);
-	(void) zfs_refcount_add_many(&sabd->abd_children, abd->abd_size, abd);
-
-	return (abd);
-}
-
-/*
- * Allocate a linear ABD structure for buf. You must free this with abd_put()
- * since the resulting ABD doesn't own its own buffer.
- */
-abd_t *
-abd_get_from_buf(void *buf, size_t size)
-{
-	abd_t *abd = abd_alloc_struct(0);
-
-	VERIFY3U(size, <=, SPA_MAXBLOCKSIZE);
-
-	/*
-	 * Even if this buf is filesystem metadata, we only track that if we
-	 * own the underlying data buffer, which is not true in this case.
-	 * Therefore, we don't ever use ABD_FLAG_META here.
-	 */
-	abd->abd_flags = ABD_FLAG_LINEAR;
-	abd->abd_size = size;
-	abd->abd_parent = NULL;
-	zfs_refcount_create(&abd->abd_children);
-
-	abd->abd_u.abd_linear.abd_buf = buf;
-
-	return (abd);
-}
-
-/*
- * Free an ABD allocated from abd_get_offset() or abd_get_from_buf(). Will not
- * free the underlying scatterlist or buffer.
- */
-void
-abd_put(abd_t *abd)
-{
-	abd_verify(abd);
-	ASSERT(!(abd->abd_flags & ABD_FLAG_OWNER));
-
-	if (abd->abd_parent != NULL) {
-		(void) zfs_refcount_remove_many(&abd->abd_parent->abd_children,
-		    abd->abd_size, abd);
-	}
-
-	zfs_refcount_destroy(&abd->abd_children);
-	abd_free_struct(abd);
-}
-
-/*
- * Get the raw buffer associated with a linear ABD.
- */
-void *
-abd_to_buf(abd_t *abd)
-{
-	ASSERT(abd_is_linear(abd));
-	abd_verify(abd);
-	return (abd->abd_u.abd_linear.abd_buf);
-}
-
-/*
- * Borrow a raw buffer from an ABD without copying the contents of the ABD
- * into the buffer. If the ABD is scattered, this will allocate a raw buffer
- * whose contents are undefined. To copy over the existing data in the ABD, use
- * abd_borrow_buf_copy() instead.
- */
-void *
-abd_borrow_buf(abd_t *abd, size_t n)
-{
-	void *buf;
-	abd_verify(abd);
-	ASSERT3U(abd->abd_size, >=, n);
-	if (abd_is_linear(abd)) {
-		buf = abd_to_buf(abd);
-	} else {
-		buf = zio_buf_alloc(n);
-	}
-	(void) zfs_refcount_add_many(&abd->abd_children, n, buf);
-
-	return (buf);
-}
-
-void *
-abd_borrow_buf_copy(abd_t *abd, size_t n)
-{
-	void *buf = abd_borrow_buf(abd, n);
-	if (!abd_is_linear(abd)) {
-		abd_copy_to_buf(buf, abd, n);
-	}
-	return (buf);
-}
-
-/*
- * Return a borrowed raw buffer to an ABD. If the ABD is scattered, this will
- * not change the contents of the ABD and will ASSERT that you didn't modify
- * the buffer since it was borrowed. If you want any changes you made to buf to
- * be copied back to abd, use abd_return_buf_copy() instead.
- */
-void
-abd_return_buf(abd_t *abd, void *buf, size_t n)
-{
-	abd_verify(abd);
-	ASSERT3U(abd->abd_size, >=, n);
-	if (abd_is_linear(abd)) {
-		ASSERT3P(buf, ==, abd_to_buf(abd));
-	} else {
-		ASSERT0(abd_cmp_buf(abd, buf, n));
-		zio_buf_free(buf, n);
-	}
-	(void) zfs_refcount_remove_many(&abd->abd_children, n, buf);
-}
-
-void
-abd_return_buf_copy(abd_t *abd, void *buf, size_t n)
-{
-	if (!abd_is_linear(abd)) {
-		abd_copy_from_buf(abd, buf, n);
-	}
-	abd_return_buf(abd, buf, n);
-}
-
-/*
- * Give this ABD ownership of the buffer that it's storing. Can only be used on
- * linear ABDs which were allocated via abd_get_from_buf(), or ones allocated
- * with abd_alloc_linear() which subsequently released ownership of their buf
- * with abd_release_ownership_of_buf().
- */
-void
-abd_take_ownership_of_buf(abd_t *abd, boolean_t is_metadata)
-{
-	ASSERT(abd_is_linear(abd));
-	ASSERT(!(abd->abd_flags & ABD_FLAG_OWNER));
-	abd_verify(abd);
-
-	abd->abd_flags |= ABD_FLAG_OWNER;
-	if (is_metadata) {
-		abd->abd_flags |= ABD_FLAG_META;
-	}
-
-	ABDSTAT_BUMP(abdstat_linear_cnt);
-	ABDSTAT_INCR(abdstat_linear_data_size, abd->abd_size);
-}
-
-void
-abd_release_ownership_of_buf(abd_t *abd)
-{
-	ASSERT(abd_is_linear(abd));
-	ASSERT(abd->abd_flags & ABD_FLAG_OWNER);
-	abd_verify(abd);
-
-	abd->abd_flags &= ~ABD_FLAG_OWNER;
-	/* Disable this flag since we no longer own the data buffer */
-	abd->abd_flags &= ~ABD_FLAG_META;
-
-	ABDSTAT_BUMPDOWN(abdstat_linear_cnt);
-	ABDSTAT_INCR(abdstat_linear_data_size, -(int)abd->abd_size);
-}
-
-struct abd_iter {
-	abd_t		*iter_abd;	/* ABD being iterated through */
-	size_t		iter_pos;	/* position (relative to abd_offset) */
-	void		*iter_mapaddr;	/* addr corresponding to iter_pos */
-	size_t		iter_mapsize;	/* length of data valid at mapaddr */
-};
-
-static inline size_t
-abd_iter_scatter_chunk_offset(struct abd_iter *aiter)
-{
-	ASSERT(!abd_is_linear(aiter->iter_abd));
-	return ((aiter->iter_abd->abd_u.abd_scatter.abd_offset +
-	    aiter->iter_pos) % zfs_abd_chunk_size);
-}
-
-static inline size_t
-abd_iter_scatter_chunk_index(struct abd_iter *aiter)
-{
-	ASSERT(!abd_is_linear(aiter->iter_abd));
-	return ((aiter->iter_abd->abd_u.abd_scatter.abd_offset +
-	    aiter->iter_pos) / zfs_abd_chunk_size);
-}
-
-/*
- * Initialize the abd_iter.
- */
-static void
-abd_iter_init(struct abd_iter *aiter, abd_t *abd)
-{
-	abd_verify(abd);
-	aiter->iter_abd = abd;
-	aiter->iter_pos = 0;
-	aiter->iter_mapaddr = NULL;
-	aiter->iter_mapsize = 0;
-}
-
-/*
- * Advance the iterator by a certain amount. Cannot be called when a chunk is
- * in use. This can be safely called when the aiter has already exhausted, in
- * which case this does nothing.
- */
-static void
-abd_iter_advance(struct abd_iter *aiter, size_t amount)
-{
-	ASSERT3P(aiter->iter_mapaddr, ==, NULL);
-	ASSERT0(aiter->iter_mapsize);
-
-	/* There's nothing left to advance to, so do nothing */
-	if (aiter->iter_pos == aiter->iter_abd->abd_size)
-		return;
-
-	aiter->iter_pos += amount;
-}
-
-/*
- * Map the current chunk into aiter. This can be safely called when the aiter
- * has already exhausted, in which case this does nothing.
- */
-static void
-abd_iter_map(struct abd_iter *aiter)
-{
-	void *paddr;
-	size_t offset = 0;
-
-	ASSERT3P(aiter->iter_mapaddr, ==, NULL);
-	ASSERT0(aiter->iter_mapsize);
-
-	/* Panic if someone has changed zfs_abd_chunk_size */
-	IMPLY(!abd_is_linear(aiter->iter_abd), zfs_abd_chunk_size ==
-	    aiter->iter_abd->abd_u.abd_scatter.abd_chunk_size);
-
-	/* There's nothing left to iterate over, so do nothing */
-	if (aiter->iter_pos == aiter->iter_abd->abd_size)
-		return;
-
-	if (abd_is_linear(aiter->iter_abd)) {
-		offset = aiter->iter_pos;
-		aiter->iter_mapsize = aiter->iter_abd->abd_size - offset;
-		paddr = aiter->iter_abd->abd_u.abd_linear.abd_buf;
-	} else {
-		size_t index = abd_iter_scatter_chunk_index(aiter);
-		offset = abd_iter_scatter_chunk_offset(aiter);
-		aiter->iter_mapsize = zfs_abd_chunk_size - offset;
-		paddr = aiter->iter_abd->abd_u.abd_scatter.abd_chunks[index];
-	}
-	aiter->iter_mapaddr = (char *)paddr + offset;
-}
-
-/*
- * Unmap the current chunk from aiter. This can be safely called when the aiter
- * has already exhausted, in which case this does nothing.
- */
-static void
-abd_iter_unmap(struct abd_iter *aiter)
-{
-	/* There's nothing left to unmap, so do nothing */
-	if (aiter->iter_pos == aiter->iter_abd->abd_size)
-		return;
-
-	ASSERT3P(aiter->iter_mapaddr, !=, NULL);
-	ASSERT3U(aiter->iter_mapsize, >, 0);
-
-	aiter->iter_mapaddr = NULL;
-	aiter->iter_mapsize = 0;
-}
-
-int
-abd_iterate_func(abd_t *abd, size_t off, size_t size,
-    abd_iter_func_t *func, void *private)
-{
-	int ret = 0;
-	struct abd_iter aiter;
-
-	abd_verify(abd);
-	ASSERT3U(off + size, <=, abd->abd_size);
-
-	abd_iter_init(&aiter, abd);
-	abd_iter_advance(&aiter, off);
-
-	while (size > 0) {
-		abd_iter_map(&aiter);
-
-		size_t len = MIN(aiter.iter_mapsize, size);
-		ASSERT3U(len, >, 0);
-
-		ret = func(aiter.iter_mapaddr, len, private);
-
-		abd_iter_unmap(&aiter);
-
-		if (ret != 0)
-			break;
-
-		size -= len;
-		abd_iter_advance(&aiter, len);
-	}
-
-	return (ret);
-}
-
-struct buf_arg {
-	void *arg_buf;
-};
-
-static int
-abd_copy_to_buf_off_cb(void *buf, size_t size, void *private)
-{
-	struct buf_arg *ba_ptr = private;
-
-	(void) memcpy(ba_ptr->arg_buf, buf, size);
-	ba_ptr->arg_buf = (char *)ba_ptr->arg_buf + size;
-
-	return (0);
-}
-
-/*
- * Copy abd to buf. (off is the offset in abd.)
- */
-void
-abd_copy_to_buf_off(void *buf, abd_t *abd, size_t off, size_t size)
-{
-	struct buf_arg ba_ptr = { buf };
-
-	(void) abd_iterate_func(abd, off, size, abd_copy_to_buf_off_cb,
-	    &ba_ptr);
-}
-
-static int
-abd_cmp_buf_off_cb(void *buf, size_t size, void *private)
-{
-	int ret;
-	struct buf_arg *ba_ptr = private;
-
-	ret = memcmp(buf, ba_ptr->arg_buf, size);
-	ba_ptr->arg_buf = (char *)ba_ptr->arg_buf + size;
-
-	return (ret);
-}
-
-/*
- * Compare the contents of abd to buf. (off is the offset in abd.)
- */
-int
-abd_cmp_buf_off(abd_t *abd, const void *buf, size_t off, size_t size)
-{
-	struct buf_arg ba_ptr = { (void *) buf };
-
-	return (abd_iterate_func(abd, off, size, abd_cmp_buf_off_cb, &ba_ptr));
-}
-
-static int
-abd_copy_from_buf_off_cb(void *buf, size_t size, void *private)
-{
-	struct buf_arg *ba_ptr = private;
-
-	(void) memcpy(buf, ba_ptr->arg_buf, size);
-	ba_ptr->arg_buf = (char *)ba_ptr->arg_buf + size;
-
-	return (0);
-}
-
-/*
- * Copy from buf to abd. (off is the offset in abd.)
- */
-void
-abd_copy_from_buf_off(abd_t *abd, const void *buf, size_t off, size_t size)
-{
-	struct buf_arg ba_ptr = { (void *) buf };
-
-	(void) abd_iterate_func(abd, off, size, abd_copy_from_buf_off_cb,
-	    &ba_ptr);
-}
-
-/*ARGSUSED*/
-static int
-abd_zero_off_cb(void *buf, size_t size, void *private)
-{
-	(void) memset(buf, 0, size);
-	return (0);
-}
-
-/*
- * Zero out the abd from a particular offset to the end.
- */
-void
-abd_zero_off(abd_t *abd, size_t off, size_t size)
-{
-	(void) abd_iterate_func(abd, off, size, abd_zero_off_cb, NULL);
-}
-
-/*
- * Iterate over two ABDs and call func incrementally on the two ABDs' data in
- * equal-sized chunks (passed to func as raw buffers). func could be called many
- * times during this iteration.
- */
-int
-abd_iterate_func2(abd_t *dabd, abd_t *sabd, size_t doff, size_t soff,
-    size_t size, abd_iter_func2_t *func, void *private)
-{
-	int ret = 0;
-	struct abd_iter daiter, saiter;
-
-	abd_verify(dabd);
-	abd_verify(sabd);
-
-	ASSERT3U(doff + size, <=, dabd->abd_size);
-	ASSERT3U(soff + size, <=, sabd->abd_size);
-
-	abd_iter_init(&daiter, dabd);
-	abd_iter_init(&saiter, sabd);
-	abd_iter_advance(&daiter, doff);
-	abd_iter_advance(&saiter, soff);
-
-	while (size > 0) {
-		abd_iter_map(&daiter);
-		abd_iter_map(&saiter);
-
-		size_t dlen = MIN(daiter.iter_mapsize, size);
-		size_t slen = MIN(saiter.iter_mapsize, size);
-		size_t len = MIN(dlen, slen);
-		ASSERT(dlen > 0 || slen > 0);
-
-		ret = func(daiter.iter_mapaddr, saiter.iter_mapaddr, len,
-		    private);
-
-		abd_iter_unmap(&saiter);
-		abd_iter_unmap(&daiter);
-
-		if (ret != 0)
-			break;
-
-		size -= len;
-		abd_iter_advance(&daiter, len);
-		abd_iter_advance(&saiter, len);
-	}
-
-	return (ret);
-}
-
-/*ARGSUSED*/
-static int
-abd_copy_off_cb(void *dbuf, void *sbuf, size_t size, void *private)
-{
-	(void) memcpy(dbuf, sbuf, size);
-	return (0);
-}
-
-/*
- * Copy from sabd to dabd starting from soff and doff.
- */
-void
-abd_copy_off(abd_t *dabd, abd_t *sabd, size_t doff, size_t soff, size_t size)
-{
-	(void) abd_iterate_func2(dabd, sabd, doff, soff, size,
-	    abd_copy_off_cb, NULL);
-}
-
-/*ARGSUSED*/
-static int
-abd_cmp_cb(void *bufa, void *bufb, size_t size, void *private)
-{
-	return (memcmp(bufa, bufb, size));
-}
-
-/*
- * Compares the first size bytes of two ABDs.
- */
-int
-abd_cmp(abd_t *dabd, abd_t *sabd, size_t size)
-{
-	return (abd_iterate_func2(dabd, sabd, 0, 0, size, abd_cmp_cb, NULL));
-}
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/aggsum.c
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/aggsum.c
+++ /dev/null
@@ -1,234 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * This file and its contents are supplied under the terms of the
- * Common Development and Distribution License ("CDDL"), version 1.0.
- * You may only use this file in accordance with the terms of version
- * 1.0 of the CDDL.
- *
- * A full copy of the text of the CDDL should have accompanied this
- * source.  A copy of the CDDL is also available via the Internet at
- * http://www.illumos.org/license/CDDL.
- *
- * CDDL HEADER END
- */
-/*
- * Copyright (c) 2017, 2018 by Delphix. All rights reserved.
- */
-
-#include <sys/zfs_context.h>
-#include <sys/aggsum.h>
-
-/*
- * Aggregate-sum counters are a form of fanned-out counter, used when atomic
- * instructions on a single field cause enough CPU cache line contention to
- * slow system performance. Due to their increased overhead and the expense
- * involved with precisely reading from them, they should only be used in cases
- * where the write rate (increment/decrement) is much higher than the read rate
- * (get value).
- *
- * Aggregate sum counters are comprised of two basic parts, the core and the
- * buckets. The core counter contains a lock for the entire counter, as well
- * as the current upper and lower bounds on the value of the counter. The
- * aggsum_bucket structure contains a per-bucket lock to protect the contents of
- * the bucket, the current amount that this bucket has changed from the global
- * counter (called the delta), and the amount of increment and decrement we have
- * "borrowed" from the core counter.
- *
- * The basic operation of an aggsum is simple. Threads that wish to modify the
- * counter will modify one bucket's counter (determined by their current CPU, to
- * help minimize lock and cache contention). If the bucket already has
- * sufficient capacity borrowed from the core structure to handle their request,
- * they simply modify the delta and return.  If the bucket does not, we clear
- * the bucket's current state (to prevent the borrowed amounts from getting too
- * large), and borrow more from the core counter. Borrowing is done by adding to
- * the upper bound (or subtracting from the lower bound) of the core counter,
- * and setting the borrow value for the bucket to the amount added (or
- * subtracted).  Clearing the bucket is the opposite; we add the current delta
- * to both the lower and upper bounds of the core counter, subtract the borrowed
- * incremental from the upper bound, and add the borrowed decrement from the
- * lower bound.  Note that only borrowing and clearing require access to the
- * core counter; since all other operations access CPU-local resources,
- * performance can be much higher than a traditional counter.
- *
- * Threads that wish to read from the counter have a slightly more challenging
- * task. It is fast to determine the upper and lower bounds of the aggum; this
- * does not require grabbing any locks. This suffices for cases where an
- * approximation of the aggsum's value is acceptable. However, if one needs to
- * know whether some specific value is above or below the current value in the
- * aggsum, they invoke aggsum_compare(). This function operates by repeatedly
- * comparing the target value to the upper and lower bounds of the aggsum, and
- * then clearing a bucket. This proceeds until the target is outside of the
- * upper and lower bounds and we return a response, or the last bucket has been
- * cleared and we know that the target is equal to the aggsum's value. Finally,
- * the most expensive operation is determining the precise value of the aggsum.
- * To do this, we clear every bucket and then return the upper bound (which must
- * be equal to the lower bound). What makes aggsum_compare() and aggsum_value()
- * expensive is clearing buckets. This involves grabbing the global lock
- * (serializing against themselves and borrow operations), grabbing a bucket's
- * lock (preventing threads on those CPUs from modifying their delta), and
- * zeroing out the borrowed value (forcing that thread to borrow on its next
- * request, which will also be expensive).  This is what makes aggsums well
- * suited for write-many read-rarely operations.
- */
-
-/*
- * We will borrow aggsum_borrow_multiplier times the current request, so we will
- * have to get the as_lock approximately every aggsum_borrow_multiplier calls to
- * aggsum_delta().
- */
-static uint_t aggsum_borrow_multiplier = 10;
-
-void
-aggsum_init(aggsum_t *as, uint64_t value)
-{
-	bzero(as, sizeof (*as));
-	as->as_lower_bound = as->as_upper_bound = value;
-	mutex_init(&as->as_lock, NULL, MUTEX_DEFAULT, NULL);
-	as->as_numbuckets = boot_ncpus;
-	as->as_buckets = kmem_zalloc(boot_ncpus * sizeof (aggsum_bucket_t),
-	    KM_SLEEP);
-	for (int i = 0; i < as->as_numbuckets; i++) {
-		mutex_init(&as->as_buckets[i].asc_lock,
-		    NULL, MUTEX_DEFAULT, NULL);
-	}
-}
-
-void
-aggsum_fini(aggsum_t *as)
-{
-	for (int i = 0; i < as->as_numbuckets; i++)
-		mutex_destroy(&as->as_buckets[i].asc_lock);
-	kmem_free(as->as_buckets, as->as_numbuckets * sizeof (aggsum_bucket_t));
-	mutex_destroy(&as->as_lock);
-}
-
-int64_t
-aggsum_lower_bound(aggsum_t *as)
-{
-	return (as->as_lower_bound);
-}
-
-int64_t
-aggsum_upper_bound(aggsum_t *as)
-{
-	return (as->as_upper_bound);
-}
-
-static void
-aggsum_flush_bucket(aggsum_t *as, struct aggsum_bucket *asb)
-{
-	ASSERT(MUTEX_HELD(&as->as_lock));
-	ASSERT(MUTEX_HELD(&asb->asc_lock));
-
-	/*
-	 * We use atomic instructions for this because we read the upper and
-	 * lower bounds without the lock, so we need stores to be atomic.
-	 */
-	atomic_add_64((volatile uint64_t *)&as->as_lower_bound,
-	    asb->asc_delta + asb->asc_borrowed);
-	atomic_add_64((volatile uint64_t *)&as->as_upper_bound,
-	    asb->asc_delta - asb->asc_borrowed);
-	asb->asc_delta = 0;
-	asb->asc_borrowed = 0;
-}
-
-uint64_t
-aggsum_value(aggsum_t *as)
-{
-	int64_t rv;
-
-	mutex_enter(&as->as_lock);
-	if (as->as_lower_bound == as->as_upper_bound) {
-		rv = as->as_lower_bound;
-		for (int i = 0; i < as->as_numbuckets; i++) {
-			ASSERT0(as->as_buckets[i].asc_delta);
-			ASSERT0(as->as_buckets[i].asc_borrowed);
-		}
-		mutex_exit(&as->as_lock);
-		return (rv);
-	}
-	for (int i = 0; i < as->as_numbuckets; i++) {
-		struct aggsum_bucket *asb = &as->as_buckets[i];
-		mutex_enter(&asb->asc_lock);
-		aggsum_flush_bucket(as, asb);
-		mutex_exit(&asb->asc_lock);
-	}
-	VERIFY3U(as->as_lower_bound, ==, as->as_upper_bound);
-	rv = as->as_lower_bound;
-	mutex_exit(&as->as_lock);
-
-	return (rv);
-}
-
-void
-aggsum_add(aggsum_t *as, int64_t delta)
-{
-	struct aggsum_bucket *asb =
-	    &as->as_buckets[CPU_SEQID % as->as_numbuckets];
-	int64_t borrow;
-
-	/* Try fast path if we already borrowed enough before. */
-	mutex_enter(&asb->asc_lock);
-	if (asb->asc_delta + delta <= (int64_t)asb->asc_borrowed &&
-	    asb->asc_delta + delta >= -(int64_t)asb->asc_borrowed) {
-		asb->asc_delta += delta;
-		mutex_exit(&asb->asc_lock);
-		return;
-	}
-	mutex_exit(&asb->asc_lock);
-
-	/*
-	 * We haven't borrowed enough.  Take the global lock and borrow
-	 * considering what is requested now and what we borrowed before.
-	 */
-	borrow = (delta < 0 ? -delta : delta) * aggsum_borrow_multiplier;
-	mutex_enter(&as->as_lock);
-	mutex_enter(&asb->asc_lock);
-	delta += asb->asc_delta;
-	asb->asc_delta = 0;
-	if (borrow >= asb->asc_borrowed)
-		borrow -= asb->asc_borrowed;
-	else
-		borrow = (borrow - (int64_t)asb->asc_borrowed) / 4;
-	asb->asc_borrowed += borrow;
-	atomic_add_64((volatile uint64_t *)&as->as_lower_bound,
-	    delta - borrow);
-	atomic_add_64((volatile uint64_t *)&as->as_upper_bound,
-	    delta + borrow);
-	mutex_exit(&asb->asc_lock);
-	mutex_exit(&as->as_lock);
-}
-
-/*
- * Compare the aggsum value to target efficiently. Returns -1 if the value
- * represented by the aggsum is less than target, 1 if it's greater, and 0 if
- * they are equal.
- */
-int
-aggsum_compare(aggsum_t *as, uint64_t target)
-{
-	if (as->as_upper_bound < target)
-		return (-1);
-	if (as->as_lower_bound > target)
-		return (1);
-	mutex_enter(&as->as_lock);
-	for (int i = 0; i < as->as_numbuckets; i++) {
-		struct aggsum_bucket *asb = &as->as_buckets[i];
-		mutex_enter(&asb->asc_lock);
-		aggsum_flush_bucket(as, asb);
-		mutex_exit(&asb->asc_lock);
-		if (as->as_upper_bound < target) {
-			mutex_exit(&as->as_lock);
-			return (-1);
-		}
-		if (as->as_lower_bound > target) {
-			mutex_exit(&as->as_lock);
-			return (1);
-		}
-	}
-	VERIFY3U(as->as_lower_bound, ==, as->as_upper_bound);
-	ASSERT3U(as->as_lower_bound, ==, target);
-	mutex_exit(&as->as_lock);
-	return (0);
-}
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c
+++ /dev/null
@@ -1,8569 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2018, Joyent, Inc.
- * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
- * Copyright (c) 2014 by Saso Kiselkov. All rights reserved.
- * Copyright 2017 Nexenta Systems, Inc.  All rights reserved.
- */
-
-/*
- * DVA-based Adjustable Replacement Cache
- *
- * While much of the theory of operation used here is
- * based on the self-tuning, low overhead replacement cache
- * presented by Megiddo and Modha at FAST 2003, there are some
- * significant differences:
- *
- * 1. The Megiddo and Modha model assumes any page is evictable.
- * Pages in its cache cannot be "locked" into memory.  This makes
- * the eviction algorithm simple: evict the last page in the list.
- * This also make the performance characteristics easy to reason
- * about.  Our cache is not so simple.  At any given moment, some
- * subset of the blocks in the cache are un-evictable because we
- * have handed out a reference to them.  Blocks are only evictable
- * when there are no external references active.  This makes
- * eviction far more problematic:  we choose to evict the evictable
- * blocks that are the "lowest" in the list.
- *
- * There are times when it is not possible to evict the requested
- * space.  In these circumstances we are unable to adjust the cache
- * size.  To prevent the cache growing unbounded at these times we
- * implement a "cache throttle" that slows the flow of new data
- * into the cache until we can make space available.
- *
- * 2. The Megiddo and Modha model assumes a fixed cache size.
- * Pages are evicted when the cache is full and there is a cache
- * miss.  Our model has a variable sized cache.  It grows with
- * high use, but also tries to react to memory pressure from the
- * operating system: decreasing its size when system memory is
- * tight.
- *
- * 3. The Megiddo and Modha model assumes a fixed page size. All
- * elements of the cache are therefore exactly the same size.  So
- * when adjusting the cache size following a cache miss, its simply
- * a matter of choosing a single page to evict.  In our model, we
- * have variable sized cache blocks (rangeing from 512 bytes to
- * 128K bytes).  We therefore choose a set of blocks to evict to make
- * space for a cache miss that approximates as closely as possible
- * the space used by the new block.
- *
- * See also:  "ARC: A Self-Tuning, Low Overhead Replacement Cache"
- * by N. Megiddo & D. Modha, FAST 2003
- */
-
-/*
- * The locking model:
- *
- * A new reference to a cache buffer can be obtained in two
- * ways: 1) via a hash table lookup using the DVA as a key,
- * or 2) via one of the ARC lists.  The arc_read() interface
- * uses method 1, while the internal ARC algorithms for
- * adjusting the cache use method 2.  We therefore provide two
- * types of locks: 1) the hash table lock array, and 2) the
- * ARC list locks.
- *
- * Buffers do not have their own mutexes, rather they rely on the
- * hash table mutexes for the bulk of their protection (i.e. most
- * fields in the arc_buf_hdr_t are protected by these mutexes).
- *
- * buf_hash_find() returns the appropriate mutex (held) when it
- * locates the requested buffer in the hash table.  It returns
- * NULL for the mutex if the buffer was not in the table.
- *
- * buf_hash_remove() expects the appropriate hash mutex to be
- * already held before it is invoked.
- *
- * Each ARC state also has a mutex which is used to protect the
- * buffer list associated with the state.  When attempting to
- * obtain a hash table lock while holding an ARC list lock you
- * must use: mutex_tryenter() to avoid deadlock.  Also note that
- * the active state mutex must be held before the ghost state mutex.
- *
- * It as also possible to register a callback which is run when the
- * arc_meta_limit is reached and no buffers can be safely evicted.  In
- * this case the arc user should drop a reference on some arc buffers so
- * they can be reclaimed and the arc_meta_limit honored.  For example,
- * when using the ZPL each dentry holds a references on a znode.  These
- * dentries must be pruned before the arc buffer holding the znode can
- * be safely evicted.
- *
- * Note that the majority of the performance stats are manipulated
- * with atomic operations.
- *
- * The L2ARC uses the l2ad_mtx on each vdev for the following:
- *
- *	- L2ARC buflist creation
- *	- L2ARC buflist eviction
- *	- L2ARC write completion, which walks L2ARC buflists
- *	- ARC header destruction, as it removes from L2ARC buflists
- *	- ARC header release, as it removes from L2ARC buflists
- */
-
-/*
- * ARC operation:
- *
- * Every block that is in the ARC is tracked by an arc_buf_hdr_t structure.
- * This structure can point either to a block that is still in the cache or to
- * one that is only accessible in an L2 ARC device, or it can provide
- * information about a block that was recently evicted. If a block is
- * only accessible in the L2ARC, then the arc_buf_hdr_t only has enough
- * information to retrieve it from the L2ARC device. This information is
- * stored in the l2arc_buf_hdr_t sub-structure of the arc_buf_hdr_t. A block
- * that is in this state cannot access the data directly.
- *
- * Blocks that are actively being referenced or have not been evicted
- * are cached in the L1ARC. The L1ARC (l1arc_buf_hdr_t) is a structure within
- * the arc_buf_hdr_t that will point to the data block in memory. A block can
- * only be read by a consumer if it has an l1arc_buf_hdr_t. The L1ARC
- * caches data in two ways -- in a list of ARC buffers (arc_buf_t) and
- * also in the arc_buf_hdr_t's private physical data block pointer (b_pabd).
- *
- * The L1ARC's data pointer may or may not be uncompressed. The ARC has the
- * ability to store the physical data (b_pabd) associated with the DVA of the
- * arc_buf_hdr_t. Since the b_pabd is a copy of the on-disk physical block,
- * it will match its on-disk compression characteristics. This behavior can be
- * disabled by setting 'zfs_compressed_arc_enabled' to B_FALSE. When the
- * compressed ARC functionality is disabled, the b_pabd will point to an
- * uncompressed version of the on-disk data.
- *
- * Data in the L1ARC is not accessed by consumers of the ARC directly. Each
- * arc_buf_hdr_t can have multiple ARC buffers (arc_buf_t) which reference it.
- * Each ARC buffer (arc_buf_t) is being actively accessed by a specific ARC
- * consumer. The ARC will provide references to this data and will keep it
- * cached until it is no longer in use. The ARC caches only the L1ARC's physical
- * data block and will evict any arc_buf_t that is no longer referenced. The
- * amount of memory consumed by the arc_buf_ts' data buffers can be seen via the
- * "overhead_size" kstat.
- *
- * Depending on the consumer, an arc_buf_t can be requested in uncompressed or
- * compressed form. The typical case is that consumers will want uncompressed
- * data, and when that happens a new data buffer is allocated where the data is
- * decompressed for them to use. Currently the only consumer who wants
- * compressed arc_buf_t's is "zfs send", when it streams data exactly as it
- * exists on disk. When this happens, the arc_buf_t's data buffer is shared
- * with the arc_buf_hdr_t.
- *
- * Here is a diagram showing an arc_buf_hdr_t referenced by two arc_buf_t's. The
- * first one is owned by a compressed send consumer (and therefore references
- * the same compressed data buffer as the arc_buf_hdr_t) and the second could be
- * used by any other consumer (and has its own uncompressed copy of the data
- * buffer).
- *
- *   arc_buf_hdr_t
- *   +-----------+
- *   | fields    |
- *   | common to |
- *   | L1- and   |
- *   | L2ARC     |
- *   +-----------+
- *   | l2arc_buf_hdr_t
- *   |           |
- *   +-----------+
- *   | l1arc_buf_hdr_t
- *   |           |              arc_buf_t
- *   | b_buf     +------------>+-----------+      arc_buf_t
- *   | b_pabd    +-+           |b_next     +---->+-----------+
- *   +-----------+ |           |-----------|     |b_next     +-->NULL
- *                 |           |b_comp = T |     +-----------+
- *                 |           |b_data     +-+   |b_comp = F |
- *                 |           +-----------+ |   |b_data     +-+
- *                 +->+------+               |   +-----------+ |
- *        compressed  |      |               |                 |
- *           data     |      |<--------------+                 | uncompressed
- *                    +------+          compressed,            |     data
- *                                        shared               +-->+------+
- *                                         data                    |      |
- *                                                                 |      |
- *                                                                 +------+
- *
- * When a consumer reads a block, the ARC must first look to see if the
- * arc_buf_hdr_t is cached. If the hdr is cached then the ARC allocates a new
- * arc_buf_t and either copies uncompressed data into a new data buffer from an
- * existing uncompressed arc_buf_t, decompresses the hdr's b_pabd buffer into a
- * new data buffer, or shares the hdr's b_pabd buffer, depending on whether the
- * hdr is compressed and the desired compression characteristics of the
- * arc_buf_t consumer. If the arc_buf_t ends up sharing data with the
- * arc_buf_hdr_t and both of them are uncompressed then the arc_buf_t must be
- * the last buffer in the hdr's b_buf list, however a shared compressed buf can
- * be anywhere in the hdr's list.
- *
- * The diagram below shows an example of an uncompressed ARC hdr that is
- * sharing its data with an arc_buf_t (note that the shared uncompressed buf is
- * the last element in the buf list):
- *
- *                arc_buf_hdr_t
- *                +-----------+
- *                |           |
- *                |           |
- *                |           |
- *                +-----------+
- * l2arc_buf_hdr_t|           |
- *                |           |
- *                +-----------+
- * l1arc_buf_hdr_t|           |
- *                |           |                 arc_buf_t    (shared)
- *                |    b_buf  +------------>+---------+      arc_buf_t
- *                |           |             |b_next   +---->+---------+
- *                |  b_pabd   +-+           |---------|     |b_next   +-->NULL
- *                +-----------+ |           |         |     +---------+
- *                              |           |b_data   +-+   |         |
- *                              |           +---------+ |   |b_data   +-+
- *                              +->+------+             |   +---------+ |
- *                                 |      |             |               |
- *                   uncompressed  |      |             |               |
- *                        data     +------+             |               |
- *                                    ^                 +->+------+     |
- *                                    |       uncompressed |      |     |
- *                                    |           data     |      |     |
- *                                    |                    +------+     |
- *                                    +---------------------------------+
- *
- * Writing to the ARC requires that the ARC first discard the hdr's b_pabd
- * since the physical block is about to be rewritten. The new data contents
- * will be contained in the arc_buf_t. As the I/O pipeline performs the write,
- * it may compress the data before writing it to disk. The ARC will be called
- * with the transformed data and will bcopy the transformed on-disk block into
- * a newly allocated b_pabd. Writes are always done into buffers which have
- * either been loaned (and hence are new and don't have other readers) or
- * buffers which have been released (and hence have their own hdr, if there
- * were originally other readers of the buf's original hdr). This ensures that
- * the ARC only needs to update a single buf and its hdr after a write occurs.
- *
- * When the L2ARC is in use, it will also take advantage of the b_pabd. The
- * L2ARC will always write the contents of b_pabd to the L2ARC. This means
- * that when compressed ARC is enabled that the L2ARC blocks are identical
- * to the on-disk block in the main data pool. This provides a significant
- * advantage since the ARC can leverage the bp's checksum when reading from the
- * L2ARC to determine if the contents are valid. However, if the compressed
- * ARC is disabled, then the L2ARC's block must be transformed to look
- * like the physical block in the main data pool before comparing the
- * checksum and determining its validity.
- */
-
-#include <sys/spa.h>
-#include <sys/zio.h>
-#include <sys/spa_impl.h>
-#include <sys/zio_compress.h>
-#include <sys/zio_checksum.h>
-#include <sys/zfs_context.h>
-#include <sys/arc.h>
-#include <sys/refcount.h>
-#include <sys/vdev.h>
-#include <sys/vdev_impl.h>
-#include <sys/dsl_pool.h>
-#include <sys/zio_checksum.h>
-#include <sys/multilist.h>
-#include <sys/abd.h>
-#ifdef _KERNEL
-#include <sys/dnlc.h>
-#include <sys/racct.h>
-#endif
-#include <sys/callb.h>
-#include <sys/kstat.h>
-#include <sys/trim_map.h>
-#include <sys/zthr.h>
-#include <zfs_fletcher.h>
-#include <sys/sdt.h>
-#include <sys/aggsum.h>
-#include <sys/cityhash.h>
-
-#include <machine/vmparam.h>
-
-#ifdef illumos
-#ifndef _KERNEL
-/* set with ZFS_DEBUG=watch, to enable watchpoints on frozen buffers */
-boolean_t arc_watch = B_FALSE;
-int arc_procfd;
-#endif
-#endif /* illumos */
-
-/*
- * This thread's job is to keep enough free memory in the system, by
- * calling arc_kmem_reap_now() plus arc_shrink(), which improves
- * arc_available_memory().
- */
-static zthr_t		*arc_reap_zthr;
-
-/*
- * This thread's job is to keep arc_size under arc_c, by calling
- * arc_adjust(), which improves arc_is_overflowing().
- */
-static zthr_t		*arc_adjust_zthr;
-
-static kmutex_t		arc_adjust_lock;
-static kcondvar_t	arc_adjust_waiters_cv;
-static boolean_t	arc_adjust_needed = B_FALSE;
-
-static kmutex_t		arc_dnlc_evicts_lock;
-static kcondvar_t	arc_dnlc_evicts_cv;
-static boolean_t	arc_dnlc_evicts_thread_exit;
-
-uint_t arc_reduce_dnlc_percent = 3;
-
-/*
- * The number of headers to evict in arc_evict_state_impl() before
- * dropping the sublist lock and evicting from another sublist. A lower
- * value means we're more likely to evict the "correct" header (i.e. the
- * oldest header in the arc state), but comes with higher overhead
- * (i.e. more invocations of arc_evict_state_impl()).
- */
-int zfs_arc_evict_batch_limit = 10;
-
-/* number of seconds before growing cache again */
-int arc_grow_retry = 60;
-
-/*
- * Minimum time between calls to arc_kmem_reap_soon().  Note that this will
- * be converted to ticks, so with the default hz=100, a setting of 15 ms
- * will actually wait 2 ticks, or 20ms.
- */
-int arc_kmem_cache_reap_retry_ms = 1000;
-
-/* shift of arc_c for calculating overflow limit in arc_get_data_impl */
-int zfs_arc_overflow_shift = 8;
-
-/* shift of arc_c for calculating both min and max arc_p */
-int arc_p_min_shift = 4;
-
-/* log2(fraction of arc to reclaim) */
-int arc_shrink_shift = 7;
-
-/*
- * log2(fraction of ARC which must be free to allow growing).
- * I.e. If there is less than arc_c >> arc_no_grow_shift free memory,
- * when reading a new block into the ARC, we will evict an equal-sized block
- * from the ARC.
- *
- * This must be less than arc_shrink_shift, so that when we shrink the ARC,
- * we will still not allow it to grow.
- */
-int			arc_no_grow_shift = 5;
-
-
-/*
- * minimum lifespan of a prefetch block in clock ticks
- * (initialized in arc_init())
- */
-static int		zfs_arc_min_prefetch_ms = 1;
-static int		zfs_arc_min_prescient_prefetch_ms = 6;
-
-/*
- * If this percent of memory is free, don't throttle.
- */
-int arc_lotsfree_percent = 10;
-
-static boolean_t arc_initialized;
-extern boolean_t zfs_prefetch_disable;
-
-/*
- * The arc has filled available memory and has now warmed up.
- */
-static boolean_t arc_warm;
-
-/*
- * log2 fraction of the zio arena to keep free.
- */
-int arc_zio_arena_free_shift = 2;
-
-/*
- * These tunables are for performance analysis.
- */
-uint64_t zfs_arc_max;
-uint64_t zfs_arc_min;
-uint64_t zfs_arc_meta_limit = 0;
-uint64_t zfs_arc_meta_min = 0;
-uint64_t zfs_arc_dnode_limit = 0;
-uint64_t zfs_arc_dnode_reduce_percent = 10;
-int zfs_arc_grow_retry = 0;
-int zfs_arc_shrink_shift = 0;
-int zfs_arc_no_grow_shift = 0;
-int zfs_arc_p_min_shift = 0;
-uint64_t zfs_arc_average_blocksize = 8 * 1024; /* 8KB */
-u_int zfs_arc_free_target = 0;
-
-/* Absolute min for arc min / max is 16MB. */
-static uint64_t arc_abs_min = 16 << 20;
-
-/*
- * ARC dirty data constraints for arc_tempreserve_space() throttle
- */
-uint_t zfs_arc_dirty_limit_percent = 50;	/* total dirty data limit */
-uint_t zfs_arc_anon_limit_percent = 25;		/* anon block dirty limit */
-uint_t zfs_arc_pool_dirty_percent = 20;		/* each pool's anon allowance */
-
-boolean_t zfs_compressed_arc_enabled = B_TRUE;
-
-static int sysctl_vfs_zfs_arc_free_target(SYSCTL_HANDLER_ARGS);
-static int sysctl_vfs_zfs_arc_meta_limit(SYSCTL_HANDLER_ARGS);
-static int sysctl_vfs_zfs_arc_max(SYSCTL_HANDLER_ARGS);
-static int sysctl_vfs_zfs_arc_min(SYSCTL_HANDLER_ARGS);
-static int sysctl_vfs_zfs_arc_no_grow_shift(SYSCTL_HANDLER_ARGS);
-
-#if defined(__FreeBSD__) && defined(_KERNEL)
-static void
-arc_free_target_init(void *unused __unused)
-{
-
-	zfs_arc_free_target = vm_cnt.v_free_target;
-}
-SYSINIT(arc_free_target_init, SI_SUB_KTHREAD_PAGE, SI_ORDER_ANY,
-    arc_free_target_init, NULL);
-
-TUNABLE_QUAD("vfs.zfs.arc_meta_limit", &zfs_arc_meta_limit);
-TUNABLE_QUAD("vfs.zfs.arc_meta_min", &zfs_arc_meta_min);
-TUNABLE_INT("vfs.zfs.arc_shrink_shift", &zfs_arc_shrink_shift);
-TUNABLE_INT("vfs.zfs.arc_grow_retry", &zfs_arc_grow_retry);
-TUNABLE_INT("vfs.zfs.arc_no_grow_shift", &zfs_arc_no_grow_shift);
-SYSCTL_DECL(_vfs_zfs);
-SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_max,
-    CTLTYPE_U64 | CTLFLAG_MPSAFE | CTLFLAG_RWTUN,
-    0, sizeof(uint64_t), sysctl_vfs_zfs_arc_max, "QU", "Maximum ARC size");
-SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_min,
-    CTLTYPE_U64 | CTLFLAG_MPSAFE | CTLFLAG_RWTUN,
-    0, sizeof(uint64_t), sysctl_vfs_zfs_arc_min, "QU", "Minimum ARC size");
-SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_no_grow_shift,
-    CTLTYPE_U32 | CTLFLAG_MPSAFE | CTLFLAG_RWTUN,
-    0, sizeof(uint32_t), sysctl_vfs_zfs_arc_no_grow_shift, "U",
-    "log2(fraction of ARC which must be free to allow growing)");
-SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, arc_average_blocksize, CTLFLAG_RDTUN,
-    &zfs_arc_average_blocksize, 0,
-    "ARC average blocksize");
-SYSCTL_INT(_vfs_zfs, OID_AUTO, arc_shrink_shift, CTLFLAG_RW,
-    &arc_shrink_shift, 0,
-    "log2(fraction of arc to reclaim)");
-SYSCTL_INT(_vfs_zfs, OID_AUTO, arc_grow_retry, CTLFLAG_RW,
-    &arc_grow_retry, 0,
-    "Wait in seconds before considering growing ARC");
-SYSCTL_INT(_vfs_zfs, OID_AUTO, compressed_arc_enabled, CTLFLAG_RDTUN,
-    &zfs_compressed_arc_enabled, 0,
-    "Enable compressed ARC");
-SYSCTL_INT(_vfs_zfs, OID_AUTO, arc_kmem_cache_reap_retry_ms, CTLFLAG_RWTUN,
-    &arc_kmem_cache_reap_retry_ms, 0,
-    "Interval between ARC kmem_cache reapings");
-
-/*
- * We don't have a tunable for arc_free_target due to the dependency on
- * pagedaemon initialisation.
- */
-SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_free_target,
-    CTLTYPE_UINT | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(u_int),
-    sysctl_vfs_zfs_arc_free_target, "IU",
-    "Desired number of free pages below which ARC triggers reclaim");
-
-static int
-sysctl_vfs_zfs_arc_free_target(SYSCTL_HANDLER_ARGS)
-{
-	u_int val;
-	int err;
-
-	val = zfs_arc_free_target;
-	err = sysctl_handle_int(oidp, &val, 0, req);
-	if (err != 0 || req->newptr == NULL)
-		return (err);
-
-	if (val < minfree)
-		return (EINVAL);
-	if (val > vm_cnt.v_page_count)
-		return (EINVAL);
-
-	zfs_arc_free_target = val;
-
-	return (0);
-}
-
-/*
- * Must be declared here, before the definition of corresponding kstat
- * macro which uses the same names will confuse the compiler.
- */
-SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_meta_limit,
-    CTLTYPE_U64 | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(uint64_t),
-    sysctl_vfs_zfs_arc_meta_limit, "QU",
-    "ARC metadata limit");
-#endif
-
-/*
- * Note that buffers can be in one of 6 states:
- *	ARC_anon	- anonymous (discussed below)
- *	ARC_mru		- recently used, currently cached
- *	ARC_mru_ghost	- recentely used, no longer in cache
- *	ARC_mfu		- frequently used, currently cached
- *	ARC_mfu_ghost	- frequently used, no longer in cache
- *	ARC_l2c_only	- exists in L2ARC but not other states
- * When there are no active references to the buffer, they are
- * are linked onto a list in one of these arc states.  These are
- * the only buffers that can be evicted or deleted.  Within each
- * state there are multiple lists, one for meta-data and one for
- * non-meta-data.  Meta-data (indirect blocks, blocks of dnodes,
- * etc.) is tracked separately so that it can be managed more
- * explicitly: favored over data, limited explicitly.
- *
- * Anonymous buffers are buffers that are not associated with
- * a DVA.  These are buffers that hold dirty block copies
- * before they are written to stable storage.  By definition,
- * they are "ref'd" and are considered part of arc_mru
- * that cannot be freed.  Generally, they will aquire a DVA
- * as they are written and migrate onto the arc_mru list.
- *
- * The ARC_l2c_only state is for buffers that are in the second
- * level ARC but no longer in any of the ARC_m* lists.  The second
- * level ARC itself may also contain buffers that are in any of
- * the ARC_m* states - meaning that a buffer can exist in two
- * places.  The reason for the ARC_l2c_only state is to keep the
- * buffer header in the hash table, so that reads that hit the
- * second level ARC benefit from these fast lookups.
- */
-
-typedef struct arc_state {
-	/*
-	 * list of evictable buffers
-	 */
-	multilist_t *arcs_list[ARC_BUFC_NUMTYPES];
-	/*
-	 * total amount of evictable data in this state
-	 */
-	zfs_refcount_t arcs_esize[ARC_BUFC_NUMTYPES];
-	/*
-	 * total amount of data in this state; this includes: evictable,
-	 * non-evictable, ARC_BUFC_DATA, and ARC_BUFC_METADATA.
-	 */
-	zfs_refcount_t arcs_size;
-	/*
-	 * supports the "dbufs" kstat
-	 */
-	arc_state_type_t arcs_state;
-} arc_state_t;
-
-/*
- * Percentage that can be consumed by dnodes of ARC meta buffers.
- */
-int zfs_arc_meta_prune = 10000;
-unsigned long zfs_arc_dnode_limit_percent = 10;
-int zfs_arc_meta_strategy = ARC_STRATEGY_META_ONLY;
-int zfs_arc_meta_adjust_restarts = 4096;
-
-SYSCTL_INT(_vfs_zfs, OID_AUTO, arc_meta_strategy, CTLFLAG_RWTUN,
-    &zfs_arc_meta_strategy, 0,
-    "ARC metadata reclamation strategy "
-    "(0 = metadata only, 1 = balance data and metadata)");
-
-/* The 6 states: */
-static arc_state_t ARC_anon;
-static arc_state_t ARC_mru;
-static arc_state_t ARC_mru_ghost;
-static arc_state_t ARC_mfu;
-static arc_state_t ARC_mfu_ghost;
-static arc_state_t ARC_l2c_only;
-
-typedef struct arc_stats {
-	kstat_named_t arcstat_hits;
-	kstat_named_t arcstat_misses;
-	kstat_named_t arcstat_demand_data_hits;
-	kstat_named_t arcstat_demand_data_misses;
-	kstat_named_t arcstat_demand_metadata_hits;
-	kstat_named_t arcstat_demand_metadata_misses;
-	kstat_named_t arcstat_prefetch_data_hits;
-	kstat_named_t arcstat_prefetch_data_misses;
-	kstat_named_t arcstat_prefetch_metadata_hits;
-	kstat_named_t arcstat_prefetch_metadata_misses;
-	kstat_named_t arcstat_mru_hits;
-	kstat_named_t arcstat_mru_ghost_hits;
-	kstat_named_t arcstat_mfu_hits;
-	kstat_named_t arcstat_mfu_ghost_hits;
-	kstat_named_t arcstat_allocated;
-	kstat_named_t arcstat_deleted;
-	/*
-	 * Number of buffers that could not be evicted because the hash lock
-	 * was held by another thread.  The lock may not necessarily be held
-	 * by something using the same buffer, since hash locks are shared
-	 * by multiple buffers.
-	 */
-	kstat_named_t arcstat_mutex_miss;
-	/*
-	 * Number of buffers skipped when updating the access state due to the
-	 * header having already been released after acquiring the hash lock.
-	 */
-	kstat_named_t arcstat_access_skip;
-	/*
-	 * Number of buffers skipped because they have I/O in progress, are
-	 * indirect prefetch buffers that have not lived long enough, or are
-	 * not from the spa we're trying to evict from.
-	 */
-	kstat_named_t arcstat_evict_skip;
-	/*
-	 * Number of times arc_evict_state() was unable to evict enough
-	 * buffers to reach it's target amount.
-	 */
-	kstat_named_t arcstat_evict_not_enough;
-	kstat_named_t arcstat_evict_l2_cached;
-	kstat_named_t arcstat_evict_l2_eligible;
-	kstat_named_t arcstat_evict_l2_ineligible;
-	kstat_named_t arcstat_evict_l2_skip;
-	kstat_named_t arcstat_hash_elements;
-	kstat_named_t arcstat_hash_elements_max;
-	kstat_named_t arcstat_hash_collisions;
-	kstat_named_t arcstat_hash_chains;
-	kstat_named_t arcstat_hash_chain_max;
-	kstat_named_t arcstat_p;
-	kstat_named_t arcstat_c;
-	kstat_named_t arcstat_c_min;
-	kstat_named_t arcstat_c_max;
-	/* Not updated directly; only synced in arc_kstat_update. */
-	kstat_named_t arcstat_size;
-	/*
-	 * Number of compressed bytes stored in the arc_buf_hdr_t's b_pabd.
-	 * Note that the compressed bytes may match the uncompressed bytes
-	 * if the block is either not compressed or compressed arc is disabled.
-	 */
-	kstat_named_t arcstat_compressed_size;
-	/*
-	 * Uncompressed size of the data stored in b_pabd. If compressed
-	 * arc is disabled then this value will be identical to the stat
-	 * above.
-	 */
-	kstat_named_t arcstat_uncompressed_size;
-	/*
-	 * Number of bytes stored in all the arc_buf_t's. This is classified
-	 * as "overhead" since this data is typically short-lived and will
-	 * be evicted from the arc when it becomes unreferenced unless the
-	 * zfs_keep_uncompressed_metadata or zfs_keep_uncompressed_level
-	 * values have been set (see comment in dbuf.c for more information).
-	 */
-	kstat_named_t arcstat_overhead_size;
-	/*
-	 * Number of bytes consumed by internal ARC structures necessary
-	 * for tracking purposes; these structures are not actually
-	 * backed by ARC buffers. This includes arc_buf_hdr_t structures
-	 * (allocated via arc_buf_hdr_t_full and arc_buf_hdr_t_l2only
-	 * caches), and arc_buf_t structures (allocated via arc_buf_t
-	 * cache).
-	 * Not updated directly; only synced in arc_kstat_update.
-	 */
-	kstat_named_t arcstat_hdr_size;
-	/*
-	 * Number of bytes consumed by ARC buffers of type equal to
-	 * ARC_BUFC_DATA. This is generally consumed by buffers backing
-	 * on disk user data (e.g. plain file contents).
-	 * Not updated directly; only synced in arc_kstat_update.
-	 */
-	kstat_named_t arcstat_data_size;
-	/*
-	 * Number of bytes consumed by ARC buffers of type equal to
-	 * ARC_BUFC_METADATA. This is generally consumed by buffers
-	 * backing on disk data that is used for internal ZFS
-	 * structures (e.g. ZAP, dnode, indirect blocks, etc).
-	 * Not updated directly; only synced in arc_kstat_update.
-	 */
-	kstat_named_t arcstat_metadata_size;
-	/*
-	 * Number of bytes consumed by dmu_buf_impl_t objects.
-	 */
-	kstat_named_t arcstat_dbuf_size;
-	/*
-	 * Number of bytes consumed by dnode_t objects.
-	 */
-	kstat_named_t arcstat_dnode_size;
-	/*
-	 * Number of bytes consumed by bonus buffers.
-	 */
-	kstat_named_t arcstat_bonus_size;
-#if defined(__FreeBSD__) && defined(COMPAT_FREEBSD11)
-	/*
-	 * Sum of the previous three counters, provided for compatibility.
-	 */
-	kstat_named_t arcstat_other_size;
-#endif
-	/*
-	 * Total number of bytes consumed by ARC buffers residing in the
-	 * arc_anon state. This includes *all* buffers in the arc_anon
-	 * state; e.g. data, metadata, evictable, and unevictable buffers
-	 * are all included in this value.
-	 * Not updated directly; only synced in arc_kstat_update.
-	 */
-	kstat_named_t arcstat_anon_size;
-	/*
-	 * Number of bytes consumed by ARC buffers that meet the
-	 * following criteria: backing buffers of type ARC_BUFC_DATA,
-	 * residing in the arc_anon state, and are eligible for eviction
-	 * (e.g. have no outstanding holds on the buffer).
-	 * Not updated directly; only synced in arc_kstat_update.
-	 */
-	kstat_named_t arcstat_anon_evictable_data;
-	/*
-	 * Number of bytes consumed by ARC buffers that meet the
-	 * following criteria: backing buffers of type ARC_BUFC_METADATA,
-	 * residing in the arc_anon state, and are eligible for eviction
-	 * (e.g. have no outstanding holds on the buffer).
-	 * Not updated directly; only synced in arc_kstat_update.
-	 */
-	kstat_named_t arcstat_anon_evictable_metadata;
-	/*
-	 * Total number of bytes consumed by ARC buffers residing in the
-	 * arc_mru state. This includes *all* buffers in the arc_mru
-	 * state; e.g. data, metadata, evictable, and unevictable buffers
-	 * are all included in this value.
-	 * Not updated directly; only synced in arc_kstat_update.
-	 */
-	kstat_named_t arcstat_mru_size;
-	/*
-	 * Number of bytes consumed by ARC buffers that meet the
-	 * following criteria: backing buffers of type ARC_BUFC_DATA,
-	 * residing in the arc_mru state, and are eligible for eviction
-	 * (e.g. have no outstanding holds on the buffer).
-	 * Not updated directly; only synced in arc_kstat_update.
-	 */
-	kstat_named_t arcstat_mru_evictable_data;
-	/*
-	 * Number of bytes consumed by ARC buffers that meet the
-	 * following criteria: backing buffers of type ARC_BUFC_METADATA,
-	 * residing in the arc_mru state, and are eligible for eviction
-	 * (e.g. have no outstanding holds on the buffer).
-	 * Not updated directly; only synced in arc_kstat_update.
-	 */
-	kstat_named_t arcstat_mru_evictable_metadata;
-	/*
-	 * Total number of bytes that *would have been* consumed by ARC
-	 * buffers in the arc_mru_ghost state. The key thing to note
-	 * here, is the fact that this size doesn't actually indicate
-	 * RAM consumption. The ghost lists only consist of headers and
-	 * don't actually have ARC buffers linked off of these headers.
-	 * Thus, *if* the headers had associated ARC buffers, these
-	 * buffers *would have* consumed this number of bytes.
-	 * Not updated directly; only synced in arc_kstat_update.
-	 */
-	kstat_named_t arcstat_mru_ghost_size;
-	/*
-	 * Number of bytes that *would have been* consumed by ARC
-	 * buffers that are eligible for eviction, of type
-	 * ARC_BUFC_DATA, and linked off the arc_mru_ghost state.
-	 * Not updated directly; only synced in arc_kstat_update.
-	 */
-	kstat_named_t arcstat_mru_ghost_evictable_data;
-	/*
-	 * Number of bytes that *would have been* consumed by ARC
-	 * buffers that are eligible for eviction, of type
-	 * ARC_BUFC_METADATA, and linked off the arc_mru_ghost state.
-	 * Not updated directly; only synced in arc_kstat_update.
-	 */
-	kstat_named_t arcstat_mru_ghost_evictable_metadata;
-	/*
-	 * Total number of bytes consumed by ARC buffers residing in the
-	 * arc_mfu state. This includes *all* buffers in the arc_mfu
-	 * state; e.g. data, metadata, evictable, and unevictable buffers
-	 * are all included in this value.
-	 * Not updated directly; only synced in arc_kstat_update.
-	 */
-	kstat_named_t arcstat_mfu_size;
-	/*
-	 * Number of bytes consumed by ARC buffers that are eligible for
-	 * eviction, of type ARC_BUFC_DATA, and reside in the arc_mfu
-	 * state.
-	 * Not updated directly; only synced in arc_kstat_update.
-	 */
-	kstat_named_t arcstat_mfu_evictable_data;
-	/*
-	 * Number of bytes consumed by ARC buffers that are eligible for
-	 * eviction, of type ARC_BUFC_METADATA, and reside in the
-	 * arc_mfu state.
-	 * Not updated directly; only synced in arc_kstat_update.
-	 */
-	kstat_named_t arcstat_mfu_evictable_metadata;
-	/*
-	 * Total number of bytes that *would have been* consumed by ARC
-	 * buffers in the arc_mfu_ghost state. See the comment above
-	 * arcstat_mru_ghost_size for more details.
-	 * Not updated directly; only synced in arc_kstat_update.
-	 */
-	kstat_named_t arcstat_mfu_ghost_size;
-	/*
-	 * Number of bytes that *would have been* consumed by ARC
-	 * buffers that are eligible for eviction, of type
-	 * ARC_BUFC_DATA, and linked off the arc_mfu_ghost state.
-	 * Not updated directly; only synced in arc_kstat_update.
-	 */
-	kstat_named_t arcstat_mfu_ghost_evictable_data;
-	/*
-	 * Number of bytes that *would have been* consumed by ARC
-	 * buffers that are eligible for eviction, of type
-	 * ARC_BUFC_METADATA, and linked off the arc_mru_ghost state.
-	 * Not updated directly; only synced in arc_kstat_update.
-	 */
-	kstat_named_t arcstat_mfu_ghost_evictable_metadata;
-	kstat_named_t arcstat_l2_hits;
-	kstat_named_t arcstat_l2_misses;
-	kstat_named_t arcstat_l2_feeds;
-	kstat_named_t arcstat_l2_rw_clash;
-	kstat_named_t arcstat_l2_read_bytes;
-	kstat_named_t arcstat_l2_write_bytes;
-	kstat_named_t arcstat_l2_writes_sent;
-	kstat_named_t arcstat_l2_writes_done;
-	kstat_named_t arcstat_l2_writes_error;
-	kstat_named_t arcstat_l2_writes_lock_retry;
-	kstat_named_t arcstat_l2_evict_lock_retry;
-	kstat_named_t arcstat_l2_evict_reading;
-	kstat_named_t arcstat_l2_evict_l1cached;
-	kstat_named_t arcstat_l2_free_on_write;
-	kstat_named_t arcstat_l2_abort_lowmem;
-	kstat_named_t arcstat_l2_cksum_bad;
-	kstat_named_t arcstat_l2_io_error;
-	kstat_named_t arcstat_l2_lsize;
-	kstat_named_t arcstat_l2_psize;
-	/* Not updated directly; only synced in arc_kstat_update. */
-	kstat_named_t arcstat_l2_hdr_size;
-	kstat_named_t arcstat_l2_write_trylock_fail;
-	kstat_named_t arcstat_l2_write_passed_headroom;
-	kstat_named_t arcstat_l2_write_spa_mismatch;
-	kstat_named_t arcstat_l2_write_in_l2;
-	kstat_named_t arcstat_l2_write_hdr_io_in_progress;
-	kstat_named_t arcstat_l2_write_not_cacheable;
-	kstat_named_t arcstat_l2_write_full;
-	kstat_named_t arcstat_l2_write_buffer_iter;
-	kstat_named_t arcstat_l2_write_pios;
-	kstat_named_t arcstat_l2_write_buffer_bytes_scanned;
-	kstat_named_t arcstat_l2_write_buffer_list_iter;
-	kstat_named_t arcstat_l2_write_buffer_list_null_iter;
-	kstat_named_t arcstat_memory_throttle_count;
-	kstat_named_t arcstat_memory_direct_count;
-	kstat_named_t arcstat_memory_indirect_count;
-	kstat_named_t arcstat_memory_all_bytes;
-	kstat_named_t arcstat_memory_free_bytes;
-	kstat_named_t arcstat_memory_available_bytes;
-	kstat_named_t arcstat_no_grow;
-	kstat_named_t arcstat_tempreserve;
-	kstat_named_t arcstat_loaned_bytes;
-	kstat_named_t arcstat_prune;
-	/* Not updated directly; only synced in arc_kstat_update. */
-	kstat_named_t arcstat_meta_used;
-	kstat_named_t arcstat_meta_limit;
-	kstat_named_t arcstat_dnode_limit;
-	kstat_named_t arcstat_meta_max;
-	kstat_named_t arcstat_meta_min;
-	kstat_named_t arcstat_async_upgrade_sync;
-	kstat_named_t arcstat_demand_hit_predictive_prefetch;
-	kstat_named_t arcstat_demand_hit_prescient_prefetch;
-} arc_stats_t;
-
-static arc_stats_t arc_stats = {
-	{ "hits",			KSTAT_DATA_UINT64 },
-	{ "misses",			KSTAT_DATA_UINT64 },
-	{ "demand_data_hits",		KSTAT_DATA_UINT64 },
-	{ "demand_data_misses",		KSTAT_DATA_UINT64 },
-	{ "demand_metadata_hits",	KSTAT_DATA_UINT64 },
-	{ "demand_metadata_misses",	KSTAT_DATA_UINT64 },
-	{ "prefetch_data_hits",		KSTAT_DATA_UINT64 },
-	{ "prefetch_data_misses",	KSTAT_DATA_UINT64 },
-	{ "prefetch_metadata_hits",	KSTAT_DATA_UINT64 },
-	{ "prefetch_metadata_misses",	KSTAT_DATA_UINT64 },
-	{ "mru_hits",			KSTAT_DATA_UINT64 },
-	{ "mru_ghost_hits",		KSTAT_DATA_UINT64 },
-	{ "mfu_hits",			KSTAT_DATA_UINT64 },
-	{ "mfu_ghost_hits",		KSTAT_DATA_UINT64 },
-	{ "allocated",			KSTAT_DATA_UINT64 },
-	{ "deleted",			KSTAT_DATA_UINT64 },
-	{ "mutex_miss",			KSTAT_DATA_UINT64 },
-	{ "access_skip",		KSTAT_DATA_UINT64 },
-	{ "evict_skip",			KSTAT_DATA_UINT64 },
-	{ "evict_not_enough",		KSTAT_DATA_UINT64 },
-	{ "evict_l2_cached",		KSTAT_DATA_UINT64 },
-	{ "evict_l2_eligible",		KSTAT_DATA_UINT64 },
-	{ "evict_l2_ineligible",	KSTAT_DATA_UINT64 },
-	{ "evict_l2_skip",		KSTAT_DATA_UINT64 },
-	{ "hash_elements",		KSTAT_DATA_UINT64 },
-	{ "hash_elements_max",		KSTAT_DATA_UINT64 },
-	{ "hash_collisions",		KSTAT_DATA_UINT64 },
-	{ "hash_chains",		KSTAT_DATA_UINT64 },
-	{ "hash_chain_max",		KSTAT_DATA_UINT64 },
-	{ "p",				KSTAT_DATA_UINT64 },
-	{ "c",				KSTAT_DATA_UINT64 },
-	{ "c_min",			KSTAT_DATA_UINT64 },
-	{ "c_max",			KSTAT_DATA_UINT64 },
-	{ "size",			KSTAT_DATA_UINT64 },
-	{ "compressed_size",		KSTAT_DATA_UINT64 },
-	{ "uncompressed_size",		KSTAT_DATA_UINT64 },
-	{ "overhead_size",		KSTAT_DATA_UINT64 },
-	{ "hdr_size",			KSTAT_DATA_UINT64 },
-	{ "data_size",			KSTAT_DATA_UINT64 },
-	{ "metadata_size",		KSTAT_DATA_UINT64 },
-	{ "dbuf_size",			KSTAT_DATA_UINT64 },
-	{ "dnode_size",			KSTAT_DATA_UINT64 },
-	{ "bonus_size",			KSTAT_DATA_UINT64 },
-#if defined(__FreeBSD__) && defined(COMPAT_FREEBSD11)
-	{ "other_size",			KSTAT_DATA_UINT64 },
-#endif
-	{ "anon_size",			KSTAT_DATA_UINT64 },
-	{ "anon_evictable_data",	KSTAT_DATA_UINT64 },
-	{ "anon_evictable_metadata",	KSTAT_DATA_UINT64 },
-	{ "mru_size",			KSTAT_DATA_UINT64 },
-	{ "mru_evictable_data",		KSTAT_DATA_UINT64 },
-	{ "mru_evictable_metadata",	KSTAT_DATA_UINT64 },
-	{ "mru_ghost_size",		KSTAT_DATA_UINT64 },
-	{ "mru_ghost_evictable_data",	KSTAT_DATA_UINT64 },
-	{ "mru_ghost_evictable_metadata", KSTAT_DATA_UINT64 },
-	{ "mfu_size",			KSTAT_DATA_UINT64 },
-	{ "mfu_evictable_data",		KSTAT_DATA_UINT64 },
-	{ "mfu_evictable_metadata",	KSTAT_DATA_UINT64 },
-	{ "mfu_ghost_size",		KSTAT_DATA_UINT64 },
-	{ "mfu_ghost_evictable_data",	KSTAT_DATA_UINT64 },
-	{ "mfu_ghost_evictable_metadata", KSTAT_DATA_UINT64 },
-	{ "l2_hits",			KSTAT_DATA_UINT64 },
-	{ "l2_misses",			KSTAT_DATA_UINT64 },
-	{ "l2_feeds",			KSTAT_DATA_UINT64 },
-	{ "l2_rw_clash",		KSTAT_DATA_UINT64 },
-	{ "l2_read_bytes",		KSTAT_DATA_UINT64 },
-	{ "l2_write_bytes",		KSTAT_DATA_UINT64 },
-	{ "l2_writes_sent",		KSTAT_DATA_UINT64 },
-	{ "l2_writes_done",		KSTAT_DATA_UINT64 },
-	{ "l2_writes_error",		KSTAT_DATA_UINT64 },
-	{ "l2_writes_lock_retry",	KSTAT_DATA_UINT64 },
-	{ "l2_evict_lock_retry",	KSTAT_DATA_UINT64 },
-	{ "l2_evict_reading",		KSTAT_DATA_UINT64 },
-	{ "l2_evict_l1cached",		KSTAT_DATA_UINT64 },
-	{ "l2_free_on_write",		KSTAT_DATA_UINT64 },
-	{ "l2_abort_lowmem",		KSTAT_DATA_UINT64 },
-	{ "l2_cksum_bad",		KSTAT_DATA_UINT64 },
-	{ "l2_io_error",		KSTAT_DATA_UINT64 },
-	{ "l2_size",			KSTAT_DATA_UINT64 },
-	{ "l2_asize",			KSTAT_DATA_UINT64 },
-	{ "l2_hdr_size",		KSTAT_DATA_UINT64 },
-	{ "l2_write_trylock_fail",	KSTAT_DATA_UINT64 },
-	{ "l2_write_passed_headroom",	KSTAT_DATA_UINT64 },
-	{ "l2_write_spa_mismatch",	KSTAT_DATA_UINT64 },
-	{ "l2_write_in_l2",		KSTAT_DATA_UINT64 },
-	{ "l2_write_io_in_progress",	KSTAT_DATA_UINT64 },
-	{ "l2_write_not_cacheable",	KSTAT_DATA_UINT64 },
-	{ "l2_write_full",		KSTAT_DATA_UINT64 },
-	{ "l2_write_buffer_iter",	KSTAT_DATA_UINT64 },
-	{ "l2_write_pios",		KSTAT_DATA_UINT64 },
-	{ "l2_write_buffer_bytes_scanned", KSTAT_DATA_UINT64 },
-	{ "l2_write_buffer_list_iter",	KSTAT_DATA_UINT64 },
-	{ "l2_write_buffer_list_null_iter", KSTAT_DATA_UINT64 },
-	{ "memory_throttle_count",	KSTAT_DATA_UINT64 },
-	{ "memory_direct_count",	KSTAT_DATA_UINT64 },
-	{ "memory_indirect_count",	KSTAT_DATA_UINT64 },
-	{ "memory_all_bytes",		KSTAT_DATA_UINT64 },
-	{ "memory_free_bytes",		KSTAT_DATA_UINT64 },
-	{ "memory_available_bytes",	KSTAT_DATA_UINT64 },
-	{ "arc_no_grow",		KSTAT_DATA_UINT64 },
-	{ "arc_tempreserve",		KSTAT_DATA_UINT64 },
-	{ "arc_loaned_bytes",		KSTAT_DATA_UINT64 },
-	{ "arc_prune",			KSTAT_DATA_UINT64 },
-	{ "arc_meta_used",		KSTAT_DATA_UINT64 },
-	{ "arc_meta_limit",		KSTAT_DATA_UINT64 },
-	{ "arc_dnode_limit",		KSTAT_DATA_UINT64 },
-	{ "arc_meta_max",		KSTAT_DATA_UINT64 },
-	{ "arc_meta_min",		KSTAT_DATA_UINT64 },
-	{ "async_upgrade_sync",		KSTAT_DATA_UINT64 },
-	{ "demand_hit_predictive_prefetch", KSTAT_DATA_UINT64 },
-	{ "demand_hit_prescient_prefetch", KSTAT_DATA_UINT64 },
-};
-
-#define	ARCSTAT(stat)	(arc_stats.stat.value.ui64)
-
-#define	ARCSTAT_INCR(stat, val) \
-	atomic_add_64(&arc_stats.stat.value.ui64, (val))
-
-#define	ARCSTAT_BUMP(stat)	ARCSTAT_INCR(stat, 1)
-#define	ARCSTAT_BUMPDOWN(stat)	ARCSTAT_INCR(stat, -1)
-
-#define	ARCSTAT_MAX(stat, val) {					\
-	uint64_t m;							\
-	while ((val) > (m = arc_stats.stat.value.ui64) &&		\
-	    (m != atomic_cas_64(&arc_stats.stat.value.ui64, m, (val))))	\
-		continue;						\
-}
-
-#define	ARCSTAT_MAXSTAT(stat) \
-	ARCSTAT_MAX(stat##_max, arc_stats.stat.value.ui64)
-
-/*
- * We define a macro to allow ARC hits/misses to be easily broken down by
- * two separate conditions, giving a total of four different subtypes for
- * each of hits and misses (so eight statistics total).
- */
-#define	ARCSTAT_CONDSTAT(cond1, stat1, notstat1, cond2, stat2, notstat2, stat) \
-	if (cond1) {							\
-		if (cond2) {						\
-			ARCSTAT_BUMP(arcstat_##stat1##_##stat2##_##stat); \
-		} else {						\
-			ARCSTAT_BUMP(arcstat_##stat1##_##notstat2##_##stat); \
-		}							\
-	} else {							\
-		if (cond2) {						\
-			ARCSTAT_BUMP(arcstat_##notstat1##_##stat2##_##stat); \
-		} else {						\
-			ARCSTAT_BUMP(arcstat_##notstat1##_##notstat2##_##stat);\
-		}							\
-	}
-
-kstat_t			*arc_ksp;
-static arc_state_t	*arc_anon;
-static arc_state_t	*arc_mru;
-static arc_state_t	*arc_mru_ghost;
-static arc_state_t	*arc_mfu;
-static arc_state_t	*arc_mfu_ghost;
-static arc_state_t	*arc_l2c_only;
-
-/*
- * There are several ARC variables that are critical to export as kstats --
- * but we don't want to have to grovel around in the kstat whenever we wish to
- * manipulate them.  For these variables, we therefore define them to be in
- * terms of the statistic variable.  This assures that we are not introducing
- * the possibility of inconsistency by having shadow copies of the variables,
- * while still allowing the code to be readable.
- */
-#define	arc_p		ARCSTAT(arcstat_p)	/* target size of MRU */
-#define	arc_c		ARCSTAT(arcstat_c)	/* target size of cache */
-#define	arc_c_min	ARCSTAT(arcstat_c_min)	/* min target cache size */
-#define	arc_c_max	ARCSTAT(arcstat_c_max)	/* max target cache size */
-#define	arc_meta_limit	ARCSTAT(arcstat_meta_limit) /* max size for metadata */
-#define	arc_dnode_limit	ARCSTAT(arcstat_dnode_limit) /* max size for dnodes */
-#define	arc_meta_min	ARCSTAT(arcstat_meta_min) /* min size for metadata */
-#define	arc_meta_max	ARCSTAT(arcstat_meta_max) /* max size of metadata */
-#define	arc_dbuf_size	ARCSTAT(arcstat_dbuf_size) /* dbuf metadata */
-#define	arc_dnode_size	ARCSTAT(arcstat_dnode_size) /* dnode metadata */
-#define	arc_bonus_size	ARCSTAT(arcstat_bonus_size) /* bonus buffer metadata */
-
-/* compressed size of entire arc */
-#define	arc_compressed_size	ARCSTAT(arcstat_compressed_size)
-/* uncompressed size of entire arc */
-#define	arc_uncompressed_size	ARCSTAT(arcstat_uncompressed_size)
-/* number of bytes in the arc from arc_buf_t's */
-#define	arc_overhead_size	ARCSTAT(arcstat_overhead_size)
-
-/*
- * There are also some ARC variables that we want to export, but that are
- * updated so often that having the canonical representation be the statistic
- * variable causes a performance bottleneck. We want to use aggsum_t's for these
- * instead, but still be able to export the kstat in the same way as before.
- * The solution is to always use the aggsum version, except in the kstat update
- * callback.
- */
-aggsum_t arc_size;
-aggsum_t arc_meta_used;
-aggsum_t astat_data_size;
-aggsum_t astat_metadata_size;
-aggsum_t astat_hdr_size;
-aggsum_t astat_bonus_size;
-aggsum_t astat_dnode_size;
-aggsum_t astat_dbuf_size;
-aggsum_t astat_l2_hdr_size;
-
-static list_t arc_prune_list;
-static kmutex_t arc_prune_mtx;
-static taskq_t *arc_prune_taskq;
-
-static int		arc_no_grow;	/* Don't try to grow cache size */
-static hrtime_t		arc_growtime;
-static uint64_t		arc_tempreserve;
-static uint64_t		arc_loaned_bytes;
-
-typedef struct arc_callback arc_callback_t;
-
-struct arc_callback {
-	void			*acb_private;
-	arc_read_done_func_t	*acb_done;
-	arc_buf_t		*acb_buf;
-	boolean_t		acb_compressed;
-	zio_t			*acb_zio_dummy;
-	zio_t			*acb_zio_head;
-	arc_callback_t		*acb_next;
-};
-
-typedef struct arc_write_callback arc_write_callback_t;
-
-struct arc_write_callback {
-	void			*awcb_private;
-	arc_write_done_func_t	*awcb_ready;
-	arc_write_done_func_t	*awcb_children_ready;
-	arc_write_done_func_t	*awcb_physdone;
-	arc_write_done_func_t	*awcb_done;
-	arc_buf_t		*awcb_buf;
-};
-
-/*
- * ARC buffers are separated into multiple structs as a memory saving measure:
- *   - Common fields struct, always defined, and embedded within it:
- *       - L2-only fields, always allocated but undefined when not in L2ARC
- *       - L1-only fields, only allocated when in L1ARC
- *
- *           Buffer in L1                     Buffer only in L2
- *    +------------------------+          +------------------------+
- *    | arc_buf_hdr_t          |          | arc_buf_hdr_t          |
- *    |                        |          |                        |
- *    |                        |          |                        |
- *    |                        |          |                        |
- *    +------------------------+          +------------------------+
- *    | l2arc_buf_hdr_t        |          | l2arc_buf_hdr_t        |
- *    | (undefined if L1-only) |          |                        |
- *    +------------------------+          +------------------------+
- *    | l1arc_buf_hdr_t        |
- *    |                        |
- *    |                        |
- *    |                        |
- *    |                        |
- *    +------------------------+
- *
- * Because it's possible for the L2ARC to become extremely large, we can wind
- * up eating a lot of memory in L2ARC buffer headers, so the size of a header
- * is minimized by only allocating the fields necessary for an L1-cached buffer
- * when a header is actually in the L1 cache. The sub-headers (l1arc_buf_hdr and
- * l2arc_buf_hdr) are embedded rather than allocated separately to save a couple
- * words in pointers. arc_hdr_realloc() is used to switch a header between
- * these two allocation states.
- */
-typedef struct l1arc_buf_hdr {
-	kmutex_t		b_freeze_lock;
-	zio_cksum_t		*b_freeze_cksum;
-#ifdef ZFS_DEBUG
-	/*
-	 * Used for debugging with kmem_flags - by allocating and freeing
-	 * b_thawed when the buffer is thawed, we get a record of the stack
-	 * trace that thawed it.
-	 */
-	void			*b_thawed;
-#endif
-
-	arc_buf_t		*b_buf;
-	uint32_t		b_bufcnt;
-	/* for waiting on writes to complete */
-	kcondvar_t		b_cv;
-	uint8_t			b_byteswap;
-
-	/* protected by arc state mutex */
-	arc_state_t		*b_state;
-	multilist_node_t	b_arc_node;
-
-	/* updated atomically */
-	clock_t			b_arc_access;
-	uint32_t		b_mru_hits;
-	uint32_t		b_mru_ghost_hits;
-	uint32_t		b_mfu_hits;
-	uint32_t		b_mfu_ghost_hits;
-	uint32_t		b_l2_hits;
-
-	/* self protecting */
-	zfs_refcount_t		b_refcnt;
-
-	arc_callback_t		*b_acb;
-	abd_t			*b_pabd;
-} l1arc_buf_hdr_t;
-
-typedef struct l2arc_dev l2arc_dev_t;
-
-typedef struct l2arc_buf_hdr {
-	/* protected by arc_buf_hdr mutex */
-	l2arc_dev_t		*b_dev;		/* L2ARC device */
-	uint64_t		b_daddr;	/* disk address, offset byte */
-	uint32_t		b_hits;
-
-	list_node_t		b_l2node;
-} l2arc_buf_hdr_t;
-
-struct arc_buf_hdr {
-	/* protected by hash lock */
-	dva_t			b_dva;
-	uint64_t		b_birth;
-
-	arc_buf_contents_t	b_type;
-	arc_buf_hdr_t		*b_hash_next;
-	arc_flags_t		b_flags;
-
-	/*
-	 * This field stores the size of the data buffer after
-	 * compression, and is set in the arc's zio completion handlers.
-	 * It is in units of SPA_MINBLOCKSIZE (e.g. 1 == 512 bytes).
-	 *
-	 * While the block pointers can store up to 32MB in their psize
-	 * field, we can only store up to 32MB minus 512B. This is due
-	 * to the bp using a bias of 1, whereas we use a bias of 0 (i.e.
-	 * a field of zeros represents 512B in the bp). We can't use a
-	 * bias of 1 since we need to reserve a psize of zero, here, to
-	 * represent holes and embedded blocks.
-	 *
-	 * This isn't a problem in practice, since the maximum size of a
-	 * buffer is limited to 16MB, so we never need to store 32MB in
-	 * this field. Even in the upstream illumos code base, the
-	 * maximum size of a buffer is limited to 16MB.
-	 */
-	uint16_t		b_psize;
-
-	/*
-	 * This field stores the size of the data buffer before
-	 * compression, and cannot change once set. It is in units
-	 * of SPA_MINBLOCKSIZE (e.g. 2 == 1024 bytes)
-	 */
-	uint16_t		b_lsize;	/* immutable */
-	uint64_t		b_spa;		/* immutable */
-
-	/* L2ARC fields. Undefined when not in L2ARC. */
-	l2arc_buf_hdr_t		b_l2hdr;
-	/* L1ARC fields. Undefined when in l2arc_only state */
-	l1arc_buf_hdr_t		b_l1hdr;
-};
-
-#if defined(__FreeBSD__) && defined(_KERNEL)
-static int
-sysctl_vfs_zfs_arc_meta_limit(SYSCTL_HANDLER_ARGS)
-{
-	uint64_t val;
-	int err;
-
-	val = arc_meta_limit;
-	err = sysctl_handle_64(oidp, &val, 0, req);
-	if (err != 0 || req->newptr == NULL)
-		return (err);
-
-        if (val <= 0 || val > arc_c_max)
-		return (EINVAL);
-
-	arc_meta_limit = val;
-
-	mutex_enter(&arc_adjust_lock);
-	arc_adjust_needed = B_TRUE;
-	mutex_exit(&arc_adjust_lock);
-	zthr_wakeup(arc_adjust_zthr);
-
-	return (0);
-}
-
-static int
-sysctl_vfs_zfs_arc_no_grow_shift(SYSCTL_HANDLER_ARGS)
-{
-	uint32_t val;
-	int err;
-
-	val = arc_no_grow_shift;
-	err = sysctl_handle_32(oidp, &val, 0, req);
-	if (err != 0 || req->newptr == NULL)
-		return (err);
-
-        if (val >= arc_shrink_shift)
-		return (EINVAL);
-
-	arc_no_grow_shift = val;
-	return (0);
-}
-
-static int
-sysctl_vfs_zfs_arc_max(SYSCTL_HANDLER_ARGS)
-{
-	uint64_t val;
-	int err;
-
-	val = zfs_arc_max;
-	err = sysctl_handle_64(oidp, &val, 0, req);
-	if (err != 0 || req->newptr == NULL)
-		return (err);
-
-	if (zfs_arc_max == 0) {
-		/* Loader tunable so blindly set */
-		zfs_arc_max = val;
-		return (0);
-	}
-
-	if (val < arc_abs_min || val > kmem_size())
-		return (EINVAL);
-	if (val < arc_c_min)
-		return (EINVAL);
-	if (zfs_arc_meta_limit > 0 && val < zfs_arc_meta_limit)
-		return (EINVAL);
-
-	arc_c_max = val;
-
-	arc_c = arc_c_max;
-        arc_p = (arc_c >> 1);
-
-	if (zfs_arc_meta_limit == 0) {
-		/* limit meta-data to 1/4 of the arc capacity */
-		arc_meta_limit = arc_c_max / 4;
-	}
-
-	/* if kmem_flags are set, lets try to use less memory */
-	if (kmem_debugging())
-		arc_c = arc_c / 2;
-
-	zfs_arc_max = arc_c;
-
-	mutex_enter(&arc_adjust_lock);
-	arc_adjust_needed = B_TRUE;
-	mutex_exit(&arc_adjust_lock);
-	zthr_wakeup(arc_adjust_zthr);
-
-	return (0);
-}
-
-static int
-sysctl_vfs_zfs_arc_min(SYSCTL_HANDLER_ARGS)
-{
-	uint64_t val;
-	int err;
-
-	val = zfs_arc_min;
-	err = sysctl_handle_64(oidp, &val, 0, req);
-	if (err != 0 || req->newptr == NULL)
-		return (err);
-
-	if (zfs_arc_min == 0) {
-		/* Loader tunable so blindly set */
-		zfs_arc_min = val;
-		return (0);
-	}
-
-	if (val < arc_abs_min || val > arc_c_max)
-		return (EINVAL);
-
-	arc_c_min = val;
-
-	if (zfs_arc_meta_min == 0)
-                arc_meta_min = arc_c_min / 2;
-
-	if (arc_c < arc_c_min)
-                arc_c = arc_c_min;
-
-	zfs_arc_min = arc_c_min;
-
-	return (0);
-}
-#endif
-
-#define	GHOST_STATE(state)	\
-	((state) == arc_mru_ghost || (state) == arc_mfu_ghost ||	\
-	(state) == arc_l2c_only)
-
-#define	HDR_IN_HASH_TABLE(hdr)	((hdr)->b_flags & ARC_FLAG_IN_HASH_TABLE)
-#define	HDR_IO_IN_PROGRESS(hdr)	((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS)
-#define	HDR_IO_ERROR(hdr)	((hdr)->b_flags & ARC_FLAG_IO_ERROR)
-#define	HDR_PREFETCH(hdr)	((hdr)->b_flags & ARC_FLAG_PREFETCH)
-#define	HDR_PRESCIENT_PREFETCH(hdr)	\
-	((hdr)->b_flags & ARC_FLAG_PRESCIENT_PREFETCH)
-#define	HDR_COMPRESSION_ENABLED(hdr)	\
-	((hdr)->b_flags & ARC_FLAG_COMPRESSED_ARC)
-
-#define	HDR_L2CACHE(hdr)	((hdr)->b_flags & ARC_FLAG_L2CACHE)
-#define	HDR_L2_READING(hdr)	\
-	(((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS) &&	\
-	((hdr)->b_flags & ARC_FLAG_HAS_L2HDR))
-#define	HDR_L2_WRITING(hdr)	((hdr)->b_flags & ARC_FLAG_L2_WRITING)
-#define	HDR_L2_EVICTED(hdr)	((hdr)->b_flags & ARC_FLAG_L2_EVICTED)
-#define	HDR_L2_WRITE_HEAD(hdr)	((hdr)->b_flags & ARC_FLAG_L2_WRITE_HEAD)
-#define	HDR_SHARED_DATA(hdr)	((hdr)->b_flags & ARC_FLAG_SHARED_DATA)
-
-#define	HDR_ISTYPE_METADATA(hdr)	\
-	((hdr)->b_flags & ARC_FLAG_BUFC_METADATA)
-#define	HDR_ISTYPE_DATA(hdr)	(!HDR_ISTYPE_METADATA(hdr))
-
-#define	HDR_HAS_L1HDR(hdr)	((hdr)->b_flags & ARC_FLAG_HAS_L1HDR)
-#define	HDR_HAS_L2HDR(hdr)	((hdr)->b_flags & ARC_FLAG_HAS_L2HDR)
-
-/* For storing compression mode in b_flags */
-#define	HDR_COMPRESS_OFFSET	(highbit64(ARC_FLAG_COMPRESS_0) - 1)
-
-#define	HDR_GET_COMPRESS(hdr)	((enum zio_compress)BF32_GET((hdr)->b_flags, \
-	HDR_COMPRESS_OFFSET, SPA_COMPRESSBITS))
-#define	HDR_SET_COMPRESS(hdr, cmp) BF32_SET((hdr)->b_flags, \
-	HDR_COMPRESS_OFFSET, SPA_COMPRESSBITS, (cmp));
-
-#define	ARC_BUF_LAST(buf)	((buf)->b_next == NULL)
-#define	ARC_BUF_SHARED(buf)	((buf)->b_flags & ARC_BUF_FLAG_SHARED)
-#define	ARC_BUF_COMPRESSED(buf)	((buf)->b_flags & ARC_BUF_FLAG_COMPRESSED)
-
-/*
- * Other sizes
- */
-
-#define	HDR_FULL_SIZE ((int64_t)sizeof (arc_buf_hdr_t))
-#define	HDR_L2ONLY_SIZE ((int64_t)offsetof(arc_buf_hdr_t, b_l1hdr))
-
-/*
- * Hash table routines
- */
-
-#define	HT_LOCK_PAD	CACHE_LINE_SIZE
-
-struct ht_lock {
-	kmutex_t	ht_lock;
-#ifdef _KERNEL
-	unsigned char	pad[(HT_LOCK_PAD - sizeof (kmutex_t))];
-#endif
-};
-
-#define	BUF_LOCKS 256
-typedef struct buf_hash_table {
-	uint64_t ht_mask;
-	arc_buf_hdr_t **ht_table;
-	struct ht_lock ht_locks[BUF_LOCKS] __aligned(CACHE_LINE_SIZE);
-} buf_hash_table_t;
-
-static buf_hash_table_t buf_hash_table;
-
-#define	BUF_HASH_INDEX(spa, dva, birth) \
-	(buf_hash(spa, dva, birth) & buf_hash_table.ht_mask)
-#define	BUF_HASH_LOCK_NTRY(idx) (buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)])
-#define	BUF_HASH_LOCK(idx)	(&(BUF_HASH_LOCK_NTRY(idx).ht_lock))
-#define	HDR_LOCK(hdr) \
-	(BUF_HASH_LOCK(BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth)))
-
-uint64_t zfs_crc64_table[256];
-
-/*
- * Level 2 ARC
- */
-
-#define	L2ARC_WRITE_SIZE	(8 * 1024 * 1024)	/* initial write max */
-#define	L2ARC_HEADROOM		2			/* num of writes */
-/*
- * If we discover during ARC scan any buffers to be compressed, we boost
- * our headroom for the next scanning cycle by this percentage multiple.
- */
-#define	L2ARC_HEADROOM_BOOST	200
-#define	L2ARC_FEED_SECS		1		/* caching interval secs */
-#define	L2ARC_FEED_MIN_MS	200		/* min caching interval ms */
-
-#define	l2arc_writes_sent	ARCSTAT(arcstat_l2_writes_sent)
-#define	l2arc_writes_done	ARCSTAT(arcstat_l2_writes_done)
-
-/* L2ARC Performance Tunables */
-uint64_t l2arc_write_max = L2ARC_WRITE_SIZE;	/* default max write size */
-uint64_t l2arc_write_boost = L2ARC_WRITE_SIZE;	/* extra write during warmup */
-uint64_t l2arc_headroom = L2ARC_HEADROOM;	/* number of dev writes */
-uint64_t l2arc_headroom_boost = L2ARC_HEADROOM_BOOST;
-uint64_t l2arc_feed_secs = L2ARC_FEED_SECS;	/* interval seconds */
-uint64_t l2arc_feed_min_ms = L2ARC_FEED_MIN_MS;	/* min interval milliseconds */
-boolean_t l2arc_noprefetch = B_TRUE;		/* don't cache prefetch bufs */
-boolean_t l2arc_feed_again = B_TRUE;		/* turbo warmup */
-boolean_t l2arc_norw = B_TRUE;			/* no reads during writes */
-
-SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_write_max, CTLFLAG_RWTUN,
-    &l2arc_write_max, 0, "max write size");
-SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_write_boost, CTLFLAG_RWTUN,
-    &l2arc_write_boost, 0, "extra write during warmup");
-SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_headroom, CTLFLAG_RWTUN,
-    &l2arc_headroom, 0, "number of dev writes");
-SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_feed_secs, CTLFLAG_RWTUN,
-    &l2arc_feed_secs, 0, "interval seconds");
-SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_feed_min_ms, CTLFLAG_RWTUN,
-    &l2arc_feed_min_ms, 0, "min interval milliseconds");
-
-SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_noprefetch, CTLFLAG_RWTUN,
-    &l2arc_noprefetch, 0, "don't cache prefetch bufs");
-SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_feed_again, CTLFLAG_RWTUN,
-    &l2arc_feed_again, 0, "turbo warmup");
-SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_norw, CTLFLAG_RWTUN,
-    &l2arc_norw, 0, "no reads during writes");
-
-SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_size, CTLFLAG_RD,
-    &ARC_anon.arcs_size.rc_count, 0, "size of anonymous state");
-SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_metadata_esize, CTLFLAG_RD,
-    &ARC_anon.arcs_esize[ARC_BUFC_METADATA].rc_count, 0,
-    "size of anonymous state");
-SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_data_esize, CTLFLAG_RD,
-    &ARC_anon.arcs_esize[ARC_BUFC_DATA].rc_count, 0,
-    "size of anonymous state");
-
-SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_size, CTLFLAG_RD,
-    &ARC_mru.arcs_size.rc_count, 0, "size of mru state");
-SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_metadata_esize, CTLFLAG_RD,
-    &ARC_mru.arcs_esize[ARC_BUFC_METADATA].rc_count, 0,
-    "size of metadata in mru state");
-SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_data_esize, CTLFLAG_RD,
-    &ARC_mru.arcs_esize[ARC_BUFC_DATA].rc_count, 0,
-    "size of data in mru state");
-
-SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_size, CTLFLAG_RD,
-    &ARC_mru_ghost.arcs_size.rc_count, 0, "size of mru ghost state");
-SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_metadata_esize, CTLFLAG_RD,
-    &ARC_mru_ghost.arcs_esize[ARC_BUFC_METADATA].rc_count, 0,
-    "size of metadata in mru ghost state");
-SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_data_esize, CTLFLAG_RD,
-    &ARC_mru_ghost.arcs_esize[ARC_BUFC_DATA].rc_count, 0,
-    "size of data in mru ghost state");
-
-SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_size, CTLFLAG_RD,
-    &ARC_mfu.arcs_size.rc_count, 0, "size of mfu state");
-SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_metadata_esize, CTLFLAG_RD,
-    &ARC_mfu.arcs_esize[ARC_BUFC_METADATA].rc_count, 0,
-    "size of metadata in mfu state");
-SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_data_esize, CTLFLAG_RD,
-    &ARC_mfu.arcs_esize[ARC_BUFC_DATA].rc_count, 0,
-    "size of data in mfu state");
-
-SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_size, CTLFLAG_RD,
-    &ARC_mfu_ghost.arcs_size.rc_count, 0, "size of mfu ghost state");
-SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_metadata_esize, CTLFLAG_RD,
-    &ARC_mfu_ghost.arcs_esize[ARC_BUFC_METADATA].rc_count, 0,
-    "size of metadata in mfu ghost state");
-SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_data_esize, CTLFLAG_RD,
-    &ARC_mfu_ghost.arcs_esize[ARC_BUFC_DATA].rc_count, 0,
-    "size of data in mfu ghost state");
-
-SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2c_only_size, CTLFLAG_RD,
-    &ARC_l2c_only.arcs_size.rc_count, 0, "size of mru state");
-
-SYSCTL_UINT(_vfs_zfs, OID_AUTO, arc_min_prefetch_ms, CTLFLAG_RW,
-    &zfs_arc_min_prefetch_ms, 0, "Min life of prefetch block in ms");
-SYSCTL_UINT(_vfs_zfs, OID_AUTO, arc_min_prescient_prefetch_ms, CTLFLAG_RW,
-    &zfs_arc_min_prescient_prefetch_ms, 0, "Min life of prescient prefetched block in ms");
-
-/*
- * L2ARC Internals
- */
-struct l2arc_dev {
-	vdev_t			*l2ad_vdev;	/* vdev */
-	spa_t			*l2ad_spa;	/* spa */
-	uint64_t		l2ad_hand;	/* next write location */
-	uint64_t		l2ad_start;	/* first addr on device */
-	uint64_t		l2ad_end;	/* last addr on device */
-	boolean_t		l2ad_first;	/* first sweep through */
-	boolean_t		l2ad_writing;	/* currently writing */
-	kmutex_t		l2ad_mtx;	/* lock for buffer list */
-	list_t			l2ad_buflist;	/* buffer list */
-	list_node_t		l2ad_node;	/* device list node */
-	zfs_refcount_t		l2ad_alloc;	/* allocated bytes */
-};
-
-static list_t L2ARC_dev_list;			/* device list */
-static list_t *l2arc_dev_list;			/* device list pointer */
-static kmutex_t l2arc_dev_mtx;			/* device list mutex */
-static l2arc_dev_t *l2arc_dev_last;		/* last device used */
-static list_t L2ARC_free_on_write;		/* free after write buf list */
-static list_t *l2arc_free_on_write;		/* free after write list ptr */
-static kmutex_t l2arc_free_on_write_mtx;	/* mutex for list */
-static uint64_t l2arc_ndev;			/* number of devices */
-
-typedef struct l2arc_read_callback {
-	arc_buf_hdr_t		*l2rcb_hdr;		/* read header */
-	blkptr_t		l2rcb_bp;		/* original blkptr */
-	zbookmark_phys_t	l2rcb_zb;		/* original bookmark */
-	int			l2rcb_flags;		/* original flags */
-	abd_t			*l2rcb_abd;		/* temporary buffer */
-} l2arc_read_callback_t;
-
-typedef struct l2arc_write_callback {
-	l2arc_dev_t	*l2wcb_dev;		/* device info */
-	arc_buf_hdr_t	*l2wcb_head;		/* head of write buflist */
-} l2arc_write_callback_t;
-
-typedef struct l2arc_data_free {
-	/* protected by l2arc_free_on_write_mtx */
-	abd_t		*l2df_abd;
-	size_t		l2df_size;
-	arc_buf_contents_t l2df_type;
-	list_node_t	l2df_list_node;
-} l2arc_data_free_t;
-
-static kmutex_t l2arc_feed_thr_lock;
-static kcondvar_t l2arc_feed_thr_cv;
-static uint8_t l2arc_thread_exit;
-
-static abd_t *arc_get_data_abd(arc_buf_hdr_t *, uint64_t, void *, boolean_t);
-static void *arc_get_data_buf(arc_buf_hdr_t *, uint64_t, void *);
-static void arc_get_data_impl(arc_buf_hdr_t *, uint64_t, void *, boolean_t);
-static void arc_free_data_abd(arc_buf_hdr_t *, abd_t *, uint64_t, void *);
-static void arc_free_data_buf(arc_buf_hdr_t *, void *, uint64_t, void *);
-static void arc_free_data_impl(arc_buf_hdr_t *hdr, uint64_t size, void *tag);
-static void arc_hdr_free_pabd(arc_buf_hdr_t *);
-static void arc_hdr_alloc_pabd(arc_buf_hdr_t *, boolean_t);
-static void arc_access(arc_buf_hdr_t *, kmutex_t *);
-static boolean_t arc_is_overflowing();
-static void arc_buf_watch(arc_buf_t *);
-static void arc_prune_async(int64_t);
-
-static arc_buf_contents_t arc_buf_type(arc_buf_hdr_t *);
-static uint32_t arc_bufc_to_flags(arc_buf_contents_t);
-static inline void arc_hdr_set_flags(arc_buf_hdr_t *hdr, arc_flags_t flags);
-static inline void arc_hdr_clear_flags(arc_buf_hdr_t *hdr, arc_flags_t flags);
-
-static boolean_t l2arc_write_eligible(uint64_t, arc_buf_hdr_t *);
-static void l2arc_read_done(zio_t *);
-
-static void
-l2arc_trim(const arc_buf_hdr_t *hdr)
-{
-	l2arc_dev_t *dev = hdr->b_l2hdr.b_dev;
-
-	ASSERT(HDR_HAS_L2HDR(hdr));
-	ASSERT(MUTEX_HELD(&dev->l2ad_mtx));
-
-	if (HDR_GET_PSIZE(hdr) != 0) {
-		trim_map_free(dev->l2ad_vdev, hdr->b_l2hdr.b_daddr,
-		    HDR_GET_PSIZE(hdr), 0);
-	}
-}
-
-/*
- * We use Cityhash for this. It's fast, and has good hash properties without
- * requiring any large static buffers.
- */
-static uint64_t
-buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth)
-{
-	return (cityhash4(spa, dva->dva_word[0], dva->dva_word[1], birth));
-}
-
-#define	HDR_EMPTY(hdr)						\
-	((hdr)->b_dva.dva_word[0] == 0 &&			\
-	(hdr)->b_dva.dva_word[1] == 0)
-
-#define	HDR_EQUAL(spa, dva, birth, hdr)				\
-	((hdr)->b_dva.dva_word[0] == (dva)->dva_word[0]) &&	\
-	((hdr)->b_dva.dva_word[1] == (dva)->dva_word[1]) &&	\
-	((hdr)->b_birth == birth) && ((hdr)->b_spa == spa)
-
-static void
-buf_discard_identity(arc_buf_hdr_t *hdr)
-{
-	hdr->b_dva.dva_word[0] = 0;
-	hdr->b_dva.dva_word[1] = 0;
-	hdr->b_birth = 0;
-}
-
-static arc_buf_hdr_t *
-buf_hash_find(uint64_t spa, const blkptr_t *bp, kmutex_t **lockp)
-{
-	const dva_t *dva = BP_IDENTITY(bp);
-	uint64_t birth = BP_PHYSICAL_BIRTH(bp);
-	uint64_t idx = BUF_HASH_INDEX(spa, dva, birth);
-	kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
-	arc_buf_hdr_t *hdr;
-
-	mutex_enter(hash_lock);
-	for (hdr = buf_hash_table.ht_table[idx]; hdr != NULL;
-	    hdr = hdr->b_hash_next) {
-		if (HDR_EQUAL(spa, dva, birth, hdr)) {
-			*lockp = hash_lock;
-			return (hdr);
-		}
-	}
-	mutex_exit(hash_lock);
-	*lockp = NULL;
-	return (NULL);
-}
-
-/*
- * Insert an entry into the hash table.  If there is already an element
- * equal to elem in the hash table, then the already existing element
- * will be returned and the new element will not be inserted.
- * Otherwise returns NULL.
- * If lockp == NULL, the caller is assumed to already hold the hash lock.
- */
-static arc_buf_hdr_t *
-buf_hash_insert(arc_buf_hdr_t *hdr, kmutex_t **lockp)
-{
-	uint64_t idx = BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth);
-	kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
-	arc_buf_hdr_t *fhdr;
-	uint32_t i;
-
-	ASSERT(!DVA_IS_EMPTY(&hdr->b_dva));
-	ASSERT(hdr->b_birth != 0);
-	ASSERT(!HDR_IN_HASH_TABLE(hdr));
-
-	if (lockp != NULL) {
-		*lockp = hash_lock;
-		mutex_enter(hash_lock);
-	} else {
-		ASSERT(MUTEX_HELD(hash_lock));
-	}
-
-	for (fhdr = buf_hash_table.ht_table[idx], i = 0; fhdr != NULL;
-	    fhdr = fhdr->b_hash_next, i++) {
-		if (HDR_EQUAL(hdr->b_spa, &hdr->b_dva, hdr->b_birth, fhdr))
-			return (fhdr);
-	}
-
-	hdr->b_hash_next = buf_hash_table.ht_table[idx];
-	buf_hash_table.ht_table[idx] = hdr;
-	arc_hdr_set_flags(hdr, ARC_FLAG_IN_HASH_TABLE);
-
-	/* collect some hash table performance data */
-	if (i > 0) {
-		ARCSTAT_BUMP(arcstat_hash_collisions);
-		if (i == 1)
-			ARCSTAT_BUMP(arcstat_hash_chains);
-
-		ARCSTAT_MAX(arcstat_hash_chain_max, i);
-	}
-
-	ARCSTAT_BUMP(arcstat_hash_elements);
-	ARCSTAT_MAXSTAT(arcstat_hash_elements);
-
-	return (NULL);
-}
-
-static void
-buf_hash_remove(arc_buf_hdr_t *hdr)
-{
-	arc_buf_hdr_t *fhdr, **hdrp;
-	uint64_t idx = BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth);
-
-	ASSERT(MUTEX_HELD(BUF_HASH_LOCK(idx)));
-	ASSERT(HDR_IN_HASH_TABLE(hdr));
-
-	hdrp = &buf_hash_table.ht_table[idx];
-	while ((fhdr = *hdrp) != hdr) {
-		ASSERT3P(fhdr, !=, NULL);
-		hdrp = &fhdr->b_hash_next;
-	}
-	*hdrp = hdr->b_hash_next;
-	hdr->b_hash_next = NULL;
-	arc_hdr_clear_flags(hdr, ARC_FLAG_IN_HASH_TABLE);
-
-	/* collect some hash table performance data */
-	ARCSTAT_BUMPDOWN(arcstat_hash_elements);
-
-	if (buf_hash_table.ht_table[idx] &&
-	    buf_hash_table.ht_table[idx]->b_hash_next == NULL)
-		ARCSTAT_BUMPDOWN(arcstat_hash_chains);
-}
-
-/*
- * Global data structures and functions for the buf kmem cache.
- */
-static kmem_cache_t *hdr_full_cache;
-static kmem_cache_t *hdr_l2only_cache;
-static kmem_cache_t *buf_cache;
-
-static void
-buf_fini(void)
-{
-	int i;
-
-	kmem_free(buf_hash_table.ht_table,
-	    (buf_hash_table.ht_mask + 1) * sizeof (void *));
-	for (i = 0; i < BUF_LOCKS; i++)
-		mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock);
-	kmem_cache_destroy(hdr_full_cache);
-	kmem_cache_destroy(hdr_l2only_cache);
-	kmem_cache_destroy(buf_cache);
-}
-
-/*
- * Constructor callback - called when the cache is empty
- * and a new buf is requested.
- */
-/* ARGSUSED */
-static int
-hdr_full_cons(void *vbuf, void *unused, int kmflag)
-{
-	arc_buf_hdr_t *hdr = vbuf;
-
-	bzero(hdr, HDR_FULL_SIZE);
-	cv_init(&hdr->b_l1hdr.b_cv, NULL, CV_DEFAULT, NULL);
-	zfs_refcount_create(&hdr->b_l1hdr.b_refcnt);
-	mutex_init(&hdr->b_l1hdr.b_freeze_lock, NULL, MUTEX_DEFAULT, NULL);
-	multilist_link_init(&hdr->b_l1hdr.b_arc_node);
-	arc_space_consume(HDR_FULL_SIZE, ARC_SPACE_HDRS);
-
-	return (0);
-}
-
-/* ARGSUSED */
-static int
-hdr_l2only_cons(void *vbuf, void *unused, int kmflag)
-{
-	arc_buf_hdr_t *hdr = vbuf;
-
-	bzero(hdr, HDR_L2ONLY_SIZE);
-	arc_space_consume(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS);
-
-	return (0);
-}
-
-/* ARGSUSED */
-static int
-buf_cons(void *vbuf, void *unused, int kmflag)
-{
-	arc_buf_t *buf = vbuf;
-
-	bzero(buf, sizeof (arc_buf_t));
-	mutex_init(&buf->b_evict_lock, NULL, MUTEX_DEFAULT, NULL);
-	arc_space_consume(sizeof (arc_buf_t), ARC_SPACE_HDRS);
-
-	return (0);
-}
-
-/*
- * Destructor callback - called when a cached buf is
- * no longer required.
- */
-/* ARGSUSED */
-static void
-hdr_full_dest(void *vbuf, void *unused)
-{
-	arc_buf_hdr_t *hdr = vbuf;
-
-	ASSERT(HDR_EMPTY(hdr));
-	cv_destroy(&hdr->b_l1hdr.b_cv);
-	zfs_refcount_destroy(&hdr->b_l1hdr.b_refcnt);
-	mutex_destroy(&hdr->b_l1hdr.b_freeze_lock);
-	ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));
-	arc_space_return(HDR_FULL_SIZE, ARC_SPACE_HDRS);
-}
-
-/* ARGSUSED */
-static void
-hdr_l2only_dest(void *vbuf, void *unused)
-{
-	arc_buf_hdr_t *hdr = vbuf;
-
-	ASSERT(HDR_EMPTY(hdr));
-	arc_space_return(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS);
-}
-
-/* ARGSUSED */
-static void
-buf_dest(void *vbuf, void *unused)
-{
-	arc_buf_t *buf = vbuf;
-
-	mutex_destroy(&buf->b_evict_lock);
-	arc_space_return(sizeof (arc_buf_t), ARC_SPACE_HDRS);
-}
-
-/*
- * Reclaim callback -- invoked when memory is low.
- */
-/* ARGSUSED */
-static void
-hdr_recl(void *unused)
-{
-	dprintf("hdr_recl called\n");
-	/*
-	 * umem calls the reclaim func when we destroy the buf cache,
-	 * which is after we do arc_fini().
-	 */
-	if (arc_initialized)
-		zthr_wakeup(arc_reap_zthr);
-}
-
-static void
-buf_init(void)
-{
-	uint64_t *ct;
-	uint64_t hsize = 1ULL << 12;
-	int i, j;
-
-	/*
-	 * The hash table is big enough to fill all of physical memory
-	 * with an average block size of zfs_arc_average_blocksize (default 8K).
-	 * By default, the table will take up
-	 * totalmem * sizeof(void*) / 8K (1MB per GB with 8-byte pointers).
-	 */
-	while (hsize * zfs_arc_average_blocksize < (uint64_t)physmem * PAGESIZE)
-		hsize <<= 1;
-retry:
-	buf_hash_table.ht_mask = hsize - 1;
-	buf_hash_table.ht_table =
-	    kmem_zalloc(hsize * sizeof (void*), KM_NOSLEEP);
-	if (buf_hash_table.ht_table == NULL) {
-		ASSERT(hsize > (1ULL << 8));
-		hsize >>= 1;
-		goto retry;
-	}
-
-	hdr_full_cache = kmem_cache_create("arc_buf_hdr_t_full", HDR_FULL_SIZE,
-	    0, hdr_full_cons, hdr_full_dest, hdr_recl, NULL, NULL, 0);
-	hdr_l2only_cache = kmem_cache_create("arc_buf_hdr_t_l2only",
-	    HDR_L2ONLY_SIZE, 0, hdr_l2only_cons, hdr_l2only_dest, hdr_recl,
-	    NULL, NULL, 0);
-	buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t),
-	    0, buf_cons, buf_dest, NULL, NULL, NULL, 0);
-
-	for (i = 0; i < 256; i++)
-		for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--)
-			*ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY);
-
-	for (i = 0; i < BUF_LOCKS; i++) {
-		mutex_init(&buf_hash_table.ht_locks[i].ht_lock,
-		    NULL, MUTEX_DEFAULT, NULL);
-	}
-}
-
-/*
- * This is the size that the buf occupies in memory. If the buf is compressed,
- * it will correspond to the compressed size. You should use this method of
- * getting the buf size unless you explicitly need the logical size.
- */
-int32_t
-arc_buf_size(arc_buf_t *buf)
-{
-	return (ARC_BUF_COMPRESSED(buf) ?
-	    HDR_GET_PSIZE(buf->b_hdr) : HDR_GET_LSIZE(buf->b_hdr));
-}
-
-int32_t
-arc_buf_lsize(arc_buf_t *buf)
-{
-	return (HDR_GET_LSIZE(buf->b_hdr));
-}
-
-enum zio_compress
-arc_get_compression(arc_buf_t *buf)
-{
-	return (ARC_BUF_COMPRESSED(buf) ?
-	    HDR_GET_COMPRESS(buf->b_hdr) : ZIO_COMPRESS_OFF);
-}
-
-#define	ARC_MINTIME	(hz>>4) /* 62 ms */
-
-static inline boolean_t
-arc_buf_is_shared(arc_buf_t *buf)
-{
-	boolean_t shared = (buf->b_data != NULL &&
-	    buf->b_hdr->b_l1hdr.b_pabd != NULL &&
-	    abd_is_linear(buf->b_hdr->b_l1hdr.b_pabd) &&
-	    buf->b_data == abd_to_buf(buf->b_hdr->b_l1hdr.b_pabd));
-	IMPLY(shared, HDR_SHARED_DATA(buf->b_hdr));
-	IMPLY(shared, ARC_BUF_SHARED(buf));
-	IMPLY(shared, ARC_BUF_COMPRESSED(buf) || ARC_BUF_LAST(buf));
-
-	/*
-	 * It would be nice to assert arc_can_share() too, but the "hdr isn't
-	 * already being shared" requirement prevents us from doing that.
-	 */
-
-	return (shared);
-}
-
-/*
- * Free the checksum associated with this header. If there is no checksum, this
- * is a no-op.
- */
-static inline void
-arc_cksum_free(arc_buf_hdr_t *hdr)
-{
-	ASSERT(HDR_HAS_L1HDR(hdr));
-	mutex_enter(&hdr->b_l1hdr.b_freeze_lock);
-	if (hdr->b_l1hdr.b_freeze_cksum != NULL) {
-		kmem_free(hdr->b_l1hdr.b_freeze_cksum, sizeof (zio_cksum_t));
-		hdr->b_l1hdr.b_freeze_cksum = NULL;
-	}
-	mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
-}
-
-/*
- * Return true iff at least one of the bufs on hdr is not compressed.
- */
-static boolean_t
-arc_hdr_has_uncompressed_buf(arc_buf_hdr_t *hdr)
-{
-	for (arc_buf_t *b = hdr->b_l1hdr.b_buf; b != NULL; b = b->b_next) {
-		if (!ARC_BUF_COMPRESSED(b)) {
-			return (B_TRUE);
-		}
-	}
-	return (B_FALSE);
-}
-
-/*
- * If we've turned on the ZFS_DEBUG_MODIFY flag, verify that the buf's data
- * matches the checksum that is stored in the hdr. If there is no checksum,
- * or if the buf is compressed, this is a no-op.
- */
-static void
-arc_cksum_verify(arc_buf_t *buf)
-{
-	arc_buf_hdr_t *hdr = buf->b_hdr;
-	zio_cksum_t zc;
-
-	if (!(zfs_flags & ZFS_DEBUG_MODIFY))
-		return;
-
-	if (ARC_BUF_COMPRESSED(buf)) {
-		ASSERT(hdr->b_l1hdr.b_freeze_cksum == NULL ||
-		    arc_hdr_has_uncompressed_buf(hdr));
-		return;
-	}
-
-	ASSERT(HDR_HAS_L1HDR(hdr));
-
-	mutex_enter(&hdr->b_l1hdr.b_freeze_lock);
-	if (hdr->b_l1hdr.b_freeze_cksum == NULL || HDR_IO_ERROR(hdr)) {
-		mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
-		return;
-	}
-
-	fletcher_2_native(buf->b_data, arc_buf_size(buf), NULL, &zc);
-	if (!ZIO_CHECKSUM_EQUAL(*hdr->b_l1hdr.b_freeze_cksum, zc))
-		panic("buffer modified while frozen!");
-	mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
-}
-
-static boolean_t
-arc_cksum_is_equal(arc_buf_hdr_t *hdr, zio_t *zio)
-{
-	enum zio_compress compress = BP_GET_COMPRESS(zio->io_bp);
-	boolean_t valid_cksum;
-
-	ASSERT(!BP_IS_EMBEDDED(zio->io_bp));
-	VERIFY3U(BP_GET_PSIZE(zio->io_bp), ==, HDR_GET_PSIZE(hdr));
-
-	/*
-	 * We rely on the blkptr's checksum to determine if the block
-	 * is valid or not. When compressed arc is enabled, the l2arc
-	 * writes the block to the l2arc just as it appears in the pool.
-	 * This allows us to use the blkptr's checksum to validate the
-	 * data that we just read off of the l2arc without having to store
-	 * a separate checksum in the arc_buf_hdr_t. However, if compressed
-	 * arc is disabled, then the data written to the l2arc is always
-	 * uncompressed and won't match the block as it exists in the main
-	 * pool. When this is the case, we must first compress it if it is
-	 * compressed on the main pool before we can validate the checksum.
-	 */
-	if (!HDR_COMPRESSION_ENABLED(hdr) && compress != ZIO_COMPRESS_OFF) {
-		ASSERT3U(HDR_GET_COMPRESS(hdr), ==, ZIO_COMPRESS_OFF);
-		uint64_t lsize = HDR_GET_LSIZE(hdr);
-		uint64_t csize;
-
-		abd_t *cdata = abd_alloc_linear(HDR_GET_PSIZE(hdr), B_TRUE);
-		csize = zio_compress_data(compress, zio->io_abd,
-		    abd_to_buf(cdata), lsize);
-
-		ASSERT3U(csize, <=, HDR_GET_PSIZE(hdr));
-		if (csize < HDR_GET_PSIZE(hdr)) {
-			/*
-			 * Compressed blocks are always a multiple of the
-			 * smallest ashift in the pool. Ideally, we would
-			 * like to round up the csize to the next
-			 * spa_min_ashift but that value may have changed
-			 * since the block was last written. Instead,
-			 * we rely on the fact that the hdr's psize
-			 * was set to the psize of the block when it was
-			 * last written. We set the csize to that value
-			 * and zero out any part that should not contain
-			 * data.
-			 */
-			abd_zero_off(cdata, csize, HDR_GET_PSIZE(hdr) - csize);
-			csize = HDR_GET_PSIZE(hdr);
-		}
-		zio_push_transform(zio, cdata, csize, HDR_GET_PSIZE(hdr), NULL);
-	}
-
-	/*
-	 * Block pointers always store the checksum for the logical data.
-	 * If the block pointer has the gang bit set, then the checksum
-	 * it represents is for the reconstituted data and not for an
-	 * individual gang member. The zio pipeline, however, must be able to
-	 * determine the checksum of each of the gang constituents so it
-	 * treats the checksum comparison differently than what we need
-	 * for l2arc blocks. This prevents us from using the
-	 * zio_checksum_error() interface directly. Instead we must call the
-	 * zio_checksum_error_impl() so that we can ensure the checksum is
-	 * generated using the correct checksum algorithm and accounts for the
-	 * logical I/O size and not just a gang fragment.
-	 */
-	valid_cksum = (zio_checksum_error_impl(zio->io_spa, zio->io_bp,
-	    BP_GET_CHECKSUM(zio->io_bp), zio->io_abd, zio->io_size,
-	    zio->io_offset, NULL) == 0);
-	zio_pop_transforms(zio);
-	return (valid_cksum);
-}
-
-/*
- * Given a buf full of data, if ZFS_DEBUG_MODIFY is enabled this computes a
- * checksum and attaches it to the buf's hdr so that we can ensure that the buf
- * isn't modified later on. If buf is compressed or there is already a checksum
- * on the hdr, this is a no-op (we only checksum uncompressed bufs).
- */
-static void
-arc_cksum_compute(arc_buf_t *buf)
-{
-	arc_buf_hdr_t *hdr = buf->b_hdr;
-
-	if (!(zfs_flags & ZFS_DEBUG_MODIFY))
-		return;
-
-	ASSERT(HDR_HAS_L1HDR(hdr));
-
-	mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock);
-	if (hdr->b_l1hdr.b_freeze_cksum != NULL) {
-		ASSERT(arc_hdr_has_uncompressed_buf(hdr));
-		mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
-		return;
-	} else if (ARC_BUF_COMPRESSED(buf)) {
-		mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
-		return;
-	}
-
-	ASSERT(!ARC_BUF_COMPRESSED(buf));
-	hdr->b_l1hdr.b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t),
-	    KM_SLEEP);
-	fletcher_2_native(buf->b_data, arc_buf_size(buf), NULL,
-	    hdr->b_l1hdr.b_freeze_cksum);
-	mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
-#ifdef illumos
-	arc_buf_watch(buf);
-#endif
-}
-
-#ifdef illumos
-#ifndef _KERNEL
-typedef struct procctl {
-	long cmd;
-	prwatch_t prwatch;
-} procctl_t;
-#endif
-
-/* ARGSUSED */
-static void
-arc_buf_unwatch(arc_buf_t *buf)
-{
-#ifndef _KERNEL
-	if (arc_watch) {
-		int result;
-		procctl_t ctl;
-		ctl.cmd = PCWATCH;
-		ctl.prwatch.pr_vaddr = (uintptr_t)buf->b_data;
-		ctl.prwatch.pr_size = 0;
-		ctl.prwatch.pr_wflags = 0;
-		result = write(arc_procfd, &ctl, sizeof (ctl));
-		ASSERT3U(result, ==, sizeof (ctl));
-	}
-#endif
-}
-
-/* ARGSUSED */
-static void
-arc_buf_watch(arc_buf_t *buf)
-{
-#ifndef _KERNEL
-	if (arc_watch) {
-		int result;
-		procctl_t ctl;
-		ctl.cmd = PCWATCH;
-		ctl.prwatch.pr_vaddr = (uintptr_t)buf->b_data;
-		ctl.prwatch.pr_size = arc_buf_size(buf);
-		ctl.prwatch.pr_wflags = WA_WRITE;
-		result = write(arc_procfd, &ctl, sizeof (ctl));
-		ASSERT3U(result, ==, sizeof (ctl));
-	}
-#endif
-}
-#endif /* illumos */
-
-static arc_buf_contents_t
-arc_buf_type(arc_buf_hdr_t *hdr)
-{
-	arc_buf_contents_t type;
-	if (HDR_ISTYPE_METADATA(hdr)) {
-		type = ARC_BUFC_METADATA;
-	} else {
-		type = ARC_BUFC_DATA;
-	}
-	VERIFY3U(hdr->b_type, ==, type);
-	return (type);
-}
-
-boolean_t
-arc_is_metadata(arc_buf_t *buf)
-{
-	return (HDR_ISTYPE_METADATA(buf->b_hdr) != 0);
-}
-
-static uint32_t
-arc_bufc_to_flags(arc_buf_contents_t type)
-{
-	switch (type) {
-	case ARC_BUFC_DATA:
-		/* metadata field is 0 if buffer contains normal data */
-		return (0);
-	case ARC_BUFC_METADATA:
-		return (ARC_FLAG_BUFC_METADATA);
-	default:
-		break;
-	}
-	panic("undefined ARC buffer type!");
-	return ((uint32_t)-1);
-}
-
-void
-arc_buf_thaw(arc_buf_t *buf)
-{
-	arc_buf_hdr_t *hdr = buf->b_hdr;
-
-	ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon);
-	ASSERT(!HDR_IO_IN_PROGRESS(hdr));
-
-	arc_cksum_verify(buf);
-
-	/*
-	 * Compressed buffers do not manipulate the b_freeze_cksum or
-	 * allocate b_thawed.
-	 */
-	if (ARC_BUF_COMPRESSED(buf)) {
-		ASSERT(hdr->b_l1hdr.b_freeze_cksum == NULL ||
-		    arc_hdr_has_uncompressed_buf(hdr));
-		return;
-	}
-
-	ASSERT(HDR_HAS_L1HDR(hdr));
-	arc_cksum_free(hdr);
-
-	mutex_enter(&hdr->b_l1hdr.b_freeze_lock);
-#ifdef ZFS_DEBUG
-	if (zfs_flags & ZFS_DEBUG_MODIFY) {
-		if (hdr->b_l1hdr.b_thawed != NULL)
-			kmem_free(hdr->b_l1hdr.b_thawed, 1);
-		hdr->b_l1hdr.b_thawed = kmem_alloc(1, KM_SLEEP);
-	}
-#endif
-
-	mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
-
-#ifdef illumos
-	arc_buf_unwatch(buf);
-#endif
-}
-
-void
-arc_buf_freeze(arc_buf_t *buf)
-{
-	arc_buf_hdr_t *hdr = buf->b_hdr;
-	kmutex_t *hash_lock;
-
-	if (!(zfs_flags & ZFS_DEBUG_MODIFY))
-		return;
-
-	if (ARC_BUF_COMPRESSED(buf)) {
-		ASSERT(hdr->b_l1hdr.b_freeze_cksum == NULL ||
-		    arc_hdr_has_uncompressed_buf(hdr));
-		return;
-	}
-
-	hash_lock = HDR_LOCK(hdr);
-	mutex_enter(hash_lock);
-
-	ASSERT(HDR_HAS_L1HDR(hdr));
-	ASSERT(hdr->b_l1hdr.b_freeze_cksum != NULL ||
-	    hdr->b_l1hdr.b_state == arc_anon);
-	arc_cksum_compute(buf);
-	mutex_exit(hash_lock);
-}
-
-/*
- * The arc_buf_hdr_t's b_flags should never be modified directly. Instead,
- * the following functions should be used to ensure that the flags are
- * updated in a thread-safe way. When manipulating the flags either
- * the hash_lock must be held or the hdr must be undiscoverable. This
- * ensures that we're not racing with any other threads when updating
- * the flags.
- */
-static inline void
-arc_hdr_set_flags(arc_buf_hdr_t *hdr, arc_flags_t flags)
-{
-	ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr));
-	hdr->b_flags |= flags;
-}
-
-static inline void
-arc_hdr_clear_flags(arc_buf_hdr_t *hdr, arc_flags_t flags)
-{
-	ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr));
-	hdr->b_flags &= ~flags;
-}
-
-/*
- * Setting the compression bits in the arc_buf_hdr_t's b_flags is
- * done in a special way since we have to clear and set bits
- * at the same time. Consumers that wish to set the compression bits
- * must use this function to ensure that the flags are updated in
- * thread-safe manner.
- */
-static void
-arc_hdr_set_compress(arc_buf_hdr_t *hdr, enum zio_compress cmp)
-{
-	ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr));
-
-	/*
-	 * Holes and embedded blocks will always have a psize = 0 so
-	 * we ignore the compression of the blkptr and set the
-	 * arc_buf_hdr_t's compression to ZIO_COMPRESS_OFF.
-	 * Holes and embedded blocks remain anonymous so we don't
-	 * want to uncompress them. Mark them as uncompressed.
-	 */
-	if (!zfs_compressed_arc_enabled || HDR_GET_PSIZE(hdr) == 0) {
-		arc_hdr_clear_flags(hdr, ARC_FLAG_COMPRESSED_ARC);
-		HDR_SET_COMPRESS(hdr, ZIO_COMPRESS_OFF);
-		ASSERT(!HDR_COMPRESSION_ENABLED(hdr));
-		ASSERT3U(HDR_GET_COMPRESS(hdr), ==, ZIO_COMPRESS_OFF);
-	} else {
-		arc_hdr_set_flags(hdr, ARC_FLAG_COMPRESSED_ARC);
-		HDR_SET_COMPRESS(hdr, cmp);
-		ASSERT3U(HDR_GET_COMPRESS(hdr), ==, cmp);
-		ASSERT(HDR_COMPRESSION_ENABLED(hdr));
-	}
-}
-
-/*
- * Looks for another buf on the same hdr which has the data decompressed, copies
- * from it, and returns true. If no such buf exists, returns false.
- */
-static boolean_t
-arc_buf_try_copy_decompressed_data(arc_buf_t *buf)
-{
-	arc_buf_hdr_t *hdr = buf->b_hdr;
-	boolean_t copied = B_FALSE;
-
-	ASSERT(HDR_HAS_L1HDR(hdr));
-	ASSERT3P(buf->b_data, !=, NULL);
-	ASSERT(!ARC_BUF_COMPRESSED(buf));
-
-	for (arc_buf_t *from = hdr->b_l1hdr.b_buf; from != NULL;
-	    from = from->b_next) {
-		/* can't use our own data buffer */
-		if (from == buf) {
-			continue;
-		}
-
-		if (!ARC_BUF_COMPRESSED(from)) {
-			bcopy(from->b_data, buf->b_data, arc_buf_size(buf));
-			copied = B_TRUE;
-			break;
-		}
-	}
-
-	/*
-	 * There were no decompressed bufs, so there should not be a
-	 * checksum on the hdr either.
-	 */
-	EQUIV(!copied, hdr->b_l1hdr.b_freeze_cksum == NULL);
-
-	return (copied);
-}
-
-/*
- * Given a buf that has a data buffer attached to it, this function will
- * efficiently fill the buf with data of the specified compression setting from
- * the hdr and update the hdr's b_freeze_cksum if necessary. If the buf and hdr
- * are already sharing a data buf, no copy is performed.
- *
- * If the buf is marked as compressed but uncompressed data was requested, this
- * will allocate a new data buffer for the buf, remove that flag, and fill the
- * buf with uncompressed data. You can't request a compressed buf on a hdr with
- * uncompressed data, and (since we haven't added support for it yet) if you
- * want compressed data your buf must already be marked as compressed and have
- * the correct-sized data buffer.
- */
-static int
-arc_buf_fill(arc_buf_t *buf, boolean_t compressed)
-{
-	arc_buf_hdr_t *hdr = buf->b_hdr;
-	boolean_t hdr_compressed = (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF);
-	dmu_object_byteswap_t bswap = hdr->b_l1hdr.b_byteswap;
-
-	ASSERT3P(buf->b_data, !=, NULL);
-	IMPLY(compressed, hdr_compressed);
-	IMPLY(compressed, ARC_BUF_COMPRESSED(buf));
-
-	if (hdr_compressed == compressed) {
-		if (!arc_buf_is_shared(buf)) {
-			abd_copy_to_buf(buf->b_data, hdr->b_l1hdr.b_pabd,
-			    arc_buf_size(buf));
-		}
-	} else {
-		ASSERT(hdr_compressed);
-		ASSERT(!compressed);
-		ASSERT3U(HDR_GET_LSIZE(hdr), !=, HDR_GET_PSIZE(hdr));
-
-		/*
-		 * If the buf is sharing its data with the hdr, unlink it and
-		 * allocate a new data buffer for the buf.
-		 */
-		if (arc_buf_is_shared(buf)) {
-			ASSERT(ARC_BUF_COMPRESSED(buf));
-
-			/* We need to give the buf it's own b_data */
-			buf->b_flags &= ~ARC_BUF_FLAG_SHARED;
-			buf->b_data =
-			    arc_get_data_buf(hdr, HDR_GET_LSIZE(hdr), buf);
-			arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA);
-
-			/* Previously overhead was 0; just add new overhead */
-			ARCSTAT_INCR(arcstat_overhead_size, HDR_GET_LSIZE(hdr));
-		} else if (ARC_BUF_COMPRESSED(buf)) {
-			/* We need to reallocate the buf's b_data */
-			arc_free_data_buf(hdr, buf->b_data, HDR_GET_PSIZE(hdr),
-			    buf);
-			buf->b_data =
-			    arc_get_data_buf(hdr, HDR_GET_LSIZE(hdr), buf);
-
-			/* We increased the size of b_data; update overhead */
-			ARCSTAT_INCR(arcstat_overhead_size,
-			    HDR_GET_LSIZE(hdr) - HDR_GET_PSIZE(hdr));
-		}
-
-		/*
-		 * Regardless of the buf's previous compression settings, it
-		 * should not be compressed at the end of this function.
-		 */
-		buf->b_flags &= ~ARC_BUF_FLAG_COMPRESSED;
-
-		/*
-		 * Try copying the data from another buf which already has a
-		 * decompressed version. If that's not possible, it's time to
-		 * bite the bullet and decompress the data from the hdr.
-		 */
-		if (arc_buf_try_copy_decompressed_data(buf)) {
-			/* Skip byteswapping and checksumming (already done) */
-			ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, !=, NULL);
-			return (0);
-		} else {
-			int error = zio_decompress_data(HDR_GET_COMPRESS(hdr),
-			    hdr->b_l1hdr.b_pabd, buf->b_data,
-			    HDR_GET_PSIZE(hdr), HDR_GET_LSIZE(hdr));
-
-			/*
-			 * Absent hardware errors or software bugs, this should
-			 * be impossible, but log it anyway so we can debug it.
-			 */
-			if (error != 0) {
-				zfs_dbgmsg(
-				    "hdr %p, compress %d, psize %d, lsize %d",
-				    hdr, HDR_GET_COMPRESS(hdr),
-				    HDR_GET_PSIZE(hdr), HDR_GET_LSIZE(hdr));
-				return (SET_ERROR(EIO));
-			}
-		}
-	}
-
-	/* Byteswap the buf's data if necessary */
-	if (bswap != DMU_BSWAP_NUMFUNCS) {
-		ASSERT(!HDR_SHARED_DATA(hdr));
-		ASSERT3U(bswap, <, DMU_BSWAP_NUMFUNCS);
-		dmu_ot_byteswap[bswap].ob_func(buf->b_data, HDR_GET_LSIZE(hdr));
-	}
-
-	/* Compute the hdr's checksum if necessary */
-	arc_cksum_compute(buf);
-
-	return (0);
-}
-
-int
-arc_decompress(arc_buf_t *buf)
-{
-	return (arc_buf_fill(buf, B_FALSE));
-}
-
-/*
- * Return the size of the block, b_pabd, that is stored in the arc_buf_hdr_t.
- */
-static uint64_t
-arc_hdr_size(arc_buf_hdr_t *hdr)
-{
-	uint64_t size;
-
-	if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF &&
-	    HDR_GET_PSIZE(hdr) > 0) {
-		size = HDR_GET_PSIZE(hdr);
-	} else {
-		ASSERT3U(HDR_GET_LSIZE(hdr), !=, 0);
-		size = HDR_GET_LSIZE(hdr);
-	}
-	return (size);
-}
-
-/*
- * Increment the amount of evictable space in the arc_state_t's refcount.
- * We account for the space used by the hdr and the arc buf individually
- * so that we can add and remove them from the refcount individually.
- */
-static void
-arc_evictable_space_increment(arc_buf_hdr_t *hdr, arc_state_t *state)
-{
-	arc_buf_contents_t type = arc_buf_type(hdr);
-
-	ASSERT(HDR_HAS_L1HDR(hdr));
-
-	if (GHOST_STATE(state)) {
-		ASSERT0(hdr->b_l1hdr.b_bufcnt);
-		ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
-		ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
-		(void) zfs_refcount_add_many(&state->arcs_esize[type],
-		    HDR_GET_LSIZE(hdr), hdr);
-		return;
-	}
-
-	ASSERT(!GHOST_STATE(state));
-	if (hdr->b_l1hdr.b_pabd != NULL) {
-		(void) zfs_refcount_add_many(&state->arcs_esize[type],
-		    arc_hdr_size(hdr), hdr);
-	}
-	for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL;
-	    buf = buf->b_next) {
-		if (arc_buf_is_shared(buf))
-			continue;
-		(void) zfs_refcount_add_many(&state->arcs_esize[type],
-		    arc_buf_size(buf), buf);
-	}
-}
-
-/*
- * Decrement the amount of evictable space in the arc_state_t's refcount.
- * We account for the space used by the hdr and the arc buf individually
- * so that we can add and remove them from the refcount individually.
- */
-static void
-arc_evictable_space_decrement(arc_buf_hdr_t *hdr, arc_state_t *state)
-{
-	arc_buf_contents_t type = arc_buf_type(hdr);
-
-	ASSERT(HDR_HAS_L1HDR(hdr));
-
-	if (GHOST_STATE(state)) {
-		ASSERT0(hdr->b_l1hdr.b_bufcnt);
-		ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
-		ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
-		(void) zfs_refcount_remove_many(&state->arcs_esize[type],
-		    HDR_GET_LSIZE(hdr), hdr);
-		return;
-	}
-
-	ASSERT(!GHOST_STATE(state));
-	if (hdr->b_l1hdr.b_pabd != NULL) {
-		(void) zfs_refcount_remove_many(&state->arcs_esize[type],
-		    arc_hdr_size(hdr), hdr);
-	}
-	for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL;
-	    buf = buf->b_next) {
-		if (arc_buf_is_shared(buf))
-			continue;
-		(void) zfs_refcount_remove_many(&state->arcs_esize[type],
-		    arc_buf_size(buf), buf);
-	}
-}
-
-/*
- * Add a reference to this hdr indicating that someone is actively
- * referencing that memory. When the refcount transitions from 0 to 1,
- * we remove it from the respective arc_state_t list to indicate that
- * it is not evictable.
- */
-static void
-add_reference(arc_buf_hdr_t *hdr, void *tag)
-{
-	ASSERT(HDR_HAS_L1HDR(hdr));
-	if (!MUTEX_HELD(HDR_LOCK(hdr))) {
-		ASSERT(hdr->b_l1hdr.b_state == arc_anon);
-		ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
-		ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
-	}
-
-	arc_state_t *state = hdr->b_l1hdr.b_state;
-
-	if ((zfs_refcount_add(&hdr->b_l1hdr.b_refcnt, tag) == 1) &&
-	    (state != arc_anon)) {
-		/* We don't use the L2-only state list. */
-		if (state != arc_l2c_only) {
-			multilist_remove(state->arcs_list[arc_buf_type(hdr)],
-			    hdr);
-			arc_evictable_space_decrement(hdr, state);
-		}
-		/* remove the prefetch flag if we get a reference */
-		arc_hdr_clear_flags(hdr, ARC_FLAG_PREFETCH);
-	}
-}
-
-/*
- * Remove a reference from this hdr. When the reference transitions from
- * 1 to 0 and we're not anonymous, then we add this hdr to the arc_state_t's
- * list making it eligible for eviction.
- */
-static int
-remove_reference(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, void *tag)
-{
-	int cnt;
-	arc_state_t *state = hdr->b_l1hdr.b_state;
-
-	ASSERT(HDR_HAS_L1HDR(hdr));
-	ASSERT(state == arc_anon || MUTEX_HELD(hash_lock));
-	ASSERT(!GHOST_STATE(state));
-
-	/*
-	 * arc_l2c_only counts as a ghost state so we don't need to explicitly
-	 * check to prevent usage of the arc_l2c_only list.
-	 */
-	if (((cnt = zfs_refcount_remove(&hdr->b_l1hdr.b_refcnt, tag)) == 0) &&
-	    (state != arc_anon)) {
-		multilist_insert(state->arcs_list[arc_buf_type(hdr)], hdr);
-		ASSERT3U(hdr->b_l1hdr.b_bufcnt, >, 0);
-		arc_evictable_space_increment(hdr, state);
-	}
-	return (cnt);
-}
-
-/*
- * Returns detailed information about a specific arc buffer.  When the
- * state_index argument is set the function will calculate the arc header
- * list position for its arc state.  Since this requires a linear traversal
- * callers are strongly encourage not to do this.  However, it can be helpful
- * for targeted analysis so the functionality is provided.
- */
-void
-arc_buf_info(arc_buf_t *ab, arc_buf_info_t *abi, int state_index)
-{
-	arc_buf_hdr_t *hdr = ab->b_hdr;
-	l1arc_buf_hdr_t *l1hdr = NULL;
-	l2arc_buf_hdr_t *l2hdr = NULL;
-	arc_state_t *state = NULL;
-
-	memset(abi, 0, sizeof (arc_buf_info_t));
-
-	if (hdr == NULL)
-		return;
-
-	abi->abi_flags = hdr->b_flags;
-
-	if (HDR_HAS_L1HDR(hdr)) {
-		l1hdr = &hdr->b_l1hdr;
-		state = l1hdr->b_state;
-	}
-	if (HDR_HAS_L2HDR(hdr))
-		l2hdr = &hdr->b_l2hdr;
-
-	if (l1hdr) {
-		abi->abi_bufcnt = l1hdr->b_bufcnt;
-		abi->abi_access = l1hdr->b_arc_access;
-		abi->abi_mru_hits = l1hdr->b_mru_hits;
-		abi->abi_mru_ghost_hits = l1hdr->b_mru_ghost_hits;
-		abi->abi_mfu_hits = l1hdr->b_mfu_hits;
-		abi->abi_mfu_ghost_hits = l1hdr->b_mfu_ghost_hits;
-		abi->abi_holds = zfs_refcount_count(&l1hdr->b_refcnt);
-	}
-
-	if (l2hdr) {
-		abi->abi_l2arc_dattr = l2hdr->b_daddr;
-		abi->abi_l2arc_hits = l2hdr->b_hits;
-	}
-
-	abi->abi_state_type = state ? state->arcs_state : ARC_STATE_ANON;
-	abi->abi_state_contents = arc_buf_type(hdr);
-	abi->abi_size = arc_hdr_size(hdr);
-}
-
-/*
- * Move the supplied buffer to the indicated state. The hash lock
- * for the buffer must be held by the caller.
- */
-static void
-arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr,
-    kmutex_t *hash_lock)
-{
-	arc_state_t *old_state;
-	int64_t refcnt;
-	uint32_t bufcnt;
-	boolean_t update_old, update_new;
-	arc_buf_contents_t buftype = arc_buf_type(hdr);
-
-	/*
-	 * We almost always have an L1 hdr here, since we call arc_hdr_realloc()
-	 * in arc_read() when bringing a buffer out of the L2ARC.  However, the
-	 * L1 hdr doesn't always exist when we change state to arc_anon before
-	 * destroying a header, in which case reallocating to add the L1 hdr is
-	 * pointless.
-	 */
-	if (HDR_HAS_L1HDR(hdr)) {
-		old_state = hdr->b_l1hdr.b_state;
-		refcnt = zfs_refcount_count(&hdr->b_l1hdr.b_refcnt);
-		bufcnt = hdr->b_l1hdr.b_bufcnt;
-		update_old = (bufcnt > 0 || hdr->b_l1hdr.b_pabd != NULL);
-	} else {
-		old_state = arc_l2c_only;
-		refcnt = 0;
-		bufcnt = 0;
-		update_old = B_FALSE;
-	}
-	update_new = update_old;
-
-	ASSERT(MUTEX_HELD(hash_lock));
-	ASSERT3P(new_state, !=, old_state);
-	ASSERT(!GHOST_STATE(new_state) || bufcnt == 0);
-	ASSERT(old_state != arc_anon || bufcnt <= 1);
-
-	/*
-	 * If this buffer is evictable, transfer it from the
-	 * old state list to the new state list.
-	 */
-	if (refcnt == 0) {
-		if (old_state != arc_anon && old_state != arc_l2c_only) {
-			ASSERT(HDR_HAS_L1HDR(hdr));
-			multilist_remove(old_state->arcs_list[buftype], hdr);
-
-			if (GHOST_STATE(old_state)) {
-				ASSERT0(bufcnt);
-				ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
-				update_old = B_TRUE;
-			}
-			arc_evictable_space_decrement(hdr, old_state);
-		}
-		if (new_state != arc_anon && new_state != arc_l2c_only) {
-
-			/*
-			 * An L1 header always exists here, since if we're
-			 * moving to some L1-cached state (i.e. not l2c_only or
-			 * anonymous), we realloc the header to add an L1hdr
-			 * beforehand.
-			 */
-			ASSERT(HDR_HAS_L1HDR(hdr));
-			multilist_insert(new_state->arcs_list[buftype], hdr);
-
-			if (GHOST_STATE(new_state)) {
-				ASSERT0(bufcnt);
-				ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
-				update_new = B_TRUE;
-			}
-			arc_evictable_space_increment(hdr, new_state);
-		}
-	}
-
-	ASSERT(!HDR_EMPTY(hdr));
-	if (new_state == arc_anon && HDR_IN_HASH_TABLE(hdr))
-		buf_hash_remove(hdr);
-
-	/* adjust state sizes (ignore arc_l2c_only) */
-
-	if (update_new && new_state != arc_l2c_only) {
-		ASSERT(HDR_HAS_L1HDR(hdr));
-		if (GHOST_STATE(new_state)) {
-			ASSERT0(bufcnt);
-
-			/*
-			 * When moving a header to a ghost state, we first
-			 * remove all arc buffers. Thus, we'll have a
-			 * bufcnt of zero, and no arc buffer to use for
-			 * the reference. As a result, we use the arc
-			 * header pointer for the reference.
-			 */
-			(void) zfs_refcount_add_many(&new_state->arcs_size,
-			    HDR_GET_LSIZE(hdr), hdr);
-			ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
-		} else {
-			uint32_t buffers = 0;
-
-			/*
-			 * Each individual buffer holds a unique reference,
-			 * thus we must remove each of these references one
-			 * at a time.
-			 */
-			for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL;
-			    buf = buf->b_next) {
-				ASSERT3U(bufcnt, !=, 0);
-				buffers++;
-
-				/*
-				 * When the arc_buf_t is sharing the data
-				 * block with the hdr, the owner of the
-				 * reference belongs to the hdr. Only
-				 * add to the refcount if the arc_buf_t is
-				 * not shared.
-				 */
-				if (arc_buf_is_shared(buf))
-					continue;
-
-				(void) zfs_refcount_add_many(
-				    &new_state->arcs_size,
-				    arc_buf_size(buf), buf);
-			}
-			ASSERT3U(bufcnt, ==, buffers);
-
-			if (hdr->b_l1hdr.b_pabd != NULL) {
-				(void) zfs_refcount_add_many(
-				    &new_state->arcs_size,
-				    arc_hdr_size(hdr), hdr);
-			} else {
-				ASSERT(GHOST_STATE(old_state));
-			}
-		}
-	}
-
-	if (update_old && old_state != arc_l2c_only) {
-		ASSERT(HDR_HAS_L1HDR(hdr));
-		if (GHOST_STATE(old_state)) {
-			ASSERT0(bufcnt);
-			ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
-
-			/*
-			 * When moving a header off of a ghost state,
-			 * the header will not contain any arc buffers.
-			 * We use the arc header pointer for the reference
-			 * which is exactly what we did when we put the
-			 * header on the ghost state.
-			 */
-
-			(void) zfs_refcount_remove_many(&old_state->arcs_size,
-			    HDR_GET_LSIZE(hdr), hdr);
-		} else {
-			uint32_t buffers = 0;
-
-			/*
-			 * Each individual buffer holds a unique reference,
-			 * thus we must remove each of these references one
-			 * at a time.
-			 */
-			for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL;
-			    buf = buf->b_next) {
-				ASSERT3U(bufcnt, !=, 0);
-				buffers++;
-
-				/*
-				 * When the arc_buf_t is sharing the data
-				 * block with the hdr, the owner of the
-				 * reference belongs to the hdr. Only
-				 * add to the refcount if the arc_buf_t is
-				 * not shared.
-				 */
-				if (arc_buf_is_shared(buf))
-					continue;
-
-				(void) zfs_refcount_remove_many(
-				    &old_state->arcs_size, arc_buf_size(buf),
-				    buf);
-			}
-			ASSERT3U(bufcnt, ==, buffers);
-			ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
-			(void) zfs_refcount_remove_many(
-			    &old_state->arcs_size, arc_hdr_size(hdr), hdr);
-		}
-	}
-
-	if (HDR_HAS_L1HDR(hdr))
-		hdr->b_l1hdr.b_state = new_state;
-
-	/*
-	 * L2 headers should never be on the L2 state list since they don't
-	 * have L1 headers allocated.
-	 */
-	ASSERT(multilist_is_empty(arc_l2c_only->arcs_list[ARC_BUFC_DATA]) &&
-	    multilist_is_empty(arc_l2c_only->arcs_list[ARC_BUFC_METADATA]));
-}
-
-void
-arc_space_consume(uint64_t space, arc_space_type_t type)
-{
-	ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);
-
-	switch (type) {
-	case ARC_SPACE_DATA:
-		aggsum_add(&astat_data_size, space);
-		break;
-	case ARC_SPACE_META:
-		aggsum_add(&astat_metadata_size, space);
-		break;
-	case ARC_SPACE_BONUS:
-		aggsum_add(&astat_bonus_size, space);
-		break;
-	case ARC_SPACE_DNODE:
-		aggsum_add(&astat_dnode_size, space);
-		break;
-	case ARC_SPACE_DBUF:
-		aggsum_add(&astat_dbuf_size, space);
-		break;
-	case ARC_SPACE_HDRS:
-		aggsum_add(&astat_hdr_size, space);
-		break;
-	case ARC_SPACE_L2HDRS:
-		aggsum_add(&astat_l2_hdr_size, space);
-		break;
-	}
-
-	if (type != ARC_SPACE_DATA)
-		aggsum_add(&arc_meta_used, space);
-
-	aggsum_add(&arc_size, space);
-}
-
-void
-arc_space_return(uint64_t space, arc_space_type_t type)
-{
-	ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);
-
-	switch (type) {
-	case ARC_SPACE_DATA:
-		aggsum_add(&astat_data_size, -space);
-		break;
-	case ARC_SPACE_META:
-		aggsum_add(&astat_metadata_size, -space);
-		break;
-	case ARC_SPACE_BONUS:
-		aggsum_add(&astat_bonus_size, -space);
-		break;
-	case ARC_SPACE_DNODE:
-		aggsum_add(&astat_dnode_size, -space);
-		break;
-	case ARC_SPACE_DBUF:
-		aggsum_add(&astat_dbuf_size, -space);
-		break;
-	case ARC_SPACE_HDRS:
-		aggsum_add(&astat_hdr_size, -space);
-		break;
-	case ARC_SPACE_L2HDRS:
-		aggsum_add(&astat_l2_hdr_size, -space);
-		break;
-	}
-
-	if (type != ARC_SPACE_DATA) {
-		ASSERT(aggsum_compare(&arc_meta_used, space) >= 0);
-		/*
-		 * We use the upper bound here rather than the precise value
-		 * because the arc_meta_max value doesn't need to be
-		 * precise. It's only consumed by humans via arcstats.
-		 */
-		if (arc_meta_max < aggsum_upper_bound(&arc_meta_used))
-			arc_meta_max = aggsum_upper_bound(&arc_meta_used);
-		aggsum_add(&arc_meta_used, -space);
-	}
-
-	ASSERT(aggsum_compare(&arc_size, space) >= 0);
-	aggsum_add(&arc_size, -space);
-}
-
-/*
- * Given a hdr and a buf, returns whether that buf can share its b_data buffer
- * with the hdr's b_pabd.
- */
-static boolean_t
-arc_can_share(arc_buf_hdr_t *hdr, arc_buf_t *buf)
-{
-	/*
-	 * The criteria for sharing a hdr's data are:
-	 * 1. the hdr's compression matches the buf's compression
-	 * 2. the hdr doesn't need to be byteswapped
-	 * 3. the hdr isn't already being shared
-	 * 4. the buf is either compressed or it is the last buf in the hdr list
-	 *
-	 * Criterion #4 maintains the invariant that shared uncompressed
-	 * bufs must be the final buf in the hdr's b_buf list. Reading this, you
-	 * might ask, "if a compressed buf is allocated first, won't that be the
-	 * last thing in the list?", but in that case it's impossible to create
-	 * a shared uncompressed buf anyway (because the hdr must be compressed
-	 * to have the compressed buf). You might also think that #3 is
-	 * sufficient to make this guarantee, however it's possible
-	 * (specifically in the rare L2ARC write race mentioned in
-	 * arc_buf_alloc_impl()) there will be an existing uncompressed buf that
-	 * is sharable, but wasn't at the time of its allocation. Rather than
-	 * allow a new shared uncompressed buf to be created and then shuffle
-	 * the list around to make it the last element, this simply disallows
-	 * sharing if the new buf isn't the first to be added.
-	 */
-	ASSERT3P(buf->b_hdr, ==, hdr);
-	boolean_t hdr_compressed = HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF;
-	boolean_t buf_compressed = ARC_BUF_COMPRESSED(buf) != 0;
-	return (buf_compressed == hdr_compressed &&
-	    hdr->b_l1hdr.b_byteswap == DMU_BSWAP_NUMFUNCS &&
-	    !HDR_SHARED_DATA(hdr) &&
-	    (ARC_BUF_LAST(buf) || ARC_BUF_COMPRESSED(buf)));
-}
-
-/*
- * Allocate a buf for this hdr. If you care about the data that's in the hdr,
- * or if you want a compressed buffer, pass those flags in. Returns 0 if the
- * copy was made successfully, or an error code otherwise.
- */
-static int
-arc_buf_alloc_impl(arc_buf_hdr_t *hdr, void *tag, boolean_t compressed,
-    boolean_t fill, arc_buf_t **ret)
-{
-	arc_buf_t *buf;
-
-	ASSERT(HDR_HAS_L1HDR(hdr));
-	ASSERT3U(HDR_GET_LSIZE(hdr), >, 0);
-	VERIFY(hdr->b_type == ARC_BUFC_DATA ||
-	    hdr->b_type == ARC_BUFC_METADATA);
-	ASSERT3P(ret, !=, NULL);
-	ASSERT3P(*ret, ==, NULL);
-
-	buf = *ret = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
-	buf->b_hdr = hdr;
-	buf->b_data = NULL;
-	buf->b_next = hdr->b_l1hdr.b_buf;
-	buf->b_flags = 0;
-
-	add_reference(hdr, tag);
-
-	/*
-	 * We're about to change the hdr's b_flags. We must either
-	 * hold the hash_lock or be undiscoverable.
-	 */
-	ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr));
-
-	/*
-	 * Only honor requests for compressed bufs if the hdr is actually
-	 * compressed.
-	 */
-	if (compressed && HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF)
-		buf->b_flags |= ARC_BUF_FLAG_COMPRESSED;
-
-	/*
-	 * If the hdr's data can be shared then we share the data buffer and
-	 * set the appropriate bit in the hdr's b_flags to indicate the hdr is
-	 * sharing it's b_pabd with the arc_buf_t. Otherwise, we allocate a new
-	 * buffer to store the buf's data.
-	 *
-	 * There are two additional restrictions here because we're sharing
-	 * hdr -> buf instead of the usual buf -> hdr. First, the hdr can't be
-	 * actively involved in an L2ARC write, because if this buf is used by
-	 * an arc_write() then the hdr's data buffer will be released when the
-	 * write completes, even though the L2ARC write might still be using it.
-	 * Second, the hdr's ABD must be linear so that the buf's user doesn't
-	 * need to be ABD-aware.
-	 */
-	boolean_t can_share = arc_can_share(hdr, buf) && !HDR_L2_WRITING(hdr) &&
-	    abd_is_linear(hdr->b_l1hdr.b_pabd);
-
-	/* Set up b_data and sharing */
-	if (can_share) {
-		buf->b_data = abd_to_buf(hdr->b_l1hdr.b_pabd);
-		buf->b_flags |= ARC_BUF_FLAG_SHARED;
-		arc_hdr_set_flags(hdr, ARC_FLAG_SHARED_DATA);
-	} else {
-		buf->b_data =
-		    arc_get_data_buf(hdr, arc_buf_size(buf), buf);
-		ARCSTAT_INCR(arcstat_overhead_size, arc_buf_size(buf));
-	}
-	VERIFY3P(buf->b_data, !=, NULL);
-
-	hdr->b_l1hdr.b_buf = buf;
-	hdr->b_l1hdr.b_bufcnt += 1;
-
-	/*
-	 * If the user wants the data from the hdr, we need to either copy or
-	 * decompress the data.
-	 */
-	if (fill) {
-		return (arc_buf_fill(buf, ARC_BUF_COMPRESSED(buf) != 0));
-	}
-
-	return (0);
-}
-
-static char *arc_onloan_tag = "onloan";
-
-static inline void
-arc_loaned_bytes_update(int64_t delta)
-{
-	atomic_add_64(&arc_loaned_bytes, delta);
-
-	/* assert that it did not wrap around */
-	ASSERT3S(atomic_add_64_nv(&arc_loaned_bytes, 0), >=, 0);
-}
-
-/*
- * Loan out an anonymous arc buffer. Loaned buffers are not counted as in
- * flight data by arc_tempreserve_space() until they are "returned". Loaned
- * buffers must be returned to the arc before they can be used by the DMU or
- * freed.
- */
-arc_buf_t *
-arc_loan_buf(spa_t *spa, boolean_t is_metadata, int size)
-{
-	arc_buf_t *buf = arc_alloc_buf(spa, arc_onloan_tag,
-	    is_metadata ? ARC_BUFC_METADATA : ARC_BUFC_DATA, size);
-
-	arc_loaned_bytes_update(arc_buf_size(buf));
-
-	return (buf);
-}
-
-arc_buf_t *
-arc_loan_compressed_buf(spa_t *spa, uint64_t psize, uint64_t lsize,
-    enum zio_compress compression_type)
-{
-	arc_buf_t *buf = arc_alloc_compressed_buf(spa, arc_onloan_tag,
-	    psize, lsize, compression_type);
-
-	arc_loaned_bytes_update(arc_buf_size(buf));
-
-	return (buf);
-}
-
-
-/*
- * Return a loaned arc buffer to the arc.
- */
-void
-arc_return_buf(arc_buf_t *buf, void *tag)
-{
-	arc_buf_hdr_t *hdr = buf->b_hdr;
-
-	ASSERT3P(buf->b_data, !=, NULL);
-	ASSERT(HDR_HAS_L1HDR(hdr));
-	(void) zfs_refcount_add(&hdr->b_l1hdr.b_refcnt, tag);
-	(void) zfs_refcount_remove(&hdr->b_l1hdr.b_refcnt, arc_onloan_tag);
-
-	arc_loaned_bytes_update(-arc_buf_size(buf));
-}
-
-/* Detach an arc_buf from a dbuf (tag) */
-void
-arc_loan_inuse_buf(arc_buf_t *buf, void *tag)
-{
-	arc_buf_hdr_t *hdr = buf->b_hdr;
-
-	ASSERT3P(buf->b_data, !=, NULL);
-	ASSERT(HDR_HAS_L1HDR(hdr));
-	(void) zfs_refcount_add(&hdr->b_l1hdr.b_refcnt, arc_onloan_tag);
-	(void) zfs_refcount_remove(&hdr->b_l1hdr.b_refcnt, tag);
-
-	arc_loaned_bytes_update(arc_buf_size(buf));
-}
-
-static void
-l2arc_free_abd_on_write(abd_t *abd, size_t size, arc_buf_contents_t type)
-{
-	l2arc_data_free_t *df = kmem_alloc(sizeof (*df), KM_SLEEP);
-
-	df->l2df_abd = abd;
-	df->l2df_size = size;
-	df->l2df_type = type;
-	mutex_enter(&l2arc_free_on_write_mtx);
-	list_insert_head(l2arc_free_on_write, df);
-	mutex_exit(&l2arc_free_on_write_mtx);
-}
-
-static void
-arc_hdr_free_on_write(arc_buf_hdr_t *hdr)
-{
-	arc_state_t *state = hdr->b_l1hdr.b_state;
-	arc_buf_contents_t type = arc_buf_type(hdr);
-	uint64_t size = arc_hdr_size(hdr);
-
-	/* protected by hash lock, if in the hash table */
-	if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) {
-		ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
-		ASSERT(state != arc_anon && state != arc_l2c_only);
-
-		(void) zfs_refcount_remove_many(&state->arcs_esize[type],
-		    size, hdr);
-	}
-	(void) zfs_refcount_remove_many(&state->arcs_size, size, hdr);
-	if (type == ARC_BUFC_METADATA) {
-		arc_space_return(size, ARC_SPACE_META);
-	} else {
-		ASSERT(type == ARC_BUFC_DATA);
-		arc_space_return(size, ARC_SPACE_DATA);
-	}
-
-	l2arc_free_abd_on_write(hdr->b_l1hdr.b_pabd, size, type);
-}
-
-/*
- * Share the arc_buf_t's data with the hdr. Whenever we are sharing the
- * data buffer, we transfer the refcount ownership to the hdr and update
- * the appropriate kstats.
- */
-static void
-arc_share_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf)
-{
-	arc_state_t *state = hdr->b_l1hdr.b_state;
-
-	ASSERT(arc_can_share(hdr, buf));
-	ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
-	ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr));
-
-	/*
-	 * Start sharing the data buffer. We transfer the
-	 * refcount ownership to the hdr since it always owns
-	 * the refcount whenever an arc_buf_t is shared.
-	 */
-	zfs_refcount_transfer_ownership(&state->arcs_size, buf, hdr);
-	hdr->b_l1hdr.b_pabd = abd_get_from_buf(buf->b_data, arc_buf_size(buf));
-	abd_take_ownership_of_buf(hdr->b_l1hdr.b_pabd,
-	    HDR_ISTYPE_METADATA(hdr));
-	arc_hdr_set_flags(hdr, ARC_FLAG_SHARED_DATA);
-	buf->b_flags |= ARC_BUF_FLAG_SHARED;
-
-	/*
-	 * Since we've transferred ownership to the hdr we need
-	 * to increment its compressed and uncompressed kstats and
-	 * decrement the overhead size.
-	 */
-	ARCSTAT_INCR(arcstat_compressed_size, arc_hdr_size(hdr));
-	ARCSTAT_INCR(arcstat_uncompressed_size, HDR_GET_LSIZE(hdr));
-	ARCSTAT_INCR(arcstat_overhead_size, -arc_buf_size(buf));
-}
-
-static void
-arc_unshare_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf)
-{
-	arc_state_t *state = hdr->b_l1hdr.b_state;
-
-	ASSERT(arc_buf_is_shared(buf));
-	ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
-	ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr));
-
-	/*
-	 * We are no longer sharing this buffer so we need
-	 * to transfer its ownership to the rightful owner.
-	 */
-	zfs_refcount_transfer_ownership(&state->arcs_size, hdr, buf);
-	arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA);
-	abd_release_ownership_of_buf(hdr->b_l1hdr.b_pabd);
-	abd_put(hdr->b_l1hdr.b_pabd);
-	hdr->b_l1hdr.b_pabd = NULL;
-	buf->b_flags &= ~ARC_BUF_FLAG_SHARED;
-
-	/*
-	 * Since the buffer is no longer shared between
-	 * the arc buf and the hdr, count it as overhead.
-	 */
-	ARCSTAT_INCR(arcstat_compressed_size, -arc_hdr_size(hdr));
-	ARCSTAT_INCR(arcstat_uncompressed_size, -HDR_GET_LSIZE(hdr));
-	ARCSTAT_INCR(arcstat_overhead_size, arc_buf_size(buf));
-}
-
-/*
- * Remove an arc_buf_t from the hdr's buf list and return the last
- * arc_buf_t on the list. If no buffers remain on the list then return
- * NULL.
- */
-static arc_buf_t *
-arc_buf_remove(arc_buf_hdr_t *hdr, arc_buf_t *buf)
-{
-	ASSERT(HDR_HAS_L1HDR(hdr));
-	ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr));
-
-	arc_buf_t **bufp = &hdr->b_l1hdr.b_buf;
-	arc_buf_t *lastbuf = NULL;
-
-	/*
-	 * Remove the buf from the hdr list and locate the last
-	 * remaining buffer on the list.
-	 */
-	while (*bufp != NULL) {
-		if (*bufp == buf)
-			*bufp = buf->b_next;
-
-		/*
-		 * If we've removed a buffer in the middle of
-		 * the list then update the lastbuf and update
-		 * bufp.
-		 */
-		if (*bufp != NULL) {
-			lastbuf = *bufp;
-			bufp = &(*bufp)->b_next;
-		}
-	}
-	buf->b_next = NULL;
-	ASSERT3P(lastbuf, !=, buf);
-	IMPLY(hdr->b_l1hdr.b_bufcnt > 0, lastbuf != NULL);
-	IMPLY(hdr->b_l1hdr.b_bufcnt > 0, hdr->b_l1hdr.b_buf != NULL);
-	IMPLY(lastbuf != NULL, ARC_BUF_LAST(lastbuf));
-
-	return (lastbuf);
-}
-
-/*
- * Free up buf->b_data and pull the arc_buf_t off of the the arc_buf_hdr_t's
- * list and free it.
- */
-static void
-arc_buf_destroy_impl(arc_buf_t *buf)
-{
-	arc_buf_hdr_t *hdr = buf->b_hdr;
-
-	/*
-	 * Free up the data associated with the buf but only if we're not
-	 * sharing this with the hdr. If we are sharing it with the hdr, the
-	 * hdr is responsible for doing the free.
-	 */
-	if (buf->b_data != NULL) {
-		/*
-		 * We're about to change the hdr's b_flags. We must either
-		 * hold the hash_lock or be undiscoverable.
-		 */
-		ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr));
-
-		arc_cksum_verify(buf);
-#ifdef illumos
-		arc_buf_unwatch(buf);
-#endif
-
-		if (arc_buf_is_shared(buf)) {
-			arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA);
-		} else {
-			uint64_t size = arc_buf_size(buf);
-			arc_free_data_buf(hdr, buf->b_data, size, buf);
-			ARCSTAT_INCR(arcstat_overhead_size, -size);
-		}
-		buf->b_data = NULL;
-
-		ASSERT(hdr->b_l1hdr.b_bufcnt > 0);
-		hdr->b_l1hdr.b_bufcnt -= 1;
-	}
-
-	arc_buf_t *lastbuf = arc_buf_remove(hdr, buf);
-
-	if (ARC_BUF_SHARED(buf) && !ARC_BUF_COMPRESSED(buf)) {
-		/*
-		 * If the current arc_buf_t is sharing its data buffer with the
-		 * hdr, then reassign the hdr's b_pabd to share it with the new
-		 * buffer at the end of the list. The shared buffer is always
-		 * the last one on the hdr's buffer list.
-		 *
-		 * There is an equivalent case for compressed bufs, but since
-		 * they aren't guaranteed to be the last buf in the list and
-		 * that is an exceedingly rare case, we just allow that space be
-		 * wasted temporarily.
-		 */
-		if (lastbuf != NULL) {
-			/* Only one buf can be shared at once */
-			VERIFY(!arc_buf_is_shared(lastbuf));
-			/* hdr is uncompressed so can't have compressed buf */
-			VERIFY(!ARC_BUF_COMPRESSED(lastbuf));
-
-			ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
-			arc_hdr_free_pabd(hdr);
-
-			/*
-			 * We must setup a new shared block between the
-			 * last buffer and the hdr. The data would have
-			 * been allocated by the arc buf so we need to transfer
-			 * ownership to the hdr since it's now being shared.
-			 */
-			arc_share_buf(hdr, lastbuf);
-		}
-	} else if (HDR_SHARED_DATA(hdr)) {
-		/*
-		 * Uncompressed shared buffers are always at the end
-		 * of the list. Compressed buffers don't have the
-		 * same requirements. This makes it hard to
-		 * simply assert that the lastbuf is shared so
-		 * we rely on the hdr's compression flags to determine
-		 * if we have a compressed, shared buffer.
-		 */
-		ASSERT3P(lastbuf, !=, NULL);
-		ASSERT(arc_buf_is_shared(lastbuf) ||
-		    HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF);
-	}
-
-	/*
-	 * Free the checksum if we're removing the last uncompressed buf from
-	 * this hdr.
-	 */
-	if (!arc_hdr_has_uncompressed_buf(hdr)) {
-		arc_cksum_free(hdr);
-	}
-
-	/* clean up the buf */
-	buf->b_hdr = NULL;
-	kmem_cache_free(buf_cache, buf);
-}
-
-static void
-arc_hdr_alloc_pabd(arc_buf_hdr_t *hdr, boolean_t do_adapt)
-{
-	ASSERT3U(HDR_GET_LSIZE(hdr), >, 0);
-	ASSERT(HDR_HAS_L1HDR(hdr));
-	ASSERT(!HDR_SHARED_DATA(hdr));
-
-	ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
-	hdr->b_l1hdr.b_pabd = arc_get_data_abd(hdr, arc_hdr_size(hdr), hdr, do_adapt);
-	hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS;
-	ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
-
-	ARCSTAT_INCR(arcstat_compressed_size, arc_hdr_size(hdr));
-	ARCSTAT_INCR(arcstat_uncompressed_size, HDR_GET_LSIZE(hdr));
-}
-
-static void
-arc_hdr_free_pabd(arc_buf_hdr_t *hdr)
-{
-	ASSERT(HDR_HAS_L1HDR(hdr));
-	ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
-
-	/*
-	 * If the hdr is currently being written to the l2arc then
-	 * we defer freeing the data by adding it to the l2arc_free_on_write
-	 * list. The l2arc will free the data once it's finished
-	 * writing it to the l2arc device.
-	 */
-	if (HDR_L2_WRITING(hdr)) {
-		arc_hdr_free_on_write(hdr);
-		ARCSTAT_BUMP(arcstat_l2_free_on_write);
-	} else {
-		arc_free_data_abd(hdr, hdr->b_l1hdr.b_pabd,
-		    arc_hdr_size(hdr), hdr);
-	}
-	hdr->b_l1hdr.b_pabd = NULL;
-	hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS;
-
-	ARCSTAT_INCR(arcstat_compressed_size, -arc_hdr_size(hdr));
-	ARCSTAT_INCR(arcstat_uncompressed_size, -HDR_GET_LSIZE(hdr));
-}
-
-static arc_buf_hdr_t *
-arc_hdr_alloc(uint64_t spa, int32_t psize, int32_t lsize,
-    enum zio_compress compression_type, arc_buf_contents_t type)
-{
-	arc_buf_hdr_t *hdr;
-
-	VERIFY(type == ARC_BUFC_DATA || type == ARC_BUFC_METADATA);
-
-	hdr = kmem_cache_alloc(hdr_full_cache, KM_PUSHPAGE);
-	ASSERT(HDR_EMPTY(hdr));
-	ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL);
-	ASSERT3P(hdr->b_l1hdr.b_thawed, ==, NULL);
-	HDR_SET_PSIZE(hdr, psize);
-	HDR_SET_LSIZE(hdr, lsize);
-	hdr->b_spa = spa;
-	hdr->b_type = type;
-	hdr->b_flags = 0;
-	arc_hdr_set_flags(hdr, arc_bufc_to_flags(type) | ARC_FLAG_HAS_L1HDR);
-	arc_hdr_set_compress(hdr, compression_type);
-
-	hdr->b_l1hdr.b_state = arc_anon;
-	hdr->b_l1hdr.b_arc_access = 0;
-	hdr->b_l1hdr.b_bufcnt = 0;
-	hdr->b_l1hdr.b_buf = NULL;
-
-	/*
-	 * Allocate the hdr's buffer. This will contain either
-	 * the compressed or uncompressed data depending on the block
-	 * it references and compressed arc enablement.
-	 */
-	arc_hdr_alloc_pabd(hdr, B_TRUE);
-	ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
-
-	return (hdr);
-}
-
-/*
- * Transition between the two allocation states for the arc_buf_hdr struct.
- * The arc_buf_hdr struct can be allocated with (hdr_full_cache) or without
- * (hdr_l2only_cache) the fields necessary for the L1 cache - the smaller
- * version is used when a cache buffer is only in the L2ARC in order to reduce
- * memory usage.
- */
-static arc_buf_hdr_t *
-arc_hdr_realloc(arc_buf_hdr_t *hdr, kmem_cache_t *old, kmem_cache_t *new)
-{
-	ASSERT(HDR_HAS_L2HDR(hdr));
-
-	arc_buf_hdr_t *nhdr;
-	l2arc_dev_t *dev = hdr->b_l2hdr.b_dev;
-
-	ASSERT((old == hdr_full_cache && new == hdr_l2only_cache) ||
-	    (old == hdr_l2only_cache && new == hdr_full_cache));
-
-	nhdr = kmem_cache_alloc(new, KM_PUSHPAGE);
-
-	ASSERT(MUTEX_HELD(HDR_LOCK(hdr)));
-	buf_hash_remove(hdr);
-
-	bcopy(hdr, nhdr, HDR_L2ONLY_SIZE);
-
-	if (new == hdr_full_cache) {
-		arc_hdr_set_flags(nhdr, ARC_FLAG_HAS_L1HDR);
-		/*
-		 * arc_access and arc_change_state need to be aware that a
-		 * header has just come out of L2ARC, so we set its state to
-		 * l2c_only even though it's about to change.
-		 */
-		nhdr->b_l1hdr.b_state = arc_l2c_only;
-
-		/* Verify previous threads set to NULL before freeing */
-		ASSERT3P(nhdr->b_l1hdr.b_pabd, ==, NULL);
-	} else {
-		ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
-		ASSERT0(hdr->b_l1hdr.b_bufcnt);
-		ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL);
-
-		/*
-		 * If we've reached here, We must have been called from
-		 * arc_evict_hdr(), as such we should have already been
-		 * removed from any ghost list we were previously on
-		 * (which protects us from racing with arc_evict_state),
-		 * thus no locking is needed during this check.
-		 */
-		ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));
-
-		/*
-		 * A buffer must not be moved into the arc_l2c_only
-		 * state if it's not finished being written out to the
-		 * l2arc device. Otherwise, the b_l1hdr.b_pabd field
-		 * might try to be accessed, even though it was removed.
-		 */
-		VERIFY(!HDR_L2_WRITING(hdr));
-		VERIFY3P(hdr->b_l1hdr.b_pabd, ==, NULL);
-
-#ifdef ZFS_DEBUG
-		if (hdr->b_l1hdr.b_thawed != NULL) {
-			kmem_free(hdr->b_l1hdr.b_thawed, 1);
-			hdr->b_l1hdr.b_thawed = NULL;
-		}
-#endif
-
-		arc_hdr_clear_flags(nhdr, ARC_FLAG_HAS_L1HDR);
-	}
-	/*
-	 * The header has been reallocated so we need to re-insert it into any
-	 * lists it was on.
-	 */
-	(void) buf_hash_insert(nhdr, NULL);
-
-	ASSERT(list_link_active(&hdr->b_l2hdr.b_l2node));
-
-	mutex_enter(&dev->l2ad_mtx);
-
-	/*
-	 * We must place the realloc'ed header back into the list at
-	 * the same spot. Otherwise, if it's placed earlier in the list,
-	 * l2arc_write_buffers() could find it during the function's
-	 * write phase, and try to write it out to the l2arc.
-	 */
-	list_insert_after(&dev->l2ad_buflist, hdr, nhdr);
-	list_remove(&dev->l2ad_buflist, hdr);
-
-	mutex_exit(&dev->l2ad_mtx);
-
-	/*
-	 * Since we're using the pointer address as the tag when
-	 * incrementing and decrementing the l2ad_alloc refcount, we
-	 * must remove the old pointer (that we're about to destroy) and
-	 * add the new pointer to the refcount. Otherwise we'd remove
-	 * the wrong pointer address when calling arc_hdr_destroy() later.
-	 */
-
-	(void) zfs_refcount_remove_many(&dev->l2ad_alloc, arc_hdr_size(hdr),
-	    hdr);
-	(void) zfs_refcount_add_many(&dev->l2ad_alloc, arc_hdr_size(nhdr),
-	    nhdr);
-
-	buf_discard_identity(hdr);
-	kmem_cache_free(old, hdr);
-
-	return (nhdr);
-}
-
-/*
- * Allocate a new arc_buf_hdr_t and arc_buf_t and return the buf to the caller.
- * The buf is returned thawed since we expect the consumer to modify it.
- */
-arc_buf_t *
-arc_alloc_buf(spa_t *spa, void *tag, arc_buf_contents_t type, int32_t size)
-{
-	arc_buf_hdr_t *hdr = arc_hdr_alloc(spa_load_guid(spa), size, size,
-	    ZIO_COMPRESS_OFF, type);
-	ASSERT(!MUTEX_HELD(HDR_LOCK(hdr)));
-
-	arc_buf_t *buf = NULL;
-	VERIFY0(arc_buf_alloc_impl(hdr, tag, B_FALSE, B_FALSE, &buf));
-	arc_buf_thaw(buf);
-
-	return (buf);
-}
-
-/*
- * Allocate a compressed buf in the same manner as arc_alloc_buf. Don't use this
- * for bufs containing metadata.
- */
-arc_buf_t *
-arc_alloc_compressed_buf(spa_t *spa, void *tag, uint64_t psize, uint64_t lsize,
-    enum zio_compress compression_type)
-{
-	ASSERT3U(lsize, >, 0);
-	ASSERT3U(lsize, >=, psize);
-	ASSERT(compression_type > ZIO_COMPRESS_OFF);
-	ASSERT(compression_type < ZIO_COMPRESS_FUNCTIONS);
-
-	arc_buf_hdr_t *hdr = arc_hdr_alloc(spa_load_guid(spa), psize, lsize,
-	    compression_type, ARC_BUFC_DATA);
-	ASSERT(!MUTEX_HELD(HDR_LOCK(hdr)));
-
-	arc_buf_t *buf = NULL;
-	VERIFY0(arc_buf_alloc_impl(hdr, tag, B_TRUE, B_FALSE, &buf));
-	arc_buf_thaw(buf);
-	ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL);
-
-	if (!arc_buf_is_shared(buf)) {
-		/*
-		 * To ensure that the hdr has the correct data in it if we call
-		 * arc_decompress() on this buf before it's been written to
-		 * disk, it's easiest if we just set up sharing between the
-		 * buf and the hdr.
-		 */
-		ASSERT(!abd_is_linear(hdr->b_l1hdr.b_pabd));
-		arc_hdr_free_pabd(hdr);
-		arc_share_buf(hdr, buf);
-	}
-
-	return (buf);
-}
-
-static void
-arc_hdr_l2hdr_destroy(arc_buf_hdr_t *hdr)
-{
-	l2arc_buf_hdr_t *l2hdr = &hdr->b_l2hdr;
-	l2arc_dev_t *dev = l2hdr->b_dev;
-	uint64_t psize = arc_hdr_size(hdr);
-
-	ASSERT(MUTEX_HELD(&dev->l2ad_mtx));
-	ASSERT(HDR_HAS_L2HDR(hdr));
-
-	list_remove(&dev->l2ad_buflist, hdr);
-
-	ARCSTAT_INCR(arcstat_l2_psize, -psize);
-	ARCSTAT_INCR(arcstat_l2_lsize, -HDR_GET_LSIZE(hdr));
-
-	vdev_space_update(dev->l2ad_vdev, -psize, 0, 0);
-
-	(void) zfs_refcount_remove_many(&dev->l2ad_alloc, psize, hdr);
-	arc_hdr_clear_flags(hdr, ARC_FLAG_HAS_L2HDR);
-}
-
-static void
-arc_hdr_destroy(arc_buf_hdr_t *hdr)
-{
-	if (HDR_HAS_L1HDR(hdr)) {
-		ASSERT(hdr->b_l1hdr.b_buf == NULL ||
-		    hdr->b_l1hdr.b_bufcnt > 0);
-		ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
-		ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon);
-	}
-	ASSERT(!HDR_IO_IN_PROGRESS(hdr));
-	ASSERT(!HDR_IN_HASH_TABLE(hdr));
-
-	if (!HDR_EMPTY(hdr))
-		buf_discard_identity(hdr);
-
-	if (HDR_HAS_L2HDR(hdr)) {
-		l2arc_dev_t *dev = hdr->b_l2hdr.b_dev;
-		boolean_t buflist_held = MUTEX_HELD(&dev->l2ad_mtx);
-
-		if (!buflist_held)
-			mutex_enter(&dev->l2ad_mtx);
-
-		/*
-		 * Even though we checked this conditional above, we
-		 * need to check this again now that we have the
-		 * l2ad_mtx. This is because we could be racing with
-		 * another thread calling l2arc_evict() which might have
-		 * destroyed this header's L2 portion as we were waiting
-		 * to acquire the l2ad_mtx. If that happens, we don't
-		 * want to re-destroy the header's L2 portion.
-		 */
-		if (HDR_HAS_L2HDR(hdr)) {
-			l2arc_trim(hdr);
-			arc_hdr_l2hdr_destroy(hdr);
-		}
-
-		if (!buflist_held)
-			mutex_exit(&dev->l2ad_mtx);
-	}
-
-	if (HDR_HAS_L1HDR(hdr)) {
-		arc_cksum_free(hdr);
-
-		while (hdr->b_l1hdr.b_buf != NULL)
-			arc_buf_destroy_impl(hdr->b_l1hdr.b_buf);
-
-#ifdef ZFS_DEBUG
-		if (hdr->b_l1hdr.b_thawed != NULL) {
-			kmem_free(hdr->b_l1hdr.b_thawed, 1);
-			hdr->b_l1hdr.b_thawed = NULL;
-		}
-#endif
-
-		if (hdr->b_l1hdr.b_pabd != NULL) {
-			arc_hdr_free_pabd(hdr);
-		}
-	}
-
-	ASSERT3P(hdr->b_hash_next, ==, NULL);
-	if (HDR_HAS_L1HDR(hdr)) {
-		ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));
-		ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL);
-		kmem_cache_free(hdr_full_cache, hdr);
-	} else {
-		kmem_cache_free(hdr_l2only_cache, hdr);
-	}
-}
-
-void
-arc_buf_destroy(arc_buf_t *buf, void* tag)
-{
-	arc_buf_hdr_t *hdr = buf->b_hdr;
-	kmutex_t *hash_lock = HDR_LOCK(hdr);
-
-	if (hdr->b_l1hdr.b_state == arc_anon) {
-		ASSERT3U(hdr->b_l1hdr.b_bufcnt, ==, 1);
-		ASSERT(!HDR_IO_IN_PROGRESS(hdr));
-		VERIFY0(remove_reference(hdr, NULL, tag));
-		arc_hdr_destroy(hdr);
-		return;
-	}
-
-	mutex_enter(hash_lock);
-	ASSERT3P(hdr, ==, buf->b_hdr);
-	ASSERT(hdr->b_l1hdr.b_bufcnt > 0);
-	ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
-	ASSERT3P(hdr->b_l1hdr.b_state, !=, arc_anon);
-	ASSERT3P(buf->b_data, !=, NULL);
-
-	(void) remove_reference(hdr, hash_lock, tag);
-	arc_buf_destroy_impl(buf);
-	mutex_exit(hash_lock);
-}
-
-/*
- * Evict the arc_buf_hdr that is provided as a parameter. The resultant
- * state of the header is dependent on its state prior to entering this
- * function. The following transitions are possible:
- *
- *    - arc_mru -> arc_mru_ghost
- *    - arc_mfu -> arc_mfu_ghost
- *    - arc_mru_ghost -> arc_l2c_only
- *    - arc_mru_ghost -> deleted
- *    - arc_mfu_ghost -> arc_l2c_only
- *    - arc_mfu_ghost -> deleted
- */
-static int64_t
-arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
-{
-	arc_state_t *evicted_state, *state;
-	int64_t bytes_evicted = 0;
-	int min_lifetime = HDR_PRESCIENT_PREFETCH(hdr) ?
-	    zfs_arc_min_prescient_prefetch_ms : zfs_arc_min_prefetch_ms;
-
-	ASSERT(MUTEX_HELD(hash_lock));
-	ASSERT(HDR_HAS_L1HDR(hdr));
-
-	state = hdr->b_l1hdr.b_state;
-	if (GHOST_STATE(state)) {
-		ASSERT(!HDR_IO_IN_PROGRESS(hdr));
-		ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
-
-		/*
-		 * l2arc_write_buffers() relies on a header's L1 portion
-		 * (i.e. its b_pabd field) during it's write phase.
-		 * Thus, we cannot push a header onto the arc_l2c_only
-		 * state (removing it's L1 piece) until the header is
-		 * done being written to the l2arc.
-		 */
-		if (HDR_HAS_L2HDR(hdr) && HDR_L2_WRITING(hdr)) {
-			ARCSTAT_BUMP(arcstat_evict_l2_skip);
-			return (bytes_evicted);
-		}
-
-		ARCSTAT_BUMP(arcstat_deleted);
-		bytes_evicted += HDR_GET_LSIZE(hdr);
-
-		DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, hdr);
-
-		ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
-		if (HDR_HAS_L2HDR(hdr)) {
-			/*
-			 * This buffer is cached on the 2nd Level ARC;
-			 * don't destroy the header.
-			 */
-			arc_change_state(arc_l2c_only, hdr, hash_lock);
-			/*
-			 * dropping from L1+L2 cached to L2-only,
-			 * realloc to remove the L1 header.
-			 */
-			hdr = arc_hdr_realloc(hdr, hdr_full_cache,
-			    hdr_l2only_cache);
-		} else {
-			arc_change_state(arc_anon, hdr, hash_lock);
-			arc_hdr_destroy(hdr);
-		}
-		return (bytes_evicted);
-	}
-
-	ASSERT(state == arc_mru || state == arc_mfu);
-	evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost;
-
-	/* prefetch buffers have a minimum lifespan */
-	if (HDR_IO_IN_PROGRESS(hdr) ||
-	    ((hdr->b_flags & (ARC_FLAG_PREFETCH | ARC_FLAG_INDIRECT)) &&
-	    ddi_get_lbolt() - hdr->b_l1hdr.b_arc_access < min_lifetime * hz)) {
-		ARCSTAT_BUMP(arcstat_evict_skip);
-		return (bytes_evicted);
-	}
-
-	ASSERT0(zfs_refcount_count(&hdr->b_l1hdr.b_refcnt));
-	while (hdr->b_l1hdr.b_buf) {
-		arc_buf_t *buf = hdr->b_l1hdr.b_buf;
-		if (!mutex_tryenter(&buf->b_evict_lock)) {
-			ARCSTAT_BUMP(arcstat_mutex_miss);
-			break;
-		}
-		if (buf->b_data != NULL)
-			bytes_evicted += HDR_GET_LSIZE(hdr);
-		mutex_exit(&buf->b_evict_lock);
-		arc_buf_destroy_impl(buf);
-	}
-
-	if (HDR_HAS_L2HDR(hdr)) {
-		ARCSTAT_INCR(arcstat_evict_l2_cached, HDR_GET_LSIZE(hdr));
-	} else {
-		if (l2arc_write_eligible(hdr->b_spa, hdr)) {
-			ARCSTAT_INCR(arcstat_evict_l2_eligible,
-			    HDR_GET_LSIZE(hdr));
-		} else {
-			ARCSTAT_INCR(arcstat_evict_l2_ineligible,
-			    HDR_GET_LSIZE(hdr));
-		}
-	}
-
-	if (hdr->b_l1hdr.b_bufcnt == 0) {
-		arc_cksum_free(hdr);
-
-		bytes_evicted += arc_hdr_size(hdr);
-
-		/*
-		 * If this hdr is being evicted and has a compressed
-		 * buffer then we discard it here before we change states.
-		 * This ensures that the accounting is updated correctly
-		 * in arc_free_data_impl().
-		 */
-		arc_hdr_free_pabd(hdr);
-
-		arc_change_state(evicted_state, hdr, hash_lock);
-		ASSERT(HDR_IN_HASH_TABLE(hdr));
-		arc_hdr_set_flags(hdr, ARC_FLAG_IN_HASH_TABLE);
-		DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, hdr);
-	}
-
-	return (bytes_evicted);
-}
-
-static uint64_t
-arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker,
-    uint64_t spa, int64_t bytes)
-{
-	multilist_sublist_t *mls;
-	uint64_t bytes_evicted = 0;
-	arc_buf_hdr_t *hdr;
-	kmutex_t *hash_lock;
-	int evict_count = 0;
-
-	ASSERT3P(marker, !=, NULL);
-	IMPLY(bytes < 0, bytes == ARC_EVICT_ALL);
-
-	mls = multilist_sublist_lock(ml, idx);
-
-	for (hdr = multilist_sublist_prev(mls, marker); hdr != NULL;
-	    hdr = multilist_sublist_prev(mls, marker)) {
-		if ((bytes != ARC_EVICT_ALL && bytes_evicted >= bytes) ||
-		    (evict_count >= zfs_arc_evict_batch_limit))
-			break;
-
-		/*
-		 * To keep our iteration location, move the marker
-		 * forward. Since we're not holding hdr's hash lock, we
-		 * must be very careful and not remove 'hdr' from the
-		 * sublist. Otherwise, other consumers might mistake the
-		 * 'hdr' as not being on a sublist when they call the
-		 * multilist_link_active() function (they all rely on
-		 * the hash lock protecting concurrent insertions and
-		 * removals). multilist_sublist_move_forward() was
-		 * specifically implemented to ensure this is the case
-		 * (only 'marker' will be removed and re-inserted).
-		 */
-		multilist_sublist_move_forward(mls, marker);
-
-		/*
-		 * The only case where the b_spa field should ever be
-		 * zero, is the marker headers inserted by
-		 * arc_evict_state(). It's possible for multiple threads
-		 * to be calling arc_evict_state() concurrently (e.g.
-		 * dsl_pool_close() and zio_inject_fault()), so we must
-		 * skip any markers we see from these other threads.
-		 */
-		if (hdr->b_spa == 0)
-			continue;
-
-		/* we're only interested in evicting buffers of a certain spa */
-		if (spa != 0 && hdr->b_spa != spa) {
-			ARCSTAT_BUMP(arcstat_evict_skip);
-			continue;
-		}
-
-		hash_lock = HDR_LOCK(hdr);
-
-		/*
-		 * We aren't calling this function from any code path
-		 * that would already be holding a hash lock, so we're
-		 * asserting on this assumption to be defensive in case
-		 * this ever changes. Without this check, it would be
-		 * possible to incorrectly increment arcstat_mutex_miss
-		 * below (e.g. if the code changed such that we called
-		 * this function with a hash lock held).
-		 */
-		ASSERT(!MUTEX_HELD(hash_lock));
-
-		if (mutex_tryenter(hash_lock)) {
-			uint64_t evicted = arc_evict_hdr(hdr, hash_lock);
-			mutex_exit(hash_lock);
-
-			bytes_evicted += evicted;
-
-			/*
-			 * If evicted is zero, arc_evict_hdr() must have
-			 * decided to skip this header, don't increment
-			 * evict_count in this case.
-			 */
-			if (evicted != 0)
-				evict_count++;
-
-			/*
-			 * If arc_size isn't overflowing, signal any
-			 * threads that might happen to be waiting.
-			 *
-			 * For each header evicted, we wake up a single
-			 * thread. If we used cv_broadcast, we could
-			 * wake up "too many" threads causing arc_size
-			 * to significantly overflow arc_c; since
-			 * arc_get_data_impl() doesn't check for overflow
-			 * when it's woken up (it doesn't because it's
-			 * possible for the ARC to be overflowing while
-			 * full of un-evictable buffers, and the
-			 * function should proceed in this case).
-			 *
-			 * If threads are left sleeping, due to not
-			 * using cv_broadcast here, they will be woken
-			 * up via cv_broadcast in arc_adjust_cb() just
-			 * before arc_adjust_zthr sleeps.
-			 */
-			mutex_enter(&arc_adjust_lock);
-			if (!arc_is_overflowing())
-				cv_signal(&arc_adjust_waiters_cv);
-			mutex_exit(&arc_adjust_lock);
-		} else {
-			ARCSTAT_BUMP(arcstat_mutex_miss);
-		}
-	}
-
-	multilist_sublist_unlock(mls);
-
-	return (bytes_evicted);
-}
-
-/*
- * Evict buffers from the given arc state, until we've removed the
- * specified number of bytes. Move the removed buffers to the
- * appropriate evict state.
- *
- * This function makes a "best effort". It skips over any buffers
- * it can't get a hash_lock on, and so, may not catch all candidates.
- * It may also return without evicting as much space as requested.
- *
- * If bytes is specified using the special value ARC_EVICT_ALL, this
- * will evict all available (i.e. unlocked and evictable) buffers from
- * the given arc state; which is used by arc_flush().
- */
-static uint64_t
-arc_evict_state(arc_state_t *state, uint64_t spa, int64_t bytes,
-    arc_buf_contents_t type)
-{
-	uint64_t total_evicted = 0;
-	multilist_t *ml = state->arcs_list[type];
-	int num_sublists;
-	arc_buf_hdr_t **markers;
-
-	IMPLY(bytes < 0, bytes == ARC_EVICT_ALL);
-
-	num_sublists = multilist_get_num_sublists(ml);
-
-	/*
-	 * If we've tried to evict from each sublist, made some
-	 * progress, but still have not hit the target number of bytes
-	 * to evict, we want to keep trying. The markers allow us to
-	 * pick up where we left off for each individual sublist, rather
-	 * than starting from the tail each time.
-	 */
-	markers = kmem_zalloc(sizeof (*markers) * num_sublists, KM_SLEEP);
-	for (int i = 0; i < num_sublists; i++) {
-		markers[i] = kmem_cache_alloc(hdr_full_cache, KM_SLEEP);
-
-		/*
-		 * A b_spa of 0 is used to indicate that this header is
-		 * a marker. This fact is used in arc_adjust_type() and
-		 * arc_evict_state_impl().
-		 */
-		markers[i]->b_spa = 0;
-
-		multilist_sublist_t *mls = multilist_sublist_lock(ml, i);
-		multilist_sublist_insert_tail(mls, markers[i]);
-		multilist_sublist_unlock(mls);
-	}
-
-	/*
-	 * While we haven't hit our target number of bytes to evict, or
-	 * we're evicting all available buffers.
-	 */
-	while (total_evicted < bytes || bytes == ARC_EVICT_ALL) {
-		int sublist_idx = multilist_get_random_index(ml);
-		uint64_t scan_evicted = 0;
-
-		/*
-		 * Try to reduce pinned dnodes with a floor of arc_dnode_limit.
-		 * Request that 10% of the LRUs be scanned by the superblock
-		 * shrinker.
-		 */
-		if (type == ARC_BUFC_DATA && aggsum_compare(&astat_dnode_size,
-		    arc_dnode_limit) > 0) {
-			arc_prune_async((aggsum_upper_bound(&astat_dnode_size) -
-			    arc_dnode_limit) / sizeof (dnode_t) /
-			    zfs_arc_dnode_reduce_percent);
-		}
-
-		/*
-		 * Start eviction using a randomly selected sublist,
-		 * this is to try and evenly balance eviction across all
-		 * sublists. Always starting at the same sublist
-		 * (e.g. index 0) would cause evictions to favor certain
-		 * sublists over others.
-		 */
-		for (int i = 0; i < num_sublists; i++) {
-			uint64_t bytes_remaining;
-			uint64_t bytes_evicted;
-
-			if (bytes == ARC_EVICT_ALL)
-				bytes_remaining = ARC_EVICT_ALL;
-			else if (total_evicted < bytes)
-				bytes_remaining = bytes - total_evicted;
-			else
-				break;
-
-			bytes_evicted = arc_evict_state_impl(ml, sublist_idx,
-			    markers[sublist_idx], spa, bytes_remaining);
-
-			scan_evicted += bytes_evicted;
-			total_evicted += bytes_evicted;
-
-			/* we've reached the end, wrap to the beginning */
-			if (++sublist_idx >= num_sublists)
-				sublist_idx = 0;
-		}
-
-		/*
-		 * If we didn't evict anything during this scan, we have
-		 * no reason to believe we'll evict more during another
-		 * scan, so break the loop.
-		 */
-		if (scan_evicted == 0) {
-			/* This isn't possible, let's make that obvious */
-			ASSERT3S(bytes, !=, 0);
-
-			/*
-			 * When bytes is ARC_EVICT_ALL, the only way to
-			 * break the loop is when scan_evicted is zero.
-			 * In that case, we actually have evicted enough,
-			 * so we don't want to increment the kstat.
-			 */
-			if (bytes != ARC_EVICT_ALL) {
-				ASSERT3S(total_evicted, <, bytes);
-				ARCSTAT_BUMP(arcstat_evict_not_enough);
-			}
-
-			break;
-		}
-	}
-
-	for (int i = 0; i < num_sublists; i++) {
-		multilist_sublist_t *mls = multilist_sublist_lock(ml, i);
-		multilist_sublist_remove(mls, markers[i]);
-		multilist_sublist_unlock(mls);
-
-		kmem_cache_free(hdr_full_cache, markers[i]);
-	}
-	kmem_free(markers, sizeof (*markers) * num_sublists);
-
-	return (total_evicted);
-}
-
-/*
- * Flush all "evictable" data of the given type from the arc state
- * specified. This will not evict any "active" buffers (i.e. referenced).
- *
- * When 'retry' is set to B_FALSE, the function will make a single pass
- * over the state and evict any buffers that it can. Since it doesn't
- * continually retry the eviction, it might end up leaving some buffers
- * in the ARC due to lock misses.
- *
- * When 'retry' is set to B_TRUE, the function will continually retry the
- * eviction until *all* evictable buffers have been removed from the
- * state. As a result, if concurrent insertions into the state are
- * allowed (e.g. if the ARC isn't shutting down), this function might
- * wind up in an infinite loop, continually trying to evict buffers.
- */
-static uint64_t
-arc_flush_state(arc_state_t *state, uint64_t spa, arc_buf_contents_t type,
-    boolean_t retry)
-{
-	uint64_t evicted = 0;
-
-	while (zfs_refcount_count(&state->arcs_esize[type]) != 0) {
-		evicted += arc_evict_state(state, spa, ARC_EVICT_ALL, type);
-
-		if (!retry)
-			break;
-	}
-
-	return (evicted);
-}
-
-/*
- * Helper function for arc_prune_async() it is responsible for safely
- * handling the execution of a registered arc_prune_func_t.
- */
-static void
-arc_prune_task(void *ptr)
-{
-	arc_prune_t *ap = (arc_prune_t *)ptr;
-	arc_prune_func_t *func = ap->p_pfunc;
-
-	if (func != NULL)
-		func(ap->p_adjust, ap->p_private);
-
-	zfs_refcount_remove(&ap->p_refcnt, func);
-}
-
-/*
- * Notify registered consumers they must drop holds on a portion of the ARC
- * buffered they reference.  This provides a mechanism to ensure the ARC can
- * honor the arc_meta_limit and reclaim otherwise pinned ARC buffers.  This
- * is analogous to dnlc_reduce_cache() but more generic.
- *
- * This operation is performed asynchronously so it may be safely called
- * in the context of the arc_reclaim_thread().  A reference is taken here
- * for each registered arc_prune_t and the arc_prune_task() is responsible
- * for releasing it once the registered arc_prune_func_t has completed.
- */
-static void
-arc_prune_async(int64_t adjust)
-{
-	arc_prune_t *ap;
-
-	mutex_enter(&arc_prune_mtx);
-	for (ap = list_head(&arc_prune_list); ap != NULL;
-	    ap = list_next(&arc_prune_list, ap)) {
-
-		if (zfs_refcount_count(&ap->p_refcnt) >= 2)
-			continue;
-
-		zfs_refcount_add(&ap->p_refcnt, ap->p_pfunc);
-		ap->p_adjust = adjust;
-		if (taskq_dispatch(arc_prune_taskq, arc_prune_task,
-		    ap, TQ_SLEEP) == TASKQID_INVALID) {
-			zfs_refcount_remove(&ap->p_refcnt, ap->p_pfunc);
-			continue;
-		}
-		ARCSTAT_BUMP(arcstat_prune);
-	}
-	mutex_exit(&arc_prune_mtx);
-}
-
-/*
- * Evict the specified number of bytes from the state specified,
- * restricting eviction to the spa and type given. This function
- * prevents us from trying to evict more from a state's list than
- * is "evictable", and to skip evicting altogether when passed a
- * negative value for "bytes". In contrast, arc_evict_state() will
- * evict everything it can, when passed a negative value for "bytes".
- */
-static uint64_t
-arc_adjust_impl(arc_state_t *state, uint64_t spa, int64_t bytes,
-    arc_buf_contents_t type)
-{
-	int64_t delta;
-
-	if (bytes > 0 && zfs_refcount_count(&state->arcs_esize[type]) > 0) {
-		delta = MIN(zfs_refcount_count(&state->arcs_esize[type]),
-		    bytes);
-		return (arc_evict_state(state, spa, delta, type));
-	}
-
-	return (0);
-}
-
-/*
- * The goal of this function is to evict enough meta data buffers from the
- * ARC in order to enforce the arc_meta_limit.  Achieving this is slightly
- * more complicated than it appears because it is common for data buffers
- * to have holds on meta data buffers.  In addition, dnode meta data buffers
- * will be held by the dnodes in the block preventing them from being freed.
- * This means we can't simply traverse the ARC and expect to always find
- * enough unheld meta data buffer to release.
- *
- * Therefore, this function has been updated to make alternating passes
- * over the ARC releasing data buffers and then newly unheld meta data
- * buffers.  This ensures forward progress is maintained and meta_used
- * will decrease.  Normally this is sufficient, but if required the ARC
- * will call the registered prune callbacks causing dentry and inodes to
- * be dropped from the VFS cache.  This will make dnode meta data buffers
- * available for reclaim.
- */
-static uint64_t
-arc_adjust_meta_balanced(uint64_t meta_used)
-{
-	int64_t delta, prune = 0, adjustmnt;
-	uint64_t total_evicted = 0;
-	arc_buf_contents_t type = ARC_BUFC_DATA;
-	int restarts = MAX(zfs_arc_meta_adjust_restarts, 0);
-
-restart:
-	/*
-	 * This slightly differs than the way we evict from the mru in
-	 * arc_adjust because we don't have a "target" value (i.e. no
-	 * "meta" arc_p). As a result, I think we can completely
-	 * cannibalize the metadata in the MRU before we evict the
-	 * metadata from the MFU. I think we probably need to implement a
-	 * "metadata arc_p" value to do this properly.
-	 */
-	adjustmnt = meta_used - arc_meta_limit;
-
-	if (adjustmnt > 0 &&
-	    zfs_refcount_count(&arc_mru->arcs_esize[type]) > 0) {
-		delta = MIN(zfs_refcount_count(&arc_mru->arcs_esize[type]),
-		    adjustmnt);
-		total_evicted += arc_adjust_impl(arc_mru, 0, delta, type);
-		adjustmnt -= delta;
-	}
-
-	/*
-	 * We can't afford to recalculate adjustmnt here. If we do,
-	 * new metadata buffers can sneak into the MRU or ANON lists,
-	 * thus penalize the MFU metadata. Although the fudge factor is
-	 * small, it has been empirically shown to be significant for
-	 * certain workloads (e.g. creating many empty directories). As
-	 * such, we use the original calculation for adjustmnt, and
-	 * simply decrement the amount of data evicted from the MRU.
-	 */
-
-	if (adjustmnt > 0 &&
-	    zfs_refcount_count(&arc_mfu->arcs_esize[type]) > 0) {
-		delta = MIN(zfs_refcount_count(&arc_mfu->arcs_esize[type]),
-		    adjustmnt);
-		total_evicted += arc_adjust_impl(arc_mfu, 0, delta, type);
-	}
-
-	adjustmnt = meta_used - arc_meta_limit;
-
-	if (adjustmnt > 0 &&
-	    zfs_refcount_count(&arc_mru_ghost->arcs_esize[type]) > 0) {
-		delta = MIN(adjustmnt,
-		    zfs_refcount_count(&arc_mru_ghost->arcs_esize[type]));
-		total_evicted += arc_adjust_impl(arc_mru_ghost, 0, delta, type);
-		adjustmnt -= delta;
-	}
-
-	if (adjustmnt > 0 &&
-	    zfs_refcount_count(&arc_mfu_ghost->arcs_esize[type]) > 0) {
-		delta = MIN(adjustmnt,
-		    zfs_refcount_count(&arc_mfu_ghost->arcs_esize[type]));
-		total_evicted += arc_adjust_impl(arc_mfu_ghost, 0, delta, type);
-	}
-
-	/*
-	 * If after attempting to make the requested adjustment to the ARC
-	 * the meta limit is still being exceeded then request that the
-	 * higher layers drop some cached objects which have holds on ARC
-	 * meta buffers.  Requests to the upper layers will be made with
-	 * increasingly large scan sizes until the ARC is below the limit.
-	 */
-	if (meta_used > arc_meta_limit) {
-		if (type == ARC_BUFC_DATA) {
-			type = ARC_BUFC_METADATA;
-		} else {
-			type = ARC_BUFC_DATA;
-
-			if (zfs_arc_meta_prune) {
-				prune += zfs_arc_meta_prune;
-				arc_prune_async(prune);
-			}
-		}
-
-		if (restarts > 0) {
-			restarts--;
-			goto restart;
-		}
-	}
-	return (total_evicted);
-}
-
-/*
- * Evict metadata buffers from the cache, such that arc_meta_used is
- * capped by the arc_meta_limit tunable.
- */
-static uint64_t
-arc_adjust_meta_only(uint64_t meta_used)
-{
-	uint64_t total_evicted = 0;
-	int64_t target;
-
-	/*
-	 * If we're over the meta limit, we want to evict enough
-	 * metadata to get back under the meta limit. We don't want to
-	 * evict so much that we drop the MRU below arc_p, though. If
-	 * we're over the meta limit more than we're over arc_p, we
-	 * evict some from the MRU here, and some from the MFU below.
-	 */
-	target = MIN((int64_t)(meta_used - arc_meta_limit),
-	    (int64_t)(zfs_refcount_count(&arc_anon->arcs_size) +
-	    zfs_refcount_count(&arc_mru->arcs_size) - arc_p));
-
-	total_evicted += arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_METADATA);
-
-	/*
-	 * Similar to the above, we want to evict enough bytes to get us
-	 * below the meta limit, but not so much as to drop us below the
-	 * space allotted to the MFU (which is defined as arc_c - arc_p).
-	 */
-	target = MIN((int64_t)(meta_used - arc_meta_limit),
-	    (int64_t)(zfs_refcount_count(&arc_mfu->arcs_size) -
-	    (arc_c - arc_p)));
-
-	total_evicted += arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_METADATA);
-
-	return (total_evicted);
-}
-
-static uint64_t
-arc_adjust_meta(uint64_t meta_used)
-{
-	if (zfs_arc_meta_strategy == ARC_STRATEGY_META_ONLY)
-		return (arc_adjust_meta_only(meta_used));
-	else
-		return (arc_adjust_meta_balanced(meta_used));
-}
-
-/*
- * Return the type of the oldest buffer in the given arc state
- *
- * This function will select a random sublist of type ARC_BUFC_DATA and
- * a random sublist of type ARC_BUFC_METADATA. The tail of each sublist
- * is compared, and the type which contains the "older" buffer will be
- * returned.
- */
-static arc_buf_contents_t
-arc_adjust_type(arc_state_t *state)
-{
-	multilist_t *data_ml = state->arcs_list[ARC_BUFC_DATA];
-	multilist_t *meta_ml = state->arcs_list[ARC_BUFC_METADATA];
-	int data_idx = multilist_get_random_index(data_ml);
-	int meta_idx = multilist_get_random_index(meta_ml);
-	multilist_sublist_t *data_mls;
-	multilist_sublist_t *meta_mls;
-	arc_buf_contents_t type;
-	arc_buf_hdr_t *data_hdr;
-	arc_buf_hdr_t *meta_hdr;
-
-	/*
-	 * We keep the sublist lock until we're finished, to prevent
-	 * the headers from being destroyed via arc_evict_state().
-	 */
-	data_mls = multilist_sublist_lock(data_ml, data_idx);
-	meta_mls = multilist_sublist_lock(meta_ml, meta_idx);
-
-	/*
-	 * These two loops are to ensure we skip any markers that
-	 * might be at the tail of the lists due to arc_evict_state().
-	 */
-
-	for (data_hdr = multilist_sublist_tail(data_mls); data_hdr != NULL;
-	    data_hdr = multilist_sublist_prev(data_mls, data_hdr)) {
-		if (data_hdr->b_spa != 0)
-			break;
-	}
-
-	for (meta_hdr = multilist_sublist_tail(meta_mls); meta_hdr != NULL;
-	    meta_hdr = multilist_sublist_prev(meta_mls, meta_hdr)) {
-		if (meta_hdr->b_spa != 0)
-			break;
-	}
-
-	if (data_hdr == NULL && meta_hdr == NULL) {
-		type = ARC_BUFC_DATA;
-	} else if (data_hdr == NULL) {
-		ASSERT3P(meta_hdr, !=, NULL);
-		type = ARC_BUFC_METADATA;
-	} else if (meta_hdr == NULL) {
-		ASSERT3P(data_hdr, !=, NULL);
-		type = ARC_BUFC_DATA;
-	} else {
-		ASSERT3P(data_hdr, !=, NULL);
-		ASSERT3P(meta_hdr, !=, NULL);
-
-		/* The headers can't be on the sublist without an L1 header */
-		ASSERT(HDR_HAS_L1HDR(data_hdr));
-		ASSERT(HDR_HAS_L1HDR(meta_hdr));
-
-		if (data_hdr->b_l1hdr.b_arc_access <
-		    meta_hdr->b_l1hdr.b_arc_access) {
-			type = ARC_BUFC_DATA;
-		} else {
-			type = ARC_BUFC_METADATA;
-		}
-	}
-
-	multilist_sublist_unlock(meta_mls);
-	multilist_sublist_unlock(data_mls);
-
-	return (type);
-}
-
-/*
- * Evict buffers from the cache, such that arc_size is capped by arc_c.
- */
-static uint64_t
-arc_adjust(void)
-{
-	uint64_t total_evicted = 0;
-	uint64_t bytes;
-	int64_t target;
-	uint64_t asize = aggsum_value(&arc_size);
-	uint64_t ameta = aggsum_value(&arc_meta_used);
-
-	/*
-	 * If we're over arc_meta_limit, we want to correct that before
-	 * potentially evicting data buffers below.
-	 */
-	total_evicted += arc_adjust_meta(ameta);
-
-	/*
-	 * Adjust MRU size
-	 *
-	 * If we're over the target cache size, we want to evict enough
-	 * from the list to get back to our target size. We don't want
-	 * to evict too much from the MRU, such that it drops below
-	 * arc_p. So, if we're over our target cache size more than
-	 * the MRU is over arc_p, we'll evict enough to get back to
-	 * arc_p here, and then evict more from the MFU below.
-	 */
-	target = MIN((int64_t)(asize - arc_c),
-	    (int64_t)(zfs_refcount_count(&arc_anon->arcs_size) +
-	    zfs_refcount_count(&arc_mru->arcs_size) + ameta - arc_p));
-
-	/*
-	 * If we're below arc_meta_min, always prefer to evict data.
-	 * Otherwise, try to satisfy the requested number of bytes to
-	 * evict from the type which contains older buffers; in an
-	 * effort to keep newer buffers in the cache regardless of their
-	 * type. If we cannot satisfy the number of bytes from this
-	 * type, spill over into the next type.
-	 */
-	if (arc_adjust_type(arc_mru) == ARC_BUFC_METADATA &&
-	    ameta > arc_meta_min) {
-		bytes = arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_METADATA);
-		total_evicted += bytes;
-
-		/*
-		 * If we couldn't evict our target number of bytes from
-		 * metadata, we try to get the rest from data.
-		 */
-		target -= bytes;
-
-		total_evicted +=
-		    arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_DATA);
-	} else {
-		bytes = arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_DATA);
-		total_evicted += bytes;
-
-		/*
-		 * If we couldn't evict our target number of bytes from
-		 * data, we try to get the rest from metadata.
-		 */
-		target -= bytes;
-
-		total_evicted +=
-		    arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_METADATA);
-	}
-
-	/*
-	 * Re-sum ARC stats after the first round of evictions.
-	 */
-	asize = aggsum_value(&arc_size);
-	ameta = aggsum_value(&arc_meta_used);
-
-	/*
-	 * Adjust MFU size
-	 *
-	 * Now that we've tried to evict enough from the MRU to get its
-	 * size back to arc_p, if we're still above the target cache
-	 * size, we evict the rest from the MFU.
-	 */
-	target = asize - arc_c;
-
-	if (arc_adjust_type(arc_mfu) == ARC_BUFC_METADATA &&
-	    ameta > arc_meta_min) {
-		bytes = arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_METADATA);
-		total_evicted += bytes;
-
-		/*
-		 * If we couldn't evict our target number of bytes from
-		 * metadata, we try to get the rest from data.
-		 */
-		target -= bytes;
-
-		total_evicted +=
-		    arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_DATA);
-	} else {
-		bytes = arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_DATA);
-		total_evicted += bytes;
-
-		/*
-		 * If we couldn't evict our target number of bytes from
-		 * data, we try to get the rest from data.
-		 */
-		target -= bytes;
-
-		total_evicted +=
-		    arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_METADATA);
-	}
-
-	/*
-	 * Adjust ghost lists
-	 *
-	 * In addition to the above, the ARC also defines target values
-	 * for the ghost lists. The sum of the mru list and mru ghost
-	 * list should never exceed the target size of the cache, and
-	 * the sum of the mru list, mfu list, mru ghost list, and mfu
-	 * ghost list should never exceed twice the target size of the
-	 * cache. The following logic enforces these limits on the ghost
-	 * caches, and evicts from them as needed.
-	 */
-	target = zfs_refcount_count(&arc_mru->arcs_size) +
-	    zfs_refcount_count(&arc_mru_ghost->arcs_size) - arc_c;
-
-	bytes = arc_adjust_impl(arc_mru_ghost, 0, target, ARC_BUFC_DATA);
-	total_evicted += bytes;
-
-	target -= bytes;
-
-	total_evicted +=
-	    arc_adjust_impl(arc_mru_ghost, 0, target, ARC_BUFC_METADATA);
-
-	/*
-	 * We assume the sum of the mru list and mfu list is less than
-	 * or equal to arc_c (we enforced this above), which means we
-	 * can use the simpler of the two equations below:
-	 *
-	 *	mru + mfu + mru ghost + mfu ghost <= 2 * arc_c
-	 *		    mru ghost + mfu ghost <= arc_c
-	 */
-	target = zfs_refcount_count(&arc_mru_ghost->arcs_size) +
-	    zfs_refcount_count(&arc_mfu_ghost->arcs_size) - arc_c;
-
-	bytes = arc_adjust_impl(arc_mfu_ghost, 0, target, ARC_BUFC_DATA);
-	total_evicted += bytes;
-
-	target -= bytes;
-
-	total_evicted +=
-	    arc_adjust_impl(arc_mfu_ghost, 0, target, ARC_BUFC_METADATA);
-
-	return (total_evicted);
-}
-
-void
-arc_flush(spa_t *spa, boolean_t retry)
-{
-	uint64_t guid = 0;
-
-	/*
-	 * If retry is B_TRUE, a spa must not be specified since we have
-	 * no good way to determine if all of a spa's buffers have been
-	 * evicted from an arc state.
-	 */
-	ASSERT(!retry || spa == 0);
-
-	if (spa != NULL)
-		guid = spa_load_guid(spa);
-
-	(void) arc_flush_state(arc_mru, guid, ARC_BUFC_DATA, retry);
-	(void) arc_flush_state(arc_mru, guid, ARC_BUFC_METADATA, retry);
-
-	(void) arc_flush_state(arc_mfu, guid, ARC_BUFC_DATA, retry);
-	(void) arc_flush_state(arc_mfu, guid, ARC_BUFC_METADATA, retry);
-
-	(void) arc_flush_state(arc_mru_ghost, guid, ARC_BUFC_DATA, retry);
-	(void) arc_flush_state(arc_mru_ghost, guid, ARC_BUFC_METADATA, retry);
-
-	(void) arc_flush_state(arc_mfu_ghost, guid, ARC_BUFC_DATA, retry);
-	(void) arc_flush_state(arc_mfu_ghost, guid, ARC_BUFC_METADATA, retry);
-}
-
-static void
-arc_reduce_target_size(int64_t to_free)
-{
-	uint64_t asize = aggsum_value(&arc_size);
-	if (arc_c > arc_c_min) {
-		DTRACE_PROBE4(arc__shrink, uint64_t, arc_c, uint64_t,
-			arc_c_min, uint64_t, arc_p, uint64_t, to_free);
-		if (arc_c > arc_c_min + to_free)
-			atomic_add_64(&arc_c, -to_free);
-		else
-			arc_c = arc_c_min;
-
-		atomic_add_64(&arc_p, -(arc_p >> arc_shrink_shift));
-		if (asize < arc_c)
-			arc_c = MAX(asize, arc_c_min);
-		if (arc_p > arc_c)
-			arc_p = (arc_c >> 1);
-
-		DTRACE_PROBE2(arc__shrunk, uint64_t, arc_c, uint64_t,
-			arc_p);
-
-		ASSERT(arc_c >= arc_c_min);
-		ASSERT((int64_t)arc_p >= 0);
-	}
-
-	if (asize > arc_c) {
-		DTRACE_PROBE2(arc__shrink_adjust, uint64_t, asize,
-			uint64_t, arc_c);
-		/* See comment in arc_adjust_cb_check() on why lock+flag */
-		mutex_enter(&arc_adjust_lock);
-		arc_adjust_needed = B_TRUE;
-		mutex_exit(&arc_adjust_lock);
-		zthr_wakeup(arc_adjust_zthr);
-	}
-}
-
-typedef enum free_memory_reason_t {
-	FMR_UNKNOWN,
-	FMR_NEEDFREE,
-	FMR_LOTSFREE,
-	FMR_SWAPFS_MINFREE,
-	FMR_PAGES_PP_MAXIMUM,
-	FMR_HEAP_ARENA,
-	FMR_ZIO_ARENA,
-} free_memory_reason_t;
-
-int64_t last_free_memory;
-free_memory_reason_t last_free_reason;
-
-/*
- * Additional reserve of pages for pp_reserve.
- */
-int64_t arc_pages_pp_reserve = 64;
-
-/*
- * Additional reserve of pages for swapfs.
- */
-int64_t arc_swapfs_reserve = 64;
-
-/*
- * Return the amount of memory that can be consumed before reclaim will be
- * needed.  Positive if there is sufficient free memory, negative indicates
- * the amount of memory that needs to be freed up.
- */
-static int64_t
-arc_available_memory(void)
-{
-	int64_t lowest = INT64_MAX;
-	int64_t n;
-	free_memory_reason_t r = FMR_UNKNOWN;
-
-#ifdef _KERNEL
-#ifdef __FreeBSD__
-	/*
-	 * Cooperate with pagedaemon when it's time for it to scan
-	 * and reclaim some pages.
-	 */
-	n = PAGESIZE * ((int64_t)freemem - zfs_arc_free_target);
-	if (n < lowest) {
-		lowest = n;
-		r = FMR_LOTSFREE;
-	}
-
-#else
-	if (needfree > 0) {
-		n = PAGESIZE * (-needfree);
-		if (n < lowest) {
-			lowest = n;
-			r = FMR_NEEDFREE;
-		}
-	}
-
-	/*
-	 * check that we're out of range of the pageout scanner.  It starts to
-	 * schedule paging if freemem is less than lotsfree and needfree.
-	 * lotsfree is the high-water mark for pageout, and needfree is the
-	 * number of needed free pages.  We add extra pages here to make sure
-	 * the scanner doesn't start up while we're freeing memory.
-	 */
-	n = PAGESIZE * (freemem - lotsfree - needfree - desfree);
-	if (n < lowest) {
-		lowest = n;
-		r = FMR_LOTSFREE;
-	}
-
-	/*
-	 * check to make sure that swapfs has enough space so that anon
-	 * reservations can still succeed. anon_resvmem() checks that the
-	 * availrmem is greater than swapfs_minfree, and the number of reserved
-	 * swap pages.  We also add a bit of extra here just to prevent
-	 * circumstances from getting really dire.
-	 */
-	n = PAGESIZE * (availrmem - swapfs_minfree - swapfs_reserve -
-	    desfree - arc_swapfs_reserve);
-	if (n < lowest) {
-		lowest = n;
-		r = FMR_SWAPFS_MINFREE;
-	}
-
-
-	/*
-	 * Check that we have enough availrmem that memory locking (e.g., via
-	 * mlock(3C) or memcntl(2)) can still succeed.  (pages_pp_maximum
-	 * stores the number of pages that cannot be locked; when availrmem
-	 * drops below pages_pp_maximum, page locking mechanisms such as
-	 * page_pp_lock() will fail.)
-	 */
-	n = PAGESIZE * (availrmem - pages_pp_maximum -
-	    arc_pages_pp_reserve);
-	if (n < lowest) {
-		lowest = n;
-		r = FMR_PAGES_PP_MAXIMUM;
-	}
-
-#endif	/* __FreeBSD__ */
-#if defined(__i386) || !defined(UMA_MD_SMALL_ALLOC)
-	/*
-	 * If we're on an i386 platform, it's possible that we'll exhaust the
-	 * kernel heap space before we ever run out of available physical
-	 * memory.  Most checks of the size of the heap_area compare against
-	 * tune.t_minarmem, which is the minimum available real memory that we
-	 * can have in the system.  However, this is generally fixed at 25 pages
-	 * which is so low that it's useless.  In this comparison, we seek to
-	 * calculate the total heap-size, and reclaim if more than 3/4ths of the
-	 * heap is allocated.  (Or, in the calculation, if less than 1/4th is
-	 * free)
-	 */
-	n = uma_avail() - (long)(uma_limit() / 4);
-	if (n < lowest) {
-		lowest = n;
-		r = FMR_HEAP_ARENA;
-	}
-#endif
-
-	/*
-	 * If zio data pages are being allocated out of a separate heap segment,
-	 * then enforce that the size of available vmem for this arena remains
-	 * above about 1/4th (1/(2^arc_zio_arena_free_shift)) free.
-	 *
-	 * Note that reducing the arc_zio_arena_free_shift keeps more virtual
-	 * memory (in the zio_arena) free, which can avoid memory
-	 * fragmentation issues.
-	 */
-	if (zio_arena != NULL) {
-		n = (int64_t)vmem_size(zio_arena, VMEM_FREE) -
-		    (vmem_size(zio_arena, VMEM_ALLOC) >>
-		    arc_zio_arena_free_shift);
-		if (n < lowest) {
-			lowest = n;
-			r = FMR_ZIO_ARENA;
-		}
-	}
-
-#else	/* _KERNEL */
-	/* Every 100 calls, free a small amount */
-	if (spa_get_random(100) == 0)
-		lowest = -1024;
-#endif	/* _KERNEL */
-
-	last_free_memory = lowest;
-	last_free_reason = r;
-	DTRACE_PROBE2(arc__available_memory, int64_t, lowest, int, r);
-	return (lowest);
-}
-
-
-/*
- * Determine if the system is under memory pressure and is asking
- * to reclaim memory. A return value of B_TRUE indicates that the system
- * is under memory pressure and that the arc should adjust accordingly.
- */
-static boolean_t
-arc_reclaim_needed(void)
-{
-	return (arc_available_memory() < 0);
-}
-
-extern kmem_cache_t	*zio_buf_cache[];
-extern kmem_cache_t	*zio_data_buf_cache[];
-extern kmem_cache_t	*range_seg_cache;
-extern kmem_cache_t	*abd_chunk_cache;
-
-static __noinline void
-arc_kmem_reap_soon(void)
-{
-	size_t			i;
-	kmem_cache_t		*prev_cache = NULL;
-	kmem_cache_t		*prev_data_cache = NULL;
-
-	DTRACE_PROBE(arc__kmem_reap_start);
-#ifdef _KERNEL
-	if (aggsum_compare(&arc_meta_used, arc_meta_limit) >= 0) {
-		/*
-		 * We are exceeding our meta-data cache limit.
-		 * Purge some DNLC entries to release holds on meta-data.
-		 */
-		dnlc_reduce_cache((void *)(uintptr_t)arc_reduce_dnlc_percent);
-	}
-#if defined(__i386)
-	/*
-	 * Reclaim unused memory from all kmem caches.
-	 */
-	kmem_reap();
-#endif
-#endif
-
-	for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) {
-		if (zio_buf_cache[i] != prev_cache) {
-			prev_cache = zio_buf_cache[i];
-			kmem_cache_reap_soon(zio_buf_cache[i]);
-		}
-		if (zio_data_buf_cache[i] != prev_data_cache) {
-			prev_data_cache = zio_data_buf_cache[i];
-			kmem_cache_reap_soon(zio_data_buf_cache[i]);
-		}
-	}
-	kmem_cache_reap_soon(abd_chunk_cache);
-	kmem_cache_reap_soon(buf_cache);
-	kmem_cache_reap_soon(hdr_full_cache);
-	kmem_cache_reap_soon(hdr_l2only_cache);
-	kmem_cache_reap_soon(range_seg_cache);
-
-#ifdef illumos
-	if (zio_arena != NULL) {
-		/*
-		 * Ask the vmem arena to reclaim unused memory from its
-		 * quantum caches.
-		 */
-		vmem_qcache_reap(zio_arena);
-	}
-#endif
-	DTRACE_PROBE(arc__kmem_reap_end);
-}
-
-/* ARGSUSED */
-static boolean_t
-arc_adjust_cb_check(void *arg, zthr_t *zthr)
-{
-	/*
-	 * This is necessary in order for the mdb ::arc dcmd to
-	 * show up to date information. Since the ::arc command
-	 * does not call the kstat's update function, without
-	 * this call, the command may show stale stats for the
-	 * anon, mru, mru_ghost, mfu, and mfu_ghost lists. Even
-	 * with this change, the data might be up to 1 second
-	 * out of date(the arc_adjust_zthr has a maximum sleep
-	 * time of 1 second); but that should suffice.  The
-	 * arc_state_t structures can be queried directly if more
-	 * accurate information is needed.
-	 */
-	if (arc_ksp != NULL)
-		arc_ksp->ks_update(arc_ksp, KSTAT_READ);
-
-	/*
-	 * We have to rely on arc_get_data_impl() to tell us when to adjust,
-	 * rather than checking if we are overflowing here, so that we are
-	 * sure to not leave arc_get_data_impl() waiting on
-	 * arc_adjust_waiters_cv.  If we have become "not overflowing" since
-	 * arc_get_data_impl() checked, we need to wake it up.  We could
-	 * broadcast the CV here, but arc_get_data_impl() may have not yet
-	 * gone to sleep.  We would need to use a mutex to ensure that this
-	 * function doesn't broadcast until arc_get_data_impl() has gone to
-	 * sleep (e.g. the arc_adjust_lock).  However, the lock ordering of
-	 * such a lock would necessarily be incorrect with respect to the
-	 * zthr_lock, which is held before this function is called, and is
-	 * held by arc_get_data_impl() when it calls zthr_wakeup().
-	 */
-	return (arc_adjust_needed);
-}
-
-/*
- * Keep arc_size under arc_c by running arc_adjust which evicts data
- * from the ARC. */
-/* ARGSUSED */
-static void
-arc_adjust_cb(void *arg, zthr_t *zthr)
-{
-	uint64_t evicted = 0;
-
-	/* Evict from cache */
-	evicted = arc_adjust();
-
-	/*
-	 * If evicted is zero, we couldn't evict anything
-	 * via arc_adjust(). This could be due to hash lock
-	 * collisions, but more likely due to the majority of
-	 * arc buffers being unevictable. Therefore, even if
-	 * arc_size is above arc_c, another pass is unlikely to
-	 * be helpful and could potentially cause us to enter an
-	 * infinite loop.  Additionally, zthr_iscancelled() is
-	 * checked here so that if the arc is shutting down, the
-	 * broadcast will wake any remaining arc adjust waiters.
-	 */
-	mutex_enter(&arc_adjust_lock);
-	arc_adjust_needed = !zthr_iscancelled(arc_adjust_zthr) &&
-	    evicted > 0 && aggsum_compare(&arc_size, arc_c) > 0;
-	if (!arc_adjust_needed) {
-		/*
-		 * We're either no longer overflowing, or we
-		 * can't evict anything more, so we should wake
-		 * up any waiters.
-		 */
-		cv_broadcast(&arc_adjust_waiters_cv);
-	}
-	mutex_exit(&arc_adjust_lock);
-}
-
-/* ARGSUSED */
-static boolean_t
-arc_reap_cb_check(void *arg, zthr_t *zthr)
-{
-	int64_t free_memory = arc_available_memory();
-
-	/*
-	 * If a kmem reap is already active, don't schedule more.  We must
-	 * check for this because kmem_cache_reap_soon() won't actually
-	 * block on the cache being reaped (this is to prevent callers from
-	 * becoming implicitly blocked by a system-wide kmem reap -- which,
-	 * on a system with many, many full magazines, can take minutes).
-	 */
-	if (!kmem_cache_reap_active() &&
-	    free_memory < 0) {
-		arc_no_grow = B_TRUE;
-		arc_warm = B_TRUE;
-		/*
-		 * Wait at least zfs_grow_retry (default 60) seconds
-		 * before considering growing.
-		 */
-		arc_growtime = gethrtime() + SEC2NSEC(arc_grow_retry);
-		return (B_TRUE);
-	} else if (free_memory < arc_c >> arc_no_grow_shift) {
-		arc_no_grow = B_TRUE;
-	} else if (gethrtime() >= arc_growtime) {
-		arc_no_grow = B_FALSE;
-	}
-
-	return (B_FALSE);
-}
-
-/*
- * Keep enough free memory in the system by reaping the ARC's kmem
- * caches.  To cause more slabs to be reapable, we may reduce the
- * target size of the cache (arc_c), causing the arc_adjust_cb()
- * to free more buffers.
- */
-/* ARGSUSED */
-static void
-arc_reap_cb(void *arg, zthr_t *zthr)
-{
-	int64_t free_memory;
-
-	/*
-	 * Kick off asynchronous kmem_reap()'s of all our caches.
-	 */
-	arc_kmem_reap_soon();
-
-	/*
-	 * Wait at least arc_kmem_cache_reap_retry_ms between
-	 * arc_kmem_reap_soon() calls. Without this check it is possible to
-	 * end up in a situation where we spend lots of time reaping
-	 * caches, while we're near arc_c_min.  Waiting here also gives the
-	 * subsequent free memory check a chance of finding that the
-	 * asynchronous reap has already freed enough memory, and we don't
-	 * need to call arc_reduce_target_size().
-	 */
-	delay((hz * arc_kmem_cache_reap_retry_ms + 999) / 1000);
-
-	/*
-	 * Reduce the target size as needed to maintain the amount of free
-	 * memory in the system at a fraction of the arc_size (1/128th by
-	 * default).  If oversubscribed (free_memory < 0) then reduce the
-	 * target arc_size by the deficit amount plus the fractional
-	 * amount.  If free memory is positive but less then the fractional
-	 * amount, reduce by what is needed to hit the fractional amount.
-	 */
-	free_memory = arc_available_memory();
-
-	int64_t to_free =
-	    (arc_c >> arc_shrink_shift) - free_memory;
-	if (to_free > 0) {
-#ifdef _KERNEL
-#ifdef illumos
-		to_free = MAX(to_free, ptob(needfree));
-#endif
-#endif
-		arc_reduce_target_size(to_free);
-	}
-}
-
-static u_int arc_dnlc_evicts_arg;
-extern struct vfsops zfs_vfsops;
-
-static void
-arc_dnlc_evicts_thread(void *dummy __unused)
-{
-	callb_cpr_t cpr;
-	u_int percent;
-
-	CALLB_CPR_INIT(&cpr, &arc_dnlc_evicts_lock, callb_generic_cpr, FTAG);
-
-	mutex_enter(&arc_dnlc_evicts_lock);
-	while (!arc_dnlc_evicts_thread_exit) {
-		CALLB_CPR_SAFE_BEGIN(&cpr);
-		(void) cv_wait(&arc_dnlc_evicts_cv, &arc_dnlc_evicts_lock);
-		CALLB_CPR_SAFE_END(&cpr, &arc_dnlc_evicts_lock);
-		if (arc_dnlc_evicts_arg != 0) {
-			percent = arc_dnlc_evicts_arg;
-			mutex_exit(&arc_dnlc_evicts_lock);
-#ifdef _KERNEL
-			vnlru_free(desiredvnodes * percent / 100, &zfs_vfsops);
-#endif
-			mutex_enter(&arc_dnlc_evicts_lock);
-			/*
-			 * Clear our token only after vnlru_free()
-			 * pass is done, to avoid false queueing of
-			 * the requests.
-			 */
-			arc_dnlc_evicts_arg = 0;
-		}
-	}
-	arc_dnlc_evicts_thread_exit = FALSE;
-	cv_broadcast(&arc_dnlc_evicts_cv);
-	CALLB_CPR_EXIT(&cpr);
-	thread_exit();
-}
-
-void
-dnlc_reduce_cache(void *arg)
-{
-	u_int percent;
-
-	percent = (u_int)(uintptr_t)arg;
-	mutex_enter(&arc_dnlc_evicts_lock);
-	if (arc_dnlc_evicts_arg == 0) {
-		arc_dnlc_evicts_arg = percent;
-		cv_broadcast(&arc_dnlc_evicts_cv);
-	}
-	mutex_exit(&arc_dnlc_evicts_lock);
-}
-
-/*
- * Adapt arc info given the number of bytes we are trying to add and
- * the state that we are comming from.  This function is only called
- * when we are adding new content to the cache.
- */
-static void
-arc_adapt(int bytes, arc_state_t *state)
-{
-	int mult;
-	uint64_t arc_p_min = (arc_c >> arc_p_min_shift);
-	int64_t mrug_size = zfs_refcount_count(&arc_mru_ghost->arcs_size);
-	int64_t mfug_size = zfs_refcount_count(&arc_mfu_ghost->arcs_size);
-
-	if (state == arc_l2c_only)
-		return;
-
-	ASSERT(bytes > 0);
-	/*
-	 * Adapt the target size of the MRU list:
-	 *	- if we just hit in the MRU ghost list, then increase
-	 *	  the target size of the MRU list.
-	 *	- if we just hit in the MFU ghost list, then increase
-	 *	  the target size of the MFU list by decreasing the
-	 *	  target size of the MRU list.
-	 */
-	if (state == arc_mru_ghost) {
-		mult = (mrug_size >= mfug_size) ? 1 : (mfug_size / mrug_size);
-		mult = MIN(mult, 10); /* avoid wild arc_p adjustment */
-
-		arc_p = MIN(arc_c - arc_p_min, arc_p + bytes * mult);
-	} else if (state == arc_mfu_ghost) {
-		uint64_t delta;
-
-		mult = (mfug_size >= mrug_size) ? 1 : (mrug_size / mfug_size);
-		mult = MIN(mult, 10);
-
-		delta = MIN(bytes * mult, arc_p);
-		arc_p = MAX(arc_p_min, arc_p - delta);
-	}
-	ASSERT((int64_t)arc_p >= 0);
-
-	/*
-	 * Wake reap thread if we do not have any available memory
-	 */
-	if (arc_reclaim_needed()) {
-		zthr_wakeup(arc_reap_zthr);
-		return;
-	}
-
-	if (arc_no_grow)
-		return;
-
-	if (arc_c >= arc_c_max)
-		return;
-
-	/*
-	 * If we're within (2 * maxblocksize) bytes of the target
-	 * cache size, increment the target cache size
-	 */
-	if (aggsum_compare(&arc_size, arc_c - (2ULL << SPA_MAXBLOCKSHIFT)) >
-	    0) {
-		DTRACE_PROBE1(arc__inc_adapt, int, bytes);
-		atomic_add_64(&arc_c, (int64_t)bytes);
-		if (arc_c > arc_c_max)
-			arc_c = arc_c_max;
-		else if (state == arc_anon)
-			atomic_add_64(&arc_p, (int64_t)bytes);
-		if (arc_p > arc_c)
-			arc_p = arc_c;
-	}
-	ASSERT((int64_t)arc_p >= 0);
-}
-
-/*
- * Check if arc_size has grown past our upper threshold, determined by
- * zfs_arc_overflow_shift.
- */
-static boolean_t
-arc_is_overflowing(void)
-{
-	/* Always allow at least one block of overflow */
-	int64_t overflow = MAX(SPA_MAXBLOCKSIZE,
-	    arc_c >> zfs_arc_overflow_shift);
-
-	/*
-	 * We just compare the lower bound here for performance reasons. Our
-	 * primary goals are to make sure that the arc never grows without
-	 * bound, and that it can reach its maximum size. This check
-	 * accomplishes both goals. The maximum amount we could run over by is
-	 * 2 * aggsum_borrow_multiplier * NUM_CPUS * the average size of a block
-	 * in the ARC. In practice, that's in the tens of MB, which is low
-	 * enough to be safe.
-	 */
-	return (aggsum_lower_bound(&arc_size) >= (int64_t)arc_c + overflow);
-}
-
-static abd_t *
-arc_get_data_abd(arc_buf_hdr_t *hdr, uint64_t size, void *tag, boolean_t do_adapt)
-{
-	arc_buf_contents_t type = arc_buf_type(hdr);
-
-	arc_get_data_impl(hdr, size, tag, do_adapt);
-	if (type == ARC_BUFC_METADATA) {
-		return (abd_alloc(size, B_TRUE));
-	} else {
-		ASSERT(type == ARC_BUFC_DATA);
-		return (abd_alloc(size, B_FALSE));
-	}
-}
-
-static void *
-arc_get_data_buf(arc_buf_hdr_t *hdr, uint64_t size, void *tag)
-{
-	arc_buf_contents_t type = arc_buf_type(hdr);
-
-	arc_get_data_impl(hdr, size, tag, B_TRUE);
-	if (type == ARC_BUFC_METADATA) {
-		return (zio_buf_alloc(size));
-	} else {
-		ASSERT(type == ARC_BUFC_DATA);
-		return (zio_data_buf_alloc(size));
-	}
-}
-
-/*
- * Allocate a block and return it to the caller. If we are hitting the
- * hard limit for the cache size, we must sleep, waiting for the eviction
- * thread to catch up. If we're past the target size but below the hard
- * limit, we'll only signal the reclaim thread and continue on.
- */
-static void
-arc_get_data_impl(arc_buf_hdr_t *hdr, uint64_t size, void *tag, boolean_t do_adapt)
-{
-	arc_state_t *state = hdr->b_l1hdr.b_state;
-	arc_buf_contents_t type = arc_buf_type(hdr);
-
-	if (do_adapt)
-		arc_adapt(size, state);
-
-	/*
-	 * If arc_size is currently overflowing, and has grown past our
-	 * upper limit, we must be adding data faster than the evict
-	 * thread can evict. Thus, to ensure we don't compound the
-	 * problem by adding more data and forcing arc_size to grow even
-	 * further past it's target size, we halt and wait for the
-	 * eviction thread to catch up.
-	 *
-	 * It's also possible that the reclaim thread is unable to evict
-	 * enough buffers to get arc_size below the overflow limit (e.g.
-	 * due to buffers being un-evictable, or hash lock collisions).
-	 * In this case, we want to proceed regardless if we're
-	 * overflowing; thus we don't use a while loop here.
-	 */
-	if (arc_is_overflowing()) {
-		mutex_enter(&arc_adjust_lock);
-
-		/*
-		 * Now that we've acquired the lock, we may no longer be
-		 * over the overflow limit, lets check.
-		 *
-		 * We're ignoring the case of spurious wake ups. If that
-		 * were to happen, it'd let this thread consume an ARC
-		 * buffer before it should have (i.e. before we're under
-		 * the overflow limit and were signalled by the reclaim
-		 * thread). As long as that is a rare occurrence, it
-		 * shouldn't cause any harm.
-		 */
-		if (arc_is_overflowing()) {
-			arc_adjust_needed = B_TRUE;
-			zthr_wakeup(arc_adjust_zthr);
-			(void) cv_wait(&arc_adjust_waiters_cv,
-			    &arc_adjust_lock);
-		}
-		mutex_exit(&arc_adjust_lock);
-	}
-
-	VERIFY3U(hdr->b_type, ==, type);
-	if (type == ARC_BUFC_METADATA) {
-		arc_space_consume(size, ARC_SPACE_META);
-	} else {
-		arc_space_consume(size, ARC_SPACE_DATA);
-	}
-
-	/*
-	 * Update the state size.  Note that ghost states have a
-	 * "ghost size" and so don't need to be updated.
-	 */
-	if (!GHOST_STATE(state)) {
-
-		(void) zfs_refcount_add_many(&state->arcs_size, size, tag);
-
-		/*
-		 * If this is reached via arc_read, the link is
-		 * protected by the hash lock. If reached via
-		 * arc_buf_alloc, the header should not be accessed by
-		 * any other thread. And, if reached via arc_read_done,
-		 * the hash lock will protect it if it's found in the
-		 * hash table; otherwise no other thread should be
-		 * trying to [add|remove]_reference it.
-		 */
-		if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) {
-			ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
-			(void) zfs_refcount_add_many(&state->arcs_esize[type],
-			    size, tag);
-		}
-
-		/*
-		 * If we are growing the cache, and we are adding anonymous
-		 * data, and we have outgrown arc_p, update arc_p
-		 */
-		if (aggsum_upper_bound(&arc_size) < arc_c &&
-		    hdr->b_l1hdr.b_state == arc_anon &&
-		    (zfs_refcount_count(&arc_anon->arcs_size) +
-		    zfs_refcount_count(&arc_mru->arcs_size) > arc_p))
-			arc_p = MIN(arc_c, arc_p + size);
-	}
-	ARCSTAT_BUMP(arcstat_allocated);
-}
-
-static void
-arc_free_data_abd(arc_buf_hdr_t *hdr, abd_t *abd, uint64_t size, void *tag)
-{
-	arc_free_data_impl(hdr, size, tag);
-	abd_free(abd);
-}
-
-static void
-arc_free_data_buf(arc_buf_hdr_t *hdr, void *buf, uint64_t size, void *tag)
-{
-	arc_buf_contents_t type = arc_buf_type(hdr);
-
-	arc_free_data_impl(hdr, size, tag);
-	if (type == ARC_BUFC_METADATA) {
-		zio_buf_free(buf, size);
-	} else {
-		ASSERT(type == ARC_BUFC_DATA);
-		zio_data_buf_free(buf, size);
-	}
-}
-
-/*
- * Free the arc data buffer.
- */
-static void
-arc_free_data_impl(arc_buf_hdr_t *hdr, uint64_t size, void *tag)
-{
-	arc_state_t *state = hdr->b_l1hdr.b_state;
-	arc_buf_contents_t type = arc_buf_type(hdr);
-
-	/* protected by hash lock, if in the hash table */
-	if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) {
-		ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
-		ASSERT(state != arc_anon && state != arc_l2c_only);
-
-		(void) zfs_refcount_remove_many(&state->arcs_esize[type],
-		    size, tag);
-	}
-	(void) zfs_refcount_remove_many(&state->arcs_size, size, tag);
-
-	VERIFY3U(hdr->b_type, ==, type);
-	if (type == ARC_BUFC_METADATA) {
-		arc_space_return(size, ARC_SPACE_META);
-	} else {
-		ASSERT(type == ARC_BUFC_DATA);
-		arc_space_return(size, ARC_SPACE_DATA);
-	}
-}
-
-/*
- * This routine is called whenever a buffer is accessed.
- * NOTE: the hash lock is dropped in this function.
- */
-static void
-arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
-{
-	clock_t now;
-
-	ASSERT(MUTEX_HELD(hash_lock));
-	ASSERT(HDR_HAS_L1HDR(hdr));
-
-	if (hdr->b_l1hdr.b_state == arc_anon) {
-		/*
-		 * This buffer is not in the cache, and does not
-		 * appear in our "ghost" list.  Add the new buffer
-		 * to the MRU state.
-		 */
-
-		ASSERT0(hdr->b_l1hdr.b_arc_access);
-		hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
-		DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr);
-		arc_change_state(arc_mru, hdr, hash_lock);
-
-	} else if (hdr->b_l1hdr.b_state == arc_mru) {
-		now = ddi_get_lbolt();
-
-		/*
-		 * If this buffer is here because of a prefetch, then either:
-		 * - clear the flag if this is a "referencing" read
-		 *   (any subsequent access will bump this into the MFU state).
-		 * or
-		 * - move the buffer to the head of the list if this is
-		 *   another prefetch (to make it less likely to be evicted).
-		 */
-		if (HDR_PREFETCH(hdr) || HDR_PRESCIENT_PREFETCH(hdr)) {
-			if (zfs_refcount_count(&hdr->b_l1hdr.b_refcnt) == 0) {
-				/* link protected by hash lock */
-				ASSERT(multilist_link_active(
-				    &hdr->b_l1hdr.b_arc_node));
-			} else {
-				arc_hdr_clear_flags(hdr,
-				    ARC_FLAG_PREFETCH |
-				    ARC_FLAG_PRESCIENT_PREFETCH);
-				ARCSTAT_BUMP(arcstat_mru_hits);
-			}
-			hdr->b_l1hdr.b_arc_access = now;
-			return;
-		}
-
-		/*
-		 * This buffer has been "accessed" only once so far,
-		 * but it is still in the cache. Move it to the MFU
-		 * state.
-		 */
-		if (now > hdr->b_l1hdr.b_arc_access + ARC_MINTIME) {
-			/*
-			 * More than 125ms have passed since we
-			 * instantiated this buffer.  Move it to the
-			 * most frequently used state.
-			 */
-			hdr->b_l1hdr.b_arc_access = now;
-			DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr);
-			arc_change_state(arc_mfu, hdr, hash_lock);
-		}
-		atomic_inc_32(&hdr->b_l1hdr.b_mru_hits);
-		ARCSTAT_BUMP(arcstat_mru_hits);
-	} else if (hdr->b_l1hdr.b_state == arc_mru_ghost) {
-		arc_state_t	*new_state;
-		/*
-		 * This buffer has been "accessed" recently, but
-		 * was evicted from the cache.  Move it to the
-		 * MFU state.
-		 */
-
-		if (HDR_PREFETCH(hdr) || HDR_PRESCIENT_PREFETCH(hdr)) {
-			new_state = arc_mru;
-			if (zfs_refcount_count(&hdr->b_l1hdr.b_refcnt) > 0) {
-				arc_hdr_clear_flags(hdr,
-				    ARC_FLAG_PREFETCH |
-				    ARC_FLAG_PRESCIENT_PREFETCH);
-			}
-			DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr);
-		} else {
-			new_state = arc_mfu;
-			DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr);
-		}
-
-		hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
-		arc_change_state(new_state, hdr, hash_lock);
-
-		atomic_inc_32(&hdr->b_l1hdr.b_mru_ghost_hits);
-		ARCSTAT_BUMP(arcstat_mru_ghost_hits);
-	} else if (hdr->b_l1hdr.b_state == arc_mfu) {
-		/*
-		 * This buffer has been accessed more than once and is
-		 * still in the cache.  Keep it in the MFU state.
-		 *
-		 * NOTE: an add_reference() that occurred when we did
-		 * the arc_read() will have kicked this off the list.
-		 * If it was a prefetch, we will explicitly move it to
-		 * the head of the list now.
-		 */
-
-		atomic_inc_32(&hdr->b_l1hdr.b_mfu_hits);
-		ARCSTAT_BUMP(arcstat_mfu_hits);
-		hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
-	} else if (hdr->b_l1hdr.b_state == arc_mfu_ghost) {
-		arc_state_t	*new_state = arc_mfu;
-		/*
-		 * This buffer has been accessed more than once but has
-		 * been evicted from the cache.  Move it back to the
-		 * MFU state.
-		 */
-
-		if (HDR_PREFETCH(hdr) || HDR_PRESCIENT_PREFETCH(hdr)) {
-			/*
-			 * This is a prefetch access...
-			 * move this block back to the MRU state.
-			 */
-			new_state = arc_mru;
-		}
-
-		hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
-		DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr);
-		arc_change_state(new_state, hdr, hash_lock);
-
-		atomic_inc_32(&hdr->b_l1hdr.b_mfu_ghost_hits);
-		ARCSTAT_BUMP(arcstat_mfu_ghost_hits);
-	} else if (hdr->b_l1hdr.b_state == arc_l2c_only) {
-		/*
-		 * This buffer is on the 2nd Level ARC.
-		 */
-
-		hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
-		DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr);
-		arc_change_state(arc_mfu, hdr, hash_lock);
-	} else {
-		ASSERT(!"invalid arc state");
-	}
-}
-
-/*
- * This routine is called by dbuf_hold() to update the arc_access() state
- * which otherwise would be skipped for entries in the dbuf cache.
- */
-void
-arc_buf_access(arc_buf_t *buf)
-{
-	mutex_enter(&buf->b_evict_lock);
-	arc_buf_hdr_t *hdr = buf->b_hdr;
-
-	/*
-	 * Avoid taking the hash_lock when possible as an optimization.
-	 * The header must be checked again under the hash_lock in order
-	 * to handle the case where it is concurrently being released.
-	 */
-	if (hdr->b_l1hdr.b_state == arc_anon || HDR_EMPTY(hdr)) {
-		mutex_exit(&buf->b_evict_lock);
-		ARCSTAT_BUMP(arcstat_access_skip);
-		return;
-	}
-
-	kmutex_t *hash_lock = HDR_LOCK(hdr);
-	mutex_enter(hash_lock);
-
-	if (hdr->b_l1hdr.b_state == arc_anon || HDR_EMPTY(hdr)) {
-		mutex_exit(hash_lock);
-		mutex_exit(&buf->b_evict_lock);
-		ARCSTAT_BUMP(arcstat_access_skip);
-		return;
-	}
-
-	mutex_exit(&buf->b_evict_lock);
-
-	ASSERT(hdr->b_l1hdr.b_state == arc_mru ||
-	    hdr->b_l1hdr.b_state == arc_mfu);
-
-	DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
-	arc_access(hdr, hash_lock);
-	mutex_exit(hash_lock);
-
-	ARCSTAT_BUMP(arcstat_hits);
-	ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr),
-	    demand, prefetch, !HDR_ISTYPE_METADATA(hdr), data, metadata, hits);
-}
-
-/* a generic arc_read_done_func_t which you can use */
-/* ARGSUSED */
-void
-arc_bcopy_func(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp,
-    arc_buf_t *buf, void *arg)
-{
-	if (buf == NULL)
-		return;
-
-	bcopy(buf->b_data, arg, arc_buf_size(buf));
-	arc_buf_destroy(buf, arg);
-}
-
-/* a generic arc_read_done_func_t */
-/* ARGSUSED */
-void
-arc_getbuf_func(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp,
-    arc_buf_t *buf, void *arg)
-{
-	arc_buf_t **bufp = arg;
-	if (buf == NULL) {
-		ASSERT(zio == NULL || zio->io_error != 0);
-		*bufp = NULL;
-	} else {
-		ASSERT(zio == NULL || zio->io_error == 0);
-		*bufp = buf;
-		ASSERT(buf->b_data != NULL);
-	}
-}
-
-static void
-arc_hdr_verify(arc_buf_hdr_t *hdr, blkptr_t *bp)
-{
-	if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp)) {
-		ASSERT3U(HDR_GET_PSIZE(hdr), ==, 0);
-		ASSERT3U(HDR_GET_COMPRESS(hdr), ==, ZIO_COMPRESS_OFF);
-	} else {
-		if (HDR_COMPRESSION_ENABLED(hdr)) {
-			ASSERT3U(HDR_GET_COMPRESS(hdr), ==,
-			    BP_GET_COMPRESS(bp));
-		}
-		ASSERT3U(HDR_GET_LSIZE(hdr), ==, BP_GET_LSIZE(bp));
-		ASSERT3U(HDR_GET_PSIZE(hdr), ==, BP_GET_PSIZE(bp));
-	}
-}
-
-static void
-arc_read_done(zio_t *zio)
-{
-	arc_buf_hdr_t	*hdr = zio->io_private;
-	kmutex_t	*hash_lock = NULL;
-	arc_callback_t	*callback_list;
-	arc_callback_t	*acb;
-	boolean_t	freeable = B_FALSE;
-	boolean_t	no_zio_error = (zio->io_error == 0);
-
-	/*
-	 * The hdr was inserted into hash-table and removed from lists
-	 * prior to starting I/O.  We should find this header, since
-	 * it's in the hash table, and it should be legit since it's
-	 * not possible to evict it during the I/O.  The only possible
-	 * reason for it not to be found is if we were freed during the
-	 * read.
-	 */
-	if (HDR_IN_HASH_TABLE(hdr)) {
-		ASSERT3U(hdr->b_birth, ==, BP_PHYSICAL_BIRTH(zio->io_bp));
-		ASSERT3U(hdr->b_dva.dva_word[0], ==,
-		    BP_IDENTITY(zio->io_bp)->dva_word[0]);
-		ASSERT3U(hdr->b_dva.dva_word[1], ==,
-		    BP_IDENTITY(zio->io_bp)->dva_word[1]);
-
-		arc_buf_hdr_t *found = buf_hash_find(hdr->b_spa, zio->io_bp,
-		    &hash_lock);
-
-		ASSERT((found == hdr &&
-		    DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp))) ||
-		    (found == hdr && HDR_L2_READING(hdr)));
-		ASSERT3P(hash_lock, !=, NULL);
-	}
-
-	if (no_zio_error) {
-		/* byteswap if necessary */
-		if (BP_SHOULD_BYTESWAP(zio->io_bp)) {
-			if (BP_GET_LEVEL(zio->io_bp) > 0) {
-				hdr->b_l1hdr.b_byteswap = DMU_BSWAP_UINT64;
-			} else {
-				hdr->b_l1hdr.b_byteswap =
-				    DMU_OT_BYTESWAP(BP_GET_TYPE(zio->io_bp));
-			}
-		} else {
-			hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS;
-		}
-	}
-
-	arc_hdr_clear_flags(hdr, ARC_FLAG_L2_EVICTED);
-	if (l2arc_noprefetch && HDR_PREFETCH(hdr))
-		arc_hdr_clear_flags(hdr, ARC_FLAG_L2CACHE);
-
-	callback_list = hdr->b_l1hdr.b_acb;
-	ASSERT3P(callback_list, !=, NULL);
-
-	if (hash_lock && no_zio_error && hdr->b_l1hdr.b_state == arc_anon) {
-		/*
-		 * Only call arc_access on anonymous buffers.  This is because
-		 * if we've issued an I/O for an evicted buffer, we've already
-		 * called arc_access (to prevent any simultaneous readers from
-		 * getting confused).
-		 */
-		arc_access(hdr, hash_lock);
-	}
-
-	/*
-	 * If a read request has a callback (i.e. acb_done is not NULL), then we
-	 * make a buf containing the data according to the parameters which were
-	 * passed in. The implementation of arc_buf_alloc_impl() ensures that we
-	 * aren't needlessly decompressing the data multiple times.
-	 */
-	int callback_cnt = 0;
-	for (acb = callback_list; acb != NULL; acb = acb->acb_next) {
-		if (!acb->acb_done)
-			continue;
-
-		callback_cnt++;
-
-		if (no_zio_error) {
-			int error = arc_buf_alloc_impl(hdr, acb->acb_private,
-			    acb->acb_compressed, zio->io_error == 0,
-			    &acb->acb_buf);
-			if (error != 0) {
-				/*
-				 * Decompression failed.  Set io_error
-				 * so that when we call acb_done (below),
-				 * we will indicate that the read failed.
-				 * Note that in the unusual case where one
-				 * callback is compressed and another
-				 * uncompressed, we will mark all of them
-				 * as failed, even though the uncompressed
-				 * one can't actually fail.  In this case,
-				 * the hdr will not be anonymous, because
-				 * if there are multiple callbacks, it's
-				 * because multiple threads found the same
-				 * arc buf in the hash table.
-				 */
-				zio->io_error = error;
-			}
-		}
-	}
-	/*
-	 * If there are multiple callbacks, we must have the hash lock,
-	 * because the only way for multiple threads to find this hdr is
-	 * in the hash table.  This ensures that if there are multiple
-	 * callbacks, the hdr is not anonymous.  If it were anonymous,
-	 * we couldn't use arc_buf_destroy() in the error case below.
-	 */
-	ASSERT(callback_cnt < 2 || hash_lock != NULL);
-
-	hdr->b_l1hdr.b_acb = NULL;
-	arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS);
-	if (callback_cnt == 0) {
-		ASSERT(HDR_PREFETCH(hdr));
-		ASSERT0(hdr->b_l1hdr.b_bufcnt);
-		ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
-	}
-
-	ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt) ||
-	    callback_list != NULL);
-
-	if (no_zio_error) {
-		arc_hdr_verify(hdr, zio->io_bp);
-	} else {
-		arc_hdr_set_flags(hdr, ARC_FLAG_IO_ERROR);
-		if (hdr->b_l1hdr.b_state != arc_anon)
-			arc_change_state(arc_anon, hdr, hash_lock);
-		if (HDR_IN_HASH_TABLE(hdr))
-			buf_hash_remove(hdr);
-		freeable = zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt);
-	}
-
-	/*
-	 * Broadcast before we drop the hash_lock to avoid the possibility
-	 * that the hdr (and hence the cv) might be freed before we get to
-	 * the cv_broadcast().
-	 */
-	cv_broadcast(&hdr->b_l1hdr.b_cv);
-
-	if (hash_lock != NULL) {
-		mutex_exit(hash_lock);
-	} else {
-		/*
-		 * This block was freed while we waited for the read to
-		 * complete.  It has been removed from the hash table and
-		 * moved to the anonymous state (so that it won't show up
-		 * in the cache).
-		 */
-		ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon);
-		freeable = zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt);
-	}
-
-	/* execute each callback and free its structure */
-	while ((acb = callback_list) != NULL) {
-		if (acb->acb_done != NULL) {
-			if (zio->io_error != 0 && acb->acb_buf != NULL) {
-				/*
-				 * If arc_buf_alloc_impl() fails during
-				 * decompression, the buf will still be
-				 * allocated, and needs to be freed here.
-				 */
-				arc_buf_destroy(acb->acb_buf, acb->acb_private);
-				acb->acb_buf = NULL;
-			}
-			acb->acb_done(zio, &zio->io_bookmark, zio->io_bp,
-			    acb->acb_buf, acb->acb_private);
-		}
-
-		if (acb->acb_zio_dummy != NULL) {
-			acb->acb_zio_dummy->io_error = zio->io_error;
-			zio_nowait(acb->acb_zio_dummy);
-		}
-
-		callback_list = acb->acb_next;
-		kmem_free(acb, sizeof (arc_callback_t));
-	}
-
-	if (freeable)
-		arc_hdr_destroy(hdr);
-}
-
-/*
- * "Read" the block at the specified DVA (in bp) via the
- * cache.  If the block is found in the cache, invoke the provided
- * callback immediately and return.  Note that the `zio' parameter
- * in the callback will be NULL in this case, since no IO was
- * required.  If the block is not in the cache pass the read request
- * on to the spa with a substitute callback function, so that the
- * requested block will be added to the cache.
- *
- * If a read request arrives for a block that has a read in-progress,
- * either wait for the in-progress read to complete (and return the
- * results); or, if this is a read with a "done" func, add a record
- * to the read to invoke the "done" func when the read completes,
- * and return; or just return.
- *
- * arc_read_done() will invoke all the requested "done" functions
- * for readers of this block.
- */
-int
-arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_read_done_func_t *done,
-    void *private, zio_priority_t priority, int zio_flags,
-    arc_flags_t *arc_flags, const zbookmark_phys_t *zb)
-{
-	arc_buf_hdr_t *hdr = NULL;
-	kmutex_t *hash_lock = NULL;
-	zio_t *rzio;
-	uint64_t guid = spa_load_guid(spa);
-	boolean_t compressed_read = (zio_flags & ZIO_FLAG_RAW) != 0;
-	int rc = 0;
-	
-	ASSERT(!BP_IS_EMBEDDED(bp) ||
-	    BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA);
-
-top:
-	if (!BP_IS_EMBEDDED(bp)) {
-		/*
-		 * Embedded BP's have no DVA and require no I/O to "read".
-		 * Create an anonymous arc buf to back it.
-		 */
-		hdr = buf_hash_find(guid, bp, &hash_lock);
-	}
-
-	if (hdr != NULL && HDR_HAS_L1HDR(hdr) && hdr->b_l1hdr.b_pabd != NULL) {
-		arc_buf_t *buf = NULL;
-		*arc_flags |= ARC_FLAG_CACHED;
-
-		if (HDR_IO_IN_PROGRESS(hdr)) {
-			zio_t *head_zio = hdr->b_l1hdr.b_acb->acb_zio_head;
-
-			ASSERT3P(head_zio, !=, NULL);
-			if ((hdr->b_flags & ARC_FLAG_PRIO_ASYNC_READ) &&
-			    priority == ZIO_PRIORITY_SYNC_READ) {
-				/*
-				 * This is a sync read that needs to wait for
-				 * an in-flight async read. Request that the
-				 * zio have its priority upgraded.
-				 */
-				zio_change_priority(head_zio, priority);
-				DTRACE_PROBE1(arc__async__upgrade__sync,
-				    arc_buf_hdr_t *, hdr);
-				ARCSTAT_BUMP(arcstat_async_upgrade_sync);
-			}
-			if (hdr->b_flags & ARC_FLAG_PREDICTIVE_PREFETCH) {
-				arc_hdr_clear_flags(hdr,
-				    ARC_FLAG_PREDICTIVE_PREFETCH);
-			}
-
-			if (*arc_flags & ARC_FLAG_WAIT) {
-				cv_wait(&hdr->b_l1hdr.b_cv, hash_lock);
-				mutex_exit(hash_lock);
-				goto top;
-			}
-			ASSERT(*arc_flags & ARC_FLAG_NOWAIT);
-
-			if (done) {
-				arc_callback_t *acb = NULL;
-
-				acb = kmem_zalloc(sizeof (arc_callback_t),
-				    KM_SLEEP);
-				acb->acb_done = done;
-				acb->acb_private = private;
-				acb->acb_compressed = compressed_read;
-				if (pio != NULL)
-					acb->acb_zio_dummy = zio_null(pio,
-					    spa, NULL, NULL, NULL, zio_flags);
-
-				ASSERT3P(acb->acb_done, !=, NULL);
-				acb->acb_zio_head = head_zio;
-				acb->acb_next = hdr->b_l1hdr.b_acb;
-				hdr->b_l1hdr.b_acb = acb;
-				mutex_exit(hash_lock);
-				return (0);
-			}
-			mutex_exit(hash_lock);
-			return (0);
-		}
-
-		ASSERT(hdr->b_l1hdr.b_state == arc_mru ||
-		    hdr->b_l1hdr.b_state == arc_mfu);
-
-		if (done) {
-			if (hdr->b_flags & ARC_FLAG_PREDICTIVE_PREFETCH) {
-				/*
-				 * This is a demand read which does not have to
-				 * wait for i/o because we did a predictive
-				 * prefetch i/o for it, which has completed.
-				 */
-				DTRACE_PROBE1(
-				    arc__demand__hit__predictive__prefetch,
-				    arc_buf_hdr_t *, hdr);
-				ARCSTAT_BUMP(
-				    arcstat_demand_hit_predictive_prefetch);
-				arc_hdr_clear_flags(hdr,
-				    ARC_FLAG_PREDICTIVE_PREFETCH);
-			}
-
-			if (hdr->b_flags & ARC_FLAG_PRESCIENT_PREFETCH) {
-				ARCSTAT_BUMP(
-                                    arcstat_demand_hit_prescient_prefetch);
-				arc_hdr_clear_flags(hdr,
-                                    ARC_FLAG_PRESCIENT_PREFETCH);
-			}
-
-			ASSERT(!BP_IS_EMBEDDED(bp) || !BP_IS_HOLE(bp));
-			/* Get a buf with the desired data in it. */
-			rc = arc_buf_alloc_impl(hdr, private,
-			   compressed_read, B_TRUE, &buf);
-			if (rc != 0) {
-				arc_buf_destroy(buf, private);
-				buf = NULL;
-			}
-			ASSERT((zio_flags & ZIO_FLAG_SPECULATIVE) ||
-                            rc == 0 || rc != ENOENT);
-		} else if (*arc_flags & ARC_FLAG_PREFETCH &&
-		    zfs_refcount_count(&hdr->b_l1hdr.b_refcnt) == 0) {
-			arc_hdr_set_flags(hdr, ARC_FLAG_PREFETCH);
-		}
-		DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
-		arc_access(hdr, hash_lock);
-		if (*arc_flags & ARC_FLAG_PRESCIENT_PREFETCH)
-                        arc_hdr_set_flags(hdr, ARC_FLAG_PRESCIENT_PREFETCH);
-		if (*arc_flags & ARC_FLAG_L2CACHE)
-			arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE);
-		mutex_exit(hash_lock);
-		ARCSTAT_BUMP(arcstat_hits);
-		ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr),
-		    demand, prefetch, !HDR_ISTYPE_METADATA(hdr),
-		    data, metadata, hits);
-
-		if (done)
-			done(NULL, zb, bp, buf, private);
-	} else {
-		uint64_t lsize = BP_GET_LSIZE(bp);
-		uint64_t psize = BP_GET_PSIZE(bp);
-		arc_callback_t *acb;
-		vdev_t *vd = NULL;
-		uint64_t addr = 0;
-		boolean_t devw = B_FALSE;
-		uint64_t size;
-
-		if (hdr == NULL) {
-			/* this block is not in the cache */
-			arc_buf_hdr_t *exists = NULL;
-			arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp);
-			hdr = arc_hdr_alloc(spa_load_guid(spa), psize, lsize,
-			    BP_GET_COMPRESS(bp), type);
-
-			if (!BP_IS_EMBEDDED(bp)) {
-				hdr->b_dva = *BP_IDENTITY(bp);
-				hdr->b_birth = BP_PHYSICAL_BIRTH(bp);
-				exists = buf_hash_insert(hdr, &hash_lock);
-			}
-			if (exists != NULL) {
-				/* somebody beat us to the hash insert */
-				mutex_exit(hash_lock);
-				buf_discard_identity(hdr);
-				arc_hdr_destroy(hdr);
-				goto top; /* restart the IO request */
-			}
-		} else {
-			/*
-			 * This block is in the ghost cache. If it was L2-only
-			 * (and thus didn't have an L1 hdr), we realloc the
-			 * header to add an L1 hdr.
-			 */
-			if (!HDR_HAS_L1HDR(hdr)) {
-				hdr = arc_hdr_realloc(hdr, hdr_l2only_cache,
-				    hdr_full_cache);
-			}
-			ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
-			ASSERT(GHOST_STATE(hdr->b_l1hdr.b_state));
-			ASSERT(!HDR_IO_IN_PROGRESS(hdr));
-			ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
-			ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
-			ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL);
-
-			/*
-			 * This is a delicate dance that we play here.
-			 * This hdr is in the ghost list so we access it
-			 * to move it out of the ghost list before we
-			 * initiate the read. If it's a prefetch then
-			 * it won't have a callback so we'll remove the
-			 * reference that arc_buf_alloc_impl() created. We
-			 * do this after we've called arc_access() to
-			 * avoid hitting an assert in remove_reference().
-			 */
-			arc_adapt(arc_hdr_size(hdr), hdr->b_l1hdr.b_state);
-			arc_access(hdr, hash_lock);
-			arc_hdr_alloc_pabd(hdr, B_FALSE);
-		}
-		ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
-		size = arc_hdr_size(hdr);
-
-		/*
-		 * If compression is enabled on the hdr, then will do
-		 * RAW I/O and will store the compressed data in the hdr's
-		 * data block. Otherwise, the hdr's data block will contain
-		 * the uncompressed data.
-		 */
-		if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF) {
-			zio_flags |= ZIO_FLAG_RAW;
-		}
-
-		if (*arc_flags & ARC_FLAG_PREFETCH)
-			arc_hdr_set_flags(hdr, ARC_FLAG_PREFETCH);
-		if (*arc_flags & ARC_FLAG_PRESCIENT_PREFETCH)
-			arc_hdr_set_flags(hdr, ARC_FLAG_PRESCIENT_PREFETCH);
-
-		if (*arc_flags & ARC_FLAG_L2CACHE)
-			arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE);
-		if (BP_GET_LEVEL(bp) > 0)
-			arc_hdr_set_flags(hdr, ARC_FLAG_INDIRECT);
-		if (*arc_flags & ARC_FLAG_PREDICTIVE_PREFETCH)
-			arc_hdr_set_flags(hdr, ARC_FLAG_PREDICTIVE_PREFETCH);
-		ASSERT(!GHOST_STATE(hdr->b_l1hdr.b_state));
-
-		acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP);
-		acb->acb_done = done;
-		acb->acb_private = private;
-		acb->acb_compressed = compressed_read;
-
-		ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL);
-		hdr->b_l1hdr.b_acb = acb;
-		arc_hdr_set_flags(hdr, ARC_FLAG_IO_IN_PROGRESS);
-
-		if (HDR_HAS_L2HDR(hdr) &&
-		    (vd = hdr->b_l2hdr.b_dev->l2ad_vdev) != NULL) {
-			devw = hdr->b_l2hdr.b_dev->l2ad_writing;
-			addr = hdr->b_l2hdr.b_daddr;
-			/*
-			 * Lock out L2ARC device removal.
-			 */
-			if (vdev_is_dead(vd) ||
-			    !spa_config_tryenter(spa, SCL_L2ARC, vd, RW_READER))
-				vd = NULL;
-		}
-
-		/*
-		 * We count both async reads and scrub IOs as asynchronous so
-		 * that both can be upgraded in the event of a cache hit while
-		 * the read IO is still in-flight.
-		 */
-		if (priority == ZIO_PRIORITY_ASYNC_READ ||
-		    priority == ZIO_PRIORITY_SCRUB)
-			arc_hdr_set_flags(hdr, ARC_FLAG_PRIO_ASYNC_READ);
-		else
-			arc_hdr_clear_flags(hdr, ARC_FLAG_PRIO_ASYNC_READ);
-
-		/*
-		 * At this point, we have a level 1 cache miss.  Try again in
-		 * L2ARC if possible.
-		 */
-		ASSERT3U(HDR_GET_LSIZE(hdr), ==, lsize);
-
-		DTRACE_PROBE4(arc__miss, arc_buf_hdr_t *, hdr, blkptr_t *, bp,
-		    uint64_t, lsize, zbookmark_phys_t *, zb);
-		ARCSTAT_BUMP(arcstat_misses);
-		ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr),
-		    demand, prefetch, !HDR_ISTYPE_METADATA(hdr),
-		    data, metadata, misses);
-#ifdef _KERNEL
-#ifdef RACCT
-		if (racct_enable) {
-			PROC_LOCK(curproc);
-			racct_add_force(curproc, RACCT_READBPS, size);
-			racct_add_force(curproc, RACCT_READIOPS, 1);
-			PROC_UNLOCK(curproc);
-		}
-#endif /* RACCT */
-		curthread->td_ru.ru_inblock++;
-#endif
-
-		if (vd != NULL && l2arc_ndev != 0 && !(l2arc_norw && devw)) {
-			/*
-			 * Read from the L2ARC if the following are true:
-			 * 1. The L2ARC vdev was previously cached.
-			 * 2. This buffer still has L2ARC metadata.
-			 * 3. This buffer isn't currently writing to the L2ARC.
-			 * 4. The L2ARC entry wasn't evicted, which may
-			 *    also have invalidated the vdev.
-			 * 5. This isn't prefetch and l2arc_noprefetch is set.
-			 */
-			if (HDR_HAS_L2HDR(hdr) &&
-			    !HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr) &&
-			    !(l2arc_noprefetch && HDR_PREFETCH(hdr))) {
-				l2arc_read_callback_t *cb;
-				abd_t *abd;
-				uint64_t asize;
-
-				DTRACE_PROBE1(l2arc__hit, arc_buf_hdr_t *, hdr);
-				ARCSTAT_BUMP(arcstat_l2_hits);
-				atomic_inc_32(&hdr->b_l2hdr.b_hits);
-
-				cb = kmem_zalloc(sizeof (l2arc_read_callback_t),
-				    KM_SLEEP);
-				cb->l2rcb_hdr = hdr;
-				cb->l2rcb_bp = *bp;
-				cb->l2rcb_zb = *zb;
-				cb->l2rcb_flags = zio_flags;
-
-				asize = vdev_psize_to_asize(vd, size);
-				if (asize != size) {
-					abd = abd_alloc_for_io(asize,
-					    HDR_ISTYPE_METADATA(hdr));
-					cb->l2rcb_abd = abd;
-				} else {
-					abd = hdr->b_l1hdr.b_pabd;
-				}
-
-				ASSERT(addr >= VDEV_LABEL_START_SIZE &&
-				    addr + asize <= vd->vdev_psize -
-				    VDEV_LABEL_END_SIZE);
-
-				/*
-				 * l2arc read.  The SCL_L2ARC lock will be
-				 * released by l2arc_read_done().
-				 * Issue a null zio if the underlying buffer
-				 * was squashed to zero size by compression.
-				 */
-				ASSERT3U(HDR_GET_COMPRESS(hdr), !=,
-				    ZIO_COMPRESS_EMPTY);
-				rzio = zio_read_phys(pio, vd, addr,
-				    asize, abd,
-				    ZIO_CHECKSUM_OFF,
-				    l2arc_read_done, cb, priority,
-				    zio_flags | ZIO_FLAG_DONT_CACHE |
-				    ZIO_FLAG_CANFAIL |
-				    ZIO_FLAG_DONT_PROPAGATE |
-				    ZIO_FLAG_DONT_RETRY, B_FALSE);
-				acb->acb_zio_head = rzio;
-
-				if (hash_lock != NULL)
-					mutex_exit(hash_lock);
-
-				DTRACE_PROBE2(l2arc__read, vdev_t *, vd,
-				    zio_t *, rzio);
-				ARCSTAT_INCR(arcstat_l2_read_bytes, size);
-
-				if (*arc_flags & ARC_FLAG_NOWAIT) {
-					zio_nowait(rzio);
-					return (0);
-				}
-
-				ASSERT(*arc_flags & ARC_FLAG_WAIT);
-				if (zio_wait(rzio) == 0)
-					return (0);
-
-				/* l2arc read error; goto zio_read() */
-				if (hash_lock != NULL)
-					mutex_enter(hash_lock);
-			} else {
-				DTRACE_PROBE1(l2arc__miss,
-				    arc_buf_hdr_t *, hdr);
-				ARCSTAT_BUMP(arcstat_l2_misses);
-				if (HDR_L2_WRITING(hdr))
-					ARCSTAT_BUMP(arcstat_l2_rw_clash);
-				spa_config_exit(spa, SCL_L2ARC, vd);
-			}
-		} else {
-			if (vd != NULL)
-				spa_config_exit(spa, SCL_L2ARC, vd);
-			if (l2arc_ndev != 0) {
-				DTRACE_PROBE1(l2arc__miss,
-				    arc_buf_hdr_t *, hdr);
-				ARCSTAT_BUMP(arcstat_l2_misses);
-			}
-		}
-
-		rzio = zio_read(pio, spa, bp, hdr->b_l1hdr.b_pabd, size,
-		    arc_read_done, hdr, priority, zio_flags, zb);
-		acb->acb_zio_head = rzio;
-
-		if (hash_lock != NULL)
-			mutex_exit(hash_lock);
-
-		if (*arc_flags & ARC_FLAG_WAIT)
-			return (zio_wait(rzio));
-
-		ASSERT(*arc_flags & ARC_FLAG_NOWAIT);
-		zio_nowait(rzio);
-	}
-	return (0);
-}
-
-arc_prune_t *
-arc_add_prune_callback(arc_prune_func_t *func, void *private)
-{
-	arc_prune_t *p;
-
-	p = kmem_alloc(sizeof (*p), KM_SLEEP);
-	p->p_pfunc = func;
-	p->p_private = private;
-	list_link_init(&p->p_node);
-	zfs_refcount_create(&p->p_refcnt);
-
-	mutex_enter(&arc_prune_mtx);
-	zfs_refcount_add(&p->p_refcnt, &arc_prune_list);
-	list_insert_head(&arc_prune_list, p);
-	mutex_exit(&arc_prune_mtx);
-
-	return (p);
-}
-
-void
-arc_remove_prune_callback(arc_prune_t *p)
-{
-	boolean_t wait = B_FALSE;
-	mutex_enter(&arc_prune_mtx);
-	list_remove(&arc_prune_list, p);
-	if (zfs_refcount_remove(&p->p_refcnt, &arc_prune_list) > 0)
-		wait = B_TRUE;
-	mutex_exit(&arc_prune_mtx);
-
-	/* wait for arc_prune_task to finish */
-	if (wait)
-		taskq_wait(arc_prune_taskq);
-	ASSERT0(zfs_refcount_count(&p->p_refcnt));
-	zfs_refcount_destroy(&p->p_refcnt);
-	kmem_free(p, sizeof (*p));
-}
-
-/*
- * Notify the arc that a block was freed, and thus will never be used again.
- */
-void
-arc_freed(spa_t *spa, const blkptr_t *bp)
-{
-	arc_buf_hdr_t *hdr;
-	kmutex_t *hash_lock;
-	uint64_t guid = spa_load_guid(spa);
-
-	ASSERT(!BP_IS_EMBEDDED(bp));
-
-	hdr = buf_hash_find(guid, bp, &hash_lock);
-	if (hdr == NULL)
-		return;
-
-	/*
-	 * We might be trying to free a block that is still doing I/O
-	 * (i.e. prefetch) or has a reference (i.e. a dedup-ed,
-	 * dmu_sync-ed block). If this block is being prefetched, then it
-	 * would still have the ARC_FLAG_IO_IN_PROGRESS flag set on the hdr
-	 * until the I/O completes. A block may also have a reference if it is
-	 * part of a dedup-ed, dmu_synced write. The dmu_sync() function would
-	 * have written the new block to its final resting place on disk but
-	 * without the dedup flag set. This would have left the hdr in the MRU
-	 * state and discoverable. When the txg finally syncs it detects that
-	 * the block was overridden in open context and issues an override I/O.
-	 * Since this is a dedup block, the override I/O will determine if the
-	 * block is already in the DDT. If so, then it will replace the io_bp
-	 * with the bp from the DDT and allow the I/O to finish. When the I/O
-	 * reaches the done callback, dbuf_write_override_done, it will
-	 * check to see if the io_bp and io_bp_override are identical.
-	 * If they are not, then it indicates that the bp was replaced with
-	 * the bp in the DDT and the override bp is freed. This allows
-	 * us to arrive here with a reference on a block that is being
-	 * freed. So if we have an I/O in progress, or a reference to
-	 * this hdr, then we don't destroy the hdr.
-	 */
-	if (!HDR_HAS_L1HDR(hdr) || (!HDR_IO_IN_PROGRESS(hdr) &&
-	    zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt))) {
-		arc_change_state(arc_anon, hdr, hash_lock);
-		arc_hdr_destroy(hdr);
-		mutex_exit(hash_lock);
-	} else {
-		mutex_exit(hash_lock);
-	}
-
-}
-
-/*
- * Release this buffer from the cache, making it an anonymous buffer.  This
- * must be done after a read and prior to modifying the buffer contents.
- * If the buffer has more than one reference, we must make
- * a new hdr for the buffer.
- */
-void
-arc_release(arc_buf_t *buf, void *tag)
-{
-	arc_buf_hdr_t *hdr = buf->b_hdr;
-
-	/*
-	 * It would be nice to assert that if it's DMU metadata (level >
-	 * 0 || it's the dnode file), then it must be syncing context.
-	 * But we don't know that information at this level.
-	 */
-
-	mutex_enter(&buf->b_evict_lock);
-
-	ASSERT(HDR_HAS_L1HDR(hdr));
-
-	/*
-	 * We don't grab the hash lock prior to this check, because if
-	 * the buffer's header is in the arc_anon state, it won't be
-	 * linked into the hash table.
-	 */
-	if (hdr->b_l1hdr.b_state == arc_anon) {
-		mutex_exit(&buf->b_evict_lock);
-		ASSERT(!HDR_IO_IN_PROGRESS(hdr));
-		ASSERT(!HDR_IN_HASH_TABLE(hdr));
-		ASSERT(!HDR_HAS_L2HDR(hdr));
-		ASSERT(HDR_EMPTY(hdr));
-		ASSERT3U(hdr->b_l1hdr.b_bufcnt, ==, 1);
-		ASSERT3S(zfs_refcount_count(&hdr->b_l1hdr.b_refcnt), ==, 1);
-		ASSERT(!list_link_active(&hdr->b_l1hdr.b_arc_node));
-
-		hdr->b_l1hdr.b_arc_access = 0;
-
-		/*
-		 * If the buf is being overridden then it may already
-		 * have a hdr that is not empty.
-		 */
-		buf_discard_identity(hdr);
-		arc_buf_thaw(buf);
-
-		return;
-	}
-
-	kmutex_t *hash_lock = HDR_LOCK(hdr);
-	mutex_enter(hash_lock);
-
-	/*
-	 * This assignment is only valid as long as the hash_lock is
-	 * held, we must be careful not to reference state or the
-	 * b_state field after dropping the lock.
-	 */
-	arc_state_t *state = hdr->b_l1hdr.b_state;
-	ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
-	ASSERT3P(state, !=, arc_anon);
-
-	/* this buffer is not on any list */
-	ASSERT3S(zfs_refcount_count(&hdr->b_l1hdr.b_refcnt), >, 0);
-
-	if (HDR_HAS_L2HDR(hdr)) {
-		mutex_enter(&hdr->b_l2hdr.b_dev->l2ad_mtx);
-
-		/*
-		 * We have to recheck this conditional again now that
-		 * we're holding the l2ad_mtx to prevent a race with
-		 * another thread which might be concurrently calling
-		 * l2arc_evict(). In that case, l2arc_evict() might have
-		 * destroyed the header's L2 portion as we were waiting
-		 * to acquire the l2ad_mtx.
-		 */
-		if (HDR_HAS_L2HDR(hdr)) {
-			l2arc_trim(hdr);
-			arc_hdr_l2hdr_destroy(hdr);
-		}
-
-		mutex_exit(&hdr->b_l2hdr.b_dev->l2ad_mtx);
-	}
-
-	/*
-	 * Do we have more than one buf?
-	 */
-	if (hdr->b_l1hdr.b_bufcnt > 1) {
-		arc_buf_hdr_t *nhdr;
-		uint64_t spa = hdr->b_spa;
-		uint64_t psize = HDR_GET_PSIZE(hdr);
-		uint64_t lsize = HDR_GET_LSIZE(hdr);
-		enum zio_compress compress = HDR_GET_COMPRESS(hdr);
-		arc_buf_contents_t type = arc_buf_type(hdr);
-		VERIFY3U(hdr->b_type, ==, type);
-
-		ASSERT(hdr->b_l1hdr.b_buf != buf || buf->b_next != NULL);
-		(void) remove_reference(hdr, hash_lock, tag);
-
-		if (arc_buf_is_shared(buf) && !ARC_BUF_COMPRESSED(buf)) {
-			ASSERT3P(hdr->b_l1hdr.b_buf, !=, buf);
-			ASSERT(ARC_BUF_LAST(buf));
-		}
-
-		/*
-		 * Pull the data off of this hdr and attach it to
-		 * a new anonymous hdr. Also find the last buffer
-		 * in the hdr's buffer list.
-		 */
-		arc_buf_t *lastbuf = arc_buf_remove(hdr, buf);
-		ASSERT3P(lastbuf, !=, NULL);
-
-		/*
-		 * If the current arc_buf_t and the hdr are sharing their data
-		 * buffer, then we must stop sharing that block.
-		 */
-		if (arc_buf_is_shared(buf)) {
-			VERIFY(!arc_buf_is_shared(lastbuf));
-
-			/*
-			 * First, sever the block sharing relationship between
-			 * buf and the arc_buf_hdr_t.
-			 */
-			arc_unshare_buf(hdr, buf);
-
-			/*
-			 * Now we need to recreate the hdr's b_pabd. Since we
-			 * have lastbuf handy, we try to share with it, but if
-			 * we can't then we allocate a new b_pabd and copy the
-			 * data from buf into it.
-			 */
-			if (arc_can_share(hdr, lastbuf)) {
-				arc_share_buf(hdr, lastbuf);
-			} else {
-				arc_hdr_alloc_pabd(hdr, B_TRUE);
-				abd_copy_from_buf(hdr->b_l1hdr.b_pabd,
-				    buf->b_data, psize);
-			}
-			VERIFY3P(lastbuf->b_data, !=, NULL);
-		} else if (HDR_SHARED_DATA(hdr)) {
-			/*
-			 * Uncompressed shared buffers are always at the end
-			 * of the list. Compressed buffers don't have the
-			 * same requirements. This makes it hard to
-			 * simply assert that the lastbuf is shared so
-			 * we rely on the hdr's compression flags to determine
-			 * if we have a compressed, shared buffer.
-			 */
-			ASSERT(arc_buf_is_shared(lastbuf) ||
-			    HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF);
-			ASSERT(!ARC_BUF_SHARED(buf));
-		}
-		ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
-		ASSERT3P(state, !=, arc_l2c_only);
-
-		(void) zfs_refcount_remove_many(&state->arcs_size,
-		    arc_buf_size(buf), buf);
-
-		if (zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt)) {
-			ASSERT3P(state, !=, arc_l2c_only);
-			(void) zfs_refcount_remove_many(
-			    &state->arcs_esize[type],
-			    arc_buf_size(buf), buf);
-		}
-
-		hdr->b_l1hdr.b_bufcnt -= 1;
-		arc_cksum_verify(buf);
-#ifdef illumos
-		arc_buf_unwatch(buf);
-#endif
-
-		mutex_exit(hash_lock);
-
-		/*
-		 * Allocate a new hdr. The new hdr will contain a b_pabd
-		 * buffer which will be freed in arc_write().
-		 */
-		nhdr = arc_hdr_alloc(spa, psize, lsize, compress, type);
-		ASSERT3P(nhdr->b_l1hdr.b_buf, ==, NULL);
-		ASSERT0(nhdr->b_l1hdr.b_bufcnt);
-		ASSERT0(zfs_refcount_count(&nhdr->b_l1hdr.b_refcnt));
-		VERIFY3U(nhdr->b_type, ==, type);
-		ASSERT(!HDR_SHARED_DATA(nhdr));
-
-		nhdr->b_l1hdr.b_buf = buf;
-		nhdr->b_l1hdr.b_bufcnt = 1;
-		(void) zfs_refcount_add(&nhdr->b_l1hdr.b_refcnt, tag);
-		buf->b_hdr = nhdr;
-
-		mutex_exit(&buf->b_evict_lock);
-		(void) zfs_refcount_add_many(&arc_anon->arcs_size,
-		    arc_buf_size(buf), buf);
-	} else {
-		mutex_exit(&buf->b_evict_lock);
-		ASSERT(zfs_refcount_count(&hdr->b_l1hdr.b_refcnt) == 1);
-		/* protected by hash lock, or hdr is on arc_anon */
-		ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));
-		ASSERT(!HDR_IO_IN_PROGRESS(hdr));
-		arc_change_state(arc_anon, hdr, hash_lock);
-		hdr->b_l1hdr.b_arc_access = 0;
-		mutex_exit(hash_lock);
-
-		buf_discard_identity(hdr);
-		arc_buf_thaw(buf);
-	}
-}
-
-int
-arc_released(arc_buf_t *buf)
-{
-	int released;
-
-	mutex_enter(&buf->b_evict_lock);
-	released = (buf->b_data != NULL &&
-	    buf->b_hdr->b_l1hdr.b_state == arc_anon);
-	mutex_exit(&buf->b_evict_lock);
-	return (released);
-}
-
-#ifdef ZFS_DEBUG
-int
-arc_referenced(arc_buf_t *buf)
-{
-	int referenced;
-
-	mutex_enter(&buf->b_evict_lock);
-	referenced = (zfs_refcount_count(&buf->b_hdr->b_l1hdr.b_refcnt));
-	mutex_exit(&buf->b_evict_lock);
-	return (referenced);
-}
-#endif
-
-static void
-arc_write_ready(zio_t *zio)
-{
-	arc_write_callback_t *callback = zio->io_private;
-	arc_buf_t *buf = callback->awcb_buf;
-	arc_buf_hdr_t *hdr = buf->b_hdr;
-	uint64_t psize = BP_IS_HOLE(zio->io_bp) ? 0 : BP_GET_PSIZE(zio->io_bp);
-
-	ASSERT(HDR_HAS_L1HDR(hdr));
-	ASSERT(!zfs_refcount_is_zero(&buf->b_hdr->b_l1hdr.b_refcnt));
-	ASSERT(hdr->b_l1hdr.b_bufcnt > 0);
-
-	/*
-	 * If we're reexecuting this zio because the pool suspended, then
-	 * cleanup any state that was previously set the first time the
-	 * callback was invoked.
-	 */
-	if (zio->io_flags & ZIO_FLAG_REEXECUTED) {
-		arc_cksum_free(hdr);
-#ifdef illumos
-		arc_buf_unwatch(buf);
-#endif
-		if (hdr->b_l1hdr.b_pabd != NULL) {
-			if (arc_buf_is_shared(buf)) {
-				arc_unshare_buf(hdr, buf);
-			} else {
-				arc_hdr_free_pabd(hdr);
-			}
-		}
-	}
-	ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
-	ASSERT(!HDR_SHARED_DATA(hdr));
-	ASSERT(!arc_buf_is_shared(buf));
-
-	callback->awcb_ready(zio, buf, callback->awcb_private);
-
-	if (HDR_IO_IN_PROGRESS(hdr))
-		ASSERT(zio->io_flags & ZIO_FLAG_REEXECUTED);
-
-	arc_cksum_compute(buf);
-	arc_hdr_set_flags(hdr, ARC_FLAG_IO_IN_PROGRESS);
-
-	enum zio_compress compress;
-	if (BP_IS_HOLE(zio->io_bp) || BP_IS_EMBEDDED(zio->io_bp)) {
-		compress = ZIO_COMPRESS_OFF;
-	} else {
-		ASSERT3U(HDR_GET_LSIZE(hdr), ==, BP_GET_LSIZE(zio->io_bp));
-		compress = BP_GET_COMPRESS(zio->io_bp);
-	}
-	HDR_SET_PSIZE(hdr, psize);
-	arc_hdr_set_compress(hdr, compress);
-
-
-	/*
-	 * Fill the hdr with data. If the hdr is compressed, the data we want
-	 * is available from the zio, otherwise we can take it from the buf.
-	 *
-	 * We might be able to share the buf's data with the hdr here. However,
-	 * doing so would cause the ARC to be full of linear ABDs if we write a
-	 * lot of shareable data. As a compromise, we check whether scattered
-	 * ABDs are allowed, and assume that if they are then the user wants
-	 * the ARC to be primarily filled with them regardless of the data being
-	 * written. Therefore, if they're allowed then we allocate one and copy
-	 * the data into it; otherwise, we share the data directly if we can.
-	 */
-	if (zfs_abd_scatter_enabled || !arc_can_share(hdr, buf)) {
-		arc_hdr_alloc_pabd(hdr, B_TRUE);
-
-		/*
-		 * Ideally, we would always copy the io_abd into b_pabd, but the
-		 * user may have disabled compressed ARC, thus we must check the
-		 * hdr's compression setting rather than the io_bp's.
-		 */
-		if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF) {
-			ASSERT3U(BP_GET_COMPRESS(zio->io_bp), !=,
-			    ZIO_COMPRESS_OFF);
-			ASSERT3U(psize, >, 0);
-
-			abd_copy(hdr->b_l1hdr.b_pabd, zio->io_abd, psize);
-		} else {
-			ASSERT3U(zio->io_orig_size, ==, arc_hdr_size(hdr));
-
-			abd_copy_from_buf(hdr->b_l1hdr.b_pabd, buf->b_data,
-			    arc_buf_size(buf));
-		}
-	} else {
-		ASSERT3P(buf->b_data, ==, abd_to_buf(zio->io_orig_abd));
-		ASSERT3U(zio->io_orig_size, ==, arc_buf_size(buf));
-		ASSERT3U(hdr->b_l1hdr.b_bufcnt, ==, 1);
-
-		arc_share_buf(hdr, buf);
-	}
-
-	arc_hdr_verify(hdr, zio->io_bp);
-}
-
-static void
-arc_write_children_ready(zio_t *zio)
-{
-	arc_write_callback_t *callback = zio->io_private;
-	arc_buf_t *buf = callback->awcb_buf;
-
-	callback->awcb_children_ready(zio, buf, callback->awcb_private);
-}
-
-/*
- * The SPA calls this callback for each physical write that happens on behalf
- * of a logical write.  See the comment in dbuf_write_physdone() for details.
- */
-static void
-arc_write_physdone(zio_t *zio)
-{
-	arc_write_callback_t *cb = zio->io_private;
-	if (cb->awcb_physdone != NULL)
-		cb->awcb_physdone(zio, cb->awcb_buf, cb->awcb_private);
-}
-
-static void
-arc_write_done(zio_t *zio)
-{
-	arc_write_callback_t *callback = zio->io_private;
-	arc_buf_t *buf = callback->awcb_buf;
-	arc_buf_hdr_t *hdr = buf->b_hdr;
-
-	ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL);
-
-	if (zio->io_error == 0) {
-		arc_hdr_verify(hdr, zio->io_bp);
-
-		if (BP_IS_HOLE(zio->io_bp) || BP_IS_EMBEDDED(zio->io_bp)) {
-			buf_discard_identity(hdr);
-		} else {
-			hdr->b_dva = *BP_IDENTITY(zio->io_bp);
-			hdr->b_birth = BP_PHYSICAL_BIRTH(zio->io_bp);
-		}
-	} else {
-		ASSERT(HDR_EMPTY(hdr));
-	}
-
-	/*
-	 * If the block to be written was all-zero or compressed enough to be
-	 * embedded in the BP, no write was performed so there will be no
-	 * dva/birth/checksum.  The buffer must therefore remain anonymous
-	 * (and uncached).
-	 */
-	if (!HDR_EMPTY(hdr)) {
-		arc_buf_hdr_t *exists;
-		kmutex_t *hash_lock;
-
-		ASSERT3U(zio->io_error, ==, 0);
-
-		arc_cksum_verify(buf);
-
-		exists = buf_hash_insert(hdr, &hash_lock);
-		if (exists != NULL) {
-			/*
-			 * This can only happen if we overwrite for
-			 * sync-to-convergence, because we remove
-			 * buffers from the hash table when we arc_free().
-			 */
-			if (zio->io_flags & ZIO_FLAG_IO_REWRITE) {
-				if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp))
-					panic("bad overwrite, hdr=%p exists=%p",
-					    (void *)hdr, (void *)exists);
-				ASSERT(zfs_refcount_is_zero(
-				    &exists->b_l1hdr.b_refcnt));
-				arc_change_state(arc_anon, exists, hash_lock);
-				mutex_exit(hash_lock);
-				arc_hdr_destroy(exists);
-				exists = buf_hash_insert(hdr, &hash_lock);
-				ASSERT3P(exists, ==, NULL);
-			} else if (zio->io_flags & ZIO_FLAG_NOPWRITE) {
-				/* nopwrite */
-				ASSERT(zio->io_prop.zp_nopwrite);
-				if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp))
-					panic("bad nopwrite, hdr=%p exists=%p",
-					    (void *)hdr, (void *)exists);
-			} else {
-				/* Dedup */
-				ASSERT(hdr->b_l1hdr.b_bufcnt == 1);
-				ASSERT(hdr->b_l1hdr.b_state == arc_anon);
-				ASSERT(BP_GET_DEDUP(zio->io_bp));
-				ASSERT(BP_GET_LEVEL(zio->io_bp) == 0);
-			}
-		}
-		arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS);
-		/* if it's not anon, we are doing a scrub */
-		if (exists == NULL && hdr->b_l1hdr.b_state == arc_anon)
-			arc_access(hdr, hash_lock);
-		mutex_exit(hash_lock);
-	} else {
-		arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS);
-	}
-
-	ASSERT(!zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
-	callback->awcb_done(zio, buf, callback->awcb_private);
-
-	abd_put(zio->io_abd);
-	kmem_free(callback, sizeof (arc_write_callback_t));
-}
-
-zio_t *
-arc_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, arc_buf_t *buf,
-    boolean_t l2arc, const zio_prop_t *zp, arc_write_done_func_t *ready,
-    arc_write_done_func_t *children_ready, arc_write_done_func_t *physdone,
-    arc_write_done_func_t *done, void *private, zio_priority_t priority,
-    int zio_flags, const zbookmark_phys_t *zb)
-{
-	arc_buf_hdr_t *hdr = buf->b_hdr;
-	arc_write_callback_t *callback;
-	zio_t *zio;
-	zio_prop_t localprop = *zp;
-
-	ASSERT3P(ready, !=, NULL);
-	ASSERT3P(done, !=, NULL);
-	ASSERT(!HDR_IO_ERROR(hdr));
-	ASSERT(!HDR_IO_IN_PROGRESS(hdr));
-	ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL);
-	ASSERT3U(hdr->b_l1hdr.b_bufcnt, >, 0);
-	if (l2arc)
-		arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE);
-	if (ARC_BUF_COMPRESSED(buf)) {
-		/*
-		 * We're writing a pre-compressed buffer.  Make the
-		 * compression algorithm requested by the zio_prop_t match
-		 * the pre-compressed buffer's compression algorithm.
-		 */
-		localprop.zp_compress = HDR_GET_COMPRESS(hdr);
-
-		ASSERT3U(HDR_GET_LSIZE(hdr), !=, arc_buf_size(buf));
-		zio_flags |= ZIO_FLAG_RAW;
-	}
-	callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP);
-	callback->awcb_ready = ready;
-	callback->awcb_children_ready = children_ready;
-	callback->awcb_physdone = physdone;
-	callback->awcb_done = done;
-	callback->awcb_private = private;
-	callback->awcb_buf = buf;
-
-	/*
-	 * The hdr's b_pabd is now stale, free it now. A new data block
-	 * will be allocated when the zio pipeline calls arc_write_ready().
-	 */
-	if (hdr->b_l1hdr.b_pabd != NULL) {
-		/*
-		 * If the buf is currently sharing the data block with
-		 * the hdr then we need to break that relationship here.
-		 * The hdr will remain with a NULL data pointer and the
-		 * buf will take sole ownership of the block.
-		 */
-		if (arc_buf_is_shared(buf)) {
-			arc_unshare_buf(hdr, buf);
-		} else {
-			arc_hdr_free_pabd(hdr);
-		}
-		VERIFY3P(buf->b_data, !=, NULL);
-		arc_hdr_set_compress(hdr, ZIO_COMPRESS_OFF);
-	}
-	ASSERT(!arc_buf_is_shared(buf));
-	ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
-
-	zio = zio_write(pio, spa, txg, bp,
-	    abd_get_from_buf(buf->b_data, HDR_GET_LSIZE(hdr)),
-	    HDR_GET_LSIZE(hdr), arc_buf_size(buf), &localprop, arc_write_ready,
-	    (children_ready != NULL) ? arc_write_children_ready : NULL,
-	    arc_write_physdone, arc_write_done, callback,
-	    priority, zio_flags, zb);
-
-	return (zio);
-}
-
-static int
-arc_memory_throttle(spa_t *spa, uint64_t reserve, uint64_t txg)
-{
-#ifdef _KERNEL
-	uint64_t available_memory = ptob(freemem);
-
-#if defined(__i386) || !defined(UMA_MD_SMALL_ALLOC)
-	available_memory = MIN(available_memory, uma_avail());
-#endif
-
-	if (freemem > (uint64_t)physmem * arc_lotsfree_percent / 100)
-		return (0);
-
-	if (txg > spa->spa_lowmem_last_txg) {
-		spa->spa_lowmem_last_txg = txg;
-		spa->spa_lowmem_page_load = 0;
-	}
-	/*
-	 * If we are in pageout, we know that memory is already tight,
-	 * the arc is already going to be evicting, so we just want to
-	 * continue to let page writes occur as quickly as possible.
-	 */
-	if (curproc == pageproc) {
-		if (spa->spa_lowmem_page_load >
-		    MAX(ptob(minfree), available_memory) / 4)
-			return (SET_ERROR(ERESTART));
-		/* Note: reserve is inflated, so we deflate */
-		atomic_add_64(&spa->spa_lowmem_page_load, reserve / 8);
-		return (0);
-	} else if (spa->spa_lowmem_page_load > 0 && arc_reclaim_needed()) {
-		/* memory is low, delay before restarting */
-		ARCSTAT_INCR(arcstat_memory_throttle_count, 1);
-		return (SET_ERROR(EAGAIN));
-	}
-	spa->spa_lowmem_page_load = 0;
-#endif /* _KERNEL */
-	return (0);
-}
-
-void
-arc_tempreserve_clear(uint64_t reserve)
-{
-	atomic_add_64(&arc_tempreserve, -reserve);
-	ASSERT((int64_t)arc_tempreserve >= 0);
-}
-
-int
-arc_tempreserve_space(spa_t *spa, uint64_t reserve, uint64_t txg)
-{
-	int error;
-	uint64_t anon_size;
-
-	if (reserve > arc_c/4 && !arc_no_grow) {
-		arc_c = MIN(arc_c_max, reserve * 4);
-		DTRACE_PROBE1(arc__set_reserve, uint64_t, arc_c);
-	}
-	if (reserve > arc_c)
-		return (SET_ERROR(ENOMEM));
-
-	/*
-	 * Don't count loaned bufs as in flight dirty data to prevent long
-	 * network delays from blocking transactions that are ready to be
-	 * assigned to a txg.
-	 */
-
-	/* assert that it has not wrapped around */
-	ASSERT3S(atomic_add_64_nv(&arc_loaned_bytes, 0), >=, 0);
-
-	anon_size = MAX((int64_t)(zfs_refcount_count(&arc_anon->arcs_size) -
-	    arc_loaned_bytes), 0);
-
-	/*
-	 * Writes will, almost always, require additional memory allocations
-	 * in order to compress/encrypt/etc the data.  We therefore need to
-	 * make sure that there is sufficient available memory for this.
-	 */
-	error = arc_memory_throttle(spa, reserve, txg);
-	if (error != 0)
-		return (error);
-
-	/*
-	 * Throttle writes when the amount of dirty data in the cache
-	 * gets too large.  We try to keep the cache less than half full
-	 * of dirty blocks so that our sync times don't grow too large.
-	 *
-	 * In the case of one pool being built on another pool, we want
-	 * to make sure we don't end up throttling the lower (backing)
-	 * pool when the upper pool is the majority contributor to dirty
-	 * data. To insure we make forward progress during throttling, we
-	 * also check the current pool's net dirty data and only throttle
-	 * if it exceeds zfs_arc_pool_dirty_percent of the anonymous dirty
-	 * data in the cache.
-	 *
-	 * Note: if two requests come in concurrently, we might let them
-	 * both succeed, when one of them should fail.  Not a huge deal.
-	 */
-	uint64_t total_dirty = reserve + arc_tempreserve + anon_size;
-	uint64_t spa_dirty_anon = spa_dirty_data(spa);
-
-	if (total_dirty > arc_c * zfs_arc_dirty_limit_percent / 100 &&
-	    anon_size > arc_c * zfs_arc_anon_limit_percent / 100 &&
-	    spa_dirty_anon > anon_size * zfs_arc_pool_dirty_percent / 100) {
-		uint64_t meta_esize =
-		    zfs_refcount_count(
-		    &arc_anon->arcs_esize[ARC_BUFC_METADATA]);
-		uint64_t data_esize =
-		    zfs_refcount_count(&arc_anon->arcs_esize[ARC_BUFC_DATA]);
-		dprintf("failing, arc_tempreserve=%lluK anon_meta=%lluK "
-		    "anon_data=%lluK tempreserve=%lluK arc_c=%lluK\n",
-		    arc_tempreserve >> 10, meta_esize >> 10,
-		    data_esize >> 10, reserve >> 10, arc_c >> 10);
-		return (SET_ERROR(ERESTART));
-	}
-	atomic_add_64(&arc_tempreserve, reserve);
-	return (0);
-}
-
-static void
-arc_kstat_update_state(arc_state_t *state, kstat_named_t *size,
-    kstat_named_t *evict_data, kstat_named_t *evict_metadata)
-{
-	size->value.ui64 = zfs_refcount_count(&state->arcs_size);
-	evict_data->value.ui64 =
-	    zfs_refcount_count(&state->arcs_esize[ARC_BUFC_DATA]);
-	evict_metadata->value.ui64 =
-	    zfs_refcount_count(&state->arcs_esize[ARC_BUFC_METADATA]);
-}
-
-static int
-arc_kstat_update(kstat_t *ksp, int rw)
-{
-	arc_stats_t *as = ksp->ks_data;
-
-	if (rw == KSTAT_WRITE) {
-		return (EACCES);
-	} else {
-		arc_kstat_update_state(arc_anon,
-		    &as->arcstat_anon_size,
-		    &as->arcstat_anon_evictable_data,
-		    &as->arcstat_anon_evictable_metadata);
-		arc_kstat_update_state(arc_mru,
-		    &as->arcstat_mru_size,
-		    &as->arcstat_mru_evictable_data,
-		    &as->arcstat_mru_evictable_metadata);
-		arc_kstat_update_state(arc_mru_ghost,
-		    &as->arcstat_mru_ghost_size,
-		    &as->arcstat_mru_ghost_evictable_data,
-		    &as->arcstat_mru_ghost_evictable_metadata);
-		arc_kstat_update_state(arc_mfu,
-		    &as->arcstat_mfu_size,
-		    &as->arcstat_mfu_evictable_data,
-		    &as->arcstat_mfu_evictable_metadata);
-		arc_kstat_update_state(arc_mfu_ghost,
-		    &as->arcstat_mfu_ghost_size,
-		    &as->arcstat_mfu_ghost_evictable_data,
-		    &as->arcstat_mfu_ghost_evictable_metadata);
-
-		ARCSTAT(arcstat_size) = aggsum_value(&arc_size);
-		ARCSTAT(arcstat_meta_used) = aggsum_value(&arc_meta_used);
-		ARCSTAT(arcstat_data_size) = aggsum_value(&astat_data_size);
-		ARCSTAT(arcstat_metadata_size) =
-		    aggsum_value(&astat_metadata_size);
-		ARCSTAT(arcstat_hdr_size) = aggsum_value(&astat_hdr_size);
-		ARCSTAT(arcstat_bonus_size) = aggsum_value(&astat_bonus_size);
-		ARCSTAT(arcstat_dnode_size) = aggsum_value(&astat_dnode_size);
-		ARCSTAT(arcstat_dbuf_size) = aggsum_value(&astat_dbuf_size);
-#if defined(__FreeBSD__) && defined(COMPAT_FREEBSD11)
-		ARCSTAT(arcstat_other_size) = aggsum_value(&astat_bonus_size) +
-		    aggsum_value(&astat_dnode_size) +
-		    aggsum_value(&astat_dbuf_size);
-#endif
-		ARCSTAT(arcstat_l2_hdr_size) = aggsum_value(&astat_l2_hdr_size);
-	}
-
-	return (0);
-}
-
-/*
- * This function *must* return indices evenly distributed between all
- * sublists of the multilist. This is needed due to how the ARC eviction
- * code is laid out; arc_evict_state() assumes ARC buffers are evenly
- * distributed between all sublists and uses this assumption when
- * deciding which sublist to evict from and how much to evict from it.
- */
-unsigned int
-arc_state_multilist_index_func(multilist_t *ml, void *obj)
-{
-	arc_buf_hdr_t *hdr = obj;
-
-	/*
-	 * We rely on b_dva to generate evenly distributed index
-	 * numbers using buf_hash below. So, as an added precaution,
-	 * let's make sure we never add empty buffers to the arc lists.
-	 */
-	ASSERT(!HDR_EMPTY(hdr));
-
-	/*
-	 * The assumption here, is the hash value for a given
-	 * arc_buf_hdr_t will remain constant throughout it's lifetime
-	 * (i.e. it's b_spa, b_dva, and b_birth fields don't change).
-	 * Thus, we don't need to store the header's sublist index
-	 * on insertion, as this index can be recalculated on removal.
-	 *
-	 * Also, the low order bits of the hash value are thought to be
-	 * distributed evenly. Otherwise, in the case that the multilist
-	 * has a power of two number of sublists, each sublists' usage
-	 * would not be evenly distributed.
-	 */
-	return (buf_hash(hdr->b_spa, &hdr->b_dva, hdr->b_birth) %
-	    multilist_get_num_sublists(ml));
-}
-
-#ifdef _KERNEL
-static eventhandler_tag arc_event_lowmem = NULL;
-
-static void
-arc_lowmem(void *arg __unused, int howto __unused)
-{
-	int64_t free_memory, to_free;
-
-	arc_no_grow = B_TRUE;
-	arc_warm = B_TRUE;
-	arc_growtime = gethrtime() + SEC2NSEC(arc_grow_retry);
-	free_memory = arc_available_memory();
-	to_free = (arc_c >> arc_shrink_shift) - MIN(free_memory, 0);
-	DTRACE_PROBE2(arc__needfree, int64_t, free_memory, int64_t, to_free);
-	arc_reduce_target_size(to_free);
-
-	mutex_enter(&arc_adjust_lock);
-	arc_adjust_needed = B_TRUE;
-	zthr_wakeup(arc_adjust_zthr);
-
-	/*
-	 * It is unsafe to block here in arbitrary threads, because we can come
-	 * here from ARC itself and may hold ARC locks and thus risk a deadlock
-	 * with ARC reclaim thread.
-	 */
-	if (curproc == pageproc)
-		(void) cv_wait(&arc_adjust_waiters_cv, &arc_adjust_lock);
-	mutex_exit(&arc_adjust_lock);
-}
-#endif
-
-static void
-arc_state_init(void)
-{
-	arc_anon = &ARC_anon;
-	arc_mru = &ARC_mru;
-	arc_mru_ghost = &ARC_mru_ghost;
-	arc_mfu = &ARC_mfu;
-	arc_mfu_ghost = &ARC_mfu_ghost;
-	arc_l2c_only = &ARC_l2c_only;
-
-	arc_mru->arcs_list[ARC_BUFC_METADATA] =
-	    multilist_create(sizeof (arc_buf_hdr_t),
-	    offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
-	    arc_state_multilist_index_func);
-	arc_mru->arcs_list[ARC_BUFC_DATA] =
-	    multilist_create(sizeof (arc_buf_hdr_t),
-	    offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
-	    arc_state_multilist_index_func);
-	arc_mru_ghost->arcs_list[ARC_BUFC_METADATA] =
-	    multilist_create(sizeof (arc_buf_hdr_t),
-	    offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
-	    arc_state_multilist_index_func);
-	arc_mru_ghost->arcs_list[ARC_BUFC_DATA] =
-	    multilist_create(sizeof (arc_buf_hdr_t),
-	    offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
-	    arc_state_multilist_index_func);
-	arc_mfu->arcs_list[ARC_BUFC_METADATA] =
-	    multilist_create(sizeof (arc_buf_hdr_t),
-	    offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
-	    arc_state_multilist_index_func);
-	arc_mfu->arcs_list[ARC_BUFC_DATA] =
-	    multilist_create(sizeof (arc_buf_hdr_t),
-	    offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
-	    arc_state_multilist_index_func);
-	arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA] =
-	    multilist_create(sizeof (arc_buf_hdr_t),
-	    offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
-	    arc_state_multilist_index_func);
-	arc_mfu_ghost->arcs_list[ARC_BUFC_DATA] =
-	    multilist_create(sizeof (arc_buf_hdr_t),
-	    offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
-	    arc_state_multilist_index_func);
-	arc_l2c_only->arcs_list[ARC_BUFC_METADATA] =
-	    multilist_create(sizeof (arc_buf_hdr_t),
-	    offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
-	    arc_state_multilist_index_func);
-	arc_l2c_only->arcs_list[ARC_BUFC_DATA] =
-	    multilist_create(sizeof (arc_buf_hdr_t),
-	    offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
-	    arc_state_multilist_index_func);
-
-	zfs_refcount_create(&arc_anon->arcs_esize[ARC_BUFC_METADATA]);
-	zfs_refcount_create(&arc_anon->arcs_esize[ARC_BUFC_DATA]);
-	zfs_refcount_create(&arc_mru->arcs_esize[ARC_BUFC_METADATA]);
-	zfs_refcount_create(&arc_mru->arcs_esize[ARC_BUFC_DATA]);
-	zfs_refcount_create(&arc_mru_ghost->arcs_esize[ARC_BUFC_METADATA]);
-	zfs_refcount_create(&arc_mru_ghost->arcs_esize[ARC_BUFC_DATA]);
-	zfs_refcount_create(&arc_mfu->arcs_esize[ARC_BUFC_METADATA]);
-	zfs_refcount_create(&arc_mfu->arcs_esize[ARC_BUFC_DATA]);
-	zfs_refcount_create(&arc_mfu_ghost->arcs_esize[ARC_BUFC_METADATA]);
-	zfs_refcount_create(&arc_mfu_ghost->arcs_esize[ARC_BUFC_DATA]);
-	zfs_refcount_create(&arc_l2c_only->arcs_esize[ARC_BUFC_METADATA]);
-	zfs_refcount_create(&arc_l2c_only->arcs_esize[ARC_BUFC_DATA]);
-
-	zfs_refcount_create(&arc_anon->arcs_size);
-	zfs_refcount_create(&arc_mru->arcs_size);
-	zfs_refcount_create(&arc_mru_ghost->arcs_size);
-	zfs_refcount_create(&arc_mfu->arcs_size);
-	zfs_refcount_create(&arc_mfu_ghost->arcs_size);
-	zfs_refcount_create(&arc_l2c_only->arcs_size);
-
-	aggsum_init(&arc_meta_used, 0);
-	aggsum_init(&arc_size, 0);
-	aggsum_init(&astat_data_size, 0);
-	aggsum_init(&astat_metadata_size, 0);
-	aggsum_init(&astat_hdr_size, 0);
-	aggsum_init(&astat_bonus_size, 0);
-	aggsum_init(&astat_dnode_size, 0);
-	aggsum_init(&astat_dbuf_size, 0);
-	aggsum_init(&astat_l2_hdr_size, 0);
-}
-
-static void
-arc_state_fini(void)
-{
-	zfs_refcount_destroy(&arc_anon->arcs_esize[ARC_BUFC_METADATA]);
-	zfs_refcount_destroy(&arc_anon->arcs_esize[ARC_BUFC_DATA]);
-	zfs_refcount_destroy(&arc_mru->arcs_esize[ARC_BUFC_METADATA]);
-	zfs_refcount_destroy(&arc_mru->arcs_esize[ARC_BUFC_DATA]);
-	zfs_refcount_destroy(&arc_mru_ghost->arcs_esize[ARC_BUFC_METADATA]);
-	zfs_refcount_destroy(&arc_mru_ghost->arcs_esize[ARC_BUFC_DATA]);
-	zfs_refcount_destroy(&arc_mfu->arcs_esize[ARC_BUFC_METADATA]);
-	zfs_refcount_destroy(&arc_mfu->arcs_esize[ARC_BUFC_DATA]);
-	zfs_refcount_destroy(&arc_mfu_ghost->arcs_esize[ARC_BUFC_METADATA]);
-	zfs_refcount_destroy(&arc_mfu_ghost->arcs_esize[ARC_BUFC_DATA]);
-	zfs_refcount_destroy(&arc_l2c_only->arcs_esize[ARC_BUFC_METADATA]);
-	zfs_refcount_destroy(&arc_l2c_only->arcs_esize[ARC_BUFC_DATA]);
-
-	zfs_refcount_destroy(&arc_anon->arcs_size);
-	zfs_refcount_destroy(&arc_mru->arcs_size);
-	zfs_refcount_destroy(&arc_mru_ghost->arcs_size);
-	zfs_refcount_destroy(&arc_mfu->arcs_size);
-	zfs_refcount_destroy(&arc_mfu_ghost->arcs_size);
-	zfs_refcount_destroy(&arc_l2c_only->arcs_size);
-
-	multilist_destroy(arc_mru->arcs_list[ARC_BUFC_METADATA]);
-	multilist_destroy(arc_mru_ghost->arcs_list[ARC_BUFC_METADATA]);
-	multilist_destroy(arc_mfu->arcs_list[ARC_BUFC_METADATA]);
-	multilist_destroy(arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA]);
-	multilist_destroy(arc_mru->arcs_list[ARC_BUFC_DATA]);
-	multilist_destroy(arc_mru_ghost->arcs_list[ARC_BUFC_DATA]);
-	multilist_destroy(arc_mfu->arcs_list[ARC_BUFC_DATA]);
-	multilist_destroy(arc_mfu_ghost->arcs_list[ARC_BUFC_DATA]);
-
-	aggsum_fini(&arc_meta_used);
-	aggsum_fini(&arc_size);
-	aggsum_fini(&astat_data_size);
-	aggsum_fini(&astat_metadata_size);
-	aggsum_fini(&astat_hdr_size);
-	aggsum_fini(&astat_bonus_size);
-	aggsum_fini(&astat_dnode_size);
-	aggsum_fini(&astat_dbuf_size);
-	aggsum_fini(&astat_l2_hdr_size);
-}
-
-uint64_t
-arc_max_bytes(void)
-{
-	return (arc_c_max);
-}
-
-void
-arc_init(void)
-{
-	int i, prefetch_tunable_set = 0;
-
-	/*
-	 * allmem is "all memory that we could possibly use".
-	 */
-#ifdef illumos
-#ifdef _KERNEL
-	uint64_t allmem = ptob(physmem - swapfs_minfree);
-#else
-	uint64_t allmem = (physmem * PAGESIZE) / 2;
-#endif
-#else
-	uint64_t allmem = kmem_size();
-#endif
-	mutex_init(&arc_adjust_lock, NULL, MUTEX_DEFAULT, NULL);
-	cv_init(&arc_adjust_waiters_cv, NULL, CV_DEFAULT, NULL);
-
-	mutex_init(&arc_dnlc_evicts_lock, NULL, MUTEX_DEFAULT, NULL);
-	cv_init(&arc_dnlc_evicts_cv, NULL, CV_DEFAULT, NULL);
-
-	/* set min cache to 1/32 of all memory, or arc_abs_min, whichever is more */
-	arc_c_min = MAX(allmem / 32, arc_abs_min);
-	/* set max to 5/8 of all memory, or all but 1GB, whichever is more */
-	if (allmem >= 1 << 30)
-		arc_c_max = allmem - (1 << 30);
-	else
-		arc_c_max = arc_c_min;
-	arc_c_max = MAX(allmem * 5 / 8, arc_c_max);
-
-	/*
-	 * In userland, there's only the memory pressure that we artificially
-	 * create (see arc_available_memory()).  Don't let arc_c get too
-	 * small, because it can cause transactions to be larger than
-	 * arc_c, causing arc_tempreserve_space() to fail.
-	 */
-#ifndef _KERNEL
-	arc_c_min = arc_c_max / 2;
-#endif
-
-#ifdef _KERNEL
-	/*
-	 * Allow the tunables to override our calculations if they are
-	 * reasonable.
-	 */
-	if (zfs_arc_max > arc_abs_min && zfs_arc_max < allmem) {
-		arc_c_max = zfs_arc_max;
-		arc_c_min = MIN(arc_c_min, arc_c_max);
-	}
-	if (zfs_arc_min > arc_abs_min && zfs_arc_min <= arc_c_max)
-		arc_c_min = zfs_arc_min;
-#endif
-
-	arc_c = arc_c_max;
-	arc_p = (arc_c >> 1);
-
-	/* limit meta-data to 1/4 of the arc capacity */
-	arc_meta_limit = arc_c_max / 4;
-
-#ifdef _KERNEL
-	/*
-	 * Metadata is stored in the kernel's heap.  Don't let us
-	 * use more than half the heap for the ARC.
-	 */
-#ifdef __FreeBSD__
-	arc_meta_limit = MIN(arc_meta_limit, uma_limit() / 2);
-	arc_dnode_limit = arc_meta_limit / 10;
-#else
-	arc_meta_limit = MIN(arc_meta_limit,
-	    vmem_size(heap_arena, VMEM_ALLOC | VMEM_FREE) / 2);
-#endif
-#endif
-
-	/* Allow the tunable to override if it is reasonable */
-	if (zfs_arc_meta_limit > 0 && zfs_arc_meta_limit <= arc_c_max)
-		arc_meta_limit = zfs_arc_meta_limit;
-
-	if (arc_c_min < arc_meta_limit / 2 && zfs_arc_min == 0)
-		arc_c_min = arc_meta_limit / 2;
-
-	if (zfs_arc_meta_min > 0) {
-		arc_meta_min = zfs_arc_meta_min;
-	} else {
-		arc_meta_min = arc_c_min / 2;
-	}
-
-	/* Valid range: <arc_meta_min> - <arc_c_max> */
-	if ((zfs_arc_dnode_limit) && (zfs_arc_dnode_limit != arc_dnode_limit) &&
-	    (zfs_arc_dnode_limit >= zfs_arc_meta_min) &&
-	    (zfs_arc_dnode_limit <= arc_c_max))
-		arc_dnode_limit = zfs_arc_dnode_limit;
-
-	if (zfs_arc_grow_retry > 0)
-		arc_grow_retry = zfs_arc_grow_retry;
-
-	if (zfs_arc_shrink_shift > 0)
-		arc_shrink_shift = zfs_arc_shrink_shift;
-
-	if (zfs_arc_no_grow_shift > 0)
-		arc_no_grow_shift = zfs_arc_no_grow_shift;
-	/*
-	 * Ensure that arc_no_grow_shift is less than arc_shrink_shift.
-	 */
-	if (arc_no_grow_shift >= arc_shrink_shift)
-		arc_no_grow_shift = arc_shrink_shift - 1;
-
-	if (zfs_arc_p_min_shift > 0)
-		arc_p_min_shift = zfs_arc_p_min_shift;
-
-	/* if kmem_flags are set, lets try to use less memory */
-	if (kmem_debugging())
-		arc_c = arc_c / 2;
-	if (arc_c < arc_c_min)
-		arc_c = arc_c_min;
-
-	zfs_arc_min = arc_c_min;
-	zfs_arc_max = arc_c_max;
-
-	arc_state_init();
-
-	/*
-	 * The arc must be "uninitialized", so that hdr_recl() (which is
-	 * registered by buf_init()) will not access arc_reap_zthr before
-	 * it is created.
-	 */
-	ASSERT(!arc_initialized);
-	buf_init();
-
-	list_create(&arc_prune_list, sizeof (arc_prune_t),
-	    offsetof(arc_prune_t, p_node));
-	mutex_init(&arc_prune_mtx, NULL, MUTEX_DEFAULT, NULL);
-
-	arc_prune_taskq = taskq_create("arc_prune", max_ncpus, minclsyspri,
-	    max_ncpus, INT_MAX, TASKQ_PREPOPULATE | TASKQ_DYNAMIC);
-
-	arc_dnlc_evicts_thread_exit = FALSE;
-
-	arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED,
-	    sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
-
-	if (arc_ksp != NULL) {
-		arc_ksp->ks_data = &arc_stats;
-		arc_ksp->ks_update = arc_kstat_update;
-		kstat_install(arc_ksp);
-	}
-
-	arc_adjust_zthr = zthr_create_timer(arc_adjust_cb_check,
-	    arc_adjust_cb, NULL, SEC2NSEC(1));
-	arc_reap_zthr = zthr_create_timer(arc_reap_cb_check,
-	    arc_reap_cb, NULL, SEC2NSEC(1));
-
-#ifdef _KERNEL
-	arc_event_lowmem = EVENTHANDLER_REGISTER(vm_lowmem, arc_lowmem, NULL,
-	    EVENTHANDLER_PRI_FIRST);
-#endif
-
-	(void) thread_create(NULL, 0, arc_dnlc_evicts_thread, NULL, 0, &p0,
-	    TS_RUN, minclsyspri);
-
-	arc_initialized = B_TRUE;
-	arc_warm = B_FALSE;
-
-	/*
-	 * Calculate maximum amount of dirty data per pool.
-	 *
-	 * If it has been set by /etc/system, take that.
-	 * Otherwise, use a percentage of physical memory defined by
-	 * zfs_dirty_data_max_percent (default 10%) with a cap at
-	 * zfs_dirty_data_max_max (default 4GB).
-	 */
-	if (zfs_dirty_data_max == 0) {
-		zfs_dirty_data_max = ptob(physmem) *
-		    zfs_dirty_data_max_percent / 100;
-		zfs_dirty_data_max = MIN(zfs_dirty_data_max,
-		    zfs_dirty_data_max_max);
-	}
-
-#ifdef _KERNEL
-	if (TUNABLE_INT_FETCH("vfs.zfs.prefetch_disable", &zfs_prefetch_disable))
-		prefetch_tunable_set = 1;
-
-#ifdef __i386__
-	if (prefetch_tunable_set == 0) {
-		printf("ZFS NOTICE: Prefetch is disabled by default on i386 "
-		    "-- to enable,\n");
-		printf("            add \"vfs.zfs.prefetch_disable=0\" "
-		    "to /boot/loader.conf.\n");
-		zfs_prefetch_disable = 1;
-	}
-#else
-	if ((((uint64_t)physmem * PAGESIZE) < (1ULL << 32)) &&
-	    prefetch_tunable_set == 0) {
-		printf("ZFS NOTICE: Prefetch is disabled by default if less "
-		    "than 4GB of RAM is present;\n"
-		    "            to enable, add \"vfs.zfs.prefetch_disable=0\" "
-		    "to /boot/loader.conf.\n");
-		zfs_prefetch_disable = 1;
-	}
-#endif
-	/* Warn about ZFS memory and address space requirements. */
-	if (((uint64_t)physmem * PAGESIZE) < (256 + 128 + 64) * (1 << 20)) {
-		printf("ZFS WARNING: Recommended minimum RAM size is 512MB; "
-		    "expect unstable behavior.\n");
-	}
-	if (allmem < 512 * (1 << 20)) {
-		printf("ZFS WARNING: Recommended minimum kmem_size is 512MB; "
-		    "expect unstable behavior.\n");
-		printf("             Consider tuning vm.kmem_size and "
-		    "vm.kmem_size_max\n");
-		printf("             in /boot/loader.conf.\n");
-	}
-#endif
-}
-
-void
-arc_fini(void)
-{
-	arc_prune_t *p;
-
-#ifdef _KERNEL
-	if (arc_event_lowmem != NULL)
-		EVENTHANDLER_DEREGISTER(vm_lowmem, arc_event_lowmem);
-#endif
-
-	/* Use B_TRUE to ensure *all* buffers are evicted */
-	arc_flush(NULL, B_TRUE);
-
-	mutex_enter(&arc_dnlc_evicts_lock);
-	arc_dnlc_evicts_thread_exit = TRUE;
-	/*
-	 * The user evicts thread will set arc_user_evicts_thread_exit
-	 * to FALSE when it is finished exiting; we're waiting for that.
-	 */
-	while (arc_dnlc_evicts_thread_exit) {
-		cv_signal(&arc_dnlc_evicts_cv);
-		cv_wait(&arc_dnlc_evicts_cv, &arc_dnlc_evicts_lock);
-	}
-	mutex_exit(&arc_dnlc_evicts_lock);
-
-	arc_initialized = B_FALSE;
-
-	if (arc_ksp != NULL) {
-		kstat_delete(arc_ksp);
-		arc_ksp = NULL;
-	}
-
-	taskq_wait(arc_prune_taskq);
-	taskq_destroy(arc_prune_taskq);
-
-	mutex_enter(&arc_prune_mtx);
-	while ((p = list_head(&arc_prune_list)) != NULL) {
-		list_remove(&arc_prune_list, p);
-		zfs_refcount_remove(&p->p_refcnt, &arc_prune_list);
-		zfs_refcount_destroy(&p->p_refcnt);
-		kmem_free(p, sizeof (*p));
-	}
-	mutex_exit(&arc_prune_mtx);
-
-	list_destroy(&arc_prune_list);
-	mutex_destroy(&arc_prune_mtx);
-
-	(void) zthr_cancel(arc_adjust_zthr);
-	zthr_destroy(arc_adjust_zthr);
-
-	mutex_destroy(&arc_dnlc_evicts_lock);
-	cv_destroy(&arc_dnlc_evicts_cv);
-
-	(void) zthr_cancel(arc_reap_zthr);
-	zthr_destroy(arc_reap_zthr);
-
-	mutex_destroy(&arc_adjust_lock);
-	cv_destroy(&arc_adjust_waiters_cv);
-
-	/*
-	 * buf_fini() must proceed arc_state_fini() because buf_fin() may
-	 * trigger the release of kmem magazines, which can callback to
-	 * arc_space_return() which accesses aggsums freed in act_state_fini().
-	 */
-	buf_fini();
-	arc_state_fini();
-
-	ASSERT0(arc_loaned_bytes);
-}
-
-/*
- * Level 2 ARC
- *
- * The level 2 ARC (L2ARC) is a cache layer in-between main memory and disk.
- * It uses dedicated storage devices to hold cached data, which are populated
- * using large infrequent writes.  The main role of this cache is to boost
- * the performance of random read workloads.  The intended L2ARC devices
- * include short-stroked disks, solid state disks, and other media with
- * substantially faster read latency than disk.
- *
- *                 +-----------------------+
- *                 |         ARC           |
- *                 +-----------------------+
- *                    |         ^     ^
- *                    |         |     |
- *      l2arc_feed_thread()    arc_read()
- *                    |         |     |
- *                    |  l2arc read   |
- *                    V         |     |
- *               +---------------+    |
- *               |     L2ARC     |    |
- *               +---------------+    |
- *                   |    ^           |
- *          l2arc_write() |           |
- *                   |    |           |
- *                   V    |           |
- *                 +-------+      +-------+
- *                 | vdev  |      | vdev  |
- *                 | cache |      | cache |
- *                 +-------+      +-------+
- *                 +=========+     .-----.
- *                 :  L2ARC  :    |-_____-|
- *                 : devices :    | Disks |
- *                 +=========+    `-_____-'
- *
- * Read requests are satisfied from the following sources, in order:
- *
- *	1) ARC
- *	2) vdev cache of L2ARC devices
- *	3) L2ARC devices
- *	4) vdev cache of disks
- *	5) disks
- *
- * Some L2ARC device types exhibit extremely slow write performance.
- * To accommodate for this there are some significant differences between
- * the L2ARC and traditional cache design:
- *
- * 1. There is no eviction path from the ARC to the L2ARC.  Evictions from
- * the ARC behave as usual, freeing buffers and placing headers on ghost
- * lists.  The ARC does not send buffers to the L2ARC during eviction as
- * this would add inflated write latencies for all ARC memory pressure.
- *
- * 2. The L2ARC attempts to cache data from the ARC before it is evicted.
- * It does this by periodically scanning buffers from the eviction-end of
- * the MFU and MRU ARC lists, copying them to the L2ARC devices if they are
- * not already there. It scans until a headroom of buffers is satisfied,
- * which itself is a buffer for ARC eviction. If a compressible buffer is
- * found during scanning and selected for writing to an L2ARC device, we
- * temporarily boost scanning headroom during the next scan cycle to make
- * sure we adapt to compression effects (which might significantly reduce
- * the data volume we write to L2ARC). The thread that does this is
- * l2arc_feed_thread(), illustrated below; example sizes are included to
- * provide a better sense of ratio than this diagram:
- *
- *	       head -->                        tail
- *	        +---------------------+----------+
- *	ARC_mfu |:::::#:::::::::::::::|o#o###o###|-->.   # already on L2ARC
- *	        +---------------------+----------+   |   o L2ARC eligible
- *	ARC_mru |:#:::::::::::::::::::|#o#ooo####|-->|   : ARC buffer
- *	        +---------------------+----------+   |
- *	             15.9 Gbytes      ^ 32 Mbytes    |
- *	                           headroom          |
- *	                                      l2arc_feed_thread()
- *	                                             |
- *	                 l2arc write hand <--[oooo]--'
- *	                         |           8 Mbyte
- *	                         |          write max
- *	                         V
- *		  +==============================+
- *	L2ARC dev |####|#|###|###|    |####| ... |
- *	          +==============================+
- *	                     32 Gbytes
- *
- * 3. If an ARC buffer is copied to the L2ARC but then hit instead of
- * evicted, then the L2ARC has cached a buffer much sooner than it probably
- * needed to, potentially wasting L2ARC device bandwidth and storage.  It is
- * safe to say that this is an uncommon case, since buffers at the end of
- * the ARC lists have moved there due to inactivity.
- *
- * 4. If the ARC evicts faster than the L2ARC can maintain a headroom,
- * then the L2ARC simply misses copying some buffers.  This serves as a
- * pressure valve to prevent heavy read workloads from both stalling the ARC
- * with waits and clogging the L2ARC with writes.  This also helps prevent
- * the potential for the L2ARC to churn if it attempts to cache content too
- * quickly, such as during backups of the entire pool.
- *
- * 5. After system boot and before the ARC has filled main memory, there are
- * no evictions from the ARC and so the tails of the ARC_mfu and ARC_mru
- * lists can remain mostly static.  Instead of searching from tail of these
- * lists as pictured, the l2arc_feed_thread() will search from the list heads
- * for eligible buffers, greatly increasing its chance of finding them.
- *
- * The L2ARC device write speed is also boosted during this time so that
- * the L2ARC warms up faster.  Since there have been no ARC evictions yet,
- * there are no L2ARC reads, and no fear of degrading read performance
- * through increased writes.
- *
- * 6. Writes to the L2ARC devices are grouped and sent in-sequence, so that
- * the vdev queue can aggregate them into larger and fewer writes.  Each
- * device is written to in a rotor fashion, sweeping writes through
- * available space then repeating.
- *
- * 7. The L2ARC does not store dirty content.  It never needs to flush
- * write buffers back to disk based storage.
- *
- * 8. If an ARC buffer is written (and dirtied) which also exists in the
- * L2ARC, the now stale L2ARC buffer is immediately dropped.
- *
- * The performance of the L2ARC can be tweaked by a number of tunables, which
- * may be necessary for different workloads:
- *
- *	l2arc_write_max		max write bytes per interval
- *	l2arc_write_boost	extra write bytes during device warmup
- *	l2arc_noprefetch	skip caching prefetched buffers
- *	l2arc_headroom		number of max device writes to precache
- *	l2arc_headroom_boost	when we find compressed buffers during ARC
- *				scanning, we multiply headroom by this
- *				percentage factor for the next scan cycle,
- *				since more compressed buffers are likely to
- *				be present
- *	l2arc_feed_secs		seconds between L2ARC writing
- *
- * Tunables may be removed or added as future performance improvements are
- * integrated, and also may become zpool properties.
- *
- * There are three key functions that control how the L2ARC warms up:
- *
- *	l2arc_write_eligible()	check if a buffer is eligible to cache
- *	l2arc_write_size()	calculate how much to write
- *	l2arc_write_interval()	calculate sleep delay between writes
- *
- * These three functions determine what to write, how much, and how quickly
- * to send writes.
- */
-
-static boolean_t
-l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *hdr)
-{
-	/*
-	 * A buffer is *not* eligible for the L2ARC if it:
-	 * 1. belongs to a different spa.
-	 * 2. is already cached on the L2ARC.
-	 * 3. has an I/O in progress (it may be an incomplete read).
-	 * 4. is flagged not eligible (zfs property).
-	 */
-	if (hdr->b_spa != spa_guid) {
-		ARCSTAT_BUMP(arcstat_l2_write_spa_mismatch);
-		return (B_FALSE);
-	}
-	if (HDR_HAS_L2HDR(hdr)) {
-		ARCSTAT_BUMP(arcstat_l2_write_in_l2);
-		return (B_FALSE);
-	}
-	if (HDR_IO_IN_PROGRESS(hdr)) {
-		ARCSTAT_BUMP(arcstat_l2_write_hdr_io_in_progress);
-		return (B_FALSE);
-	}
-	if (!HDR_L2CACHE(hdr)) {
-		ARCSTAT_BUMP(arcstat_l2_write_not_cacheable);
-		return (B_FALSE);
-	}
-
-	return (B_TRUE);
-}
-
-static uint64_t
-l2arc_write_size(void)
-{
-	uint64_t size;
-
-	/*
-	 * Make sure our globals have meaningful values in case the user
-	 * altered them.
-	 */
-	size = l2arc_write_max;
-	if (size == 0) {
-		cmn_err(CE_NOTE, "Bad value for l2arc_write_max, value must "
-		    "be greater than zero, resetting it to the default (%d)",
-		    L2ARC_WRITE_SIZE);
-		size = l2arc_write_max = L2ARC_WRITE_SIZE;
-	}
-
-	if (arc_warm == B_FALSE)
-		size += l2arc_write_boost;
-
-	return (size);
-
-}
-
-static clock_t
-l2arc_write_interval(clock_t began, uint64_t wanted, uint64_t wrote)
-{
-	clock_t interval, next, now;
-
-	/*
-	 * If the ARC lists are busy, increase our write rate; if the
-	 * lists are stale, idle back.  This is achieved by checking
-	 * how much we previously wrote - if it was more than half of
-	 * what we wanted, schedule the next write much sooner.
-	 */
-	if (l2arc_feed_again && wrote > (wanted / 2))
-		interval = (hz * l2arc_feed_min_ms) / 1000;
-	else
-		interval = hz * l2arc_feed_secs;
-
-	now = ddi_get_lbolt();
-	next = MAX(now, MIN(now + interval, began + interval));
-
-	return (next);
-}
-
-/*
- * Cycle through L2ARC devices.  This is how L2ARC load balances.
- * If a device is returned, this also returns holding the spa config lock.
- */
-static l2arc_dev_t *
-l2arc_dev_get_next(void)
-{
-	l2arc_dev_t *first, *next = NULL;
-
-	/*
-	 * Lock out the removal of spas (spa_namespace_lock), then removal
-	 * of cache devices (l2arc_dev_mtx).  Once a device has been selected,
-	 * both locks will be dropped and a spa config lock held instead.
-	 */
-	mutex_enter(&spa_namespace_lock);
-	mutex_enter(&l2arc_dev_mtx);
-
-	/* if there are no vdevs, there is nothing to do */
-	if (l2arc_ndev == 0)
-		goto out;
-
-	first = NULL;
-	next = l2arc_dev_last;
-	do {
-		/* loop around the list looking for a non-faulted vdev */
-		if (next == NULL) {
-			next = list_head(l2arc_dev_list);
-		} else {
-			next = list_next(l2arc_dev_list, next);
-			if (next == NULL)
-				next = list_head(l2arc_dev_list);
-		}
-
-		/* if we have come back to the start, bail out */
-		if (first == NULL)
-			first = next;
-		else if (next == first)
-			break;
-
-	} while (vdev_is_dead(next->l2ad_vdev));
-
-	/* if we were unable to find any usable vdevs, return NULL */
-	if (vdev_is_dead(next->l2ad_vdev))
-		next = NULL;
-
-	l2arc_dev_last = next;
-
-out:
-	mutex_exit(&l2arc_dev_mtx);
-
-	/*
-	 * Grab the config lock to prevent the 'next' device from being
-	 * removed while we are writing to it.
-	 */
-	if (next != NULL)
-		spa_config_enter(next->l2ad_spa, SCL_L2ARC, next, RW_READER);
-	mutex_exit(&spa_namespace_lock);
-
-	return (next);
-}
-
-/*
- * Free buffers that were tagged for destruction.
- */
-static void
-l2arc_do_free_on_write()
-{
-	list_t *buflist;
-	l2arc_data_free_t *df, *df_prev;
-
-	mutex_enter(&l2arc_free_on_write_mtx);
-	buflist = l2arc_free_on_write;
-
-	for (df = list_tail(buflist); df; df = df_prev) {
-		df_prev = list_prev(buflist, df);
-		ASSERT3P(df->l2df_abd, !=, NULL);
-		abd_free(df->l2df_abd);
-		list_remove(buflist, df);
-		kmem_free(df, sizeof (l2arc_data_free_t));
-	}
-
-	mutex_exit(&l2arc_free_on_write_mtx);
-}
-
-/*
- * A write to a cache device has completed.  Update all headers to allow
- * reads from these buffers to begin.
- */
-static void
-l2arc_write_done(zio_t *zio)
-{
-	l2arc_write_callback_t *cb;
-	l2arc_dev_t *dev;
-	list_t *buflist;
-	arc_buf_hdr_t *head, *hdr, *hdr_prev;
-	kmutex_t *hash_lock;
-	int64_t bytes_dropped = 0;
-
-	cb = zio->io_private;
-	ASSERT3P(cb, !=, NULL);
-	dev = cb->l2wcb_dev;
-	ASSERT3P(dev, !=, NULL);
-	head = cb->l2wcb_head;
-	ASSERT3P(head, !=, NULL);
-	buflist = &dev->l2ad_buflist;
-	ASSERT3P(buflist, !=, NULL);
-	DTRACE_PROBE2(l2arc__iodone, zio_t *, zio,
-	    l2arc_write_callback_t *, cb);
-
-	if (zio->io_error != 0)
-		ARCSTAT_BUMP(arcstat_l2_writes_error);
-
-	/*
-	 * All writes completed, or an error was hit.
-	 */
-top:
-	mutex_enter(&dev->l2ad_mtx);
-	for (hdr = list_prev(buflist, head); hdr; hdr = hdr_prev) {
-		hdr_prev = list_prev(buflist, hdr);
-
-		hash_lock = HDR_LOCK(hdr);
-
-		/*
-		 * We cannot use mutex_enter or else we can deadlock
-		 * with l2arc_write_buffers (due to swapping the order
-		 * the hash lock and l2ad_mtx are taken).
-		 */
-		if (!mutex_tryenter(hash_lock)) {
-			/*
-			 * Missed the hash lock. We must retry so we
-			 * don't leave the ARC_FLAG_L2_WRITING bit set.
-			 */
-			ARCSTAT_BUMP(arcstat_l2_writes_lock_retry);
-
-			/*
-			 * We don't want to rescan the headers we've
-			 * already marked as having been written out, so
-			 * we reinsert the head node so we can pick up
-			 * where we left off.
-			 */
-			list_remove(buflist, head);
-			list_insert_after(buflist, hdr, head);
-
-			mutex_exit(&dev->l2ad_mtx);
-
-			/*
-			 * We wait for the hash lock to become available
-			 * to try and prevent busy waiting, and increase
-			 * the chance we'll be able to acquire the lock
-			 * the next time around.
-			 */
-			mutex_enter(hash_lock);
-			mutex_exit(hash_lock);
-			goto top;
-		}
-
-		/*
-		 * We could not have been moved into the arc_l2c_only
-		 * state while in-flight due to our ARC_FLAG_L2_WRITING
-		 * bit being set. Let's just ensure that's being enforced.
-		 */
-		ASSERT(HDR_HAS_L1HDR(hdr));
-
-		if (zio->io_error != 0) {
-			/*
-			 * Error - drop L2ARC entry.
-			 */
-			list_remove(buflist, hdr);
-			l2arc_trim(hdr);
-			arc_hdr_clear_flags(hdr, ARC_FLAG_HAS_L2HDR);
-
-			ARCSTAT_INCR(arcstat_l2_psize, -arc_hdr_size(hdr));
-			ARCSTAT_INCR(arcstat_l2_lsize, -HDR_GET_LSIZE(hdr));
-
-			bytes_dropped += arc_hdr_size(hdr);
-			(void) zfs_refcount_remove_many(&dev->l2ad_alloc,
-			    arc_hdr_size(hdr), hdr);
-		}
-
-		/*
-		 * Allow ARC to begin reads and ghost list evictions to
-		 * this L2ARC entry.
-		 */
-		arc_hdr_clear_flags(hdr, ARC_FLAG_L2_WRITING);
-
-		mutex_exit(hash_lock);
-	}
-
-	atomic_inc_64(&l2arc_writes_done);
-	list_remove(buflist, head);
-	ASSERT(!HDR_HAS_L1HDR(head));
-	kmem_cache_free(hdr_l2only_cache, head);
-	mutex_exit(&dev->l2ad_mtx);
-
-	vdev_space_update(dev->l2ad_vdev, -bytes_dropped, 0, 0);
-
-	l2arc_do_free_on_write();
-
-	kmem_free(cb, sizeof (l2arc_write_callback_t));
-}
-
-/*
- * A read to a cache device completed.  Validate buffer contents before
- * handing over to the regular ARC routines.
- */
-static void
-l2arc_read_done(zio_t *zio)
-{
-	l2arc_read_callback_t *cb;
-	arc_buf_hdr_t *hdr;
-	kmutex_t *hash_lock;
-	boolean_t valid_cksum;
-
-	ASSERT3P(zio->io_vd, !=, NULL);
-	ASSERT(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE);
-
-	spa_config_exit(zio->io_spa, SCL_L2ARC, zio->io_vd);
-
-	cb = zio->io_private;
-	ASSERT3P(cb, !=, NULL);
-	hdr = cb->l2rcb_hdr;
-	ASSERT3P(hdr, !=, NULL);
-
-	hash_lock = HDR_LOCK(hdr);
-	mutex_enter(hash_lock);
-	ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
-
-	/*
-	 * If the data was read into a temporary buffer,
-	 * move it and free the buffer.
-	 */
-	if (cb->l2rcb_abd != NULL) {
-		ASSERT3U(arc_hdr_size(hdr), <, zio->io_size);
-		if (zio->io_error == 0) {
-			abd_copy(hdr->b_l1hdr.b_pabd, cb->l2rcb_abd,
-			    arc_hdr_size(hdr));
-		}
-
-		/*
-		 * The following must be done regardless of whether
-		 * there was an error:
-		 * - free the temporary buffer
-		 * - point zio to the real ARC buffer
-		 * - set zio size accordingly
-		 * These are required because zio is either re-used for
-		 * an I/O of the block in the case of the error
-		 * or the zio is passed to arc_read_done() and it
-		 * needs real data.
-		 */
-		abd_free(cb->l2rcb_abd);
-		zio->io_size = zio->io_orig_size = arc_hdr_size(hdr);
-		zio->io_abd = zio->io_orig_abd = hdr->b_l1hdr.b_pabd;
-	}
-
-	ASSERT3P(zio->io_abd, !=, NULL);
-
-	/*
-	 * Check this survived the L2ARC journey.
-	 */
-	ASSERT3P(zio->io_abd, ==, hdr->b_l1hdr.b_pabd);
-	zio->io_bp_copy = cb->l2rcb_bp;	/* XXX fix in L2ARC 2.0	*/
-	zio->io_bp = &zio->io_bp_copy;	/* XXX fix in L2ARC 2.0	*/
-
-	valid_cksum = arc_cksum_is_equal(hdr, zio);
-	if (valid_cksum && zio->io_error == 0 && !HDR_L2_EVICTED(hdr)) {
-		mutex_exit(hash_lock);
-		zio->io_private = hdr;
-		arc_read_done(zio);
-	} else {
-		/*
-		 * Buffer didn't survive caching.  Increment stats and
-		 * reissue to the original storage device.
-		 */
-		if (zio->io_error != 0) {
-			ARCSTAT_BUMP(arcstat_l2_io_error);
-		} else {
-			zio->io_error = SET_ERROR(EIO);
-		}
-		if (!valid_cksum)
-			ARCSTAT_BUMP(arcstat_l2_cksum_bad);
-
-		/*
-		 * If there's no waiter, issue an async i/o to the primary
-		 * storage now.  If there *is* a waiter, the caller must
-		 * issue the i/o in a context where it's OK to block.
-		 */
-		if (zio->io_waiter == NULL) {
-			zio_t *pio = zio_unique_parent(zio);
-
-			ASSERT(!pio || pio->io_child_type == ZIO_CHILD_LOGICAL);
-
-			zio = zio_read(pio, zio->io_spa, zio->io_bp,
-			    hdr->b_l1hdr.b_pabd, zio->io_size, arc_read_done,
-			    hdr, zio->io_priority, cb->l2rcb_flags,
-			    &cb->l2rcb_zb);
-			for (struct arc_callback *acb = hdr->b_l1hdr.b_acb;
-			    acb != NULL; acb = acb->acb_next)
-				acb->acb_zio_head = zio;
-			mutex_exit(hash_lock);
-			zio_nowait(zio);
-		} else
-			mutex_exit(hash_lock);
-	}
-
-	kmem_free(cb, sizeof (l2arc_read_callback_t));
-}
-
-/*
- * This is the list priority from which the L2ARC will search for pages to
- * cache.  This is used within loops (0..3) to cycle through lists in the
- * desired order.  This order can have a significant effect on cache
- * performance.
- *
- * Currently the metadata lists are hit first, MFU then MRU, followed by
- * the data lists.  This function returns a locked list, and also returns
- * the lock pointer.
- */
-static multilist_sublist_t *
-l2arc_sublist_lock(int list_num)
-{
-	multilist_t *ml = NULL;
-	unsigned int idx;
-
-	ASSERT(list_num >= 0 && list_num <= 3);
-
-	switch (list_num) {
-	case 0:
-		ml = arc_mfu->arcs_list[ARC_BUFC_METADATA];
-		break;
-	case 1:
-		ml = arc_mru->arcs_list[ARC_BUFC_METADATA];
-		break;
-	case 2:
-		ml = arc_mfu->arcs_list[ARC_BUFC_DATA];
-		break;
-	case 3:
-		ml = arc_mru->arcs_list[ARC_BUFC_DATA];
-		break;
-	}
-
-	/*
-	 * Return a randomly-selected sublist. This is acceptable
-	 * because the caller feeds only a little bit of data for each
-	 * call (8MB). Subsequent calls will result in different
-	 * sublists being selected.
-	 */
-	idx = multilist_get_random_index(ml);
-	return (multilist_sublist_lock(ml, idx));
-}
-
-/*
- * Evict buffers from the device write hand to the distance specified in
- * bytes.  This distance may span populated buffers, it may span nothing.
- * This is clearing a region on the L2ARC device ready for writing.
- * If the 'all' boolean is set, every buffer is evicted.
- */
-static void
-l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all)
-{
-	list_t *buflist;
-	arc_buf_hdr_t *hdr, *hdr_prev;
-	kmutex_t *hash_lock;
-	uint64_t taddr;
-
-	buflist = &dev->l2ad_buflist;
-
-	if (!all && dev->l2ad_first) {
-		/*
-		 * This is the first sweep through the device.  There is
-		 * nothing to evict.
-		 */
-		return;
-	}
-
-	if (dev->l2ad_hand >= (dev->l2ad_end - (2 * distance))) {
-		/*
-		 * When nearing the end of the device, evict to the end
-		 * before the device write hand jumps to the start.
-		 */
-		taddr = dev->l2ad_end;
-	} else {
-		taddr = dev->l2ad_hand + distance;
-	}
-	DTRACE_PROBE4(l2arc__evict, l2arc_dev_t *, dev, list_t *, buflist,
-	    uint64_t, taddr, boolean_t, all);
-
-top:
-	mutex_enter(&dev->l2ad_mtx);
-	for (hdr = list_tail(buflist); hdr; hdr = hdr_prev) {
-		hdr_prev = list_prev(buflist, hdr);
-
-		hash_lock = HDR_LOCK(hdr);
-
-		/*
-		 * We cannot use mutex_enter or else we can deadlock
-		 * with l2arc_write_buffers (due to swapping the order
-		 * the hash lock and l2ad_mtx are taken).
-		 */
-		if (!mutex_tryenter(hash_lock)) {
-			/*
-			 * Missed the hash lock.  Retry.
-			 */
-			ARCSTAT_BUMP(arcstat_l2_evict_lock_retry);
-			mutex_exit(&dev->l2ad_mtx);
-			mutex_enter(hash_lock);
-			mutex_exit(hash_lock);
-			goto top;
-		}
-
-		/*
-		 * A header can't be on this list if it doesn't have L2 header.
-		 */
-		ASSERT(HDR_HAS_L2HDR(hdr));
-
-		/* Ensure this header has finished being written. */
-		ASSERT(!HDR_L2_WRITING(hdr));
-		ASSERT(!HDR_L2_WRITE_HEAD(hdr));
-
-		if (!all && (hdr->b_l2hdr.b_daddr >= taddr ||
-		    hdr->b_l2hdr.b_daddr < dev->l2ad_hand)) {
-			/*
-			 * We've evicted to the target address,
-			 * or the end of the device.
-			 */
-			mutex_exit(hash_lock);
-			break;
-		}
-
-		if (!HDR_HAS_L1HDR(hdr)) {
-			ASSERT(!HDR_L2_READING(hdr));
-			/*
-			 * This doesn't exist in the ARC.  Destroy.
-			 * arc_hdr_destroy() will call list_remove()
-			 * and decrement arcstat_l2_lsize.
-			 */
-			arc_change_state(arc_anon, hdr, hash_lock);
-			arc_hdr_destroy(hdr);
-		} else {
-			ASSERT(hdr->b_l1hdr.b_state != arc_l2c_only);
-			ARCSTAT_BUMP(arcstat_l2_evict_l1cached);
-			/*
-			 * Invalidate issued or about to be issued
-			 * reads, since we may be about to write
-			 * over this location.
-			 */
-			if (HDR_L2_READING(hdr)) {
-				ARCSTAT_BUMP(arcstat_l2_evict_reading);
-				arc_hdr_set_flags(hdr, ARC_FLAG_L2_EVICTED);
-			}
-
-			arc_hdr_l2hdr_destroy(hdr);
-		}
-		mutex_exit(hash_lock);
-	}
-	mutex_exit(&dev->l2ad_mtx);
-}
-
-/*
- * Find and write ARC buffers to the L2ARC device.
- *
- * An ARC_FLAG_L2_WRITING flag is set so that the L2ARC buffers are not valid
- * for reading until they have completed writing.
- * The headroom_boost is an in-out parameter used to maintain headroom boost
- * state between calls to this function.
- *
- * Returns the number of bytes actually written (which may be smaller than
- * the delta by which the device hand has changed due to alignment).
- */
-static uint64_t
-l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
-{
-	arc_buf_hdr_t *hdr, *hdr_prev, *head;
-	uint64_t write_asize, write_psize, write_lsize, headroom;
-	boolean_t full;
-	l2arc_write_callback_t *cb;
-	zio_t *pio, *wzio;
-	uint64_t guid = spa_load_guid(spa);
-	int try;
-
-	ASSERT3P(dev->l2ad_vdev, !=, NULL);
-
-	pio = NULL;
-	write_lsize = write_asize = write_psize = 0;
-	full = B_FALSE;
-	head = kmem_cache_alloc(hdr_l2only_cache, KM_PUSHPAGE);
-	arc_hdr_set_flags(head, ARC_FLAG_L2_WRITE_HEAD | ARC_FLAG_HAS_L2HDR);
-
-	ARCSTAT_BUMP(arcstat_l2_write_buffer_iter);
-	/*
-	 * Copy buffers for L2ARC writing.
-	 */
-	for (try = 0; try <= 3; try++) {
-		multilist_sublist_t *mls = l2arc_sublist_lock(try);
-		uint64_t passed_sz = 0;
-
-		ARCSTAT_BUMP(arcstat_l2_write_buffer_list_iter);
-
-		/*
-		 * L2ARC fast warmup.
-		 *
-		 * Until the ARC is warm and starts to evict, read from the
-		 * head of the ARC lists rather than the tail.
-		 */
-		if (arc_warm == B_FALSE)
-			hdr = multilist_sublist_head(mls);
-		else
-			hdr = multilist_sublist_tail(mls);
-		if (hdr == NULL)
-			ARCSTAT_BUMP(arcstat_l2_write_buffer_list_null_iter);
-
-		headroom = target_sz * l2arc_headroom;
-		if (zfs_compressed_arc_enabled)
-			headroom = (headroom * l2arc_headroom_boost) / 100;
-
-		for (; hdr; hdr = hdr_prev) {
-			kmutex_t *hash_lock;
-
-			if (arc_warm == B_FALSE)
-				hdr_prev = multilist_sublist_next(mls, hdr);
-			else
-				hdr_prev = multilist_sublist_prev(mls, hdr);
-			ARCSTAT_INCR(arcstat_l2_write_buffer_bytes_scanned,
-			    HDR_GET_LSIZE(hdr));
-
-			hash_lock = HDR_LOCK(hdr);
-			if (!mutex_tryenter(hash_lock)) {
-				ARCSTAT_BUMP(arcstat_l2_write_trylock_fail);
-				/*
-				 * Skip this buffer rather than waiting.
-				 */
-				continue;
-			}
-
-			passed_sz += HDR_GET_LSIZE(hdr);
-			if (passed_sz > headroom) {
-				/*
-				 * Searched too far.
-				 */
-				mutex_exit(hash_lock);
-				ARCSTAT_BUMP(arcstat_l2_write_passed_headroom);
-				break;
-			}
-
-			if (!l2arc_write_eligible(guid, hdr)) {
-				mutex_exit(hash_lock);
-				continue;
-			}
-
-			/*
-			 * We rely on the L1 portion of the header below, so
-			 * it's invalid for this header to have been evicted out
-			 * of the ghost cache, prior to being written out. The
-			 * ARC_FLAG_L2_WRITING bit ensures this won't happen.
-			 */
-			ASSERT(HDR_HAS_L1HDR(hdr));
-
-			ASSERT3U(HDR_GET_PSIZE(hdr), >, 0);
-			ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
-			ASSERT3U(arc_hdr_size(hdr), >, 0);
-			uint64_t psize = arc_hdr_size(hdr);
-			uint64_t asize = vdev_psize_to_asize(dev->l2ad_vdev,
-			    psize);
-
-			if ((write_asize + asize) > target_sz) {
-				full = B_TRUE;
-				mutex_exit(hash_lock);
-				ARCSTAT_BUMP(arcstat_l2_write_full);
-				break;
-			}
-
-			if (pio == NULL) {
-				/*
-				 * Insert a dummy header on the buflist so
-				 * l2arc_write_done() can find where the
-				 * write buffers begin without searching.
-				 */
-				mutex_enter(&dev->l2ad_mtx);
-				list_insert_head(&dev->l2ad_buflist, head);
-				mutex_exit(&dev->l2ad_mtx);
-
-				cb = kmem_alloc(
-				    sizeof (l2arc_write_callback_t), KM_SLEEP);
-				cb->l2wcb_dev = dev;
-				cb->l2wcb_head = head;
-				pio = zio_root(spa, l2arc_write_done, cb,
-				    ZIO_FLAG_CANFAIL);
-				ARCSTAT_BUMP(arcstat_l2_write_pios);
-			}
-
-			hdr->b_l2hdr.b_dev = dev;
-			hdr->b_l2hdr.b_daddr = dev->l2ad_hand;
-			arc_hdr_set_flags(hdr,
-			    ARC_FLAG_L2_WRITING | ARC_FLAG_HAS_L2HDR);
-
-			mutex_enter(&dev->l2ad_mtx);
-			list_insert_head(&dev->l2ad_buflist, hdr);
-			mutex_exit(&dev->l2ad_mtx);
-
-			(void) zfs_refcount_add_many(&dev->l2ad_alloc, psize,
-			    hdr);
-
-			/*
-			 * Normally the L2ARC can use the hdr's data, but if
-			 * we're sharing data between the hdr and one of its
-			 * bufs, L2ARC needs its own copy of the data so that
-			 * the ZIO below can't race with the buf consumer.
-			 * Another case where we need to create a copy of the
-			 * data is when the buffer size is not device-aligned
-			 * and we need to pad the block to make it such.
-			 * That also keeps the clock hand suitably aligned.
-			 *
-			 * To ensure that the copy will be available for the
-			 * lifetime of the ZIO and be cleaned up afterwards, we
-			 * add it to the l2arc_free_on_write queue.
-			 */
-			abd_t *to_write;
-			if (!HDR_SHARED_DATA(hdr) && psize == asize) {
-				to_write = hdr->b_l1hdr.b_pabd;
-			} else {
-				to_write = abd_alloc_for_io(asize,
-				    HDR_ISTYPE_METADATA(hdr));
-				abd_copy(to_write, hdr->b_l1hdr.b_pabd, psize);
-				if (asize != psize) {
-					abd_zero_off(to_write, psize,
-					    asize - psize);
-				}
-				l2arc_free_abd_on_write(to_write, asize,
-				    arc_buf_type(hdr));
-			}
-			wzio = zio_write_phys(pio, dev->l2ad_vdev,
-			    hdr->b_l2hdr.b_daddr, asize, to_write,
-			    ZIO_CHECKSUM_OFF, NULL, hdr,
-			    ZIO_PRIORITY_ASYNC_WRITE,
-			    ZIO_FLAG_CANFAIL, B_FALSE);
-
-			write_lsize += HDR_GET_LSIZE(hdr);
-			DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev,
-			    zio_t *, wzio);
-
-			write_psize += psize;
-			write_asize += asize;
-			dev->l2ad_hand += asize;
-
-			mutex_exit(hash_lock);
-
-			(void) zio_nowait(wzio);
-		}
-
-		multilist_sublist_unlock(mls);
-
-		if (full == B_TRUE)
-			break;
-	}
-
-	/* No buffers selected for writing? */
-	if (pio == NULL) {
-		ASSERT0(write_lsize);
-		ASSERT(!HDR_HAS_L1HDR(head));
-		kmem_cache_free(hdr_l2only_cache, head);
-		return (0);
-	}
-
-	ASSERT3U(write_psize, <=, target_sz);
-	ARCSTAT_BUMP(arcstat_l2_writes_sent);
-	ARCSTAT_INCR(arcstat_l2_write_bytes, write_psize);
-	ARCSTAT_INCR(arcstat_l2_lsize, write_lsize);
-	ARCSTAT_INCR(arcstat_l2_psize, write_psize);
-	vdev_space_update(dev->l2ad_vdev, write_psize, 0, 0);
-
-	/*
-	 * Bump device hand to the device start if it is approaching the end.
-	 * l2arc_evict() will already have evicted ahead for this case.
-	 */
-	if (dev->l2ad_hand >= (dev->l2ad_end - target_sz)) {
-		dev->l2ad_hand = dev->l2ad_start;
-		dev->l2ad_first = B_FALSE;
-	}
-
-	dev->l2ad_writing = B_TRUE;
-	(void) zio_wait(pio);
-	dev->l2ad_writing = B_FALSE;
-
-	return (write_asize);
-}
-
-/*
- * This thread feeds the L2ARC at regular intervals.  This is the beating
- * heart of the L2ARC.
- */
-/* ARGSUSED */
-static void
-l2arc_feed_thread(void *unused __unused)
-{
-	callb_cpr_t cpr;
-	l2arc_dev_t *dev;
-	spa_t *spa;
-	uint64_t size, wrote;
-	clock_t begin, next = ddi_get_lbolt();
-
-	CALLB_CPR_INIT(&cpr, &l2arc_feed_thr_lock, callb_generic_cpr, FTAG);
-
-	mutex_enter(&l2arc_feed_thr_lock);
-
-	while (l2arc_thread_exit == 0) {
-		CALLB_CPR_SAFE_BEGIN(&cpr);
-		(void) cv_timedwait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock,
-		    next - ddi_get_lbolt());
-		CALLB_CPR_SAFE_END(&cpr, &l2arc_feed_thr_lock);
-		next = ddi_get_lbolt() + hz;
-
-		/*
-		 * Quick check for L2ARC devices.
-		 */
-		mutex_enter(&l2arc_dev_mtx);
-		if (l2arc_ndev == 0) {
-			mutex_exit(&l2arc_dev_mtx);
-			continue;
-		}
-		mutex_exit(&l2arc_dev_mtx);
-		begin = ddi_get_lbolt();
-
-		/*
-		 * This selects the next l2arc device to write to, and in
-		 * doing so the next spa to feed from: dev->l2ad_spa.   This
-		 * will return NULL if there are now no l2arc devices or if
-		 * they are all faulted.
-		 *
-		 * If a device is returned, its spa's config lock is also
-		 * held to prevent device removal.  l2arc_dev_get_next()
-		 * will grab and release l2arc_dev_mtx.
-		 */
-		if ((dev = l2arc_dev_get_next()) == NULL)
-			continue;
-
-		spa = dev->l2ad_spa;
-		ASSERT3P(spa, !=, NULL);
-
-		/*
-		 * If the pool is read-only then force the feed thread to
-		 * sleep a little longer.
-		 */
-		if (!spa_writeable(spa)) {
-			next = ddi_get_lbolt() + 5 * l2arc_feed_secs * hz;
-			spa_config_exit(spa, SCL_L2ARC, dev);
-			continue;
-		}
-
-		/*
-		 * Avoid contributing to memory pressure.
-		 */
-		if (arc_reclaim_needed()) {
-			ARCSTAT_BUMP(arcstat_l2_abort_lowmem);
-			spa_config_exit(spa, SCL_L2ARC, dev);
-			continue;
-		}
-
-		ARCSTAT_BUMP(arcstat_l2_feeds);
-
-		size = l2arc_write_size();
-
-		/*
-		 * Evict L2ARC buffers that will be overwritten.
-		 */
-		l2arc_evict(dev, size, B_FALSE);
-
-		/*
-		 * Write ARC buffers.
-		 */
-		wrote = l2arc_write_buffers(spa, dev, size);
-
-		/*
-		 * Calculate interval between writes.
-		 */
-		next = l2arc_write_interval(begin, size, wrote);
-		spa_config_exit(spa, SCL_L2ARC, dev);
-	}
-
-	l2arc_thread_exit = 0;
-	cv_broadcast(&l2arc_feed_thr_cv);
-	CALLB_CPR_EXIT(&cpr);		/* drops l2arc_feed_thr_lock */
-	thread_exit();
-}
-
-boolean_t
-l2arc_vdev_present(vdev_t *vd)
-{
-	l2arc_dev_t *dev;
-
-	mutex_enter(&l2arc_dev_mtx);
-	for (dev = list_head(l2arc_dev_list); dev != NULL;
-	    dev = list_next(l2arc_dev_list, dev)) {
-		if (dev->l2ad_vdev == vd)
-			break;
-	}
-	mutex_exit(&l2arc_dev_mtx);
-
-	return (dev != NULL);
-}
-
-/*
- * Add a vdev for use by the L2ARC.  By this point the spa has already
- * validated the vdev and opened it.
- */
-void
-l2arc_add_vdev(spa_t *spa, vdev_t *vd)
-{
-	l2arc_dev_t *adddev;
-
-	ASSERT(!l2arc_vdev_present(vd));
-
-	vdev_ashift_optimize(vd);
-
-	/*
-	 * Create a new l2arc device entry.
-	 */
-	adddev = kmem_zalloc(sizeof (l2arc_dev_t), KM_SLEEP);
-	adddev->l2ad_spa = spa;
-	adddev->l2ad_vdev = vd;
-	adddev->l2ad_start = VDEV_LABEL_START_SIZE;
-	adddev->l2ad_end = VDEV_LABEL_START_SIZE + vdev_get_min_asize(vd);
-	adddev->l2ad_hand = adddev->l2ad_start;
-	adddev->l2ad_first = B_TRUE;
-	adddev->l2ad_writing = B_FALSE;
-
-	mutex_init(&adddev->l2ad_mtx, NULL, MUTEX_DEFAULT, NULL);
-	/*
-	 * This is a list of all ARC buffers that are still valid on the
-	 * device.
-	 */
-	list_create(&adddev->l2ad_buflist, sizeof (arc_buf_hdr_t),
-	    offsetof(arc_buf_hdr_t, b_l2hdr.b_l2node));
-
-	vdev_space_update(vd, 0, 0, adddev->l2ad_end - adddev->l2ad_hand);
-	zfs_refcount_create(&adddev->l2ad_alloc);
-
-	/*
-	 * Add device to global list
-	 */
-	mutex_enter(&l2arc_dev_mtx);
-	list_insert_head(l2arc_dev_list, adddev);
-	atomic_inc_64(&l2arc_ndev);
-	mutex_exit(&l2arc_dev_mtx);
-}
-
-/*
- * Remove a vdev from the L2ARC.
- */
-void
-l2arc_remove_vdev(vdev_t *vd)
-{
-	l2arc_dev_t *dev, *nextdev, *remdev = NULL;
-
-	/*
-	 * Find the device by vdev
-	 */
-	mutex_enter(&l2arc_dev_mtx);
-	for (dev = list_head(l2arc_dev_list); dev; dev = nextdev) {
-		nextdev = list_next(l2arc_dev_list, dev);
-		if (vd == dev->l2ad_vdev) {
-			remdev = dev;
-			break;
-		}
-	}
-	ASSERT3P(remdev, !=, NULL);
-
-	/*
-	 * Remove device from global list
-	 */
-	list_remove(l2arc_dev_list, remdev);
-	l2arc_dev_last = NULL;		/* may have been invalidated */
-	atomic_dec_64(&l2arc_ndev);
-	mutex_exit(&l2arc_dev_mtx);
-
-	/*
-	 * Clear all buflists and ARC references.  L2ARC device flush.
-	 */
-	l2arc_evict(remdev, 0, B_TRUE);
-	list_destroy(&remdev->l2ad_buflist);
-	mutex_destroy(&remdev->l2ad_mtx);
-	zfs_refcount_destroy(&remdev->l2ad_alloc);
-	kmem_free(remdev, sizeof (l2arc_dev_t));
-}
-
-void
-l2arc_init(void)
-{
-	l2arc_thread_exit = 0;
-	l2arc_ndev = 0;
-	l2arc_writes_sent = 0;
-	l2arc_writes_done = 0;
-
-	mutex_init(&l2arc_feed_thr_lock, NULL, MUTEX_DEFAULT, NULL);
-	cv_init(&l2arc_feed_thr_cv, NULL, CV_DEFAULT, NULL);
-	mutex_init(&l2arc_dev_mtx, NULL, MUTEX_DEFAULT, NULL);
-	mutex_init(&l2arc_free_on_write_mtx, NULL, MUTEX_DEFAULT, NULL);
-
-	l2arc_dev_list = &L2ARC_dev_list;
-	l2arc_free_on_write = &L2ARC_free_on_write;
-	list_create(l2arc_dev_list, sizeof (l2arc_dev_t),
-	    offsetof(l2arc_dev_t, l2ad_node));
-	list_create(l2arc_free_on_write, sizeof (l2arc_data_free_t),
-	    offsetof(l2arc_data_free_t, l2df_list_node));
-}
-
-void
-l2arc_fini(void)
-{
-	/*
-	 * This is called from dmu_fini(), which is called from spa_fini();
-	 * Because of this, we can assume that all l2arc devices have
-	 * already been removed when the pools themselves were removed.
-	 */
-
-	l2arc_do_free_on_write();
-
-	mutex_destroy(&l2arc_feed_thr_lock);
-	cv_destroy(&l2arc_feed_thr_cv);
-	mutex_destroy(&l2arc_dev_mtx);
-	mutex_destroy(&l2arc_free_on_write_mtx);
-
-	list_destroy(l2arc_dev_list);
-	list_destroy(l2arc_free_on_write);
-}
-
-void
-l2arc_start(void)
-{
-	if (!(spa_mode_global & FWRITE))
-		return;
-
-	(void) thread_create(NULL, 0, l2arc_feed_thread, NULL, 0, &p0,
-	    TS_RUN, minclsyspri);
-}
-
-void
-l2arc_stop(void)
-{
-	if (!(spa_mode_global & FWRITE))
-		return;
-
-	mutex_enter(&l2arc_feed_thr_lock);
-	cv_signal(&l2arc_feed_thr_cv);	/* kick thread out of startup */
-	l2arc_thread_exit = 1;
-	while (l2arc_thread_exit != 0)
-		cv_wait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock);
-	mutex_exit(&l2arc_feed_thr_lock);
-}
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/blkptr.c
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/blkptr.c
+++ /dev/null
@@ -1,152 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * This file and its contents are supplied under the terms of the
- * Common Development and Distribution License ("CDDL"), version 1.0.
- * You may only use this file in accordance with the terms of version
- * 1.0 of the CDDL.
- *
- * A full copy of the text of the CDDL should have accompanied this
- * source.  A copy of the CDDL is also available via the Internet at
- * http://www.illumos.org/license/CDDL.
- *
- * CDDL HEADER END
- */
-
-/*
- * Copyright (c) 2013, 2016 by Delphix. All rights reserved.
- */
-
-#include <sys/zfs_context.h>
-#include <sys/zio.h>
-#include <sys/zio_compress.h>
-
-/*
- * Embedded-data Block Pointers
- *
- * Normally, block pointers point (via their DVAs) to a block which holds data.
- * If the data that we need to store is very small, this is an inefficient
- * use of space, because a block must be at minimum 1 sector (typically 512
- * bytes or 4KB).  Additionally, reading these small blocks tends to generate
- * more random reads.
- *
- * Embedded-data Block Pointers allow small pieces of data (the "payload",
- * up to 112 bytes) to be stored in the block pointer itself, instead of
- * being pointed to.  The "Pointer" part of this name is a bit of a
- * misnomer, as nothing is pointed to.
- *
- * BP_EMBEDDED_TYPE_DATA block pointers allow highly-compressible data to
- * be embedded in the block pointer.  The logic for this is handled in
- * the SPA, by the zio pipeline.  Therefore most code outside the zio
- * pipeline doesn't need special-cases to handle these block pointers.
- *
- * See spa.h for details on the exact layout of embedded block pointers.
- */
-
-void
-encode_embedded_bp_compressed(blkptr_t *bp, void *data,
-    enum zio_compress comp, int uncompressed_size, int compressed_size)
-{
-	uint64_t *bp64 = (uint64_t *)bp;
-	uint64_t w = 0;
-	uint8_t *data8 = data;
-
-	ASSERT3U(compressed_size, <=, BPE_PAYLOAD_SIZE);
-	ASSERT(uncompressed_size == compressed_size ||
-	    comp != ZIO_COMPRESS_OFF);
-	ASSERT3U(comp, >=, ZIO_COMPRESS_OFF);
-	ASSERT3U(comp, <, ZIO_COMPRESS_FUNCTIONS);
-
-	bzero(bp, sizeof (*bp));
-	BP_SET_EMBEDDED(bp, B_TRUE);
-	BP_SET_COMPRESS(bp, comp);
-	BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
-	BPE_SET_LSIZE(bp, uncompressed_size);
-	BPE_SET_PSIZE(bp, compressed_size);
-
-	/*
-	 * Encode the byte array into the words of the block pointer.
-	 * First byte goes into low bits of first word (little endian).
-	 */
-	for (int i = 0; i < compressed_size; i++) {
-		BF64_SET(w, (i % sizeof (w)) * NBBY, NBBY, data8[i]);
-		if (i % sizeof (w) == sizeof (w) - 1) {
-			/* we've reached the end of a word */
-			ASSERT3P(bp64, <, bp + 1);
-			*bp64 = w;
-			bp64++;
-			if (!BPE_IS_PAYLOADWORD(bp, bp64))
-				bp64++;
-			w = 0;
-		}
-	}
-	/* write last partial word */
-	if (bp64 < (uint64_t *)(bp + 1))
-		*bp64 = w;
-}
-
-/*
- * buf must be at least BPE_GET_PSIZE(bp) bytes long (which will never be
- * more than BPE_PAYLOAD_SIZE bytes).
- */
-void
-decode_embedded_bp_compressed(const blkptr_t *bp, void *buf)
-{
-	int psize;
-	uint8_t *buf8 = buf;
-	uint64_t w = 0;
-	const uint64_t *bp64 = (const uint64_t *)bp;
-
-	ASSERT(BP_IS_EMBEDDED(bp));
-
-	psize = BPE_GET_PSIZE(bp);
-
-	/*
-	 * Decode the words of the block pointer into the byte array.
-	 * Low bits of first word are the first byte (little endian).
-	 */
-	for (int i = 0; i < psize; i++) {
-		if (i % sizeof (w) == 0) {
-			/* beginning of a word */
-			ASSERT3P(bp64, <, bp + 1);
-			w = *bp64;
-			bp64++;
-			if (!BPE_IS_PAYLOADWORD(bp, bp64))
-				bp64++;
-		}
-		buf8[i] = BF64_GET(w, (i % sizeof (w)) * NBBY, NBBY);
-	}
-}
-
-/*
- * Fill in the buffer with the (decompressed) payload of the embedded
- * blkptr_t.  Takes into account compression and byteorder (the payload is
- * treated as a stream of bytes).
- * Return 0 on success, or ENOSPC if it won't fit in the buffer.
- */
-int
-decode_embedded_bp(const blkptr_t *bp, void *buf, int buflen)
-{
-	int lsize, psize;
-
-	ASSERT(BP_IS_EMBEDDED(bp));
-
-	lsize = BPE_GET_LSIZE(bp);
-	psize = BPE_GET_PSIZE(bp);
-
-	if (lsize > buflen)
-		return (ENOSPC);
-	ASSERT3U(lsize, ==, buflen);
-
-	if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF) {
-		uint8_t dstbuf[BPE_PAYLOAD_SIZE];
-		decode_embedded_bp_compressed(bp, dstbuf);
-		VERIFY0(zio_decompress_data_buf(BP_GET_COMPRESS(bp),
-		    dstbuf, buf, psize, buflen));
-	} else {
-		ASSERT3U(lsize, ==, psize);
-		decode_embedded_bp_compressed(bp, buf);
-	}
-
-	return (0);
-}
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bplist.c
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bplist.c
+++ /dev/null
@@ -1,77 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012 by Delphix. All rights reserved.
- */
-
-#include <sys/bplist.h>
-#include <sys/zfs_context.h>
-
-
-void
-bplist_create(bplist_t *bpl)
-{
-	mutex_init(&bpl->bpl_lock, NULL, MUTEX_DEFAULT, NULL);
-	list_create(&bpl->bpl_list, sizeof (bplist_entry_t),
-	    offsetof(bplist_entry_t, bpe_node));
-}
-
-void
-bplist_destroy(bplist_t *bpl)
-{
-	list_destroy(&bpl->bpl_list);
-	mutex_destroy(&bpl->bpl_lock);
-}
-
-void
-bplist_append(bplist_t *bpl, const blkptr_t *bp)
-{
-	bplist_entry_t *bpe = kmem_alloc(sizeof (*bpe), KM_SLEEP);
-
-	mutex_enter(&bpl->bpl_lock);
-	bpe->bpe_blk = *bp;
-	list_insert_tail(&bpl->bpl_list, bpe);
-	mutex_exit(&bpl->bpl_lock);
-}
-
-/*
- * To aid debugging, we keep the most recently removed entry.  This way if
- * we are in the callback, we can easily locate the entry.
- */
-static bplist_entry_t *bplist_iterate_last_removed;
-
-void
-bplist_iterate(bplist_t *bpl, bplist_itor_t *func, void *arg, dmu_tx_t *tx)
-{
-	bplist_entry_t *bpe;
-
-	mutex_enter(&bpl->bpl_lock);
-	while (bpe = list_head(&bpl->bpl_list)) {
-		bplist_iterate_last_removed = bpe;
-		list_remove(&bpl->bpl_list, bpe);
-		mutex_exit(&bpl->bpl_lock);
-		func(arg, &bpe->bpe_blk, tx);
-		kmem_free(bpe, sizeof (*bpe));
-		mutex_enter(&bpl->bpl_lock);
-	}
-	mutex_exit(&bpl->bpl_lock);
-}
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bpobj.c
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bpobj.c
+++ /dev/null
@@ -1,606 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, 2016 by Delphix. All rights reserved.
- * Copyright (c) 2014 Integros [integros.com]
- * Copyright (c) 2017 Datto Inc.
- */
-
-#include <sys/bpobj.h>
-#include <sys/zfs_context.h>
-#include <sys/refcount.h>
-#include <sys/dsl_pool.h>
-#include <sys/zfeature.h>
-#include <sys/zap.h>
-
-/*
- * Return an empty bpobj, preferably the empty dummy one (dp_empty_bpobj).
- */
-uint64_t
-bpobj_alloc_empty(objset_t *os, int blocksize, dmu_tx_t *tx)
-{
-	spa_t *spa = dmu_objset_spa(os);
-	dsl_pool_t *dp = dmu_objset_pool(os);
-
-	if (spa_feature_is_enabled(spa, SPA_FEATURE_EMPTY_BPOBJ)) {
-		if (!spa_feature_is_active(spa, SPA_FEATURE_EMPTY_BPOBJ)) {
-			ASSERT0(dp->dp_empty_bpobj);
-			dp->dp_empty_bpobj =
-			    bpobj_alloc(os, SPA_OLD_MAXBLOCKSIZE, tx);
-			VERIFY(zap_add(os,
-			    DMU_POOL_DIRECTORY_OBJECT,
-			    DMU_POOL_EMPTY_BPOBJ, sizeof (uint64_t), 1,
-			    &dp->dp_empty_bpobj, tx) == 0);
-		}
-		spa_feature_incr(spa, SPA_FEATURE_EMPTY_BPOBJ, tx);
-		ASSERT(dp->dp_empty_bpobj != 0);
-		return (dp->dp_empty_bpobj);
-	} else {
-		return (bpobj_alloc(os, blocksize, tx));
-	}
-}
-
-void
-bpobj_decr_empty(objset_t *os, dmu_tx_t *tx)
-{
-	dsl_pool_t *dp = dmu_objset_pool(os);
-
-	spa_feature_decr(dmu_objset_spa(os), SPA_FEATURE_EMPTY_BPOBJ, tx);
-	if (!spa_feature_is_active(dmu_objset_spa(os),
-	    SPA_FEATURE_EMPTY_BPOBJ)) {
-		VERIFY3U(0, ==, zap_remove(dp->dp_meta_objset,
-		    DMU_POOL_DIRECTORY_OBJECT,
-		    DMU_POOL_EMPTY_BPOBJ, tx));
-		VERIFY3U(0, ==, dmu_object_free(os, dp->dp_empty_bpobj, tx));
-		dp->dp_empty_bpobj = 0;
-	}
-}
-
-uint64_t
-bpobj_alloc(objset_t *os, int blocksize, dmu_tx_t *tx)
-{
-	int size;
-
-	if (spa_version(dmu_objset_spa(os)) < SPA_VERSION_BPOBJ_ACCOUNT)
-		size = BPOBJ_SIZE_V0;
-	else if (spa_version(dmu_objset_spa(os)) < SPA_VERSION_DEADLISTS)
-		size = BPOBJ_SIZE_V1;
-	else
-		size = sizeof (bpobj_phys_t);
-
-	return (dmu_object_alloc(os, DMU_OT_BPOBJ, blocksize,
-	    DMU_OT_BPOBJ_HDR, size, tx));
-}
-
-void
-bpobj_free(objset_t *os, uint64_t obj, dmu_tx_t *tx)
-{
-	int64_t i;
-	bpobj_t bpo;
-	dmu_object_info_t doi;
-	int epb;
-	dmu_buf_t *dbuf = NULL;
-
-	ASSERT(obj != dmu_objset_pool(os)->dp_empty_bpobj);
-	VERIFY3U(0, ==, bpobj_open(&bpo, os, obj));
-
-	mutex_enter(&bpo.bpo_lock);
-
-	if (!bpo.bpo_havesubobj || bpo.bpo_phys->bpo_subobjs == 0)
-		goto out;
-
-	VERIFY3U(0, ==, dmu_object_info(os, bpo.bpo_phys->bpo_subobjs, &doi));
-	epb = doi.doi_data_block_size / sizeof (uint64_t);
-
-	for (i = bpo.bpo_phys->bpo_num_subobjs - 1; i >= 0; i--) {
-		uint64_t *objarray;
-		uint64_t offset, blkoff;
-
-		offset = i * sizeof (uint64_t);
-		blkoff = P2PHASE(i, epb);
-
-		if (dbuf == NULL || dbuf->db_offset > offset) {
-			if (dbuf)
-				dmu_buf_rele(dbuf, FTAG);
-			VERIFY3U(0, ==, dmu_buf_hold(os,
-			    bpo.bpo_phys->bpo_subobjs, offset, FTAG, &dbuf, 0));
-		}
-
-		ASSERT3U(offset, >=, dbuf->db_offset);
-		ASSERT3U(offset, <, dbuf->db_offset + dbuf->db_size);
-
-		objarray = dbuf->db_data;
-		bpobj_free(os, objarray[blkoff], tx);
-	}
-	if (dbuf) {
-		dmu_buf_rele(dbuf, FTAG);
-		dbuf = NULL;
-	}
-	VERIFY3U(0, ==, dmu_object_free(os, bpo.bpo_phys->bpo_subobjs, tx));
-
-out:
-	mutex_exit(&bpo.bpo_lock);
-	bpobj_close(&bpo);
-
-	VERIFY3U(0, ==, dmu_object_free(os, obj, tx));
-}
-
-int
-bpobj_open(bpobj_t *bpo, objset_t *os, uint64_t object)
-{
-	dmu_object_info_t doi;
-	int err;
-
-	err = dmu_object_info(os, object, &doi);
-	if (err)
-		return (err);
-
-	bzero(bpo, sizeof (*bpo));
-	mutex_init(&bpo->bpo_lock, NULL, MUTEX_DEFAULT, NULL);
-
-	ASSERT(bpo->bpo_dbuf == NULL);
-	ASSERT(bpo->bpo_phys == NULL);
-	ASSERT(object != 0);
-	ASSERT3U(doi.doi_type, ==, DMU_OT_BPOBJ);
-	ASSERT3U(doi.doi_bonus_type, ==, DMU_OT_BPOBJ_HDR);
-
-	err = dmu_bonus_hold(os, object, bpo, &bpo->bpo_dbuf);
-	if (err)
-		return (err);
-
-	bpo->bpo_os = os;
-	bpo->bpo_object = object;
-	bpo->bpo_epb = doi.doi_data_block_size >> SPA_BLKPTRSHIFT;
-	bpo->bpo_havecomp = (doi.doi_bonus_size > BPOBJ_SIZE_V0);
-	bpo->bpo_havesubobj = (doi.doi_bonus_size > BPOBJ_SIZE_V1);
-	bpo->bpo_phys = bpo->bpo_dbuf->db_data;
-	return (0);
-}
-
-boolean_t
-bpobj_is_open(const bpobj_t *bpo)
-{
-	return (bpo->bpo_object != 0);
-}
-
-void
-bpobj_close(bpobj_t *bpo)
-{
-	/* Lame workaround for closing a bpobj that was never opened. */
-	if (bpo->bpo_object == 0)
-		return;
-
-	dmu_buf_rele(bpo->bpo_dbuf, bpo);
-	if (bpo->bpo_cached_dbuf != NULL)
-		dmu_buf_rele(bpo->bpo_cached_dbuf, bpo);
-	bpo->bpo_dbuf = NULL;
-	bpo->bpo_phys = NULL;
-	bpo->bpo_cached_dbuf = NULL;
-	bpo->bpo_object = 0;
-
-	mutex_destroy(&bpo->bpo_lock);
-}
-
-boolean_t
-bpobj_is_empty(bpobj_t *bpo)
-{
-	return (bpo->bpo_phys->bpo_num_blkptrs == 0 &&
-	    (!bpo->bpo_havesubobj || bpo->bpo_phys->bpo_num_subobjs == 0));
-}
-
-static int
-bpobj_iterate_impl(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx,
-    boolean_t free)
-{
-	dmu_object_info_t doi;
-	int epb;
-	int64_t i;
-	int err = 0;
-	dmu_buf_t *dbuf = NULL;
-
-	ASSERT(bpobj_is_open(bpo));
-	mutex_enter(&bpo->bpo_lock);
-
-	if (free)
-		dmu_buf_will_dirty(bpo->bpo_dbuf, tx);
-
-	for (i = bpo->bpo_phys->bpo_num_blkptrs - 1; i >= 0; i--) {
-		blkptr_t *bparray;
-		blkptr_t *bp;
-		uint64_t offset, blkoff;
-
-		offset = i * sizeof (blkptr_t);
-		blkoff = P2PHASE(i, bpo->bpo_epb);
-
-		if (dbuf == NULL || dbuf->db_offset > offset) {
-			if (dbuf)
-				dmu_buf_rele(dbuf, FTAG);
-			err = dmu_buf_hold(bpo->bpo_os, bpo->bpo_object, offset,
-			    FTAG, &dbuf, 0);
-			if (err)
-				break;
-		}
-
-		ASSERT3U(offset, >=, dbuf->db_offset);
-		ASSERT3U(offset, <, dbuf->db_offset + dbuf->db_size);
-
-		bparray = dbuf->db_data;
-		bp = &bparray[blkoff];
-		err = func(arg, bp, tx);
-		if (err)
-			break;
-		if (free) {
-			bpo->bpo_phys->bpo_bytes -=
-			    bp_get_dsize_sync(dmu_objset_spa(bpo->bpo_os), bp);
-			ASSERT3S(bpo->bpo_phys->bpo_bytes, >=, 0);
-			if (bpo->bpo_havecomp) {
-				bpo->bpo_phys->bpo_comp -= BP_GET_PSIZE(bp);
-				bpo->bpo_phys->bpo_uncomp -= BP_GET_UCSIZE(bp);
-			}
-			bpo->bpo_phys->bpo_num_blkptrs--;
-			ASSERT3S(bpo->bpo_phys->bpo_num_blkptrs, >=, 0);
-		}
-	}
-	if (dbuf) {
-		dmu_buf_rele(dbuf, FTAG);
-		dbuf = NULL;
-	}
-	if (free) {
-		VERIFY3U(0, ==, dmu_free_range(bpo->bpo_os, bpo->bpo_object,
-		    (i + 1) * sizeof (blkptr_t), -1ULL, tx));
-	}
-	if (err || !bpo->bpo_havesubobj || bpo->bpo_phys->bpo_subobjs == 0)
-		goto out;
-
-	ASSERT(bpo->bpo_havecomp);
-	err = dmu_object_info(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, &doi);
-	if (err) {
-		mutex_exit(&bpo->bpo_lock);
-		return (err);
-	}
-	ASSERT3U(doi.doi_type, ==, DMU_OT_BPOBJ_SUBOBJ);
-	epb = doi.doi_data_block_size / sizeof (uint64_t);
-
-	for (i = bpo->bpo_phys->bpo_num_subobjs - 1; i >= 0; i--) {
-		uint64_t *objarray;
-		uint64_t offset, blkoff;
-		bpobj_t sublist;
-		uint64_t used_before, comp_before, uncomp_before;
-		uint64_t used_after, comp_after, uncomp_after;
-
-		offset = i * sizeof (uint64_t);
-		blkoff = P2PHASE(i, epb);
-
-		if (dbuf == NULL || dbuf->db_offset > offset) {
-			if (dbuf)
-				dmu_buf_rele(dbuf, FTAG);
-			err = dmu_buf_hold(bpo->bpo_os,
-			    bpo->bpo_phys->bpo_subobjs, offset, FTAG, &dbuf, 0);
-			if (err)
-				break;
-		}
-
-		ASSERT3U(offset, >=, dbuf->db_offset);
-		ASSERT3U(offset, <, dbuf->db_offset + dbuf->db_size);
-
-		objarray = dbuf->db_data;
-		err = bpobj_open(&sublist, bpo->bpo_os, objarray[blkoff]);
-		if (err)
-			break;
-		if (free) {
-			err = bpobj_space(&sublist,
-			    &used_before, &comp_before, &uncomp_before);
-			if (err != 0) {
-				bpobj_close(&sublist);
-				break;
-			}
-		}
-		err = bpobj_iterate_impl(&sublist, func, arg, tx, free);
-		if (free) {
-			VERIFY3U(0, ==, bpobj_space(&sublist,
-			    &used_after, &comp_after, &uncomp_after));
-			bpo->bpo_phys->bpo_bytes -= used_before - used_after;
-			ASSERT3S(bpo->bpo_phys->bpo_bytes, >=, 0);
-			bpo->bpo_phys->bpo_comp -= comp_before - comp_after;
-			bpo->bpo_phys->bpo_uncomp -=
-			    uncomp_before - uncomp_after;
-		}
-
-		bpobj_close(&sublist);
-		if (err)
-			break;
-		if (free) {
-			err = dmu_object_free(bpo->bpo_os,
-			    objarray[blkoff], tx);
-			if (err)
-				break;
-			bpo->bpo_phys->bpo_num_subobjs--;
-			ASSERT3S(bpo->bpo_phys->bpo_num_subobjs, >=, 0);
-		}
-	}
-	if (dbuf) {
-		dmu_buf_rele(dbuf, FTAG);
-		dbuf = NULL;
-	}
-	if (free) {
-		VERIFY3U(0, ==, dmu_free_range(bpo->bpo_os,
-		    bpo->bpo_phys->bpo_subobjs,
-		    (i + 1) * sizeof (uint64_t), -1ULL, tx));
-	}
-
-out:
-	/* If there are no entries, there should be no bytes. */
-	if (bpobj_is_empty(bpo)) {
-		ASSERT0(bpo->bpo_phys->bpo_bytes);
-		ASSERT0(bpo->bpo_phys->bpo_comp);
-		ASSERT0(bpo->bpo_phys->bpo_uncomp);
-	}
-
-	mutex_exit(&bpo->bpo_lock);
-	return (err);
-}
-
-/*
- * Iterate and remove the entries.  If func returns nonzero, iteration
- * will stop and that entry will not be removed.
- */
-int
-bpobj_iterate(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx)
-{
-	return (bpobj_iterate_impl(bpo, func, arg, tx, B_TRUE));
-}
-
-/*
- * Iterate the entries.  If func returns nonzero, iteration will stop.
- */
-int
-bpobj_iterate_nofree(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx)
-{
-	return (bpobj_iterate_impl(bpo, func, arg, tx, B_FALSE));
-}
-
-void
-bpobj_enqueue_subobj(bpobj_t *bpo, uint64_t subobj, dmu_tx_t *tx)
-{
-	bpobj_t subbpo;
-	uint64_t used, comp, uncomp, subsubobjs;
-
-	ASSERT(bpobj_is_open(bpo));
-	ASSERT(subobj != 0);
-	ASSERT(bpo->bpo_havesubobj);
-	ASSERT(bpo->bpo_havecomp);
-	ASSERT(bpo->bpo_object != dmu_objset_pool(bpo->bpo_os)->dp_empty_bpobj);
-
-	if (subobj == dmu_objset_pool(bpo->bpo_os)->dp_empty_bpobj) {
-		bpobj_decr_empty(bpo->bpo_os, tx);
-		return;
-	}
-
-	VERIFY3U(0, ==, bpobj_open(&subbpo, bpo->bpo_os, subobj));
-	VERIFY3U(0, ==, bpobj_space(&subbpo, &used, &comp, &uncomp));
-
-	if (bpobj_is_empty(&subbpo)) {
-		/* No point in having an empty subobj. */
-		bpobj_close(&subbpo);
-		bpobj_free(bpo->bpo_os, subobj, tx);
-		return;
-	}
-
-	mutex_enter(&bpo->bpo_lock);
-	dmu_buf_will_dirty(bpo->bpo_dbuf, tx);
-	if (bpo->bpo_phys->bpo_subobjs == 0) {
-		bpo->bpo_phys->bpo_subobjs = dmu_object_alloc(bpo->bpo_os,
-		    DMU_OT_BPOBJ_SUBOBJ, SPA_OLD_MAXBLOCKSIZE,
-		    DMU_OT_NONE, 0, tx);
-	}
-
-	dmu_object_info_t doi;
-	ASSERT0(dmu_object_info(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, &doi));
-	ASSERT3U(doi.doi_type, ==, DMU_OT_BPOBJ_SUBOBJ);
-
-	dmu_write(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs,
-	    bpo->bpo_phys->bpo_num_subobjs * sizeof (subobj),
-	    sizeof (subobj), &subobj, tx);
-	bpo->bpo_phys->bpo_num_subobjs++;
-
-	/*
-	 * If subobj has only one block of subobjs, then move subobj's
-	 * subobjs to bpo's subobj list directly.  This reduces
-	 * recursion in bpobj_iterate due to nested subobjs.
-	 */
-	subsubobjs = subbpo.bpo_phys->bpo_subobjs;
-	if (subsubobjs != 0) {
-		dmu_object_info_t doi;
-
-		VERIFY3U(0, ==, dmu_object_info(bpo->bpo_os, subsubobjs, &doi));
-		if (doi.doi_max_offset == doi.doi_data_block_size) {
-			dmu_buf_t *subdb;
-			uint64_t numsubsub = subbpo.bpo_phys->bpo_num_subobjs;
-
-			VERIFY3U(0, ==, dmu_buf_hold(bpo->bpo_os, subsubobjs,
-			    0, FTAG, &subdb, 0));
-			/*
-			 * Make sure that we are not asking dmu_write()
-			 * to write more data than we have in our buffer.
-			 */
-			VERIFY3U(subdb->db_size, >=,
-			    numsubsub * sizeof (subobj));
-			dmu_write(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs,
-			    bpo->bpo_phys->bpo_num_subobjs * sizeof (subobj),
-			    numsubsub * sizeof (subobj), subdb->db_data, tx);
-			dmu_buf_rele(subdb, FTAG);
-			bpo->bpo_phys->bpo_num_subobjs += numsubsub;
-
-			dmu_buf_will_dirty(subbpo.bpo_dbuf, tx);
-			subbpo.bpo_phys->bpo_subobjs = 0;
-			VERIFY3U(0, ==, dmu_object_free(bpo->bpo_os,
-			    subsubobjs, tx));
-		}
-	}
-	bpo->bpo_phys->bpo_bytes += used;
-	bpo->bpo_phys->bpo_comp += comp;
-	bpo->bpo_phys->bpo_uncomp += uncomp;
-	mutex_exit(&bpo->bpo_lock);
-
-	bpobj_close(&subbpo);
-}
-
-void
-bpobj_enqueue(bpobj_t *bpo, const blkptr_t *bp, dmu_tx_t *tx)
-{
-	blkptr_t stored_bp = *bp;
-	uint64_t offset;
-	int blkoff;
-	blkptr_t *bparray;
-
-	ASSERT(bpobj_is_open(bpo));
-	ASSERT(!BP_IS_HOLE(bp));
-	ASSERT(bpo->bpo_object != dmu_objset_pool(bpo->bpo_os)->dp_empty_bpobj);
-
-	if (BP_IS_EMBEDDED(bp)) {
-		/*
-		 * The bpobj will compress better without the payload.
-		 *
-		 * Note that we store EMBEDDED bp's because they have an
-		 * uncompressed size, which must be accounted for.  An
-		 * alternative would be to add their size to bpo_uncomp
-		 * without storing the bp, but that would create additional
-		 * complications: bpo_uncomp would be inconsistent with the
-		 * set of BP's stored, and bpobj_iterate() wouldn't visit
-		 * all the space accounted for in the bpobj.
-		 */
-		bzero(&stored_bp, sizeof (stored_bp));
-		stored_bp.blk_prop = bp->blk_prop;
-		stored_bp.blk_birth = bp->blk_birth;
-	} else if (!BP_GET_DEDUP(bp)) {
-		/* The bpobj will compress better without the checksum */
-		bzero(&stored_bp.blk_cksum, sizeof (stored_bp.blk_cksum));
-	}
-
-	/* We never need the fill count. */
-	stored_bp.blk_fill = 0;
-
-	mutex_enter(&bpo->bpo_lock);
-
-	offset = bpo->bpo_phys->bpo_num_blkptrs * sizeof (stored_bp);
-	blkoff = P2PHASE(bpo->bpo_phys->bpo_num_blkptrs, bpo->bpo_epb);
-
-	if (bpo->bpo_cached_dbuf == NULL ||
-	    offset < bpo->bpo_cached_dbuf->db_offset ||
-	    offset >= bpo->bpo_cached_dbuf->db_offset +
-	    bpo->bpo_cached_dbuf->db_size) {
-		if (bpo->bpo_cached_dbuf)
-			dmu_buf_rele(bpo->bpo_cached_dbuf, bpo);
-		VERIFY3U(0, ==, dmu_buf_hold(bpo->bpo_os, bpo->bpo_object,
-		    offset, bpo, &bpo->bpo_cached_dbuf, 0));
-	}
-
-	dmu_buf_will_dirty(bpo->bpo_cached_dbuf, tx);
-	bparray = bpo->bpo_cached_dbuf->db_data;
-	bparray[blkoff] = stored_bp;
-
-	dmu_buf_will_dirty(bpo->bpo_dbuf, tx);
-	bpo->bpo_phys->bpo_num_blkptrs++;
-	bpo->bpo_phys->bpo_bytes +=
-	    bp_get_dsize_sync(dmu_objset_spa(bpo->bpo_os), bp);
-	if (bpo->bpo_havecomp) {
-		bpo->bpo_phys->bpo_comp += BP_GET_PSIZE(bp);
-		bpo->bpo_phys->bpo_uncomp += BP_GET_UCSIZE(bp);
-	}
-	mutex_exit(&bpo->bpo_lock);
-}
-
-struct space_range_arg {
-	spa_t *spa;
-	uint64_t mintxg;
-	uint64_t maxtxg;
-	uint64_t used;
-	uint64_t comp;
-	uint64_t uncomp;
-};
-
-/* ARGSUSED */
-static int
-space_range_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
-{
-	struct space_range_arg *sra = arg;
-
-	if (bp->blk_birth > sra->mintxg && bp->blk_birth <= sra->maxtxg) {
-		if (dsl_pool_sync_context(spa_get_dsl(sra->spa)))
-			sra->used += bp_get_dsize_sync(sra->spa, bp);
-		else
-			sra->used += bp_get_dsize(sra->spa, bp);
-		sra->comp += BP_GET_PSIZE(bp);
-		sra->uncomp += BP_GET_UCSIZE(bp);
-	}
-	return (0);
-}
-
-int
-bpobj_space(bpobj_t *bpo, uint64_t *usedp, uint64_t *compp, uint64_t *uncompp)
-{
-	ASSERT(bpobj_is_open(bpo));
-	mutex_enter(&bpo->bpo_lock);
-
-	*usedp = bpo->bpo_phys->bpo_bytes;
-	if (bpo->bpo_havecomp) {
-		*compp = bpo->bpo_phys->bpo_comp;
-		*uncompp = bpo->bpo_phys->bpo_uncomp;
-		mutex_exit(&bpo->bpo_lock);
-		return (0);
-	} else {
-		mutex_exit(&bpo->bpo_lock);
-		return (bpobj_space_range(bpo, 0, UINT64_MAX,
-		    usedp, compp, uncompp));
-	}
-}
-
-/*
- * Return the amount of space in the bpobj which is:
- * mintxg < blk_birth <= maxtxg
- */
-int
-bpobj_space_range(bpobj_t *bpo, uint64_t mintxg, uint64_t maxtxg,
-    uint64_t *usedp, uint64_t *compp, uint64_t *uncompp)
-{
-	struct space_range_arg sra = { 0 };
-	int err;
-
-	ASSERT(bpobj_is_open(bpo));
-
-	/*
-	 * As an optimization, if they want the whole txg range, just
-	 * get bpo_bytes rather than iterating over the bps.
-	 */
-	if (mintxg < TXG_INITIAL && maxtxg == UINT64_MAX && bpo->bpo_havecomp)
-		return (bpobj_space(bpo, usedp, compp, uncompp));
-
-	sra.spa = dmu_objset_spa(bpo->bpo_os);
-	sra.mintxg = mintxg;
-	sra.maxtxg = maxtxg;
-
-	err = bpobj_iterate_nofree(bpo, space_range_cb, &sra, NULL);
-	*usedp = sra.used;
-	*compp = sra.comp;
-	*uncompp = sra.uncomp;
-	return (err);
-}
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bptree.c
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bptree.c
+++ /dev/null
@@ -1,301 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-
-/*
- * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
- * Copyright (c) 2014 Integros [integros.com]
- */
-
-#include <sys/arc.h>
-#include <sys/bptree.h>
-#include <sys/dmu.h>
-#include <sys/dmu_objset.h>
-#include <sys/dmu_tx.h>
-#include <sys/dmu_traverse.h>
-#include <sys/dsl_dataset.h>
-#include <sys/dsl_dir.h>
-#include <sys/dsl_pool.h>
-#include <sys/dnode.h>
-#include <sys/refcount.h>
-#include <sys/spa.h>
-
-/*
- * A bptree is a queue of root block pointers from destroyed datasets. When a
- * dataset is destroyed its root block pointer is put on the end of the pool's
- * bptree queue so the dataset's blocks can be freed asynchronously by
- * dsl_scan_sync. This allows the delete operation to finish without traversing
- * all the dataset's blocks.
- *
- * Note that while bt_begin and bt_end are only ever incremented in this code,
- * they are effectively reset to 0 every time the entire bptree is freed because
- * the bptree's object is destroyed and re-created.
- */
-
-struct bptree_args {
-	bptree_phys_t *ba_phys;	/* data in bonus buffer, dirtied if freeing */
-	boolean_t ba_free;	/* true if freeing during traversal */
-
-	bptree_itor_t *ba_func;	/* function to call for each blockpointer */
-	void *ba_arg;		/* caller supplied argument to ba_func */
-	dmu_tx_t *ba_tx;	/* caller supplied tx, NULL if not freeing */
-} bptree_args_t;
-
-uint64_t
-bptree_alloc(objset_t *os, dmu_tx_t *tx)
-{
-	uint64_t obj;
-	dmu_buf_t *db;
-	bptree_phys_t *bt;
-
-	obj = dmu_object_alloc(os, DMU_OTN_UINT64_METADATA,
-	    SPA_OLD_MAXBLOCKSIZE, DMU_OTN_UINT64_METADATA,
-	    sizeof (bptree_phys_t), tx);
-
-	/*
-	 * Bonus buffer contents are already initialized to 0, but for
-	 * readability we make it explicit.
-	 */
-	VERIFY3U(0, ==, dmu_bonus_hold(os, obj, FTAG, &db));
-	dmu_buf_will_dirty(db, tx);
-	bt = db->db_data;
-	bt->bt_begin = 0;
-	bt->bt_end = 0;
-	bt->bt_bytes = 0;
-	bt->bt_comp = 0;
-	bt->bt_uncomp = 0;
-	dmu_buf_rele(db, FTAG);
-
-	return (obj);
-}
-
-int
-bptree_free(objset_t *os, uint64_t obj, dmu_tx_t *tx)
-{
-	dmu_buf_t *db;
-	bptree_phys_t *bt;
-
-	VERIFY3U(0, ==, dmu_bonus_hold(os, obj, FTAG, &db));
-	bt = db->db_data;
-	ASSERT3U(bt->bt_begin, ==, bt->bt_end);
-	ASSERT0(bt->bt_bytes);
-	ASSERT0(bt->bt_comp);
-	ASSERT0(bt->bt_uncomp);
-	dmu_buf_rele(db, FTAG);
-
-	return (dmu_object_free(os, obj, tx));
-}
-
-boolean_t
-bptree_is_empty(objset_t *os, uint64_t obj)
-{
-	dmu_buf_t *db;
-	bptree_phys_t *bt;
-	boolean_t rv;
-
-	VERIFY0(dmu_bonus_hold(os, obj, FTAG, &db));
-	bt = db->db_data;
-	rv = (bt->bt_begin == bt->bt_end);
-	dmu_buf_rele(db, FTAG);
-	return (rv);
-}
-
-void
-bptree_add(objset_t *os, uint64_t obj, blkptr_t *bp, uint64_t birth_txg,
-    uint64_t bytes, uint64_t comp, uint64_t uncomp, dmu_tx_t *tx)
-{
-	dmu_buf_t *db;
-	bptree_phys_t *bt;
-	bptree_entry_phys_t bte = { 0 };
-
-	/*
-	 * bptree objects are in the pool mos, therefore they can only be
-	 * modified in syncing context. Furthermore, this is only modified
-	 * by the sync thread, so no locking is necessary.
-	 */
-	ASSERT(dmu_tx_is_syncing(tx));
-
-	VERIFY3U(0, ==, dmu_bonus_hold(os, obj, FTAG, &db));
-	bt = db->db_data;
-
-	bte.be_birth_txg = birth_txg;
-	bte.be_bp = *bp;
-	dmu_write(os, obj, bt->bt_end * sizeof (bte), sizeof (bte), &bte, tx);
-
-	dmu_buf_will_dirty(db, tx);
-	bt->bt_end++;
-	bt->bt_bytes += bytes;
-	bt->bt_comp += comp;
-	bt->bt_uncomp += uncomp;
-	dmu_buf_rele(db, FTAG);
-}
-
-/* ARGSUSED */
-static int
-bptree_visit_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
-    const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg)
-{
-	int err;
-	struct bptree_args *ba = arg;
-
-	if (bp == NULL || BP_IS_HOLE(bp))
-		return (0);
-
-	err = ba->ba_func(ba->ba_arg, bp, ba->ba_tx);
-	if (err == 0 && ba->ba_free) {
-		ba->ba_phys->bt_bytes -= bp_get_dsize_sync(spa, bp);
-		ba->ba_phys->bt_comp -= BP_GET_PSIZE(bp);
-		ba->ba_phys->bt_uncomp -= BP_GET_UCSIZE(bp);
-	}
-	return (err);
-}
-
-/*
- * If "free" is set:
- *  - It is assumed that "func" will be freeing the block pointers.
- *  - If "func" returns nonzero, the bookmark will be remembered and
- *    iteration will be restarted from this point on next invocation.
- *  - If an i/o error is encountered (e.g. "func" returns EIO or ECKSUM),
- *    bptree_iterate will remember the bookmark, continue traversing
- *    any additional entries, and return 0.
- *
- * If "free" is not set, traversal will stop and return an error if
- * an i/o error is encountered.
- *
- * In either case, if zfs_free_leak_on_eio is set, i/o errors will be
- * ignored and traversal will continue (i.e. TRAVERSE_HARD will be passed to
- * traverse_dataset_destroyed()).
- */
-int
-bptree_iterate(objset_t *os, uint64_t obj, boolean_t free, bptree_itor_t func,
-    void *arg, dmu_tx_t *tx)
-{
-	boolean_t ioerr = B_FALSE;
-	int err;
-	uint64_t i;
-	dmu_buf_t *db;
-	struct bptree_args ba;
-
-	ASSERT(!free || dmu_tx_is_syncing(tx));
-
-	err = dmu_bonus_hold(os, obj, FTAG, &db);
-	if (err != 0)
-		return (err);
-
-	if (free)
-		dmu_buf_will_dirty(db, tx);
-
-	ba.ba_phys = db->db_data;
-	ba.ba_free = free;
-	ba.ba_func = func;
-	ba.ba_arg = arg;
-	ba.ba_tx = tx;
-
-	err = 0;
-	for (i = ba.ba_phys->bt_begin; i < ba.ba_phys->bt_end; i++) {
-		bptree_entry_phys_t bte;
-		int flags = TRAVERSE_PREFETCH_METADATA | TRAVERSE_POST;
-
-		err = dmu_read(os, obj, i * sizeof (bte), sizeof (bte),
-		    &bte, DMU_READ_NO_PREFETCH);
-		if (err != 0)
-			break;
-
-		if (zfs_free_leak_on_eio)
-			flags |= TRAVERSE_HARD;
-		zfs_dbgmsg("bptree index %lld: traversing from min_txg=%lld "
-		    "bookmark %lld/%lld/%lld/%lld",
-		    (longlong_t)i,
-		    (longlong_t)bte.be_birth_txg,
-		    (longlong_t)bte.be_zb.zb_objset,
-		    (longlong_t)bte.be_zb.zb_object,
-		    (longlong_t)bte.be_zb.zb_level,
-		    (longlong_t)bte.be_zb.zb_blkid);
-		err = traverse_dataset_destroyed(os->os_spa, &bte.be_bp,
-		    bte.be_birth_txg, &bte.be_zb, flags,
-		    bptree_visit_cb, &ba);
-		if (free) {
-			/*
-			 * The callback has freed the visited block pointers.
-			 * Record our traversal progress on disk, either by
-			 * updating this record's bookmark, or by logically
-			 * removing this record by advancing bt_begin.
-			 */
-			if (err != 0) {
-				/* save bookmark for future resume */
-				ASSERT3U(bte.be_zb.zb_objset, ==,
-				    ZB_DESTROYED_OBJSET);
-				ASSERT0(bte.be_zb.zb_level);
-				dmu_write(os, obj, i * sizeof (bte),
-				    sizeof (bte), &bte, tx);
-				if (err == EIO || err == ECKSUM ||
-				    err == ENXIO) {
-					/*
-					 * Skip the rest of this tree and
-					 * continue on to the next entry.
-					 */
-					err = 0;
-					ioerr = B_TRUE;
-				} else {
-					break;
-				}
-			} else if (ioerr) {
-				/*
-				 * This entry is finished, but there were
-				 * i/o errors on previous entries, so we
-				 * can't adjust bt_begin.  Set this entry's
-				 * be_birth_txg such that it will be
-				 * treated as a no-op in future traversals.
-				 */
-				bte.be_birth_txg = UINT64_MAX;
-				dmu_write(os, obj, i * sizeof (bte),
-				    sizeof (bte), &bte, tx);
-			}
-
-			if (!ioerr) {
-				ba.ba_phys->bt_begin++;
-				(void) dmu_free_range(os, obj,
-				    i * sizeof (bte), sizeof (bte), tx);
-			}
-		} else if (err != 0) {
-			break;
-		}
-	}
-
-	ASSERT(!free || err != 0 || ioerr ||
-	    ba.ba_phys->bt_begin == ba.ba_phys->bt_end);
-
-	/* if all blocks are free there should be no used space */
-	if (ba.ba_phys->bt_begin == ba.ba_phys->bt_end) {
-		if (zfs_free_leak_on_eio) {
-			ba.ba_phys->bt_bytes = 0;
-			ba.ba_phys->bt_comp = 0;
-			ba.ba_phys->bt_uncomp = 0;
-		}
-
-		ASSERT0(ba.ba_phys->bt_bytes);
-		ASSERT0(ba.ba_phys->bt_comp);
-		ASSERT0(ba.ba_phys->bt_uncomp);
-	}
-
-	dmu_buf_rele(db, FTAG);
-
-	return (err);
-}
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bqueue.c
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bqueue.c
+++ /dev/null
@@ -1,111 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * This file and its contents are supplied under the terms of the
- * Common Development and Distribution License ("CDDL"), version 1.0.
- * You may only use this file in accordance with the terms of version
- * 1.0 of the CDDL.
- *
- * A full copy of the text of the CDDL should have accompanied this
- * source.  A copy of the CDDL is also available via the Internet at
- * http://www.illumos.org/license/CDDL.
- *
- * CDDL HEADER END
- */
-/*
- * Copyright (c) 2014 by Delphix. All rights reserved.
- */
-
-#include	<sys/bqueue.h>
-#include	<sys/zfs_context.h>
-
-static inline bqueue_node_t *
-obj2node(bqueue_t *q, void *data)
-{
-	return ((bqueue_node_t *)((char *)data + q->bq_node_offset));
-}
-
-/*
- * Initialize a blocking queue  The maximum capacity of the queue is set to
- * size.  Types that want to be stored in a bqueue must contain a bqueue_node_t,
- * and offset should give its offset from the start of the struct.  Return 0 on
- * success, or -1 on failure.
- */
-int
-bqueue_init(bqueue_t *q, uint64_t size, size_t node_offset)
-{
-	list_create(&q->bq_list, node_offset + sizeof (bqueue_node_t),
-	    node_offset + offsetof(bqueue_node_t, bqn_node));
-	cv_init(&q->bq_add_cv, NULL, CV_DEFAULT, NULL);
-	cv_init(&q->bq_pop_cv, NULL, CV_DEFAULT, NULL);
-	mutex_init(&q->bq_lock, NULL, MUTEX_DEFAULT, NULL);
-	q->bq_node_offset = node_offset;
-	q->bq_size = 0;
-	q->bq_maxsize = size;
-	return (0);
-}
-
-/*
- * Destroy a blocking queue.  This function asserts that there are no
- * elements in the queue, and no one is blocked on the condition
- * variables.
- */
-void
-bqueue_destroy(bqueue_t *q)
-{
-	ASSERT0(q->bq_size);
-	cv_destroy(&q->bq_add_cv);
-	cv_destroy(&q->bq_pop_cv);
-	mutex_destroy(&q->bq_lock);
-	list_destroy(&q->bq_list);
-}
-
-/*
- * Add data to q, consuming size units of capacity.  If there is insufficient
- * capacity to consume size units, block until capacity exists.  Asserts size is
- * > 0.
- */
-void
-bqueue_enqueue(bqueue_t *q, void *data, uint64_t item_size)
-{
-	ASSERT3U(item_size, >, 0);
-	ASSERT3U(item_size, <, q->bq_maxsize);
-	mutex_enter(&q->bq_lock);
-	obj2node(q, data)->bqn_size = item_size;
-	while (q->bq_size + item_size > q->bq_maxsize) {
-		cv_wait(&q->bq_add_cv, &q->bq_lock);
-	}
-	q->bq_size += item_size;
-	list_insert_tail(&q->bq_list, data);
-	cv_signal(&q->bq_pop_cv);
-	mutex_exit(&q->bq_lock);
-}
-/*
- * Take the first element off of q.  If there are no elements on the queue, wait
- * until one is put there.  Return the removed element.
- */
-void *
-bqueue_dequeue(bqueue_t *q)
-{
-	void *ret;
-	uint64_t item_size;
-	mutex_enter(&q->bq_lock);
-	while (q->bq_size == 0) {
-		cv_wait(&q->bq_pop_cv, &q->bq_lock);
-	}
-	ret = list_remove_head(&q->bq_list);
-	item_size = obj2node(q, ret)->bqn_size;
-	q->bq_size -= item_size;
-	mutex_exit(&q->bq_lock);
-	cv_signal(&q->bq_add_cv);
-	return (ret);
-}
-
-/*
- * Returns true if the space used is 0.
- */
-boolean_t
-bqueue_empty(bqueue_t *q)
-{
-	return (q->bq_size == 0);
-}
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/cityhash.c
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/cityhash.c
+++ /dev/null
@@ -1,63 +0,0 @@
-// Copyright (c) 2011 Google, Inc.
-//
-// Permission is hereby granted, free of charge, to any person obtaining a copy
-// of this software and associated documentation files (the "Software"), to deal
-// in the Software without restriction, including without limitation the rights
-// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-// copies of the Software, and to permit persons to whom the Software is
-// furnished to do so, subject to the following conditions:
-//
-// The above copyright notice and this permission notice shall be included in
-// all copies or substantial portions of the Software.
-//
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-// THE SOFTWARE.
-
-/*
- * Copyright (c) 2017 by Delphix. All rights reserved.
- */
-
-#include <sys/cityhash.h>
-
-#define	HASH_K1 0xb492b66fbe98f273ULL
-#define	HASH_K2 0x9ae16a3b2f90404fULL
-
-/*
- * Bitwise right rotate.  Normally this will compile to a single
- * instruction.
- */
-static inline uint64_t
-rotate(uint64_t val, int shift)
-{
-	// Avoid shifting by 64: doing so yields an undefined result.
-	return (shift == 0 ? val : (val >> shift) | (val << (64 - shift)));
-}
-
-static inline uint64_t
-cityhash_helper(uint64_t u, uint64_t v, uint64_t mul)
-{
-	uint64_t a = (u ^ v) * mul;
-	a ^= (a >> 47);
-	uint64_t b = (v ^ a) * mul;
-	b ^= (b >> 47);
-	b *= mul;
-	return (b);
-}
-
-uint64_t
-cityhash4(uint64_t w1, uint64_t w2, uint64_t w3, uint64_t w4)
-{
-	uint64_t mul = HASH_K2 + 64;
-	uint64_t a = w1 * HASH_K1;
-	uint64_t b = w2;
-	uint64_t c = w4 * mul;
-	uint64_t d = w3 * HASH_K2;
-	return (cityhash_helper(rotate(a + b, 43) + rotate(c, 30) + d,
-	    a + rotate(b + HASH_K2, 18) + c, mul));
-
-}
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c
+++ /dev/null
@@ -1,4255 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
- * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
- * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
- * Copyright (c) 2013, Joyent, Inc. All rights reserved.
- * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
- * Copyright (c) 2014 Integros [integros.com]
- */
-
-#include <sys/zfs_context.h>
-#include <sys/dmu.h>
-#include <sys/dmu_send.h>
-#include <sys/dmu_impl.h>
-#include <sys/dbuf.h>
-#include <sys/dmu_objset.h>
-#include <sys/dsl_dataset.h>
-#include <sys/dsl_dir.h>
-#include <sys/dmu_tx.h>
-#include <sys/spa.h>
-#include <sys/zio.h>
-#include <sys/dmu_zfetch.h>
-#include <sys/sa.h>
-#include <sys/sa_impl.h>
-#include <sys/zfeature.h>
-#include <sys/blkptr.h>
-#include <sys/range_tree.h>
-#include <sys/callb.h>
-#include <sys/abd.h>
-#include <sys/vdev.h>
-#include <sys/cityhash.h>
-#include <sys/spa_impl.h>
-
-kstat_t *dbuf_ksp;
-
-typedef struct dbuf_stats {
-	/*
-	 * Various statistics about the size of the dbuf cache.
-	 */
-	kstat_named_t cache_count;
-	kstat_named_t cache_size_bytes;
-	kstat_named_t cache_size_bytes_max;
-	/*
-	 * Statistics regarding the bounds on the dbuf cache size.
-	 */
-	kstat_named_t cache_target_bytes;
-	kstat_named_t cache_lowater_bytes;
-	kstat_named_t cache_hiwater_bytes;
-	/*
-	 * Total number of dbuf cache evictions that have occurred.
-	 */
-	kstat_named_t cache_total_evicts;
-	/*
-	 * The distribution of dbuf levels in the dbuf cache and
-	 * the total size of all dbufs at each level.
-	 */
-	kstat_named_t cache_levels[DN_MAX_LEVELS];
-	kstat_named_t cache_levels_bytes[DN_MAX_LEVELS];
-	/*
-	 * Statistics about the dbuf hash table.
-	 */
-	kstat_named_t hash_hits;
-	kstat_named_t hash_misses;
-	kstat_named_t hash_collisions;
-	kstat_named_t hash_elements;
-	kstat_named_t hash_elements_max;
-	/*
-	 * Number of sublists containing more than one dbuf in the dbuf
-	 * hash table. Keep track of the longest hash chain.
-	 */
-	kstat_named_t hash_chains;
-	kstat_named_t hash_chain_max;
-	/*
-	 * Number of times a dbuf_create() discovers that a dbuf was
-	 * already created and in the dbuf hash table.
-	 */
-	kstat_named_t hash_insert_race;
-	/*
-	 * Statistics about the size of the metadata dbuf cache.
-	 */
-	kstat_named_t metadata_cache_count;
-	kstat_named_t metadata_cache_size_bytes;
-	kstat_named_t metadata_cache_size_bytes_max;
-	/*
-	 * For diagnostic purposes, this is incremented whenever we can't add
-	 * something to the metadata cache because it's full, and instead put
-	 * the data in the regular dbuf cache.
-	 */
-	kstat_named_t metadata_cache_overflow;
-} dbuf_stats_t;
-
-dbuf_stats_t dbuf_stats = {
-	{ "cache_count",			KSTAT_DATA_UINT64 },
-	{ "cache_size_bytes",			KSTAT_DATA_UINT64 },
-	{ "cache_size_bytes_max",		KSTAT_DATA_UINT64 },
-	{ "cache_target_bytes",			KSTAT_DATA_UINT64 },
-	{ "cache_lowater_bytes",		KSTAT_DATA_UINT64 },
-	{ "cache_hiwater_bytes",		KSTAT_DATA_UINT64 },
-	{ "cache_total_evicts",			KSTAT_DATA_UINT64 },
-	{ { "cache_levels_N",			KSTAT_DATA_UINT64 } },
-	{ { "cache_levels_bytes_N",		KSTAT_DATA_UINT64 } },
-	{ "hash_hits",				KSTAT_DATA_UINT64 },
-	{ "hash_misses",			KSTAT_DATA_UINT64 },
-	{ "hash_collisions",			KSTAT_DATA_UINT64 },
-	{ "hash_elements",			KSTAT_DATA_UINT64 },
-	{ "hash_elements_max",			KSTAT_DATA_UINT64 },
-	{ "hash_chains",			KSTAT_DATA_UINT64 },
-	{ "hash_chain_max",			KSTAT_DATA_UINT64 },
-	{ "hash_insert_race",			KSTAT_DATA_UINT64 },
-	{ "metadata_cache_count",		KSTAT_DATA_UINT64 },
-	{ "metadata_cache_size_bytes",		KSTAT_DATA_UINT64 },
-	{ "metadata_cache_size_bytes_max",	KSTAT_DATA_UINT64 },
-	{ "metadata_cache_overflow",		KSTAT_DATA_UINT64 }
-};
-
-#define	DBUF_STAT_INCR(stat, val)	\
-	atomic_add_64(&dbuf_stats.stat.value.ui64, (val));
-#define	DBUF_STAT_DECR(stat, val)	\
-	DBUF_STAT_INCR(stat, -(val));
-#define	DBUF_STAT_BUMP(stat)		\
-	DBUF_STAT_INCR(stat, 1);
-#define	DBUF_STAT_BUMPDOWN(stat)	\
-	DBUF_STAT_INCR(stat, -1);
-#define	DBUF_STAT_MAX(stat, v) {					\
-	uint64_t _m;							\
-	while ((v) > (_m = dbuf_stats.stat.value.ui64) &&		\
-	    (_m != atomic_cas_64(&dbuf_stats.stat.value.ui64, _m, (v))))\
-		continue;						\
-}
-
-struct dbuf_hold_impl_data {
-	/* Function arguments */
-	dnode_t *dh_dn;
-	uint8_t dh_level;
-	uint64_t dh_blkid;
-	boolean_t dh_fail_sparse;
-	boolean_t dh_fail_uncached;
-	void *dh_tag;
-	dmu_buf_impl_t **dh_dbp;
-	/* Local variables */
-	dmu_buf_impl_t *dh_db;
-	dmu_buf_impl_t *dh_parent;
-	blkptr_t *dh_bp;
-	int dh_err;
-	dbuf_dirty_record_t *dh_dr;
-	int dh_depth;
-};
-
-static void __dbuf_hold_impl_init(struct dbuf_hold_impl_data *dh,
-    dnode_t *dn, uint8_t level, uint64_t blkid, boolean_t fail_sparse,
-	boolean_t fail_uncached,
-	void *tag, dmu_buf_impl_t **dbp, int depth);
-static int __dbuf_hold_impl(struct dbuf_hold_impl_data *dh);
-
-static boolean_t dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
-static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx);
-
-#ifndef __lint
-extern inline void dmu_buf_init_user(dmu_buf_user_t *dbu,
-    dmu_buf_evict_func_t *evict_func_sync,
-    dmu_buf_evict_func_t *evict_func_async,
-    dmu_buf_t **clear_on_evict_dbufp);
-#endif /* ! __lint */
-
-/*
- * Global data structures and functions for the dbuf cache.
- */
-static kmem_cache_t *dbuf_kmem_cache;
-static taskq_t *dbu_evict_taskq;
-
-static kthread_t *dbuf_cache_evict_thread;
-static kmutex_t dbuf_evict_lock;
-static kcondvar_t dbuf_evict_cv;
-static boolean_t dbuf_evict_thread_exit;
-
-/*
- * There are two dbuf caches; each dbuf can only be in one of them at a time.
- *
- * 1. Cache of metadata dbufs, to help make read-heavy administrative commands
- *    from /sbin/zfs run faster. The "metadata cache" specifically stores dbufs
- *    that represent the metadata that describes filesystems/snapshots/
- *    bookmarks/properties/etc. We only evict from this cache when we export a
- *    pool, to short-circuit as much I/O as possible for all administrative
- *    commands that need the metadata. There is no eviction policy for this
- *    cache, because we try to only include types in it which would occupy a
- *    very small amount of space per object but create a large impact on the
- *    performance of these commands. Instead, after it reaches a maximum size
- *    (which should only happen on very small memory systems with a very large
- *    number of filesystem objects), we stop taking new dbufs into the
- *    metadata cache, instead putting them in the normal dbuf cache.
- *
- * 2. LRU cache of dbufs. The dbuf cache maintains a list of dbufs that
- *    are not currently held but have been recently released. These dbufs
- *    are not eligible for arc eviction until they are aged out of the cache.
- *    Dbufs that are aged out of the cache will be immediately destroyed and
- *    become eligible for arc eviction.
- *
- * Dbufs are added to these caches once the last hold is released. If a dbuf is
- * later accessed and still exists in the dbuf cache, then it will be removed
- * from the cache and later re-added to the head of the cache.
- *
- * If a given dbuf meets the requirements for the metadata cache, it will go
- * there, otherwise it will be considered for the generic LRU dbuf cache. The
- * caches and the refcounts tracking their sizes are stored in an array indexed
- * by those caches' matching enum values (from dbuf_cached_state_t).
- */
-typedef struct dbuf_cache {
-	multilist_t *cache;
-	zfs_refcount_t size;
-} dbuf_cache_t;
-dbuf_cache_t dbuf_caches[DB_CACHE_MAX];
-
-/* Size limits for the caches */
-uint64_t dbuf_cache_max_bytes = 0;
-uint64_t dbuf_metadata_cache_max_bytes = 0;
-/* Set the default sizes of the caches to log2 fraction of arc size */
-int dbuf_cache_shift = 5;
-int dbuf_metadata_cache_shift = 6;
-
-/*
- * For diagnostic purposes, this is incremented whenever we can't add
- * something to the metadata cache because it's full, and instead put
- * the data in the regular dbuf cache.
- */
-uint64_t dbuf_metadata_cache_overflow;
-
-/*
- * The LRU dbuf cache uses a three-stage eviction policy:
- *	- A low water marker designates when the dbuf eviction thread
- *	should stop evicting from the dbuf cache.
- *	- When we reach the maximum size (aka mid water mark), we
- *	signal the eviction thread to run.
- *	- The high water mark indicates when the eviction thread
- *	is unable to keep up with the incoming load and eviction must
- *	happen in the context of the calling thread.
- *
- * The dbuf cache:
- *                                                 (max size)
- *                                      low water   mid water   hi water
- * +----------------------------------------+----------+----------+
- * |                                        |          |          |
- * |                                        |          |          |
- * |                                        |          |          |
- * |                                        |          |          |
- * +----------------------------------------+----------+----------+
- *                                        stop        signal     evict
- *                                      evicting     eviction   directly
- *                                                    thread
- *
- * The high and low water marks indicate the operating range for the eviction
- * thread. The low water mark is, by default, 90% of the total size of the
- * cache and the high water mark is at 110% (both of these percentages can be
- * changed by setting dbuf_cache_lowater_pct and dbuf_cache_hiwater_pct,
- * respectively). The eviction thread will try to ensure that the cache remains
- * within this range by waking up every second and checking if the cache is
- * above the low water mark. The thread can also be woken up by callers adding
- * elements into the cache if the cache is larger than the mid water (i.e max
- * cache size). Once the eviction thread is woken up and eviction is required,
- * it will continue evicting buffers until it's able to reduce the cache size
- * to the low water mark. If the cache size continues to grow and hits the high
- * water mark, then callers adding elments to the cache will begin to evict
- * directly from the cache until the cache is no longer above the high water
- * mark.
- */
-
-/*
- * The percentage above and below the maximum cache size.
- */
-uint_t dbuf_cache_hiwater_pct = 10;
-uint_t dbuf_cache_lowater_pct = 10;
-
-SYSCTL_DECL(_vfs_zfs);
-SYSCTL_QUAD(_vfs_zfs, OID_AUTO, dbuf_cache_max_bytes, CTLFLAG_RWTUN,
-    &dbuf_cache_max_bytes, 0, "dbuf cache size in bytes");
-SYSCTL_QUAD(_vfs_zfs, OID_AUTO, dbuf_metadata_cache_max_bytes, CTLFLAG_RWTUN,
-    &dbuf_metadata_cache_max_bytes, 0, "dbuf metadata cache size in bytes");
-SYSCTL_INT(_vfs_zfs, OID_AUTO, dbuf_cache_shift, CTLFLAG_RDTUN,
-    &dbuf_cache_shift, 0, "dbuf cache size as log2 fraction of ARC");
-SYSCTL_INT(_vfs_zfs, OID_AUTO, dbuf_metadata_cache_shift, CTLFLAG_RDTUN,
-    &dbuf_metadata_cache_shift, 0,
-    "dbuf metadata cache size as log2 fraction of ARC");
-SYSCTL_QUAD(_vfs_zfs, OID_AUTO, dbuf_metadata_cache_overflow, CTLFLAG_RD,
-    &dbuf_metadata_cache_overflow, 0, "dbuf metadata cache overflow");
-SYSCTL_UINT(_vfs_zfs, OID_AUTO, dbuf_cache_hiwater_pct, CTLFLAG_RWTUN,
-    &dbuf_cache_hiwater_pct, 0, "max percents above the dbuf cache size");
-SYSCTL_UINT(_vfs_zfs, OID_AUTO, dbuf_cache_lowater_pct, CTLFLAG_RWTUN,
-    &dbuf_cache_lowater_pct, 0, "max percents below the dbuf cache size");
-
-/* ARGSUSED */
-static int
-dbuf_cons(void *vdb, void *unused, int kmflag)
-{
-	dmu_buf_impl_t *db = vdb;
-	bzero(db, sizeof (dmu_buf_impl_t));
-
-	mutex_init(&db->db_mtx, NULL, MUTEX_DEFAULT, NULL);
-	cv_init(&db->db_changed, NULL, CV_DEFAULT, NULL);
-	multilist_link_init(&db->db_cache_link);
-	zfs_refcount_create(&db->db_holds);
-
-	return (0);
-}
-
-/* ARGSUSED */
-static void
-dbuf_dest(void *vdb, void *unused)
-{
-	dmu_buf_impl_t *db = vdb;
-	mutex_destroy(&db->db_mtx);
-	cv_destroy(&db->db_changed);
-	ASSERT(!multilist_link_active(&db->db_cache_link));
-	zfs_refcount_destroy(&db->db_holds);
-}
-
-/*
- * dbuf hash table routines
- */
-static dbuf_hash_table_t dbuf_hash_table;
-
-static uint64_t dbuf_hash_count;
-
-/*
- * We use Cityhash for this. It's fast, and has good hash properties without
- * requiring any large static buffers.
- */
-static uint64_t
-dbuf_hash(void *os, uint64_t obj, uint8_t lvl, uint64_t blkid)
-{
-	return (cityhash4((uintptr_t)os, obj, (uint64_t)lvl, blkid));
-}
-
-#define	DBUF_EQUAL(dbuf, os, obj, level, blkid)		\
-	((dbuf)->db.db_object == (obj) &&		\
-	(dbuf)->db_objset == (os) &&			\
-	(dbuf)->db_level == (level) &&			\
-	(dbuf)->db_blkid == (blkid))
-
-dmu_buf_impl_t *
-dbuf_find(objset_t *os, uint64_t obj, uint8_t level, uint64_t blkid)
-{
-	dbuf_hash_table_t *h = &dbuf_hash_table;
-	uint64_t hv = dbuf_hash(os, obj, level, blkid);
-	uint64_t idx = hv & h->hash_table_mask;
-	dmu_buf_impl_t *db;
-
-	mutex_enter(DBUF_HASH_MUTEX(h, idx));
-	for (db = h->hash_table[idx]; db != NULL; db = db->db_hash_next) {
-		if (DBUF_EQUAL(db, os, obj, level, blkid)) {
-			mutex_enter(&db->db_mtx);
-			if (db->db_state != DB_EVICTING) {
-				mutex_exit(DBUF_HASH_MUTEX(h, idx));
-				return (db);
-			}
-			mutex_exit(&db->db_mtx);
-		}
-	}
-	mutex_exit(DBUF_HASH_MUTEX(h, idx));
-	return (NULL);
-}
-
-static dmu_buf_impl_t *
-dbuf_find_bonus(objset_t *os, uint64_t object)
-{
-	dnode_t *dn;
-	dmu_buf_impl_t *db = NULL;
-
-	if (dnode_hold(os, object, FTAG, &dn) == 0) {
-		rw_enter(&dn->dn_struct_rwlock, RW_READER);
-		if (dn->dn_bonus != NULL) {
-			db = dn->dn_bonus;
-			mutex_enter(&db->db_mtx);
-		}
-		rw_exit(&dn->dn_struct_rwlock);
-		dnode_rele(dn, FTAG);
-	}
-	return (db);
-}
-
-/*
- * Insert an entry into the hash table.  If there is already an element
- * equal to elem in the hash table, then the already existing element
- * will be returned and the new element will not be inserted.
- * Otherwise returns NULL.
- */
-static dmu_buf_impl_t *
-dbuf_hash_insert(dmu_buf_impl_t *db)
-{
-	dbuf_hash_table_t *h = &dbuf_hash_table;
-	objset_t *os = db->db_objset;
-	uint64_t obj = db->db.db_object;
-	int level = db->db_level;
-	uint64_t blkid, hv, idx;
-	dmu_buf_impl_t *dbf;
-	uint32_t i;
-
-	blkid = db->db_blkid;
-	hv = dbuf_hash(os, obj, level, blkid);
-	idx = hv & h->hash_table_mask;
-
-	mutex_enter(DBUF_HASH_MUTEX(h, idx));
-	for (dbf = h->hash_table[idx], i = 0; dbf != NULL;
-	    dbf = dbf->db_hash_next, i++) {
-		if (DBUF_EQUAL(dbf, os, obj, level, blkid)) {
-			mutex_enter(&dbf->db_mtx);
-			if (dbf->db_state != DB_EVICTING) {
-				mutex_exit(DBUF_HASH_MUTEX(h, idx));
-				return (dbf);
-			}
-			mutex_exit(&dbf->db_mtx);
-		}
-	}
-
-	if (i > 0) {
-		DBUF_STAT_BUMP(hash_collisions);
-		if (i == 1)
-			DBUF_STAT_BUMP(hash_chains);
-
-		DBUF_STAT_MAX(hash_chain_max, i);
-	}
-
-	mutex_enter(&db->db_mtx);
-	db->db_hash_next = h->hash_table[idx];
-	h->hash_table[idx] = db;
-	mutex_exit(DBUF_HASH_MUTEX(h, idx));
-	atomic_inc_64(&dbuf_hash_count);
-	DBUF_STAT_MAX(hash_elements_max, dbuf_hash_count);
-
-	return (NULL);
-}
-
-/*
- * Remove an entry from the hash table.  It must be in the EVICTING state.
- */
-static void
-dbuf_hash_remove(dmu_buf_impl_t *db)
-{
-	dbuf_hash_table_t *h = &dbuf_hash_table;
-	uint64_t hv, idx;
-	dmu_buf_impl_t *dbf, **dbp;
-
-	hv = dbuf_hash(db->db_objset, db->db.db_object,
-	    db->db_level, db->db_blkid);
-	idx = hv & h->hash_table_mask;
-
-	/*
-	 * We mustn't hold db_mtx to maintain lock ordering:
-	 * DBUF_HASH_MUTEX > db_mtx.
-	 */
-	ASSERT(zfs_refcount_is_zero(&db->db_holds));
-	ASSERT(db->db_state == DB_EVICTING);
-	ASSERT(!MUTEX_HELD(&db->db_mtx));
-
-	mutex_enter(DBUF_HASH_MUTEX(h, idx));
-	dbp = &h->hash_table[idx];
-	while ((dbf = *dbp) != db) {
-		dbp = &dbf->db_hash_next;
-		ASSERT(dbf != NULL);
-	}
-	*dbp = db->db_hash_next;
-	db->db_hash_next = NULL;
-	if (h->hash_table[idx] &&
-	    h->hash_table[idx]->db_hash_next == NULL)
-		DBUF_STAT_BUMPDOWN(hash_chains);
-	mutex_exit(DBUF_HASH_MUTEX(h, idx));
-	atomic_dec_64(&dbuf_hash_count);
-}
-
-typedef enum {
-	DBVU_EVICTING,
-	DBVU_NOT_EVICTING
-} dbvu_verify_type_t;
-
-static void
-dbuf_verify_user(dmu_buf_impl_t *db, dbvu_verify_type_t verify_type)
-{
-#ifdef ZFS_DEBUG
-	int64_t holds;
-
-	if (db->db_user == NULL)
-		return;
-
-	/* Only data blocks support the attachment of user data. */
-	ASSERT(db->db_level == 0);
-
-	/* Clients must resolve a dbuf before attaching user data. */
-	ASSERT(db->db.db_data != NULL);
-	ASSERT3U(db->db_state, ==, DB_CACHED);
-
-	holds = zfs_refcount_count(&db->db_holds);
-	if (verify_type == DBVU_EVICTING) {
-		/*
-		 * Immediate eviction occurs when holds == dirtycnt.
-		 * For normal eviction buffers, holds is zero on
-		 * eviction, except when dbuf_fix_old_data() calls
-		 * dbuf_clear_data().  However, the hold count can grow
-		 * during eviction even though db_mtx is held (see
-		 * dmu_bonus_hold() for an example), so we can only
-		 * test the generic invariant that holds >= dirtycnt.
-		 */
-		ASSERT3U(holds, >=, db->db_dirtycnt);
-	} else {
-		if (db->db_user_immediate_evict == TRUE)
-			ASSERT3U(holds, >=, db->db_dirtycnt);
-		else
-			ASSERT3U(holds, >, 0);
-	}
-#endif
-}
-
-static void
-dbuf_evict_user(dmu_buf_impl_t *db)
-{
-	dmu_buf_user_t *dbu = db->db_user;
-
-	ASSERT(MUTEX_HELD(&db->db_mtx));
-
-	if (dbu == NULL)
-		return;
-
-	dbuf_verify_user(db, DBVU_EVICTING);
-	db->db_user = NULL;
-
-#ifdef ZFS_DEBUG
-	if (dbu->dbu_clear_on_evict_dbufp != NULL)
-		*dbu->dbu_clear_on_evict_dbufp = NULL;
-#endif
-
-	/*
-	 * There are two eviction callbacks - one that we call synchronously
-	 * and one that we invoke via a taskq.  The async one is useful for
-	 * avoiding lock order reversals and limiting stack depth.
-	 *
-	 * Note that if we have a sync callback but no async callback,
-	 * it's likely that the sync callback will free the structure
-	 * containing the dbu.  In that case we need to take care to not
-	 * dereference dbu after calling the sync evict func.
-	 */
-	boolean_t has_async = (dbu->dbu_evict_func_async != NULL);
-
-	if (dbu->dbu_evict_func_sync != NULL)
-		dbu->dbu_evict_func_sync(dbu);
-
-	if (has_async) {
-		taskq_dispatch_ent(dbu_evict_taskq, dbu->dbu_evict_func_async,
-		    dbu, 0, &dbu->dbu_tqent);
-	}
-}
-
-boolean_t
-dbuf_is_metadata(dmu_buf_impl_t *db)
-{
-	if (db->db_level > 0) {
-		return (B_TRUE);
-	} else {
-		boolean_t is_metadata;
-
-		DB_DNODE_ENTER(db);
-		is_metadata = DMU_OT_IS_METADATA(DB_DNODE(db)->dn_type);
-		DB_DNODE_EXIT(db);
-
-		return (is_metadata);
-	}
-}
-
-/*
- * This returns whether this dbuf should be stored in the metadata cache, which
- * is based on whether it's from one of the dnode types that store data related
- * to traversing dataset hierarchies.
- */
-static boolean_t
-dbuf_include_in_metadata_cache(dmu_buf_impl_t *db)
-{
-	DB_DNODE_ENTER(db);
-	dmu_object_type_t type = DB_DNODE(db)->dn_type;
-	DB_DNODE_EXIT(db);
-
-	/* Check if this dbuf is one of the types we care about */
-	if (DMU_OT_IS_METADATA_CACHED(type)) {
-		/* If we hit this, then we set something up wrong in dmu_ot */
-		ASSERT(DMU_OT_IS_METADATA(type));
-
-		/*
-		 * Sanity check for small-memory systems: don't allocate too
-		 * much memory for this purpose.
-		 */
-		if (zfs_refcount_count(
-		    &dbuf_caches[DB_DBUF_METADATA_CACHE].size) >
-		    dbuf_metadata_cache_max_bytes) {
-			dbuf_metadata_cache_overflow++;
-			DTRACE_PROBE1(dbuf__metadata__cache__overflow,
-			    dmu_buf_impl_t *, db);
-			return (B_FALSE);
-		}
-
-		return (B_TRUE);
-	}
-
-	return (B_FALSE);
-}
-
-/*
- * This function *must* return indices evenly distributed between all
- * sublists of the multilist. This is needed due to how the dbuf eviction
- * code is laid out; dbuf_evict_thread() assumes dbufs are evenly
- * distributed between all sublists and uses this assumption when
- * deciding which sublist to evict from and how much to evict from it.
- */
-unsigned int
-dbuf_cache_multilist_index_func(multilist_t *ml, void *obj)
-{
-	dmu_buf_impl_t *db = obj;
-
-	/*
-	 * The assumption here, is the hash value for a given
-	 * dmu_buf_impl_t will remain constant throughout it's lifetime
-	 * (i.e. it's objset, object, level and blkid fields don't change).
-	 * Thus, we don't need to store the dbuf's sublist index
-	 * on insertion, as this index can be recalculated on removal.
-	 *
-	 * Also, the low order bits of the hash value are thought to be
-	 * distributed evenly. Otherwise, in the case that the multilist
-	 * has a power of two number of sublists, each sublists' usage
-	 * would not be evenly distributed.
-	 */
-	return (dbuf_hash(db->db_objset, db->db.db_object,
-	    db->db_level, db->db_blkid) %
-	    multilist_get_num_sublists(ml));
-}
-
-static inline unsigned long
-dbuf_cache_target_bytes(void)
-{
-	return MIN(dbuf_cache_max_bytes,
-	    arc_max_bytes() >> dbuf_cache_shift);
-}
-
-static inline uint64_t
-dbuf_cache_hiwater_bytes(void)
-{
-	uint64_t dbuf_cache_target = dbuf_cache_target_bytes();
-	return (dbuf_cache_target +
-	    (dbuf_cache_target * dbuf_cache_hiwater_pct) / 100);
-}
-
-static inline uint64_t
-dbuf_cache_lowater_bytes(void)
-{
-	uint64_t dbuf_cache_target = dbuf_cache_target_bytes();
-	return (dbuf_cache_target -
-	    (dbuf_cache_target * dbuf_cache_lowater_pct) / 100);
-}
-
-static inline boolean_t
-dbuf_cache_above_lowater(void)
-{
-	return (zfs_refcount_count(&dbuf_caches[DB_DBUF_CACHE].size) >
-	    dbuf_cache_lowater_bytes());
-}
-
-/*
- * Evict the oldest eligible dbuf from the dbuf cache.
- */
-static void
-dbuf_evict_one(void)
-{
-	int idx = multilist_get_random_index(dbuf_caches[DB_DBUF_CACHE].cache);
-	multilist_sublist_t *mls = multilist_sublist_lock(
-	    dbuf_caches[DB_DBUF_CACHE].cache, idx);
-
-	ASSERT(!MUTEX_HELD(&dbuf_evict_lock));
-
-	dmu_buf_impl_t *db = multilist_sublist_tail(mls);
-	while (db != NULL && mutex_tryenter(&db->db_mtx) == 0) {
-		db = multilist_sublist_prev(mls, db);
-	}
-
-	DTRACE_PROBE2(dbuf__evict__one, dmu_buf_impl_t *, db,
-	    multilist_sublist_t *, mls);
-
-	if (db != NULL) {
-		multilist_sublist_remove(mls, db);
-		multilist_sublist_unlock(mls);
-		(void) zfs_refcount_remove_many(
-		    &dbuf_caches[DB_DBUF_CACHE].size,
-		    db->db.db_size, db);
-		DBUF_STAT_BUMPDOWN(cache_levels[db->db_level]);
-		DBUF_STAT_BUMPDOWN(cache_count);
-		DBUF_STAT_DECR(cache_levels_bytes[db->db_level],
-		    db->db.db_size);
-		ASSERT3U(db->db_caching_status, ==, DB_DBUF_CACHE);
-		db->db_caching_status = DB_NO_CACHE;
-		dbuf_destroy(db);
-		DBUF_STAT_BUMP(cache_total_evicts);
-	} else {
-		multilist_sublist_unlock(mls);
-	}
-}
-
-/*
- * The dbuf evict thread is responsible for aging out dbufs from the
- * cache. Once the cache has reached it's maximum size, dbufs are removed
- * and destroyed. The eviction thread will continue running until the size
- * of the dbuf cache is at or below the maximum size. Once the dbuf is aged
- * out of the cache it is destroyed and becomes eligible for arc eviction.
- */
-/* ARGSUSED */
-static void
-dbuf_evict_thread(void *unused __unused)
-{
-	callb_cpr_t cpr;
-
-	CALLB_CPR_INIT(&cpr, &dbuf_evict_lock, callb_generic_cpr, FTAG);
-
-	mutex_enter(&dbuf_evict_lock);
-	while (!dbuf_evict_thread_exit) {
-		while (!dbuf_cache_above_lowater() && !dbuf_evict_thread_exit) {
-			CALLB_CPR_SAFE_BEGIN(&cpr);
-			(void) cv_timedwait_hires(&dbuf_evict_cv,
-			    &dbuf_evict_lock, SEC2NSEC(1), MSEC2NSEC(1), 0);
-			CALLB_CPR_SAFE_END(&cpr, &dbuf_evict_lock);
-#ifdef __FreeBSD__
-			if (dbuf_ksp != NULL)
-				dbuf_ksp->ks_update(dbuf_ksp, KSTAT_READ);
-#endif
-		}
-		mutex_exit(&dbuf_evict_lock);
-
-		/*
-		 * Keep evicting as long as we're above the low water mark
-		 * for the cache. We do this without holding the locks to
-		 * minimize lock contention.
-		 */
-		while (dbuf_cache_above_lowater() && !dbuf_evict_thread_exit) {
-			dbuf_evict_one();
-		}
-
-		mutex_enter(&dbuf_evict_lock);
-	}
-
-	dbuf_evict_thread_exit = B_FALSE;
-	cv_broadcast(&dbuf_evict_cv);
-	CALLB_CPR_EXIT(&cpr);	/* drops dbuf_evict_lock */
-	thread_exit();
-}
-
-/*
- * Wake up the dbuf eviction thread if the dbuf cache is at its max size.
- * If the dbuf cache is at its high water mark, then evict a dbuf from the
- * dbuf cache using the callers context.
- */
-static void
-dbuf_evict_notify(uint64_t size)
-{
-	/*
-	 * We check if we should evict without holding the dbuf_evict_lock,
-	 * because it's OK to occasionally make the wrong decision here,
-	 * and grabbing the lock results in massive lock contention.
-	 */
-	if (size > dbuf_cache_max_bytes) {
-		if (size > dbuf_cache_hiwater_bytes())
-			dbuf_evict_one();
-		cv_signal(&dbuf_evict_cv);
-	}
-}
-
-static int
-dbuf_kstat_update(kstat_t *ksp, int rw)
-{
-	dbuf_stats_t *ds = ksp->ks_data;
-
-	if (rw == KSTAT_WRITE) {
-		return (SET_ERROR(EACCES));
-	} else {
-		ds->metadata_cache_size_bytes.value.ui64 =
-		    zfs_refcount_count(&dbuf_caches[DB_DBUF_METADATA_CACHE].size);
-		ds->cache_size_bytes.value.ui64 =
-		    zfs_refcount_count(&dbuf_caches[DB_DBUF_CACHE].size);
-		ds->cache_target_bytes.value.ui64 = dbuf_cache_target_bytes();
-		ds->cache_hiwater_bytes.value.ui64 = dbuf_cache_hiwater_bytes();
-		ds->cache_lowater_bytes.value.ui64 = dbuf_cache_lowater_bytes();
-		ds->hash_elements.value.ui64 = dbuf_hash_count;
-	}
-
-	return (0);
-}
-
-void
-dbuf_init(void)
-{
-	uint64_t hsize = 1ULL << 16;
-	dbuf_hash_table_t *h = &dbuf_hash_table;
-	int i;
-
-	/*
-	 * The hash table is big enough to fill all of physical memory
-	 * with an average 4K block size.  The table will take up
-	 * totalmem*sizeof(void*)/4K (i.e. 2MB/GB with 8-byte pointers).
-	 */
-	while (hsize * 4096 < (uint64_t)physmem * PAGESIZE)
-		hsize <<= 1;
-
-retry:
-	h->hash_table_mask = hsize - 1;
-	h->hash_table = kmem_zalloc(hsize * sizeof (void *), KM_NOSLEEP);
-	if (h->hash_table == NULL) {
-		/* XXX - we should really return an error instead of assert */
-		ASSERT(hsize > (1ULL << 10));
-		hsize >>= 1;
-		goto retry;
-	}
-
-	dbuf_kmem_cache = kmem_cache_create("dmu_buf_impl_t",
-	    sizeof (dmu_buf_impl_t),
-	    0, dbuf_cons, dbuf_dest, NULL, NULL, NULL, 0);
-
-	for (i = 0; i < DBUF_MUTEXES; i++)
-		mutex_init(&h->hash_mutexes[i], NULL, MUTEX_DEFAULT, NULL);
-
-	dbuf_stats_init(h);
-	/*
-	 * Setup the parameters for the dbuf caches. We set the sizes of the
-	 * dbuf cache and the metadata cache to 1/32nd and 1/16th (default)
-	 * of the size of the ARC, respectively. If the values are set in
-	 * /etc/system and they're not greater than the size of the ARC, then
-	 * we honor that value.
-	 */
-	if (dbuf_cache_max_bytes == 0 ||
-	    dbuf_cache_max_bytes >= arc_max_bytes())  {
-		dbuf_cache_max_bytes = arc_max_bytes() >> dbuf_cache_shift;
-	}
-	if (dbuf_metadata_cache_max_bytes == 0 ||
-	    dbuf_metadata_cache_max_bytes >= arc_max_bytes()) {
-		dbuf_metadata_cache_max_bytes =
-		    arc_max_bytes() >> dbuf_metadata_cache_shift;
-	}
-
-	/*
-	 * All entries are queued via taskq_dispatch_ent(), so min/maxalloc
-	 * configuration is not required.
-	 */
-	dbu_evict_taskq = taskq_create("dbu_evict", 1, minclsyspri, 0, 0, 0);
-
-	for (dbuf_cached_state_t dcs = 0; dcs < DB_CACHE_MAX; dcs++) {
-		dbuf_caches[dcs].cache =
-		    multilist_create(sizeof (dmu_buf_impl_t),
-		    offsetof(dmu_buf_impl_t, db_cache_link),
-		    dbuf_cache_multilist_index_func);
-		zfs_refcount_create(&dbuf_caches[dcs].size);
-	}
-
-	dbuf_evict_thread_exit = B_FALSE;
-	mutex_init(&dbuf_evict_lock, NULL, MUTEX_DEFAULT, NULL);
-	cv_init(&dbuf_evict_cv, NULL, CV_DEFAULT, NULL);
-	dbuf_cache_evict_thread = thread_create(NULL, 0, dbuf_evict_thread,
-	    NULL, 0, &p0, TS_RUN, minclsyspri);
-
-	dbuf_ksp = kstat_create("zfs", 0, "dbufstats", "misc",
-	    KSTAT_TYPE_NAMED, sizeof (dbuf_stats) / sizeof (kstat_named_t),
-	    KSTAT_FLAG_VIRTUAL);
-	if (dbuf_ksp != NULL) {
-		for (i = 0; i < DN_MAX_LEVELS; i++) {
-			snprintf(dbuf_stats.cache_levels[i].name,
-			    KSTAT_STRLEN, "cache_level_%d", i);
-			dbuf_stats.cache_levels[i].data_type =
-			    KSTAT_DATA_UINT64;
-			snprintf(dbuf_stats.cache_levels_bytes[i].name,
-			    KSTAT_STRLEN, "cache_level_%d_bytes", i);
-			dbuf_stats.cache_levels_bytes[i].data_type =
-			    KSTAT_DATA_UINT64;
-		}
-		dbuf_ksp->ks_data = &dbuf_stats;
-		dbuf_ksp->ks_update = dbuf_kstat_update;
-		kstat_install(dbuf_ksp);
-	}
-}
-
-void
-dbuf_fini(void)
-{
-	dbuf_hash_table_t *h = &dbuf_hash_table;
-	int i;
-
-	dbuf_stats_destroy();
-
-	for (i = 0; i < DBUF_MUTEXES; i++)
-		mutex_destroy(&h->hash_mutexes[i]);
-	kmem_free(h->hash_table, (h->hash_table_mask + 1) * sizeof (void *));
-	kmem_cache_destroy(dbuf_kmem_cache);
-	taskq_destroy(dbu_evict_taskq);
-
-	mutex_enter(&dbuf_evict_lock);
-	dbuf_evict_thread_exit = B_TRUE;
-	while (dbuf_evict_thread_exit) {
-		cv_signal(&dbuf_evict_cv);
-		cv_wait(&dbuf_evict_cv, &dbuf_evict_lock);
-	}
-	mutex_exit(&dbuf_evict_lock);
-
-	mutex_destroy(&dbuf_evict_lock);
-	cv_destroy(&dbuf_evict_cv);
-
-	for (dbuf_cached_state_t dcs = 0; dcs < DB_CACHE_MAX; dcs++) {
-		zfs_refcount_destroy(&dbuf_caches[dcs].size);
-		multilist_destroy(dbuf_caches[dcs].cache);
-	}
-
-	if (dbuf_ksp != NULL) {
-		kstat_delete(dbuf_ksp);
-		dbuf_ksp = NULL;
-	}
-}
-
-/*
- * Other stuff.
- */
-
-#ifdef ZFS_DEBUG
-static void
-dbuf_verify(dmu_buf_impl_t *db)
-{
-	dnode_t *dn;
-	dbuf_dirty_record_t *dr;
-
-	ASSERT(MUTEX_HELD(&db->db_mtx));
-
-	if (!(zfs_flags & ZFS_DEBUG_DBUF_VERIFY))
-		return;
-
-	ASSERT(db->db_objset != NULL);
-	DB_DNODE_ENTER(db);
-	dn = DB_DNODE(db);
-	if (dn == NULL) {
-		ASSERT(db->db_parent == NULL);
-		ASSERT(db->db_blkptr == NULL);
-	} else {
-		ASSERT3U(db->db.db_object, ==, dn->dn_object);
-		ASSERT3P(db->db_objset, ==, dn->dn_objset);
-		ASSERT3U(db->db_level, <, dn->dn_nlevels);
-		ASSERT(db->db_blkid == DMU_BONUS_BLKID ||
-		    db->db_blkid == DMU_SPILL_BLKID ||
-		    !avl_is_empty(&dn->dn_dbufs));
-	}
-	if (db->db_blkid == DMU_BONUS_BLKID) {
-		ASSERT(dn != NULL);
-		ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
-		ASSERT3U(db->db.db_offset, ==, DMU_BONUS_BLKID);
-	} else if (db->db_blkid == DMU_SPILL_BLKID) {
-		ASSERT(dn != NULL);
-		ASSERT0(db->db.db_offset);
-	} else {
-		ASSERT3U(db->db.db_offset, ==, db->db_blkid * db->db.db_size);
-	}
-
-	for (dr = db->db_data_pending; dr != NULL; dr = dr->dr_next)
-		ASSERT(dr->dr_dbuf == db);
-
-	for (dr = db->db_last_dirty; dr != NULL; dr = dr->dr_next)
-		ASSERT(dr->dr_dbuf == db);
-
-	/*
-	 * We can't assert that db_size matches dn_datablksz because it
-	 * can be momentarily different when another thread is doing
-	 * dnode_set_blksz().
-	 */
-	if (db->db_level == 0 && db->db.db_object == DMU_META_DNODE_OBJECT) {
-		dr = db->db_data_pending;
-		/*
-		 * It should only be modified in syncing context, so
-		 * make sure we only have one copy of the data.
-		 */
-		ASSERT(dr == NULL || dr->dt.dl.dr_data == db->db_buf);
-	}
-
-	/* verify db->db_blkptr */
-	if (db->db_blkptr) {
-		if (db->db_parent == dn->dn_dbuf) {
-			/* db is pointed to by the dnode */
-			/* ASSERT3U(db->db_blkid, <, dn->dn_nblkptr); */
-			if (DMU_OBJECT_IS_SPECIAL(db->db.db_object))
-				ASSERT(db->db_parent == NULL);
-			else
-				ASSERT(db->db_parent != NULL);
-			if (db->db_blkid != DMU_SPILL_BLKID)
-				ASSERT3P(db->db_blkptr, ==,
-				    &dn->dn_phys->dn_blkptr[db->db_blkid]);
-		} else {
-			/* db is pointed to by an indirect block */
-			int epb = db->db_parent->db.db_size >> SPA_BLKPTRSHIFT;
-			ASSERT3U(db->db_parent->db_level, ==, db->db_level+1);
-			ASSERT3U(db->db_parent->db.db_object, ==,
-			    db->db.db_object);
-			/*
-			 * dnode_grow_indblksz() can make this fail if we don't
-			 * have the struct_rwlock.  XXX indblksz no longer
-			 * grows.  safe to do this now?
-			 */
-			if (RW_WRITE_HELD(&dn->dn_struct_rwlock)) {
-				ASSERT3P(db->db_blkptr, ==,
-				    ((blkptr_t *)db->db_parent->db.db_data +
-				    db->db_blkid % epb));
-			}
-		}
-	}
-	if ((db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr)) &&
-	    (db->db_buf == NULL || db->db_buf->b_data) &&
-	    db->db.db_data && db->db_blkid != DMU_BONUS_BLKID &&
-	    db->db_state != DB_FILL && !dn->dn_free_txg) {
-		/*
-		 * If the blkptr isn't set but they have nonzero data,
-		 * it had better be dirty, otherwise we'll lose that
-		 * data when we evict this buffer.
-		 *
-		 * There is an exception to this rule for indirect blocks; in
-		 * this case, if the indirect block is a hole, we fill in a few
-		 * fields on each of the child blocks (importantly, birth time)
-		 * to prevent hole birth times from being lost when you
-		 * partially fill in a hole.
-		 */
-		if (db->db_dirtycnt == 0) {
-			if (db->db_level == 0) {
-				uint64_t *buf = db->db.db_data;
-				int i;
-
-				for (i = 0; i < db->db.db_size >> 3; i++) {
-					ASSERT(buf[i] == 0);
-				}
-			} else {
-				blkptr_t *bps = db->db.db_data;
-				ASSERT3U(1 << DB_DNODE(db)->dn_indblkshift, ==,
-				    db->db.db_size);
-				/*
-				 * We want to verify that all the blkptrs in the
-				 * indirect block are holes, but we may have
-				 * automatically set up a few fields for them.
-				 * We iterate through each blkptr and verify
-				 * they only have those fields set.
-				 */
-				for (int i = 0;
-				    i < db->db.db_size / sizeof (blkptr_t);
-				    i++) {
-					blkptr_t *bp = &bps[i];
-					ASSERT(ZIO_CHECKSUM_IS_ZERO(
-					    &bp->blk_cksum));
-					ASSERT(
-					    DVA_IS_EMPTY(&bp->blk_dva[0]) &&
-					    DVA_IS_EMPTY(&bp->blk_dva[1]) &&
-					    DVA_IS_EMPTY(&bp->blk_dva[2]));
-					ASSERT0(bp->blk_fill);
-					ASSERT0(bp->blk_pad[0]);
-					ASSERT0(bp->blk_pad[1]);
-					ASSERT(!BP_IS_EMBEDDED(bp));
-					ASSERT(BP_IS_HOLE(bp));
-					ASSERT0(bp->blk_phys_birth);
-				}
-			}
-		}
-	}
-	DB_DNODE_EXIT(db);
-}
-#endif
-
-static void
-dbuf_clear_data(dmu_buf_impl_t *db)
-{
-	ASSERT(MUTEX_HELD(&db->db_mtx));
-	dbuf_evict_user(db);
-	ASSERT3P(db->db_buf, ==, NULL);
-	db->db.db_data = NULL;
-	if (db->db_state != DB_NOFILL)
-		db->db_state = DB_UNCACHED;
-}
-
-static void
-dbuf_set_data(dmu_buf_impl_t *db, arc_buf_t *buf)
-{
-	ASSERT(MUTEX_HELD(&db->db_mtx));
-	ASSERT(buf != NULL);
-
-	db->db_buf = buf;
-	ASSERT(buf->b_data != NULL);
-	db->db.db_data = buf->b_data;
-}
-
-/*
- * Loan out an arc_buf for read.  Return the loaned arc_buf.
- */
-arc_buf_t *
-dbuf_loan_arcbuf(dmu_buf_impl_t *db)
-{
-	arc_buf_t *abuf;
-
-	ASSERT(db->db_blkid != DMU_BONUS_BLKID);
-	mutex_enter(&db->db_mtx);
-	if (arc_released(db->db_buf) || zfs_refcount_count(&db->db_holds) > 1) {
-		int blksz = db->db.db_size;
-		spa_t *spa = db->db_objset->os_spa;
-
-		mutex_exit(&db->db_mtx);
-		abuf = arc_loan_buf(spa, B_FALSE, blksz);
-		bcopy(db->db.db_data, abuf->b_data, blksz);
-	} else {
-		abuf = db->db_buf;
-		arc_loan_inuse_buf(abuf, db);
-		db->db_buf = NULL;
-		dbuf_clear_data(db);
-		mutex_exit(&db->db_mtx);
-	}
-	return (abuf);
-}
-
-/*
- * Calculate which level n block references the data at the level 0 offset
- * provided.
- */
-uint64_t
-dbuf_whichblock(dnode_t *dn, int64_t level, uint64_t offset)
-{
-	if (dn->dn_datablkshift != 0 && dn->dn_indblkshift != 0) {
-		/*
-		 * The level n blkid is equal to the level 0 blkid divided by
-		 * the number of level 0s in a level n block.
-		 *
-		 * The level 0 blkid is offset >> datablkshift =
-		 * offset / 2^datablkshift.
-		 *
-		 * The number of level 0s in a level n is the number of block
-		 * pointers in an indirect block, raised to the power of level.
-		 * This is 2^(indblkshift - SPA_BLKPTRSHIFT)^level =
-		 * 2^(level*(indblkshift - SPA_BLKPTRSHIFT)).
-		 *
-		 * Thus, the level n blkid is: offset /
-		 * ((2^datablkshift)*(2^(level*(indblkshift - SPA_BLKPTRSHIFT)))
-		 * = offset / 2^(datablkshift + level *
-		 *   (indblkshift - SPA_BLKPTRSHIFT))
-		 * = offset >> (datablkshift + level *
-		 *   (indblkshift - SPA_BLKPTRSHIFT))
-		 */
-		return (offset >> (dn->dn_datablkshift + level *
-		    (dn->dn_indblkshift - SPA_BLKPTRSHIFT)));
-	} else {
-		ASSERT3U(offset, <, dn->dn_datablksz);
-		return (0);
-	}
-}
-
-static void
-dbuf_read_done(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp,
-    arc_buf_t *buf, void *vdb)
-{
-	dmu_buf_impl_t *db = vdb;
-
-	mutex_enter(&db->db_mtx);
-	ASSERT3U(db->db_state, ==, DB_READ);
-	/*
-	 * All reads are synchronous, so we must have a hold on the dbuf
-	 */
-	ASSERT(zfs_refcount_count(&db->db_holds) > 0);
-	ASSERT(db->db_buf == NULL);
-	ASSERT(db->db.db_data == NULL);
-	if (buf == NULL) {
-		/* i/o error */
-		ASSERT(zio == NULL || zio->io_error != 0);
-		ASSERT(db->db_blkid != DMU_BONUS_BLKID);
-		ASSERT3P(db->db_buf, ==, NULL);
-		db->db_state = DB_UNCACHED;
-	} else if (db->db_level == 0 && db->db_freed_in_flight) {
-		/* freed in flight */
-		ASSERT(zio == NULL || zio->io_error == 0);
-		if (buf == NULL) {
-			buf = arc_alloc_buf(db->db_objset->os_spa,
-			     db, DBUF_GET_BUFC_TYPE(db), db->db.db_size);
-		}
-		arc_release(buf, db);
-		bzero(buf->b_data, db->db.db_size);
-		arc_buf_freeze(buf);
-		db->db_freed_in_flight = FALSE;
-		dbuf_set_data(db, buf);
-		db->db_state = DB_CACHED;
-	} else {
-		/* success */
-		ASSERT(zio == NULL || zio->io_error == 0);
-		dbuf_set_data(db, buf);
-		db->db_state = DB_CACHED;
-	}
-	cv_broadcast(&db->db_changed);
-	dbuf_rele_and_unlock(db, NULL, B_FALSE);
-}
-
-static void
-dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
-{
-	dnode_t *dn;
-	zbookmark_phys_t zb;
-	arc_flags_t aflags = ARC_FLAG_NOWAIT;
-
-	DB_DNODE_ENTER(db);
-	dn = DB_DNODE(db);
-	ASSERT(!zfs_refcount_is_zero(&db->db_holds));
-	/* We need the struct_rwlock to prevent db_blkptr from changing. */
-	ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
-	ASSERT(MUTEX_HELD(&db->db_mtx));
-	ASSERT(db->db_state == DB_UNCACHED);
-	ASSERT(db->db_buf == NULL);
-
-	if (db->db_blkid == DMU_BONUS_BLKID) {
-		/*
-		 * The bonus length stored in the dnode may be less than
-		 * the maximum available space in the bonus buffer.
-		 */
-		int bonuslen = MIN(dn->dn_bonuslen, dn->dn_phys->dn_bonuslen);
-		int max_bonuslen = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots);
-
-		ASSERT3U(bonuslen, <=, db->db.db_size);
-		db->db.db_data = zio_buf_alloc(max_bonuslen);
-		arc_space_consume(max_bonuslen, ARC_SPACE_BONUS);
-		if (bonuslen < max_bonuslen)
-			bzero(db->db.db_data, max_bonuslen);
-		if (bonuslen)
-			bcopy(DN_BONUS(dn->dn_phys), db->db.db_data, bonuslen);
-		DB_DNODE_EXIT(db);
-		db->db_state = DB_CACHED;
-		mutex_exit(&db->db_mtx);
-		return;
-	}
-
-	/*
-	 * Recheck BP_IS_HOLE() after dnode_block_freed() in case dnode_sync()
-	 * processes the delete record and clears the bp while we are waiting
-	 * for the dn_mtx (resulting in a "no" from block_freed).
-	 */
-	if (db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr) ||
-	    (db->db_level == 0 && (dnode_block_freed(dn, db->db_blkid) ||
-	    BP_IS_HOLE(db->db_blkptr)))) {
-		arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
-
-		dbuf_set_data(db, arc_alloc_buf(db->db_objset->os_spa, db, type,
-		    db->db.db_size));
-		bzero(db->db.db_data, db->db.db_size);
-
-		if (db->db_blkptr != NULL && db->db_level > 0 &&
-		    BP_IS_HOLE(db->db_blkptr) &&
-		    db->db_blkptr->blk_birth != 0) {
-			blkptr_t *bps = db->db.db_data;
-			for (int i = 0; i < ((1 <<
-			    DB_DNODE(db)->dn_indblkshift) / sizeof (blkptr_t));
-			    i++) {
-				blkptr_t *bp = &bps[i];
-				ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==,
-				    1 << dn->dn_indblkshift);
-				BP_SET_LSIZE(bp,
-				    BP_GET_LEVEL(db->db_blkptr) == 1 ?
-				    dn->dn_datablksz :
-				    BP_GET_LSIZE(db->db_blkptr));
-				BP_SET_TYPE(bp, BP_GET_TYPE(db->db_blkptr));
-				BP_SET_LEVEL(bp,
-				    BP_GET_LEVEL(db->db_blkptr) - 1);
-				BP_SET_BIRTH(bp, db->db_blkptr->blk_birth, 0);
-			}
-		}
-		DB_DNODE_EXIT(db);
-		db->db_state = DB_CACHED;
-		mutex_exit(&db->db_mtx);
-		return;
-	}
-
-	DB_DNODE_EXIT(db);
-
-	db->db_state = DB_READ;
-	mutex_exit(&db->db_mtx);
-
-	if (DBUF_IS_L2CACHEABLE(db))
-		aflags |= ARC_FLAG_L2CACHE;
-
-	SET_BOOKMARK(&zb, db->db_objset->os_dsl_dataset ?
-	    db->db_objset->os_dsl_dataset->ds_object : DMU_META_OBJSET,
-	    db->db.db_object, db->db_level, db->db_blkid);
-
-	dbuf_add_ref(db, NULL);
-
-	(void) arc_read(zio, db->db_objset->os_spa, db->db_blkptr,
-	    dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ,
-	    (flags & DB_RF_CANFAIL) ? ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED,
-	    &aflags, &zb);
-}
-
-/*
- * This is our just-in-time copy function.  It makes a copy of buffers that
- * have been modified in a previous transaction group before we access them in
- * the current active group.
- *
- * This function is used in three places: when we are dirtying a buffer for the
- * first time in a txg, when we are freeing a range in a dnode that includes
- * this buffer, and when we are accessing a buffer which was received compressed
- * and later referenced in a WRITE_BYREF record.
- *
- * Note that when we are called from dbuf_free_range() we do not put a hold on
- * the buffer, we just traverse the active dbuf list for the dnode.
- */
-static void
-dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg)
-{
-	dbuf_dirty_record_t *dr = db->db_last_dirty;
-
-	ASSERT(MUTEX_HELD(&db->db_mtx));
-	ASSERT(db->db.db_data != NULL);
-	ASSERT(db->db_level == 0);
-	ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT);
-
-	if (dr == NULL ||
-	    (dr->dt.dl.dr_data !=
-	    ((db->db_blkid  == DMU_BONUS_BLKID) ? db->db.db_data : db->db_buf)))
-		return;
-
-	/*
-	 * If the last dirty record for this dbuf has not yet synced
-	 * and its referencing the dbuf data, either:
-	 *	reset the reference to point to a new copy,
-	 * or (if there a no active holders)
-	 *	just null out the current db_data pointer.
-	 */
-	ASSERT(dr->dr_txg >= txg - 2);
-	if (db->db_blkid == DMU_BONUS_BLKID) {
-		/* Note that the data bufs here are zio_bufs */
-		dnode_t *dn = DB_DNODE(db);
-		int bonuslen = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots);
-		dr->dt.dl.dr_data = zio_buf_alloc(bonuslen);
-		arc_space_consume(bonuslen, ARC_SPACE_BONUS);
-		bcopy(db->db.db_data, dr->dt.dl.dr_data, bonuslen);
-	} else if (zfs_refcount_count(&db->db_holds) > db->db_dirtycnt) {
-		int size = arc_buf_size(db->db_buf);
-		arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
-		spa_t *spa = db->db_objset->os_spa;
-		enum zio_compress compress_type =
-		    arc_get_compression(db->db_buf);
-
-		if (compress_type == ZIO_COMPRESS_OFF) {
-			dr->dt.dl.dr_data = arc_alloc_buf(spa, db, type, size);
-		} else {
-			ASSERT3U(type, ==, ARC_BUFC_DATA);
-			dr->dt.dl.dr_data = arc_alloc_compressed_buf(spa, db,
-			    size, arc_buf_lsize(db->db_buf), compress_type);
-		}
-		bcopy(db->db.db_data, dr->dt.dl.dr_data->b_data, size);
-	} else {
-		db->db_buf = NULL;
-		dbuf_clear_data(db);
-	}
-}
-
-int
-dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
-{
-	int err = 0;
-	boolean_t prefetch;
-	dnode_t *dn;
-
-	/*
-	 * We don't have to hold the mutex to check db_state because it
-	 * can't be freed while we have a hold on the buffer.
-	 */
-	ASSERT(!zfs_refcount_is_zero(&db->db_holds));
-
-	if (db->db_state == DB_NOFILL)
-		return (SET_ERROR(EIO));
-
-	DB_DNODE_ENTER(db);
-	dn = DB_DNODE(db);
-	if ((flags & DB_RF_HAVESTRUCT) == 0)
-		rw_enter(&dn->dn_struct_rwlock, RW_READER);
-
-	prefetch = db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
-	    (flags & DB_RF_NOPREFETCH) == 0 && dn != NULL &&
-	    DBUF_IS_CACHEABLE(db);
-
-	mutex_enter(&db->db_mtx);
-	if (db->db_state == DB_CACHED) {
-		/*
-		 * If the arc buf is compressed, we need to decompress it to
-		 * read the data. This could happen during the "zfs receive" of
-		 * a stream which is compressed and deduplicated.
-		 */
-		if (db->db_buf != NULL &&
-		    arc_get_compression(db->db_buf) != ZIO_COMPRESS_OFF) {
-			dbuf_fix_old_data(db,
-			    spa_syncing_txg(dmu_objset_spa(db->db_objset)));
-			err = arc_decompress(db->db_buf);
-			dbuf_set_data(db, db->db_buf);
-		}
-		mutex_exit(&db->db_mtx);
-		if (prefetch)
-			dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE);
-		if ((flags & DB_RF_HAVESTRUCT) == 0)
-			rw_exit(&dn->dn_struct_rwlock);
-		DB_DNODE_EXIT(db);
-		DBUF_STAT_BUMP(hash_hits);
-	} else if (db->db_state == DB_UNCACHED) {
-		spa_t *spa = dn->dn_objset->os_spa;
-		boolean_t need_wait = B_FALSE;
-
-		if (zio == NULL &&
-		    db->db_blkptr != NULL && !BP_IS_HOLE(db->db_blkptr)) {
-			zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
-			need_wait = B_TRUE;
-		}
-		dbuf_read_impl(db, zio, flags);
-
-		/* dbuf_read_impl has dropped db_mtx for us */
-
-		if (prefetch)
-			dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE);
-
-		if ((flags & DB_RF_HAVESTRUCT) == 0)
-			rw_exit(&dn->dn_struct_rwlock);
-		DB_DNODE_EXIT(db);
-		DBUF_STAT_BUMP(hash_misses);
-
-		if (need_wait)
-			err = zio_wait(zio);
-	} else {
-		/*
-		 * Another reader came in while the dbuf was in flight
-		 * between UNCACHED and CACHED.  Either a writer will finish
-		 * writing the buffer (sending the dbuf to CACHED) or the
-		 * first reader's request will reach the read_done callback
-		 * and send the dbuf to CACHED.  Otherwise, a failure
-		 * occurred and the dbuf went to UNCACHED.
-		 */
-		mutex_exit(&db->db_mtx);
-		if (prefetch)
-			dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE);
-		if ((flags & DB_RF_HAVESTRUCT) == 0)
-			rw_exit(&dn->dn_struct_rwlock);
-		DB_DNODE_EXIT(db);
-		DBUF_STAT_BUMP(hash_misses);
-
-		/* Skip the wait per the caller's request. */
-		mutex_enter(&db->db_mtx);
-		if ((flags & DB_RF_NEVERWAIT) == 0) {
-			while (db->db_state == DB_READ ||
-			    db->db_state == DB_FILL) {
-				ASSERT(db->db_state == DB_READ ||
-				    (flags & DB_RF_HAVESTRUCT) == 0);
-				DTRACE_PROBE2(blocked__read, dmu_buf_impl_t *,
-				    db, zio_t *, zio);
-				cv_wait(&db->db_changed, &db->db_mtx);
-			}
-			if (db->db_state == DB_UNCACHED)
-				err = SET_ERROR(EIO);
-		}
-		mutex_exit(&db->db_mtx);
-	}
-
-	return (err);
-}
-
-static void
-dbuf_noread(dmu_buf_impl_t *db)
-{
-	ASSERT(!zfs_refcount_is_zero(&db->db_holds));
-	ASSERT(db->db_blkid != DMU_BONUS_BLKID);
-	mutex_enter(&db->db_mtx);
-	while (db->db_state == DB_READ || db->db_state == DB_FILL)
-		cv_wait(&db->db_changed, &db->db_mtx);
-	if (db->db_state == DB_UNCACHED) {
-		arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
-		spa_t *spa = db->db_objset->os_spa;
-
-		ASSERT(db->db_buf == NULL);
-		ASSERT(db->db.db_data == NULL);
-		dbuf_set_data(db, arc_alloc_buf(spa, db, type, db->db.db_size));
-		db->db_state = DB_FILL;
-	} else if (db->db_state == DB_NOFILL) {
-		dbuf_clear_data(db);
-	} else {
-		ASSERT3U(db->db_state, ==, DB_CACHED);
-	}
-	mutex_exit(&db->db_mtx);
-}
-
-void
-dbuf_unoverride(dbuf_dirty_record_t *dr)
-{
-	dmu_buf_impl_t *db = dr->dr_dbuf;
-	blkptr_t *bp = &dr->dt.dl.dr_overridden_by;
-	uint64_t txg = dr->dr_txg;
-
-	ASSERT(MUTEX_HELD(&db->db_mtx));
-	/*
-	 * This assert is valid because dmu_sync() expects to be called by
-	 * a zilog's get_data while holding a range lock.  This call only
-	 * comes from dbuf_dirty() callers who must also hold a range lock.
-	 */
-	ASSERT(dr->dt.dl.dr_override_state != DR_IN_DMU_SYNC);
-	ASSERT(db->db_level == 0);
-
-	if (db->db_blkid == DMU_BONUS_BLKID ||
-	    dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN)
-		return;
-
-	ASSERT(db->db_data_pending != dr);
-
-	/* free this block */
-	if (!BP_IS_HOLE(bp) && !dr->dt.dl.dr_nopwrite)
-		zio_free(db->db_objset->os_spa, txg, bp);
-
-	dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
-	dr->dt.dl.dr_nopwrite = B_FALSE;
-
-	/*
-	 * Release the already-written buffer, so we leave it in
-	 * a consistent dirty state.  Note that all callers are
-	 * modifying the buffer, so they will immediately do
-	 * another (redundant) arc_release().  Therefore, leave
-	 * the buf thawed to save the effort of freezing &
-	 * immediately re-thawing it.
-	 */
-	arc_release(dr->dt.dl.dr_data, db);
-}
-
-/*
- * Evict (if its unreferenced) or clear (if its referenced) any level-0
- * data blocks in the free range, so that any future readers will find
- * empty blocks.
- */
-void
-dbuf_free_range(dnode_t *dn, uint64_t start_blkid, uint64_t end_blkid,
-    dmu_tx_t *tx)
-{
-	dmu_buf_impl_t db_search;
-	dmu_buf_impl_t *db, *db_next;
-	uint64_t txg = tx->tx_txg;
-	avl_index_t where;
-
-	if (end_blkid > dn->dn_maxblkid &&
-	    !(start_blkid == DMU_SPILL_BLKID || end_blkid == DMU_SPILL_BLKID))
-		end_blkid = dn->dn_maxblkid;
-	dprintf_dnode(dn, "start=%llu end=%llu\n", start_blkid, end_blkid);
-
-	db_search.db_level = 0;
-	db_search.db_blkid = start_blkid;
-	db_search.db_state = DB_SEARCH;
-
-	mutex_enter(&dn->dn_dbufs_mtx);
-	db = avl_find(&dn->dn_dbufs, &db_search, &where);
-	ASSERT3P(db, ==, NULL);
-
-	db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER);
-
-	for (; db != NULL; db = db_next) {
-		db_next = AVL_NEXT(&dn->dn_dbufs, db);
-		ASSERT(db->db_blkid != DMU_BONUS_BLKID);
-
-		if (db->db_level != 0 || db->db_blkid > end_blkid) {
-			break;
-		}
-		ASSERT3U(db->db_blkid, >=, start_blkid);
-
-		/* found a level 0 buffer in the range */
-		mutex_enter(&db->db_mtx);
-		if (dbuf_undirty(db, tx)) {
-			/* mutex has been dropped and dbuf destroyed */
-			continue;
-		}
-
-		if (db->db_state == DB_UNCACHED ||
-		    db->db_state == DB_NOFILL ||
-		    db->db_state == DB_EVICTING) {
-			ASSERT(db->db.db_data == NULL);
-			mutex_exit(&db->db_mtx);
-			continue;
-		}
-		if (db->db_state == DB_READ || db->db_state == DB_FILL) {
-			/* will be handled in dbuf_read_done or dbuf_rele */
-			db->db_freed_in_flight = TRUE;
-			mutex_exit(&db->db_mtx);
-			continue;
-		}
-		if (zfs_refcount_count(&db->db_holds) == 0) {
-			ASSERT(db->db_buf);
-			dbuf_destroy(db);
-			continue;
-		}
-		/* The dbuf is referenced */
-
-		if (db->db_last_dirty != NULL) {
-			dbuf_dirty_record_t *dr = db->db_last_dirty;
-
-			if (dr->dr_txg == txg) {
-				/*
-				 * This buffer is "in-use", re-adjust the file
-				 * size to reflect that this buffer may
-				 * contain new data when we sync.
-				 */
-				if (db->db_blkid != DMU_SPILL_BLKID &&
-				    db->db_blkid > dn->dn_maxblkid)
-					dn->dn_maxblkid = db->db_blkid;
-				dbuf_unoverride(dr);
-			} else {
-				/*
-				 * This dbuf is not dirty in the open context.
-				 * Either uncache it (if its not referenced in
-				 * the open context) or reset its contents to
-				 * empty.
-				 */
-				dbuf_fix_old_data(db, txg);
-			}
-		}
-		/* clear the contents if its cached */
-		if (db->db_state == DB_CACHED) {
-			ASSERT(db->db.db_data != NULL);
-			arc_release(db->db_buf, db);
-			bzero(db->db.db_data, db->db.db_size);
-			arc_buf_freeze(db->db_buf);
-		}
-
-		mutex_exit(&db->db_mtx);
-	}
-	mutex_exit(&dn->dn_dbufs_mtx);
-}
-
-void
-dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx)
-{
-	arc_buf_t *buf, *obuf;
-	int osize = db->db.db_size;
-	arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
-	dnode_t *dn;
-
-	ASSERT(db->db_blkid != DMU_BONUS_BLKID);
-
-	DB_DNODE_ENTER(db);
-	dn = DB_DNODE(db);
-
-	/* XXX does *this* func really need the lock? */
-	ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
-
-	/*
-	 * This call to dmu_buf_will_dirty() with the dn_struct_rwlock held
-	 * is OK, because there can be no other references to the db
-	 * when we are changing its size, so no concurrent DB_FILL can
-	 * be happening.
-	 */
-	/*
-	 * XXX we should be doing a dbuf_read, checking the return
-	 * value and returning that up to our callers
-	 */
-	dmu_buf_will_dirty(&db->db, tx);
-
-	/* create the data buffer for the new block */
-	buf = arc_alloc_buf(dn->dn_objset->os_spa, db, type, size);
-
-	/* copy old block data to the new block */
-	obuf = db->db_buf;
-	bcopy(obuf->b_data, buf->b_data, MIN(osize, size));
-	/* zero the remainder */
-	if (size > osize)
-		bzero((uint8_t *)buf->b_data + osize, size - osize);
-
-	mutex_enter(&db->db_mtx);
-	dbuf_set_data(db, buf);
-	arc_buf_destroy(obuf, db);
-	db->db.db_size = size;
-
-	if (db->db_level == 0) {
-		ASSERT3U(db->db_last_dirty->dr_txg, ==, tx->tx_txg);
-		db->db_last_dirty->dt.dl.dr_data = buf;
-	}
-	mutex_exit(&db->db_mtx);
-
-	dmu_objset_willuse_space(dn->dn_objset, size - osize, tx);
-	DB_DNODE_EXIT(db);
-}
-
-void
-dbuf_release_bp(dmu_buf_impl_t *db)
-{
-	objset_t *os = db->db_objset;
-
-	ASSERT(dsl_pool_sync_context(dmu_objset_pool(os)));
-	ASSERT(arc_released(os->os_phys_buf) ||
-	    list_link_active(&os->os_dsl_dataset->ds_synced_link));
-	ASSERT(db->db_parent == NULL || arc_released(db->db_parent->db_buf));
-
-	(void) arc_release(db->db_buf, db);
-}
-
-/*
- * We already have a dirty record for this TXG, and we are being
- * dirtied again.
- */
-static void
-dbuf_redirty(dbuf_dirty_record_t *dr)
-{
-	dmu_buf_impl_t *db = dr->dr_dbuf;
-
-	ASSERT(MUTEX_HELD(&db->db_mtx));
-
-	if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID) {
-		/*
-		 * If this buffer has already been written out,
-		 * we now need to reset its state.
-		 */
-		dbuf_unoverride(dr);
-		if (db->db.db_object != DMU_META_DNODE_OBJECT &&
-		    db->db_state != DB_NOFILL) {
-			/* Already released on initial dirty, so just thaw. */
-			ASSERT(arc_released(db->db_buf));
-			arc_buf_thaw(db->db_buf);
-		}
-	}
-}
-
-dbuf_dirty_record_t *
-dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
-{
-	dnode_t *dn;
-	objset_t *os;
-	dbuf_dirty_record_t **drp, *dr;
-	int drop_struct_lock = FALSE;
-	int txgoff = tx->tx_txg & TXG_MASK;
-
-	ASSERT(tx->tx_txg != 0);
-	ASSERT(!zfs_refcount_is_zero(&db->db_holds));
-	DMU_TX_DIRTY_BUF(tx, db);
-
-	DB_DNODE_ENTER(db);
-	dn = DB_DNODE(db);
-	/*
-	 * Shouldn't dirty a regular buffer in syncing context.  Private
-	 * objects may be dirtied in syncing context, but only if they
-	 * were already pre-dirtied in open context.
-	 */
-#ifdef DEBUG
-	if (dn->dn_objset->os_dsl_dataset != NULL) {
-		rrw_enter(&dn->dn_objset->os_dsl_dataset->ds_bp_rwlock,
-		    RW_READER, FTAG);
-	}
-	ASSERT(!dmu_tx_is_syncing(tx) ||
-	    BP_IS_HOLE(dn->dn_objset->os_rootbp) ||
-	    DMU_OBJECT_IS_SPECIAL(dn->dn_object) ||
-	    dn->dn_objset->os_dsl_dataset == NULL);
-	if (dn->dn_objset->os_dsl_dataset != NULL)
-		rrw_exit(&dn->dn_objset->os_dsl_dataset->ds_bp_rwlock, FTAG);
-#endif
-	/*
-	 * We make this assert for private objects as well, but after we
-	 * check if we're already dirty.  They are allowed to re-dirty
-	 * in syncing context.
-	 */
-	ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT ||
-	    dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx ==
-	    (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN));
-
-	mutex_enter(&db->db_mtx);
-	/*
-	 * XXX make this true for indirects too?  The problem is that
-	 * transactions created with dmu_tx_create_assigned() from
-	 * syncing context don't bother holding ahead.
-	 */
-	ASSERT(db->db_level != 0 ||
-	    db->db_state == DB_CACHED || db->db_state == DB_FILL ||
-	    db->db_state == DB_NOFILL);
-
-	mutex_enter(&dn->dn_mtx);
-	/*
-	 * Don't set dirtyctx to SYNC if we're just modifying this as we
-	 * initialize the objset.
-	 */
-	if (dn->dn_dirtyctx == DN_UNDIRTIED) {
-		if (dn->dn_objset->os_dsl_dataset != NULL) {
-			rrw_enter(&dn->dn_objset->os_dsl_dataset->ds_bp_rwlock,
-			    RW_READER, FTAG);
-		}
-		if (!BP_IS_HOLE(dn->dn_objset->os_rootbp)) {
-			dn->dn_dirtyctx = (dmu_tx_is_syncing(tx) ?
-			    DN_DIRTY_SYNC : DN_DIRTY_OPEN);
-			ASSERT(dn->dn_dirtyctx_firstset == NULL);
-			dn->dn_dirtyctx_firstset = kmem_alloc(1, KM_SLEEP);
-		}
-		if (dn->dn_objset->os_dsl_dataset != NULL) {
-			rrw_exit(&dn->dn_objset->os_dsl_dataset->ds_bp_rwlock,
-			    FTAG);
-		}
-	}
-
-	if (tx->tx_txg > dn->dn_dirty_txg)
-		dn->dn_dirty_txg = tx->tx_txg;
-	mutex_exit(&dn->dn_mtx);
-
-	if (db->db_blkid == DMU_SPILL_BLKID)
-		dn->dn_have_spill = B_TRUE;
-
-	/*
-	 * If this buffer is already dirty, we're done.
-	 */
-	drp = &db->db_last_dirty;
-	ASSERT(*drp == NULL || (*drp)->dr_txg <= tx->tx_txg ||
-	    db->db.db_object == DMU_META_DNODE_OBJECT);
-	while ((dr = *drp) != NULL && dr->dr_txg > tx->tx_txg)
-		drp = &dr->dr_next;
-	if (dr && dr->dr_txg == tx->tx_txg) {
-		DB_DNODE_EXIT(db);
-
-		dbuf_redirty(dr);
-		mutex_exit(&db->db_mtx);
-		return (dr);
-	}
-
-	/*
-	 * Only valid if not already dirty.
-	 */
-	ASSERT(dn->dn_object == 0 ||
-	    dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx ==
-	    (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN));
-
-	ASSERT3U(dn->dn_nlevels, >, db->db_level);
-
-	/*
-	 * We should only be dirtying in syncing context if it's the
-	 * mos or we're initializing the os or it's a special object.
-	 * However, we are allowed to dirty in syncing context provided
-	 * we already dirtied it in open context.  Hence we must make
-	 * this assertion only if we're not already dirty.
-	 */
-	os = dn->dn_objset;
-	VERIFY3U(tx->tx_txg, <=, spa_final_dirty_txg(os->os_spa));
-#ifdef DEBUG
-	if (dn->dn_objset->os_dsl_dataset != NULL)
-		rrw_enter(&os->os_dsl_dataset->ds_bp_rwlock, RW_READER, FTAG);
-	ASSERT(!dmu_tx_is_syncing(tx) || DMU_OBJECT_IS_SPECIAL(dn->dn_object) ||
-	    os->os_dsl_dataset == NULL || BP_IS_HOLE(os->os_rootbp));
-	if (dn->dn_objset->os_dsl_dataset != NULL)
-		rrw_exit(&os->os_dsl_dataset->ds_bp_rwlock, FTAG);
-#endif
-	ASSERT(db->db.db_size != 0);
-
-	dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size);
-
-	if (db->db_blkid != DMU_BONUS_BLKID) {
-		dmu_objset_willuse_space(os, db->db.db_size, tx);
-	}
-
-	/*
-	 * If this buffer is dirty in an old transaction group we need
-	 * to make a copy of it so that the changes we make in this
-	 * transaction group won't leak out when we sync the older txg.
-	 */
-	dr = kmem_zalloc(sizeof (dbuf_dirty_record_t), KM_SLEEP);
-	list_link_init(&dr->dr_dirty_node);
-	if (db->db_level == 0) {
-		void *data_old = db->db_buf;
-
-		if (db->db_state != DB_NOFILL) {
-			if (db->db_blkid == DMU_BONUS_BLKID) {
-				dbuf_fix_old_data(db, tx->tx_txg);
-				data_old = db->db.db_data;
-			} else if (db->db.db_object != DMU_META_DNODE_OBJECT) {
-				/*
-				 * Release the data buffer from the cache so
-				 * that we can modify it without impacting
-				 * possible other users of this cached data
-				 * block.  Note that indirect blocks and
-				 * private objects are not released until the
-				 * syncing state (since they are only modified
-				 * then).
-				 */
-				arc_release(db->db_buf, db);
-				dbuf_fix_old_data(db, tx->tx_txg);
-				data_old = db->db_buf;
-			}
-			ASSERT(data_old != NULL);
-		}
-		dr->dt.dl.dr_data = data_old;
-	} else {
-		mutex_init(&dr->dt.di.dr_mtx, NULL, MUTEX_DEFAULT, NULL);
-		list_create(&dr->dt.di.dr_children,
-		    sizeof (dbuf_dirty_record_t),
-		    offsetof(dbuf_dirty_record_t, dr_dirty_node));
-	}
-	if (db->db_blkid != DMU_BONUS_BLKID && os->os_dsl_dataset != NULL)
-		dr->dr_accounted = db->db.db_size;
-	dr->dr_dbuf = db;
-	dr->dr_txg = tx->tx_txg;
-	dr->dr_next = *drp;
-	*drp = dr;
-
-	/*
-	 * We could have been freed_in_flight between the dbuf_noread
-	 * and dbuf_dirty.  We win, as though the dbuf_noread() had
-	 * happened after the free.
-	 */
-	if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
-	    db->db_blkid != DMU_SPILL_BLKID) {
-		mutex_enter(&dn->dn_mtx);
-		if (dn->dn_free_ranges[txgoff] != NULL) {
-			range_tree_clear(dn->dn_free_ranges[txgoff],
-			    db->db_blkid, 1);
-		}
-		mutex_exit(&dn->dn_mtx);
-		db->db_freed_in_flight = FALSE;
-	}
-
-	/*
-	 * This buffer is now part of this txg
-	 */
-	dbuf_add_ref(db, (void *)(uintptr_t)tx->tx_txg);
-	db->db_dirtycnt += 1;
-	ASSERT3U(db->db_dirtycnt, <=, 3);
-
-	mutex_exit(&db->db_mtx);
-
-	if (db->db_blkid == DMU_BONUS_BLKID ||
-	    db->db_blkid == DMU_SPILL_BLKID) {
-		mutex_enter(&dn->dn_mtx);
-		ASSERT(!list_link_active(&dr->dr_dirty_node));
-		list_insert_tail(&dn->dn_dirty_records[txgoff], dr);
-		mutex_exit(&dn->dn_mtx);
-		dnode_setdirty(dn, tx);
-		DB_DNODE_EXIT(db);
-		return (dr);
-	}
-
-	/*
-	 * The dn_struct_rwlock prevents db_blkptr from changing
-	 * due to a write from syncing context completing
-	 * while we are running, so we want to acquire it before
-	 * looking at db_blkptr.
-	 */
-	if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) {
-		rw_enter(&dn->dn_struct_rwlock, RW_READER);
-		drop_struct_lock = TRUE;
-	}
-
-	/*
-	 * We need to hold the dn_struct_rwlock to make this assertion,
-	 * because it protects dn_phys / dn_next_nlevels from changing.
-	 */
-	ASSERT((dn->dn_phys->dn_nlevels == 0 && db->db_level == 0) ||
-	    dn->dn_phys->dn_nlevels > db->db_level ||
-	    dn->dn_next_nlevels[txgoff] > db->db_level ||
-	    dn->dn_next_nlevels[(tx->tx_txg-1) & TXG_MASK] > db->db_level ||
-	    dn->dn_next_nlevels[(tx->tx_txg-2) & TXG_MASK] > db->db_level);
-
-	/*
-	 * If we are overwriting a dedup BP, then unless it is snapshotted,
-	 * when we get to syncing context we will need to decrement its
-	 * refcount in the DDT.  Prefetch the relevant DDT block so that
-	 * syncing context won't have to wait for the i/o.
-	 */
-	ddt_prefetch(os->os_spa, db->db_blkptr);
-
-	if (db->db_level == 0) {
-		dnode_new_blkid(dn, db->db_blkid, tx, drop_struct_lock);
-		ASSERT(dn->dn_maxblkid >= db->db_blkid);
-	}
-
-	if (db->db_level+1 < dn->dn_nlevels) {
-		dmu_buf_impl_t *parent = db->db_parent;
-		dbuf_dirty_record_t *di;
-		int parent_held = FALSE;
-
-		if (db->db_parent == NULL || db->db_parent == dn->dn_dbuf) {
-			int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
-
-			parent = dbuf_hold_level(dn, db->db_level+1,
-			    db->db_blkid >> epbs, FTAG);
-			ASSERT(parent != NULL);
-			parent_held = TRUE;
-		}
-		if (drop_struct_lock)
-			rw_exit(&dn->dn_struct_rwlock);
-		ASSERT3U(db->db_level+1, ==, parent->db_level);
-		di = dbuf_dirty(parent, tx);
-		if (parent_held)
-			dbuf_rele(parent, FTAG);
-
-		mutex_enter(&db->db_mtx);
-		/*
-		 * Since we've dropped the mutex, it's possible that
-		 * dbuf_undirty() might have changed this out from under us.
-		 */
-		if (db->db_last_dirty == dr ||
-		    dn->dn_object == DMU_META_DNODE_OBJECT) {
-			mutex_enter(&di->dt.di.dr_mtx);
-			ASSERT3U(di->dr_txg, ==, tx->tx_txg);
-			ASSERT(!list_link_active(&dr->dr_dirty_node));
-			list_insert_tail(&di->dt.di.dr_children, dr);
-			mutex_exit(&di->dt.di.dr_mtx);
-			dr->dr_parent = di;
-		}
-		mutex_exit(&db->db_mtx);
-	} else {
-		ASSERT(db->db_level+1 == dn->dn_nlevels);
-		ASSERT(db->db_blkid < dn->dn_nblkptr);
-		ASSERT(db->db_parent == NULL || db->db_parent == dn->dn_dbuf);
-		mutex_enter(&dn->dn_mtx);
-		ASSERT(!list_link_active(&dr->dr_dirty_node));
-		list_insert_tail(&dn->dn_dirty_records[txgoff], dr);
-		mutex_exit(&dn->dn_mtx);
-		if (drop_struct_lock)
-			rw_exit(&dn->dn_struct_rwlock);
-	}
-
-	dnode_setdirty(dn, tx);
-	DB_DNODE_EXIT(db);
-	return (dr);
-}
-
-/*
- * Undirty a buffer in the transaction group referenced by the given
- * transaction.  Return whether this evicted the dbuf.
- */
-static boolean_t
-dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
-{
-	dnode_t *dn;
-	uint64_t txg = tx->tx_txg;
-	dbuf_dirty_record_t *dr, **drp;
-
-	ASSERT(txg != 0);
-
-	/*
-	 * Due to our use of dn_nlevels below, this can only be called
-	 * in open context, unless we are operating on the MOS.
-	 * From syncing context, dn_nlevels may be different from the
-	 * dn_nlevels used when dbuf was dirtied.
-	 */
-	ASSERT(db->db_objset ==
-	    dmu_objset_pool(db->db_objset)->dp_meta_objset ||
-	    txg != spa_syncing_txg(dmu_objset_spa(db->db_objset)));
-	ASSERT(db->db_blkid != DMU_BONUS_BLKID);
-	ASSERT0(db->db_level);
-	ASSERT(MUTEX_HELD(&db->db_mtx));
-
-	/*
-	 * If this buffer is not dirty, we're done.
-	 */
-	for (drp = &db->db_last_dirty; (dr = *drp) != NULL; drp = &dr->dr_next)
-		if (dr->dr_txg <= txg)
-			break;
-	if (dr == NULL || dr->dr_txg < txg)
-		return (B_FALSE);
-	ASSERT(dr->dr_txg == txg);
-	ASSERT(dr->dr_dbuf == db);
-
-	DB_DNODE_ENTER(db);
-	dn = DB_DNODE(db);
-
-	dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size);
-
-	ASSERT(db->db.db_size != 0);
-
-	dsl_pool_undirty_space(dmu_objset_pool(dn->dn_objset),
-	    dr->dr_accounted, txg);
-
-	*drp = dr->dr_next;
-
-	/*
-	 * Note that there are three places in dbuf_dirty()
-	 * where this dirty record may be put on a list.
-	 * Make sure to do a list_remove corresponding to
-	 * every one of those list_insert calls.
-	 */
-	if (dr->dr_parent) {
-		mutex_enter(&dr->dr_parent->dt.di.dr_mtx);
-		list_remove(&dr->dr_parent->dt.di.dr_children, dr);
-		mutex_exit(&dr->dr_parent->dt.di.dr_mtx);
-	} else if (db->db_blkid == DMU_SPILL_BLKID ||
-	    db->db_level + 1 == dn->dn_nlevels) {
-		ASSERT(db->db_blkptr == NULL || db->db_parent == dn->dn_dbuf);
-		mutex_enter(&dn->dn_mtx);
-		list_remove(&dn->dn_dirty_records[txg & TXG_MASK], dr);
-		mutex_exit(&dn->dn_mtx);
-	}
-	DB_DNODE_EXIT(db);
-
-	if (db->db_state != DB_NOFILL) {
-		dbuf_unoverride(dr);
-
-		ASSERT(db->db_buf != NULL);
-		ASSERT(dr->dt.dl.dr_data != NULL);
-		if (dr->dt.dl.dr_data != db->db_buf)
-			arc_buf_destroy(dr->dt.dl.dr_data, db);
-	}
-
-	kmem_free(dr, sizeof (dbuf_dirty_record_t));
-
-	ASSERT(db->db_dirtycnt > 0);
-	db->db_dirtycnt -= 1;
-
-	if (zfs_refcount_remove(&db->db_holds, (void *)(uintptr_t)txg) == 0) {
-		ASSERT(db->db_state == DB_NOFILL || arc_released(db->db_buf));
-		dbuf_destroy(db);
-		return (B_TRUE);
-	}
-
-	return (B_FALSE);
-}
-
-void
-dmu_buf_will_dirty(dmu_buf_t *db_fake, dmu_tx_t *tx)
-{
-	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
-	int rf = DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH;
-
-	ASSERT(tx->tx_txg != 0);
-	ASSERT(!zfs_refcount_is_zero(&db->db_holds));
-
-	/*
-	 * Quick check for dirtyness.  For already dirty blocks, this
-	 * reduces runtime of this function by >90%, and overall performance
-	 * by 50% for some workloads (e.g. file deletion with indirect blocks
-	 * cached).
-	 */
-	mutex_enter(&db->db_mtx);
-	dbuf_dirty_record_t *dr;
-	for (dr = db->db_last_dirty;
-	    dr != NULL && dr->dr_txg >= tx->tx_txg; dr = dr->dr_next) {
-		/*
-		 * It's possible that it is already dirty but not cached,
-		 * because there are some calls to dbuf_dirty() that don't
-		 * go through dmu_buf_will_dirty().
-		 */
-		if (dr->dr_txg == tx->tx_txg && db->db_state == DB_CACHED) {
-			/* This dbuf is already dirty and cached. */
-			dbuf_redirty(dr);
-			mutex_exit(&db->db_mtx);
-			return;
-		}
-	}
-	mutex_exit(&db->db_mtx);
-
-	DB_DNODE_ENTER(db);
-	if (RW_WRITE_HELD(&DB_DNODE(db)->dn_struct_rwlock))
-		rf |= DB_RF_HAVESTRUCT;
-	DB_DNODE_EXIT(db);
-	(void) dbuf_read(db, NULL, rf);
-	(void) dbuf_dirty(db, tx);
-}
-
-void
-dmu_buf_will_not_fill(dmu_buf_t *db_fake, dmu_tx_t *tx)
-{
-	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
-
-	db->db_state = DB_NOFILL;
-
-	dmu_buf_will_fill(db_fake, tx);
-}
-
-void
-dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx)
-{
-	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
-
-	ASSERT(db->db_blkid != DMU_BONUS_BLKID);
-	ASSERT(tx->tx_txg != 0);
-	ASSERT(db->db_level == 0);
-	ASSERT(!zfs_refcount_is_zero(&db->db_holds));
-
-	ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT ||
-	    dmu_tx_private_ok(tx));
-
-	dbuf_noread(db);
-	(void) dbuf_dirty(db, tx);
-}
-
-#pragma weak dmu_buf_fill_done = dbuf_fill_done
-/* ARGSUSED */
-void
-dbuf_fill_done(dmu_buf_impl_t *db, dmu_tx_t *tx)
-{
-	mutex_enter(&db->db_mtx);
-	DBUF_VERIFY(db);
-
-	if (db->db_state == DB_FILL) {
-		if (db->db_level == 0 && db->db_freed_in_flight) {
-			ASSERT(db->db_blkid != DMU_BONUS_BLKID);
-			/* we were freed while filling */
-			/* XXX dbuf_undirty? */
-			bzero(db->db.db_data, db->db.db_size);
-			db->db_freed_in_flight = FALSE;
-		}
-		db->db_state = DB_CACHED;
-		cv_broadcast(&db->db_changed);
-	}
-	mutex_exit(&db->db_mtx);
-}
-
-void
-dmu_buf_write_embedded(dmu_buf_t *dbuf, void *data,
-    bp_embedded_type_t etype, enum zio_compress comp,
-    int uncompressed_size, int compressed_size, int byteorder,
-    dmu_tx_t *tx)
-{
-	dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf;
-	struct dirty_leaf *dl;
-	dmu_object_type_t type;
-
-	if (etype == BP_EMBEDDED_TYPE_DATA) {
-		ASSERT(spa_feature_is_active(dmu_objset_spa(db->db_objset),
-		    SPA_FEATURE_EMBEDDED_DATA));
-	}
-
-	DB_DNODE_ENTER(db);
-	type = DB_DNODE(db)->dn_type;
-	DB_DNODE_EXIT(db);
-
-	ASSERT0(db->db_level);
-	ASSERT(db->db_blkid != DMU_BONUS_BLKID);
-
-	dmu_buf_will_not_fill(dbuf, tx);
-
-	ASSERT3U(db->db_last_dirty->dr_txg, ==, tx->tx_txg);
-	dl = &db->db_last_dirty->dt.dl;
-	encode_embedded_bp_compressed(&dl->dr_overridden_by,
-	    data, comp, uncompressed_size, compressed_size);
-	BPE_SET_ETYPE(&dl->dr_overridden_by, etype);
-	BP_SET_TYPE(&dl->dr_overridden_by, type);
-	BP_SET_LEVEL(&dl->dr_overridden_by, 0);
-	BP_SET_BYTEORDER(&dl->dr_overridden_by, byteorder);
-
-	dl->dr_override_state = DR_OVERRIDDEN;
-	dl->dr_overridden_by.blk_birth = db->db_last_dirty->dr_txg;
-}
-
-/*
- * Directly assign a provided arc buf to a given dbuf if it's not referenced
- * by anybody except our caller. Otherwise copy arcbuf's contents to dbuf.
- */
-void
-dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx)
-{
-	ASSERT(!zfs_refcount_is_zero(&db->db_holds));
-	ASSERT(db->db_blkid != DMU_BONUS_BLKID);
-	ASSERT(db->db_level == 0);
-	ASSERT3U(dbuf_is_metadata(db), ==, arc_is_metadata(buf));
-	ASSERT(buf != NULL);
-	ASSERT(arc_buf_lsize(buf) == db->db.db_size);
-	ASSERT(tx->tx_txg != 0);
-
-	arc_return_buf(buf, db);
-	ASSERT(arc_released(buf));
-
-	mutex_enter(&db->db_mtx);
-
-	while (db->db_state == DB_READ || db->db_state == DB_FILL)
-		cv_wait(&db->db_changed, &db->db_mtx);
-
-	ASSERT(db->db_state == DB_CACHED || db->db_state == DB_UNCACHED);
-
-	if (db->db_state == DB_CACHED &&
-	    zfs_refcount_count(&db->db_holds) - 1 > db->db_dirtycnt) {
-		mutex_exit(&db->db_mtx);
-		(void) dbuf_dirty(db, tx);
-		bcopy(buf->b_data, db->db.db_data, db->db.db_size);
-		arc_buf_destroy(buf, db);
-		xuio_stat_wbuf_copied();
-		return;
-	}
-
-	xuio_stat_wbuf_nocopy();
-	if (db->db_state == DB_CACHED) {
-		dbuf_dirty_record_t *dr = db->db_last_dirty;
-
-		ASSERT(db->db_buf != NULL);
-		if (dr != NULL && dr->dr_txg == tx->tx_txg) {
-			ASSERT(dr->dt.dl.dr_data == db->db_buf);
-			if (!arc_released(db->db_buf)) {
-				ASSERT(dr->dt.dl.dr_override_state ==
-				    DR_OVERRIDDEN);
-				arc_release(db->db_buf, db);
-			}
-			dr->dt.dl.dr_data = buf;
-			arc_buf_destroy(db->db_buf, db);
-		} else if (dr == NULL || dr->dt.dl.dr_data != db->db_buf) {
-			arc_release(db->db_buf, db);
-			arc_buf_destroy(db->db_buf, db);
-		}
-		db->db_buf = NULL;
-	}
-	ASSERT(db->db_buf == NULL);
-	dbuf_set_data(db, buf);
-	db->db_state = DB_FILL;
-	mutex_exit(&db->db_mtx);
-	(void) dbuf_dirty(db, tx);
-	dmu_buf_fill_done(&db->db, tx);
-}
-
-void
-dbuf_destroy(dmu_buf_impl_t *db)
-{
-	dnode_t *dn;
-	dmu_buf_impl_t *parent = db->db_parent;
-	dmu_buf_impl_t *dndb;
-
-	ASSERT(MUTEX_HELD(&db->db_mtx));
-	ASSERT(zfs_refcount_is_zero(&db->db_holds));
-
-	if (db->db_buf != NULL) {
-		arc_buf_destroy(db->db_buf, db);
-		db->db_buf = NULL;
-	}
-
-	if (db->db_blkid == DMU_BONUS_BLKID) {
-		int slots = DB_DNODE(db)->dn_num_slots;
-		int bonuslen = DN_SLOTS_TO_BONUSLEN(slots);
-		if (db->db.db_data != NULL) {
-			zio_buf_free(db->db.db_data, bonuslen);
-			arc_space_return(bonuslen, ARC_SPACE_BONUS);
-			db->db_state = DB_UNCACHED;
-		}
-	}
-
-	dbuf_clear_data(db);
-
-	if (multilist_link_active(&db->db_cache_link)) {
-		ASSERT(db->db_caching_status == DB_DBUF_CACHE ||
-		    db->db_caching_status == DB_DBUF_METADATA_CACHE);
-
-		multilist_remove(dbuf_caches[db->db_caching_status].cache, db);
-		(void) zfs_refcount_remove_many(
-		    &dbuf_caches[db->db_caching_status].size,
-		    db->db.db_size, db);
-
-		if (db->db_caching_status == DB_DBUF_METADATA_CACHE) {
-			DBUF_STAT_BUMPDOWN(metadata_cache_count);
-		} else {
-			DBUF_STAT_BUMPDOWN(cache_levels[db->db_level]);
-			DBUF_STAT_BUMPDOWN(cache_count);
-			DBUF_STAT_DECR(cache_levels_bytes[db->db_level],
-			    db->db.db_size);
-		}
-		db->db_caching_status = DB_NO_CACHE;
-	}
-
-	ASSERT(db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL);
-	ASSERT(db->db_data_pending == NULL);
-
-	db->db_state = DB_EVICTING;
-	db->db_blkptr = NULL;
-
-	/*
-	 * Now that db_state is DB_EVICTING, nobody else can find this via
-	 * the hash table.  We can now drop db_mtx, which allows us to
-	 * acquire the dn_dbufs_mtx.
-	 */
-	mutex_exit(&db->db_mtx);
-
-	DB_DNODE_ENTER(db);
-	dn = DB_DNODE(db);
-	dndb = dn->dn_dbuf;
-	if (db->db_blkid != DMU_BONUS_BLKID) {
-		boolean_t needlock = !MUTEX_HELD(&dn->dn_dbufs_mtx);
-		if (needlock)
-			mutex_enter(&dn->dn_dbufs_mtx);
-		avl_remove(&dn->dn_dbufs, db);
-		membar_producer();
-		DB_DNODE_EXIT(db);
-		if (needlock)
-			mutex_exit(&dn->dn_dbufs_mtx);
-		/*
-		 * Decrementing the dbuf count means that the hold corresponding
-		 * to the removed dbuf is no longer discounted in dnode_move(),
-		 * so the dnode cannot be moved until after we release the hold.
-		 * The membar_producer() ensures visibility of the decremented
-		 * value in dnode_move(), since DB_DNODE_EXIT doesn't actually
-		 * release any lock.
-		 */
-		mutex_enter(&dn->dn_mtx);
-		dnode_rele_and_unlock(dn, db, B_TRUE);
-		db->db_dnode_handle = NULL;
-
-		dbuf_hash_remove(db);
-	} else {
-		DB_DNODE_EXIT(db);
-	}
-
-	ASSERT(zfs_refcount_is_zero(&db->db_holds));
-
-	db->db_parent = NULL;
-
-	ASSERT(db->db_buf == NULL);
-	ASSERT(db->db.db_data == NULL);
-	ASSERT(db->db_hash_next == NULL);
-	ASSERT(db->db_blkptr == NULL);
-	ASSERT(db->db_data_pending == NULL);
-	ASSERT3U(db->db_caching_status, ==, DB_NO_CACHE);
-	ASSERT(!multilist_link_active(&db->db_cache_link));
-
-	kmem_cache_free(dbuf_kmem_cache, db);
-	arc_space_return(sizeof (dmu_buf_impl_t), ARC_SPACE_DBUF);
-
-	/*
-	 * If this dbuf is referenced from an indirect dbuf,
-	 * decrement the ref count on the indirect dbuf.
-	 */
-	if (parent && parent != dndb) {
-		mutex_enter(&parent->db_mtx);
-		dbuf_rele_and_unlock(parent, db, B_TRUE);
-	}
-}
-
-/*
- * Note: While bpp will always be updated if the function returns success,
- * parentp will not be updated if the dnode does not have dn_dbuf filled in;
- * this happens when the dnode is the meta-dnode, or a userused or groupused
- * object.
- */
-__attribute__((always_inline))
-static inline int
-dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse,
-    dmu_buf_impl_t **parentp, blkptr_t **bpp, struct dbuf_hold_impl_data *dh)
-{
-	*parentp = NULL;
-	*bpp = NULL;
-
-	ASSERT(blkid != DMU_BONUS_BLKID);
-
-	if (blkid == DMU_SPILL_BLKID) {
-		mutex_enter(&dn->dn_mtx);
-		if (dn->dn_have_spill &&
-		    (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR))
-			*bpp = DN_SPILL_BLKPTR(dn->dn_phys);
-		else
-			*bpp = NULL;
-		dbuf_add_ref(dn->dn_dbuf, NULL);
-		*parentp = dn->dn_dbuf;
-		mutex_exit(&dn->dn_mtx);
-		return (0);
-	}
-
-	int nlevels =
-	    (dn->dn_phys->dn_nlevels == 0) ? 1 : dn->dn_phys->dn_nlevels;
-	int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
-
-	ASSERT3U(level * epbs, <, 64);
-	ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
-	/*
-	 * This assertion shouldn't trip as long as the max indirect block size
-	 * is less than 1M.  The reason for this is that up to that point,
-	 * the number of levels required to address an entire object with blocks
-	 * of size SPA_MINBLOCKSIZE satisfies nlevels * epbs + 1 <= 64.  In
-	 * other words, if N * epbs + 1 > 64, then if (N-1) * epbs + 1 > 55
-	 * (i.e. we can address the entire object), objects will all use at most
-	 * N-1 levels and the assertion won't overflow.  However, once epbs is
-	 * 13, 4 * 13 + 1 = 53, but 5 * 13 + 1 = 66.  Then, 4 levels will not be
-	 * enough to address an entire object, so objects will have 5 levels,
-	 * but then this assertion will overflow.
-	 *
-	 * All this is to say that if we ever increase DN_MAX_INDBLKSHIFT, we
-	 * need to redo this logic to handle overflows.
-	 */
-	ASSERT(level >= nlevels ||
-	    ((nlevels - level - 1) * epbs) +
-	    highbit64(dn->dn_phys->dn_nblkptr) <= 64);
-	if (level >= nlevels ||
-	    blkid >= ((uint64_t)dn->dn_phys->dn_nblkptr <<
-	    ((nlevels - level - 1) * epbs)) ||
-	    (fail_sparse &&
-	    blkid > (dn->dn_phys->dn_maxblkid >> (level * epbs)))) {
-		/* the buffer has no parent yet */
-		return (SET_ERROR(ENOENT));
-	} else if (level < nlevels-1) {
-		/* this block is referenced from an indirect block */
-		int err;
-		if (dh == NULL) {
-			err = dbuf_hold_impl(dn, level+1,
-			    blkid >> epbs, fail_sparse, FALSE, NULL, parentp);
-		} else {
-			__dbuf_hold_impl_init(dh + 1, dn, dh->dh_level + 1,
-			    blkid >> epbs, fail_sparse, FALSE, NULL,
-			    parentp, dh->dh_depth + 1);
-			err = __dbuf_hold_impl(dh + 1);
-		}
-		if (err)
-			return (err);
-		err = dbuf_read(*parentp, NULL,
-		    (DB_RF_HAVESTRUCT | DB_RF_NOPREFETCH | DB_RF_CANFAIL));
-		if (err) {
-			dbuf_rele(*parentp, NULL);
-			*parentp = NULL;
-			return (err);
-		}
-		*bpp = ((blkptr_t *)(*parentp)->db.db_data) +
-		    (blkid & ((1ULL << epbs) - 1));
-		if (blkid > (dn->dn_phys->dn_maxblkid >> (level * epbs)))
-			ASSERT(BP_IS_HOLE(*bpp));
-		return (0);
-	} else {
-		/* the block is referenced from the dnode */
-		ASSERT3U(level, ==, nlevels-1);
-		ASSERT(dn->dn_phys->dn_nblkptr == 0 ||
-		    blkid < dn->dn_phys->dn_nblkptr);
-		if (dn->dn_dbuf) {
-			dbuf_add_ref(dn->dn_dbuf, NULL);
-			*parentp = dn->dn_dbuf;
-		}
-		*bpp = &dn->dn_phys->dn_blkptr[blkid];
-		return (0);
-	}
-}
-
-static dmu_buf_impl_t *
-dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid,
-    dmu_buf_impl_t *parent, blkptr_t *blkptr)
-{
-	objset_t *os = dn->dn_objset;
-	dmu_buf_impl_t *db, *odb;
-
-	ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
-	ASSERT(dn->dn_type != DMU_OT_NONE);
-
-	db = kmem_cache_alloc(dbuf_kmem_cache, KM_SLEEP);
-
-	db->db_objset = os;
-	db->db.db_object = dn->dn_object;
-	db->db_level = level;
-	db->db_blkid = blkid;
-	db->db_last_dirty = NULL;
-	db->db_dirtycnt = 0;
-	db->db_dnode_handle = dn->dn_handle;
-	db->db_parent = parent;
-	db->db_blkptr = blkptr;
-
-	db->db_user = NULL;
-	db->db_user_immediate_evict = FALSE;
-	db->db_freed_in_flight = FALSE;
-	db->db_pending_evict = FALSE;
-
-	if (blkid == DMU_BONUS_BLKID) {
-		ASSERT3P(parent, ==, dn->dn_dbuf);
-		db->db.db_size = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots) -
-		    (dn->dn_nblkptr-1) * sizeof (blkptr_t);
-		ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
-		db->db.db_offset = DMU_BONUS_BLKID;
-		db->db_state = DB_UNCACHED;
-		db->db_caching_status = DB_NO_CACHE;
-		/* the bonus dbuf is not placed in the hash table */
-		arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_DBUF);
-		return (db);
-	} else if (blkid == DMU_SPILL_BLKID) {
-		db->db.db_size = (blkptr != NULL) ?
-		    BP_GET_LSIZE(blkptr) : SPA_MINBLOCKSIZE;
-		db->db.db_offset = 0;
-	} else {
-		int blocksize =
-		    db->db_level ? 1 << dn->dn_indblkshift : dn->dn_datablksz;
-		db->db.db_size = blocksize;
-		db->db.db_offset = db->db_blkid * blocksize;
-	}
-
-	/*
-	 * Hold the dn_dbufs_mtx while we get the new dbuf
-	 * in the hash table *and* added to the dbufs list.
-	 * This prevents a possible deadlock with someone
-	 * trying to look up this dbuf before its added to the
-	 * dn_dbufs list.
-	 */
-	mutex_enter(&dn->dn_dbufs_mtx);
-	db->db_state = DB_EVICTING;
-	if ((odb = dbuf_hash_insert(db)) != NULL) {
-		/* someone else inserted it first */
-		kmem_cache_free(dbuf_kmem_cache, db);
-		mutex_exit(&dn->dn_dbufs_mtx);
-		DBUF_STAT_BUMP(hash_insert_race);
-		return (odb);
-	}
-	avl_add(&dn->dn_dbufs, db);
-
-	db->db_state = DB_UNCACHED;
-	db->db_caching_status = DB_NO_CACHE;
-	mutex_exit(&dn->dn_dbufs_mtx);
-	arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_DBUF);
-
-	if (parent && parent != dn->dn_dbuf)
-		dbuf_add_ref(parent, db);
-
-	ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT ||
-	    zfs_refcount_count(&dn->dn_holds) > 0);
-	(void) zfs_refcount_add(&dn->dn_holds, db);
-
-	dprintf_dbuf(db, "db=%p\n", db);
-
-	return (db);
-}
-
-typedef struct dbuf_prefetch_arg {
-	spa_t *dpa_spa;	/* The spa to issue the prefetch in. */
-	zbookmark_phys_t dpa_zb; /* The target block to prefetch. */
-	int dpa_epbs; /* Entries (blkptr_t's) Per Block Shift. */
-	int dpa_curlevel; /* The current level that we're reading */
-	dnode_t *dpa_dnode; /* The dnode associated with the prefetch */
-	zio_priority_t dpa_prio; /* The priority I/Os should be issued at. */
-	zio_t *dpa_zio; /* The parent zio_t for all prefetches. */
-	arc_flags_t dpa_aflags; /* Flags to pass to the final prefetch. */
-} dbuf_prefetch_arg_t;
-
-/*
- * Actually issue the prefetch read for the block given.
- */
-static void
-dbuf_issue_final_prefetch(dbuf_prefetch_arg_t *dpa, blkptr_t *bp)
-{
-	if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp))
-		return;
-
-	arc_flags_t aflags =
-	    dpa->dpa_aflags | ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH;
-
-	ASSERT3U(dpa->dpa_curlevel, ==, BP_GET_LEVEL(bp));
-	ASSERT3U(dpa->dpa_curlevel, ==, dpa->dpa_zb.zb_level);
-	ASSERT(dpa->dpa_zio != NULL);
-	(void) arc_read(dpa->dpa_zio, dpa->dpa_spa, bp, NULL, NULL,
-	    dpa->dpa_prio, ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
-	    &aflags, &dpa->dpa_zb);
-}
-
-/*
- * Called when an indirect block above our prefetch target is read in.  This
- * will either read in the next indirect block down the tree or issue the actual
- * prefetch if the next block down is our target.
- */
-static void
-dbuf_prefetch_indirect_done(zio_t *zio, const zbookmark_phys_t *zb,
-    const blkptr_t *iobp, arc_buf_t *abuf, void *private)
-{
-	dbuf_prefetch_arg_t *dpa = private;
-
-	ASSERT3S(dpa->dpa_zb.zb_level, <, dpa->dpa_curlevel);
-	ASSERT3S(dpa->dpa_curlevel, >, 0);
-
-	if (abuf == NULL) {
-		ASSERT(zio == NULL || zio->io_error != 0);
-		kmem_free(dpa, sizeof (*dpa));
-		return;
-	}
-	ASSERT(zio == NULL || zio->io_error == 0);
-
-	/*
-	 * The dpa_dnode is only valid if we are called with a NULL
-	 * zio. This indicates that the arc_read() returned without
-	 * first calling zio_read() to issue a physical read. Once
-	 * a physical read is made the dpa_dnode must be invalidated
-	 * as the locks guarding it may have been dropped. If the
-	 * dpa_dnode is still valid, then we want to add it to the dbuf
-	 * cache. To do so, we must hold the dbuf associated with the block
-	 * we just prefetched, read its contents so that we associate it
-	 * with an arc_buf_t, and then release it.
-	 */
-	if (zio != NULL) {
-		ASSERT3S(BP_GET_LEVEL(zio->io_bp), ==, dpa->dpa_curlevel);
-		if (zio->io_flags & ZIO_FLAG_RAW) {
-			ASSERT3U(BP_GET_PSIZE(zio->io_bp), ==, zio->io_size);
-		} else {
-			ASSERT3U(BP_GET_LSIZE(zio->io_bp), ==, zio->io_size);
-		}
-		ASSERT3P(zio->io_spa, ==, dpa->dpa_spa);
-
-		dpa->dpa_dnode = NULL;
-	} else if (dpa->dpa_dnode != NULL) {
-		uint64_t curblkid = dpa->dpa_zb.zb_blkid >>
-		    (dpa->dpa_epbs * (dpa->dpa_curlevel -
-		    dpa->dpa_zb.zb_level));
-		dmu_buf_impl_t *db = dbuf_hold_level(dpa->dpa_dnode,
-		    dpa->dpa_curlevel, curblkid, FTAG);
-		(void) dbuf_read(db, NULL,
-		    DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH | DB_RF_HAVESTRUCT);
-		dbuf_rele(db, FTAG);
-	}
-
-	if (abuf == NULL) {
-		kmem_free(dpa, sizeof(*dpa));
-		return;
-	}
-	
-	dpa->dpa_curlevel--;
-
-	uint64_t nextblkid = dpa->dpa_zb.zb_blkid >>
-	    (dpa->dpa_epbs * (dpa->dpa_curlevel - dpa->dpa_zb.zb_level));
-	blkptr_t *bp = ((blkptr_t *)abuf->b_data) +
-	    P2PHASE(nextblkid, 1ULL << dpa->dpa_epbs);
-	if (BP_IS_HOLE(bp)) {
-		kmem_free(dpa, sizeof (*dpa));
-	} else if (dpa->dpa_curlevel == dpa->dpa_zb.zb_level) {
-		ASSERT3U(nextblkid, ==, dpa->dpa_zb.zb_blkid);
-		dbuf_issue_final_prefetch(dpa, bp);
-		kmem_free(dpa, sizeof (*dpa));
-	} else {
-		arc_flags_t iter_aflags = ARC_FLAG_NOWAIT;
-		zbookmark_phys_t zb;
-
-		/* flag if L2ARC eligible, l2arc_noprefetch then decides */
-		if (dpa->dpa_aflags & ARC_FLAG_L2CACHE)
-			iter_aflags |= ARC_FLAG_L2CACHE;
-
-		ASSERT3U(dpa->dpa_curlevel, ==, BP_GET_LEVEL(bp));
-
-		SET_BOOKMARK(&zb, dpa->dpa_zb.zb_objset,
-		    dpa->dpa_zb.zb_object, dpa->dpa_curlevel, nextblkid);
-
-		(void) arc_read(dpa->dpa_zio, dpa->dpa_spa,
-		    bp, dbuf_prefetch_indirect_done, dpa, dpa->dpa_prio,
-		    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
-		    &iter_aflags, &zb);
-	}
-
-	arc_buf_destroy(abuf, private);
-}
-
-/*
- * Issue prefetch reads for the given block on the given level.  If the indirect
- * blocks above that block are not in memory, we will read them in
- * asynchronously.  As a result, this call never blocks waiting for a read to
- * complete.
- */
-void
-dbuf_prefetch(dnode_t *dn, int64_t level, uint64_t blkid, zio_priority_t prio,
-    arc_flags_t aflags)
-{
-	blkptr_t bp;
-	int epbs, nlevels, curlevel;
-	uint64_t curblkid;
-
-	ASSERT(blkid != DMU_BONUS_BLKID);
-	ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
-
-	if (blkid > dn->dn_maxblkid)
-		return;
-
-	if (dnode_block_freed(dn, blkid))
-		return;
-
-	/*
-	 * This dnode hasn't been written to disk yet, so there's nothing to
-	 * prefetch.
-	 */
-	nlevels = dn->dn_phys->dn_nlevels;
-	if (level >= nlevels || dn->dn_phys->dn_nblkptr == 0)
-		return;
-
-	epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
-	if (dn->dn_phys->dn_maxblkid < blkid << (epbs * level))
-		return;
-
-	dmu_buf_impl_t *db = dbuf_find(dn->dn_objset, dn->dn_object,
-	    level, blkid);
-	if (db != NULL) {
-		mutex_exit(&db->db_mtx);
-		/*
-		 * This dbuf already exists.  It is either CACHED, or
-		 * (we assume) about to be read or filled.
-		 */
-		return;
-	}
-
-	/*
-	 * Find the closest ancestor (indirect block) of the target block
-	 * that is present in the cache.  In this indirect block, we will
-	 * find the bp that is at curlevel, curblkid.
-	 */
-	curlevel = level;
-	curblkid = blkid;
-	while (curlevel < nlevels - 1) {
-		int parent_level = curlevel + 1;
-		uint64_t parent_blkid = curblkid >> epbs;
-		dmu_buf_impl_t *db;
-
-		if (dbuf_hold_impl(dn, parent_level, parent_blkid,
-		    FALSE, TRUE, FTAG, &db) == 0) {
-			blkptr_t *bpp = db->db_buf->b_data;
-			bp = bpp[P2PHASE(curblkid, 1 << epbs)];
-			dbuf_rele(db, FTAG);
-			break;
-		}
-
-		curlevel = parent_level;
-		curblkid = parent_blkid;
-	}
-
-	if (curlevel == nlevels - 1) {
-		/* No cached indirect blocks found. */
-		ASSERT3U(curblkid, <, dn->dn_phys->dn_nblkptr);
-		bp = dn->dn_phys->dn_blkptr[curblkid];
-	}
-	if (BP_IS_HOLE(&bp))
-		return;
-
-	ASSERT3U(curlevel, ==, BP_GET_LEVEL(&bp));
-
-	zio_t *pio = zio_root(dmu_objset_spa(dn->dn_objset), NULL, NULL,
-	    ZIO_FLAG_CANFAIL);
-
-	dbuf_prefetch_arg_t *dpa = kmem_zalloc(sizeof (*dpa), KM_SLEEP);
-	dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
-	SET_BOOKMARK(&dpa->dpa_zb, ds != NULL ? ds->ds_object : DMU_META_OBJSET,
-	    dn->dn_object, level, blkid);
-	dpa->dpa_curlevel = curlevel;
-	dpa->dpa_prio = prio;
-	dpa->dpa_aflags = aflags;
-	dpa->dpa_spa = dn->dn_objset->os_spa;
-	dpa->dpa_dnode = dn;
-	dpa->dpa_epbs = epbs;
-	dpa->dpa_zio = pio;
-
-	/* flag if L2ARC eligible, l2arc_noprefetch then decides */
-	if (DNODE_LEVEL_IS_L2CACHEABLE(dn, level))
-		dpa->dpa_aflags |= ARC_FLAG_L2CACHE;
-
-	/*
-	 * If we have the indirect just above us, no need to do the asynchronous
-	 * prefetch chain; we'll just run the last step ourselves.  If we're at
-	 * a higher level, though, we want to issue the prefetches for all the
-	 * indirect blocks asynchronously, so we can go on with whatever we were
-	 * doing.
-	 */
-	if (curlevel == level) {
-		ASSERT3U(curblkid, ==, blkid);
-		dbuf_issue_final_prefetch(dpa, &bp);
-		kmem_free(dpa, sizeof (*dpa));
-	} else {
-		arc_flags_t iter_aflags = ARC_FLAG_NOWAIT;
-		zbookmark_phys_t zb;
-
-		/* flag if L2ARC eligible, l2arc_noprefetch then decides */
-		if (DNODE_LEVEL_IS_L2CACHEABLE(dn, level))
-			iter_aflags |= ARC_FLAG_L2CACHE;
-
-		SET_BOOKMARK(&zb, ds != NULL ? ds->ds_object : DMU_META_OBJSET,
-		    dn->dn_object, curlevel, curblkid);
-		(void) arc_read(dpa->dpa_zio, dpa->dpa_spa,
-		    &bp, dbuf_prefetch_indirect_done, dpa, prio,
-		    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
-		    &iter_aflags, &zb);
-	}
-	/*
-	 * We use pio here instead of dpa_zio since it's possible that
-	 * dpa may have already been freed.
-	 */
-	zio_nowait(pio);
-}
-
-#define	DBUF_HOLD_IMPL_MAX_DEPTH	20
-
-/*
- * Helper function for __dbuf_hold_impl() to copy a buffer. Handles
- * the case of encrypted, compressed and uncompressed buffers by
- * allocating the new buffer, respectively, with arc_alloc_raw_buf(),
- * arc_alloc_compressed_buf() or arc_alloc_buf().*
- *
- * NOTE: Declared noinline to avoid stack bloat in __dbuf_hold_impl().
- */
-noinline static void
-dbuf_hold_copy(struct dbuf_hold_impl_data *dh)
-{
-	dnode_t *dn = dh->dh_dn;
-	dmu_buf_impl_t *db = dh->dh_db;
-	dbuf_dirty_record_t *dr = dh->dh_dr;
-	arc_buf_t *data = dr->dt.dl.dr_data;
-
-	enum zio_compress compress_type = arc_get_compression(data);
-
-	if (compress_type != ZIO_COMPRESS_OFF) {
-		dbuf_set_data(db, arc_alloc_compressed_buf(
-		    dn->dn_objset->os_spa, db, arc_buf_size(data),
-		    arc_buf_lsize(data), compress_type));
-	} else {
-		dbuf_set_data(db, arc_alloc_buf(dn->dn_objset->os_spa, db,
-		    DBUF_GET_BUFC_TYPE(db), db->db.db_size));
-	}
-
-	bcopy(data->b_data, db->db.db_data, arc_buf_size(data));
-}
-
-/*
- * Returns with db_holds incremented, and db_mtx not held.
- * Note: dn_struct_rwlock must be held.
- */
-static int
-__dbuf_hold_impl(struct dbuf_hold_impl_data *dh)
-{
-	ASSERT3S(dh->dh_depth, <, DBUF_HOLD_IMPL_MAX_DEPTH);
-	dh->dh_parent = NULL;
-
-	ASSERT(dh->dh_blkid != DMU_BONUS_BLKID);
-	ASSERT(RW_LOCK_HELD(&dh->dh_dn->dn_struct_rwlock));
-	ASSERT3U(dh->dh_dn->dn_nlevels, >, dh->dh_level);
-
-	*(dh->dh_dbp) = NULL;
-
-	/* dbuf_find() returns with db_mtx held */
-	dh->dh_db = dbuf_find(dh->dh_dn->dn_objset, dh->dh_dn->dn_object,
-	    dh->dh_level, dh->dh_blkid);
-
-	if (dh->dh_db == NULL) {
-		dh->dh_bp = NULL;
-
-		if (dh->dh_fail_uncached)
-			return (SET_ERROR(ENOENT));
-
-		ASSERT3P(dh->dh_parent, ==, NULL);
-		dh->dh_err = dbuf_findbp(dh->dh_dn, dh->dh_level, dh->dh_blkid,
-		    dh->dh_fail_sparse, &dh->dh_parent, &dh->dh_bp, dh);
-		if (dh->dh_fail_sparse) {
-			if (dh->dh_err == 0 &&
-			    dh->dh_bp && BP_IS_HOLE(dh->dh_bp))
-				dh->dh_err = SET_ERROR(ENOENT);
-			if (dh->dh_err) {
-				if (dh->dh_parent)
-					dbuf_rele(dh->dh_parent, NULL);
-				return (dh->dh_err);
-			}
-		}
-		if (dh->dh_err && dh->dh_err != ENOENT)
-			return (dh->dh_err);
-		dh->dh_db = dbuf_create(dh->dh_dn, dh->dh_level, dh->dh_blkid,
-		    dh->dh_parent, dh->dh_bp);
-	}
-
-	if (dh->dh_fail_uncached && dh->dh_db->db_state != DB_CACHED) {
-		mutex_exit(&dh->dh_db->db_mtx);
-		return (SET_ERROR(ENOENT));
-	}
-
-	if (dh->dh_db->db_buf != NULL) {
-		arc_buf_access(dh->dh_db->db_buf);
-		ASSERT3P(dh->dh_db->db.db_data, ==, dh->dh_db->db_buf->b_data);
-	}
-
-	ASSERT(dh->dh_db->db_buf == NULL || arc_referenced(dh->dh_db->db_buf));
-
-	/*
-	 * If this buffer is currently syncing out, and we are are
-	 * still referencing it from db_data, we need to make a copy
-	 * of it in case we decide we want to dirty it again in this txg.
-	 */
-	if (dh->dh_db->db_level == 0 &&
-	    dh->dh_db->db_blkid != DMU_BONUS_BLKID &&
-	    dh->dh_dn->dn_object != DMU_META_DNODE_OBJECT &&
-	    dh->dh_db->db_state == DB_CACHED && dh->dh_db->db_data_pending) {
-		dh->dh_dr = dh->dh_db->db_data_pending;
-		if (dh->dh_dr->dt.dl.dr_data == dh->dh_db->db_buf)
-			dbuf_hold_copy(dh);
-	}
-
-	if (multilist_link_active(&dh->dh_db->db_cache_link)) {
-		ASSERT(zfs_refcount_is_zero(&dh->dh_db->db_holds));
-		ASSERT(dh->dh_db->db_caching_status == DB_DBUF_CACHE ||
-		    dh->dh_db->db_caching_status == DB_DBUF_METADATA_CACHE);
-
-		multilist_remove(
-		    dbuf_caches[dh->dh_db->db_caching_status].cache,
-		    dh->dh_db);
-		(void) zfs_refcount_remove_many(
-		    &dbuf_caches[dh->dh_db->db_caching_status].size,
-		    dh->dh_db->db.db_size, dh->dh_db);
-
-		if (dh->dh_db->db_caching_status == DB_DBUF_METADATA_CACHE) {
-			DBUF_STAT_BUMPDOWN(metadata_cache_count);
-		} else {
-			DBUF_STAT_BUMPDOWN(cache_levels[dh->dh_db->db_level]);
-			DBUF_STAT_BUMPDOWN(cache_count);
-			DBUF_STAT_DECR(cache_levels_bytes[dh->dh_db->db_level],
-			    dh->dh_db->db.db_size);
-		}
-		dh->dh_db->db_caching_status = DB_NO_CACHE;
-	}
-	(void) zfs_refcount_add(&dh->dh_db->db_holds, dh->dh_tag);
-	DBUF_VERIFY(dh->dh_db);
-	mutex_exit(&dh->dh_db->db_mtx);
-
-	/* NOTE: we can't rele the parent until after we drop the db_mtx */
-	if (dh->dh_parent)
-		dbuf_rele(dh->dh_parent, NULL);
-
-	ASSERT3P(DB_DNODE(dh->dh_db), ==, dh->dh_dn);
-	ASSERT3U(dh->dh_db->db_blkid, ==, dh->dh_blkid);
-	ASSERT3U(dh->dh_db->db_level, ==, dh->dh_level);
-	*(dh->dh_dbp) = dh->dh_db;
-
-	return (0);
-}
-
-/*
- * The following code preserves the recursive function dbuf_hold_impl()
- * but moves the local variables AND function arguments to the heap to
- * minimize the stack frame size.  Enough space is initially allocated
- * on the stack for 20 levels of recursion.
- */
-int
-dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid,
-    boolean_t fail_sparse, boolean_t fail_uncached,
-    void *tag, dmu_buf_impl_t **dbp)
-{
-	struct dbuf_hold_impl_data *dh;
-	int error;
-
-	dh = kmem_alloc(sizeof (struct dbuf_hold_impl_data) *
-	    DBUF_HOLD_IMPL_MAX_DEPTH, KM_SLEEP);
-	__dbuf_hold_impl_init(dh, dn, level, blkid, fail_sparse,
-	    fail_uncached, tag, dbp, 0);
-
-	error = __dbuf_hold_impl(dh);
-
-	kmem_free(dh, sizeof (struct dbuf_hold_impl_data) *
-	    DBUF_HOLD_IMPL_MAX_DEPTH);
-
-	return (error);
-}
-
-static void
-__dbuf_hold_impl_init(struct dbuf_hold_impl_data *dh,
-    dnode_t *dn, uint8_t level, uint64_t blkid,
-    boolean_t fail_sparse, boolean_t fail_uncached,
-    void *tag, dmu_buf_impl_t **dbp, int depth)
-{
-	dh->dh_dn = dn;
-	dh->dh_level = level;
-	dh->dh_blkid = blkid;
-
-	dh->dh_fail_sparse = fail_sparse;
-	dh->dh_fail_uncached = fail_uncached;
-
-	dh->dh_tag = tag;
-	dh->dh_dbp = dbp;
-
-	dh->dh_db = NULL;
-	dh->dh_parent = NULL;
-	dh->dh_bp = NULL;
-	dh->dh_err = 0;
-	dh->dh_dr = NULL;
-
-	dh->dh_depth = depth;
-}
-
-dmu_buf_impl_t *
-dbuf_hold(dnode_t *dn, uint64_t blkid, void *tag)
-{
-	return (dbuf_hold_level(dn, 0, blkid, tag));
-}
-
-dmu_buf_impl_t *
-dbuf_hold_level(dnode_t *dn, int level, uint64_t blkid, void *tag)
-{
-	dmu_buf_impl_t *db;
-	int err = dbuf_hold_impl(dn, level, blkid, FALSE, FALSE, tag, &db);
-	return (err ? NULL : db);
-}
-
-void
-dbuf_create_bonus(dnode_t *dn)
-{
-	ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
-
-	ASSERT(dn->dn_bonus == NULL);
-	dn->dn_bonus = dbuf_create(dn, 0, DMU_BONUS_BLKID, dn->dn_dbuf, NULL);
-}
-
-int
-dbuf_spill_set_blksz(dmu_buf_t *db_fake, uint64_t blksz, dmu_tx_t *tx)
-{
-	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
-	dnode_t *dn;
-
-	if (db->db_blkid != DMU_SPILL_BLKID)
-		return (SET_ERROR(ENOTSUP));
-	if (blksz == 0)
-		blksz = SPA_MINBLOCKSIZE;
-	ASSERT3U(blksz, <=, spa_maxblocksize(dmu_objset_spa(db->db_objset)));
-	blksz = P2ROUNDUP(blksz, SPA_MINBLOCKSIZE);
-
-	DB_DNODE_ENTER(db);
-	dn = DB_DNODE(db);
-	rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
-	dbuf_new_size(db, blksz, tx);
-	rw_exit(&dn->dn_struct_rwlock);
-	DB_DNODE_EXIT(db);
-
-	return (0);
-}
-
-void
-dbuf_rm_spill(dnode_t *dn, dmu_tx_t *tx)
-{
-	dbuf_free_range(dn, DMU_SPILL_BLKID, DMU_SPILL_BLKID, tx);
-}
-
-#pragma weak dmu_buf_add_ref = dbuf_add_ref
-void
-dbuf_add_ref(dmu_buf_impl_t *db, void *tag)
-{
-	int64_t holds = zfs_refcount_add(&db->db_holds, tag);
-	ASSERT3S(holds, >, 1);
-}
-
-#pragma weak dmu_buf_try_add_ref = dbuf_try_add_ref
-boolean_t
-dbuf_try_add_ref(dmu_buf_t *db_fake, objset_t *os, uint64_t obj, uint64_t blkid,
-    void *tag)
-{
-	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
-	dmu_buf_impl_t *found_db;
-	boolean_t result = B_FALSE;
-
-	if (db->db_blkid == DMU_BONUS_BLKID)
-		found_db = dbuf_find_bonus(os, obj);
-	else
-		found_db = dbuf_find(os, obj, 0, blkid);
-
-	if (found_db != NULL) {
-		if (db == found_db && dbuf_refcount(db) > db->db_dirtycnt) {
-			(void) zfs_refcount_add(&db->db_holds, tag);
-			result = B_TRUE;
-		}
-		mutex_exit(&db->db_mtx);
-	}
-	return (result);
-}
-
-/*
- * If you call dbuf_rele() you had better not be referencing the dnode handle
- * unless you have some other direct or indirect hold on the dnode. (An indirect
- * hold is a hold on one of the dnode's dbufs, including the bonus buffer.)
- * Without that, the dbuf_rele() could lead to a dnode_rele() followed by the
- * dnode's parent dbuf evicting its dnode handles.
- */
-void
-dbuf_rele(dmu_buf_impl_t *db, void *tag)
-{
-	mutex_enter(&db->db_mtx);
-	dbuf_rele_and_unlock(db, tag, B_FALSE);
-}
-
-void
-dmu_buf_rele(dmu_buf_t *db, void *tag)
-{
-	dbuf_rele((dmu_buf_impl_t *)db, tag);
-}
-
-/*
- * dbuf_rele() for an already-locked dbuf.  This is necessary to allow
- * db_dirtycnt and db_holds to be updated atomically.  The 'evicting'
- * argument should be set if we are already in the dbuf-evicting code
- * path, in which case we don't want to recursively evict.  This allows us to
- * avoid deeply nested stacks that would have a call flow similar to this:
- *
- * dbuf_rele()-->dbuf_rele_and_unlock()-->dbuf_evict_notify()
- *	^						|
- *	|						|
- *	+-----dbuf_destroy()<--dbuf_evict_one()<--------+
- *
- */
-void
-dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag, boolean_t evicting)
-{
-	int64_t holds;
-	uint64_t size;
-
-	ASSERT(MUTEX_HELD(&db->db_mtx));
-	DBUF_VERIFY(db);
-
-	/*
-	 * Remove the reference to the dbuf before removing its hold on the
-	 * dnode so we can guarantee in dnode_move() that a referenced bonus
-	 * buffer has a corresponding dnode hold.
-	 */
-	holds = zfs_refcount_remove(&db->db_holds, tag);
-	ASSERT(holds >= 0);
-
-	/*
-	 * We can't freeze indirects if there is a possibility that they
-	 * may be modified in the current syncing context.
-	 */
-	if (db->db_buf != NULL &&
-	    holds == (db->db_level == 0 ? db->db_dirtycnt : 0)) {
-		arc_buf_freeze(db->db_buf);
-	}
-
-	if (holds == db->db_dirtycnt &&
-	    db->db_level == 0 && db->db_user_immediate_evict)
-		dbuf_evict_user(db);
-
-	if (holds == 0) {
-		if (db->db_blkid == DMU_BONUS_BLKID) {
-			dnode_t *dn;
-			boolean_t evict_dbuf = db->db_pending_evict;
-
-			/*
-			 * If the dnode moves here, we cannot cross this
-			 * barrier until the move completes.
-			 */
-			DB_DNODE_ENTER(db);
-
-			dn = DB_DNODE(db);
-			atomic_dec_32(&dn->dn_dbufs_count);
-
-			/*
-			 * Decrementing the dbuf count means that the bonus
-			 * buffer's dnode hold is no longer discounted in
-			 * dnode_move(). The dnode cannot move until after
-			 * the dnode_rele() below.
-			 */
-			DB_DNODE_EXIT(db);
-
-			/*
-			 * Do not reference db after its lock is dropped.
-			 * Another thread may evict it.
-			 */
-			mutex_exit(&db->db_mtx);
-
-			if (evict_dbuf)
-				dnode_evict_bonus(dn);
-
-			dnode_rele(dn, db);
-		} else if (db->db_buf == NULL) {
-			/*
-			 * This is a special case: we never associated this
-			 * dbuf with any data allocated from the ARC.
-			 */
-			ASSERT(db->db_state == DB_UNCACHED ||
-			    db->db_state == DB_NOFILL);
-			dbuf_destroy(db);
-		} else if (arc_released(db->db_buf)) {
-			/*
-			 * This dbuf has anonymous data associated with it.
-			 */
-			dbuf_destroy(db);
-		} else {
-			boolean_t do_arc_evict = B_FALSE;
-			blkptr_t bp;
-			spa_t *spa = dmu_objset_spa(db->db_objset);
-
-			if (!DBUF_IS_CACHEABLE(db) &&
-			    db->db_blkptr != NULL &&
-			    !BP_IS_HOLE(db->db_blkptr) &&
-			    !BP_IS_EMBEDDED(db->db_blkptr)) {
-				do_arc_evict = B_TRUE;
-				bp = *db->db_blkptr;
-			}
-
-			if (!DBUF_IS_CACHEABLE(db) ||
-			    db->db_pending_evict) {
-				dbuf_destroy(db);
-			} else if (!multilist_link_active(&db->db_cache_link)) {
-				ASSERT3U(db->db_caching_status, ==,
-				    DB_NO_CACHE);
-
-				dbuf_cached_state_t dcs =
-				    dbuf_include_in_metadata_cache(db) ?
-				    DB_DBUF_METADATA_CACHE : DB_DBUF_CACHE;
-				db->db_caching_status = dcs;
-
-				multilist_insert(dbuf_caches[dcs].cache, db);
-				size = zfs_refcount_add_many(
-				    &dbuf_caches[dcs].size, db->db.db_size, db);
-
-				if (dcs == DB_DBUF_METADATA_CACHE) {
-					DBUF_STAT_BUMP(metadata_cache_count);
-					DBUF_STAT_MAX(
-					    metadata_cache_size_bytes_max,
-					    size);
-				} else {
-					DBUF_STAT_BUMP(
-					    cache_levels[db->db_level]);
-					DBUF_STAT_BUMP(cache_count);
-					DBUF_STAT_INCR(
-					    cache_levels_bytes[db->db_level],
-					    db->db.db_size);
-					DBUF_STAT_MAX(cache_size_bytes_max,
-					    size);
-				}
-				mutex_exit(&db->db_mtx);
-
-				if (dcs == DB_DBUF_CACHE && !evicting)
-					dbuf_evict_notify(size);
-			}
-
-			if (do_arc_evict)
-				arc_freed(spa, &bp);
-		}
-	} else {
-		mutex_exit(&db->db_mtx);
-	}
-
-}
-
-#pragma weak dmu_buf_refcount = dbuf_refcount
-uint64_t
-dbuf_refcount(dmu_buf_impl_t *db)
-{
-	return (zfs_refcount_count(&db->db_holds));
-}
-
-void *
-dmu_buf_replace_user(dmu_buf_t *db_fake, dmu_buf_user_t *old_user,
-    dmu_buf_user_t *new_user)
-{
-	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
-
-	mutex_enter(&db->db_mtx);
-	dbuf_verify_user(db, DBVU_NOT_EVICTING);
-	if (db->db_user == old_user)
-		db->db_user = new_user;
-	else
-		old_user = db->db_user;
-	dbuf_verify_user(db, DBVU_NOT_EVICTING);
-	mutex_exit(&db->db_mtx);
-
-	return (old_user);
-}
-
-void *
-dmu_buf_set_user(dmu_buf_t *db_fake, dmu_buf_user_t *user)
-{
-	return (dmu_buf_replace_user(db_fake, NULL, user));
-}
-
-void *
-dmu_buf_set_user_ie(dmu_buf_t *db_fake, dmu_buf_user_t *user)
-{
-	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
-
-	db->db_user_immediate_evict = TRUE;
-	return (dmu_buf_set_user(db_fake, user));
-}
-
-void *
-dmu_buf_remove_user(dmu_buf_t *db_fake, dmu_buf_user_t *user)
-{
-	return (dmu_buf_replace_user(db_fake, user, NULL));
-}
-
-void *
-dmu_buf_get_user(dmu_buf_t *db_fake)
-{
-	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
-
-	dbuf_verify_user(db, DBVU_NOT_EVICTING);
-	return (db->db_user);
-}
-
-void
-dmu_buf_user_evict_wait()
-{
-	taskq_wait(dbu_evict_taskq);
-}
-
-blkptr_t *
-dmu_buf_get_blkptr(dmu_buf_t *db)
-{
-	dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db;
-	return (dbi->db_blkptr);
-}
-
-objset_t *
-dmu_buf_get_objset(dmu_buf_t *db)
-{
-	dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db;
-	return (dbi->db_objset);
-}
-
-dnode_t *
-dmu_buf_dnode_enter(dmu_buf_t *db)
-{
-	dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db;
-	DB_DNODE_ENTER(dbi);
-	return (DB_DNODE(dbi));
-}
-
-void
-dmu_buf_dnode_exit(dmu_buf_t *db)
-{
-	dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db;
-	DB_DNODE_EXIT(dbi);
-}
-
-static void
-dbuf_check_blkptr(dnode_t *dn, dmu_buf_impl_t *db)
-{
-	/* ASSERT(dmu_tx_is_syncing(tx) */
-	ASSERT(MUTEX_HELD(&db->db_mtx));
-
-	if (db->db_blkptr != NULL)
-		return;
-
-	if (db->db_blkid == DMU_SPILL_BLKID) {
-		db->db_blkptr = DN_SPILL_BLKPTR(dn->dn_phys);
-		BP_ZERO(db->db_blkptr);
-		return;
-	}
-	if (db->db_level == dn->dn_phys->dn_nlevels-1) {
-		/*
-		 * This buffer was allocated at a time when there was
-		 * no available blkptrs from the dnode, or it was
-		 * inappropriate to hook it in (i.e., nlevels mis-match).
-		 */
-		ASSERT(db->db_blkid < dn->dn_phys->dn_nblkptr);
-		ASSERT(db->db_parent == NULL);
-		db->db_parent = dn->dn_dbuf;
-		db->db_blkptr = &dn->dn_phys->dn_blkptr[db->db_blkid];
-		DBUF_VERIFY(db);
-	} else {
-		dmu_buf_impl_t *parent = db->db_parent;
-		int epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
-
-		ASSERT(dn->dn_phys->dn_nlevels > 1);
-		if (parent == NULL) {
-			mutex_exit(&db->db_mtx);
-			rw_enter(&dn->dn_struct_rwlock, RW_READER);
-			parent = dbuf_hold_level(dn, db->db_level + 1,
-			    db->db_blkid >> epbs, db);
-			rw_exit(&dn->dn_struct_rwlock);
-			mutex_enter(&db->db_mtx);
-			db->db_parent = parent;
-		}
-		db->db_blkptr = (blkptr_t *)parent->db.db_data +
-		    (db->db_blkid & ((1ULL << epbs) - 1));
-		DBUF_VERIFY(db);
-	}
-}
-
-/*
- * dbuf_sync_indirect() is called recursively from dbuf_sync_list() so it
- * is critical the we not allow the compiler to inline this function in to
- * dbuf_sync_list() thereby drastically bloating the stack usage.
- */
-noinline static void
-dbuf_sync_indirect(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
-{
-	dmu_buf_impl_t *db = dr->dr_dbuf;
-	dnode_t *dn;
-	zio_t *zio;
-
-	ASSERT(dmu_tx_is_syncing(tx));
-
-	dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr);
-
-	mutex_enter(&db->db_mtx);
-
-	ASSERT(db->db_level > 0);
-	DBUF_VERIFY(db);
-
-	/* Read the block if it hasn't been read yet. */
-	if (db->db_buf == NULL) {
-		mutex_exit(&db->db_mtx);
-		(void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED);
-		mutex_enter(&db->db_mtx);
-	}
-	ASSERT3U(db->db_state, ==, DB_CACHED);
-	ASSERT(db->db_buf != NULL);
-
-	DB_DNODE_ENTER(db);
-	dn = DB_DNODE(db);
-	/* Indirect block size must match what the dnode thinks it is. */
-	ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
-	dbuf_check_blkptr(dn, db);
-	DB_DNODE_EXIT(db);
-
-	/* Provide the pending dirty record to child dbufs */
-	db->db_data_pending = dr;
-
-	mutex_exit(&db->db_mtx);
-
-	dbuf_write(dr, db->db_buf, tx);
-
-	zio = dr->dr_zio;
-	mutex_enter(&dr->dt.di.dr_mtx);
-	dbuf_sync_list(&dr->dt.di.dr_children, db->db_level - 1, tx);
-	ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
-	mutex_exit(&dr->dt.di.dr_mtx);
-	zio_nowait(zio);
-}
-
-/*
- * dbuf_sync_leaf() is called recursively from dbuf_sync_list() so it is
- * critical the we not allow the compiler to inline this function in to
- * dbuf_sync_list() thereby drastically bloating the stack usage.
- */
-noinline static void
-dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
-{
-	arc_buf_t **datap = &dr->dt.dl.dr_data;
-	dmu_buf_impl_t *db = dr->dr_dbuf;
-	dnode_t *dn;
-	objset_t *os;
-	uint64_t txg = tx->tx_txg;
-
-	ASSERT(dmu_tx_is_syncing(tx));
-
-	dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr);
-
-	mutex_enter(&db->db_mtx);
-	/*
-	 * To be synced, we must be dirtied.  But we
-	 * might have been freed after the dirty.
-	 */
-	if (db->db_state == DB_UNCACHED) {
-		/* This buffer has been freed since it was dirtied */
-		ASSERT(db->db.db_data == NULL);
-	} else if (db->db_state == DB_FILL) {
-		/* This buffer was freed and is now being re-filled */
-		ASSERT(db->db.db_data != dr->dt.dl.dr_data);
-	} else {
-		ASSERT(db->db_state == DB_CACHED || db->db_state == DB_NOFILL);
-	}
-	DBUF_VERIFY(db);
-
-	DB_DNODE_ENTER(db);
-	dn = DB_DNODE(db);
-
-	if (db->db_blkid == DMU_SPILL_BLKID) {
-		mutex_enter(&dn->dn_mtx);
-		if (!(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR)) {
-			/*
-			 * In the previous transaction group, the bonus buffer
-			 * was entirely used to store the attributes for the
-			 * dnode which overrode the dn_spill field.  However,
-			 * when adding more attributes to the file a spill
-			 * block was required to hold the extra attributes.
-			 *
-			 * Make sure to clear the garbage left in the dn_spill
-			 * field from the previous attributes in the bonus
-			 * buffer.  Otherwise, after writing out the spill
-			 * block to the new allocated dva, it will free
-			 * the old block pointed to by the invalid dn_spill.
-			 */
-			db->db_blkptr = NULL;
-		}
-		dn->dn_phys->dn_flags |= DNODE_FLAG_SPILL_BLKPTR;
-		mutex_exit(&dn->dn_mtx);
-	}
-
-	/*
-	 * If this is a bonus buffer, simply copy the bonus data into the
-	 * dnode.  It will be written out when the dnode is synced (and it
-	 * will be synced, since it must have been dirty for dbuf_sync to
-	 * be called).
-	 */
-	if (db->db_blkid == DMU_BONUS_BLKID) {
-		dbuf_dirty_record_t **drp;
-
-		ASSERT(*datap != NULL);
-		ASSERT0(db->db_level);
-		ASSERT3U(DN_MAX_BONUS_LEN(dn->dn_phys), <=,
-		    DN_SLOTS_TO_BONUSLEN(dn->dn_phys->dn_extra_slots + 1));
-		bcopy(*datap, DN_BONUS(dn->dn_phys),
-		    DN_MAX_BONUS_LEN(dn->dn_phys));
-		DB_DNODE_EXIT(db);
-
-		if (*datap != db->db.db_data) {
-			int slots = DB_DNODE(db)->dn_num_slots;
-			int bonuslen = DN_SLOTS_TO_BONUSLEN(slots);
-			zio_buf_free(*datap, bonuslen);
-			arc_space_return(bonuslen, ARC_SPACE_BONUS);
-		}
-		db->db_data_pending = NULL;
-		drp = &db->db_last_dirty;
-		while (*drp != dr)
-			drp = &(*drp)->dr_next;
-		ASSERT(dr->dr_next == NULL);
-		ASSERT(dr->dr_dbuf == db);
-		*drp = dr->dr_next;
-		if (dr->dr_dbuf->db_level != 0) {
-			mutex_destroy(&dr->dt.di.dr_mtx);
-			list_destroy(&dr->dt.di.dr_children);
-		}
-		kmem_free(dr, sizeof (dbuf_dirty_record_t));
-		ASSERT(db->db_dirtycnt > 0);
-		db->db_dirtycnt -= 1;
-		dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg, B_FALSE);
-		return;
-	}
-
-	os = dn->dn_objset;
-
-	/*
-	 * This function may have dropped the db_mtx lock allowing a dmu_sync
-	 * operation to sneak in. As a result, we need to ensure that we
-	 * don't check the dr_override_state until we have returned from
-	 * dbuf_check_blkptr.
-	 */
-	dbuf_check_blkptr(dn, db);
-
-	/*
-	 * If this buffer is in the middle of an immediate write,
-	 * wait for the synchronous IO to complete.
-	 */
-	while (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC) {
-		ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT);
-		cv_wait(&db->db_changed, &db->db_mtx);
-		ASSERT(dr->dt.dl.dr_override_state != DR_NOT_OVERRIDDEN);
-	}
-
-	if (db->db_state != DB_NOFILL &&
-	    dn->dn_object != DMU_META_DNODE_OBJECT &&
-	    zfs_refcount_count(&db->db_holds) > 1 &&
-	    dr->dt.dl.dr_override_state != DR_OVERRIDDEN &&
-	    *datap == db->db_buf) {
-		/*
-		 * If this buffer is currently "in use" (i.e., there
-		 * are active holds and db_data still references it),
-		 * then make a copy before we start the write so that
-		 * any modifications from the open txg will not leak
-		 * into this write.
-		 *
-		 * NOTE: this copy does not need to be made for
-		 * objects only modified in the syncing context (e.g.
-		 * DNONE_DNODE blocks).
-		 */
-		int psize = arc_buf_size(*datap);
-		arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
-		enum zio_compress compress_type = arc_get_compression(*datap);
-
-		if (compress_type == ZIO_COMPRESS_OFF) {
-			*datap = arc_alloc_buf(os->os_spa, db, type, psize);
-		} else {
-			ASSERT3U(type, ==, ARC_BUFC_DATA);
-			int lsize = arc_buf_lsize(*datap);
-			*datap = arc_alloc_compressed_buf(os->os_spa, db,
-			    psize, lsize, compress_type);
-		}
-		bcopy(db->db.db_data, (*datap)->b_data, psize);
-	}
-	db->db_data_pending = dr;
-
-	mutex_exit(&db->db_mtx);
-
-	dbuf_write(dr, *datap, tx);
-
-	ASSERT(!list_link_active(&dr->dr_dirty_node));
-	if (dn->dn_object == DMU_META_DNODE_OBJECT) {
-		list_insert_tail(&dn->dn_dirty_records[txg&TXG_MASK], dr);
-		DB_DNODE_EXIT(db);
-	} else {
-		/*
-		 * Although zio_nowait() does not "wait for an IO", it does
-		 * initiate the IO. If this is an empty write it seems plausible
-		 * that the IO could actually be completed before the nowait
-		 * returns. We need to DB_DNODE_EXIT() first in case
-		 * zio_nowait() invalidates the dbuf.
-		 */
-		DB_DNODE_EXIT(db);
-		zio_nowait(dr->dr_zio);
-	}
-}
-
-void
-dbuf_sync_list(list_t *list, int level, dmu_tx_t *tx)
-{
-	dbuf_dirty_record_t *dr;
-
-	while (dr = list_head(list)) {
-		if (dr->dr_zio != NULL) {
-			/*
-			 * If we find an already initialized zio then we
-			 * are processing the meta-dnode, and we have finished.
-			 * The dbufs for all dnodes are put back on the list
-			 * during processing, so that we can zio_wait()
-			 * these IOs after initiating all child IOs.
-			 */
-			ASSERT3U(dr->dr_dbuf->db.db_object, ==,
-			    DMU_META_DNODE_OBJECT);
-			break;
-		}
-		if (dr->dr_dbuf->db_blkid != DMU_BONUS_BLKID &&
-		    dr->dr_dbuf->db_blkid != DMU_SPILL_BLKID) {
-			VERIFY3U(dr->dr_dbuf->db_level, ==, level);
-		}
-		list_remove(list, dr);
-		if (dr->dr_dbuf->db_level > 0)
-			dbuf_sync_indirect(dr, tx);
-		else
-			dbuf_sync_leaf(dr, tx);
-	}
-}
-
-/* ARGSUSED */
-static void
-dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
-{
-	dmu_buf_impl_t *db = vdb;
-	dnode_t *dn;
-	blkptr_t *bp = zio->io_bp;
-	blkptr_t *bp_orig = &zio->io_bp_orig;
-	spa_t *spa = zio->io_spa;
-	int64_t delta;
-	uint64_t fill = 0;
-	int i;
-
-	ASSERT3P(db->db_blkptr, !=, NULL);
-	ASSERT3P(&db->db_data_pending->dr_bp_copy, ==, bp);
-
-	DB_DNODE_ENTER(db);
-	dn = DB_DNODE(db);
-	delta = bp_get_dsize_sync(spa, bp) - bp_get_dsize_sync(spa, bp_orig);
-	dnode_diduse_space(dn, delta - zio->io_prev_space_delta);
-	zio->io_prev_space_delta = delta;
-
-	if (bp->blk_birth != 0) {
-		ASSERT((db->db_blkid != DMU_SPILL_BLKID &&
-		    BP_GET_TYPE(bp) == dn->dn_type) ||
-		    (db->db_blkid == DMU_SPILL_BLKID &&
-		    BP_GET_TYPE(bp) == dn->dn_bonustype) ||
-		    BP_IS_EMBEDDED(bp));
-		ASSERT(BP_GET_LEVEL(bp) == db->db_level);
-	}
-
-	mutex_enter(&db->db_mtx);
-
-#ifdef ZFS_DEBUG
-	if (db->db_blkid == DMU_SPILL_BLKID) {
-		ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR);
-		ASSERT(!(BP_IS_HOLE(bp)) &&
-		    db->db_blkptr == DN_SPILL_BLKPTR(dn->dn_phys));
-	}
-#endif
-
-	if (db->db_level == 0) {
-		mutex_enter(&dn->dn_mtx);
-		if (db->db_blkid > dn->dn_phys->dn_maxblkid &&
-		    db->db_blkid != DMU_SPILL_BLKID)
-			dn->dn_phys->dn_maxblkid = db->db_blkid;
-		mutex_exit(&dn->dn_mtx);
-
-		if (dn->dn_type == DMU_OT_DNODE) {
-			i = 0;
-			while (i < db->db.db_size) {
-				dnode_phys_t *dnp =
-				    (void *)(((char *)db->db.db_data) + i);
-
-				i += DNODE_MIN_SIZE;
-				if (dnp->dn_type != DMU_OT_NONE) {
-					fill++;
-					i += dnp->dn_extra_slots *
-					    DNODE_MIN_SIZE;
-				}
-			}
-		} else {
-			if (BP_IS_HOLE(bp)) {
-				fill = 0;
-			} else {
-				fill = 1;
-			}
-		}
-	} else {
-		blkptr_t *ibp = db->db.db_data;
-		ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
-		for (i = db->db.db_size >> SPA_BLKPTRSHIFT; i > 0; i--, ibp++) {
-			if (BP_IS_HOLE(ibp))
-				continue;
-			fill += BP_GET_FILL(ibp);
-		}
-	}
-	DB_DNODE_EXIT(db);
-
-	if (!BP_IS_EMBEDDED(bp))
-		bp->blk_fill = fill;
-
-	mutex_exit(&db->db_mtx);
-
-	rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
-	*db->db_blkptr = *bp;
-	rw_exit(&dn->dn_struct_rwlock);
-}
-
-/* ARGSUSED */
-/*
- * This function gets called just prior to running through the compression
- * stage of the zio pipeline. If we're an indirect block comprised of only
- * holes, then we want this indirect to be compressed away to a hole. In
- * order to do that we must zero out any information about the holes that
- * this indirect points to prior to before we try to compress it.
- */
-static void
-dbuf_write_children_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
-{
-	dmu_buf_impl_t *db = vdb;
-	dnode_t *dn;
-	blkptr_t *bp;
-	unsigned int epbs, i;
-
-	ASSERT3U(db->db_level, >, 0);
-	DB_DNODE_ENTER(db);
-	dn = DB_DNODE(db);
-	epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
-	ASSERT3U(epbs, <, 31);
-
-	/* Determine if all our children are holes */
-	for (i = 0, bp = db->db.db_data; i < 1 << epbs; i++, bp++) {
-		if (!BP_IS_HOLE(bp))
-			break;
-	}
-
-	/*
-	 * If all the children are holes, then zero them all out so that
-	 * we may get compressed away.
-	 */
-	if (i == 1 << epbs) {
-		/*
-		 * We only found holes. Grab the rwlock to prevent
-		 * anybody from reading the blocks we're about to
-		 * zero out.
-		 */
-		rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
-		bzero(db->db.db_data, db->db.db_size);
-		rw_exit(&dn->dn_struct_rwlock);
-	}
-	DB_DNODE_EXIT(db);
-}
-
-/*
- * The SPA will call this callback several times for each zio - once
- * for every physical child i/o (zio->io_phys_children times).  This
- * allows the DMU to monitor the progress of each logical i/o.  For example,
- * there may be 2 copies of an indirect block, or many fragments of a RAID-Z
- * block.  There may be a long delay before all copies/fragments are completed,
- * so this callback allows us to retire dirty space gradually, as the physical
- * i/os complete.
- */
-/* ARGSUSED */
-static void
-dbuf_write_physdone(zio_t *zio, arc_buf_t *buf, void *arg)
-{
-	dmu_buf_impl_t *db = arg;
-	objset_t *os = db->db_objset;
-	dsl_pool_t *dp = dmu_objset_pool(os);
-	dbuf_dirty_record_t *dr;
-	int delta = 0;
-
-	dr = db->db_data_pending;
-	ASSERT3U(dr->dr_txg, ==, zio->io_txg);
-
-	/*
-	 * The callback will be called io_phys_children times.  Retire one
-	 * portion of our dirty space each time we are called.  Any rounding
-	 * error will be cleaned up by dsl_pool_sync()'s call to
-	 * dsl_pool_undirty_space().
-	 */
-	delta = dr->dr_accounted / zio->io_phys_children;
-	dsl_pool_undirty_space(dp, delta, zio->io_txg);
-}
-
-/* ARGSUSED */
-static void
-dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
-{
-	dmu_buf_impl_t *db = vdb;
-	blkptr_t *bp_orig = &zio->io_bp_orig;
-	blkptr_t *bp = db->db_blkptr;
-	objset_t *os = db->db_objset;
-	dmu_tx_t *tx = os->os_synctx;
-	dbuf_dirty_record_t **drp, *dr;
-
-	ASSERT0(zio->io_error);
-	ASSERT(db->db_blkptr == bp);
-
-	/*
-	 * For nopwrites and rewrites we ensure that the bp matches our
-	 * original and bypass all the accounting.
-	 */
-	if (zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE)) {
-		ASSERT(BP_EQUAL(bp, bp_orig));
-	} else {
-		dsl_dataset_t *ds = os->os_dsl_dataset;
-		(void) dsl_dataset_block_kill(ds, bp_orig, tx, B_TRUE);
-		dsl_dataset_block_born(ds, bp, tx);
-	}
-
-	mutex_enter(&db->db_mtx);
-
-	DBUF_VERIFY(db);
-
-	drp = &db->db_last_dirty;
-	while ((dr = *drp) != db->db_data_pending)
-		drp = &dr->dr_next;
-	ASSERT(!list_link_active(&dr->dr_dirty_node));
-	ASSERT(dr->dr_dbuf == db);
-	ASSERT(dr->dr_next == NULL);
-	*drp = dr->dr_next;
-
-#ifdef ZFS_DEBUG
-	if (db->db_blkid == DMU_SPILL_BLKID) {
-		dnode_t *dn;
-
-		DB_DNODE_ENTER(db);
-		dn = DB_DNODE(db);
-		ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR);
-		ASSERT(!(BP_IS_HOLE(db->db_blkptr)) &&
-		    db->db_blkptr == DN_SPILL_BLKPTR(dn->dn_phys));
-		DB_DNODE_EXIT(db);
-	}
-#endif
-
-	if (db->db_level == 0) {
-		ASSERT(db->db_blkid != DMU_BONUS_BLKID);
-		ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN);
-		if (db->db_state != DB_NOFILL) {
-			if (dr->dt.dl.dr_data != db->db_buf)
-				arc_buf_destroy(dr->dt.dl.dr_data, db);
-		}
-	} else {
-		dnode_t *dn;
-
-		DB_DNODE_ENTER(db);
-		dn = DB_DNODE(db);
-		ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
-		ASSERT3U(db->db.db_size, ==, 1 << dn->dn_phys->dn_indblkshift);
-		if (!BP_IS_HOLE(db->db_blkptr)) {
-			int epbs =
-			    dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
-			ASSERT3U(db->db_blkid, <=,
-			    dn->dn_phys->dn_maxblkid >> (db->db_level * epbs));
-			ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==,
-			    db->db.db_size);
-		}
-		DB_DNODE_EXIT(db);
-		mutex_destroy(&dr->dt.di.dr_mtx);
-		list_destroy(&dr->dt.di.dr_children);
-	}
-	kmem_free(dr, sizeof (dbuf_dirty_record_t));
-
-	cv_broadcast(&db->db_changed);
-	ASSERT(db->db_dirtycnt > 0);
-	db->db_dirtycnt -= 1;
-	db->db_data_pending = NULL;
-	dbuf_rele_and_unlock(db, (void *)(uintptr_t)tx->tx_txg, B_FALSE);
-}
-
-static void
-dbuf_write_nofill_ready(zio_t *zio)
-{
-	dbuf_write_ready(zio, NULL, zio->io_private);
-}
-
-static void
-dbuf_write_nofill_done(zio_t *zio)
-{
-	dbuf_write_done(zio, NULL, zio->io_private);
-}
-
-static void
-dbuf_write_override_ready(zio_t *zio)
-{
-	dbuf_dirty_record_t *dr = zio->io_private;
-	dmu_buf_impl_t *db = dr->dr_dbuf;
-
-	dbuf_write_ready(zio, NULL, db);
-}
-
-static void
-dbuf_write_override_done(zio_t *zio)
-{
-	dbuf_dirty_record_t *dr = zio->io_private;
-	dmu_buf_impl_t *db = dr->dr_dbuf;
-	blkptr_t *obp = &dr->dt.dl.dr_overridden_by;
-
-	mutex_enter(&db->db_mtx);
-	if (!BP_EQUAL(zio->io_bp, obp)) {
-		if (!BP_IS_HOLE(obp))
-			dsl_free(spa_get_dsl(zio->io_spa), zio->io_txg, obp);
-		arc_release(dr->dt.dl.dr_data, db);
-	}
-	mutex_exit(&db->db_mtx);
-	dbuf_write_done(zio, NULL, db);
-
-	if (zio->io_abd != NULL)
-		abd_put(zio->io_abd);
-}
-
-typedef struct dbuf_remap_impl_callback_arg {
-	objset_t	*drica_os;
-	uint64_t	drica_blk_birth;
-	dmu_tx_t	*drica_tx;
-} dbuf_remap_impl_callback_arg_t;
-
-static void
-dbuf_remap_impl_callback(uint64_t vdev, uint64_t offset, uint64_t size,
-    void *arg)
-{
-	dbuf_remap_impl_callback_arg_t *drica = arg;
-	objset_t *os = drica->drica_os;
-	spa_t *spa = dmu_objset_spa(os);
-	dmu_tx_t *tx = drica->drica_tx;
-
-	ASSERT(dsl_pool_sync_context(spa_get_dsl(spa)));
-
-	if (os == spa_meta_objset(spa)) {
-		spa_vdev_indirect_mark_obsolete(spa, vdev, offset, size, tx);
-	} else {
-		dsl_dataset_block_remapped(dmu_objset_ds(os), vdev, offset,
-		    size, drica->drica_blk_birth, tx);
-	}
-}
-
-static void
-dbuf_remap_impl(dnode_t *dn, blkptr_t *bp, dmu_tx_t *tx)
-{
-	blkptr_t bp_copy = *bp;
-	spa_t *spa = dmu_objset_spa(dn->dn_objset);
-	dbuf_remap_impl_callback_arg_t drica;
-
-	ASSERT(dsl_pool_sync_context(spa_get_dsl(spa)));
-
-	drica.drica_os = dn->dn_objset;
-	drica.drica_blk_birth = bp->blk_birth;
-	drica.drica_tx = tx;
-	if (spa_remap_blkptr(spa, &bp_copy, dbuf_remap_impl_callback,
-	    &drica)) {
-		/*
-		 * The struct_rwlock prevents dbuf_read_impl() from
-		 * dereferencing the BP while we are changing it.  To
-		 * avoid lock contention, only grab it when we are actually
-		 * changing the BP.
-		 */
-		rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
-		*bp = bp_copy;
-		rw_exit(&dn->dn_struct_rwlock);
-	}
-}
-
-/*
- * Returns true if a dbuf_remap would modify the dbuf. We do this by attempting
- * to remap a copy of every bp in the dbuf.
- */
-boolean_t
-dbuf_can_remap(const dmu_buf_impl_t *db)
-{
-	spa_t *spa = dmu_objset_spa(db->db_objset);
-	blkptr_t *bp = db->db.db_data;
-	boolean_t ret = B_FALSE;
-
-	ASSERT3U(db->db_level, >, 0);
-	ASSERT3S(db->db_state, ==, DB_CACHED);
-
-	ASSERT(spa_feature_is_active(spa, SPA_FEATURE_DEVICE_REMOVAL));
-
-	spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
-	for (int i = 0; i < db->db.db_size >> SPA_BLKPTRSHIFT; i++) {
-		blkptr_t bp_copy = bp[i];
-		if (spa_remap_blkptr(spa, &bp_copy, NULL, NULL)) {
-			ret = B_TRUE;
-			break;
-		}
-	}
-	spa_config_exit(spa, SCL_VDEV, FTAG);
-
-	return (ret);
-}
-
-boolean_t
-dnode_needs_remap(const dnode_t *dn)
-{
-	spa_t *spa = dmu_objset_spa(dn->dn_objset);
-	boolean_t ret = B_FALSE;
-
-	if (dn->dn_phys->dn_nlevels == 0) {
-		return (B_FALSE);
-	}
-
-	ASSERT(spa_feature_is_active(spa, SPA_FEATURE_DEVICE_REMOVAL));
-
-	spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
-	for (int j = 0; j < dn->dn_phys->dn_nblkptr; j++) {
-		blkptr_t bp_copy = dn->dn_phys->dn_blkptr[j];
-		if (spa_remap_blkptr(spa, &bp_copy, NULL, NULL)) {
-			ret = B_TRUE;
-			break;
-		}
-	}
-	spa_config_exit(spa, SCL_VDEV, FTAG);
-
-	return (ret);
-}
-
-/*
- * Remap any existing BP's to concrete vdevs, if possible.
- */
-static void
-dbuf_remap(dnode_t *dn, dmu_buf_impl_t *db, dmu_tx_t *tx)
-{
-	spa_t *spa = dmu_objset_spa(db->db_objset);
-	ASSERT(dsl_pool_sync_context(spa_get_dsl(spa)));
-
-	if (!spa_feature_is_active(spa, SPA_FEATURE_DEVICE_REMOVAL))
-		return;
-
-	if (db->db_level > 0) {
-		blkptr_t *bp = db->db.db_data;
-		for (int i = 0; i < db->db.db_size >> SPA_BLKPTRSHIFT; i++) {
-			dbuf_remap_impl(dn, &bp[i], tx);
-		}
-	} else if (db->db.db_object == DMU_META_DNODE_OBJECT) {
-		dnode_phys_t *dnp = db->db.db_data;
-		ASSERT3U(db->db_dnode_handle->dnh_dnode->dn_type, ==,
-		    DMU_OT_DNODE);
-		for (int i = 0; i < db->db.db_size >> DNODE_SHIFT;
-		    i += dnp[i].dn_extra_slots + 1) {
-			for (int j = 0; j < dnp[i].dn_nblkptr; j++) {
-				dbuf_remap_impl(dn, &dnp[i].dn_blkptr[j], tx);
-			}
-		}
-	}
-}
-
-
-/* Issue I/O to commit a dirty buffer to disk. */
-static void
-dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
-{
-	dmu_buf_impl_t *db = dr->dr_dbuf;
-	dnode_t *dn;
-	objset_t *os;
-	dmu_buf_impl_t *parent = db->db_parent;
-	uint64_t txg = tx->tx_txg;
-	zbookmark_phys_t zb;
-	zio_prop_t zp;
-	zio_t *zio;
-	int wp_flag = 0;
-
-	ASSERT(dmu_tx_is_syncing(tx));
-
-	DB_DNODE_ENTER(db);
-	dn = DB_DNODE(db);
-	os = dn->dn_objset;
-
-	if (db->db_state != DB_NOFILL) {
-		if (db->db_level > 0 || dn->dn_type == DMU_OT_DNODE) {
-			/*
-			 * Private object buffers are released here rather
-			 * than in dbuf_dirty() since they are only modified
-			 * in the syncing context and we don't want the
-			 * overhead of making multiple copies of the data.
-			 */
-			if (BP_IS_HOLE(db->db_blkptr)) {
-				arc_buf_thaw(data);
-			} else {
-				dbuf_release_bp(db);
-			}
-			dbuf_remap(dn, db, tx);
-		}
-	}
-
-	if (parent != dn->dn_dbuf) {
-		/* Our parent is an indirect block. */
-		/* We have a dirty parent that has been scheduled for write. */
-		ASSERT(parent && parent->db_data_pending);
-		/* Our parent's buffer is one level closer to the dnode. */
-		ASSERT(db->db_level == parent->db_level-1);
-		/*
-		 * We're about to modify our parent's db_data by modifying
-		 * our block pointer, so the parent must be released.
-		 */
-		ASSERT(arc_released(parent->db_buf));
-		zio = parent->db_data_pending->dr_zio;
-	} else {
-		/* Our parent is the dnode itself. */
-		ASSERT((db->db_level == dn->dn_phys->dn_nlevels-1 &&
-		    db->db_blkid != DMU_SPILL_BLKID) ||
-		    (db->db_blkid == DMU_SPILL_BLKID && db->db_level == 0));
-		if (db->db_blkid != DMU_SPILL_BLKID)
-			ASSERT3P(db->db_blkptr, ==,
-			    &dn->dn_phys->dn_blkptr[db->db_blkid]);
-		zio = dn->dn_zio;
-	}
-
-	ASSERT(db->db_level == 0 || data == db->db_buf);
-	ASSERT3U(db->db_blkptr->blk_birth, <=, txg);
-	ASSERT(zio);
-
-	SET_BOOKMARK(&zb, os->os_dsl_dataset ?
-	    os->os_dsl_dataset->ds_object : DMU_META_OBJSET,
-	    db->db.db_object, db->db_level, db->db_blkid);
-
-	if (db->db_blkid == DMU_SPILL_BLKID)
-		wp_flag = WP_SPILL;
-	wp_flag |= (db->db_state == DB_NOFILL) ? WP_NOFILL : 0;
-
-	dmu_write_policy(os, dn, db->db_level, wp_flag, &zp);
-	DB_DNODE_EXIT(db);
-
-	/*
-	 * We copy the blkptr now (rather than when we instantiate the dirty
-	 * record), because its value can change between open context and
-	 * syncing context. We do not need to hold dn_struct_rwlock to read
-	 * db_blkptr because we are in syncing context.
-	 */
-	dr->dr_bp_copy = *db->db_blkptr;
-
-	if (db->db_level == 0 &&
-	    dr->dt.dl.dr_override_state == DR_OVERRIDDEN) {
-		/*
-		 * The BP for this block has been provided by open context
-		 * (by dmu_sync() or dmu_buf_write_embedded()).
-		 */
-		abd_t *contents = (data != NULL) ?
-		    abd_get_from_buf(data->b_data, arc_buf_size(data)) : NULL;
-
-		dr->dr_zio = zio_write(zio, os->os_spa, txg, &dr->dr_bp_copy,
-		    contents, db->db.db_size, db->db.db_size, &zp,
-		    dbuf_write_override_ready, NULL, NULL,
-		    dbuf_write_override_done,
-		    dr, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
-		mutex_enter(&db->db_mtx);
-		dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
-		zio_write_override(dr->dr_zio, &dr->dt.dl.dr_overridden_by,
-		    dr->dt.dl.dr_copies, dr->dt.dl.dr_nopwrite);
-		mutex_exit(&db->db_mtx);
-	} else if (db->db_state == DB_NOFILL) {
-		ASSERT(zp.zp_checksum == ZIO_CHECKSUM_OFF ||
-		    zp.zp_checksum == ZIO_CHECKSUM_NOPARITY);
-		dr->dr_zio = zio_write(zio, os->os_spa, txg,
-		    &dr->dr_bp_copy, NULL, db->db.db_size, db->db.db_size, &zp,
-		    dbuf_write_nofill_ready, NULL, NULL,
-		    dbuf_write_nofill_done, db,
-		    ZIO_PRIORITY_ASYNC_WRITE,
-		    ZIO_FLAG_MUSTSUCCEED | ZIO_FLAG_NODATA, &zb);
-	} else {
-		ASSERT(arc_released(data));
-
-		/*
-		 * For indirect blocks, we want to setup the children
-		 * ready callback so that we can properly handle an indirect
-		 * block that only contains holes.
-		 */
-		arc_write_done_func_t *children_ready_cb = NULL;
-		if (db->db_level != 0)
-			children_ready_cb = dbuf_write_children_ready;
-
-		dr->dr_zio = arc_write(zio, os->os_spa, txg,
-		    &dr->dr_bp_copy, data, DBUF_IS_L2CACHEABLE(db),
-		    &zp, dbuf_write_ready, children_ready_cb,
-		    dbuf_write_physdone, dbuf_write_done, db,
-		    ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
-	}
-}
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf_stats.c
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf_stats.c
+++ /dev/null
@@ -1,242 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-
-#include <sys/zfs_context.h>
-#include <sys/dbuf.h>
-#include <sys/dmu_objset.h>
-
-/*
- * Calculate the index of the arc header for the state, disabled by default.
- */
-int zfs_dbuf_state_index = 0;
-
-/*
- * ==========================================================================
- * Dbuf Hash Read Routines
- * ==========================================================================
- */
-typedef struct dbuf_stats_t {
-	kmutex_t		lock;
-	kstat_t			*kstat;
-	dbuf_hash_table_t	*hash;
-	int			idx;
-} dbuf_stats_t;
-
-static dbuf_stats_t dbuf_stats_hash_table;
-
-static int
-dbuf_stats_hash_table_headers(char *buf, size_t size)
-{
-	size = snprintf(buf, size - 1,
-	    "%-88s | %-124s | %s\n"
-	    "%-16s %-8s %-8s %-8s %-8s %-8s %-8s %-5s %-5s %5s | "
-	    "%-5s %-5s %-6s %-8s %-6s %-8s %-12s "
-	    "%-6s %-6s %-6s %-6s %-6s %-8s %-8s %-8s %-5s | "
-	    "%-6s %-6s %-8s %-8s %-6s %-6s %-5s %-8s %-8s\n",
-	    "dbuf", "arcbuf", "dnode", "pool", "objset", "object", "level",
-	    "blkid", "offset", "dbsize", "meta", "state", "dbholds", "list",
-	    "atype", "index", "flags", "count", "asize", "access", "mru", "gmru",
-	    "mfu", "gmfu", "l2", "l2_dattr", "l2_asize", "l2_comp", "aholds",
-	    "dtype", "btype", "data_bs", "meta_bs", "bsize",
-	    "lvls", "dholds", "blocks", "dsize");
-        buf[size] = '\0';
-
-	return (0);
-}
-
-int
-__dbuf_stats_hash_table_data(char *buf, size_t size, dmu_buf_impl_t *db)
-{
-	arc_buf_info_t abi = { 0 };
-	dmu_object_info_t doi = { 0 };
-	dnode_t *dn = DB_DNODE(db);
-
-	if (db->db_buf)
-		arc_buf_info(db->db_buf, &abi, zfs_dbuf_state_index);
-
-	if (dn)
-		__dmu_object_info_from_dnode(dn, &doi);
-
-	size = snprintf(buf, size - 1,
-	    "%-16s %-8llu %-8lld %-8lld %-8lld %-8llu %-8llu %-5d %-5d %-5lu | "
-	    "%-5d %-5d %-6lld 0x%-6x %-6lu %-8llu %-12llu "
-	    "%-6lu %-6lu %-6lu %-6lu %-6lu %-8llu %-8llu %-8d %-5lu | "
-	    "%-6d %-6d %-8lu %-8lu %-6llu %-6lu %-5lu %-8llu %-8llu\n",
-	    /* dmu_buf_impl_t */
-	    spa_name(dn->dn_objset->os_spa),
-	    (u_longlong_t)dmu_objset_id(db->db_objset),
-	    (longlong_t)db->db.db_object,
-	    (longlong_t)db->db_level,
-	    (longlong_t)db->db_blkid,
-	    (u_longlong_t)db->db.db_offset,
-	    (u_longlong_t)db->db.db_size,
-	    !!dbuf_is_metadata(db),
-	    db->db_state,
-	    (ulong_t)zfs_refcount_count(&db->db_holds),
-	    /* arc_buf_info_t */
-	    abi.abi_state_type,
-	    abi.abi_state_contents,
-	    (longlong_t)abi.abi_state_index,
-	    abi.abi_flags,
-	    (ulong_t)abi.abi_bufcnt,
-	    (u_longlong_t)abi.abi_size,
-	    (u_longlong_t)abi.abi_access,
-	    (ulong_t)abi.abi_mru_hits,
-	    (ulong_t)abi.abi_mru_ghost_hits,
-	    (ulong_t)abi.abi_mfu_hits,
-	    (ulong_t)abi.abi_mfu_ghost_hits,
-	    (ulong_t)abi.abi_l2arc_hits,
-	    (u_longlong_t)abi.abi_l2arc_dattr,
-	    (u_longlong_t)abi.abi_l2arc_asize,
-	    abi.abi_l2arc_compress,
-	    (ulong_t)abi.abi_holds,
-	    /* dmu_object_info_t */
-	    doi.doi_type,
-	    doi.doi_bonus_type,
-	    (ulong_t)doi.doi_data_block_size,
-	    (ulong_t)doi.doi_metadata_block_size,
-	    (u_longlong_t)doi.doi_bonus_size,
-	    (ulong_t)doi.doi_indirection,
-	    (ulong_t)zfs_refcount_count(&dn->dn_holds),
-	    (u_longlong_t)doi.doi_fill_count,
-	    (u_longlong_t)doi.doi_max_offset);
-        buf[size] = '\0';
-
-	return (size);
-}
-
-static int
-dbuf_stats_hash_table_data(char *buf, size_t size, void *data)
-{
-	dbuf_stats_t *dsh = (dbuf_stats_t *)data;
-	dbuf_hash_table_t *h = dsh->hash;
-	dmu_buf_impl_t *db;
-	int length, error = 0;
-
-	ASSERT3S(dsh->idx, >=, 0);
-	ASSERT3S(dsh->idx, <=, h->hash_table_mask);
-	memset(buf, 0, size);
-
-	mutex_enter(DBUF_HASH_MUTEX(h, dsh->idx));
-	for (db = h->hash_table[dsh->idx]; db != NULL; db = db->db_hash_next) {
-		/*
-		 * Returning ENOMEM will cause the data and header functions
-		 * to be called with a larger scratch buffers.
-		 */
-		if (size < 512) {
-			error = ENOMEM;
-			break;
-		}
-
-		mutex_enter(&db->db_mtx);
-		mutex_exit(DBUF_HASH_MUTEX(h, dsh->idx));
-
-		length = __dbuf_stats_hash_table_data(buf, size, db);
-		buf += length;
-		size -= length;
-
-		mutex_exit(&db->db_mtx);
-		mutex_enter(DBUF_HASH_MUTEX(h, dsh->idx));
-	}
-	mutex_exit(DBUF_HASH_MUTEX(h, dsh->idx));
-
-	return (error);
-}
-
-static void *
-dbuf_stats_hash_table_addr(kstat_t *ksp, off_t n)
-{
-	dbuf_stats_t *dsh = ksp->ks_private;
-
-        ASSERT(MUTEX_HELD(&dsh->lock));
-
-	if (n <= dsh->hash->hash_table_mask) {
-		dsh->idx = n;
-		return (dsh);
-	}
-
-	return (NULL);
-}
-
-#ifndef __FreeBSD__
-/*
- * XXX The FreeBSD SPL is missing support for KSTAT_TYPE_RAW
- * we can enable this as soon as that's implemented. See the
- * lindebugfs module for similar callback semantics.
- */
-static void
-dbuf_stats_hash_table_init(dbuf_hash_table_t *hash)
-{
-	dbuf_stats_t *dsh = &dbuf_stats_hash_table;
-	kstat_t *ksp;
-
-	mutex_init(&dsh->lock, NULL, MUTEX_DEFAULT, NULL);
-	dsh->hash = hash;
-
-	ksp = kstat_create("zfs", 0, "dbufs", "misc",
-	    KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VIRTUAL);
-	dsh->kstat = ksp;
-
-	if (ksp) {
-		ksp->ks_lock = &dsh->lock;
-		ksp->ks_ndata = UINT32_MAX;
-		ksp->ks_private = dsh;
-		kstat_set_raw_ops(ksp, dbuf_stats_hash_table_headers,
-		    dbuf_stats_hash_table_data, dbuf_stats_hash_table_addr);
-		kstat_install(ksp);
-	}
-}
-
-static void
-dbuf_stats_hash_table_destroy(void)
-{
-	dbuf_stats_t *dsh = &dbuf_stats_hash_table;
-	kstat_t *ksp;
-
-	ksp = dsh->kstat;
-	if (ksp)
-		kstat_delete(ksp);
-
-	mutex_destroy(&dsh->lock);
-}
-#else
-static void
-dbuf_stats_hash_table_init(dbuf_hash_table_t *hash)
-{
-}
-
-static void
-dbuf_stats_hash_table_destroy(void)
-{
-}
-#endif
-
-void
-dbuf_stats_init(dbuf_hash_table_t *hash)
-{
-	dbuf_stats_hash_table_init(hash);
-}
-
-void
-dbuf_stats_destroy(void)
-{
-	dbuf_stats_hash_table_destroy();
-}
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/ddt.c
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/ddt.c
+++ /dev/null
@@ -1,1189 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-
-/*
- * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, 2016 by Delphix. All rights reserved.
- */
-
-#include <sys/zfs_context.h>
-#include <sys/spa.h>
-#include <sys/spa_impl.h>
-#include <sys/zio.h>
-#include <sys/ddt.h>
-#include <sys/zap.h>
-#include <sys/dmu_tx.h>
-#include <sys/arc.h>
-#include <sys/dsl_pool.h>
-#include <sys/zio_checksum.h>
-#include <sys/zio_compress.h>
-#include <sys/dsl_scan.h>
-#include <sys/abd.h>
-
-/*
- * Enable/disable prefetching of dedup-ed blocks which are going to be freed.
- */
-int zfs_dedup_prefetch = 1;
-
-SYSCTL_DECL(_vfs_zfs);
-SYSCTL_NODE(_vfs_zfs, OID_AUTO, dedup, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
-    "ZFS DEDUP");
-SYSCTL_INT(_vfs_zfs_dedup, OID_AUTO, prefetch, CTLFLAG_RWTUN, &zfs_dedup_prefetch,
-    0, "Enable/disable prefetching of dedup-ed blocks which are going to be freed");
-
-static const ddt_ops_t *ddt_ops[DDT_TYPES] = {
-	&ddt_zap_ops,
-};
-
-static const char *ddt_class_name[DDT_CLASSES] = {
-	"ditto",
-	"duplicate",
-	"unique",
-};
-
-static void
-ddt_object_create(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
-    dmu_tx_t *tx)
-{
-	spa_t *spa = ddt->ddt_spa;
-	objset_t *os = ddt->ddt_os;
-	uint64_t *objectp = &ddt->ddt_object[type][class];
-	boolean_t prehash = zio_checksum_table[ddt->ddt_checksum].ci_flags &
-	    ZCHECKSUM_FLAG_DEDUP;
-	char name[DDT_NAMELEN];
-
-	ddt_object_name(ddt, type, class, name);
-
-	ASSERT(*objectp == 0);
-	VERIFY(ddt_ops[type]->ddt_op_create(os, objectp, tx, prehash) == 0);
-	ASSERT(*objectp != 0);
-
-	VERIFY(zap_add(os, DMU_POOL_DIRECTORY_OBJECT, name,
-	    sizeof (uint64_t), 1, objectp, tx) == 0);
-
-	VERIFY(zap_add(os, spa->spa_ddt_stat_object, name,
-	    sizeof (uint64_t), sizeof (ddt_histogram_t) / sizeof (uint64_t),
-	    &ddt->ddt_histogram[type][class], tx) == 0);
-}
-
-static void
-ddt_object_destroy(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
-    dmu_tx_t *tx)
-{
-	spa_t *spa = ddt->ddt_spa;
-	objset_t *os = ddt->ddt_os;
-	uint64_t *objectp = &ddt->ddt_object[type][class];
-	uint64_t count;
-	char name[DDT_NAMELEN];
-
-	ddt_object_name(ddt, type, class, name);
-
-	ASSERT(*objectp != 0);
-	VERIFY(ddt_object_count(ddt, type, class, &count) == 0 && count == 0);
-	ASSERT(ddt_histogram_empty(&ddt->ddt_histogram[type][class]));
-	VERIFY(zap_remove(os, DMU_POOL_DIRECTORY_OBJECT, name, tx) == 0);
-	VERIFY(zap_remove(os, spa->spa_ddt_stat_object, name, tx) == 0);
-	VERIFY(ddt_ops[type]->ddt_op_destroy(os, *objectp, tx) == 0);
-	bzero(&ddt->ddt_object_stats[type][class], sizeof (ddt_object_t));
-
-	*objectp = 0;
-}
-
-static int
-ddt_object_load(ddt_t *ddt, enum ddt_type type, enum ddt_class class)
-{
-	ddt_object_t *ddo = &ddt->ddt_object_stats[type][class];
-	dmu_object_info_t doi;
-	uint64_t count;
-	char name[DDT_NAMELEN];
-	int error;
-
-	ddt_object_name(ddt, type, class, name);
-
-	error = zap_lookup(ddt->ddt_os, DMU_POOL_DIRECTORY_OBJECT, name,
-	    sizeof (uint64_t), 1, &ddt->ddt_object[type][class]);
-
-	if (error != 0)
-		return (error);
-
-	VERIFY0(zap_lookup(ddt->ddt_os, ddt->ddt_spa->spa_ddt_stat_object, name,
-	    sizeof (uint64_t), sizeof (ddt_histogram_t) / sizeof (uint64_t),
-	    &ddt->ddt_histogram[type][class]));
-
-	/*
-	 * Seed the cached statistics.
-	 */
-	VERIFY(ddt_object_info(ddt, type, class, &doi) == 0);
-
-	error = ddt_object_count(ddt, type, class, &count);
-	if (error)
-		return error;
-
-	ddo->ddo_count = count;
-	ddo->ddo_dspace = doi.doi_physical_blocks_512 << 9;
-	ddo->ddo_mspace = doi.doi_fill_count * doi.doi_data_block_size;
-
-	return (0);
-}
-
-static void
-ddt_object_sync(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
-    dmu_tx_t *tx)
-{
-	ddt_object_t *ddo = &ddt->ddt_object_stats[type][class];
-	dmu_object_info_t doi;
-	uint64_t count;
-	char name[DDT_NAMELEN];
-
-	ddt_object_name(ddt, type, class, name);
-
-	VERIFY(zap_update(ddt->ddt_os, ddt->ddt_spa->spa_ddt_stat_object, name,
-	    sizeof (uint64_t), sizeof (ddt_histogram_t) / sizeof (uint64_t),
-	    &ddt->ddt_histogram[type][class], tx) == 0);
-
-	/*
-	 * Cache DDT statistics; this is the only time they'll change.
-	 */
-	VERIFY(ddt_object_info(ddt, type, class, &doi) == 0);
-	VERIFY(ddt_object_count(ddt, type, class, &count) == 0);
-
-	ddo->ddo_count = count;
-	ddo->ddo_dspace = doi.doi_physical_blocks_512 << 9;
-	ddo->ddo_mspace = doi.doi_fill_count * doi.doi_data_block_size;
-}
-
-static int
-ddt_object_lookup(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
-    ddt_entry_t *dde)
-{
-	if (!ddt_object_exists(ddt, type, class))
-		return (SET_ERROR(ENOENT));
-
-	return (ddt_ops[type]->ddt_op_lookup(ddt->ddt_os,
-	    ddt->ddt_object[type][class], dde));
-}
-
-static void
-ddt_object_prefetch(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
-    ddt_entry_t *dde)
-{
-	if (!ddt_object_exists(ddt, type, class))
-		return;
-
-	ddt_ops[type]->ddt_op_prefetch(ddt->ddt_os,
-	    ddt->ddt_object[type][class], dde);
-}
-
-int
-ddt_object_update(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
-    ddt_entry_t *dde, dmu_tx_t *tx)
-{
-	ASSERT(ddt_object_exists(ddt, type, class));
-
-	return (ddt_ops[type]->ddt_op_update(ddt->ddt_os,
-	    ddt->ddt_object[type][class], dde, tx));
-}
-
-static int
-ddt_object_remove(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
-    ddt_entry_t *dde, dmu_tx_t *tx)
-{
-	ASSERT(ddt_object_exists(ddt, type, class));
-
-	return (ddt_ops[type]->ddt_op_remove(ddt->ddt_os,
-	    ddt->ddt_object[type][class], dde, tx));
-}
-
-int
-ddt_object_walk(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
-    uint64_t *walk, ddt_entry_t *dde)
-{
-	ASSERT(ddt_object_exists(ddt, type, class));
-
-	return (ddt_ops[type]->ddt_op_walk(ddt->ddt_os,
-	    ddt->ddt_object[type][class], dde, walk));
-}
-
-int
-ddt_object_count(ddt_t *ddt, enum ddt_type type, enum ddt_class class, uint64_t *count)
-{
-	ASSERT(ddt_object_exists(ddt, type, class));
-
-	return (ddt_ops[type]->ddt_op_count(ddt->ddt_os,
-	    ddt->ddt_object[type][class], count));
-}
-
-int
-ddt_object_info(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
-    dmu_object_info_t *doi)
-{
-	if (!ddt_object_exists(ddt, type, class))
-		return (SET_ERROR(ENOENT));
-
-	return (dmu_object_info(ddt->ddt_os, ddt->ddt_object[type][class],
-	    doi));
-}
-
-boolean_t
-ddt_object_exists(ddt_t *ddt, enum ddt_type type, enum ddt_class class)
-{
-	return (!!ddt->ddt_object[type][class]);
-}
-
-void
-ddt_object_name(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
-    char *name)
-{
-	(void) sprintf(name, DMU_POOL_DDT,
-	    zio_checksum_table[ddt->ddt_checksum].ci_name,
-	    ddt_ops[type]->ddt_op_name, ddt_class_name[class]);
-}
-
-void
-ddt_bp_fill(const ddt_phys_t *ddp, blkptr_t *bp, uint64_t txg)
-{
-	ASSERT(txg != 0);
-
-	for (int d = 0; d < SPA_DVAS_PER_BP; d++)
-		bp->blk_dva[d] = ddp->ddp_dva[d];
-	BP_SET_BIRTH(bp, txg, ddp->ddp_phys_birth);
-}
-
-void
-ddt_bp_create(enum zio_checksum checksum,
-    const ddt_key_t *ddk, const ddt_phys_t *ddp, blkptr_t *bp)
-{
-	BP_ZERO(bp);
-
-	if (ddp != NULL)
-		ddt_bp_fill(ddp, bp, ddp->ddp_phys_birth);
-
-	bp->blk_cksum = ddk->ddk_cksum;
-	bp->blk_fill = 1;
-
-	BP_SET_LSIZE(bp, DDK_GET_LSIZE(ddk));
-	BP_SET_PSIZE(bp, DDK_GET_PSIZE(ddk));
-	BP_SET_COMPRESS(bp, DDK_GET_COMPRESS(ddk));
-	BP_SET_CHECKSUM(bp, checksum);
-	BP_SET_TYPE(bp, DMU_OT_DEDUP);
-	BP_SET_LEVEL(bp, 0);
-	BP_SET_DEDUP(bp, 0);
-	BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
-}
-
-void
-ddt_key_fill(ddt_key_t *ddk, const blkptr_t *bp)
-{
-	ddk->ddk_cksum = bp->blk_cksum;
-	ddk->ddk_prop = 0;
-
-	DDK_SET_LSIZE(ddk, BP_GET_LSIZE(bp));
-	DDK_SET_PSIZE(ddk, BP_GET_PSIZE(bp));
-	DDK_SET_COMPRESS(ddk, BP_GET_COMPRESS(bp));
-}
-
-void
-ddt_phys_fill(ddt_phys_t *ddp, const blkptr_t *bp)
-{
-	ASSERT(ddp->ddp_phys_birth == 0);
-
-	for (int d = 0; d < SPA_DVAS_PER_BP; d++)
-		ddp->ddp_dva[d] = bp->blk_dva[d];
-	ddp->ddp_phys_birth = BP_PHYSICAL_BIRTH(bp);
-}
-
-void
-ddt_phys_clear(ddt_phys_t *ddp)
-{
-	bzero(ddp, sizeof (*ddp));
-}
-
-void
-ddt_phys_addref(ddt_phys_t *ddp)
-{
-	ddp->ddp_refcnt++;
-}
-
-void
-ddt_phys_decref(ddt_phys_t *ddp)
-{
-	if (ddp) {
-		ASSERT((int64_t)ddp->ddp_refcnt > 0);
-		ddp->ddp_refcnt--;
-	}
-}
-
-void
-ddt_phys_free(ddt_t *ddt, ddt_key_t *ddk, ddt_phys_t *ddp, uint64_t txg)
-{
-	blkptr_t blk;
-
-	ddt_bp_create(ddt->ddt_checksum, ddk, ddp, &blk);
-	ddt_phys_clear(ddp);
-	zio_free(ddt->ddt_spa, txg, &blk);
-}
-
-ddt_phys_t *
-ddt_phys_select(const ddt_entry_t *dde, const blkptr_t *bp)
-{
-	ddt_phys_t *ddp = (ddt_phys_t *)dde->dde_phys;
-
-	for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
-		if (DVA_EQUAL(BP_IDENTITY(bp), &ddp->ddp_dva[0]) &&
-		    BP_PHYSICAL_BIRTH(bp) == ddp->ddp_phys_birth)
-			return (ddp);
-	}
-	return (NULL);
-}
-
-uint64_t
-ddt_phys_total_refcnt(const ddt_entry_t *dde)
-{
-	uint64_t refcnt = 0;
-
-	for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++)
-		refcnt += dde->dde_phys[p].ddp_refcnt;
-
-	return (refcnt);
-}
-
-static void
-ddt_stat_generate(ddt_t *ddt, ddt_entry_t *dde, ddt_stat_t *dds)
-{
-	spa_t *spa = ddt->ddt_spa;
-	ddt_phys_t *ddp = dde->dde_phys;
-	ddt_key_t *ddk = &dde->dde_key;
-	uint64_t lsize = DDK_GET_LSIZE(ddk);
-	uint64_t psize = DDK_GET_PSIZE(ddk);
-
-	bzero(dds, sizeof (*dds));
-
-	for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
-		uint64_t dsize = 0;
-		uint64_t refcnt = ddp->ddp_refcnt;
-
-		if (ddp->ddp_phys_birth == 0)
-			continue;
-
-		for (int d = 0; d < SPA_DVAS_PER_BP; d++)
-			dsize += dva_get_dsize_sync(spa, &ddp->ddp_dva[d]);
-
-		dds->dds_blocks += 1;
-		dds->dds_lsize += lsize;
-		dds->dds_psize += psize;
-		dds->dds_dsize += dsize;
-
-		dds->dds_ref_blocks += refcnt;
-		dds->dds_ref_lsize += lsize * refcnt;
-		dds->dds_ref_psize += psize * refcnt;
-		dds->dds_ref_dsize += dsize * refcnt;
-	}
-}
-
-void
-ddt_stat_add(ddt_stat_t *dst, const ddt_stat_t *src, uint64_t neg)
-{
-	const uint64_t *s = (const uint64_t *)src;
-	uint64_t *d = (uint64_t *)dst;
-	uint64_t *d_end = (uint64_t *)(dst + 1);
-
-	ASSERT(neg == 0 || neg == -1ULL);	/* add or subtract */
-
-	while (d < d_end)
-		*d++ += (*s++ ^ neg) - neg;
-}
-
-static void
-ddt_stat_update(ddt_t *ddt, ddt_entry_t *dde, uint64_t neg)
-{
-	ddt_stat_t dds;
-	ddt_histogram_t *ddh;
-	int bucket;
-
-	ddt_stat_generate(ddt, dde, &dds);
-
-	bucket = highbit64(dds.dds_ref_blocks) - 1;
-	ASSERT(bucket >= 0);
-
-	ddh = &ddt->ddt_histogram[dde->dde_type][dde->dde_class];
-
-	ddt_stat_add(&ddh->ddh_stat[bucket], &dds, neg);
-}
-
-void
-ddt_histogram_add(ddt_histogram_t *dst, const ddt_histogram_t *src)
-{
-	for (int h = 0; h < 64; h++)
-		ddt_stat_add(&dst->ddh_stat[h], &src->ddh_stat[h], 0);
-}
-
-void
-ddt_histogram_stat(ddt_stat_t *dds, const ddt_histogram_t *ddh)
-{
-	bzero(dds, sizeof (*dds));
-
-	for (int h = 0; h < 64; h++)
-		ddt_stat_add(dds, &ddh->ddh_stat[h], 0);
-}
-
-boolean_t
-ddt_histogram_empty(const ddt_histogram_t *ddh)
-{
-	const uint64_t *s = (const uint64_t *)ddh;
-	const uint64_t *s_end = (const uint64_t *)(ddh + 1);
-
-	while (s < s_end)
-		if (*s++ != 0)
-			return (B_FALSE);
-
-	return (B_TRUE);
-}
-
-void
-ddt_get_dedup_object_stats(spa_t *spa, ddt_object_t *ddo_total)
-{
-	/* Sum the statistics we cached in ddt_object_sync(). */
-	for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
-		ddt_t *ddt = spa->spa_ddt[c];
-		for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
-			for (enum ddt_class class = 0; class < DDT_CLASSES;
-			    class++) {
-				ddt_object_t *ddo =
-				    &ddt->ddt_object_stats[type][class];
-				ddo_total->ddo_count += ddo->ddo_count;
-				ddo_total->ddo_dspace += ddo->ddo_dspace;
-				ddo_total->ddo_mspace += ddo->ddo_mspace;
-			}
-		}
-	}
-
-	/* ... and compute the averages. */
-	if (ddo_total->ddo_count != 0) {
-		ddo_total->ddo_dspace /= ddo_total->ddo_count;
-		ddo_total->ddo_mspace /= ddo_total->ddo_count;
-	}
-}
-
-void
-ddt_get_dedup_histogram(spa_t *spa, ddt_histogram_t *ddh)
-{
-	for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
-		ddt_t *ddt = spa->spa_ddt[c];
-		for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
-			for (enum ddt_class class = 0; class < DDT_CLASSES;
-			    class++) {
-				ddt_histogram_add(ddh,
-				    &ddt->ddt_histogram_cache[type][class]);
-			}
-		}
-	}
-}
-
-void
-ddt_get_dedup_stats(spa_t *spa, ddt_stat_t *dds_total)
-{
-	ddt_histogram_t *ddh_total;
-
-	ddh_total = kmem_zalloc(sizeof (ddt_histogram_t), KM_SLEEP);
-	ddt_get_dedup_histogram(spa, ddh_total);
-	ddt_histogram_stat(dds_total, ddh_total);
-	kmem_free(ddh_total, sizeof (ddt_histogram_t));
-}
-
-uint64_t
-ddt_get_dedup_dspace(spa_t *spa)
-{
-	ddt_stat_t dds_total = { 0 };
-
-	ddt_get_dedup_stats(spa, &dds_total);
-	return (dds_total.dds_ref_dsize - dds_total.dds_dsize);
-}
-
-uint64_t
-ddt_get_pool_dedup_ratio(spa_t *spa)
-{
-	ddt_stat_t dds_total = { 0 };
-
-	ddt_get_dedup_stats(spa, &dds_total);
-	if (dds_total.dds_dsize == 0)
-		return (100);
-
-	return (dds_total.dds_ref_dsize * 100 / dds_total.dds_dsize);
-}
-
-int
-ddt_ditto_copies_needed(ddt_t *ddt, ddt_entry_t *dde, ddt_phys_t *ddp_willref)
-{
-	spa_t *spa = ddt->ddt_spa;
-	uint64_t total_refcnt = 0;
-	uint64_t ditto = spa->spa_dedup_ditto;
-	int total_copies = 0;
-	int desired_copies = 0;
-
-	for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) {
-		ddt_phys_t *ddp = &dde->dde_phys[p];
-		zio_t *zio = dde->dde_lead_zio[p];
-		uint64_t refcnt = ddp->ddp_refcnt;	/* committed refs */
-		if (zio != NULL)
-			refcnt += zio->io_parent_count;	/* pending refs */
-		if (ddp == ddp_willref)
-			refcnt++;			/* caller's ref */
-		if (refcnt != 0) {
-			total_refcnt += refcnt;
-			total_copies += p;
-		}
-	}
-
-	if (ditto == 0 || ditto > UINT32_MAX)
-		ditto = UINT32_MAX;
-
-	if (total_refcnt >= 1)
-		desired_copies++;
-	if (total_refcnt >= ditto)
-		desired_copies++;
-	if (total_refcnt >= ditto * ditto)
-		desired_copies++;
-
-	return (MAX(desired_copies, total_copies) - total_copies);
-}
-
-int
-ddt_ditto_copies_present(ddt_entry_t *dde)
-{
-	ddt_phys_t *ddp = &dde->dde_phys[DDT_PHYS_DITTO];
-	dva_t *dva = ddp->ddp_dva;
-	int copies = 0 - DVA_GET_GANG(dva);
-
-	for (int d = 0; d < SPA_DVAS_PER_BP; d++, dva++)
-		if (DVA_IS_VALID(dva))
-			copies++;
-
-	ASSERT(copies >= 0 && copies < SPA_DVAS_PER_BP);
-
-	return (copies);
-}
-
-size_t
-ddt_compress(void *src, uchar_t *dst, size_t s_len, size_t d_len)
-{
-	uchar_t *version = dst++;
-	int cpfunc = ZIO_COMPRESS_ZLE;
-	zio_compress_info_t *ci = &zio_compress_table[cpfunc];
-	size_t c_len;
-
-	ASSERT(d_len >= s_len + 1);	/* no compression plus version byte */
-
-	c_len = ci->ci_compress(src, dst, s_len, d_len - 1, ci->ci_level);
-
-	if (c_len == s_len) {
-		cpfunc = ZIO_COMPRESS_OFF;
-		bcopy(src, dst, s_len);
-	}
-
-	*version = cpfunc;
-	/* CONSTCOND */
-	if (ZFS_HOST_BYTEORDER)
-		*version |= DDT_COMPRESS_BYTEORDER_MASK;
-
-	return (c_len + 1);
-}
-
-void
-ddt_decompress(uchar_t *src, void *dst, size_t s_len, size_t d_len)
-{
-	uchar_t version = *src++;
-	int cpfunc = version & DDT_COMPRESS_FUNCTION_MASK;
-	zio_compress_info_t *ci = &zio_compress_table[cpfunc];
-
-	if (ci->ci_decompress != NULL)
-		(void) ci->ci_decompress(src, dst, s_len, d_len, ci->ci_level);
-	else
-		bcopy(src, dst, d_len);
-
-	if (((version & DDT_COMPRESS_BYTEORDER_MASK) != 0) !=
-	    (ZFS_HOST_BYTEORDER != 0))
-		byteswap_uint64_array(dst, d_len);
-}
-
-ddt_t *
-ddt_select_by_checksum(spa_t *spa, enum zio_checksum c)
-{
-	return (spa->spa_ddt[c]);
-}
-
-ddt_t *
-ddt_select(spa_t *spa, const blkptr_t *bp)
-{
-	return (spa->spa_ddt[BP_GET_CHECKSUM(bp)]);
-}
-
-void
-ddt_enter(ddt_t *ddt)
-{
-	mutex_enter(&ddt->ddt_lock);
-}
-
-void
-ddt_exit(ddt_t *ddt)
-{
-	mutex_exit(&ddt->ddt_lock);
-}
-
-static ddt_entry_t *
-ddt_alloc(const ddt_key_t *ddk)
-{
-	ddt_entry_t *dde;
-
-	dde = kmem_zalloc(sizeof (ddt_entry_t), KM_SLEEP);
-	cv_init(&dde->dde_cv, NULL, CV_DEFAULT, NULL);
-
-	dde->dde_key = *ddk;
-
-	return (dde);
-}
-
-static void
-ddt_free(ddt_entry_t *dde)
-{
-	ASSERT(!dde->dde_loading);
-
-	for (int p = 0; p < DDT_PHYS_TYPES; p++)
-		ASSERT(dde->dde_lead_zio[p] == NULL);
-
-	if (dde->dde_repair_abd != NULL)
-		abd_free(dde->dde_repair_abd);
-
-	cv_destroy(&dde->dde_cv);
-	kmem_free(dde, sizeof (*dde));
-}
-
-void
-ddt_remove(ddt_t *ddt, ddt_entry_t *dde)
-{
-	ASSERT(MUTEX_HELD(&ddt->ddt_lock));
-
-	avl_remove(&ddt->ddt_tree, dde);
-	ddt_free(dde);
-}
-
-ddt_entry_t *
-ddt_lookup(ddt_t *ddt, const blkptr_t *bp, boolean_t add)
-{
-	ddt_entry_t *dde, dde_search;
-	enum ddt_type type;
-	enum ddt_class class;
-	avl_index_t where;
-	int error;
-
-	ASSERT(MUTEX_HELD(&ddt->ddt_lock));
-
-	ddt_key_fill(&dde_search.dde_key, bp);
-
-	dde = avl_find(&ddt->ddt_tree, &dde_search, &where);
-	if (dde == NULL) {
-		if (!add)
-			return (NULL);
-		dde = ddt_alloc(&dde_search.dde_key);
-		avl_insert(&ddt->ddt_tree, dde, where);
-	}
-
-	while (dde->dde_loading)
-		cv_wait(&dde->dde_cv, &ddt->ddt_lock);
-
-	if (dde->dde_loaded)
-		return (dde);
-
-	dde->dde_loading = B_TRUE;
-
-	ddt_exit(ddt);
-
-	error = ENOENT;
-
-	for (type = 0; type < DDT_TYPES; type++) {
-		for (class = 0; class < DDT_CLASSES; class++) {
-			error = ddt_object_lookup(ddt, type, class, dde);
-			if (error != ENOENT) {
-				ASSERT0(error);
-				break;
-			}
-		}
-		if (error != ENOENT)
-			break;
-	}
-
-	ddt_enter(ddt);
-
-	ASSERT(dde->dde_loaded == B_FALSE);
-	ASSERT(dde->dde_loading == B_TRUE);
-
-	dde->dde_type = type;	/* will be DDT_TYPES if no entry found */
-	dde->dde_class = class;	/* will be DDT_CLASSES if no entry found */
-	dde->dde_loaded = B_TRUE;
-	dde->dde_loading = B_FALSE;
-
-	if (error == 0)
-		ddt_stat_update(ddt, dde, -1ULL);
-
-	cv_broadcast(&dde->dde_cv);
-
-	return (dde);
-}
-
-void
-ddt_prefetch(spa_t *spa, const blkptr_t *bp)
-{
-	ddt_t *ddt;
-	ddt_entry_t dde;
-
-	if (!zfs_dedup_prefetch || bp == NULL || !BP_GET_DEDUP(bp))
-		return;
-
-	/*
-	 * We only remove the DDT once all tables are empty and only
-	 * prefetch dedup blocks when there are entries in the DDT.
-	 * Thus no locking is required as the DDT can't disappear on us.
-	 */
-	ddt = ddt_select(spa, bp);
-	ddt_key_fill(&dde.dde_key, bp);
-
-	for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
-		for (enum ddt_class class = 0; class < DDT_CLASSES; class++) {
-			ddt_object_prefetch(ddt, type, class, &dde);
-		}
-	}
-}
-
-/*
- * Opaque struct used for ddt_key comparison
- */
-#define	DDT_KEY_CMP_LEN	(sizeof (ddt_key_t) / sizeof (uint16_t))
-
-typedef struct ddt_key_cmp {
-	uint16_t	u16[DDT_KEY_CMP_LEN];
-} ddt_key_cmp_t;
-
-int
-ddt_entry_compare(const void *x1, const void *x2)
-{
-	const ddt_entry_t *dde1 = x1;
-	const ddt_entry_t *dde2 = x2;
-	const ddt_key_cmp_t *k1 = (const ddt_key_cmp_t *)&dde1->dde_key;
-	const ddt_key_cmp_t *k2 = (const ddt_key_cmp_t *)&dde2->dde_key;
-	int32_t cmp = 0;
-
-	for (int i = 0; i < DDT_KEY_CMP_LEN; i++) {
-		cmp = (int32_t)k1->u16[i] - (int32_t)k2->u16[i];
-		if (likely(cmp))
-			break;
-	}
-
-	return (AVL_ISIGN(cmp));
-}
-
-static ddt_t *
-ddt_table_alloc(spa_t *spa, enum zio_checksum c)
-{
-	ddt_t *ddt;
-
-	ddt = kmem_zalloc(sizeof (*ddt), KM_SLEEP);
-
-	mutex_init(&ddt->ddt_lock, NULL, MUTEX_DEFAULT, NULL);
-	avl_create(&ddt->ddt_tree, ddt_entry_compare,
-	    sizeof (ddt_entry_t), offsetof(ddt_entry_t, dde_node));
-	avl_create(&ddt->ddt_repair_tree, ddt_entry_compare,
-	    sizeof (ddt_entry_t), offsetof(ddt_entry_t, dde_node));
-	ddt->ddt_checksum = c;
-	ddt->ddt_spa = spa;
-	ddt->ddt_os = spa->spa_meta_objset;
-
-	return (ddt);
-}
-
-static void
-ddt_table_free(ddt_t *ddt)
-{
-	ASSERT(avl_numnodes(&ddt->ddt_tree) == 0);
-	ASSERT(avl_numnodes(&ddt->ddt_repair_tree) == 0);
-	avl_destroy(&ddt->ddt_tree);
-	avl_destroy(&ddt->ddt_repair_tree);
-	mutex_destroy(&ddt->ddt_lock);
-	kmem_free(ddt, sizeof (*ddt));
-}
-
-void
-ddt_create(spa_t *spa)
-{
-	spa->spa_dedup_checksum = ZIO_DEDUPCHECKSUM;
-
-	for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++)
-		spa->spa_ddt[c] = ddt_table_alloc(spa, c);
-}
-
-int
-ddt_load(spa_t *spa)
-{
-	int error;
-
-	ddt_create(spa);
-
-	error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
-	    DMU_POOL_DDT_STATS, sizeof (uint64_t), 1,
-	    &spa->spa_ddt_stat_object);
-
-	if (error)
-		return (error == ENOENT ? 0 : error);
-
-	for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
-		ddt_t *ddt = spa->spa_ddt[c];
-		for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
-			for (enum ddt_class class = 0; class < DDT_CLASSES;
-			    class++) {
-				error = ddt_object_load(ddt, type, class);
-				if (error != 0 && error != ENOENT)
-					return (error);
-			}
-		}
-
-		/*
-		 * Seed the cached histograms.
-		 */
-		bcopy(ddt->ddt_histogram, &ddt->ddt_histogram_cache,
-		    sizeof (ddt->ddt_histogram));
-	}
-
-	return (0);
-}
-
-void
-ddt_unload(spa_t *spa)
-{
-	for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
-		if (spa->spa_ddt[c]) {
-			ddt_table_free(spa->spa_ddt[c]);
-			spa->spa_ddt[c] = NULL;
-		}
-	}
-}
-
-boolean_t
-ddt_class_contains(spa_t *spa, enum ddt_class max_class, const blkptr_t *bp)
-{
-	ddt_t *ddt;
-	ddt_entry_t dde;
-
-	if (!BP_GET_DEDUP(bp))
-		return (B_FALSE);
-
-	if (max_class == DDT_CLASS_UNIQUE)
-		return (B_TRUE);
-
-	ddt = spa->spa_ddt[BP_GET_CHECKSUM(bp)];
-
-	ddt_key_fill(&dde.dde_key, bp);
-
-	for (enum ddt_type type = 0; type < DDT_TYPES; type++)
-		for (enum ddt_class class = 0; class <= max_class; class++)
-			if (ddt_object_lookup(ddt, type, class, &dde) == 0)
-				return (B_TRUE);
-
-	return (B_FALSE);
-}
-
-ddt_entry_t *
-ddt_repair_start(ddt_t *ddt, const blkptr_t *bp)
-{
-	ddt_key_t ddk;
-	ddt_entry_t *dde;
-
-	ddt_key_fill(&ddk, bp);
-
-	dde = ddt_alloc(&ddk);
-
-	for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
-		for (enum ddt_class class = 0; class < DDT_CLASSES; class++) {
-			/*
-			 * We can only do repair if there are multiple copies
-			 * of the block.  For anything in the UNIQUE class,
-			 * there's definitely only one copy, so don't even try.
-			 */
-			if (class != DDT_CLASS_UNIQUE &&
-			    ddt_object_lookup(ddt, type, class, dde) == 0)
-				return (dde);
-		}
-	}
-
-	bzero(dde->dde_phys, sizeof (dde->dde_phys));
-
-	return (dde);
-}
-
-void
-ddt_repair_done(ddt_t *ddt, ddt_entry_t *dde)
-{
-	avl_index_t where;
-
-	ddt_enter(ddt);
-
-	if (dde->dde_repair_abd != NULL && spa_writeable(ddt->ddt_spa) &&
-	    avl_find(&ddt->ddt_repair_tree, dde, &where) == NULL)
-		avl_insert(&ddt->ddt_repair_tree, dde, where);
-	else
-		ddt_free(dde);
-
-	ddt_exit(ddt);
-}
-
-static void
-ddt_repair_entry_done(zio_t *zio)
-{
-	ddt_entry_t *rdde = zio->io_private;
-
-	ddt_free(rdde);
-}
-
-static void
-ddt_repair_entry(ddt_t *ddt, ddt_entry_t *dde, ddt_entry_t *rdde, zio_t *rio)
-{
-	ddt_phys_t *ddp = dde->dde_phys;
-	ddt_phys_t *rddp = rdde->dde_phys;
-	ddt_key_t *ddk = &dde->dde_key;
-	ddt_key_t *rddk = &rdde->dde_key;
-	zio_t *zio;
-	blkptr_t blk;
-
-	zio = zio_null(rio, rio->io_spa, NULL,
-	    ddt_repair_entry_done, rdde, rio->io_flags);
-
-	for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++, rddp++) {
-		if (ddp->ddp_phys_birth == 0 ||
-		    ddp->ddp_phys_birth != rddp->ddp_phys_birth ||
-		    bcmp(ddp->ddp_dva, rddp->ddp_dva, sizeof (ddp->ddp_dva)))
-			continue;
-		ddt_bp_create(ddt->ddt_checksum, ddk, ddp, &blk);
-		zio_nowait(zio_rewrite(zio, zio->io_spa, 0, &blk,
-		    rdde->dde_repair_abd, DDK_GET_PSIZE(rddk), NULL, NULL,
-		    ZIO_PRIORITY_SYNC_WRITE, ZIO_DDT_CHILD_FLAGS(zio), NULL));
-	}
-
-	zio_nowait(zio);
-}
-
-static void
-ddt_repair_table(ddt_t *ddt, zio_t *rio)
-{
-	spa_t *spa = ddt->ddt_spa;
-	ddt_entry_t *dde, *rdde_next, *rdde;
-	avl_tree_t *t = &ddt->ddt_repair_tree;
-	blkptr_t blk;
-
-	if (spa_sync_pass(spa) > 1)
-		return;
-
-	ddt_enter(ddt);
-	for (rdde = avl_first(t); rdde != NULL; rdde = rdde_next) {
-		rdde_next = AVL_NEXT(t, rdde);
-		avl_remove(&ddt->ddt_repair_tree, rdde);
-		ddt_exit(ddt);
-		ddt_bp_create(ddt->ddt_checksum, &rdde->dde_key, NULL, &blk);
-		dde = ddt_repair_start(ddt, &blk);
-		ddt_repair_entry(ddt, dde, rdde, rio);
-		ddt_repair_done(ddt, dde);
-		ddt_enter(ddt);
-	}
-	ddt_exit(ddt);
-}
-
-static void
-ddt_sync_entry(ddt_t *ddt, ddt_entry_t *dde, dmu_tx_t *tx, uint64_t txg)
-{
-	dsl_pool_t *dp = ddt->ddt_spa->spa_dsl_pool;
-	ddt_phys_t *ddp = dde->dde_phys;
-	ddt_key_t *ddk = &dde->dde_key;
-	enum ddt_type otype = dde->dde_type;
-	enum ddt_type ntype = DDT_TYPE_CURRENT;
-	enum ddt_class oclass = dde->dde_class;
-	enum ddt_class nclass;
-	uint64_t total_refcnt = 0;
-
-	ASSERT(dde->dde_loaded);
-	ASSERT(!dde->dde_loading);
-
-	for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
-		ASSERT(dde->dde_lead_zio[p] == NULL);
-		ASSERT((int64_t)ddp->ddp_refcnt >= 0);
-		if (ddp->ddp_phys_birth == 0) {
-			ASSERT(ddp->ddp_refcnt == 0);
-			continue;
-		}
-		if (p == DDT_PHYS_DITTO) {
-			if (ddt_ditto_copies_needed(ddt, dde, NULL) == 0)
-				ddt_phys_free(ddt, ddk, ddp, txg);
-			continue;
-		}
-		if (ddp->ddp_refcnt == 0)
-			ddt_phys_free(ddt, ddk, ddp, txg);
-		total_refcnt += ddp->ddp_refcnt;
-	}
-
-	if (dde->dde_phys[DDT_PHYS_DITTO].ddp_phys_birth != 0)
-		nclass = DDT_CLASS_DITTO;
-	else if (total_refcnt > 1)
-		nclass = DDT_CLASS_DUPLICATE;
-	else
-		nclass = DDT_CLASS_UNIQUE;
-
-	if (otype != DDT_TYPES &&
-	    (otype != ntype || oclass != nclass || total_refcnt == 0)) {
-		VERIFY(ddt_object_remove(ddt, otype, oclass, dde, tx) == 0);
-		ASSERT(ddt_object_lookup(ddt, otype, oclass, dde) == ENOENT);
-	}
-
-	if (total_refcnt != 0) {
-		dde->dde_type = ntype;
-		dde->dde_class = nclass;
-		ddt_stat_update(ddt, dde, 0);
-		if (!ddt_object_exists(ddt, ntype, nclass))
-			ddt_object_create(ddt, ntype, nclass, tx);
-		VERIFY(ddt_object_update(ddt, ntype, nclass, dde, tx) == 0);
-
-		/*
-		 * If the class changes, the order that we scan this bp
-		 * changes.  If it decreases, we could miss it, so
-		 * scan it right now.  (This covers both class changing
-		 * while we are doing ddt_walk(), and when we are
-		 * traversing.)
-		 */
-		if (nclass < oclass) {
-			dsl_scan_ddt_entry(dp->dp_scan,
-			    ddt->ddt_checksum, dde, tx);
-		}
-	}
-}
-
-static void
-ddt_sync_table(ddt_t *ddt, dmu_tx_t *tx, uint64_t txg)
-{
-	spa_t *spa = ddt->ddt_spa;
-	ddt_entry_t *dde;
-	void *cookie = NULL;
-
-	if (avl_numnodes(&ddt->ddt_tree) == 0)
-		return;
-
-	ASSERT(spa->spa_uberblock.ub_version >= SPA_VERSION_DEDUP);
-
-	if (spa->spa_ddt_stat_object == 0) {
-		spa->spa_ddt_stat_object = zap_create_link(ddt->ddt_os,
-		    DMU_OT_DDT_STATS, DMU_POOL_DIRECTORY_OBJECT,
-		    DMU_POOL_DDT_STATS, tx);
-	}
-
-	while ((dde = avl_destroy_nodes(&ddt->ddt_tree, &cookie)) != NULL) {
-		ddt_sync_entry(ddt, dde, tx, txg);
-		ddt_free(dde);
-	}
-
-	for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
-		uint64_t add, count = 0;
-		for (enum ddt_class class = 0; class < DDT_CLASSES; class++) {
-			if (ddt_object_exists(ddt, type, class)) {
-				ddt_object_sync(ddt, type, class, tx);
-				VERIFY(ddt_object_count(ddt, type, class,
-				    &add) == 0);
-				count += add;
-			}
-		}
-		for (enum ddt_class class = 0; class < DDT_CLASSES; class++) {
-			if (count == 0 && ddt_object_exists(ddt, type, class))
-				ddt_object_destroy(ddt, type, class, tx);
-		}
-	}
-
-	bcopy(ddt->ddt_histogram, &ddt->ddt_histogram_cache,
-	    sizeof (ddt->ddt_histogram));
-}
-
-void
-ddt_sync(spa_t *spa, uint64_t txg)
-{
-	dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan;
-	dmu_tx_t *tx;
-	zio_t *rio;
-
-	ASSERT(spa_syncing_txg(spa) == txg);
-
-	tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
-
-	rio = zio_root(spa, NULL, NULL,
-	    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SELF_HEAL);
-
-	/*
-	 * This function may cause an immediate scan of ddt blocks (see
-	 * the comment above dsl_scan_ddt() for details). We set the
-	 * scan's root zio here so that we can wait for any scan IOs in
-	 * addition to the regular ddt IOs.
-	 */
-	ASSERT3P(scn->scn_zio_root, ==, NULL);
-	scn->scn_zio_root = rio;
-
-	for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
-		ddt_t *ddt = spa->spa_ddt[c];
-		if (ddt == NULL)
-			continue;
-		ddt_sync_table(ddt, tx, txg);
-		ddt_repair_table(ddt, rio);
-	}
-
-	(void) zio_wait(rio);
-	scn->scn_zio_root = NULL;
-
-	dmu_tx_commit(tx);
-}
-
-int
-ddt_walk(spa_t *spa, ddt_bookmark_t *ddb, ddt_entry_t *dde)
-{
-	do {
-		do {
-			do {
-				ddt_t *ddt = spa->spa_ddt[ddb->ddb_checksum];
-				int error = ENOENT;
-				if (ddt_object_exists(ddt, ddb->ddb_type,
-				    ddb->ddb_class)) {
-					error = ddt_object_walk(ddt,
-					    ddb->ddb_type, ddb->ddb_class,
-					    &ddb->ddb_cursor, dde);
-				}
-				dde->dde_type = ddb->ddb_type;
-				dde->dde_class = ddb->ddb_class;
-				if (error == 0)
-					return (0);
-				if (error != ENOENT)
-					return (error);
-				ddb->ddb_cursor = 0;
-			} while (++ddb->ddb_checksum < ZIO_CHECKSUM_FUNCTIONS);
-			ddb->ddb_checksum = 0;
-		} while (++ddb->ddb_type < DDT_TYPES);
-		ddb->ddb_type = 0;
-	} while (++ddb->ddb_class < DDT_CLASSES);
-
-	return (SET_ERROR(ENOENT));
-}
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/ddt_zap.c
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/ddt_zap.c
+++ /dev/null
@@ -1,165 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-
-/*
- * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2018 by Delphix. All rights reserved.
- */
-
-#include <sys/zfs_context.h>
-#include <sys/spa.h>
-#include <sys/zio.h>
-#include <sys/ddt.h>
-#include <sys/zap.h>
-#include <sys/dmu_tx.h>
-
-int ddt_zap_leaf_blockshift = 12;
-int ddt_zap_indirect_blockshift = 12;
-
-static int
-ddt_zap_create(objset_t *os, uint64_t *objectp, dmu_tx_t *tx, boolean_t prehash)
-{
-	zap_flags_t flags = ZAP_FLAG_HASH64 | ZAP_FLAG_UINT64_KEY;
-
-	if (prehash)
-		flags |= ZAP_FLAG_PRE_HASHED_KEY;
-
-	*objectp = zap_create_flags(os, 0, flags, DMU_OT_DDT_ZAP,
-	    ddt_zap_leaf_blockshift, ddt_zap_indirect_blockshift,
-	    DMU_OT_NONE, 0, tx);
-
-	return (*objectp == 0 ? ENOTSUP : 0);
-}
-
-static int
-ddt_zap_destroy(objset_t *os, uint64_t object, dmu_tx_t *tx)
-{
-	return (zap_destroy(os, object, tx));
-}
-
-static int
-ddt_zap_lookup(objset_t *os, uint64_t object, ddt_entry_t *dde)
-{
-	uchar_t cbuf[sizeof (dde->dde_phys) + 1];
-	uint64_t one, csize;
-	int error;
-
-	error = zap_length_uint64(os, object, (uint64_t *)&dde->dde_key,
-	    DDT_KEY_WORDS, &one, &csize);
-	if (error)
-		return (error);
-
-	ASSERT(one == 1);
-	ASSERT(csize <= sizeof (cbuf));
-
-	error = zap_lookup_uint64(os, object, (uint64_t *)&dde->dde_key,
-	    DDT_KEY_WORDS, 1, csize, cbuf);
-	if (error)
-		return (error);
-
-	ddt_decompress(cbuf, dde->dde_phys, csize, sizeof (dde->dde_phys));
-
-	return (0);
-}
-
-static void
-ddt_zap_prefetch(objset_t *os, uint64_t object, ddt_entry_t *dde)
-{
-	(void) zap_prefetch_uint64(os, object, (uint64_t *)&dde->dde_key,
-	    DDT_KEY_WORDS);
-}
-
-static int
-ddt_zap_update(objset_t *os, uint64_t object, ddt_entry_t *dde, dmu_tx_t *tx)
-{
-	uchar_t cbuf[sizeof (dde->dde_phys) + 1];
-	uint64_t csize;
-
-	csize = ddt_compress(dde->dde_phys, cbuf,
-	    sizeof (dde->dde_phys), sizeof (cbuf));
-
-	return (zap_update_uint64(os, object, (uint64_t *)&dde->dde_key,
-	    DDT_KEY_WORDS, 1, csize, cbuf, tx));
-}
-
-static int
-ddt_zap_remove(objset_t *os, uint64_t object, ddt_entry_t *dde, dmu_tx_t *tx)
-{
-	return (zap_remove_uint64(os, object, (uint64_t *)&dde->dde_key,
-	    DDT_KEY_WORDS, tx));
-}
-
-static int
-ddt_zap_walk(objset_t *os, uint64_t object, ddt_entry_t *dde, uint64_t *walk)
-{
-	zap_cursor_t zc;
-	zap_attribute_t za;
-	int error;
-
-	if (*walk == 0) {
-		/*
-		 * We don't want to prefetch the entire ZAP object, because
-		 * it can be enormous.  Also the primary use of DDT iteration
-		 * is for scrubbing, in which case we will be issuing many
-		 * scrub i/os for each ZAP block that we read in, so
-		 * reading the ZAP is unlikely to be the bottleneck.
-		 */
-		zap_cursor_init_noprefetch(&zc, os, object);
-	} else {
-		zap_cursor_init_serialized(&zc, os, object, *walk);
-	}
-	if ((error = zap_cursor_retrieve(&zc, &za)) == 0) {
-		uchar_t cbuf[sizeof (dde->dde_phys) + 1];
-		uint64_t csize = za.za_num_integers;
-		ASSERT(za.za_integer_length == 1);
-		error = zap_lookup_uint64(os, object, (uint64_t *)za.za_name,
-		    DDT_KEY_WORDS, 1, csize, cbuf);
-		ASSERT(error == 0);
-		if (error == 0) {
-			ddt_decompress(cbuf, dde->dde_phys, csize,
-			    sizeof (dde->dde_phys));
-			dde->dde_key = *(ddt_key_t *)za.za_name;
-		}
-		zap_cursor_advance(&zc);
-		*walk = zap_cursor_serialize(&zc);
-	}
-	zap_cursor_fini(&zc);
-	return (error);
-}
-
-static int
-ddt_zap_count(objset_t *os, uint64_t object, uint64_t *count)
-{
-
-	return (zap_count(os, object, count));
-}
-
-const ddt_ops_t ddt_zap_ops = {
-	"zap",
-	ddt_zap_create,
-	ddt_zap_destroy,
-	ddt_zap_lookup,
-	ddt_zap_prefetch,
-	ddt_zap_update,
-	ddt_zap_remove,
-	ddt_zap_walk,
-	ddt_zap_count,
-};
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c
+++ /dev/null
@@ -1,2748 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, 2017 by Delphix. All rights reserved.
- * Copyright (c) 2019 Datto Inc.
- */
-/* Copyright (c) 2013 by Saso Kiselkov. All rights reserved. */
-/* Copyright (c) 2013, Joyent, Inc. All rights reserved. */
-/* Copyright 2016 Nexenta Systems, Inc. All rights reserved. */
-
-#include <sys/dmu.h>
-#include <sys/dmu_impl.h>
-#include <sys/dmu_tx.h>
-#include <sys/dbuf.h>
-#include <sys/dnode.h>
-#include <sys/zfs_context.h>
-#include <sys/dmu_objset.h>
-#include <sys/dmu_traverse.h>
-#include <sys/dsl_dataset.h>
-#include <sys/dsl_dir.h>
-#include <sys/dsl_pool.h>
-#include <sys/dsl_synctask.h>
-#include <sys/dsl_prop.h>
-#include <sys/dmu_zfetch.h>
-#include <sys/zfs_ioctl.h>
-#include <sys/zap.h>
-#include <sys/zio_checksum.h>
-#include <sys/zio_compress.h>
-#include <sys/sa.h>
-#include <sys/zfeature.h>
-#include <sys/abd.h>
-#ifdef _KERNEL
-#include <sys/racct.h>
-#include <sys/vm.h>
-#include <sys/zfs_znode.h>
-#endif
-
-/*
- * Enable/disable nopwrite feature.
- */
-int zfs_nopwrite_enabled = 1;
-SYSCTL_DECL(_vfs_zfs);
-SYSCTL_INT(_vfs_zfs, OID_AUTO, nopwrite_enabled, CTLFLAG_RDTUN,
-    &zfs_nopwrite_enabled, 0, "Enable nopwrite feature");
-
-/*
- * Tunable to control percentage of dirtied L1 blocks from frees allowed into
- * one TXG. After this threshold is crossed, additional dirty blocks from frees
- * will wait until the next TXG.
- * A value of zero will disable this throttle.
- */
-uint32_t zfs_per_txg_dirty_frees_percent = 5;
-SYSCTL_INT(_vfs_zfs, OID_AUTO, per_txg_dirty_frees_percent, CTLFLAG_RWTUN,
-	&zfs_per_txg_dirty_frees_percent, 0,
-	"Percentage of dirtied indirect blocks from frees allowed in one txg");
-
-/*
- * This can be used for testing, to ensure that certain actions happen
- * while in the middle of a remap (which might otherwise complete too
- * quickly).
- */
-int zfs_object_remap_one_indirect_delay_ticks = 0;
-
-/*
- * Limit the amount we can prefetch with one call to this amount.  This
- * helps to limit the amount of memory that can be used by prefetching.
- * Larger objects should be prefetched a bit at a time.
- */
-uint64_t dmu_prefetch_max = 8 * SPA_MAXBLOCKSIZE;
-
-const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = {
-	{ DMU_BSWAP_UINT8,  TRUE,  FALSE,  "unallocated"		},
-	{ DMU_BSWAP_ZAP,    TRUE,  TRUE,   "object directory"		},
-	{ DMU_BSWAP_UINT64, TRUE,  TRUE,   "object array"		},
-	{ DMU_BSWAP_UINT8,  TRUE,  FALSE,  "packed nvlist"		},
-	{ DMU_BSWAP_UINT64, TRUE,  FALSE,  "packed nvlist size"		},
-	{ DMU_BSWAP_UINT64, TRUE,  FALSE,  "bpobj"			},
-	{ DMU_BSWAP_UINT64, TRUE,  FALSE,  "bpobj header"		},
-	{ DMU_BSWAP_UINT64, TRUE,  FALSE,  "SPA space map header"	},
-	{ DMU_BSWAP_UINT64, TRUE,  FALSE,  "SPA space map"		},
-	{ DMU_BSWAP_UINT64, TRUE,  FALSE,  "ZIL intent log"		},
-	{ DMU_BSWAP_DNODE,  TRUE,  FALSE,  "DMU dnode"			},
-	{ DMU_BSWAP_OBJSET, TRUE,  TRUE,   "DMU objset"			},
-	{ DMU_BSWAP_UINT64, TRUE,  TRUE,   "DSL directory"		},
-	{ DMU_BSWAP_ZAP,    TRUE,  TRUE,   "DSL directory child map"	},
-	{ DMU_BSWAP_ZAP,    TRUE,  TRUE,   "DSL dataset snap map"	},
-	{ DMU_BSWAP_ZAP,    TRUE,  TRUE,   "DSL props"			},
-	{ DMU_BSWAP_UINT64, TRUE,  TRUE,   "DSL dataset"		},
-	{ DMU_BSWAP_ZNODE,  TRUE,  FALSE,  "ZFS znode"			},
-	{ DMU_BSWAP_OLDACL, TRUE,  FALSE,  "ZFS V0 ACL"			},
-	{ DMU_BSWAP_UINT8,  FALSE, FALSE,  "ZFS plain file"		},
-	{ DMU_BSWAP_ZAP,    TRUE,  FALSE,  "ZFS directory"		},
-	{ DMU_BSWAP_ZAP,    TRUE,  FALSE,  "ZFS master node"		},
-	{ DMU_BSWAP_ZAP,    TRUE,  FALSE,  "ZFS delete queue"		},
-	{ DMU_BSWAP_UINT8,  FALSE, FALSE,  "zvol object"		},
-	{ DMU_BSWAP_ZAP,    TRUE,  FALSE,  "zvol prop"			},
-	{ DMU_BSWAP_UINT8,  FALSE, FALSE,  "other uint8[]"		},
-	{ DMU_BSWAP_UINT64, FALSE, FALSE,  "other uint64[]"		},
-	{ DMU_BSWAP_ZAP,    TRUE,  FALSE,  "other ZAP"			},
-	{ DMU_BSWAP_ZAP,    TRUE,  FALSE,  "persistent error log"	},
-	{ DMU_BSWAP_UINT8,  TRUE,  FALSE,  "SPA history"		},
-	{ DMU_BSWAP_UINT64, TRUE,  FALSE,  "SPA history offsets"	},
-	{ DMU_BSWAP_ZAP,    TRUE,  TRUE,   "Pool properties"		},
-	{ DMU_BSWAP_ZAP,    TRUE,  TRUE,   "DSL permissions"		},
-	{ DMU_BSWAP_ACL,    TRUE,  FALSE,  "ZFS ACL"			},
-	{ DMU_BSWAP_UINT8,  TRUE,  FALSE,  "ZFS SYSACL"			},
-	{ DMU_BSWAP_UINT8,  TRUE,  FALSE,  "FUID table"			},
-	{ DMU_BSWAP_UINT64, TRUE,  FALSE,  "FUID table size"		},
-	{ DMU_BSWAP_ZAP,    TRUE,  TRUE,   "DSL dataset next clones"	},
-	{ DMU_BSWAP_ZAP,    TRUE,  FALSE,  "scan work queue"		},
-	{ DMU_BSWAP_ZAP,    TRUE,  FALSE,  "ZFS user/group used"	},
-	{ DMU_BSWAP_ZAP,    TRUE,  FALSE,  "ZFS user/group quota"	},
-	{ DMU_BSWAP_ZAP,    TRUE,  TRUE,   "snapshot refcount tags"	},
-	{ DMU_BSWAP_ZAP,    TRUE,  FALSE,  "DDT ZAP algorithm"		},
-	{ DMU_BSWAP_ZAP,    TRUE,  FALSE,  "DDT statistics"		},
-	{ DMU_BSWAP_UINT8,  TRUE,  FALSE,  "System attributes"		},
-	{ DMU_BSWAP_ZAP,    TRUE,  FALSE,  "SA master node"		},
-	{ DMU_BSWAP_ZAP,    TRUE,  FALSE,  "SA attr registration"	},
-	{ DMU_BSWAP_ZAP,    TRUE,  FALSE,  "SA attr layouts"		},
-	{ DMU_BSWAP_ZAP,    TRUE,  FALSE,  "scan translations"		},
-	{ DMU_BSWAP_UINT8,  FALSE, FALSE,  "deduplicated block"		},
-	{ DMU_BSWAP_ZAP,    TRUE,  TRUE,   "DSL deadlist map"		},
-	{ DMU_BSWAP_UINT64, TRUE,  TRUE,   "DSL deadlist map hdr"	},
-	{ DMU_BSWAP_ZAP,    TRUE,  TRUE,   "DSL dir clones"		},
-	{ DMU_BSWAP_UINT64, TRUE,  FALSE,  "bpobj subobj"		}
-};
-
-const dmu_object_byteswap_info_t dmu_ot_byteswap[DMU_BSWAP_NUMFUNCS] = {
-	{	byteswap_uint8_array,	"uint8"		},
-	{	byteswap_uint16_array,	"uint16"	},
-	{	byteswap_uint32_array,	"uint32"	},
-	{	byteswap_uint64_array,	"uint64"	},
-	{	zap_byteswap,		"zap"		},
-	{	dnode_buf_byteswap,	"dnode"		},
-	{	dmu_objset_byteswap,	"objset"	},
-	{	zfs_znode_byteswap,	"znode"		},
-	{	zfs_oldacl_byteswap,	"oldacl"	},
-	{	zfs_acl_byteswap,	"acl"		}
-};
-
-int
-dmu_buf_hold_noread_by_dnode(dnode_t *dn, uint64_t offset,
-    void *tag, dmu_buf_t **dbp)
-{
-	uint64_t blkid;
-	dmu_buf_impl_t *db;
-
-	blkid = dbuf_whichblock(dn, 0, offset);
-	rw_enter(&dn->dn_struct_rwlock, RW_READER);
-	db = dbuf_hold(dn, blkid, tag);
-	rw_exit(&dn->dn_struct_rwlock);
-
-	if (db == NULL) {
-		*dbp = NULL;
-		return (SET_ERROR(EIO));
-	}
-
-	*dbp = &db->db;
-	return (0);
-}
-int
-dmu_buf_hold_noread(objset_t *os, uint64_t object, uint64_t offset,
-    void *tag, dmu_buf_t **dbp)
-{
-	dnode_t *dn;
-	uint64_t blkid;
-	dmu_buf_impl_t *db;
-	int err;
-
-	err = dnode_hold(os, object, FTAG, &dn);
-	if (err)
-		return (err);
-	blkid = dbuf_whichblock(dn, 0, offset);
-	rw_enter(&dn->dn_struct_rwlock, RW_READER);
-	db = dbuf_hold(dn, blkid, tag);
-	rw_exit(&dn->dn_struct_rwlock);
-	dnode_rele(dn, FTAG);
-
-	if (db == NULL) {
-		*dbp = NULL;
-		return (SET_ERROR(EIO));
-	}
-
-	*dbp = &db->db;
-	return (err);
-}
-
-int
-dmu_buf_hold_by_dnode(dnode_t *dn, uint64_t offset,
-    void *tag, dmu_buf_t **dbp, int flags)
-{
-	int err;
-	int db_flags = DB_RF_CANFAIL;
-
-	if (flags & DMU_READ_NO_PREFETCH)
-		db_flags |= DB_RF_NOPREFETCH;
-
-	err = dmu_buf_hold_noread_by_dnode(dn, offset, tag, dbp);
-	if (err == 0) {
-		dmu_buf_impl_t *db = (dmu_buf_impl_t *)(*dbp);
-		err = dbuf_read(db, NULL, db_flags);
-		if (err != 0) {
-			dbuf_rele(db, tag);
-			*dbp = NULL;
-		}
-	}
-
-	return (err);
-}
-
-int
-dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset,
-    void *tag, dmu_buf_t **dbp, int flags)
-{
-	int err;
-	int db_flags = DB_RF_CANFAIL;
-
-	if (flags & DMU_READ_NO_PREFETCH)
-		db_flags |= DB_RF_NOPREFETCH;
-
-	err = dmu_buf_hold_noread(os, object, offset, tag, dbp);
-	if (err == 0) {
-		dmu_buf_impl_t *db = (dmu_buf_impl_t *)(*dbp);
-		err = dbuf_read(db, NULL, db_flags);
-		if (err != 0) {
-			dbuf_rele(db, tag);
-			*dbp = NULL;
-		}
-	}
-
-	return (err);
-}
-
-int
-dmu_bonus_max(void)
-{
-	return (DN_OLD_MAX_BONUSLEN);
-}
-
-int
-dmu_set_bonus(dmu_buf_t *db_fake, int newsize, dmu_tx_t *tx)
-{
-	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
-	dnode_t *dn;
-	int error;
-
-	DB_DNODE_ENTER(db);
-	dn = DB_DNODE(db);
-
-	if (dn->dn_bonus != db) {
-		error = SET_ERROR(EINVAL);
-	} else if (newsize < 0 || newsize > db_fake->db_size) {
-		error = SET_ERROR(EINVAL);
-	} else {
-		dnode_setbonuslen(dn, newsize, tx);
-		error = 0;
-	}
-
-	DB_DNODE_EXIT(db);
-	return (error);
-}
-
-int
-dmu_set_bonustype(dmu_buf_t *db_fake, dmu_object_type_t type, dmu_tx_t *tx)
-{
-	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
-	dnode_t *dn;
-	int error;
-
-	DB_DNODE_ENTER(db);
-	dn = DB_DNODE(db);
-
-	if (!DMU_OT_IS_VALID(type)) {
-		error = SET_ERROR(EINVAL);
-	} else if (dn->dn_bonus != db) {
-		error = SET_ERROR(EINVAL);
-	} else {
-		dnode_setbonus_type(dn, type, tx);
-		error = 0;
-	}
-
-	DB_DNODE_EXIT(db);
-	return (error);
-}
-
-dmu_object_type_t
-dmu_get_bonustype(dmu_buf_t *db_fake)
-{
-	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
-	dnode_t *dn;
-	dmu_object_type_t type;
-
-	DB_DNODE_ENTER(db);
-	dn = DB_DNODE(db);
-	type = dn->dn_bonustype;
-	DB_DNODE_EXIT(db);
-
-	return (type);
-}
-
-int
-dmu_rm_spill(objset_t *os, uint64_t object, dmu_tx_t *tx)
-{
-	dnode_t *dn;
-	int error;
-
-	error = dnode_hold(os, object, FTAG, &dn);
-	dbuf_rm_spill(dn, tx);
-	rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
-	dnode_rm_spill(dn, tx);
-	rw_exit(&dn->dn_struct_rwlock);
-	dnode_rele(dn, FTAG);
-	return (error);
-}
-
-/*
- * returns ENOENT, EIO, or 0.
- */
-int
-dmu_bonus_hold(objset_t *os, uint64_t object, void *tag, dmu_buf_t **dbp)
-{
-	dnode_t *dn;
-	dmu_buf_impl_t *db;
-	int error;
-
-	error = dnode_hold(os, object, FTAG, &dn);
-	if (error)
-		return (error);
-
-	rw_enter(&dn->dn_struct_rwlock, RW_READER);
-	if (dn->dn_bonus == NULL) {
-		rw_exit(&dn->dn_struct_rwlock);
-		rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
-		if (dn->dn_bonus == NULL)
-			dbuf_create_bonus(dn);
-	}
-	db = dn->dn_bonus;
-
-	/* as long as the bonus buf is held, the dnode will be held */
-	if (zfs_refcount_add(&db->db_holds, tag) == 1) {
-		VERIFY(dnode_add_ref(dn, db));
-		atomic_inc_32(&dn->dn_dbufs_count);
-	}
-
-	/*
-	 * Wait to drop dn_struct_rwlock until after adding the bonus dbuf's
-	 * hold and incrementing the dbuf count to ensure that dnode_move() sees
-	 * a dnode hold for every dbuf.
-	 */
-	rw_exit(&dn->dn_struct_rwlock);
-
-	dnode_rele(dn, FTAG);
-
-	VERIFY(0 == dbuf_read(db, NULL, DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH));
-
-	*dbp = &db->db;
-	return (0);
-}
-
-/*
- * returns ENOENT, EIO, or 0.
- *
- * This interface will allocate a blank spill dbuf when a spill blk
- * doesn't already exist on the dnode.
- *
- * if you only want to find an already existing spill db, then
- * dmu_spill_hold_existing() should be used.
- */
-int
-dmu_spill_hold_by_dnode(dnode_t *dn, uint32_t flags, void *tag, dmu_buf_t **dbp)
-{
-	dmu_buf_impl_t *db = NULL;
-	int err;
-
-	if ((flags & DB_RF_HAVESTRUCT) == 0)
-		rw_enter(&dn->dn_struct_rwlock, RW_READER);
-
-	db = dbuf_hold(dn, DMU_SPILL_BLKID, tag);
-
-	if ((flags & DB_RF_HAVESTRUCT) == 0)
-		rw_exit(&dn->dn_struct_rwlock);
-
-	ASSERT(db != NULL);
-	err = dbuf_read(db, NULL, flags);
-	if (err == 0)
-		*dbp = &db->db;
-	else
-		dbuf_rele(db, tag);
-	return (err);
-}
-
-int
-dmu_spill_hold_existing(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp)
-{
-	dmu_buf_impl_t *db = (dmu_buf_impl_t *)bonus;
-	dnode_t *dn;
-	int err;
-
-	DB_DNODE_ENTER(db);
-	dn = DB_DNODE(db);
-
-	if (spa_version(dn->dn_objset->os_spa) < SPA_VERSION_SA) {
-		err = SET_ERROR(EINVAL);
-	} else {
-		rw_enter(&dn->dn_struct_rwlock, RW_READER);
-
-		if (!dn->dn_have_spill) {
-			err = SET_ERROR(ENOENT);
-		} else {
-			err = dmu_spill_hold_by_dnode(dn,
-			    DB_RF_HAVESTRUCT | DB_RF_CANFAIL, tag, dbp);
-		}
-
-		rw_exit(&dn->dn_struct_rwlock);
-	}
-
-	DB_DNODE_EXIT(db);
-	return (err);
-}
-
-int
-dmu_spill_hold_by_bonus(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp)
-{
-	dmu_buf_impl_t *db = (dmu_buf_impl_t *)bonus;
-	dnode_t *dn;
-	int err;
-
-	DB_DNODE_ENTER(db);
-	dn = DB_DNODE(db);
-	err = dmu_spill_hold_by_dnode(dn, DB_RF_CANFAIL, tag, dbp);
-	DB_DNODE_EXIT(db);
-
-	return (err);
-}
-
-/*
- * Note: longer-term, we should modify all of the dmu_buf_*() interfaces
- * to take a held dnode rather than <os, object> -- the lookup is wasteful,
- * and can induce severe lock contention when writing to several files
- * whose dnodes are in the same block.
- */
-int
-dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length,
-    boolean_t read, void *tag, int *numbufsp, dmu_buf_t ***dbpp, uint32_t flags)
-{
-	dmu_buf_t **dbp;
-	uint64_t blkid, nblks, i;
-	uint32_t dbuf_flags;
-	int err;
-	zio_t *zio;
-
-	ASSERT(length <= DMU_MAX_ACCESS);
-
-	/*
-	 * Note: We directly notify the prefetch code of this read, so that
-	 * we can tell it about the multi-block read.  dbuf_read() only knows
-	 * about the one block it is accessing.
-	 */
-	dbuf_flags = DB_RF_CANFAIL | DB_RF_NEVERWAIT | DB_RF_HAVESTRUCT |
-	    DB_RF_NOPREFETCH;
-
-	rw_enter(&dn->dn_struct_rwlock, RW_READER);
-	if (dn->dn_datablkshift) {
-		int blkshift = dn->dn_datablkshift;
-		nblks = (P2ROUNDUP(offset + length, 1ULL << blkshift) -
-		    P2ALIGN(offset, 1ULL << blkshift)) >> blkshift;
-	} else {
-		if (offset + length > dn->dn_datablksz) {
-			zfs_panic_recover("zfs: accessing past end of object "
-			    "%llx/%llx (size=%u access=%llu+%llu)",
-			    (longlong_t)dn->dn_objset->
-			    os_dsl_dataset->ds_object,
-			    (longlong_t)dn->dn_object, dn->dn_datablksz,
-			    (longlong_t)offset, (longlong_t)length);
-			rw_exit(&dn->dn_struct_rwlock);
-			return (SET_ERROR(EIO));
-		}
-		nblks = 1;
-	}
-	dbp = kmem_zalloc(sizeof (dmu_buf_t *) * nblks, KM_SLEEP);
-
-#if defined(_KERNEL) && defined(RACCT)
-	if (racct_enable && !read) {
-		PROC_LOCK(curproc);
-		racct_add_force(curproc, RACCT_WRITEBPS, length);
-		racct_add_force(curproc, RACCT_WRITEIOPS, nblks);
-		PROC_UNLOCK(curproc);
-	}
-#endif
-
-	zio = zio_root(dn->dn_objset->os_spa, NULL, NULL, ZIO_FLAG_CANFAIL);
-	blkid = dbuf_whichblock(dn, 0, offset);
-	for (i = 0; i < nblks; i++) {
-		dmu_buf_impl_t *db = dbuf_hold(dn, blkid + i, tag);
-		if (db == NULL) {
-			rw_exit(&dn->dn_struct_rwlock);
-			dmu_buf_rele_array(dbp, nblks, tag);
-			zio_nowait(zio);
-			return (SET_ERROR(EIO));
-		}
-
-		/* initiate async i/o */
-		if (read)
-			(void) dbuf_read(db, zio, dbuf_flags);
-#ifdef _KERNEL
-		else
-			curthread->td_ru.ru_oublock++;
-#endif
-		dbp[i] = &db->db;
-	}
-
-	if ((flags & DMU_READ_NO_PREFETCH) == 0 &&
-	    DNODE_META_IS_CACHEABLE(dn) && length <= zfetch_array_rd_sz) {
-		dmu_zfetch(&dn->dn_zfetch, blkid, nblks,
-		    read && DNODE_IS_CACHEABLE(dn));
-	}
-	rw_exit(&dn->dn_struct_rwlock);
-
-	/* wait for async i/o */
-	err = zio_wait(zio);
-	if (err) {
-		dmu_buf_rele_array(dbp, nblks, tag);
-		return (err);
-	}
-
-	/* wait for other io to complete */
-	if (read) {
-		for (i = 0; i < nblks; i++) {
-			dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbp[i];
-			mutex_enter(&db->db_mtx);
-			while (db->db_state == DB_READ ||
-			    db->db_state == DB_FILL)
-				cv_wait(&db->db_changed, &db->db_mtx);
-			if (db->db_state == DB_UNCACHED)
-				err = SET_ERROR(EIO);
-			mutex_exit(&db->db_mtx);
-			if (err) {
-				dmu_buf_rele_array(dbp, nblks, tag);
-				return (err);
-			}
-		}
-	}
-
-	*numbufsp = nblks;
-	*dbpp = dbp;
-	return (0);
-}
-
-static int
-dmu_buf_hold_array(objset_t *os, uint64_t object, uint64_t offset,
-    uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp)
-{
-	dnode_t *dn;
-	int err;
-
-	err = dnode_hold(os, object, FTAG, &dn);
-	if (err)
-		return (err);
-
-	err = dmu_buf_hold_array_by_dnode(dn, offset, length, read, tag,
-	    numbufsp, dbpp, DMU_READ_PREFETCH);
-
-	dnode_rele(dn, FTAG);
-
-	return (err);
-}
-
-int
-dmu_buf_hold_array_by_bonus(dmu_buf_t *db_fake, uint64_t offset,
-    uint64_t length, boolean_t read, void *tag, int *numbufsp,
-    dmu_buf_t ***dbpp)
-{
-	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
-	dnode_t *dn;
-	int err;
-
-	DB_DNODE_ENTER(db);
-	dn = DB_DNODE(db);
-	err = dmu_buf_hold_array_by_dnode(dn, offset, length, read, tag,
-	    numbufsp, dbpp, DMU_READ_PREFETCH);
-	DB_DNODE_EXIT(db);
-
-	return (err);
-}
-
-void
-dmu_buf_rele_array(dmu_buf_t **dbp_fake, int numbufs, void *tag)
-{
-	int i;
-	dmu_buf_impl_t **dbp = (dmu_buf_impl_t **)dbp_fake;
-
-	if (numbufs == 0)
-		return;
-
-	for (i = 0; i < numbufs; i++) {
-		if (dbp[i])
-			dbuf_rele(dbp[i], tag);
-	}
-
-	kmem_free(dbp, sizeof (dmu_buf_t *) * numbufs);
-}
-
-/*
- * Issue prefetch i/os for the given blocks.  If level is greater than 0, the
- * indirect blocks prefeteched will be those that point to the blocks containing
- * the data starting at offset, and continuing to offset + len.
- *
- * Note that if the indirect blocks above the blocks being prefetched are not in
- * cache, they will be asychronously read in.
- */
-void
-dmu_prefetch(objset_t *os, uint64_t object, int64_t level, uint64_t offset,
-    uint64_t len, zio_priority_t pri)
-{
-	dnode_t *dn;
-	uint64_t blkid;
-	int nblks, err;
-
-	if (len == 0) {  /* they're interested in the bonus buffer */
-		dn = DMU_META_DNODE(os);
-
-		if (object == 0 || object >= DN_MAX_OBJECT)
-			return;
-
-		rw_enter(&dn->dn_struct_rwlock, RW_READER);
-		blkid = dbuf_whichblock(dn, level,
-		    object * sizeof (dnode_phys_t));
-		dbuf_prefetch(dn, level, blkid, pri, 0);
-		rw_exit(&dn->dn_struct_rwlock);
-		return;
-	}
-
-	/*
-	 * See comment before the definition of dmu_prefetch_max.
-	 */
-	len = MIN(len, dmu_prefetch_max);
-
-	/*
-	 * XXX - Note, if the dnode for the requested object is not
-	 * already cached, we will do a *synchronous* read in the
-	 * dnode_hold() call.  The same is true for any indirects.
-	 */
-	err = dnode_hold(os, object, FTAG, &dn);
-	if (err != 0)
-		return;
-
-	rw_enter(&dn->dn_struct_rwlock, RW_READER);
-	/*
-	 * offset + len - 1 is the last byte we want to prefetch for, and offset
-	 * is the first.  Then dbuf_whichblk(dn, level, off + len - 1) is the
-	 * last block we want to prefetch, and dbuf_whichblock(dn, level,
-	 * offset)  is the first.  Then the number we need to prefetch is the
-	 * last - first + 1.
-	 */
-	if (level > 0 || dn->dn_datablkshift != 0) {
-		nblks = dbuf_whichblock(dn, level, offset + len - 1) -
-		    dbuf_whichblock(dn, level, offset) + 1;
-	} else {
-		nblks = (offset < dn->dn_datablksz);
-	}
-
-	if (nblks != 0) {
-		blkid = dbuf_whichblock(dn, level, offset);
-		for (int i = 0; i < nblks; i++)
-			dbuf_prefetch(dn, level, blkid + i, pri, 0);
-	}
-
-	rw_exit(&dn->dn_struct_rwlock);
-
-	dnode_rele(dn, FTAG);
-}
-
-/*
- * Get the next "chunk" of file data to free.  We traverse the file from
- * the end so that the file gets shorter over time (if we crashes in the
- * middle, this will leave us in a better state).  We find allocated file
- * data by simply searching the allocated level 1 indirects.
- *
- * On input, *start should be the first offset that does not need to be
- * freed (e.g. "offset + length").  On return, *start will be the first
- * offset that should be freed and l1blks is set to the number of level 1
- * indirect blocks found within the chunk.
- */
-static int
-get_next_chunk(dnode_t *dn, uint64_t *start, uint64_t minimum, uint64_t *l1blks)
-{
-	uint64_t blks;
-	uint64_t maxblks = DMU_MAX_ACCESS >> (dn->dn_indblkshift + 1);
-	/* bytes of data covered by a level-1 indirect block */
-	uint64_t iblkrange =
-	    dn->dn_datablksz * EPB(dn->dn_indblkshift, SPA_BLKPTRSHIFT);
-
-	ASSERT3U(minimum, <=, *start);
-
-	/*
-	 * Check if we can free the entire range assuming that all of the
-	 * L1 blocks in this range have data. If we can, we use this
-	 * worst case value as an estimate so we can avoid having to look
-	 * at the object's actual data.
-	 */
-	uint64_t total_l1blks =
-	    (roundup(*start, iblkrange) - (minimum / iblkrange * iblkrange)) /
-	    iblkrange;
-	if (total_l1blks <= maxblks) {
-		*l1blks = total_l1blks;
-		*start = minimum;
-		return (0);
-	}
-	ASSERT(ISP2(iblkrange));
-
-	for (blks = 0; *start > minimum && blks < maxblks; blks++) {
-		int err;
-
-		/*
-		 * dnode_next_offset(BACKWARDS) will find an allocated L1
-		 * indirect block at or before the input offset.  We must
-		 * decrement *start so that it is at the end of the region
-		 * to search.
-		 */
-		(*start)--;
-
-		err = dnode_next_offset(dn,
-		    DNODE_FIND_BACKWARDS, start, 2, 1, 0);
-
-		/* if there are no indirect blocks before start, we are done */
-		if (err == ESRCH) {
-			*start = minimum;
-			break;
-		} else if (err != 0) {
-			*l1blks = blks;
-			return (err);
-		}
-
-		/* set start to the beginning of this L1 indirect */
-		*start = P2ALIGN(*start, iblkrange);
-	}
-	if (*start < minimum)
-		*start = minimum;
-	*l1blks = blks;
-
-	return (0);
-}
-
-/*
- * If this objset is of type OST_ZFS return true if vfs's unmounted flag is set,
- * otherwise return false.
- * Used below in dmu_free_long_range_impl() to enable abort when unmounting
- */
-/*ARGSUSED*/
-static boolean_t
-dmu_objset_zfs_unmounting(objset_t *os)
-{
-#ifdef _KERNEL
-	if (dmu_objset_type(os) == DMU_OST_ZFS)
-		return (zfs_get_vfs_flag_unmounted(os));
-#endif
-	return (B_FALSE);
-}
-
-static int
-dmu_free_long_range_impl(objset_t *os, dnode_t *dn, uint64_t offset,
-    uint64_t length)
-{
-	uint64_t object_size = (dn->dn_maxblkid + 1) * dn->dn_datablksz;
-	int err;
-	uint64_t dirty_frees_threshold;
-	dsl_pool_t *dp = dmu_objset_pool(os);
-
-	if (offset >= object_size)
-		return (0);
-
-	if (zfs_per_txg_dirty_frees_percent <= 100)
-		dirty_frees_threshold =
-		    zfs_per_txg_dirty_frees_percent * zfs_dirty_data_max / 100;
-	else
-		dirty_frees_threshold = zfs_dirty_data_max / 20;
-
-	if (length == DMU_OBJECT_END || offset + length > object_size)
-		length = object_size - offset;
-
-	while (length != 0) {
-		uint64_t chunk_end, chunk_begin, chunk_len;
-		uint64_t l1blks;
-		dmu_tx_t *tx;
-
-		if (dmu_objset_zfs_unmounting(dn->dn_objset))
-			return (SET_ERROR(EINTR));
-
-		chunk_end = chunk_begin = offset + length;
-
-		/* move chunk_begin backwards to the beginning of this chunk */
-		err = get_next_chunk(dn, &chunk_begin, offset, &l1blks);
-		if (err)
-			return (err);
-		ASSERT3U(chunk_begin, >=, offset);
-		ASSERT3U(chunk_begin, <=, chunk_end);
-
-		chunk_len = chunk_end - chunk_begin;
-
-		tx = dmu_tx_create(os);
-		dmu_tx_hold_free(tx, dn->dn_object, chunk_begin, chunk_len);
-
-		/*
-		 * Mark this transaction as typically resulting in a net
-		 * reduction in space used.
-		 */
-		dmu_tx_mark_netfree(tx);
-		err = dmu_tx_assign(tx, TXG_WAIT);
-		if (err) {
-			dmu_tx_abort(tx);
-			return (err);
-		}
-
-		uint64_t txg = dmu_tx_get_txg(tx);
-
-		mutex_enter(&dp->dp_lock);
-		uint64_t long_free_dirty =
-		    dp->dp_long_free_dirty_pertxg[txg & TXG_MASK];
-		mutex_exit(&dp->dp_lock);
-
-		/*
-		 * To avoid filling up a TXG with just frees, wait for
-		 * the next TXG to open before freeing more chunks if
-		 * we have reached the threshold of frees.
-		 */
-		if (dirty_frees_threshold != 0 &&
-		    long_free_dirty >= dirty_frees_threshold) {
-			dmu_tx_commit(tx);
-			txg_wait_open(dp, 0);
-			continue;
-		}
-
-		/*
-		 * In order to prevent unnecessary write throttling, for each
-		 * TXG, we track the cumulative size of L1 blocks being dirtied
-		 * in dnode_free_range() below. We compare this number to a
-		 * tunable threshold, past which we prevent new L1 dirty freeing
-		 * blocks from being added into the open TXG. See
-		 * dmu_free_long_range_impl() for details. The threshold
-		 * prevents write throttle activation due to dirty freeing L1
-		 * blocks taking up a large percentage of zfs_dirty_data_max.
-		 */
-		mutex_enter(&dp->dp_lock);
-		dp->dp_long_free_dirty_pertxg[txg & TXG_MASK] +=
-		    l1blks << dn->dn_indblkshift;
-		mutex_exit(&dp->dp_lock);
-		DTRACE_PROBE3(free__long__range,
-		    uint64_t, long_free_dirty, uint64_t, chunk_len,
-		    uint64_t, txg);
-		dnode_free_range(dn, chunk_begin, chunk_len, tx);
-		dmu_tx_commit(tx);
-
-		length -= chunk_len;
-	}
-	return (0);
-}
-
-int
-dmu_free_long_range(objset_t *os, uint64_t object,
-    uint64_t offset, uint64_t length)
-{
-	dnode_t *dn;
-	int err;
-
-	err = dnode_hold(os, object, FTAG, &dn);
-	if (err != 0)
-		return (err);
-	err = dmu_free_long_range_impl(os, dn, offset, length);
-
-	/*
-	 * It is important to zero out the maxblkid when freeing the entire
-	 * file, so that (a) subsequent calls to dmu_free_long_range_impl()
-	 * will take the fast path, and (b) dnode_reallocate() can verify
-	 * that the entire file has been freed.
-	 */
-	if (err == 0 && offset == 0 && length == DMU_OBJECT_END)
-		dn->dn_maxblkid = 0;
-
-	dnode_rele(dn, FTAG);
-	return (err);
-}
-
-int
-dmu_free_long_object(objset_t *os, uint64_t object)
-{
-	dmu_tx_t *tx;
-	int err;
-
-	err = dmu_free_long_range(os, object, 0, DMU_OBJECT_END);
-	if (err != 0)
-		return (err);
-
-	tx = dmu_tx_create(os);
-	dmu_tx_hold_bonus(tx, object);
-	dmu_tx_hold_free(tx, object, 0, DMU_OBJECT_END);
-	dmu_tx_mark_netfree(tx);
-	err = dmu_tx_assign(tx, TXG_WAIT);
-	if (err == 0) {
-		err = dmu_object_free(os, object, tx);
-		dmu_tx_commit(tx);
-	} else {
-		dmu_tx_abort(tx);
-	}
-
-	return (err);
-}
-
-int
-dmu_free_range(objset_t *os, uint64_t object, uint64_t offset,
-    uint64_t size, dmu_tx_t *tx)
-{
-	dnode_t *dn;
-	int err = dnode_hold(os, object, FTAG, &dn);
-	if (err)
-		return (err);
-	ASSERT(offset < UINT64_MAX);
-	ASSERT(size == -1ULL || size <= UINT64_MAX - offset);
-	dnode_free_range(dn, offset, size, tx);
-	dnode_rele(dn, FTAG);
-	return (0);
-}
-
-static int
-dmu_read_impl(dnode_t *dn, uint64_t offset, uint64_t size,
-    void *buf, uint32_t flags)
-{
-	dmu_buf_t **dbp;
-	int numbufs, err = 0;
-
-	/*
-	 * Deal with odd block sizes, where there can't be data past the first
-	 * block.  If we ever do the tail block optimization, we will need to
-	 * handle that here as well.
-	 */
-	if (dn->dn_maxblkid == 0) {
-		int newsz = offset > dn->dn_datablksz ? 0 :
-		    MIN(size, dn->dn_datablksz - offset);
-		bzero((char *)buf + newsz, size - newsz);
-		size = newsz;
-	}
-
-	while (size > 0) {
-		uint64_t mylen = MIN(size, DMU_MAX_ACCESS / 2);
-		int i;
-
-		/*
-		 * NB: we could do this block-at-a-time, but it's nice
-		 * to be reading in parallel.
-		 */
-		err = dmu_buf_hold_array_by_dnode(dn, offset, mylen,
-		    TRUE, FTAG, &numbufs, &dbp, flags);
-		if (err)
-			break;
-
-		for (i = 0; i < numbufs; i++) {
-			int tocpy;
-			int bufoff;
-			dmu_buf_t *db = dbp[i];
-
-			ASSERT(size > 0);
-
-			bufoff = offset - db->db_offset;
-			tocpy = (int)MIN(db->db_size - bufoff, size);
-
-			bcopy((char *)db->db_data + bufoff, buf, tocpy);
-
-			offset += tocpy;
-			size -= tocpy;
-			buf = (char *)buf + tocpy;
-		}
-		dmu_buf_rele_array(dbp, numbufs, FTAG);
-	}
-	return (err);
-}
-
-int
-dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
-    void *buf, uint32_t flags)
-{
-	dnode_t *dn;
-	int err;
-
-	err = dnode_hold(os, object, FTAG, &dn);
-	if (err != 0)
-		return (err);
-
-	err = dmu_read_impl(dn, offset, size, buf, flags);
-	dnode_rele(dn, FTAG);
-	return (err);
-}
-
-int
-dmu_read_by_dnode(dnode_t *dn, uint64_t offset, uint64_t size, void *buf,
-    uint32_t flags)
-{
-	return (dmu_read_impl(dn, offset, size, buf, flags));
-}
-
-static void
-dmu_write_impl(dmu_buf_t **dbp, int numbufs, uint64_t offset, uint64_t size,
-    const void *buf, dmu_tx_t *tx)
-{
-	int i;
-
-	for (i = 0; i < numbufs; i++) {
-		int tocpy;
-		int bufoff;
-		dmu_buf_t *db = dbp[i];
-
-		ASSERT(size > 0);
-
-		bufoff = offset - db->db_offset;
-		tocpy = (int)MIN(db->db_size - bufoff, size);
-
-		ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size);
-
-		if (tocpy == db->db_size)
-			dmu_buf_will_fill(db, tx);
-		else
-			dmu_buf_will_dirty(db, tx);
-
-		bcopy(buf, (char *)db->db_data + bufoff, tocpy);
-
-		if (tocpy == db->db_size)
-			dmu_buf_fill_done(db, tx);
-
-		offset += tocpy;
-		size -= tocpy;
-		buf = (char *)buf + tocpy;
-	}
-}
-
-void
-dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
-    const void *buf, dmu_tx_t *tx)
-{
-	dmu_buf_t **dbp;
-	int numbufs;
-
-	if (size == 0)
-		return;
-
-	VERIFY0(dmu_buf_hold_array(os, object, offset, size,
-	    FALSE, FTAG, &numbufs, &dbp));
-	dmu_write_impl(dbp, numbufs, offset, size, buf, tx);
-	dmu_buf_rele_array(dbp, numbufs, FTAG);
-}
-
-void
-dmu_write_by_dnode(dnode_t *dn, uint64_t offset, uint64_t size,
-    const void *buf, dmu_tx_t *tx)
-{
-	dmu_buf_t **dbp;
-	int numbufs;
-
-	if (size == 0)
-		return;
-
-	VERIFY0(dmu_buf_hold_array_by_dnode(dn, offset, size,
-	    FALSE, FTAG, &numbufs, &dbp, DMU_READ_PREFETCH));
-	dmu_write_impl(dbp, numbufs, offset, size, buf, tx);
-	dmu_buf_rele_array(dbp, numbufs, FTAG);
-}
-
-static int
-dmu_object_remap_one_indirect(objset_t *os, dnode_t *dn,
-    uint64_t last_removal_txg, uint64_t offset)
-{
-	uint64_t l1blkid = dbuf_whichblock(dn, 1, offset);
-	int err = 0;
-
-	rw_enter(&dn->dn_struct_rwlock, RW_READER);
-	dmu_buf_impl_t *dbuf = dbuf_hold_level(dn, 1, l1blkid, FTAG);
-	ASSERT3P(dbuf, !=, NULL);
-
-	/*
-	 * If the block hasn't been written yet, this default will ensure
-	 * we don't try to remap it.
-	 */
-	uint64_t birth = UINT64_MAX;
-	ASSERT3U(last_removal_txg, !=, UINT64_MAX);
-	if (dbuf->db_blkptr != NULL)
-		birth = dbuf->db_blkptr->blk_birth;
-	rw_exit(&dn->dn_struct_rwlock);
-
-	/*
-	 * If this L1 was already written after the last removal, then we've
-	 * already tried to remap it.
-	 */
-	if (birth <= last_removal_txg &&
-	    dbuf_read(dbuf, NULL, DB_RF_MUST_SUCCEED) == 0 &&
-	    dbuf_can_remap(dbuf)) {
-		dmu_tx_t *tx = dmu_tx_create(os);
-		dmu_tx_hold_remap_l1indirect(tx, dn->dn_object);
-		err = dmu_tx_assign(tx, TXG_WAIT);
-		if (err == 0) {
-			(void) dbuf_dirty(dbuf, tx);
-			dmu_tx_commit(tx);
-		} else {
-			dmu_tx_abort(tx);
-		}
-	}
-
-	dbuf_rele(dbuf, FTAG);
-
-	delay(zfs_object_remap_one_indirect_delay_ticks);
-
-	return (err);
-}
-
-/*
- * Remap all blockpointers in the object, if possible, so that they reference
- * only concrete vdevs.
- *
- * To do this, iterate over the L0 blockpointers and remap any that reference
- * an indirect vdev. Note that we only examine L0 blockpointers; since we
- * cannot guarantee that we can remap all blockpointer anyways (due to split
- * blocks), we do not want to make the code unnecessarily complicated to
- * catch the unlikely case that there is an L1 block on an indirect vdev that
- * contains no indirect blockpointers.
- */
-int
-dmu_object_remap_indirects(objset_t *os, uint64_t object,
-    uint64_t last_removal_txg)
-{
-	uint64_t offset, l1span;
-	int err;
-	dnode_t *dn;
-
-	err = dnode_hold(os, object, FTAG, &dn);
-	if (err != 0) {
-		return (err);
-	}
-
-	if (dn->dn_nlevels <= 1) {
-		if (issig(JUSTLOOKING) && issig(FORREAL)) {
-			err = SET_ERROR(EINTR);
-		}
-
-		/*
-		 * If the dnode has no indirect blocks, we cannot dirty them.
-		 * We still want to remap the blkptr(s) in the dnode if
-		 * appropriate, so mark it as dirty.
-		 */
-		if (err == 0 && dnode_needs_remap(dn)) {
-			dmu_tx_t *tx = dmu_tx_create(os);
-			dmu_tx_hold_bonus(tx, dn->dn_object);
-			if ((err = dmu_tx_assign(tx, TXG_WAIT)) == 0) {
-				dnode_setdirty(dn, tx);
-				dmu_tx_commit(tx);
-			} else {
-				dmu_tx_abort(tx);
-			}
-		}
-
-		dnode_rele(dn, FTAG);
-		return (err);
-	}
-
-	offset = 0;
-	l1span = 1ULL << (dn->dn_indblkshift - SPA_BLKPTRSHIFT +
-	    dn->dn_datablkshift);
-	/*
-	 * Find the next L1 indirect that is not a hole.
-	 */
-	while (dnode_next_offset(dn, 0, &offset, 2, 1, 0) == 0) {
-		if (issig(JUSTLOOKING) && issig(FORREAL)) {
-			err = SET_ERROR(EINTR);
-			break;
-		}
-		if ((err = dmu_object_remap_one_indirect(os, dn,
-		    last_removal_txg, offset)) != 0) {
-			break;
-		}
-		offset += l1span;
-	}
-
-	dnode_rele(dn, FTAG);
-	return (err);
-}
-
-void
-dmu_prealloc(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
-    dmu_tx_t *tx)
-{
-	dmu_buf_t **dbp;
-	int numbufs, i;
-
-	if (size == 0)
-		return;
-
-	VERIFY(0 == dmu_buf_hold_array(os, object, offset, size,
-	    FALSE, FTAG, &numbufs, &dbp));
-
-	for (i = 0; i < numbufs; i++) {
-		dmu_buf_t *db = dbp[i];
-
-		dmu_buf_will_not_fill(db, tx);
-	}
-	dmu_buf_rele_array(dbp, numbufs, FTAG);
-}
-
-void
-dmu_write_embedded(objset_t *os, uint64_t object, uint64_t offset,
-    void *data, uint8_t etype, uint8_t comp, int uncompressed_size,
-    int compressed_size, int byteorder, dmu_tx_t *tx)
-{
-	dmu_buf_t *db;
-
-	ASSERT3U(etype, <, NUM_BP_EMBEDDED_TYPES);
-	ASSERT3U(comp, <, ZIO_COMPRESS_FUNCTIONS);
-	VERIFY0(dmu_buf_hold_noread(os, object, offset,
-	    FTAG, &db));
-
-	dmu_buf_write_embedded(db,
-	    data, (bp_embedded_type_t)etype, (enum zio_compress)comp,
-	    uncompressed_size, compressed_size, byteorder, tx);
-
-	dmu_buf_rele(db, FTAG);
-}
-
-/*
- * DMU support for xuio
- */
-kstat_t *xuio_ksp = NULL;
-
-int
-dmu_xuio_init(xuio_t *xuio, int nblk)
-{
-	dmu_xuio_t *priv;
-	uio_t *uio = &xuio->xu_uio;
-
-	uio->uio_iovcnt = nblk;
-	uio->uio_iov = kmem_zalloc(nblk * sizeof (iovec_t), KM_SLEEP);
-
-	priv = kmem_zalloc(sizeof (dmu_xuio_t), KM_SLEEP);
-	priv->cnt = nblk;
-	priv->bufs = kmem_zalloc(nblk * sizeof (arc_buf_t *), KM_SLEEP);
-	priv->iovp = uio->uio_iov;
-	XUIO_XUZC_PRIV(xuio) = priv;
-
-	if (XUIO_XUZC_RW(xuio) == UIO_READ)
-		XUIOSTAT_INCR(xuiostat_onloan_rbuf, nblk);
-	else
-		XUIOSTAT_INCR(xuiostat_onloan_wbuf, nblk);
-
-	return (0);
-}
-
-void
-dmu_xuio_fini(xuio_t *xuio)
-{
-	dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio);
-	int nblk = priv->cnt;
-
-	kmem_free(priv->iovp, nblk * sizeof (iovec_t));
-	kmem_free(priv->bufs, nblk * sizeof (arc_buf_t *));
-	kmem_free(priv, sizeof (dmu_xuio_t));
-
-	if (XUIO_XUZC_RW(xuio) == UIO_READ)
-		XUIOSTAT_INCR(xuiostat_onloan_rbuf, -nblk);
-	else
-		XUIOSTAT_INCR(xuiostat_onloan_wbuf, -nblk);
-}
-
-/*
- * Initialize iov[priv->next] and priv->bufs[priv->next] with { off, n, abuf }
- * and increase priv->next by 1.
- */
-int
-dmu_xuio_add(xuio_t *xuio, arc_buf_t *abuf, offset_t off, size_t n)
-{
-	struct iovec *iov;
-	uio_t *uio = &xuio->xu_uio;
-	dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio);
-	int i = priv->next++;
-
-	ASSERT(i < priv->cnt);
-	ASSERT(off + n <= arc_buf_lsize(abuf));
-	iov = uio->uio_iov + i;
-	iov->iov_base = (char *)abuf->b_data + off;
-	iov->iov_len = n;
-	priv->bufs[i] = abuf;
-	return (0);
-}
-
-int
-dmu_xuio_cnt(xuio_t *xuio)
-{
-	dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio);
-	return (priv->cnt);
-}
-
-arc_buf_t *
-dmu_xuio_arcbuf(xuio_t *xuio, int i)
-{
-	dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio);
-
-	ASSERT(i < priv->cnt);
-	return (priv->bufs[i]);
-}
-
-void
-dmu_xuio_clear(xuio_t *xuio, int i)
-{
-	dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio);
-
-	ASSERT(i < priv->cnt);
-	priv->bufs[i] = NULL;
-}
-
-static void
-xuio_stat_init(void)
-{
-	xuio_ksp = kstat_create("zfs", 0, "xuio_stats", "misc",
-	    KSTAT_TYPE_NAMED, sizeof (xuio_stats) / sizeof (kstat_named_t),
-	    KSTAT_FLAG_VIRTUAL);
-	if (xuio_ksp != NULL) {
-		xuio_ksp->ks_data = &xuio_stats;
-		kstat_install(xuio_ksp);
-	}
-}
-
-static void
-xuio_stat_fini(void)
-{
-	if (xuio_ksp != NULL) {
-		kstat_delete(xuio_ksp);
-		xuio_ksp = NULL;
-	}
-}
-
-void
-xuio_stat_wbuf_copied(void)
-{
-	XUIOSTAT_BUMP(xuiostat_wbuf_copied);
-}
-
-void
-xuio_stat_wbuf_nocopy(void)
-{
-	XUIOSTAT_BUMP(xuiostat_wbuf_nocopy);
-}
-
-#ifdef _KERNEL
-int
-dmu_read_uio_dnode(dnode_t *dn, uio_t *uio, uint64_t size)
-{
-	dmu_buf_t **dbp;
-	int numbufs, i, err;
-	xuio_t *xuio = NULL;
-
-	/*
-	 * NB: we could do this block-at-a-time, but it's nice
-	 * to be reading in parallel.
-	 */
-	err = dmu_buf_hold_array_by_dnode(dn, uio->uio_loffset, size,
-	    TRUE, FTAG, &numbufs, &dbp, 0);
-	if (err)
-		return (err);
-
-#ifdef UIO_XUIO
-	if (uio->uio_extflg == UIO_XUIO)
-		xuio = (xuio_t *)uio;
-#endif
-
-	for (i = 0; i < numbufs; i++) {
-		int tocpy;
-		int bufoff;
-		dmu_buf_t *db = dbp[i];
-
-		ASSERT(size > 0);
-
-		bufoff = uio->uio_loffset - db->db_offset;
-		tocpy = (int)MIN(db->db_size - bufoff, size);
-
-		if (xuio) {
-			dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db;
-			arc_buf_t *dbuf_abuf = dbi->db_buf;
-			arc_buf_t *abuf = dbuf_loan_arcbuf(dbi);
-			err = dmu_xuio_add(xuio, abuf, bufoff, tocpy);
-			if (!err) {
-				uio->uio_resid -= tocpy;
-				uio->uio_loffset += tocpy;
-			}
-
-			if (abuf == dbuf_abuf)
-				XUIOSTAT_BUMP(xuiostat_rbuf_nocopy);
-			else
-				XUIOSTAT_BUMP(xuiostat_rbuf_copied);
-		} else {
-#ifdef illumos
-			err = uiomove((char *)db->db_data + bufoff, tocpy,
-			    UIO_READ, uio);
-#else
-			err = vn_io_fault_uiomove((char *)db->db_data + bufoff,
-			    tocpy, uio);
-#endif
-		}
-		if (err)
-			break;
-
-		size -= tocpy;
-	}
-	dmu_buf_rele_array(dbp, numbufs, FTAG);
-
-	return (err);
-}
-
-/*
- * Read 'size' bytes into the uio buffer.
- * From object zdb->db_object.
- * Starting at offset uio->uio_loffset.
- *
- * If the caller already has a dbuf in the target object
- * (e.g. its bonus buffer), this routine is faster than dmu_read_uio(),
- * because we don't have to find the dnode_t for the object.
- */
-int
-dmu_read_uio_dbuf(dmu_buf_t *zdb, uio_t *uio, uint64_t size)
-{
-	dmu_buf_impl_t *db = (dmu_buf_impl_t *)zdb;
-	dnode_t *dn;
-	int err;
-
-	if (size == 0)
-		return (0);
-
-	DB_DNODE_ENTER(db);
-	dn = DB_DNODE(db);
-	err = dmu_read_uio_dnode(dn, uio, size);
-	DB_DNODE_EXIT(db);
-
-	return (err);
-}
-
-/*
- * Read 'size' bytes into the uio buffer.
- * From the specified object
- * Starting at offset uio->uio_loffset.
- */
-int
-dmu_read_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size)
-{
-	dnode_t *dn;
-	int err;
-
-	if (size == 0)
-		return (0);
-
-	err = dnode_hold(os, object, FTAG, &dn);
-	if (err)
-		return (err);
-
-	err = dmu_read_uio_dnode(dn, uio, size);
-
-	dnode_rele(dn, FTAG);
-
-	return (err);
-}
-
-int
-dmu_write_uio_dnode(dnode_t *dn, uio_t *uio, uint64_t size, dmu_tx_t *tx)
-{
-	dmu_buf_t **dbp;
-	int numbufs;
-	int err = 0;
-	int i;
-
-	err = dmu_buf_hold_array_by_dnode(dn, uio->uio_loffset, size,
-	    FALSE, FTAG, &numbufs, &dbp, DMU_READ_PREFETCH);
-	if (err)
-		return (err);
-
-	for (i = 0; i < numbufs; i++) {
-		int tocpy;
-		int bufoff;
-		dmu_buf_t *db = dbp[i];
-
-		ASSERT(size > 0);
-
-		bufoff = uio->uio_loffset - db->db_offset;
-		tocpy = (int)MIN(db->db_size - bufoff, size);
-
-		ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size);
-
-		if (tocpy == db->db_size)
-			dmu_buf_will_fill(db, tx);
-		else
-			dmu_buf_will_dirty(db, tx);
-
-#ifdef illumos
-		/*
-		 * XXX uiomove could block forever (eg. nfs-backed
-		 * pages).  There needs to be a uiolockdown() function
-		 * to lock the pages in memory, so that uiomove won't
-		 * block.
-		 */
-		err = uiomove((char *)db->db_data + bufoff, tocpy,
-		    UIO_WRITE, uio);
-#else
-		err = vn_io_fault_uiomove((char *)db->db_data + bufoff, tocpy,
-		    uio);
-#endif
-
-		if (tocpy == db->db_size)
-			dmu_buf_fill_done(db, tx);
-
-		if (err)
-			break;
-
-		size -= tocpy;
-	}
-
-	dmu_buf_rele_array(dbp, numbufs, FTAG);
-	return (err);
-}
-
-/*
- * Write 'size' bytes from the uio buffer.
- * To object zdb->db_object.
- * Starting at offset uio->uio_loffset.
- *
- * If the caller already has a dbuf in the target object
- * (e.g. its bonus buffer), this routine is faster than dmu_write_uio(),
- * because we don't have to find the dnode_t for the object.
- */
-int
-dmu_write_uio_dbuf(dmu_buf_t *zdb, uio_t *uio, uint64_t size,
-    dmu_tx_t *tx)
-{
-	dmu_buf_impl_t *db = (dmu_buf_impl_t *)zdb;
-	dnode_t *dn;
-	int err;
-
-	if (size == 0)
-		return (0);
-
-	DB_DNODE_ENTER(db);
-	dn = DB_DNODE(db);
-	err = dmu_write_uio_dnode(dn, uio, size, tx);
-	DB_DNODE_EXIT(db);
-
-	return (err);
-}
-
-/*
- * Write 'size' bytes from the uio buffer.
- * To the specified object.
- * Starting at offset uio->uio_loffset.
- */
-int
-dmu_write_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size,
-    dmu_tx_t *tx)
-{
-	dnode_t *dn;
-	int err;
-
-	if (size == 0)
-		return (0);
-
-	err = dnode_hold(os, object, FTAG, &dn);
-	if (err)
-		return (err);
-
-	err = dmu_write_uio_dnode(dn, uio, size, tx);
-
-	dnode_rele(dn, FTAG);
-
-	return (err);
-}
-
-#ifdef illumos
-int
-dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
-    page_t *pp, dmu_tx_t *tx)
-{
-	dmu_buf_t **dbp;
-	int numbufs, i;
-	int err;
-
-	if (size == 0)
-		return (0);
-
-	err = dmu_buf_hold_array(os, object, offset, size,
-	    FALSE, FTAG, &numbufs, &dbp);
-	if (err)
-		return (err);
-
-	for (i = 0; i < numbufs; i++) {
-		int tocpy, copied, thiscpy;
-		int bufoff;
-		dmu_buf_t *db = dbp[i];
-		caddr_t va;
-
-		ASSERT(size > 0);
-		ASSERT3U(db->db_size, >=, PAGESIZE);
-
-		bufoff = offset - db->db_offset;
-		tocpy = (int)MIN(db->db_size - bufoff, size);
-
-		ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size);
-
-		if (tocpy == db->db_size)
-			dmu_buf_will_fill(db, tx);
-		else
-			dmu_buf_will_dirty(db, tx);
-
-		for (copied = 0; copied < tocpy; copied += PAGESIZE) {
-			ASSERT3U(pp->p_offset, ==, db->db_offset + bufoff);
-			thiscpy = MIN(PAGESIZE, tocpy - copied);
-			va = zfs_map_page(pp, S_READ);
-			bcopy(va, (char *)db->db_data + bufoff, thiscpy);
-			zfs_unmap_page(pp, va);
-			pp = pp->p_next;
-			bufoff += PAGESIZE;
-		}
-
-		if (tocpy == db->db_size)
-			dmu_buf_fill_done(db, tx);
-
-		offset += tocpy;
-		size -= tocpy;
-	}
-	dmu_buf_rele_array(dbp, numbufs, FTAG);
-	return (err);
-}
-
-#else	/* !illumos */
-
-int
-dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
-    vm_page_t *ma, dmu_tx_t *tx)
-{
-	dmu_buf_t **dbp;
-	struct sf_buf *sf;
-	int numbufs, i;
-	int err;
-
-	if (size == 0)
-		return (0);
-
-	err = dmu_buf_hold_array(os, object, offset, size,
-	    FALSE, FTAG, &numbufs, &dbp);
-	if (err)
-		return (err);
-
-	for (i = 0; i < numbufs; i++) {
-		int tocpy, copied, thiscpy;
-		int bufoff;
-		dmu_buf_t *db = dbp[i];
-		caddr_t va;
-
-		ASSERT(size > 0);
-		ASSERT3U(db->db_size, >=, PAGESIZE);
-
-		bufoff = offset - db->db_offset;
-		tocpy = (int)MIN(db->db_size - bufoff, size);
-
-		ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size);
-
-		if (tocpy == db->db_size)
-			dmu_buf_will_fill(db, tx);
-		else
-			dmu_buf_will_dirty(db, tx);
-
-		for (copied = 0; copied < tocpy; copied += PAGESIZE) {
-			ASSERT3U(ptoa((*ma)->pindex), ==, db->db_offset + bufoff);
-			thiscpy = MIN(PAGESIZE, tocpy - copied);
-			va = zfs_map_page(*ma, &sf);
-			bcopy(va, (char *)db->db_data + bufoff, thiscpy);
-			zfs_unmap_page(sf);
-			ma += 1;
-			bufoff += PAGESIZE;
-		}
-
-		if (tocpy == db->db_size)
-			dmu_buf_fill_done(db, tx);
-
-		offset += tocpy;
-		size -= tocpy;
-	}
-	dmu_buf_rele_array(dbp, numbufs, FTAG);
-	return (err);
-}
-
-int
-dmu_read_pages(objset_t *os, uint64_t object, vm_page_t *ma, int count,
-    int *rbehind, int *rahead, int last_size)
-{
-	struct sf_buf *sf;
-	vm_object_t vmobj;
-	vm_page_t m;
-	dmu_buf_t **dbp;
-	dmu_buf_t *db;
-	caddr_t va;
-	int numbufs, i;
-	int bufoff, pgoff, tocpy;
-	int mi, di;
-	int err;
-
-	ASSERT3U(ma[0]->pindex + count - 1, ==, ma[count - 1]->pindex);
-	ASSERT(last_size <= PAGE_SIZE);
-
-	err = dmu_buf_hold_array(os, object, IDX_TO_OFF(ma[0]->pindex),
-	    IDX_TO_OFF(count - 1) + last_size, TRUE, FTAG, &numbufs, &dbp);
-	if (err != 0)
-		return (err);
-
-#ifdef DEBUG
-	IMPLY(last_size < PAGE_SIZE, *rahead == 0);
-	if (dbp[0]->db_offset != 0 || numbufs > 1) {
-		for (i = 0; i < numbufs; i++) {
-			ASSERT(ISP2(dbp[i]->db_size));
-			ASSERT((dbp[i]->db_offset % dbp[i]->db_size) == 0);
-			ASSERT3U(dbp[i]->db_size, ==, dbp[0]->db_size);
-		}
-	}
-#endif
-
-	vmobj = ma[0]->object;
-
-	db = dbp[0];
-	for (i = 0; i < *rbehind; i++) {
-		m = vm_page_grab_unlocked(vmobj, ma[0]->pindex - 1 - i,
-		    VM_ALLOC_NORMAL | VM_ALLOC_NOWAIT |
-		    VM_ALLOC_SBUSY | VM_ALLOC_IGN_SBUSY);
-		if (m == NULL)
-			break;
-		if (!vm_page_none_valid(m)) {
-			ASSERT3U(m->valid, ==, VM_PAGE_BITS_ALL);
-			vm_page_sunbusy(m);
-			break;
-		}
-		ASSERT(m->dirty == 0);
-		ASSERT(!pmap_page_is_write_mapped(m));
-
-		ASSERT(db->db_size > PAGE_SIZE);
-		bufoff = IDX_TO_OFF(m->pindex) % db->db_size;
-		va = zfs_map_page(m, &sf);
-		bcopy((char *)db->db_data + bufoff, va, PAGESIZE);
-		zfs_unmap_page(sf);
-		vm_page_valid(m);
-		if ((m->busy_lock & VPB_BIT_WAITERS) != 0)
-			vm_page_activate(m);
-		else
-			vm_page_deactivate(m);
-		vm_page_sunbusy(m);
-	}
-	*rbehind = i;
-
-	bufoff = IDX_TO_OFF(ma[0]->pindex) % db->db_size;
-	pgoff = 0;
-	for (mi = 0, di = 0; mi < count && di < numbufs; ) {
-		if (pgoff == 0) {
-			m = ma[mi];
-			if (m != bogus_page) {
-				vm_page_assert_xbusied(m);
-				ASSERT(vm_page_none_valid(m));
-				ASSERT(m->dirty == 0);
-				ASSERT(!pmap_page_is_mapped(m));
-				va = zfs_map_page(m, &sf);
-			}
-		}
-		if (bufoff == 0)
-			db = dbp[di];
-
-		if (m != bogus_page) {
-			ASSERT3U(IDX_TO_OFF(m->pindex) + pgoff, ==,
-			    db->db_offset + bufoff);
-		}
-
-		/*
-		 * We do not need to clamp the copy size by the file
-		 * size as the last block is zero-filled beyond the
-		 * end of file anyway.
-		 */
-		tocpy = MIN(db->db_size - bufoff, PAGESIZE - pgoff);
-		if (m != bogus_page)
-			bcopy((char *)db->db_data + bufoff, va + pgoff, tocpy);
-
-		pgoff += tocpy;
-		ASSERT(pgoff <= PAGESIZE);
-		if (pgoff == PAGESIZE) {
-			if (m != bogus_page) {
-				zfs_unmap_page(sf);
-				vm_page_valid(m);
-			}
-			ASSERT(mi < count);
-			mi++;
-			pgoff = 0;
-		}
-
-		bufoff += tocpy;
-		ASSERT(bufoff <= db->db_size);
-		if (bufoff == db->db_size) {
-			ASSERT(di < numbufs);
-			di++;
-			bufoff = 0;
-		}
-	}
-
-#ifdef DEBUG
-	/*
-	 * Three possibilities:
-	 * - last requested page ends at a buffer boundary and , thus,
-	 *   all pages and buffers have been iterated;
-	 * - all requested pages are filled, but the last buffer
-	 *   has not been exhausted;
-	 *   the read-ahead is possible only in this case;
-	 * - all buffers have been read, but the last page has not been
-	 *   fully filled;
-	 *   this is only possible if the file has only a single buffer
-	 *   with a size that is not a multiple of the page size.
-	 */
-	if (mi == count) {
-		ASSERT(di >= numbufs - 1);
-		IMPLY(*rahead != 0, di == numbufs - 1);
-		IMPLY(*rahead != 0, bufoff != 0);
-		ASSERT(pgoff == 0);
-	}
-	if (di == numbufs) {
-		ASSERT(mi >= count - 1);
-		ASSERT(*rahead == 0);
-		IMPLY(pgoff == 0, mi == count);
-		if (pgoff != 0) {
-			ASSERT(mi == count - 1);
-			ASSERT((dbp[0]->db_size & PAGE_MASK) != 0);
-		}
-	}
-#endif
-	if (pgoff != 0) {
-		ASSERT(m != bogus_page);
-		bzero(va + pgoff, PAGESIZE - pgoff);
-		zfs_unmap_page(sf);
-		vm_page_valid(m);
-	}
-
-	for (i = 0; i < *rahead; i++) {
-		m = vm_page_grab_unlocked(vmobj, ma[count - 1]->pindex + 1 + i,
-		    VM_ALLOC_NORMAL | VM_ALLOC_NOWAIT |
-		    VM_ALLOC_SBUSY | VM_ALLOC_IGN_SBUSY);
-		if (m == NULL)
-			break;
-		if (!vm_page_none_valid(m)) {
-			ASSERT3U(m->valid, ==, VM_PAGE_BITS_ALL);
-			vm_page_sunbusy(m);
-			break;
-		}
-		ASSERT(m->dirty == 0);
-		ASSERT(!pmap_page_is_write_mapped(m));
-
-		ASSERT(db->db_size > PAGE_SIZE);
-		bufoff = IDX_TO_OFF(m->pindex) % db->db_size;
-		tocpy = MIN(db->db_size - bufoff, PAGESIZE);
-		va = zfs_map_page(m, &sf);
-		bcopy((char *)db->db_data + bufoff, va, tocpy);
-		if (tocpy < PAGESIZE) {
-			ASSERT(i == *rahead - 1);
-			ASSERT((db->db_size & PAGE_MASK) != 0);
-			bzero(va + tocpy, PAGESIZE - tocpy);
-		}
-		zfs_unmap_page(sf);
-		vm_page_valid(m);
-		if ((m->busy_lock & VPB_BIT_WAITERS) != 0)
-			vm_page_activate(m);
-		else
-			vm_page_deactivate(m);
-		vm_page_sunbusy(m);
-	}
-	*rahead = i;
-
-	dmu_buf_rele_array(dbp, numbufs, FTAG);
-	return (0);
-}
-#endif	/* illumos */
-#endif	/* _KERNEL */
-
-/*
- * Allocate a loaned anonymous arc buffer.
- */
-arc_buf_t *
-dmu_request_arcbuf(dmu_buf_t *handle, int size)
-{
-	dmu_buf_impl_t *db = (dmu_buf_impl_t *)handle;
-
-	return (arc_loan_buf(db->db_objset->os_spa, B_FALSE, size));
-}
-
-/*
- * Free a loaned arc buffer.
- */
-void
-dmu_return_arcbuf(arc_buf_t *buf)
-{
-	arc_return_buf(buf, FTAG);
-	arc_buf_destroy(buf, FTAG);
-}
-
-/*
- * When possible directly assign passed loaned arc buffer to a dbuf.
- * If this is not possible copy the contents of passed arc buf via
- * dmu_write().
- */
-void
-dmu_assign_arcbuf_dnode(dnode_t *dn, uint64_t offset, arc_buf_t *buf,
-    dmu_tx_t *tx)
-{
-	dmu_buf_impl_t *db;
-	uint32_t blksz = (uint32_t)arc_buf_lsize(buf);
-	uint64_t blkid;
-
-	rw_enter(&dn->dn_struct_rwlock, RW_READER);
-	blkid = dbuf_whichblock(dn, 0, offset);
-	VERIFY((db = dbuf_hold(dn, blkid, FTAG)) != NULL);
-	rw_exit(&dn->dn_struct_rwlock);
-
-	/*
-	 * We can only assign if the offset is aligned, the arc buf is the
-	 * same size as the dbuf, and the dbuf is not metadata.
-	 */
-	if (offset == db->db.db_offset && blksz == db->db.db_size) {
-#ifdef _KERNEL
-		curthread->td_ru.ru_oublock++;
-#ifdef RACCT
-		if (racct_enable) {
-			PROC_LOCK(curproc);
-			racct_add_force(curproc, RACCT_WRITEBPS, blksz);
-			racct_add_force(curproc, RACCT_WRITEIOPS, 1);
-			PROC_UNLOCK(curproc);
-		}
-#endif /* RACCT */
-#endif /* _KERNEL */
-		dbuf_assign_arcbuf(db, buf, tx);
-		dbuf_rele(db, FTAG);
-	} else {
-		objset_t *os;
-		uint64_t object;
-
-		/* compressed bufs must always be assignable to their dbuf */
-		ASSERT3U(arc_get_compression(buf), ==, ZIO_COMPRESS_OFF);
-		ASSERT(!(buf->b_flags & ARC_BUF_FLAG_COMPRESSED));
-
-		os = dn->dn_objset;
-		object = dn->dn_object;
-
-		dbuf_rele(db, FTAG);
-		dmu_write(os, object, offset, blksz, buf->b_data, tx);
-		dmu_return_arcbuf(buf);
-		XUIOSTAT_BUMP(xuiostat_wbuf_copied);
-	}
-}
-
-void
-dmu_assign_arcbuf(dmu_buf_t *handle, uint64_t offset, arc_buf_t *buf,
-    dmu_tx_t *tx)
-{
-	dmu_buf_impl_t *dbuf = (dmu_buf_impl_t *)handle;
-
-	DB_DNODE_ENTER(dbuf);
-	dmu_assign_arcbuf_dnode(DB_DNODE(dbuf), offset, buf, tx);
-	DB_DNODE_EXIT(dbuf);
-}
-
-typedef struct {
-	dbuf_dirty_record_t	*dsa_dr;
-	dmu_sync_cb_t		*dsa_done;
-	zgd_t			*dsa_zgd;
-	dmu_tx_t		*dsa_tx;
-} dmu_sync_arg_t;
-
-/* ARGSUSED */
-static void
-dmu_sync_ready(zio_t *zio, arc_buf_t *buf, void *varg)
-{
-	dmu_sync_arg_t *dsa = varg;
-	dmu_buf_t *db = dsa->dsa_zgd->zgd_db;
-	blkptr_t *bp = zio->io_bp;
-
-	if (zio->io_error == 0) {
-		if (BP_IS_HOLE(bp)) {
-			/*
-			 * A block of zeros may compress to a hole, but the
-			 * block size still needs to be known for replay.
-			 */
-			BP_SET_LSIZE(bp, db->db_size);
-		} else if (!BP_IS_EMBEDDED(bp)) {
-			ASSERT(BP_GET_LEVEL(bp) == 0);
-			bp->blk_fill = 1;
-		}
-	}
-}
-
-static void
-dmu_sync_late_arrival_ready(zio_t *zio)
-{
-	dmu_sync_ready(zio, NULL, zio->io_private);
-}
-
-/* ARGSUSED */
-static void
-dmu_sync_done(zio_t *zio, arc_buf_t *buf, void *varg)
-{
-	dmu_sync_arg_t *dsa = varg;
-	dbuf_dirty_record_t *dr = dsa->dsa_dr;
-	dmu_buf_impl_t *db = dr->dr_dbuf;
-	zgd_t *zgd = dsa->dsa_zgd;
-
-	/*
-	 * Record the vdev(s) backing this blkptr so they can be flushed after
-	 * the writes for the lwb have completed.
-	 */
-	if (zio->io_error == 0) {
-		zil_lwb_add_block(zgd->zgd_lwb, zgd->zgd_bp);
-	}
-
-	mutex_enter(&db->db_mtx);
-	ASSERT(dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC);
-	if (zio->io_error == 0) {
-		dr->dt.dl.dr_nopwrite = !!(zio->io_flags & ZIO_FLAG_NOPWRITE);
-		if (dr->dt.dl.dr_nopwrite) {
-			blkptr_t *bp = zio->io_bp;
-			blkptr_t *bp_orig = &zio->io_bp_orig;
-			uint8_t chksum = BP_GET_CHECKSUM(bp_orig);
-
-			ASSERT(BP_EQUAL(bp, bp_orig));
-			VERIFY(BP_EQUAL(bp, db->db_blkptr));
-			ASSERT(zio->io_prop.zp_compress != ZIO_COMPRESS_OFF);
-			ASSERT(zio_checksum_table[chksum].ci_flags &
-			    ZCHECKSUM_FLAG_NOPWRITE);
-		}
-		dr->dt.dl.dr_overridden_by = *zio->io_bp;
-		dr->dt.dl.dr_override_state = DR_OVERRIDDEN;
-		dr->dt.dl.dr_copies = zio->io_prop.zp_copies;
-
-		/*
-		 * Old style holes are filled with all zeros, whereas
-		 * new-style holes maintain their lsize, type, level,
-		 * and birth time (see zio_write_compress). While we
-		 * need to reset the BP_SET_LSIZE() call that happened
-		 * in dmu_sync_ready for old style holes, we do *not*
-		 * want to wipe out the information contained in new
-		 * style holes. Thus, only zero out the block pointer if
-		 * it's an old style hole.
-		 */
-		if (BP_IS_HOLE(&dr->dt.dl.dr_overridden_by) &&
-		    dr->dt.dl.dr_overridden_by.blk_birth == 0)
-			BP_ZERO(&dr->dt.dl.dr_overridden_by);
-	} else {
-		dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
-	}
-	cv_broadcast(&db->db_changed);
-	mutex_exit(&db->db_mtx);
-
-	dsa->dsa_done(dsa->dsa_zgd, zio->io_error);
-
-	kmem_free(dsa, sizeof (*dsa));
-}
-
-static void
-dmu_sync_late_arrival_done(zio_t *zio)
-{
-	blkptr_t *bp = zio->io_bp;
-	dmu_sync_arg_t *dsa = zio->io_private;
-	blkptr_t *bp_orig = &zio->io_bp_orig;
-	zgd_t *zgd = dsa->dsa_zgd;
-
-	if (zio->io_error == 0) {
-		/*
-		 * Record the vdev(s) backing this blkptr so they can be
-		 * flushed after the writes for the lwb have completed.
-		 */
-		zil_lwb_add_block(zgd->zgd_lwb, zgd->zgd_bp);
-
-		if (!BP_IS_HOLE(bp)) {
-			ASSERT(!(zio->io_flags & ZIO_FLAG_NOPWRITE));
-			ASSERT(BP_IS_HOLE(bp_orig) || !BP_EQUAL(bp, bp_orig));
-			ASSERT(zio->io_bp->blk_birth == zio->io_txg);
-			ASSERT(zio->io_txg > spa_syncing_txg(zio->io_spa));
-			zio_free(zio->io_spa, zio->io_txg, zio->io_bp);
-		}
-	}
-
-	dmu_tx_commit(dsa->dsa_tx);
-
-	dsa->dsa_done(dsa->dsa_zgd, zio->io_error);
-
-	abd_put(zio->io_abd);
-	kmem_free(dsa, sizeof (*dsa));
-}
-
-static int
-dmu_sync_late_arrival(zio_t *pio, objset_t *os, dmu_sync_cb_t *done, zgd_t *zgd,
-    zio_prop_t *zp, zbookmark_phys_t *zb)
-{
-	dmu_sync_arg_t *dsa;
-	dmu_tx_t *tx;
-
-	tx = dmu_tx_create(os);
-	dmu_tx_hold_space(tx, zgd->zgd_db->db_size);
-	if (dmu_tx_assign(tx, TXG_WAIT) != 0) {
-		dmu_tx_abort(tx);
-		/* Make zl_get_data do txg_waited_synced() */
-		return (SET_ERROR(EIO));
-	}
-
-	/*
-	 * In order to prevent the zgd's lwb from being free'd prior to
-	 * dmu_sync_late_arrival_done() being called, we have to ensure
-	 * the lwb's "max txg" takes this tx's txg into account.
-	 */
-	zil_lwb_add_txg(zgd->zgd_lwb, dmu_tx_get_txg(tx));
-
-	dsa = kmem_alloc(sizeof (dmu_sync_arg_t), KM_SLEEP);
-	dsa->dsa_dr = NULL;
-	dsa->dsa_done = done;
-	dsa->dsa_zgd = zgd;
-	dsa->dsa_tx = tx;
-
-	/*
-	 * Since we are currently syncing this txg, it's nontrivial to
-	 * determine what BP to nopwrite against, so we disable nopwrite.
-	 *
-	 * When syncing, the db_blkptr is initially the BP of the previous
-	 * txg.  We can not nopwrite against it because it will be changed
-	 * (this is similar to the non-late-arrival case where the dbuf is
-	 * dirty in a future txg).
-	 *
-	 * Then dbuf_write_ready() sets bp_blkptr to the location we will write.
-	 * We can not nopwrite against it because although the BP will not
-	 * (typically) be changed, the data has not yet been persisted to this
-	 * location.
-	 *
-	 * Finally, when dbuf_write_done() is called, it is theoretically
-	 * possible to always nopwrite, because the data that was written in
-	 * this txg is the same data that we are trying to write.  However we
-	 * would need to check that this dbuf is not dirty in any future
-	 * txg's (as we do in the normal dmu_sync() path). For simplicity, we
-	 * don't nopwrite in this case.
-	 */
-	zp->zp_nopwrite = B_FALSE;
-
-	zio_nowait(zio_write(pio, os->os_spa, dmu_tx_get_txg(tx), zgd->zgd_bp,
-	    abd_get_from_buf(zgd->zgd_db->db_data, zgd->zgd_db->db_size),
-	    zgd->zgd_db->db_size, zgd->zgd_db->db_size, zp,
-	    dmu_sync_late_arrival_ready, NULL, NULL, dmu_sync_late_arrival_done,
-	    dsa, ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, zb));
-
-	return (0);
-}
-
-/*
- * Intent log support: sync the block associated with db to disk.
- * N.B. and XXX: the caller is responsible for making sure that the
- * data isn't changing while dmu_sync() is writing it.
- *
- * Return values:
- *
- *	EEXIST: this txg has already been synced, so there's nothing to do.
- *		The caller should not log the write.
- *
- *	ENOENT: the block was dbuf_free_range()'d, so there's nothing to do.
- *		The caller should not log the write.
- *
- *	EALREADY: this block is already in the process of being synced.
- *		The caller should track its progress (somehow).
- *
- *	EIO: could not do the I/O.
- *		The caller should do a txg_wait_synced().
- *
- *	0: the I/O has been initiated.
- *		The caller should log this blkptr in the done callback.
- *		It is possible that the I/O will fail, in which case
- *		the error will be reported to the done callback and
- *		propagated to pio from zio_done().
- */
-int
-dmu_sync(zio_t *pio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd)
-{
-	dmu_buf_impl_t *db = (dmu_buf_impl_t *)zgd->zgd_db;
-	objset_t *os = db->db_objset;
-	dsl_dataset_t *ds = os->os_dsl_dataset;
-	dbuf_dirty_record_t *dr;
-	dmu_sync_arg_t *dsa;
-	zbookmark_phys_t zb;
-	zio_prop_t zp;
-	dnode_t *dn;
-
-	ASSERT(pio != NULL);
-	ASSERT(txg != 0);
-
-	SET_BOOKMARK(&zb, ds->ds_object,
-	    db->db.db_object, db->db_level, db->db_blkid);
-
-	DB_DNODE_ENTER(db);
-	dn = DB_DNODE(db);
-	dmu_write_policy(os, dn, db->db_level, WP_DMU_SYNC, &zp);
-	DB_DNODE_EXIT(db);
-
-	/*
-	 * If we're frozen (running ziltest), we always need to generate a bp.
-	 */
-	if (txg > spa_freeze_txg(os->os_spa))
-		return (dmu_sync_late_arrival(pio, os, done, zgd, &zp, &zb));
-
-	/*
-	 * Grabbing db_mtx now provides a barrier between dbuf_sync_leaf()
-	 * and us.  If we determine that this txg is not yet syncing,
-	 * but it begins to sync a moment later, that's OK because the
-	 * sync thread will block in dbuf_sync_leaf() until we drop db_mtx.
-	 */
-	mutex_enter(&db->db_mtx);
-
-	if (txg <= spa_last_synced_txg(os->os_spa)) {
-		/*
-		 * This txg has already synced.  There's nothing to do.
-		 */
-		mutex_exit(&db->db_mtx);
-		return (SET_ERROR(EEXIST));
-	}
-
-	if (txg <= spa_syncing_txg(os->os_spa)) {
-		/*
-		 * This txg is currently syncing, so we can't mess with
-		 * the dirty record anymore; just write a new log block.
-		 */
-		mutex_exit(&db->db_mtx);
-		return (dmu_sync_late_arrival(pio, os, done, zgd, &zp, &zb));
-	}
-
-	dr = db->db_last_dirty;
-	while (dr && dr->dr_txg != txg)
-		dr = dr->dr_next;
-
-	if (dr == NULL) {
-		/*
-		 * There's no dr for this dbuf, so it must have been freed.
-		 * There's no need to log writes to freed blocks, so we're done.
-		 */
-		mutex_exit(&db->db_mtx);
-		return (SET_ERROR(ENOENT));
-	}
-
-	ASSERT(dr->dr_next == NULL || dr->dr_next->dr_txg < txg);
-
-	if (db->db_blkptr != NULL) {
-		/*
-		 * We need to fill in zgd_bp with the current blkptr so that
-		 * the nopwrite code can check if we're writing the same
-		 * data that's already on disk.  We can only nopwrite if we
-		 * are sure that after making the copy, db_blkptr will not
-		 * change until our i/o completes.  We ensure this by
-		 * holding the db_mtx, and only allowing nopwrite if the
-		 * block is not already dirty (see below).  This is verified
-		 * by dmu_sync_done(), which VERIFYs that the db_blkptr has
-		 * not changed.
-		 */
-		*zgd->zgd_bp = *db->db_blkptr;
-	}
-
-	/*
-	 * Assume the on-disk data is X, the current syncing data (in
-	 * txg - 1) is Y, and the current in-memory data is Z (currently
-	 * in dmu_sync).
-	 *
-	 * We usually want to perform a nopwrite if X and Z are the
-	 * same.  However, if Y is different (i.e. the BP is going to
-	 * change before this write takes effect), then a nopwrite will
-	 * be incorrect - we would override with X, which could have
-	 * been freed when Y was written.
-	 *
-	 * (Note that this is not a concern when we are nop-writing from
-	 * syncing context, because X and Y must be identical, because
-	 * all previous txgs have been synced.)
-	 *
-	 * Therefore, we disable nopwrite if the current BP could change
-	 * before this TXG.  There are two ways it could change: by
-	 * being dirty (dr_next is non-NULL), or by being freed
-	 * (dnode_block_freed()).  This behavior is verified by
-	 * zio_done(), which VERIFYs that the override BP is identical
-	 * to the on-disk BP.
-	 */
-	DB_DNODE_ENTER(db);
-	dn = DB_DNODE(db);
-	if (dr->dr_next != NULL || dnode_block_freed(dn, db->db_blkid))
-		zp.zp_nopwrite = B_FALSE;
-	DB_DNODE_EXIT(db);
-
-	ASSERT(dr->dr_txg == txg);
-	if (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC ||
-	    dr->dt.dl.dr_override_state == DR_OVERRIDDEN) {
-		/*
-		 * We have already issued a sync write for this buffer,
-		 * or this buffer has already been synced.  It could not
-		 * have been dirtied since, or we would have cleared the state.
-		 */
-		mutex_exit(&db->db_mtx);
-		return (SET_ERROR(EALREADY));
-	}
-
-	ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN);
-	dr->dt.dl.dr_override_state = DR_IN_DMU_SYNC;
-	mutex_exit(&db->db_mtx);
-
-	dsa = kmem_alloc(sizeof (dmu_sync_arg_t), KM_SLEEP);
-	dsa->dsa_dr = dr;
-	dsa->dsa_done = done;
-	dsa->dsa_zgd = zgd;
-	dsa->dsa_tx = NULL;
-
-	zio_nowait(arc_write(pio, os->os_spa, txg,
-	    zgd->zgd_bp, dr->dt.dl.dr_data, DBUF_IS_L2CACHEABLE(db),
-	    &zp, dmu_sync_ready, NULL, NULL, dmu_sync_done, dsa,
-	    ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, &zb));
-
-	return (0);
-}
-
-int
-dmu_object_set_blocksize(objset_t *os, uint64_t object, uint64_t size, int ibs,
-    dmu_tx_t *tx)
-{
-	dnode_t *dn;
-	int err;
-
-	err = dnode_hold(os, object, FTAG, &dn);
-	if (err)
-		return (err);
-	err = dnode_set_blksz(dn, size, ibs, tx);
-	dnode_rele(dn, FTAG);
-	return (err);
-}
-
-void
-dmu_object_set_checksum(objset_t *os, uint64_t object, uint8_t checksum,
-    dmu_tx_t *tx)
-{
-	dnode_t *dn;
-
-	/*
-	 * Send streams include each object's checksum function.  This
-	 * check ensures that the receiving system can understand the
-	 * checksum function transmitted.
-	 */
-	ASSERT3U(checksum, <, ZIO_CHECKSUM_LEGACY_FUNCTIONS);
-
-	VERIFY0(dnode_hold(os, object, FTAG, &dn));
-	ASSERT3U(checksum, <, ZIO_CHECKSUM_FUNCTIONS);
-	dn->dn_checksum = checksum;
-	dnode_setdirty(dn, tx);
-	dnode_rele(dn, FTAG);
-}
-
-void
-dmu_object_set_compress(objset_t *os, uint64_t object, uint8_t compress,
-    dmu_tx_t *tx)
-{
-	dnode_t *dn;
-
-	/*
-	 * Send streams include each object's compression function.  This
-	 * check ensures that the receiving system can understand the
-	 * compression function transmitted.
-	 */
-	ASSERT3U(compress, <, ZIO_COMPRESS_LEGACY_FUNCTIONS);
-
-	VERIFY0(dnode_hold(os, object, FTAG, &dn));
-	dn->dn_compress = compress;
-	dnode_setdirty(dn, tx);
-	dnode_rele(dn, FTAG);
-}
-
-int zfs_mdcomp_disable = 0;
-SYSCTL_INT(_vfs_zfs, OID_AUTO, mdcomp_disable, CTLFLAG_RWTUN,
-    &zfs_mdcomp_disable, 0, "Disable metadata compression");
-
-/*
- * When the "redundant_metadata" property is set to "most", only indirect
- * blocks of this level and higher will have an additional ditto block.
- */
-int zfs_redundant_metadata_most_ditto_level = 2;
-
-void
-dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp)
-{
-	dmu_object_type_t type = dn ? dn->dn_type : DMU_OT_OBJSET;
-	boolean_t ismd = (level > 0 || DMU_OT_IS_METADATA(type) ||
-	    (wp & WP_SPILL));
-	enum zio_checksum checksum = os->os_checksum;
-	enum zio_compress compress = os->os_compress;
-	enum zio_checksum dedup_checksum = os->os_dedup_checksum;
-	boolean_t dedup = B_FALSE;
-	boolean_t nopwrite = B_FALSE;
-	boolean_t dedup_verify = os->os_dedup_verify;
-	int copies = os->os_copies;
-
-	/*
-	 * We maintain different write policies for each of the following
-	 * types of data:
-	 *	 1. metadata
-	 *	 2. preallocated blocks (i.e. level-0 blocks of a dump device)
-	 *	 3. all other level 0 blocks
-	 */
-	if (ismd) {
-		if (zfs_mdcomp_disable) {
-			compress = ZIO_COMPRESS_EMPTY;
-		} else {
-			/*
-			 * XXX -- we should design a compression algorithm
-			 * that specializes in arrays of bps.
-			 */
-			compress = zio_compress_select(os->os_spa,
-			    ZIO_COMPRESS_ON, ZIO_COMPRESS_ON);
-		}
-
-		/*
-		 * Metadata always gets checksummed.  If the data
-		 * checksum is multi-bit correctable, and it's not a
-		 * ZBT-style checksum, then it's suitable for metadata
-		 * as well.  Otherwise, the metadata checksum defaults
-		 * to fletcher4.
-		 */
-		if (!(zio_checksum_table[checksum].ci_flags &
-		    ZCHECKSUM_FLAG_METADATA) ||
-		    (zio_checksum_table[checksum].ci_flags &
-		    ZCHECKSUM_FLAG_EMBEDDED))
-			checksum = ZIO_CHECKSUM_FLETCHER_4;
-
-		if (os->os_redundant_metadata == ZFS_REDUNDANT_METADATA_ALL ||
-		    (os->os_redundant_metadata ==
-		    ZFS_REDUNDANT_METADATA_MOST &&
-		    (level >= zfs_redundant_metadata_most_ditto_level ||
-		    DMU_OT_IS_METADATA(type) || (wp & WP_SPILL))))
-			copies++;
-	} else if (wp & WP_NOFILL) {
-		ASSERT(level == 0);
-
-		/*
-		 * If we're writing preallocated blocks, we aren't actually
-		 * writing them so don't set any policy properties.  These
-		 * blocks are currently only used by an external subsystem
-		 * outside of zfs (i.e. dump) and not written by the zio
-		 * pipeline.
-		 */
-		compress = ZIO_COMPRESS_OFF;
-		checksum = ZIO_CHECKSUM_NOPARITY;
-	} else {
-		compress = zio_compress_select(os->os_spa, dn->dn_compress,
-		    compress);
-
-		checksum = (dedup_checksum == ZIO_CHECKSUM_OFF) ?
-		    zio_checksum_select(dn->dn_checksum, checksum) :
-		    dedup_checksum;
-
-		/*
-		 * Determine dedup setting.  If we are in dmu_sync(),
-		 * we won't actually dedup now because that's all
-		 * done in syncing context; but we do want to use the
-		 * dedup checkum.  If the checksum is not strong
-		 * enough to ensure unique signatures, force
-		 * dedup_verify.
-		 */
-		if (dedup_checksum != ZIO_CHECKSUM_OFF) {
-			dedup = (wp & WP_DMU_SYNC) ? B_FALSE : B_TRUE;
-			if (!(zio_checksum_table[checksum].ci_flags &
-			    ZCHECKSUM_FLAG_DEDUP))
-				dedup_verify = B_TRUE;
-		}
-
-		/*
-		 * Enable nopwrite if we have secure enough checksum
-		 * algorithm (see comment in zio_nop_write) and
-		 * compression is enabled.  We don't enable nopwrite if
-		 * dedup is enabled as the two features are mutually
-		 * exclusive.
-		 */
-		nopwrite = (!dedup && (zio_checksum_table[checksum].ci_flags &
-		    ZCHECKSUM_FLAG_NOPWRITE) &&
-		    compress != ZIO_COMPRESS_OFF && zfs_nopwrite_enabled);
-	}
-
-	zp->zp_checksum = checksum;
-	zp->zp_compress = compress;
-	ASSERT3U(zp->zp_compress, !=, ZIO_COMPRESS_INHERIT);
-
-	zp->zp_type = (wp & WP_SPILL) ? dn->dn_bonustype : type;
-	zp->zp_level = level;
-	zp->zp_copies = MIN(copies, spa_max_replication(os->os_spa));
-	zp->zp_dedup = dedup;
-	zp->zp_dedup_verify = dedup && dedup_verify;
-	zp->zp_nopwrite = nopwrite;
-	zp->zp_zpl_smallblk = DMU_OT_IS_FILE(zp->zp_type) ?
-	    os->os_zpl_special_smallblock : 0;
-}
-
-int
-dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole, uint64_t *off)
-{
-	dnode_t *dn;
-	int err;
-
-	/*
-	 * Sync any current changes before
-	 * we go trundling through the block pointers.
-	 */
-	err = dmu_object_wait_synced(os, object);
-	if (err) {
-		return (err);
-	}
-
-	err = dnode_hold(os, object, FTAG, &dn);
-	if (err) {
-		return (err);
-	}
-
-	err = dnode_next_offset(dn, (hole ? DNODE_FIND_HOLE : 0), off, 1, 1, 0);
-	dnode_rele(dn, FTAG);
-
-	return (err);
-}
-
-/*
- * Given the ZFS object, if it contains any dirty nodes
- * this function flushes all dirty blocks to disk. This
- * ensures the DMU object info is updated. A more efficient
- * future version might just find the TXG with the maximum
- * ID and wait for that to be synced.
- */
-int
-dmu_object_wait_synced(objset_t *os, uint64_t object)
-{
-	dnode_t *dn;
-	int error, i;
-
-	error = dnode_hold(os, object, FTAG, &dn);
-	if (error) {
-		return (error);
-	}
-
-	for (i = 0; i < TXG_SIZE; i++) {
-		if (list_link_active(&dn->dn_dirty_link[i])) {
-			break;
-		}
-	}
-	dnode_rele(dn, FTAG);
-	if (i != TXG_SIZE) {
-		txg_wait_synced(dmu_objset_pool(os), 0);
-	}
-
-	return (0);
-}
-
-void
-__dmu_object_info_from_dnode(dnode_t *dn, dmu_object_info_t *doi)
-{
-	dnode_phys_t *dnp = dn->dn_phys;
-
-	doi->doi_data_block_size = dn->dn_datablksz;
-	doi->doi_metadata_block_size = dn->dn_indblkshift ?
-	    1ULL << dn->dn_indblkshift : 0;
-	doi->doi_type = dn->dn_type;
-	doi->doi_bonus_type = dn->dn_bonustype;
-	doi->doi_bonus_size = dn->dn_bonuslen;
-	doi->doi_dnodesize = dn->dn_num_slots << DNODE_SHIFT;
-	doi->doi_indirection = dn->dn_nlevels;
-	doi->doi_checksum = dn->dn_checksum;
-	doi->doi_compress = dn->dn_compress;
-	doi->doi_nblkptr = dn->dn_nblkptr;
-	doi->doi_physical_blocks_512 = (DN_USED_BYTES(dnp) + 256) >> 9;
-	doi->doi_max_offset = (dn->dn_maxblkid + 1) * dn->dn_datablksz;
-	doi->doi_fill_count = 0;
-	for (int i = 0; i < dnp->dn_nblkptr; i++)
-		doi->doi_fill_count += BP_GET_FILL(&dnp->dn_blkptr[i]);
-}
-
-void
-dmu_object_info_from_dnode(dnode_t *dn, dmu_object_info_t *doi)
-{
-	rw_enter(&dn->dn_struct_rwlock, RW_READER);
-	mutex_enter(&dn->dn_mtx);
-
-	__dmu_object_info_from_dnode(dn, doi);
-
-	mutex_exit(&dn->dn_mtx);
-	rw_exit(&dn->dn_struct_rwlock);
-}
-
-/*
- * Get information on a DMU object.
- * If doi is NULL, just indicates whether the object exists.
- */
-int
-dmu_object_info(objset_t *os, uint64_t object, dmu_object_info_t *doi)
-{
-	dnode_t *dn;
-	int err = dnode_hold(os, object, FTAG, &dn);
-
-	if (err)
-		return (err);
-
-	if (doi != NULL)
-		dmu_object_info_from_dnode(dn, doi);
-
-	dnode_rele(dn, FTAG);
-	return (0);
-}
-
-/*
- * As above, but faster; can be used when you have a held dbuf in hand.
- */
-void
-dmu_object_info_from_db(dmu_buf_t *db_fake, dmu_object_info_t *doi)
-{
-	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
-
-	DB_DNODE_ENTER(db);
-	dmu_object_info_from_dnode(DB_DNODE(db), doi);
-	DB_DNODE_EXIT(db);
-}
-
-/*
- * Faster still when you only care about the size.
- * This is specifically optimized for zfs_getattr().
- */
-void
-dmu_object_size_from_db(dmu_buf_t *db_fake, uint32_t *blksize,
-    u_longlong_t *nblk512)
-{
-	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
-	dnode_t *dn;
-
-	DB_DNODE_ENTER(db);
-	dn = DB_DNODE(db);
-
-	*blksize = dn->dn_datablksz;
-	/* add in number of slots used for the dnode itself */
-	*nblk512 = ((DN_USED_BYTES(dn->dn_phys) + SPA_MINBLOCKSIZE/2) >>
-	    SPA_MINBLOCKSHIFT) + dn->dn_num_slots;
-	DB_DNODE_EXIT(db);
-}
-
-void
-dmu_object_dnsize_from_db(dmu_buf_t *db_fake, int *dnsize)
-{
-	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
-	dnode_t *dn;
-
-	DB_DNODE_ENTER(db);
-	dn = DB_DNODE(db);
-	*dnsize = dn->dn_num_slots << DNODE_SHIFT;
-	DB_DNODE_EXIT(db);
-}
-
-void
-byteswap_uint64_array(void *vbuf, size_t size)
-{
-	uint64_t *buf = vbuf;
-	size_t count = size >> 3;
-	int i;
-
-	ASSERT((size & 7) == 0);
-
-	for (i = 0; i < count; i++)
-		buf[i] = BSWAP_64(buf[i]);
-}
-
-void
-byteswap_uint32_array(void *vbuf, size_t size)
-{
-	uint32_t *buf = vbuf;
-	size_t count = size >> 2;
-	int i;
-
-	ASSERT((size & 3) == 0);
-
-	for (i = 0; i < count; i++)
-		buf[i] = BSWAP_32(buf[i]);
-}
-
-void
-byteswap_uint16_array(void *vbuf, size_t size)
-{
-	uint16_t *buf = vbuf;
-	size_t count = size >> 1;
-	int i;
-
-	ASSERT((size & 1) == 0);
-
-	for (i = 0; i < count; i++)
-		buf[i] = BSWAP_16(buf[i]);
-}
-
-/* ARGSUSED */
-void
-byteswap_uint8_array(void *vbuf, size_t size)
-{
-}
-
-void
-dmu_init(void)
-{
-	abd_init();
-	zfs_dbgmsg_init();
-	sa_cache_init();
-	xuio_stat_init();
-	dmu_objset_init();
-	dnode_init();
-	zfetch_init();
-	zio_compress_init();
-	l2arc_init();
-	arc_init();
-	dbuf_init();
-}
-
-void
-dmu_fini(void)
-{
-	arc_fini(); /* arc depends on l2arc, so arc must go first */
-	l2arc_fini();
-	zfetch_fini();
-	zio_compress_fini();
-	dbuf_fini();
-	dnode_fini();
-	dmu_objset_fini();
-	xuio_stat_fini();
-	sa_cache_fini();
-	zfs_dbgmsg_fini();
-	abd_fini();
-}
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_diff.c
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_diff.c
+++ /dev/null
@@ -1,251 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
- */
-
-#include <sys/dmu.h>
-#include <sys/dmu_impl.h>
-#include <sys/dmu_tx.h>
-#include <sys/dbuf.h>
-#include <sys/dnode.h>
-#include <sys/zfs_context.h>
-#include <sys/dmu_objset.h>
-#include <sys/dmu_traverse.h>
-#include <sys/dsl_dataset.h>
-#include <sys/dsl_dir.h>
-#include <sys/dsl_pool.h>
-#include <sys/dsl_synctask.h>
-#include <sys/zfs_ioctl.h>
-#include <sys/zap.h>
-#include <sys/zio_checksum.h>
-#include <sys/zfs_znode.h>
-
-struct diffarg {
-	struct file *da_fp;		/* file to which we are reporting */
-	offset_t *da_offp;
-	int da_err;			/* error that stopped diff search */
-	dmu_diff_record_t da_ddr;
-	kthread_t *da_td;
-};
-
-static int
-write_bytes(struct diffarg *da)
-{
-	struct uio auio;
-	struct iovec aiov;
-
-	aiov.iov_base = (caddr_t)&da->da_ddr;
-	aiov.iov_len = sizeof (da->da_ddr);
-	auio.uio_iov = &aiov;
-	auio.uio_iovcnt = 1;
-	auio.uio_resid = aiov.iov_len;
-	auio.uio_segflg = UIO_SYSSPACE;
-	auio.uio_rw = UIO_WRITE;
-	auio.uio_offset = (off_t)-1;
-	auio.uio_td = da->da_td;
-#ifdef _KERNEL
-	if (da->da_fp->f_type == DTYPE_VNODE)
-		bwillwrite();
-	return (fo_write(da->da_fp, &auio, da->da_td->td_ucred, 0, da->da_td));
-#else
-	fprintf(stderr, "%s: returning EOPNOTSUPP\n", __func__);
-	return (EOPNOTSUPP);
-#endif
-}
-
-static int
-write_record(struct diffarg *da)
-{
-
-	if (da->da_ddr.ddr_type == DDR_NONE) {
-		da->da_err = 0;
-		return (0);
-	}
-
-	da->da_err = write_bytes(da);
-	*da->da_offp += sizeof (da->da_ddr);
-	return (da->da_err);
-}
-
-static int
-report_free_dnode_range(struct diffarg *da, uint64_t first, uint64_t last)
-{
-	ASSERT(first <= last);
-	if (da->da_ddr.ddr_type != DDR_FREE ||
-	    first != da->da_ddr.ddr_last + 1) {
-		if (write_record(da) != 0)
-			return (da->da_err);
-		da->da_ddr.ddr_type = DDR_FREE;
-		da->da_ddr.ddr_first = first;
-		da->da_ddr.ddr_last = last;
-		return (0);
-	}
-	da->da_ddr.ddr_last = last;
-	return (0);
-}
-
-static int
-report_dnode(struct diffarg *da, uint64_t object, dnode_phys_t *dnp)
-{
-	ASSERT(dnp != NULL);
-	if (dnp->dn_type == DMU_OT_NONE)
-		return (report_free_dnode_range(da, object, object));
-
-	if (da->da_ddr.ddr_type != DDR_INUSE ||
-	    object != da->da_ddr.ddr_last + 1) {
-		if (write_record(da) != 0)
-			return (da->da_err);
-		da->da_ddr.ddr_type = DDR_INUSE;
-		da->da_ddr.ddr_first = da->da_ddr.ddr_last = object;
-		return (0);
-	}
-	da->da_ddr.ddr_last = object;
-	return (0);
-}
-
-#define	DBP_SPAN(dnp, level)				  \
-	(((uint64_t)dnp->dn_datablkszsec) << (SPA_MINBLOCKSHIFT + \
-	(level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT)))
-
-/* ARGSUSED */
-static int
-diff_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
-    const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg)
-{
-	struct diffarg *da = arg;
-	int err = 0;
-
-	if (issig(JUSTLOOKING) && issig(FORREAL))
-		return (SET_ERROR(EINTR));
-
-	if (bp == NULL || zb->zb_object != DMU_META_DNODE_OBJECT)
-		return (0);
-
-	if (BP_IS_HOLE(bp)) {
-		uint64_t span = DBP_SPAN(dnp, zb->zb_level);
-		uint64_t dnobj = (zb->zb_blkid * span) >> DNODE_SHIFT;
-
-		err = report_free_dnode_range(da, dnobj,
-		    dnobj + (span >> DNODE_SHIFT) - 1);
-		if (err)
-			return (err);
-	} else if (zb->zb_level == 0) {
-		dnode_phys_t *blk;
-		arc_buf_t *abuf;
-		arc_flags_t aflags = ARC_FLAG_WAIT;
-		int blksz = BP_GET_LSIZE(bp);
-		int i;
-
-		if (arc_read(NULL, spa, bp, arc_getbuf_func, &abuf,
-		    ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL,
-		    &aflags, zb) != 0)
-			return (SET_ERROR(EIO));
-
-		blk = abuf->b_data;
-		for (i = 0; i < blksz >> DNODE_SHIFT; i++) {
-			uint64_t dnobj = (zb->zb_blkid <<
-			    (DNODE_BLOCK_SHIFT - DNODE_SHIFT)) + i;
-			err = report_dnode(da, dnobj, blk+i);
-			if (err)
-				break;
-		}
-		arc_buf_destroy(abuf, &abuf);
-		if (err)
-			return (err);
-		/* Don't care about the data blocks */
-		return (TRAVERSE_VISIT_NO_CHILDREN);
-	}
-	return (0);
-}
-
-int
-dmu_diff(const char *tosnap_name, const char *fromsnap_name,
-#ifdef illumos
-    struct vnode *vp, offset_t *offp)
-#else
-    struct file *fp, offset_t *offp)
-#endif
-{
-	struct diffarg da;
-	dsl_dataset_t *fromsnap;
-	dsl_dataset_t *tosnap;
-	dsl_pool_t *dp;
-	int error;
-	uint64_t fromtxg;
-
-	if (strchr(tosnap_name, '@') == NULL ||
-	    strchr(fromsnap_name, '@') == NULL)
-		return (SET_ERROR(EINVAL));
-
-	error = dsl_pool_hold(tosnap_name, FTAG, &dp);
-	if (error != 0)
-		return (error);
-
-	error = dsl_dataset_hold(dp, tosnap_name, FTAG, &tosnap);
-	if (error != 0) {
-		dsl_pool_rele(dp, FTAG);
-		return (error);
-	}
-
-	error = dsl_dataset_hold(dp, fromsnap_name, FTAG, &fromsnap);
-	if (error != 0) {
-		dsl_dataset_rele(tosnap, FTAG);
-		dsl_pool_rele(dp, FTAG);
-		return (error);
-	}
-
-	if (!dsl_dataset_is_before(tosnap, fromsnap, 0)) {
-		dsl_dataset_rele(fromsnap, FTAG);
-		dsl_dataset_rele(tosnap, FTAG);
-		dsl_pool_rele(dp, FTAG);
-		return (SET_ERROR(EXDEV));
-	}
-
-	fromtxg = dsl_dataset_phys(fromsnap)->ds_creation_txg;
-	dsl_dataset_rele(fromsnap, FTAG);
-
-	dsl_dataset_long_hold(tosnap, FTAG);
-	dsl_pool_rele(dp, FTAG);
-
-	da.da_fp = fp;
-	da.da_offp = offp;
-	da.da_ddr.ddr_type = DDR_NONE;
-	da.da_ddr.ddr_first = da.da_ddr.ddr_last = 0;
-	da.da_err = 0;
-	da.da_td = curthread;
-
-	error = traverse_dataset(tosnap, fromtxg,
-	    TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA, diff_cb, &da);
-
-	if (error != 0) {
-		da.da_err = error;
-	} else {
-		/* we set the da.da_err we return as side-effect */
-		(void) write_record(&da);
-	}
-
-	dsl_dataset_long_rele(tosnap, FTAG);
-	dsl_dataset_rele(tosnap, FTAG);
-
-	return (da.da_err);
-}
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_object.c
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_object.c
+++ /dev/null
@@ -1,444 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2013, 2017 by Delphix. All rights reserved.
- * Copyright 2014 HybridCluster. All rights reserved.
- */
-
-#include <sys/dmu.h>
-#include <sys/dmu_objset.h>
-#include <sys/dmu_tx.h>
-#include <sys/dnode.h>
-#include <sys/zap.h>
-#include <sys/zfeature.h>
-#include <sys/dsl_dataset.h>
-
-/*
- * Each of the concurrent object allocators will grab
- * 2^dmu_object_alloc_chunk_shift dnode slots at a time.  The default is to
- * grab 128 slots, which is 4 blocks worth.  This was experimentally
- * determined to be the lowest value that eliminates the measurable effect
- * of lock contention from this code path.
- */
-int dmu_object_alloc_chunk_shift = 7;
-
-static uint64_t
-dmu_object_alloc_impl(objset_t *os, dmu_object_type_t ot, int blocksize,
-    int indirect_blockshift, dmu_object_type_t bonustype, int bonuslen,
-    int dnodesize, dmu_tx_t *tx)
-{
-	uint64_t object;
-	uint64_t L1_dnode_count = DNODES_PER_BLOCK <<
-	    (DMU_META_DNODE(os)->dn_indblkshift - SPA_BLKPTRSHIFT);
-	dnode_t *dn = NULL;
-	int dn_slots = dnodesize >> DNODE_SHIFT;
-	boolean_t restarted = B_FALSE;
-	uint64_t *cpuobj = &os->os_obj_next_percpu[CPU_SEQID %
-	    os->os_obj_next_percpu_len];
-	int dnodes_per_chunk = 1 << dmu_object_alloc_chunk_shift;
-	int error;
-
-	if (dn_slots == 0) {
-		dn_slots = DNODE_MIN_SLOTS;
-	} else {
-		ASSERT3S(dn_slots, >=, DNODE_MIN_SLOTS);
-		ASSERT3S(dn_slots, <=, DNODE_MAX_SLOTS);
-	}
-
-	/*
-	 * The "chunk" of dnodes that is assigned to a CPU-specific
-	 * allocator needs to be at least one block's worth, to avoid
-	 * lock contention on the dbuf.  It can be at most one L1 block's
-	 * worth, so that the "rescan after polishing off a L1's worth"
-	 * logic below will be sure to kick in.
-	 */
-	if (dnodes_per_chunk < DNODES_PER_BLOCK)
-		dnodes_per_chunk = DNODES_PER_BLOCK;
-	if (dnodes_per_chunk > L1_dnode_count)
-		dnodes_per_chunk = L1_dnode_count;
-
-#ifdef __FreeBSD__
-	object = atomic_load_64(cpuobj);
-#else
-	object = *cpuobj;
-#endif
-
-	for (;;) {
-		/*
-		 * If we finished a chunk of dnodes, get a new one from
-		 * the global allocator.
-		 */
-		if ((P2PHASE(object, dnodes_per_chunk) == 0) ||
-		    (P2PHASE(object + dn_slots - 1, dnodes_per_chunk) <
-		    dn_slots)) {
-			DNODE_STAT_BUMP(dnode_alloc_next_chunk);
-			mutex_enter(&os->os_obj_lock);
-			ASSERT0(P2PHASE(os->os_obj_next_chunk,
-			    dnodes_per_chunk));
-			object = os->os_obj_next_chunk;
-
-			/*
-			 * Each time we polish off a L1 bp worth of dnodes
-			 * (2^12 objects), move to another L1 bp that's
-			 * still reasonably sparse (at most 1/4 full). Look
-			 * from the beginning at most once per txg. If we
-			 * still can't allocate from that L1 block, search
-			 * for an empty L0 block, which will quickly skip
-			 * to the end of the metadnode if the no nearby L0
-			 * blocks are empty. This fallback avoids a
-			 * pathology where full dnode blocks containing
-			 * large dnodes appear sparse because they have a
-			 * low blk_fill, leading to many failed allocation
-			 * attempts. In the long term a better mechanism to
-			 * search for sparse metadnode regions, such as
-			 * spacemaps, could be implemented.
-			 *
-			 * os_scan_dnodes is set during txg sync if enough
-			 * objects have been freed since the previous
-			 * rescan to justify backfilling again.
-			 *
-			 * Note that dmu_traverse depends on the behavior
-			 * that we use multiple blocks of the dnode object
-			 * before going back to reuse objects. Any change
-			 * to this algorithm should preserve that property
-			 * or find another solution to the issues described
-			 * in traverse_visitbp.
-			 */
-			if (P2PHASE(object, L1_dnode_count) == 0) {
-				uint64_t offset;
-				uint64_t blkfill;
-				int minlvl;
-				if (os->os_rescan_dnodes) {
-					offset = 0;
-					os->os_rescan_dnodes = B_FALSE;
-				} else {
-					offset = object << DNODE_SHIFT;
-				}
-				blkfill = restarted ? 1 : DNODES_PER_BLOCK >> 2;
-				minlvl = restarted ? 1 : 2;
-				restarted = B_TRUE;
-				error = dnode_next_offset(DMU_META_DNODE(os),
-				    DNODE_FIND_HOLE, &offset, minlvl,
-				    blkfill, 0);
-				if (error == 0) {
-					object = offset >> DNODE_SHIFT;
-				}
-			}
-			/*
-			 * Note: if "restarted", we may find a L0 that
-			 * is not suitably aligned.
-			 */
-			os->os_obj_next_chunk =
-			    P2ALIGN(object, dnodes_per_chunk) +
-			    dnodes_per_chunk;
-			(void) atomic_swap_64(cpuobj, object);
-			mutex_exit(&os->os_obj_lock);
-		}
-
-		/*
-		 * The value of (*cpuobj) before adding dn_slots is the object
-		 * ID assigned to us.  The value afterwards is the object ID
-		 * assigned to whoever wants to do an allocation next.
-		 */
-		object = atomic_add_64_nv(cpuobj, dn_slots) - dn_slots;
-
-		/*
-		 * XXX We should check for an i/o error here and return
-		 * up to our caller.  Actually we should pre-read it in
-		 * dmu_tx_assign(), but there is currently no mechanism
-		 * to do so.
-		 */
-		error = dnode_hold_impl(os, object, DNODE_MUST_BE_FREE,
-		    dn_slots, FTAG, &dn);
-		if (error == 0) {
-			rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
-			/*
-			 * Another thread could have allocated it; check
-			 * again now that we have the struct lock.
-			 */
-			if (dn->dn_type == DMU_OT_NONE) {
-				dnode_allocate(dn, ot, blocksize, 0,
-				    bonustype, bonuslen, dn_slots, tx);
-				rw_exit(&dn->dn_struct_rwlock);
-				dmu_tx_add_new_object(tx, dn);
-				dnode_rele(dn, FTAG);
-				return (object);
-			}
-			rw_exit(&dn->dn_struct_rwlock);
-			dnode_rele(dn, FTAG);
-			DNODE_STAT_BUMP(dnode_alloc_race);
-		}
-
-		/*
-		 * Skip to next known valid starting point on error. This
-		 * is the start of the next block of dnodes.
-		 */
-		if (dmu_object_next(os, &object, B_TRUE, 0) != 0) {
-			object = P2ROUNDUP(object + 1, DNODES_PER_BLOCK);
-			DNODE_STAT_BUMP(dnode_alloc_next_block);
-		}
-		(void) atomic_swap_64(cpuobj, object);
-	}
-}
-
-uint64_t
-dmu_object_alloc(objset_t *os, dmu_object_type_t ot, int blocksize,
-    dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
-{
-	return (dmu_object_alloc_impl(os, ot, blocksize, 0, bonustype,
-	    bonuslen, 0, tx));
-}
-
-uint64_t
-dmu_object_alloc_ibs(objset_t *os, dmu_object_type_t ot, int blocksize,
-    int indirect_blockshift, dmu_object_type_t bonustype, int bonuslen,
-    dmu_tx_t *tx)
-{
-	return (dmu_object_alloc_impl(os, ot, blocksize, indirect_blockshift,
-	    bonustype, bonuslen, 0, tx));
-}
-
-uint64_t
-dmu_object_alloc_dnsize(objset_t *os, dmu_object_type_t ot, int blocksize,
-    dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx)
-{
-	return (dmu_object_alloc_impl(os, ot, blocksize, 0, bonustype,
-	    bonuslen, dnodesize, tx));
-}
-
-int
-dmu_object_claim(objset_t *os, uint64_t object, dmu_object_type_t ot,
-    int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
-{
-	return (dmu_object_claim_dnsize(os, object, ot, blocksize, bonustype,
-	    bonuslen, 0, tx));
-}
-
-int
-dmu_object_claim_dnsize(objset_t *os, uint64_t object, dmu_object_type_t ot,
-    int blocksize, dmu_object_type_t bonustype, int bonuslen,
-    int dnodesize, dmu_tx_t *tx)
-{
-	dnode_t *dn;
-	int dn_slots = dnodesize >> DNODE_SHIFT;
-	int err;
-
-	if (dn_slots == 0)
-		dn_slots = DNODE_MIN_SLOTS;
-	ASSERT3S(dn_slots, >=, DNODE_MIN_SLOTS);
-	ASSERT3S(dn_slots, <=, DNODE_MAX_SLOTS);
-
-	if (object == DMU_META_DNODE_OBJECT && !dmu_tx_private_ok(tx))
-		return (SET_ERROR(EBADF));
-
-	err = dnode_hold_impl(os, object, DNODE_MUST_BE_FREE, dn_slots,
-	    FTAG, &dn);
-	if (err)
-		return (err);
-	dnode_allocate(dn, ot, blocksize, 0, bonustype, bonuslen, dn_slots, tx);
-	dmu_tx_add_new_object(tx, dn);
-
-	dnode_rele(dn, FTAG);
-
-	return (0);
-}
-
-int
-dmu_object_reclaim(objset_t *os, uint64_t object, dmu_object_type_t ot,
-    int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
-{
-	return (dmu_object_reclaim_dnsize(os, object, ot, blocksize, bonustype,
-	    bonuslen, DNODE_MIN_SIZE, tx));
-}
-
-int
-dmu_object_reclaim_dnsize(objset_t *os, uint64_t object, dmu_object_type_t ot,
-    int blocksize, dmu_object_type_t bonustype, int bonuslen, int dnodesize,
-    dmu_tx_t *tx)
-{
-	dnode_t *dn;
-	int dn_slots = dnodesize >> DNODE_SHIFT;
-	int err;
-
-	if (dn_slots == 0)
-		dn_slots = DNODE_MIN_SLOTS;
-
-	if (object == DMU_META_DNODE_OBJECT)
-		return (SET_ERROR(EBADF));
-
-	err = dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, 0,
-	    FTAG, &dn);
-	if (err)
-		return (err);
-
-	dnode_reallocate(dn, ot, blocksize, bonustype, bonuslen, dn_slots, tx);
-
-	dnode_rele(dn, FTAG);
-	return (err);
-}
-
-
-int
-dmu_object_free(objset_t *os, uint64_t object, dmu_tx_t *tx)
-{
-	dnode_t *dn;
-	int err;
-
-	ASSERT(object != DMU_META_DNODE_OBJECT || dmu_tx_private_ok(tx));
-
-	err = dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, 0,
-	    FTAG, &dn);
-	if (err)
-		return (err);
-
-	ASSERT(dn->dn_type != DMU_OT_NONE);
-	/*
-	 * If we don't create this free range, we'll leak indirect blocks when
-	 * we get to freeing the dnode in syncing context.
-	 */
-	dnode_free_range(dn, 0, DMU_OBJECT_END, tx);
-	dnode_free(dn, tx);
-	dnode_rele(dn, FTAG);
-
-	return (0);
-}
-
-/*
- * Return (in *objectp) the next object which is allocated (or a hole)
- * after *object, taking into account only objects that may have been modified
- * after the specified txg.
- */
-int
-dmu_object_next(objset_t *os, uint64_t *objectp, boolean_t hole, uint64_t txg)
-{
-	uint64_t offset;
-	uint64_t start_obj;
-	struct dsl_dataset *ds = os->os_dsl_dataset;
-	int error;
-
-	if (*objectp == 0) {
-		start_obj = 1;
-	} else if (ds && ds->ds_feature_inuse[SPA_FEATURE_LARGE_DNODE]) {
-		uint64_t i = *objectp + 1;
-		uint64_t last_obj = *objectp | (DNODES_PER_BLOCK - 1);
-		dmu_object_info_t doi;
-
-		/*
-		 * Scan through the remaining meta dnode block. The contents
-		 * of each slot in the block are known so it can be quickly
-		 * checked. If the block is exhausted without a match then
-		 * hand off to dnode_next_offset() for further scanning.
-		 */
-		while (i <= last_obj) {
-			error = dmu_object_info(os, i, &doi);
-			if (error == ENOENT) {
-				if (hole) {
-					*objectp = i;
-					return (0);
-				} else {
-					i++;
-				}
-			} else if (error == EEXIST) {
-				i++;
-			} else if (error == 0) {
-				if (hole) {
-					i += doi.doi_dnodesize >> DNODE_SHIFT;
-				} else {
-					*objectp = i;
-					return (0);
-				}
-			} else {
-				return (error);
-			}
-		}
-
-		start_obj = i;
-	} else {
-		start_obj = *objectp + 1;
-	}
-
-	offset = start_obj << DNODE_SHIFT;
-
-	error = dnode_next_offset(DMU_META_DNODE(os),
-	    (hole ? DNODE_FIND_HOLE : 0), &offset, 0, DNODES_PER_BLOCK, txg);
-
-	*objectp = offset >> DNODE_SHIFT;
-
-	return (error);
-}
-
-/*
- * Turn this object from old_type into DMU_OTN_ZAP_METADATA, and bump the
- * refcount on SPA_FEATURE_EXTENSIBLE_DATASET.
- *
- * Only for use from syncing context, on MOS objects.
- */
-void
-dmu_object_zapify(objset_t *mos, uint64_t object, dmu_object_type_t old_type,
-    dmu_tx_t *tx)
-{
-	dnode_t *dn;
-
-	ASSERT(dmu_tx_is_syncing(tx));
-
-	VERIFY0(dnode_hold(mos, object, FTAG, &dn));
-	if (dn->dn_type == DMU_OTN_ZAP_METADATA) {
-		dnode_rele(dn, FTAG);
-		return;
-	}
-	ASSERT3U(dn->dn_type, ==, old_type);
-	ASSERT0(dn->dn_maxblkid);
-
-	/*
-	 * We must initialize the ZAP data before changing the type,
-	 * so that concurrent calls to *_is_zapified() can determine if
-	 * the object has been completely zapified by checking the type.
-	 */
-	mzap_create_impl(mos, object, 0, 0, tx);
-
-	dn->dn_next_type[tx->tx_txg & TXG_MASK] = dn->dn_type =
-	    DMU_OTN_ZAP_METADATA;
-	dnode_setdirty(dn, tx);
-	dnode_rele(dn, FTAG);
-
-	spa_feature_incr(dmu_objset_spa(mos),
-	    SPA_FEATURE_EXTENSIBLE_DATASET, tx);
-}
-
-void
-dmu_object_free_zapified(objset_t *mos, uint64_t object, dmu_tx_t *tx)
-{
-	dnode_t *dn;
-	dmu_object_type_t t;
-
-	ASSERT(dmu_tx_is_syncing(tx));
-
-	VERIFY0(dnode_hold(mos, object, FTAG, &dn));
-	t = dn->dn_type;
-	dnode_rele(dn, FTAG);
-
-	if (t == DMU_OTN_ZAP_METADATA) {
-		spa_feature_decr(dmu_objset_spa(mos),
-		    SPA_FEATURE_EXTENSIBLE_DATASET, tx);
-	}
-	VERIFY0(dmu_object_free(mos, object, tx));
-}
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c
+++ /dev/null
@@ -1,2484 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-
-/*
- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
- * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
- * Copyright (c) 2013, Joyent, Inc. All rights reserved.
- * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
- * Copyright (c) 2015, STRATO AG, Inc. All rights reserved.
- * Copyright (c) 2014 Integros [integros.com]
- * Copyright 2017 Nexenta Systems, Inc.
- * Copyright (c) 2018, loli10K <ezomori.nozomu@gmail.com>. All rights reserved.
- */
-
-/* Portions Copyright 2010 Robert Milkowski */
-
-#include <sys/cred.h>
-#include <sys/zfs_context.h>
-#include <sys/dmu_objset.h>
-#include <sys/dsl_dir.h>
-#include <sys/dsl_dataset.h>
-#include <sys/dsl_prop.h>
-#include <sys/dsl_pool.h>
-#include <sys/dsl_synctask.h>
-#include <sys/dsl_deleg.h>
-#include <sys/dnode.h>
-#include <sys/dbuf.h>
-#include <sys/zvol.h>
-#include <sys/dmu_tx.h>
-#include <sys/zap.h>
-#include <sys/zil.h>
-#include <sys/dmu_impl.h>
-#include <sys/zfs_ioctl.h>
-#include <sys/sa.h>
-#include <sys/zfs_onexit.h>
-#include <sys/dsl_destroy.h>
-#include <sys/vdev.h>
-#include <sys/zfeature.h>
-#include "zfs_namecheck.h"
-
-/*
- * Needed to close a window in dnode_move() that allows the objset to be freed
- * before it can be safely accessed.
- */
-krwlock_t os_lock;
-
-/*
- * Tunable to overwrite the maximum number of threads for the parallization
- * of dmu_objset_find_dp, needed to speed up the import of pools with many
- * datasets.
- * Default is 4 times the number of leaf vdevs.
- */
-int dmu_find_threads = 0;
-
-/*
- * Backfill lower metadnode objects after this many have been freed.
- * Backfilling negatively impacts object creation rates, so only do it
- * if there are enough holes to fill.
- */
-int dmu_rescan_dnode_threshold = 131072;
-
-static void dmu_objset_find_dp_cb(void *arg);
-
-void
-dmu_objset_init(void)
-{
-	rw_init(&os_lock, NULL, RW_DEFAULT, NULL);
-}
-
-void
-dmu_objset_fini(void)
-{
-	rw_destroy(&os_lock);
-}
-
-spa_t *
-dmu_objset_spa(objset_t *os)
-{
-	return (os->os_spa);
-}
-
-zilog_t *
-dmu_objset_zil(objset_t *os)
-{
-	return (os->os_zil);
-}
-
-dsl_pool_t *
-dmu_objset_pool(objset_t *os)
-{
-	dsl_dataset_t *ds;
-
-	if ((ds = os->os_dsl_dataset) != NULL && ds->ds_dir)
-		return (ds->ds_dir->dd_pool);
-	else
-		return (spa_get_dsl(os->os_spa));
-}
-
-dsl_dataset_t *
-dmu_objset_ds(objset_t *os)
-{
-	return (os->os_dsl_dataset);
-}
-
-dmu_objset_type_t
-dmu_objset_type(objset_t *os)
-{
-	return (os->os_phys->os_type);
-}
-
-void
-dmu_objset_name(objset_t *os, char *buf)
-{
-	dsl_dataset_name(os->os_dsl_dataset, buf);
-}
-
-uint64_t
-dmu_objset_id(objset_t *os)
-{
-	dsl_dataset_t *ds = os->os_dsl_dataset;
-
-	return (ds ? ds->ds_object : 0);
-}
-
-uint64_t
-dmu_objset_dnodesize(objset_t *os)
-{
-	return (os->os_dnodesize);
-}
-
-zfs_sync_type_t
-dmu_objset_syncprop(objset_t *os)
-{
-	return (os->os_sync);
-}
-
-zfs_logbias_op_t
-dmu_objset_logbias(objset_t *os)
-{
-	return (os->os_logbias);
-}
-
-static void
-checksum_changed_cb(void *arg, uint64_t newval)
-{
-	objset_t *os = arg;
-
-	/*
-	 * Inheritance should have been done by now.
-	 */
-	ASSERT(newval != ZIO_CHECKSUM_INHERIT);
-
-	os->os_checksum = zio_checksum_select(newval, ZIO_CHECKSUM_ON_VALUE);
-}
-
-static void
-compression_changed_cb(void *arg, uint64_t newval)
-{
-	objset_t *os = arg;
-
-	/*
-	 * Inheritance and range checking should have been done by now.
-	 */
-	ASSERT(newval != ZIO_COMPRESS_INHERIT);
-
-	os->os_compress = zio_compress_select(os->os_spa, newval,
-	    ZIO_COMPRESS_ON);
-}
-
-static void
-copies_changed_cb(void *arg, uint64_t newval)
-{
-	objset_t *os = arg;
-
-	/*
-	 * Inheritance and range checking should have been done by now.
-	 */
-	ASSERT(newval > 0);
-	ASSERT(newval <= spa_max_replication(os->os_spa));
-
-	os->os_copies = newval;
-}
-
-static void
-dedup_changed_cb(void *arg, uint64_t newval)
-{
-	objset_t *os = arg;
-	spa_t *spa = os->os_spa;
-	enum zio_checksum checksum;
-
-	/*
-	 * Inheritance should have been done by now.
-	 */
-	ASSERT(newval != ZIO_CHECKSUM_INHERIT);
-
-	checksum = zio_checksum_dedup_select(spa, newval, ZIO_CHECKSUM_OFF);
-
-	os->os_dedup_checksum = checksum & ZIO_CHECKSUM_MASK;
-	os->os_dedup_verify = !!(checksum & ZIO_CHECKSUM_VERIFY);
-}
-
-static void
-primary_cache_changed_cb(void *arg, uint64_t newval)
-{
-	objset_t *os = arg;
-
-	/*
-	 * Inheritance and range checking should have been done by now.
-	 */
-	ASSERT(newval == ZFS_CACHE_ALL || newval == ZFS_CACHE_NONE ||
-	    newval == ZFS_CACHE_METADATA);
-
-	os->os_primary_cache = newval;
-}
-
-static void
-secondary_cache_changed_cb(void *arg, uint64_t newval)
-{
-	objset_t *os = arg;
-
-	/*
-	 * Inheritance and range checking should have been done by now.
-	 */
-	ASSERT(newval == ZFS_CACHE_ALL || newval == ZFS_CACHE_NONE ||
-	    newval == ZFS_CACHE_METADATA);
-
-	os->os_secondary_cache = newval;
-}
-
-static void
-sync_changed_cb(void *arg, uint64_t newval)
-{
-	objset_t *os = arg;
-
-	/*
-	 * Inheritance and range checking should have been done by now.
-	 */
-	ASSERT(newval == ZFS_SYNC_STANDARD || newval == ZFS_SYNC_ALWAYS ||
-	    newval == ZFS_SYNC_DISABLED);
-
-	os->os_sync = newval;
-	if (os->os_zil)
-		zil_set_sync(os->os_zil, newval);
-}
-
-static void
-redundant_metadata_changed_cb(void *arg, uint64_t newval)
-{
-	objset_t *os = arg;
-
-	/*
-	 * Inheritance and range checking should have been done by now.
-	 */
-	ASSERT(newval == ZFS_REDUNDANT_METADATA_ALL ||
-	    newval == ZFS_REDUNDANT_METADATA_MOST);
-
-	os->os_redundant_metadata = newval;
-}
-
-static void
-dnodesize_changed_cb(void *arg, uint64_t newval)
-{
-	objset_t *os = arg;
-
-	switch (newval) {
-	case ZFS_DNSIZE_LEGACY:
-		os->os_dnodesize = DNODE_MIN_SIZE;
-		break;
-	case ZFS_DNSIZE_AUTO:
-		/*
-		 * Choose a dnode size that will work well for most
-		 * workloads if the user specified "auto". Future code
-		 * improvements could dynamically select a dnode size
-		 * based on observed workload patterns.
-		 */
-		os->os_dnodesize = DNODE_MIN_SIZE * 2;
-		break;
-	case ZFS_DNSIZE_1K:
-	case ZFS_DNSIZE_2K:
-	case ZFS_DNSIZE_4K:
-	case ZFS_DNSIZE_8K:
-	case ZFS_DNSIZE_16K:
-		os->os_dnodesize = newval;
-		break;
-	}
-}
-
-static void
-smallblk_changed_cb(void *arg, uint64_t newval)
-{
-	objset_t *os = arg;
-
-	/*
-	 * Inheritance and range checking should have been done by now.
-	 */
-	ASSERT(newval <= SPA_OLD_MAXBLOCKSIZE);
-	ASSERT(ISP2(newval));
-
-	os->os_zpl_special_smallblock = newval;
-}
-
-static void
-logbias_changed_cb(void *arg, uint64_t newval)
-{
-	objset_t *os = arg;
-
-	ASSERT(newval == ZFS_LOGBIAS_LATENCY ||
-	    newval == ZFS_LOGBIAS_THROUGHPUT);
-	os->os_logbias = newval;
-	if (os->os_zil)
-		zil_set_logbias(os->os_zil, newval);
-}
-
-static void
-recordsize_changed_cb(void *arg, uint64_t newval)
-{
-	objset_t *os = arg;
-
-	os->os_recordsize = newval;
-}
-
-void
-dmu_objset_byteswap(void *buf, size_t size)
-{
-	objset_phys_t *osp = buf;
-
-	ASSERT(size == OBJSET_OLD_PHYS_SIZE || size == sizeof (objset_phys_t));
-	dnode_byteswap(&osp->os_meta_dnode);
-	byteswap_uint64_array(&osp->os_zil_header, sizeof (zil_header_t));
-	osp->os_type = BSWAP_64(osp->os_type);
-	osp->os_flags = BSWAP_64(osp->os_flags);
-	if (size == sizeof (objset_phys_t)) {
-		dnode_byteswap(&osp->os_userused_dnode);
-		dnode_byteswap(&osp->os_groupused_dnode);
-	}
-}
-
-/*
- * The hash is a CRC-based hash of the objset_t pointer and the object number.
- */
-static uint64_t
-dnode_hash(const objset_t *os, uint64_t obj)
-{
-	uintptr_t osv = (uintptr_t)os;
-	uint64_t crc = -1ULL;
-
-	ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
-	/*
-	 * The low 6 bits of the pointer don't have much entropy, because
-	 * the objset_t is larger than 2^6 bytes long.
-	 */
-	crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (osv >> 6)) & 0xFF];
-	crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 0)) & 0xFF];
-	crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 8)) & 0xFF];
-	crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 16)) & 0xFF];
-
-	crc ^= (osv>>14) ^ (obj>>24);
-
-	return (crc);
-}
-
-unsigned int
-dnode_multilist_index_func(multilist_t *ml, void *obj)
-{
-	dnode_t *dn = obj;
-	return (dnode_hash(dn->dn_objset, dn->dn_object) %
-	    multilist_get_num_sublists(ml));
-}
-
-/*
- * Instantiates the objset_t in-memory structure corresponding to the
- * objset_phys_t that's pointed to by the specified blkptr_t.
- */
-int
-dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
-    objset_t **osp)
-{
-	objset_t *os;
-	int i, err;
-
-	ASSERT(ds == NULL || MUTEX_HELD(&ds->ds_opening_lock));
-
-#if 0
-	/*
-	 * The $ORIGIN dataset (if it exists) doesn't have an associated
-	 * objset, so there's no reason to open it. The $ORIGIN dataset
-	 * will not exist on pools older than SPA_VERSION_ORIGIN.
-	 */
-	if (ds != NULL && spa_get_dsl(spa) != NULL &&
-	    spa_get_dsl(spa)->dp_origin_snap != NULL) {
-		ASSERT3P(ds->ds_dir, !=,
-		    spa_get_dsl(spa)->dp_origin_snap->ds_dir);
-	}
-#endif
-
-	os = kmem_zalloc(sizeof (objset_t), KM_SLEEP);
-	os->os_dsl_dataset = ds;
-	os->os_spa = spa;
-	os->os_rootbp = bp;
-	if (!BP_IS_HOLE(os->os_rootbp)) {
-		arc_flags_t aflags = ARC_FLAG_WAIT;
-		zbookmark_phys_t zb;
-		SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET,
-		    ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
-
-		if (DMU_OS_IS_L2CACHEABLE(os))
-			aflags |= ARC_FLAG_L2CACHE;
-
-		dprintf_bp(os->os_rootbp, "reading %s", "");
-		err = arc_read(NULL, spa, os->os_rootbp,
-		    arc_getbuf_func, &os->os_phys_buf,
-		    ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL, &aflags, &zb);
-		if (err != 0) {
-			kmem_free(os, sizeof (objset_t));
-			/* convert checksum errors into IO errors */
-			if (err == ECKSUM)
-				err = SET_ERROR(EIO);
-			return (err);
-		}
-
-		/* Increase the blocksize if we are permitted. */
-		if (spa_version(spa) >= SPA_VERSION_USERSPACE &&
-		    arc_buf_size(os->os_phys_buf) < sizeof (objset_phys_t)) {
-			arc_buf_t *buf = arc_alloc_buf(spa, &os->os_phys_buf,
-			    ARC_BUFC_METADATA, sizeof (objset_phys_t));
-			bzero(buf->b_data, sizeof (objset_phys_t));
-			bcopy(os->os_phys_buf->b_data, buf->b_data,
-			    arc_buf_size(os->os_phys_buf));
-			arc_buf_destroy(os->os_phys_buf, &os->os_phys_buf);
-			os->os_phys_buf = buf;
-		}
-
-		os->os_phys = os->os_phys_buf->b_data;
-		os->os_flags = os->os_phys->os_flags;
-	} else {
-		int size = spa_version(spa) >= SPA_VERSION_USERSPACE ?
-		    sizeof (objset_phys_t) : OBJSET_OLD_PHYS_SIZE;
-		os->os_phys_buf = arc_alloc_buf(spa, &os->os_phys_buf,
-		    ARC_BUFC_METADATA, size);
-		os->os_phys = os->os_phys_buf->b_data;
-		bzero(os->os_phys, size);
-	}
-
-	/*
-	 * Note: the changed_cb will be called once before the register
-	 * func returns, thus changing the checksum/compression from the
-	 * default (fletcher2/off).  Snapshots don't need to know about
-	 * checksum/compression/copies.
-	 */
-	if (ds != NULL) {
-		boolean_t needlock = B_FALSE;
-
-		/*
-		 * Note: it's valid to open the objset if the dataset is
-		 * long-held, in which case the pool_config lock will not
-		 * be held.
-		 */
-		if (!dsl_pool_config_held(dmu_objset_pool(os))) {
-			needlock = B_TRUE;
-			dsl_pool_config_enter(dmu_objset_pool(os), FTAG);
-		}
-		err = dsl_prop_register(ds,
-		    zfs_prop_to_name(ZFS_PROP_PRIMARYCACHE),
-		    primary_cache_changed_cb, os);
-		if (err == 0) {
-			err = dsl_prop_register(ds,
-			    zfs_prop_to_name(ZFS_PROP_SECONDARYCACHE),
-			    secondary_cache_changed_cb, os);
-		}
-		if (!ds->ds_is_snapshot) {
-			if (err == 0) {
-				err = dsl_prop_register(ds,
-				    zfs_prop_to_name(ZFS_PROP_CHECKSUM),
-				    checksum_changed_cb, os);
-			}
-			if (err == 0) {
-				err = dsl_prop_register(ds,
-				    zfs_prop_to_name(ZFS_PROP_COMPRESSION),
-				    compression_changed_cb, os);
-			}
-			if (err == 0) {
-				err = dsl_prop_register(ds,
-				    zfs_prop_to_name(ZFS_PROP_COPIES),
-				    copies_changed_cb, os);
-			}
-			if (err == 0) {
-				err = dsl_prop_register(ds,
-				    zfs_prop_to_name(ZFS_PROP_DEDUP),
-				    dedup_changed_cb, os);
-			}
-			if (err == 0) {
-				err = dsl_prop_register(ds,
-				    zfs_prop_to_name(ZFS_PROP_LOGBIAS),
-				    logbias_changed_cb, os);
-			}
-			if (err == 0) {
-				err = dsl_prop_register(ds,
-				    zfs_prop_to_name(ZFS_PROP_SYNC),
-				    sync_changed_cb, os);
-			}
-			if (err == 0) {
-				err = dsl_prop_register(ds,
-				    zfs_prop_to_name(
-				    ZFS_PROP_REDUNDANT_METADATA),
-				    redundant_metadata_changed_cb, os);
-			}
-			if (err == 0) {
-				err = dsl_prop_register(ds,
-				    zfs_prop_to_name(ZFS_PROP_RECORDSIZE),
-				    recordsize_changed_cb, os);
-			}
-			if (err == 0) {
-				err = dsl_prop_register(ds,
-				    zfs_prop_to_name(ZFS_PROP_DNODESIZE),
-				    dnodesize_changed_cb, os);
-			}
-			if (err == 0) {
-				err = dsl_prop_register(ds,
-				    zfs_prop_to_name(
-				    ZFS_PROP_SPECIAL_SMALL_BLOCKS),
-				    smallblk_changed_cb, os);
-			}
-		}
-		if (needlock)
-			dsl_pool_config_exit(dmu_objset_pool(os), FTAG);
-		if (err != 0) {
-			arc_buf_destroy(os->os_phys_buf, &os->os_phys_buf);
-			kmem_free(os, sizeof (objset_t));
-			return (err);
-		}
-	} else {
-		/* It's the meta-objset. */
-		os->os_checksum = ZIO_CHECKSUM_FLETCHER_4;
-		os->os_compress = ZIO_COMPRESS_ON;
-		os->os_copies = spa_max_replication(spa);
-		os->os_dedup_checksum = ZIO_CHECKSUM_OFF;
-		os->os_dedup_verify = B_FALSE;
-		os->os_logbias = ZFS_LOGBIAS_LATENCY;
-		os->os_sync = ZFS_SYNC_STANDARD;
-		os->os_primary_cache = ZFS_CACHE_ALL;
-		os->os_secondary_cache = ZFS_CACHE_ALL;
-		os->os_dnodesize = DNODE_MIN_SIZE;
-	}
-	/*
-	 * These properties will be filled in by the logic in zfs_get_zplprop()
-	 * when they are queried for the first time.
-	 */
-	os->os_version = OBJSET_PROP_UNINITIALIZED;
-	os->os_normalization = OBJSET_PROP_UNINITIALIZED;
-	os->os_utf8only = OBJSET_PROP_UNINITIALIZED;
-	os->os_casesensitivity = OBJSET_PROP_UNINITIALIZED;
-
-	if (ds == NULL || !ds->ds_is_snapshot)
-		os->os_zil_header = os->os_phys->os_zil_header;
-	os->os_zil = zil_alloc(os, &os->os_zil_header);
-
-	for (i = 0; i < TXG_SIZE; i++) {
-		os->os_dirty_dnodes[i] = multilist_create(sizeof (dnode_t),
-		    offsetof(dnode_t, dn_dirty_link[i]),
-		    dnode_multilist_index_func);
-	}
-	list_create(&os->os_dnodes, sizeof (dnode_t),
-	    offsetof(dnode_t, dn_link));
-	list_create(&os->os_downgraded_dbufs, sizeof (dmu_buf_impl_t),
-	    offsetof(dmu_buf_impl_t, db_link));
-
-	mutex_init(&os->os_lock, NULL, MUTEX_DEFAULT, NULL);
-	mutex_init(&os->os_userused_lock, NULL, MUTEX_DEFAULT, NULL);
-	mutex_init(&os->os_obj_lock, NULL, MUTEX_DEFAULT, NULL);
-	mutex_init(&os->os_user_ptr_lock, NULL, MUTEX_DEFAULT, NULL);
-	os->os_obj_next_percpu_len = boot_ncpus;
-	os->os_obj_next_percpu = kmem_zalloc(os->os_obj_next_percpu_len *
-	    sizeof (os->os_obj_next_percpu[0]), KM_SLEEP);
-
-	dnode_special_open(os, &os->os_phys->os_meta_dnode,
-	    DMU_META_DNODE_OBJECT, &os->os_meta_dnode);
-	if (arc_buf_size(os->os_phys_buf) >= sizeof (objset_phys_t)) {
-		dnode_special_open(os, &os->os_phys->os_userused_dnode,
-		    DMU_USERUSED_OBJECT, &os->os_userused_dnode);
-		dnode_special_open(os, &os->os_phys->os_groupused_dnode,
-		    DMU_GROUPUSED_OBJECT, &os->os_groupused_dnode);
-	}
-
-	*osp = os;
-	return (0);
-}
-
-int
-dmu_objset_from_ds(dsl_dataset_t *ds, objset_t **osp)
-{
-	int err = 0;
-
-	/*
-	 * We shouldn't be doing anything with dsl_dataset_t's unless the
-	 * pool_config lock is held, or the dataset is long-held.
-	 */
-	ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool) ||
-	    dsl_dataset_long_held(ds));
-
-	mutex_enter(&ds->ds_opening_lock);
-	if (ds->ds_objset == NULL) {
-		objset_t *os;
-		rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
-		err = dmu_objset_open_impl(dsl_dataset_get_spa(ds),
-		    ds, dsl_dataset_get_blkptr(ds), &os);
-		rrw_exit(&ds->ds_bp_rwlock, FTAG);
-
-		if (err == 0) {
-			mutex_enter(&ds->ds_lock);
-			ASSERT(ds->ds_objset == NULL);
-			ds->ds_objset = os;
-			mutex_exit(&ds->ds_lock);
-		}
-	}
-	*osp = ds->ds_objset;
-	mutex_exit(&ds->ds_opening_lock);
-	return (err);
-}
-
-/*
- * Holds the pool while the objset is held.  Therefore only one objset
- * can be held at a time.
- */
-int
-dmu_objset_hold(const char *name, void *tag, objset_t **osp)
-{
-	dsl_pool_t *dp;
-	dsl_dataset_t *ds;
-	int err;
-
-	err = dsl_pool_hold(name, tag, &dp);
-	if (err != 0)
-		return (err);
-	err = dsl_dataset_hold(dp, name, tag, &ds);
-	if (err != 0) {
-		dsl_pool_rele(dp, tag);
-		return (err);
-	}
-
-	err = dmu_objset_from_ds(ds, osp);
-	if (err != 0) {
-		dsl_dataset_rele(ds, tag);
-		dsl_pool_rele(dp, tag);
-	}
-
-	return (err);
-}
-
-static int
-dmu_objset_own_impl(dsl_dataset_t *ds, dmu_objset_type_t type,
-    boolean_t readonly, void *tag, objset_t **osp)
-{
-	int err;
-
-	err = dmu_objset_from_ds(ds, osp);
-	if (err != 0) {
-		dsl_dataset_disown(ds, tag);
-	} else if (type != DMU_OST_ANY && type != (*osp)->os_phys->os_type) {
-		dsl_dataset_disown(ds, tag);
-		return (SET_ERROR(EINVAL));
-	} else if (!readonly && dsl_dataset_is_snapshot(ds)) {
-		dsl_dataset_disown(ds, tag);
-		return (SET_ERROR(EROFS));
-	}
-	return (err);
-}
-
-/*
- * dsl_pool must not be held when this is called.
- * Upon successful return, there will be a longhold on the dataset,
- * and the dsl_pool will not be held.
- */
-int
-dmu_objset_own(const char *name, dmu_objset_type_t type,
-    boolean_t readonly, void *tag, objset_t **osp)
-{
-	dsl_pool_t *dp;
-	dsl_dataset_t *ds;
-	int err;
-
-	err = dsl_pool_hold(name, FTAG, &dp);
-	if (err != 0)
-		return (err);
-	err = dsl_dataset_own(dp, name, tag, &ds);
-	if (err != 0) {
-		dsl_pool_rele(dp, FTAG);
-		return (err);
-	}
-	err = dmu_objset_own_impl(ds, type, readonly, tag, osp);
-	dsl_pool_rele(dp, FTAG);
-
-	return (err);
-}
-
-int
-dmu_objset_own_obj(dsl_pool_t *dp, uint64_t obj, dmu_objset_type_t type,
-    boolean_t readonly, void *tag, objset_t **osp)
-{
-	dsl_dataset_t *ds;
-	int err;
-
-	err = dsl_dataset_own_obj(dp, obj, tag, &ds);
-	if (err != 0)
-		return (err);
-
-	return (dmu_objset_own_impl(ds, type, readonly, tag, osp));
-}
-
-void
-dmu_objset_rele(objset_t *os, void *tag)
-{
-	dsl_pool_t *dp = dmu_objset_pool(os);
-	dsl_dataset_rele(os->os_dsl_dataset, tag);
-	dsl_pool_rele(dp, tag);
-}
-
-/*
- * When we are called, os MUST refer to an objset associated with a dataset
- * that is owned by 'tag'; that is, is held and long held by 'tag' and ds_owner
- * == tag.  We will then release and reacquire ownership of the dataset while
- * holding the pool config_rwlock to avoid intervening namespace or ownership
- * changes may occur.
- *
- * This exists solely to accommodate zfs_ioc_userspace_upgrade()'s desire to
- * release the hold on its dataset and acquire a new one on the dataset of the
- * same name so that it can be partially torn down and reconstructed.
- */
-void
-dmu_objset_refresh_ownership(dsl_dataset_t *ds, dsl_dataset_t **newds,
-    void *tag)
-{
-	dsl_pool_t *dp;
-	char name[ZFS_MAX_DATASET_NAME_LEN];
-
-	VERIFY3P(ds, !=, NULL);
-	VERIFY3P(ds->ds_owner, ==, tag);
-	VERIFY(dsl_dataset_long_held(ds));
-
-	dsl_dataset_name(ds, name);
-	dp = ds->ds_dir->dd_pool;
-	dsl_pool_config_enter(dp, FTAG);
-	dsl_dataset_disown(ds, tag);
-	VERIFY0(dsl_dataset_own(dp, name, tag, newds));
-	dsl_pool_config_exit(dp, FTAG);
-}
-
-void
-dmu_objset_disown(objset_t *os, void *tag)
-{
-	dsl_dataset_disown(os->os_dsl_dataset, tag);
-}
-
-void
-dmu_objset_evict_dbufs(objset_t *os)
-{
-	dnode_t dn_marker;
-	dnode_t *dn;
-
-	mutex_enter(&os->os_lock);
-	dn = list_head(&os->os_dnodes);
-	while (dn != NULL) {
-		/*
-		 * Skip dnodes without holds.  We have to do this dance
-		 * because dnode_add_ref() only works if there is already a
-		 * hold.  If the dnode has no holds, then it has no dbufs.
-		 */
-		if (dnode_add_ref(dn, FTAG)) {
-			list_insert_after(&os->os_dnodes, dn, &dn_marker);
-			mutex_exit(&os->os_lock);
-
-			dnode_evict_dbufs(dn);
-			dnode_rele(dn, FTAG);
-
-			mutex_enter(&os->os_lock);
-			dn = list_next(&os->os_dnodes, &dn_marker);
-			list_remove(&os->os_dnodes, &dn_marker);
-		} else {
-			dn = list_next(&os->os_dnodes, dn);
-		}
-	}
-	mutex_exit(&os->os_lock);
-
-	if (DMU_USERUSED_DNODE(os) != NULL) {
-		dnode_evict_dbufs(DMU_GROUPUSED_DNODE(os));
-		dnode_evict_dbufs(DMU_USERUSED_DNODE(os));
-	}
-	dnode_evict_dbufs(DMU_META_DNODE(os));
-}
-
-/*
- * Objset eviction processing is split into into two pieces.
- * The first marks the objset as evicting, evicts any dbufs that
- * have a refcount of zero, and then queues up the objset for the
- * second phase of eviction.  Once os->os_dnodes has been cleared by
- * dnode_buf_pageout()->dnode_destroy(), the second phase is executed.
- * The second phase closes the special dnodes, dequeues the objset from
- * the list of those undergoing eviction, and finally frees the objset.
- *
- * NOTE: Due to asynchronous eviction processing (invocation of
- *       dnode_buf_pageout()), it is possible for the meta dnode for the
- *       objset to have no holds even though os->os_dnodes is not empty.
- */
-void
-dmu_objset_evict(objset_t *os)
-{
-	dsl_dataset_t *ds = os->os_dsl_dataset;
-
-	for (int t = 0; t < TXG_SIZE; t++)
-		ASSERT(!dmu_objset_is_dirty(os, t));
-
-	if (ds)
-		dsl_prop_unregister_all(ds, os);
-
-	if (os->os_sa)
-		sa_tear_down(os);
-
-	dmu_objset_evict_dbufs(os);
-
-	mutex_enter(&os->os_lock);
-	spa_evicting_os_register(os->os_spa, os);
-	if (list_is_empty(&os->os_dnodes)) {
-		mutex_exit(&os->os_lock);
-		dmu_objset_evict_done(os);
-	} else {
-		mutex_exit(&os->os_lock);
-	}
-}
-
-void
-dmu_objset_evict_done(objset_t *os)
-{
-	ASSERT3P(list_head(&os->os_dnodes), ==, NULL);
-
-	dnode_special_close(&os->os_meta_dnode);
-	if (DMU_USERUSED_DNODE(os)) {
-		dnode_special_close(&os->os_userused_dnode);
-		dnode_special_close(&os->os_groupused_dnode);
-	}
-	zil_free(os->os_zil);
-
-	arc_buf_destroy(os->os_phys_buf, &os->os_phys_buf);
-
-	/*
-	 * This is a barrier to prevent the objset from going away in
-	 * dnode_move() until we can safely ensure that the objset is still in
-	 * use. We consider the objset valid before the barrier and invalid
-	 * after the barrier.
-	 */
-	rw_enter(&os_lock, RW_READER);
-	rw_exit(&os_lock);
-
-	kmem_free(os->os_obj_next_percpu,
-	    os->os_obj_next_percpu_len * sizeof (os->os_obj_next_percpu[0]));
-
-	mutex_destroy(&os->os_lock);
-	mutex_destroy(&os->os_userused_lock);
-	mutex_destroy(&os->os_obj_lock);
-	mutex_destroy(&os->os_user_ptr_lock);
-	for (int i = 0; i < TXG_SIZE; i++) {
-		multilist_destroy(os->os_dirty_dnodes[i]);
-	}
-	spa_evicting_os_deregister(os->os_spa, os);
-	kmem_free(os, sizeof (objset_t));
-}
-
-timestruc_t
-dmu_objset_snap_cmtime(objset_t *os)
-{
-	return (dsl_dir_snap_cmtime(os->os_dsl_dataset->ds_dir));
-}
-
-/* called from dsl for meta-objset */
-objset_t *
-dmu_objset_create_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
-    dmu_objset_type_t type, dmu_tx_t *tx)
-{
-	objset_t *os;
-	dnode_t *mdn;
-
-	ASSERT(dmu_tx_is_syncing(tx));
-
-	if (ds != NULL)
-		VERIFY0(dmu_objset_from_ds(ds, &os));
-	else
-		VERIFY0(dmu_objset_open_impl(spa, NULL, bp, &os));
-
-	mdn = DMU_META_DNODE(os);
-
-	dnode_allocate(mdn, DMU_OT_DNODE, DNODE_BLOCK_SIZE, DN_MAX_INDBLKSHIFT,
-	    DMU_OT_NONE, 0, DNODE_MIN_SLOTS, tx);
-
-	/*
-	 * We don't want to have to increase the meta-dnode's nlevels
-	 * later, because then we could do it in quescing context while
-	 * we are also accessing it in open context.
-	 *
-	 * This precaution is not necessary for the MOS (ds == NULL),
-	 * because the MOS is only updated in syncing context.
-	 * This is most fortunate: the MOS is the only objset that
-	 * needs to be synced multiple times as spa_sync() iterates
-	 * to convergence, so minimizing its dn_nlevels matters.
-	 */
-	if (ds != NULL) {
-		int levels = 1;
-
-		/*
-		 * Determine the number of levels necessary for the meta-dnode
-		 * to contain DN_MAX_OBJECT dnodes.  Note that in order to
-		 * ensure that we do not overflow 64 bits, there has to be
-		 * a nlevels that gives us a number of blocks > DN_MAX_OBJECT
-		 * but < 2^64.  Therefore,
-		 * (mdn->dn_indblkshift - SPA_BLKPTRSHIFT) (10) must be
-		 * less than (64 - log2(DN_MAX_OBJECT)) (16).
-		 */
-		while ((uint64_t)mdn->dn_nblkptr <<
-		    (mdn->dn_datablkshift - DNODE_SHIFT +
-		    (levels - 1) * (mdn->dn_indblkshift - SPA_BLKPTRSHIFT)) <
-		    DN_MAX_OBJECT)
-			levels++;
-
-		mdn->dn_next_nlevels[tx->tx_txg & TXG_MASK] =
-		    mdn->dn_nlevels = levels;
-	}
-
-	ASSERT(type != DMU_OST_NONE);
-	ASSERT(type != DMU_OST_ANY);
-	ASSERT(type < DMU_OST_NUMTYPES);
-	os->os_phys->os_type = type;
-	if (dmu_objset_userused_enabled(os)) {
-		os->os_phys->os_flags |= OBJSET_FLAG_USERACCOUNTING_COMPLETE;
-		os->os_flags = os->os_phys->os_flags;
-	}
-
-	dsl_dataset_dirty(ds, tx);
-
-	return (os);
-}
-
-typedef struct dmu_objset_create_arg {
-	const char *doca_name;
-	cred_t *doca_cred;
-	void (*doca_userfunc)(objset_t *os, void *arg,
-	    cred_t *cr, dmu_tx_t *tx);
-	void *doca_userarg;
-	dmu_objset_type_t doca_type;
-	uint64_t doca_flags;
-} dmu_objset_create_arg_t;
-
-/*ARGSUSED*/
-static int
-dmu_objset_create_check(void *arg, dmu_tx_t *tx)
-{
-	dmu_objset_create_arg_t *doca = arg;
-	dsl_pool_t *dp = dmu_tx_pool(tx);
-	dsl_dir_t *pdd;
-	dsl_dataset_t *parentds;
-	objset_t *parentos;
-	const char *tail;
-	int error;
-
-	if (strchr(doca->doca_name, '@') != NULL)
-		return (SET_ERROR(EINVAL));
-
-	if (strlen(doca->doca_name) >= ZFS_MAX_DATASET_NAME_LEN)
-		return (SET_ERROR(ENAMETOOLONG));
-
-	if (dataset_nestcheck(doca->doca_name) != 0)
-		return (SET_ERROR(ENAMETOOLONG));
-
-	error = dsl_dir_hold(dp, doca->doca_name, FTAG, &pdd, &tail);
-	if (error != 0)
-		return (error);
-	if (tail == NULL) {
-		dsl_dir_rele(pdd, FTAG);
-		return (SET_ERROR(EEXIST));
-	}
-	error = dsl_fs_ss_limit_check(pdd, 1, ZFS_PROP_FILESYSTEM_LIMIT, NULL,
-	    doca->doca_cred);
-	if (error != 0) {
-		dsl_dir_rele(pdd, FTAG);
-		return (error);
-	}
-
-	/* can't create below anything but filesystems (eg. no ZVOLs) */
-	error = dsl_dataset_hold_obj(pdd->dd_pool,
-	    dsl_dir_phys(pdd)->dd_head_dataset_obj, FTAG, &parentds);
-	if (error != 0) {
-		dsl_dir_rele(pdd, FTAG);
-		return (error);
-	}
-	error = dmu_objset_from_ds(parentds, &parentos);
-	if (error != 0) {
-		dsl_dataset_rele(parentds, FTAG);
-		dsl_dir_rele(pdd, FTAG);
-		return (error);
-	}
-	if (dmu_objset_type(parentos) != DMU_OST_ZFS) {
-		dsl_dataset_rele(parentds, FTAG);
-		dsl_dir_rele(pdd, FTAG);
-		return (SET_ERROR(ZFS_ERR_WRONG_PARENT));
-	}
-	dsl_dataset_rele(parentds, FTAG);
-	dsl_dir_rele(pdd, FTAG);
-
-	return (error);
-}
-
-static void
-dmu_objset_create_sync(void *arg, dmu_tx_t *tx)
-{
-	dmu_objset_create_arg_t *doca = arg;
-	dsl_pool_t *dp = dmu_tx_pool(tx);
-	dsl_dir_t *pdd;
-	const char *tail;
-	dsl_dataset_t *ds;
-	uint64_t obj;
-	blkptr_t *bp;
-	objset_t *os;
-
-	VERIFY0(dsl_dir_hold(dp, doca->doca_name, FTAG, &pdd, &tail));
-
-	obj = dsl_dataset_create_sync(pdd, tail, NULL, doca->doca_flags,
-	    doca->doca_cred, tx);
-
-	VERIFY0(dsl_dataset_hold_obj(pdd->dd_pool, obj, FTAG, &ds));
-	rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
-	bp = dsl_dataset_get_blkptr(ds);
-	os = dmu_objset_create_impl(pdd->dd_pool->dp_spa,
-	    ds, bp, doca->doca_type, tx);
-	rrw_exit(&ds->ds_bp_rwlock, FTAG);
-
-	if (doca->doca_userfunc != NULL) {
-		doca->doca_userfunc(os, doca->doca_userarg,
-		    doca->doca_cred, tx);
-	}
-
-#if defined(__FreeBSD__) && defined(_KERNEL)
-	zvol_create_minors(dp->dp_spa, doca->doca_name);
-#endif
-	spa_history_log_internal_ds(ds, "create", tx, "");
-	dsl_dataset_rele(ds, FTAG);
-	dsl_dir_rele(pdd, FTAG);
-}
-
-int
-dmu_objset_create(const char *name, dmu_objset_type_t type, uint64_t flags,
-    void (*func)(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx), void *arg)
-{
-	dmu_objset_create_arg_t doca;
-
-	doca.doca_name = name;
-	doca.doca_cred = CRED();
-	doca.doca_flags = flags;
-	doca.doca_userfunc = func;
-	doca.doca_userarg = arg;
-	doca.doca_type = type;
-
-	return (dsl_sync_task(name,
-	    dmu_objset_create_check, dmu_objset_create_sync, &doca,
-	    5, ZFS_SPACE_CHECK_NORMAL));
-}
-
-typedef struct dmu_objset_clone_arg {
-	const char *doca_clone;
-	const char *doca_origin;
-	cred_t *doca_cred;
-} dmu_objset_clone_arg_t;
-
-/*ARGSUSED*/
-static int
-dmu_objset_clone_check(void *arg, dmu_tx_t *tx)
-{
-	dmu_objset_clone_arg_t *doca = arg;
-	dsl_dir_t *pdd;
-	const char *tail;
-	int error;
-	dsl_dataset_t *origin;
-	dsl_pool_t *dp = dmu_tx_pool(tx);
-
-	if (strchr(doca->doca_clone, '@') != NULL)
-		return (SET_ERROR(EINVAL));
-
-	if (strlen(doca->doca_clone) >= ZFS_MAX_DATASET_NAME_LEN)
-		return (SET_ERROR(ENAMETOOLONG));
-
-	error = dsl_dir_hold(dp, doca->doca_clone, FTAG, &pdd, &tail);
-	if (error != 0)
-		return (error);
-	if (tail == NULL) {
-		dsl_dir_rele(pdd, FTAG);
-		return (SET_ERROR(EEXIST));
-	}
-
-	error = dsl_fs_ss_limit_check(pdd, 1, ZFS_PROP_FILESYSTEM_LIMIT, NULL,
-	    doca->doca_cred);
-	if (error != 0) {
-		dsl_dir_rele(pdd, FTAG);
-		return (SET_ERROR(EDQUOT));
-	}
-	dsl_dir_rele(pdd, FTAG);
-
-	error = dsl_dataset_hold(dp, doca->doca_origin, FTAG, &origin);
-	if (error != 0)
-		return (error);
-
-	/* You can only clone snapshots, not the head datasets. */
-	if (!origin->ds_is_snapshot) {
-		dsl_dataset_rele(origin, FTAG);
-		return (SET_ERROR(EINVAL));
-	}
-	dsl_dataset_rele(origin, FTAG);
-
-	return (0);
-}
-
-static void
-dmu_objset_clone_sync(void *arg, dmu_tx_t *tx)
-{
-	dmu_objset_clone_arg_t *doca = arg;
-	dsl_pool_t *dp = dmu_tx_pool(tx);
-	dsl_dir_t *pdd;
-	const char *tail;
-	dsl_dataset_t *origin, *ds;
-	uint64_t obj;
-	char namebuf[ZFS_MAX_DATASET_NAME_LEN];
-
-	VERIFY0(dsl_dir_hold(dp, doca->doca_clone, FTAG, &pdd, &tail));
-	VERIFY0(dsl_dataset_hold(dp, doca->doca_origin, FTAG, &origin));
-
-	obj = dsl_dataset_create_sync(pdd, tail, origin, 0,
-	    doca->doca_cred, tx);
-
-	VERIFY0(dsl_dataset_hold_obj(pdd->dd_pool, obj, FTAG, &ds));
-	dsl_dataset_name(origin, namebuf);
-#if defined(__FreeBSD__) && defined(_KERNEL)
-	zvol_create_minors(dp->dp_spa, doca->doca_clone);
-#endif
-	spa_history_log_internal_ds(ds, "clone", tx,
-	    "origin=%s (%llu)", namebuf, origin->ds_object);
-	dsl_dataset_rele(ds, FTAG);
-	dsl_dataset_rele(origin, FTAG);
-	dsl_dir_rele(pdd, FTAG);
-}
-
-int
-dmu_objset_clone(const char *clone, const char *origin)
-{
-	dmu_objset_clone_arg_t doca;
-
-	doca.doca_clone = clone;
-	doca.doca_origin = origin;
-	doca.doca_cred = CRED();
-
-	return (dsl_sync_task(clone,
-	    dmu_objset_clone_check, dmu_objset_clone_sync, &doca,
-	    5, ZFS_SPACE_CHECK_NORMAL));
-}
-
-static int
-dmu_objset_remap_indirects_impl(objset_t *os, uint64_t last_removed_txg)
-{
-	int error = 0;
-	uint64_t object = 0;
-	while ((error = dmu_object_next(os, &object, B_FALSE, 0)) == 0) {
-		error = dmu_object_remap_indirects(os, object,
-		    last_removed_txg);
-		/*
-		 * If the ZPL removed the object before we managed to dnode_hold
-		 * it, we would get an ENOENT. If the ZPL declares its intent
-		 * to remove the object (dnode_free) before we manage to
-		 * dnode_hold it, we would get an EEXIST. In either case, we
-		 * want to continue remapping the other objects in the objset;
-		 * in all other cases, we want to break early.
-		 */
-		if (error != 0 && error != ENOENT && error != EEXIST) {
-			break;
-		}
-	}
-	if (error == ESRCH) {
-		error = 0;
-	}
-	return (error);
-}
-
-int
-dmu_objset_remap_indirects(const char *fsname)
-{
-	int error = 0;
-	objset_t *os = NULL;
-	uint64_t last_removed_txg;
-	uint64_t remap_start_txg;
-	dsl_dir_t *dd;
-
-	error = dmu_objset_hold(fsname, FTAG, &os);
-	if (error != 0) {
-		return (error);
-	}
-	dd = dmu_objset_ds(os)->ds_dir;
-
-	if (!spa_feature_is_enabled(dmu_objset_spa(os),
-	    SPA_FEATURE_OBSOLETE_COUNTS)) {
-		dmu_objset_rele(os, FTAG);
-		return (SET_ERROR(ENOTSUP));
-	}
-
-	if (dsl_dataset_is_snapshot(dmu_objset_ds(os))) {
-		dmu_objset_rele(os, FTAG);
-		return (SET_ERROR(EINVAL));
-	}
-
-	/*
-	 * If there has not been a removal, we're done.
-	 */
-	last_removed_txg = spa_get_last_removal_txg(dmu_objset_spa(os));
-	if (last_removed_txg == -1ULL) {
-		dmu_objset_rele(os, FTAG);
-		return (0);
-	}
-
-	/*
-	 * If we have remapped since the last removal, we're done.
-	 */
-	if (dsl_dir_is_zapified(dd)) {
-		uint64_t last_remap_txg;
-		if (zap_lookup(spa_meta_objset(dmu_objset_spa(os)),
-		    dd->dd_object, DD_FIELD_LAST_REMAP_TXG,
-		    sizeof (last_remap_txg), 1, &last_remap_txg) == 0 &&
-		    last_remap_txg > last_removed_txg) {
-			dmu_objset_rele(os, FTAG);
-			return (0);
-		}
-	}
-
-	dsl_dataset_long_hold(dmu_objset_ds(os), FTAG);
-	dsl_pool_rele(dmu_objset_pool(os), FTAG);
-
-	remap_start_txg = spa_last_synced_txg(dmu_objset_spa(os));
-	error = dmu_objset_remap_indirects_impl(os, last_removed_txg);
-	if (error == 0) {
-		/*
-		 * We update the last_remap_txg to be the start txg so that
-		 * we can guarantee that every block older than last_remap_txg
-		 * that can be remapped has been remapped.
-		 */
-		error = dsl_dir_update_last_remap_txg(dd, remap_start_txg);
-	}
-
-	dsl_dataset_long_rele(dmu_objset_ds(os), FTAG);
-	dsl_dataset_rele(dmu_objset_ds(os), FTAG);
-
-	return (error);
-}
-
-int
-dmu_objset_snapshot_one(const char *fsname, const char *snapname)
-{
-	int err;
-	char *longsnap = kmem_asprintf("%s@%s", fsname, snapname);
-	nvlist_t *snaps = fnvlist_alloc();
-
-	fnvlist_add_boolean(snaps, longsnap);
-	strfree(longsnap);
-	err = dsl_dataset_snapshot(snaps, NULL, NULL);
-	fnvlist_free(snaps);
-	return (err);
-}
-
-static void
-dmu_objset_sync_dnodes(multilist_sublist_t *list, dmu_tx_t *tx)
-{
-	dnode_t *dn;
-
-	while ((dn = multilist_sublist_head(list)) != NULL) {
-		ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT);
-		ASSERT(dn->dn_dbuf->db_data_pending);
-		/*
-		 * Initialize dn_zio outside dnode_sync() because the
-		 * meta-dnode needs to set it ouside dnode_sync().
-		 */
-		dn->dn_zio = dn->dn_dbuf->db_data_pending->dr_zio;
-		ASSERT(dn->dn_zio);
-
-		ASSERT3U(dn->dn_nlevels, <=, DN_MAX_LEVELS);
-		multilist_sublist_remove(list, dn);
-
-		/*
-		 * If we are not doing useraccounting (os_synced_dnodes == NULL)
-		 * we are done with this dnode for this txg. Unset dn_dirty_txg
-		 * if later txgs aren't dirtying it so that future holders do
-		 * not get a stale value. Otherwise, we will do this in
-		 * userquota_updates_task() when processing has completely
-		 * finished for this txg.
-		 */
-		multilist_t *newlist = dn->dn_objset->os_synced_dnodes;
-		if (newlist != NULL) {
-			(void) dnode_add_ref(dn, newlist);
-			multilist_insert(newlist, dn);
-		} else {
-			mutex_enter(&dn->dn_mtx);
-			if (dn->dn_dirty_txg == tx->tx_txg)
-				dn->dn_dirty_txg = 0;
-			mutex_exit(&dn->dn_mtx);
-		}
-
-		dnode_sync(dn, tx);
-	}
-}
-
-/* ARGSUSED */
-static void
-dmu_objset_write_ready(zio_t *zio, arc_buf_t *abuf, void *arg)
-{
-	blkptr_t *bp = zio->io_bp;
-	objset_t *os = arg;
-	dnode_phys_t *dnp = &os->os_phys->os_meta_dnode;
-
-	ASSERT(!BP_IS_EMBEDDED(bp));
-	ASSERT3U(BP_GET_TYPE(bp), ==, DMU_OT_OBJSET);
-	ASSERT0(BP_GET_LEVEL(bp));
-
-	/*
-	 * Update rootbp fill count: it should be the number of objects
-	 * allocated in the object set (not counting the "special"
-	 * objects that are stored in the objset_phys_t -- the meta
-	 * dnode and user/group accounting objects).
-	 */
-	bp->blk_fill = 0;
-	for (int i = 0; i < dnp->dn_nblkptr; i++)
-		bp->blk_fill += BP_GET_FILL(&dnp->dn_blkptr[i]);
-	if (os->os_dsl_dataset != NULL)
-		rrw_enter(&os->os_dsl_dataset->ds_bp_rwlock, RW_WRITER, FTAG);
-	*os->os_rootbp = *bp;
-	if (os->os_dsl_dataset != NULL)
-		rrw_exit(&os->os_dsl_dataset->ds_bp_rwlock, FTAG);
-}
-
-/* ARGSUSED */
-static void
-dmu_objset_write_done(zio_t *zio, arc_buf_t *abuf, void *arg)
-{
-	blkptr_t *bp = zio->io_bp;
-	blkptr_t *bp_orig = &zio->io_bp_orig;
-	objset_t *os = arg;
-
-	if (zio->io_flags & ZIO_FLAG_IO_REWRITE) {
-		ASSERT(BP_EQUAL(bp, bp_orig));
-	} else {
-		dsl_dataset_t *ds = os->os_dsl_dataset;
-		dmu_tx_t *tx = os->os_synctx;
-
-		(void) dsl_dataset_block_kill(ds, bp_orig, tx, B_TRUE);
-		dsl_dataset_block_born(ds, bp, tx);
-	}
-	kmem_free(bp, sizeof (*bp));
-}
-
-typedef struct sync_dnodes_arg {
-	multilist_t *sda_list;
-	int sda_sublist_idx;
-	multilist_t *sda_newlist;
-	dmu_tx_t *sda_tx;
-} sync_dnodes_arg_t;
-
-static void
-sync_dnodes_task(void *arg)
-{
-	sync_dnodes_arg_t *sda = arg;
-
-	multilist_sublist_t *ms =
-	    multilist_sublist_lock(sda->sda_list, sda->sda_sublist_idx);
-
-	dmu_objset_sync_dnodes(ms, sda->sda_tx);
-
-	multilist_sublist_unlock(ms);
-
-	kmem_free(sda, sizeof (*sda));
-}
-
-
-/* called from dsl */
-void
-dmu_objset_sync(objset_t *os, zio_t *pio, dmu_tx_t *tx)
-{
-	int txgoff;
-	zbookmark_phys_t zb;
-	zio_prop_t zp;
-	zio_t *zio;
-	list_t *list;
-	dbuf_dirty_record_t *dr;
-	int num_sublists;
-	multilist_t *ml;
-	blkptr_t *blkptr_copy = kmem_alloc(sizeof (*os->os_rootbp), KM_SLEEP);
-	*blkptr_copy = *os->os_rootbp;
-
-	dprintf_ds(os->os_dsl_dataset, "txg=%llu\n", tx->tx_txg);
-
-	ASSERT(dmu_tx_is_syncing(tx));
-	/* XXX the write_done callback should really give us the tx... */
-	os->os_synctx = tx;
-
-	if (os->os_dsl_dataset == NULL) {
-		/*
-		 * This is the MOS.  If we have upgraded,
-		 * spa_max_replication() could change, so reset
-		 * os_copies here.
-		 */
-		os->os_copies = spa_max_replication(os->os_spa);
-	}
-
-	/*
-	 * Create the root block IO
-	 */
-	SET_BOOKMARK(&zb, os->os_dsl_dataset ?
-	    os->os_dsl_dataset->ds_object : DMU_META_OBJSET,
-	    ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
-	arc_release(os->os_phys_buf, &os->os_phys_buf);
-
-	dmu_write_policy(os, NULL, 0, 0, &zp);
-
-	zio = arc_write(pio, os->os_spa, tx->tx_txg,
-	    blkptr_copy, os->os_phys_buf, DMU_OS_IS_L2CACHEABLE(os),
-	    &zp, dmu_objset_write_ready, NULL, NULL, dmu_objset_write_done,
-	    os, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
-
-	/*
-	 * Sync special dnodes - the parent IO for the sync is the root block
-	 */
-	DMU_META_DNODE(os)->dn_zio = zio;
-	dnode_sync(DMU_META_DNODE(os), tx);
-
-	os->os_phys->os_flags = os->os_flags;
-
-	if (DMU_USERUSED_DNODE(os) &&
-	    DMU_USERUSED_DNODE(os)->dn_type != DMU_OT_NONE) {
-		DMU_USERUSED_DNODE(os)->dn_zio = zio;
-		dnode_sync(DMU_USERUSED_DNODE(os), tx);
-		DMU_GROUPUSED_DNODE(os)->dn_zio = zio;
-		dnode_sync(DMU_GROUPUSED_DNODE(os), tx);
-	}
-
-	txgoff = tx->tx_txg & TXG_MASK;
-
-	if (dmu_objset_userused_enabled(os)) {
-		/*
-		 * We must create the list here because it uses the
-		 * dn_dirty_link[] of this txg.  But it may already
-		 * exist because we call dsl_dataset_sync() twice per txg.
-		 */
-		if (os->os_synced_dnodes == NULL) {
-			os->os_synced_dnodes =
-			    multilist_create(sizeof (dnode_t),
-			    offsetof(dnode_t, dn_dirty_link[txgoff]),
-			    dnode_multilist_index_func);
-		} else {
-			ASSERT3U(os->os_synced_dnodes->ml_offset, ==,
-			    offsetof(dnode_t, dn_dirty_link[txgoff]));
-		}
-	}
-
-	ml = os->os_dirty_dnodes[txgoff];
-	num_sublists = multilist_get_num_sublists(ml);
-	for (int i = 0; i < num_sublists; i++) {
-		if (multilist_sublist_is_empty_idx(ml, i))
-			continue;
-		sync_dnodes_arg_t *sda = kmem_alloc(sizeof (*sda), KM_SLEEP);
-		sda->sda_list = ml;
-		sda->sda_sublist_idx = i;
-		sda->sda_tx = tx;
-		(void) taskq_dispatch(dmu_objset_pool(os)->dp_sync_taskq,
-		    sync_dnodes_task, sda, 0);
-		/* callback frees sda */
-	}
-	taskq_wait(dmu_objset_pool(os)->dp_sync_taskq);
-
-	list = &DMU_META_DNODE(os)->dn_dirty_records[txgoff];
-	while ((dr = list_head(list)) != NULL) {
-		ASSERT0(dr->dr_dbuf->db_level);
-		list_remove(list, dr);
-		if (dr->dr_zio)
-			zio_nowait(dr->dr_zio);
-	}
-
-	/* Enable dnode backfill if enough objects have been freed. */
-	if (os->os_freed_dnodes >= dmu_rescan_dnode_threshold) {
-		os->os_rescan_dnodes = B_TRUE;
-		os->os_freed_dnodes = 0;
-	}
-
-	/*
-	 * Free intent log blocks up to this tx.
-	 */
-	zil_sync(os->os_zil, tx);
-	os->os_phys->os_zil_header = os->os_zil_header;
-	zio_nowait(zio);
-}
-
-boolean_t
-dmu_objset_is_dirty(objset_t *os, uint64_t txg)
-{
-	return (!multilist_is_empty(os->os_dirty_dnodes[txg & TXG_MASK]));
-}
-
-static objset_used_cb_t *used_cbs[DMU_OST_NUMTYPES];
-
-void
-dmu_objset_register_type(dmu_objset_type_t ost, objset_used_cb_t *cb)
-{
-	used_cbs[ost] = cb;
-}
-
-boolean_t
-dmu_objset_userused_enabled(objset_t *os)
-{
-	return (spa_version(os->os_spa) >= SPA_VERSION_USERSPACE &&
-	    used_cbs[os->os_phys->os_type] != NULL &&
-	    DMU_USERUSED_DNODE(os) != NULL);
-}
-
-typedef struct userquota_node {
-	uint64_t uqn_id;
-	int64_t uqn_delta;
-	avl_node_t uqn_node;
-} userquota_node_t;
-
-typedef struct userquota_cache {
-	avl_tree_t uqc_user_deltas;
-	avl_tree_t uqc_group_deltas;
-} userquota_cache_t;
-
-static int
-userquota_compare(const void *l, const void *r)
-{
-	const userquota_node_t *luqn = l;
-	const userquota_node_t *ruqn = r;
-
-	if (luqn->uqn_id < ruqn->uqn_id)
-		return (-1);
-	if (luqn->uqn_id > ruqn->uqn_id)
-		return (1);
-	return (0);
-}
-
-static void
-do_userquota_cacheflush(objset_t *os, userquota_cache_t *cache, dmu_tx_t *tx)
-{
-	void *cookie;
-	userquota_node_t *uqn;
-
-	ASSERT(dmu_tx_is_syncing(tx));
-
-	cookie = NULL;
-	while ((uqn = avl_destroy_nodes(&cache->uqc_user_deltas,
-	    &cookie)) != NULL) {
-		/*
-		 * os_userused_lock protects against concurrent calls to
-		 * zap_increment_int().  It's needed because zap_increment_int()
-		 * is not thread-safe (i.e. not atomic).
-		 */
-		mutex_enter(&os->os_userused_lock);
-		VERIFY0(zap_increment_int(os, DMU_USERUSED_OBJECT,
-		    uqn->uqn_id, uqn->uqn_delta, tx));
-		mutex_exit(&os->os_userused_lock);
-		kmem_free(uqn, sizeof (*uqn));
-	}
-	avl_destroy(&cache->uqc_user_deltas);
-
-	cookie = NULL;
-	while ((uqn = avl_destroy_nodes(&cache->uqc_group_deltas,
-	    &cookie)) != NULL) {
-		mutex_enter(&os->os_userused_lock);
-		VERIFY0(zap_increment_int(os, DMU_GROUPUSED_OBJECT,
-		    uqn->uqn_id, uqn->uqn_delta, tx));
-		mutex_exit(&os->os_userused_lock);
-		kmem_free(uqn, sizeof (*uqn));
-	}
-	avl_destroy(&cache->uqc_group_deltas);
-}
-
-static void
-userquota_update_cache(avl_tree_t *avl, uint64_t id, int64_t delta)
-{
-	userquota_node_t search = { .uqn_id = id };
-	avl_index_t idx;
-
-	userquota_node_t *uqn = avl_find(avl, &search, &idx);
-	if (uqn == NULL) {
-		uqn = kmem_zalloc(sizeof (*uqn), KM_SLEEP);
-		uqn->uqn_id = id;
-		avl_insert(avl, uqn, idx);
-	}
-	uqn->uqn_delta += delta;
-}
-
-static void
-do_userquota_update(userquota_cache_t *cache, uint64_t used, uint64_t flags,
-    uint64_t user, uint64_t group, boolean_t subtract)
-{
-	if ((flags & DNODE_FLAG_USERUSED_ACCOUNTED)) {
-		int64_t delta = DNODE_MIN_SIZE + used;
-		if (subtract)
-			delta = -delta;
-
-		userquota_update_cache(&cache->uqc_user_deltas, user, delta);
-		userquota_update_cache(&cache->uqc_group_deltas, group, delta);
-	}
-}
-
-typedef struct userquota_updates_arg {
-	objset_t *uua_os;
-	int uua_sublist_idx;
-	dmu_tx_t *uua_tx;
-} userquota_updates_arg_t;
-
-static void
-userquota_updates_task(void *arg)
-{
-	userquota_updates_arg_t *uua = arg;
-	objset_t *os = uua->uua_os;
-	dmu_tx_t *tx = uua->uua_tx;
-	dnode_t *dn;
-	userquota_cache_t cache = { 0 };
-
-	multilist_sublist_t *list =
-	    multilist_sublist_lock(os->os_synced_dnodes, uua->uua_sublist_idx);
-
-	ASSERT(multilist_sublist_head(list) == NULL ||
-	    dmu_objset_userused_enabled(os));
-	avl_create(&cache.uqc_user_deltas, userquota_compare,
-	    sizeof (userquota_node_t), offsetof(userquota_node_t, uqn_node));
-	avl_create(&cache.uqc_group_deltas, userquota_compare,
-	    sizeof (userquota_node_t), offsetof(userquota_node_t, uqn_node));
-
-	while ((dn = multilist_sublist_head(list)) != NULL) {
-		int flags;
-		ASSERT(!DMU_OBJECT_IS_SPECIAL(dn->dn_object));
-		ASSERT(dn->dn_phys->dn_type == DMU_OT_NONE ||
-		    dn->dn_phys->dn_flags &
-		    DNODE_FLAG_USERUSED_ACCOUNTED);
-
-		flags = dn->dn_id_flags;
-		ASSERT(flags);
-		if (flags & DN_ID_OLD_EXIST)  {
-			do_userquota_update(&cache,
-			    dn->dn_oldused, dn->dn_oldflags,
-			    dn->dn_olduid, dn->dn_oldgid, B_TRUE);
-		}
-		if (flags & DN_ID_NEW_EXIST) {
-			do_userquota_update(&cache,
-			    DN_USED_BYTES(dn->dn_phys),
-			    dn->dn_phys->dn_flags,  dn->dn_newuid,
-			    dn->dn_newgid, B_FALSE);
-		}
-
-		mutex_enter(&dn->dn_mtx);
-		dn->dn_oldused = 0;
-		dn->dn_oldflags = 0;
-		if (dn->dn_id_flags & DN_ID_NEW_EXIST) {
-			dn->dn_olduid = dn->dn_newuid;
-			dn->dn_oldgid = dn->dn_newgid;
-			dn->dn_id_flags |= DN_ID_OLD_EXIST;
-			if (dn->dn_bonuslen == 0)
-				dn->dn_id_flags |= DN_ID_CHKED_SPILL;
-			else
-				dn->dn_id_flags |= DN_ID_CHKED_BONUS;
-		}
-		dn->dn_id_flags &= ~(DN_ID_NEW_EXIST);
-		if (dn->dn_dirty_txg == spa_syncing_txg(os->os_spa))
-			dn->dn_dirty_txg = 0;
-		mutex_exit(&dn->dn_mtx);
-
-		multilist_sublist_remove(list, dn);
-		dnode_rele(dn, os->os_synced_dnodes);
-	}
-	do_userquota_cacheflush(os, &cache, tx);
-	multilist_sublist_unlock(list);
-	kmem_free(uua, sizeof (*uua));
-}
-
-void
-dmu_objset_do_userquota_updates(objset_t *os, dmu_tx_t *tx)
-{
-	int num_sublists;
-
-	if (!dmu_objset_userused_enabled(os))
-		return;
-
-	/* Allocate the user/groupused objects if necessary. */
-	if (DMU_USERUSED_DNODE(os)->dn_type == DMU_OT_NONE) {
-		VERIFY0(zap_create_claim(os,
-		    DMU_USERUSED_OBJECT,
-		    DMU_OT_USERGROUP_USED, DMU_OT_NONE, 0, tx));
-		VERIFY0(zap_create_claim(os,
-		    DMU_GROUPUSED_OBJECT,
-		    DMU_OT_USERGROUP_USED, DMU_OT_NONE, 0, tx));
-	}
-
-	num_sublists = multilist_get_num_sublists(os->os_synced_dnodes);
-	for (int i = 0; i < num_sublists; i++) {
-		if (multilist_sublist_is_empty_idx(os->os_synced_dnodes, i))
-			continue;
-		userquota_updates_arg_t *uua =
-		    kmem_alloc(sizeof (*uua), KM_SLEEP);
-		uua->uua_os = os;
-		uua->uua_sublist_idx = i;
-		uua->uua_tx = tx;
-		/* note: caller does taskq_wait() */
-		(void) taskq_dispatch(dmu_objset_pool(os)->dp_sync_taskq,
-		    userquota_updates_task, uua, 0);
-		/* callback frees uua */
-	}
-}
-
-/*
- * Returns a pointer to data to find uid/gid from
- *
- * If a dirty record for transaction group that is syncing can't
- * be found then NULL is returned.  In the NULL case it is assumed
- * the uid/gid aren't changing.
- */
-static void *
-dmu_objset_userquota_find_data(dmu_buf_impl_t *db, dmu_tx_t *tx)
-{
-	dbuf_dirty_record_t *dr, **drp;
-	void *data;
-
-	if (db->db_dirtycnt == 0)
-		return (db->db.db_data);  /* Nothing is changing */
-
-	for (drp = &db->db_last_dirty; (dr = *drp) != NULL; drp = &dr->dr_next)
-		if (dr->dr_txg == tx->tx_txg)
-			break;
-
-	if (dr == NULL) {
-		data = NULL;
-	} else {
-		dnode_t *dn;
-
-		DB_DNODE_ENTER(dr->dr_dbuf);
-		dn = DB_DNODE(dr->dr_dbuf);
-
-		if (dn->dn_bonuslen == 0 &&
-		    dr->dr_dbuf->db_blkid == DMU_SPILL_BLKID)
-			data = dr->dt.dl.dr_data->b_data;
-		else
-			data = dr->dt.dl.dr_data;
-
-		DB_DNODE_EXIT(dr->dr_dbuf);
-	}
-
-	return (data);
-}
-
-void
-dmu_objset_userquota_get_ids(dnode_t *dn, boolean_t before, dmu_tx_t *tx)
-{
-	objset_t *os = dn->dn_objset;
-	void *data = NULL;
-	dmu_buf_impl_t *db = NULL;
-	uint64_t *user = NULL;
-	uint64_t *group = NULL;
-	int flags = dn->dn_id_flags;
-	int error;
-	boolean_t have_spill = B_FALSE;
-
-	if (!dmu_objset_userused_enabled(dn->dn_objset))
-		return;
-
-	if (before && (flags & (DN_ID_CHKED_BONUS|DN_ID_OLD_EXIST|
-	    DN_ID_CHKED_SPILL)))
-		return;
-
-	if (before && dn->dn_bonuslen != 0)
-		data = DN_BONUS(dn->dn_phys);
-	else if (!before && dn->dn_bonuslen != 0) {
-		if (dn->dn_bonus) {
-			db = dn->dn_bonus;
-			mutex_enter(&db->db_mtx);
-			data = dmu_objset_userquota_find_data(db, tx);
-		} else {
-			data = DN_BONUS(dn->dn_phys);
-		}
-	} else if (dn->dn_bonuslen == 0 && dn->dn_bonustype == DMU_OT_SA) {
-			int rf = 0;
-
-			if (RW_WRITE_HELD(&dn->dn_struct_rwlock))
-				rf |= DB_RF_HAVESTRUCT;
-			error = dmu_spill_hold_by_dnode(dn,
-			    rf | DB_RF_MUST_SUCCEED,
-			    FTAG, (dmu_buf_t **)&db);
-			ASSERT(error == 0);
-			mutex_enter(&db->db_mtx);
-			data = (before) ? db->db.db_data :
-			    dmu_objset_userquota_find_data(db, tx);
-			have_spill = B_TRUE;
-	} else {
-		mutex_enter(&dn->dn_mtx);
-		dn->dn_id_flags |= DN_ID_CHKED_BONUS;
-		mutex_exit(&dn->dn_mtx);
-		return;
-	}
-
-	if (before) {
-		ASSERT(data);
-		user = &dn->dn_olduid;
-		group = &dn->dn_oldgid;
-	} else if (data) {
-		user = &dn->dn_newuid;
-		group = &dn->dn_newgid;
-	}
-
-	/*
-	 * Must always call the callback in case the object
-	 * type has changed and that type isn't an object type to track
-	 */
-	error = used_cbs[os->os_phys->os_type](dn->dn_bonustype, data,
-	    user, group);
-
-	/*
-	 * Preserve existing uid/gid when the callback can't determine
-	 * what the new uid/gid are and the callback returned EEXIST.
-	 * The EEXIST error tells us to just use the existing uid/gid.
-	 * If we don't know what the old values are then just assign
-	 * them to 0, since that is a new file  being created.
-	 */
-	if (!before && data == NULL && error == EEXIST) {
-		if (flags & DN_ID_OLD_EXIST) {
-			dn->dn_newuid = dn->dn_olduid;
-			dn->dn_newgid = dn->dn_oldgid;
-		} else {
-			dn->dn_newuid = 0;
-			dn->dn_newgid = 0;
-		}
-		error = 0;
-	}
-
-	if (db)
-		mutex_exit(&db->db_mtx);
-
-	mutex_enter(&dn->dn_mtx);
-	if (error == 0 && before)
-		dn->dn_id_flags |= DN_ID_OLD_EXIST;
-	if (error == 0 && !before)
-		dn->dn_id_flags |= DN_ID_NEW_EXIST;
-
-	if (have_spill) {
-		dn->dn_id_flags |= DN_ID_CHKED_SPILL;
-	} else {
-		dn->dn_id_flags |= DN_ID_CHKED_BONUS;
-	}
-	mutex_exit(&dn->dn_mtx);
-	if (have_spill)
-		dmu_buf_rele((dmu_buf_t *)db, FTAG);
-}
-
-boolean_t
-dmu_objset_userspace_present(objset_t *os)
-{
-	return (os->os_phys->os_flags &
-	    OBJSET_FLAG_USERACCOUNTING_COMPLETE);
-}
-
-int
-dmu_objset_userspace_upgrade(objset_t *os)
-{
-	uint64_t obj;
-	int err = 0;
-
-	if (dmu_objset_userspace_present(os))
-		return (0);
-	if (!dmu_objset_userused_enabled(os))
-		return (SET_ERROR(ENOTSUP));
-	if (dmu_objset_is_snapshot(os))
-		return (SET_ERROR(EINVAL));
-
-	/*
-	 * We simply need to mark every object dirty, so that it will be
-	 * synced out and now accounted.  If this is called
-	 * concurrently, or if we already did some work before crashing,
-	 * that's fine, since we track each object's accounted state
-	 * independently.
-	 */
-
-	for (obj = 0; err == 0; err = dmu_object_next(os, &obj, FALSE, 0)) {
-		dmu_tx_t *tx;
-		dmu_buf_t *db;
-		int objerr;
-
-		if (issig(JUSTLOOKING) && issig(FORREAL))
-			return (SET_ERROR(EINTR));
-
-		objerr = dmu_bonus_hold(os, obj, FTAG, &db);
-		if (objerr != 0)
-			continue;
-		tx = dmu_tx_create(os);
-		dmu_tx_hold_bonus(tx, obj);
-		objerr = dmu_tx_assign(tx, TXG_WAIT);
-		if (objerr != 0) {
-			dmu_tx_abort(tx);
-			continue;
-		}
-		dmu_buf_will_dirty(db, tx);
-		dmu_buf_rele(db, FTAG);
-		dmu_tx_commit(tx);
-	}
-
-	os->os_flags |= OBJSET_FLAG_USERACCOUNTING_COMPLETE;
-	txg_wait_synced(dmu_objset_pool(os), 0);
-	return (0);
-}
-
-void
-dmu_objset_space(objset_t *os, uint64_t *refdbytesp, uint64_t *availbytesp,
-    uint64_t *usedobjsp, uint64_t *availobjsp)
-{
-	dsl_dataset_space(os->os_dsl_dataset, refdbytesp, availbytesp,
-	    usedobjsp, availobjsp);
-}
-
-uint64_t
-dmu_objset_fsid_guid(objset_t *os)
-{
-	return (dsl_dataset_fsid_guid(os->os_dsl_dataset));
-}
-
-void
-dmu_objset_fast_stat(objset_t *os, dmu_objset_stats_t *stat)
-{
-	stat->dds_type = os->os_phys->os_type;
-	if (os->os_dsl_dataset)
-		dsl_dataset_fast_stat(os->os_dsl_dataset, stat);
-}
-
-void
-dmu_objset_stats(objset_t *os, nvlist_t *nv)
-{
-	ASSERT(os->os_dsl_dataset ||
-	    os->os_phys->os_type == DMU_OST_META);
-
-	if (os->os_dsl_dataset != NULL)
-		dsl_dataset_stats(os->os_dsl_dataset, nv);
-
-	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_TYPE,
-	    os->os_phys->os_type);
-	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USERACCOUNTING,
-	    dmu_objset_userspace_present(os));
-}
-
-int
-dmu_objset_is_snapshot(objset_t *os)
-{
-	if (os->os_dsl_dataset != NULL)
-		return (os->os_dsl_dataset->ds_is_snapshot);
-	else
-		return (B_FALSE);
-}
-
-int
-dmu_snapshot_realname(objset_t *os, char *name, char *real, int maxlen,
-    boolean_t *conflict)
-{
-	dsl_dataset_t *ds = os->os_dsl_dataset;
-	uint64_t ignored;
-
-	if (dsl_dataset_phys(ds)->ds_snapnames_zapobj == 0)
-		return (SET_ERROR(ENOENT));
-
-	return (zap_lookup_norm(ds->ds_dir->dd_pool->dp_meta_objset,
-	    dsl_dataset_phys(ds)->ds_snapnames_zapobj, name, 8, 1, &ignored,
-	    MT_NORMALIZE, real, maxlen, conflict));
-}
-
-int
-dmu_snapshot_list_next(objset_t *os, int namelen, char *name,
-    uint64_t *idp, uint64_t *offp, boolean_t *case_conflict)
-{
-	dsl_dataset_t *ds = os->os_dsl_dataset;
-	zap_cursor_t cursor;
-	zap_attribute_t attr;
-
-	ASSERT(dsl_pool_config_held(dmu_objset_pool(os)));
-
-	if (dsl_dataset_phys(ds)->ds_snapnames_zapobj == 0)
-		return (SET_ERROR(ENOENT));
-
-	zap_cursor_init_serialized(&cursor,
-	    ds->ds_dir->dd_pool->dp_meta_objset,
-	    dsl_dataset_phys(ds)->ds_snapnames_zapobj, *offp);
-
-	if (zap_cursor_retrieve(&cursor, &attr) != 0) {
-		zap_cursor_fini(&cursor);
-		return (SET_ERROR(ENOENT));
-	}
-
-	if (strlen(attr.za_name) + 1 > namelen) {
-		zap_cursor_fini(&cursor);
-		return (SET_ERROR(ENAMETOOLONG));
-	}
-
-	(void) strcpy(name, attr.za_name);
-	if (idp)
-		*idp = attr.za_first_integer;
-	if (case_conflict)
-		*case_conflict = attr.za_normalization_conflict;
-	zap_cursor_advance(&cursor);
-	*offp = zap_cursor_serialize(&cursor);
-	zap_cursor_fini(&cursor);
-
-	return (0);
-}
-
-int
-dmu_dir_list_next(objset_t *os, int namelen, char *name,
-    uint64_t *idp, uint64_t *offp)
-{
-	dsl_dir_t *dd = os->os_dsl_dataset->ds_dir;
-	zap_cursor_t cursor;
-	zap_attribute_t attr;
-
-	/* there is no next dir on a snapshot! */
-	if (os->os_dsl_dataset->ds_object !=
-	    dsl_dir_phys(dd)->dd_head_dataset_obj)
-		return (SET_ERROR(ENOENT));
-
-	zap_cursor_init_serialized(&cursor,
-	    dd->dd_pool->dp_meta_objset,
-	    dsl_dir_phys(dd)->dd_child_dir_zapobj, *offp);
-
-	if (zap_cursor_retrieve(&cursor, &attr) != 0) {
-		zap_cursor_fini(&cursor);
-		return (SET_ERROR(ENOENT));
-	}
-
-	if (strlen(attr.za_name) + 1 > namelen) {
-		zap_cursor_fini(&cursor);
-		return (SET_ERROR(ENAMETOOLONG));
-	}
-
-	(void) strcpy(name, attr.za_name);
-	if (idp)
-		*idp = attr.za_first_integer;
-	zap_cursor_advance(&cursor);
-	*offp = zap_cursor_serialize(&cursor);
-	zap_cursor_fini(&cursor);
-
-	return (0);
-}
-
-typedef struct dmu_objset_find_ctx {
-	taskq_t		*dc_tq;
-	dsl_pool_t	*dc_dp;
-	uint64_t	dc_ddobj;
-	char		*dc_ddname; /* last component of ddobj's name */
-	int		(*dc_func)(dsl_pool_t *, dsl_dataset_t *, void *);
-	void		*dc_arg;
-	int		dc_flags;
-	kmutex_t	*dc_error_lock;
-	int		*dc_error;
-} dmu_objset_find_ctx_t;
-
-static void
-dmu_objset_find_dp_impl(dmu_objset_find_ctx_t *dcp)
-{
-	dsl_pool_t *dp = dcp->dc_dp;
-	dsl_dir_t *dd;
-	dsl_dataset_t *ds;
-	zap_cursor_t zc;
-	zap_attribute_t *attr;
-	uint64_t thisobj;
-	int err = 0;
-
-	/* don't process if there already was an error */
-	if (*dcp->dc_error != 0)
-		goto out;
-
-	/*
-	 * Note: passing the name (dc_ddname) here is optional, but it
-	 * improves performance because we don't need to call
-	 * zap_value_search() to determine the name.
-	 */
-	err = dsl_dir_hold_obj(dp, dcp->dc_ddobj, dcp->dc_ddname, FTAG, &dd);
-	if (err != 0)
-		goto out;
-
-	/* Don't visit hidden ($MOS & $ORIGIN) objsets. */
-	if (dd->dd_myname[0] == '$') {
-		dsl_dir_rele(dd, FTAG);
-		goto out;
-	}
-
-	thisobj = dsl_dir_phys(dd)->dd_head_dataset_obj;
-	attr = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP);
-
-	/*
-	 * Iterate over all children.
-	 */
-	if (dcp->dc_flags & DS_FIND_CHILDREN) {
-		for (zap_cursor_init(&zc, dp->dp_meta_objset,
-		    dsl_dir_phys(dd)->dd_child_dir_zapobj);
-		    zap_cursor_retrieve(&zc, attr) == 0;
-		    (void) zap_cursor_advance(&zc)) {
-			ASSERT3U(attr->za_integer_length, ==,
-			    sizeof (uint64_t));
-			ASSERT3U(attr->za_num_integers, ==, 1);
-
-			dmu_objset_find_ctx_t *child_dcp =
-			    kmem_alloc(sizeof (*child_dcp), KM_SLEEP);
-			*child_dcp = *dcp;
-			child_dcp->dc_ddobj = attr->za_first_integer;
-			child_dcp->dc_ddname = spa_strdup(attr->za_name);
-			if (dcp->dc_tq != NULL)
-				(void) taskq_dispatch(dcp->dc_tq,
-				    dmu_objset_find_dp_cb, child_dcp, TQ_SLEEP);
-			else
-				dmu_objset_find_dp_impl(child_dcp);
-		}
-		zap_cursor_fini(&zc);
-	}
-
-	/*
-	 * Iterate over all snapshots.
-	 */
-	if (dcp->dc_flags & DS_FIND_SNAPSHOTS) {
-		dsl_dataset_t *ds;
-		err = dsl_dataset_hold_obj(dp, thisobj, FTAG, &ds);
-
-		if (err == 0) {
-			uint64_t snapobj;
-
-			snapobj = dsl_dataset_phys(ds)->ds_snapnames_zapobj;
-			dsl_dataset_rele(ds, FTAG);
-
-			for (zap_cursor_init(&zc, dp->dp_meta_objset, snapobj);
-			    zap_cursor_retrieve(&zc, attr) == 0;
-			    (void) zap_cursor_advance(&zc)) {
-				ASSERT3U(attr->za_integer_length, ==,
-				    sizeof (uint64_t));
-				ASSERT3U(attr->za_num_integers, ==, 1);
-
-				err = dsl_dataset_hold_obj(dp,
-				    attr->za_first_integer, FTAG, &ds);
-				if (err != 0)
-					break;
-				err = dcp->dc_func(dp, ds, dcp->dc_arg);
-				dsl_dataset_rele(ds, FTAG);
-				if (err != 0)
-					break;
-			}
-			zap_cursor_fini(&zc);
-		}
-	}
-
-	kmem_free(attr, sizeof (zap_attribute_t));
-
-	if (err != 0) {
-		dsl_dir_rele(dd, FTAG);
-		goto out;
-	}
-
-	/*
-	 * Apply to self.
-	 */
-	err = dsl_dataset_hold_obj(dp, thisobj, FTAG, &ds);
-
-	/*
-	 * Note: we hold the dir while calling dsl_dataset_hold_obj() so
-	 * that the dir will remain cached, and we won't have to re-instantiate
-	 * it (which could be expensive due to finding its name via
-	 * zap_value_search()).
-	 */
-	dsl_dir_rele(dd, FTAG);
-	if (err != 0)
-		goto out;
-	err = dcp->dc_func(dp, ds, dcp->dc_arg);
-	dsl_dataset_rele(ds, FTAG);
-
-out:
-	if (err != 0) {
-		mutex_enter(dcp->dc_error_lock);
-		/* only keep first error */
-		if (*dcp->dc_error == 0)
-			*dcp->dc_error = err;
-		mutex_exit(dcp->dc_error_lock);
-	}
-
-	if (dcp->dc_ddname != NULL)
-		spa_strfree(dcp->dc_ddname);
-	kmem_free(dcp, sizeof (*dcp));
-}
-
-static void
-dmu_objset_find_dp_cb(void *arg)
-{
-	dmu_objset_find_ctx_t *dcp = arg;
-	dsl_pool_t *dp = dcp->dc_dp;
-
-	/*
-	 * We need to get a pool_config_lock here, as there are several
-	 * asssert(pool_config_held) down the stack. Getting a lock via
-	 * dsl_pool_config_enter is risky, as it might be stalled by a
-	 * pending writer. This would deadlock, as the write lock can
-	 * only be granted when our parent thread gives up the lock.
-	 * The _prio interface gives us priority over a pending writer.
-	 */
-	dsl_pool_config_enter_prio(dp, FTAG);
-
-	dmu_objset_find_dp_impl(dcp);
-
-	dsl_pool_config_exit(dp, FTAG);
-}
-
-/*
- * Find objsets under and including ddobj, call func(ds) on each.
- * The order for the enumeration is completely undefined.
- * func is called with dsl_pool_config held.
- */
-int
-dmu_objset_find_dp(dsl_pool_t *dp, uint64_t ddobj,
-    int func(dsl_pool_t *, dsl_dataset_t *, void *), void *arg, int flags)
-{
-	int error = 0;
-	taskq_t *tq = NULL;
-	int ntasks;
-	dmu_objset_find_ctx_t *dcp;
-	kmutex_t err_lock;
-
-	mutex_init(&err_lock, NULL, MUTEX_DEFAULT, NULL);
-	dcp = kmem_alloc(sizeof (*dcp), KM_SLEEP);
-	dcp->dc_tq = NULL;
-	dcp->dc_dp = dp;
-	dcp->dc_ddobj = ddobj;
-	dcp->dc_ddname = NULL;
-	dcp->dc_func = func;
-	dcp->dc_arg = arg;
-	dcp->dc_flags = flags;
-	dcp->dc_error_lock = &err_lock;
-	dcp->dc_error = &error;
-
-	if ((flags & DS_FIND_SERIALIZE) || dsl_pool_config_held_writer(dp)) {
-		/*
-		 * In case a write lock is held we can't make use of
-		 * parallelism, as down the stack of the worker threads
-		 * the lock is asserted via dsl_pool_config_held.
-		 * In case of a read lock this is solved by getting a read
-		 * lock in each worker thread, which isn't possible in case
-		 * of a writer lock. So we fall back to the synchronous path
-		 * here.
-		 * In the future it might be possible to get some magic into
-		 * dsl_pool_config_held in a way that it returns true for
-		 * the worker threads so that a single lock held from this
-		 * thread suffices. For now, stay single threaded.
-		 */
-		dmu_objset_find_dp_impl(dcp);
-		mutex_destroy(&err_lock);
-
-		return (error);
-	}
-
-	ntasks = dmu_find_threads;
-	if (ntasks == 0)
-		ntasks = vdev_count_leaves(dp->dp_spa) * 4;
-	tq = taskq_create("dmu_objset_find", ntasks, minclsyspri, ntasks,
-	    INT_MAX, 0);
-	if (tq == NULL) {
-		kmem_free(dcp, sizeof (*dcp));
-		mutex_destroy(&err_lock);
-
-		return (SET_ERROR(ENOMEM));
-	}
-	dcp->dc_tq = tq;
-
-	/* dcp will be freed by task */
-	(void) taskq_dispatch(tq, dmu_objset_find_dp_cb, dcp, TQ_SLEEP);
-
-	/*
-	 * PORTING: this code relies on the property of taskq_wait to wait
-	 * until no more tasks are queued and no more tasks are active. As
-	 * we always queue new tasks from within other tasks, task_wait
-	 * reliably waits for the full recursion to finish, even though we
-	 * enqueue new tasks after taskq_wait has been called.
-	 * On platforms other than illumos, taskq_wait may not have this
-	 * property.
-	 */
-	taskq_wait(tq);
-	taskq_destroy(tq);
-	mutex_destroy(&err_lock);
-
-	return (error);
-}
-
-/*
- * Find all objsets under name, and for each, call 'func(child_name, arg)'.
- * The dp_config_rwlock must not be held when this is called, and it
- * will not be held when the callback is called.
- * Therefore this function should only be used when the pool is not changing
- * (e.g. in syncing context), or the callback can deal with the possible races.
- */
-static int
-dmu_objset_find_impl(spa_t *spa, const char *name,
-    int func(const char *, void *), void *arg, int flags)
-{
-	dsl_dir_t *dd;
-	dsl_pool_t *dp = spa_get_dsl(spa);
-	dsl_dataset_t *ds;
-	zap_cursor_t zc;
-	zap_attribute_t *attr;
-	char *child;
-	uint64_t thisobj;
-	int err;
-
-	dsl_pool_config_enter(dp, FTAG);
-
-	err = dsl_dir_hold(dp, name, FTAG, &dd, NULL);
-	if (err != 0) {
-		dsl_pool_config_exit(dp, FTAG);
-		return (err);
-	}
-
-	/* Don't visit hidden ($MOS & $ORIGIN) objsets. */
-	if (dd->dd_myname[0] == '$') {
-		dsl_dir_rele(dd, FTAG);
-		dsl_pool_config_exit(dp, FTAG);
-		return (0);
-	}
-
-	thisobj = dsl_dir_phys(dd)->dd_head_dataset_obj;
-	attr = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP);
-
-	/*
-	 * Iterate over all children.
-	 */
-	if (flags & DS_FIND_CHILDREN) {
-		for (zap_cursor_init(&zc, dp->dp_meta_objset,
-		    dsl_dir_phys(dd)->dd_child_dir_zapobj);
-		    zap_cursor_retrieve(&zc, attr) == 0;
-		    (void) zap_cursor_advance(&zc)) {
-			ASSERT3U(attr->za_integer_length, ==,
-			    sizeof (uint64_t));
-			ASSERT3U(attr->za_num_integers, ==, 1);
-
-			child = kmem_asprintf("%s/%s", name, attr->za_name);
-			dsl_pool_config_exit(dp, FTAG);
-			err = dmu_objset_find_impl(spa, child,
-			    func, arg, flags);
-			dsl_pool_config_enter(dp, FTAG);
-			strfree(child);
-			if (err != 0)
-				break;
-		}
-		zap_cursor_fini(&zc);
-
-		if (err != 0) {
-			dsl_dir_rele(dd, FTAG);
-			dsl_pool_config_exit(dp, FTAG);
-			kmem_free(attr, sizeof (zap_attribute_t));
-			return (err);
-		}
-	}
-
-	/*
-	 * Iterate over all snapshots.
-	 */
-	if (flags & DS_FIND_SNAPSHOTS) {
-		err = dsl_dataset_hold_obj(dp, thisobj, FTAG, &ds);
-
-		if (err == 0) {
-			uint64_t snapobj;
-
-			snapobj = dsl_dataset_phys(ds)->ds_snapnames_zapobj;
-			dsl_dataset_rele(ds, FTAG);
-
-			for (zap_cursor_init(&zc, dp->dp_meta_objset, snapobj);
-			    zap_cursor_retrieve(&zc, attr) == 0;
-			    (void) zap_cursor_advance(&zc)) {
-				ASSERT3U(attr->za_integer_length, ==,
-				    sizeof (uint64_t));
-				ASSERT3U(attr->za_num_integers, ==, 1);
-
-				child = kmem_asprintf("%s@%s",
-				    name, attr->za_name);
-				dsl_pool_config_exit(dp, FTAG);
-				err = func(child, arg);
-				dsl_pool_config_enter(dp, FTAG);
-				strfree(child);
-				if (err != 0)
-					break;
-			}
-			zap_cursor_fini(&zc);
-		}
-	}
-
-	dsl_dir_rele(dd, FTAG);
-	kmem_free(attr, sizeof (zap_attribute_t));
-	dsl_pool_config_exit(dp, FTAG);
-
-	if (err != 0)
-		return (err);
-
-	/* Apply to self. */
-	return (func(name, arg));
-}
-
-/*
- * See comment above dmu_objset_find_impl().
- */
-int
-dmu_objset_find(char *name, int func(const char *, void *), void *arg,
-    int flags)
-{
-	spa_t *spa;
-	int error;
-
-	error = spa_open(name, &spa, FTAG);
-	if (error != 0)
-		return (error);
-	error = dmu_objset_find_impl(spa, name, func, arg, flags);
-	spa_close(spa, FTAG);
-	return (error);
-}
-
-void
-dmu_objset_set_user(objset_t *os, void *user_ptr)
-{
-	ASSERT(MUTEX_HELD(&os->os_user_ptr_lock));
-	os->os_user_ptr = user_ptr;
-}
-
-void *
-dmu_objset_get_user(objset_t *os)
-{
-	ASSERT(MUTEX_HELD(&os->os_user_ptr_lock));
-	return (os->os_user_ptr);
-}
-
-/*
- * Determine name of filesystem, given name of snapshot.
- * buf must be at least ZFS_MAX_DATASET_NAME_LEN bytes
- */
-int
-dmu_fsname(const char *snapname, char *buf)
-{
-	char *atp = strchr(snapname, '@');
-	if (atp == NULL)
-		return (SET_ERROR(EINVAL));
-	if (atp - snapname >= ZFS_MAX_DATASET_NAME_LEN)
-		return (SET_ERROR(ENAMETOOLONG));
-	(void) strlcpy(buf, snapname, atp - snapname + 1);
-	return (0);
-}
-
-/*
- * Call when we think we're going to write/free space in open context to track
- * the amount of dirty data in the open txg, which is also the amount
- * of memory that can not be evicted until this txg syncs.
- */
-void
-dmu_objset_willuse_space(objset_t *os, int64_t space, dmu_tx_t *tx)
-{
-	dsl_dataset_t *ds = os->os_dsl_dataset;
-	int64_t aspace = spa_get_worst_case_asize(os->os_spa, space);
-
-	if (ds != NULL) {
-		dsl_dir_willuse_space(ds->ds_dir, aspace, tx);
-		dsl_pool_dirty_space(dmu_tx_pool(tx), space, tx);
-	}
-}
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c
+++ /dev/null
@@ -1,3550 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
- * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
- * Copyright (c) 2014, Joyent, Inc. All rights reserved.
- * Copyright (c) 2012, Martin Matuska <mm@FreeBSD.org>. All rights reserved.
- * Copyright 2014 HybridCluster. All rights reserved.
- * Copyright 2016 RackTop Systems.
- * Copyright (c) 2014 Integros [integros.com]
- * Copyright (c) 2018, loli10K <ezomori.nozomu@gmail.com>. All rights reserved.
- */
-
-#include <sys/dmu.h>
-#include <sys/dmu_impl.h>
-#include <sys/dmu_tx.h>
-#include <sys/dbuf.h>
-#include <sys/dnode.h>
-#include <sys/zfs_context.h>
-#include <sys/dmu_objset.h>
-#include <sys/dmu_traverse.h>
-#include <sys/dsl_dataset.h>
-#include <sys/dsl_dir.h>
-#include <sys/dsl_prop.h>
-#include <sys/dsl_pool.h>
-#include <sys/dsl_synctask.h>
-#include <sys/zfs_ioctl.h>
-#include <sys/zap.h>
-#include <sys/zio_checksum.h>
-#include <sys/zfs_znode.h>
-#include <zfs_fletcher.h>
-#include <sys/avl.h>
-#include <sys/ddt.h>
-#include <sys/zfs_onexit.h>
-#include <sys/dmu_send.h>
-#include <sys/dsl_destroy.h>
-#include <sys/blkptr.h>
-#include <sys/dsl_bookmark.h>
-#include <sys/zfeature.h>
-#include <sys/bqueue.h>
-#ifdef __FreeBSD__
-#include <sys/zvol.h>
-#endif
-
-#ifdef __FreeBSD__
-#undef dump_write
-#define dump_write dmu_dump_write
-#endif
-
-/* Set this tunable to TRUE to replace corrupt data with 0x2f5baddb10c */
-int zfs_send_corrupt_data = B_FALSE;
-int zfs_send_queue_length = 16 * 1024 * 1024;
-int zfs_recv_queue_length = 16 * 1024 * 1024;
-/* Set this tunable to FALSE to disable setting of DRR_FLAG_FREERECORDS */
-int zfs_send_set_freerecords_bit = B_TRUE;
-
-#ifdef _KERNEL
-TUNABLE_INT("vfs.zfs.send_set_freerecords_bit", &zfs_send_set_freerecords_bit);
-#endif
-
-static char *dmu_recv_tag = "dmu_recv_tag";
-const char *recv_clone_name = "%recv";
-
-/*
- * Use this to override the recordsize calculation for fast zfs send estimates.
- */
-uint64_t zfs_override_estimate_recordsize = 0;
-
-#define	BP_SPAN(datablkszsec, indblkshift, level) \
-	(((uint64_t)datablkszsec) << (SPA_MINBLOCKSHIFT + \
-	(level) * (indblkshift - SPA_BLKPTRSHIFT)))
-
-static void byteswap_record(dmu_replay_record_t *drr);
-
-struct send_thread_arg {
-	bqueue_t	q;
-	dsl_dataset_t	*ds;		/* Dataset to traverse */
-	uint64_t	fromtxg;	/* Traverse from this txg */
-	int		flags;		/* flags to pass to traverse_dataset */
-	int		error_code;
-	boolean_t	cancel;
-	zbookmark_phys_t resume;
-};
-
-struct send_block_record {
-	boolean_t		eos_marker; /* Marks the end of the stream */
-	blkptr_t		bp;
-	zbookmark_phys_t	zb;
-	uint8_t			indblkshift;
-	uint16_t		datablkszsec;
-	bqueue_node_t		ln;
-};
-
-static int
-dump_bytes(dmu_sendarg_t *dsp, void *buf, int len)
-{
-	dsl_dataset_t *ds = dmu_objset_ds(dsp->dsa_os);
-	struct uio auio;
-	struct iovec aiov;
-
-	/*
-	 * The code does not rely on this (len being a multiple of 8).  We keep
-	 * this assertion because of the corresponding assertion in
-	 * receive_read().  Keeping this assertion ensures that we do not
-	 * inadvertently break backwards compatibility (causing the assertion
-	 * in receive_read() to trigger on old software).
-	 *
-	 * Removing the assertions could be rolled into a new feature that uses
-	 * data that isn't 8-byte aligned; if the assertions were removed, a
-	 * feature flag would have to be added.
-	 */
-
-	ASSERT0(len % 8);
-
-	aiov.iov_base = buf;
-	aiov.iov_len = len;
-	auio.uio_iov = &aiov;
-	auio.uio_iovcnt = 1;
-	auio.uio_resid = len;
-	auio.uio_segflg = UIO_SYSSPACE;
-	auio.uio_rw = UIO_WRITE;
-	auio.uio_offset = (off_t)-1;
-	auio.uio_td = dsp->dsa_td;
-#ifdef _KERNEL
-	if (dsp->dsa_fp->f_type == DTYPE_VNODE)
-		bwillwrite();
-	dsp->dsa_err = fo_write(dsp->dsa_fp, &auio, dsp->dsa_td->td_ucred, 0,
-	    dsp->dsa_td);
-#else
-	fprintf(stderr, "%s: returning EOPNOTSUPP\n", __func__);
-	dsp->dsa_err = EOPNOTSUPP;
-#endif
-	mutex_enter(&ds->ds_sendstream_lock);
-	*dsp->dsa_off += len;
-	mutex_exit(&ds->ds_sendstream_lock);
-
-	return (dsp->dsa_err);
-}
-
-/*
- * For all record types except BEGIN, fill in the checksum (overlaid in
- * drr_u.drr_checksum.drr_checksum).  The checksum verifies everything
- * up to the start of the checksum itself.
- */
-static int
-dump_record(dmu_sendarg_t *dsp, void *payload, int payload_len)
-{
-	ASSERT3U(offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum),
-	    ==, sizeof (dmu_replay_record_t) - sizeof (zio_cksum_t));
-	(void) fletcher_4_incremental_native(dsp->dsa_drr,
-	    offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum),
-	    &dsp->dsa_zc);
-	if (dsp->dsa_drr->drr_type == DRR_BEGIN) {
-		dsp->dsa_sent_begin = B_TRUE;
-	} else {
-		ASSERT(ZIO_CHECKSUM_IS_ZERO(&dsp->dsa_drr->drr_u.
-		    drr_checksum.drr_checksum));
-		dsp->dsa_drr->drr_u.drr_checksum.drr_checksum = dsp->dsa_zc;
-	}
-	if (dsp->dsa_drr->drr_type == DRR_END) {
-		dsp->dsa_sent_end = B_TRUE;
-	}
-	(void) fletcher_4_incremental_native(&dsp->dsa_drr->
-	    drr_u.drr_checksum.drr_checksum,
-	    sizeof (zio_cksum_t), &dsp->dsa_zc);
-	if (dump_bytes(dsp, dsp->dsa_drr, sizeof (dmu_replay_record_t)) != 0)
-		return (SET_ERROR(EINTR));
-	if (payload_len != 0) {
-		(void) fletcher_4_incremental_native(payload, payload_len,
-		    &dsp->dsa_zc);
-		if (dump_bytes(dsp, payload, payload_len) != 0)
-			return (SET_ERROR(EINTR));
-	}
-	return (0);
-}
-
-/*
- * Fill in the drr_free struct, or perform aggregation if the previous record is
- * also a free record, and the two are adjacent.
- *
- * Note that we send free records even for a full send, because we want to be
- * able to receive a full send as a clone, which requires a list of all the free
- * and freeobject records that were generated on the source.
- */
-static int
-dump_free(dmu_sendarg_t *dsp, uint64_t object, uint64_t offset,
-    uint64_t length)
-{
-	struct drr_free *drrf = &(dsp->dsa_drr->drr_u.drr_free);
-
-	/*
-	 * When we receive a free record, dbuf_free_range() assumes
-	 * that the receiving system doesn't have any dbufs in the range
-	 * being freed.  This is always true because there is a one-record
-	 * constraint: we only send one WRITE record for any given
-	 * object,offset.  We know that the one-record constraint is
-	 * true because we always send data in increasing order by
-	 * object,offset.
-	 *
-	 * If the increasing-order constraint ever changes, we should find
-	 * another way to assert that the one-record constraint is still
-	 * satisfied.
-	 */
-	ASSERT(object > dsp->dsa_last_data_object ||
-	    (object == dsp->dsa_last_data_object &&
-	    offset > dsp->dsa_last_data_offset));
-
-	if (length != -1ULL && offset + length < offset)
-		length = -1ULL;
-
-	/*
-	 * If there is a pending op, but it's not PENDING_FREE, push it out,
-	 * since free block aggregation can only be done for blocks of the
-	 * same type (i.e., DRR_FREE records can only be aggregated with
-	 * other DRR_FREE records.  DRR_FREEOBJECTS records can only be
-	 * aggregated with other DRR_FREEOBJECTS records.
-	 */
-	if (dsp->dsa_pending_op != PENDING_NONE &&
-	    dsp->dsa_pending_op != PENDING_FREE) {
-		if (dump_record(dsp, NULL, 0) != 0)
-			return (SET_ERROR(EINTR));
-		dsp->dsa_pending_op = PENDING_NONE;
-	}
-
-	if (dsp->dsa_pending_op == PENDING_FREE) {
-		/*
-		 * There should never be a PENDING_FREE if length is -1
-		 * (because dump_dnode is the only place where this
-		 * function is called with a -1, and only after flushing
-		 * any pending record).
-		 */
-		ASSERT(length != -1ULL);
-		/*
-		 * Check to see whether this free block can be aggregated
-		 * with pending one.
-		 */
-		if (drrf->drr_object == object && drrf->drr_offset +
-		    drrf->drr_length == offset) {
-			drrf->drr_length += length;
-			return (0);
-		} else {
-			/* not a continuation.  Push out pending record */
-			if (dump_record(dsp, NULL, 0) != 0)
-				return (SET_ERROR(EINTR));
-			dsp->dsa_pending_op = PENDING_NONE;
-		}
-	}
-	/* create a FREE record and make it pending */
-	bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t));
-	dsp->dsa_drr->drr_type = DRR_FREE;
-	drrf->drr_object = object;
-	drrf->drr_offset = offset;
-	drrf->drr_length = length;
-	drrf->drr_toguid = dsp->dsa_toguid;
-	if (length == -1ULL) {
-		if (dump_record(dsp, NULL, 0) != 0)
-			return (SET_ERROR(EINTR));
-	} else {
-		dsp->dsa_pending_op = PENDING_FREE;
-	}
-
-	return (0);
-}
-
-static int
-dump_write(dmu_sendarg_t *dsp, dmu_object_type_t type,
-    uint64_t object, uint64_t offset, int lsize, int psize, const blkptr_t *bp,
-    void *data)
-{
-	uint64_t payload_size;
-	struct drr_write *drrw = &(dsp->dsa_drr->drr_u.drr_write);
-
-	/*
-	 * We send data in increasing object, offset order.
-	 * See comment in dump_free() for details.
-	 */
-	ASSERT(object > dsp->dsa_last_data_object ||
-	    (object == dsp->dsa_last_data_object &&
-	    offset > dsp->dsa_last_data_offset));
-	dsp->dsa_last_data_object = object;
-	dsp->dsa_last_data_offset = offset + lsize - 1;
-
-	/*
-	 * If there is any kind of pending aggregation (currently either
-	 * a grouping of free objects or free blocks), push it out to
-	 * the stream, since aggregation can't be done across operations
-	 * of different types.
-	 */
-	if (dsp->dsa_pending_op != PENDING_NONE) {
-		if (dump_record(dsp, NULL, 0) != 0)
-			return (SET_ERROR(EINTR));
-		dsp->dsa_pending_op = PENDING_NONE;
-	}
-	/* write a WRITE record */
-	bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t));
-	dsp->dsa_drr->drr_type = DRR_WRITE;
-	drrw->drr_object = object;
-	drrw->drr_type = type;
-	drrw->drr_offset = offset;
-	drrw->drr_toguid = dsp->dsa_toguid;
-	drrw->drr_logical_size = lsize;
-
-	/* only set the compression fields if the buf is compressed */
-	if (lsize != psize) {
-		ASSERT(dsp->dsa_featureflags & DMU_BACKUP_FEATURE_COMPRESSED);
-		ASSERT(!BP_IS_EMBEDDED(bp));
-		ASSERT(!BP_SHOULD_BYTESWAP(bp));
-		ASSERT(!DMU_OT_IS_METADATA(BP_GET_TYPE(bp)));
-		ASSERT3U(BP_GET_COMPRESS(bp), !=, ZIO_COMPRESS_OFF);
-		ASSERT3S(psize, >, 0);
-		ASSERT3S(lsize, >=, psize);
-
-		drrw->drr_compressiontype = BP_GET_COMPRESS(bp);
-		drrw->drr_compressed_size = psize;
-		payload_size = drrw->drr_compressed_size;
-	} else {
-		payload_size = drrw->drr_logical_size;
-	}
-
-	if (bp == NULL || BP_IS_EMBEDDED(bp)) {
-		/*
-		 * There's no pre-computed checksum for partial-block
-		 * writes or embedded BP's, so (like
-		 * fletcher4-checkummed blocks) userland will have to
-		 * compute a dedup-capable checksum itself.
-		 */
-		drrw->drr_checksumtype = ZIO_CHECKSUM_OFF;
-	} else {
-		drrw->drr_checksumtype = BP_GET_CHECKSUM(bp);
-		if (zio_checksum_table[drrw->drr_checksumtype].ci_flags &
-		    ZCHECKSUM_FLAG_DEDUP)
-			drrw->drr_checksumflags |= DRR_CHECKSUM_DEDUP;
-		DDK_SET_LSIZE(&drrw->drr_key, BP_GET_LSIZE(bp));
-		DDK_SET_PSIZE(&drrw->drr_key, BP_GET_PSIZE(bp));
-		DDK_SET_COMPRESS(&drrw->drr_key, BP_GET_COMPRESS(bp));
-		drrw->drr_key.ddk_cksum = bp->blk_cksum;
-	}
-
-	if (dump_record(dsp, data, payload_size) != 0)
-		return (SET_ERROR(EINTR));
-	return (0);
-}
-
-static int
-dump_write_embedded(dmu_sendarg_t *dsp, uint64_t object, uint64_t offset,
-    int blksz, const blkptr_t *bp)
-{
-	char buf[BPE_PAYLOAD_SIZE];
-	struct drr_write_embedded *drrw =
-	    &(dsp->dsa_drr->drr_u.drr_write_embedded);
-
-	if (dsp->dsa_pending_op != PENDING_NONE) {
-		if (dump_record(dsp, NULL, 0) != 0)
-			return (EINTR);
-		dsp->dsa_pending_op = PENDING_NONE;
-	}
-
-	ASSERT(BP_IS_EMBEDDED(bp));
-
-	bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t));
-	dsp->dsa_drr->drr_type = DRR_WRITE_EMBEDDED;
-	drrw->drr_object = object;
-	drrw->drr_offset = offset;
-	drrw->drr_length = blksz;
-	drrw->drr_toguid = dsp->dsa_toguid;
-	drrw->drr_compression = BP_GET_COMPRESS(bp);
-	drrw->drr_etype = BPE_GET_ETYPE(bp);
-	drrw->drr_lsize = BPE_GET_LSIZE(bp);
-	drrw->drr_psize = BPE_GET_PSIZE(bp);
-
-	decode_embedded_bp_compressed(bp, buf);
-
-	if (dump_record(dsp, buf, P2ROUNDUP(drrw->drr_psize, 8)) != 0)
-		return (EINTR);
-	return (0);
-}
-
-static int
-dump_spill(dmu_sendarg_t *dsp, uint64_t object, int blksz, void *data)
-{
-	struct drr_spill *drrs = &(dsp->dsa_drr->drr_u.drr_spill);
-
-	if (dsp->dsa_pending_op != PENDING_NONE) {
-		if (dump_record(dsp, NULL, 0) != 0)
-			return (SET_ERROR(EINTR));
-		dsp->dsa_pending_op = PENDING_NONE;
-	}
-
-	/* write a SPILL record */
-	bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t));
-	dsp->dsa_drr->drr_type = DRR_SPILL;
-	drrs->drr_object = object;
-	drrs->drr_length = blksz;
-	drrs->drr_toguid = dsp->dsa_toguid;
-
-	if (dump_record(dsp, data, blksz) != 0)
-		return (SET_ERROR(EINTR));
-	return (0);
-}
-
-static int
-dump_freeobjects(dmu_sendarg_t *dsp, uint64_t firstobj, uint64_t numobjs)
-{
-	struct drr_freeobjects *drrfo = &(dsp->dsa_drr->drr_u.drr_freeobjects);
-
-	/*
-	 * If there is a pending op, but it's not PENDING_FREEOBJECTS,
-	 * push it out, since free block aggregation can only be done for
-	 * blocks of the same type (i.e., DRR_FREE records can only be
-	 * aggregated with other DRR_FREE records.  DRR_FREEOBJECTS records
-	 * can only be aggregated with other DRR_FREEOBJECTS records.
-	 */
-	if (dsp->dsa_pending_op != PENDING_NONE &&
-	    dsp->dsa_pending_op != PENDING_FREEOBJECTS) {
-		if (dump_record(dsp, NULL, 0) != 0)
-			return (SET_ERROR(EINTR));
-		dsp->dsa_pending_op = PENDING_NONE;
-	}
-	if (dsp->dsa_pending_op == PENDING_FREEOBJECTS) {
-		/*
-		 * See whether this free object array can be aggregated
-		 * with pending one
-		 */
-		if (drrfo->drr_firstobj + drrfo->drr_numobjs == firstobj) {
-			drrfo->drr_numobjs += numobjs;
-			return (0);
-		} else {
-			/* can't be aggregated.  Push out pending record */
-			if (dump_record(dsp, NULL, 0) != 0)
-				return (SET_ERROR(EINTR));
-			dsp->dsa_pending_op = PENDING_NONE;
-		}
-	}
-
-	/* write a FREEOBJECTS record */
-	bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t));
-	dsp->dsa_drr->drr_type = DRR_FREEOBJECTS;
-	drrfo->drr_firstobj = firstobj;
-	drrfo->drr_numobjs = numobjs;
-	drrfo->drr_toguid = dsp->dsa_toguid;
-
-	dsp->dsa_pending_op = PENDING_FREEOBJECTS;
-
-	return (0);
-}
-
-static int
-dump_dnode(dmu_sendarg_t *dsp, uint64_t object, dnode_phys_t *dnp)
-{
-	struct drr_object *drro = &(dsp->dsa_drr->drr_u.drr_object);
-
-	if (object < dsp->dsa_resume_object) {
-		/*
-		 * Note: when resuming, we will visit all the dnodes in
-		 * the block of dnodes that we are resuming from.  In
-		 * this case it's unnecessary to send the dnodes prior to
-		 * the one we are resuming from.  We should be at most one
-		 * block's worth of dnodes behind the resume point.
-		 */
-		ASSERT3U(dsp->dsa_resume_object - object, <,
-		    1 << (DNODE_BLOCK_SHIFT - DNODE_SHIFT));
-		return (0);
-	}
-
-	if (dnp == NULL || dnp->dn_type == DMU_OT_NONE)
-		return (dump_freeobjects(dsp, object, 1));
-
-	if (dsp->dsa_pending_op != PENDING_NONE) {
-		if (dump_record(dsp, NULL, 0) != 0)
-			return (SET_ERROR(EINTR));
-		dsp->dsa_pending_op = PENDING_NONE;
-	}
-
-	/* write an OBJECT record */
-	bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t));
-	dsp->dsa_drr->drr_type = DRR_OBJECT;
-	drro->drr_object = object;
-	drro->drr_type = dnp->dn_type;
-	drro->drr_bonustype = dnp->dn_bonustype;
-	drro->drr_blksz = dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT;
-	drro->drr_bonuslen = dnp->dn_bonuslen;
-	drro->drr_dn_slots = dnp->dn_extra_slots + 1;
-	drro->drr_checksumtype = dnp->dn_checksum;
-	drro->drr_compress = dnp->dn_compress;
-	drro->drr_toguid = dsp->dsa_toguid;
-
-	if (!(dsp->dsa_featureflags & DMU_BACKUP_FEATURE_LARGE_BLOCKS) &&
-	    drro->drr_blksz > SPA_OLD_MAXBLOCKSIZE)
-		drro->drr_blksz = SPA_OLD_MAXBLOCKSIZE;
-
-	if (dump_record(dsp, DN_BONUS(dnp),
-	    P2ROUNDUP(dnp->dn_bonuslen, 8)) != 0) {
-		return (SET_ERROR(EINTR));
-	}
-
-	/* Free anything past the end of the file. */
-	if (dump_free(dsp, object, (dnp->dn_maxblkid + 1) *
-	    (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT), -1ULL) != 0)
-		return (SET_ERROR(EINTR));
-	if (dsp->dsa_err != 0)
-		return (SET_ERROR(EINTR));
-	return (0);
-}
-
-static boolean_t
-backup_do_embed(dmu_sendarg_t *dsp, const blkptr_t *bp)
-{
-	if (!BP_IS_EMBEDDED(bp))
-		return (B_FALSE);
-
-	/*
-	 * Compression function must be legacy, or explicitly enabled.
-	 */
-	if ((BP_GET_COMPRESS(bp) >= ZIO_COMPRESS_LEGACY_FUNCTIONS &&
-	    !(dsp->dsa_featureflags & DMU_BACKUP_FEATURE_LZ4)))
-		return (B_FALSE);
-
-	/*
-	 * Embed type must be explicitly enabled.
-	 */
-	switch (BPE_GET_ETYPE(bp)) {
-	case BP_EMBEDDED_TYPE_DATA:
-		if (dsp->dsa_featureflags & DMU_BACKUP_FEATURE_EMBED_DATA)
-			return (B_TRUE);
-		break;
-	default:
-		return (B_FALSE);
-	}
-	return (B_FALSE);
-}
-
-/*
- * This is the callback function to traverse_dataset that acts as the worker
- * thread for dmu_send_impl.
- */
-/*ARGSUSED*/
-static int
-send_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
-    const zbookmark_phys_t *zb, const struct dnode_phys *dnp, void *arg)
-{
-	struct send_thread_arg *sta = arg;
-	struct send_block_record *record;
-	uint64_t record_size;
-	int err = 0;
-
-	ASSERT(zb->zb_object == DMU_META_DNODE_OBJECT ||
-	    zb->zb_object >= sta->resume.zb_object);
-
-	if (sta->cancel)
-		return (SET_ERROR(EINTR));
-
-	if (bp == NULL) {
-		ASSERT3U(zb->zb_level, ==, ZB_DNODE_LEVEL);
-		return (0);
-	} else if (zb->zb_level < 0) {
-		return (0);
-	}
-
-	record = kmem_zalloc(sizeof (struct send_block_record), KM_SLEEP);
-	record->eos_marker = B_FALSE;
-	record->bp = *bp;
-	record->zb = *zb;
-	record->indblkshift = dnp->dn_indblkshift;
-	record->datablkszsec = dnp->dn_datablkszsec;
-	record_size = dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT;
-	bqueue_enqueue(&sta->q, record, record_size);
-
-	return (err);
-}
-
-/*
- * This function kicks off the traverse_dataset.  It also handles setting the
- * error code of the thread in case something goes wrong, and pushes the End of
- * Stream record when the traverse_dataset call has finished.  If there is no
- * dataset to traverse, the thread immediately pushes End of Stream marker.
- */
-static void
-send_traverse_thread(void *arg)
-{
-	struct send_thread_arg *st_arg = arg;
-	int err;
-	struct send_block_record *data;
-
-	if (st_arg->ds != NULL) {
-		err = traverse_dataset_resume(st_arg->ds,
-		    st_arg->fromtxg, &st_arg->resume,
-		    st_arg->flags, send_cb, st_arg);
-
-		if (err != EINTR)
-			st_arg->error_code = err;
-	}
-	data = kmem_zalloc(sizeof (*data), KM_SLEEP);
-	data->eos_marker = B_TRUE;
-	bqueue_enqueue(&st_arg->q, data, 1);
-	thread_exit();
-}
-
-/*
- * This function actually handles figuring out what kind of record needs to be
- * dumped, reading the data (which has hopefully been prefetched), and calling
- * the appropriate helper function.
- */
-static int
-do_dump(dmu_sendarg_t *dsa, struct send_block_record *data)
-{
-	dsl_dataset_t *ds = dmu_objset_ds(dsa->dsa_os);
-	const blkptr_t *bp = &data->bp;
-	const zbookmark_phys_t *zb = &data->zb;
-	uint8_t indblkshift = data->indblkshift;
-	uint16_t dblkszsec = data->datablkszsec;
-	spa_t *spa = ds->ds_dir->dd_pool->dp_spa;
-	dmu_object_type_t type = bp ? BP_GET_TYPE(bp) : DMU_OT_NONE;
-	int err = 0;
-
-	ASSERT3U(zb->zb_level, >=, 0);
-
-	ASSERT(zb->zb_object == DMU_META_DNODE_OBJECT ||
-	    zb->zb_object >= dsa->dsa_resume_object);
-
-	if (zb->zb_object != DMU_META_DNODE_OBJECT &&
-	    DMU_OBJECT_IS_SPECIAL(zb->zb_object)) {
-		return (0);
-	} else if (BP_IS_HOLE(bp) &&
-	    zb->zb_object == DMU_META_DNODE_OBJECT) {
-		uint64_t span = BP_SPAN(dblkszsec, indblkshift, zb->zb_level);
-		uint64_t dnobj = (zb->zb_blkid * span) >> DNODE_SHIFT;
-		err = dump_freeobjects(dsa, dnobj, span >> DNODE_SHIFT);
-	} else if (BP_IS_HOLE(bp)) {
-		uint64_t span = BP_SPAN(dblkszsec, indblkshift, zb->zb_level);
-		uint64_t offset = zb->zb_blkid * span;
-		err = dump_free(dsa, zb->zb_object, offset, span);
-	} else if (zb->zb_level > 0 || type == DMU_OT_OBJSET) {
-		return (0);
-	} else if (type == DMU_OT_DNODE) {
-		int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT;
-		arc_flags_t aflags = ARC_FLAG_WAIT;
-		arc_buf_t *abuf;
-
-		ASSERT0(zb->zb_level);
-
-		if (arc_read(NULL, spa, bp, arc_getbuf_func, &abuf,
-		    ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL,
-		    &aflags, zb) != 0)
-			return (SET_ERROR(EIO));
-
-		dnode_phys_t *blk = abuf->b_data;
-		uint64_t dnobj = zb->zb_blkid * epb;
-		for (int i = 0; i < epb; i += blk[i].dn_extra_slots + 1) {
-			err = dump_dnode(dsa, dnobj + i, blk + i);
-			if (err != 0)
-				break;
-		}
-		arc_buf_destroy(abuf, &abuf);
-	} else if (type == DMU_OT_SA) {
-		arc_flags_t aflags = ARC_FLAG_WAIT;
-		arc_buf_t *abuf;
-		int blksz = BP_GET_LSIZE(bp);
-
-		if (arc_read(NULL, spa, bp, arc_getbuf_func, &abuf,
-		    ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL,
-		    &aflags, zb) != 0)
-			return (SET_ERROR(EIO));
-
-		err = dump_spill(dsa, zb->zb_object, blksz, abuf->b_data);
-		arc_buf_destroy(abuf, &abuf);
-	} else if (backup_do_embed(dsa, bp)) {
-		/* it's an embedded level-0 block of a regular object */
-		int blksz = dblkszsec << SPA_MINBLOCKSHIFT;
-		ASSERT0(zb->zb_level);
-		err = dump_write_embedded(dsa, zb->zb_object,
-		    zb->zb_blkid * blksz, blksz, bp);
-	} else {
-		/* it's a level-0 block of a regular object */
-		arc_flags_t aflags = ARC_FLAG_WAIT;
-		arc_buf_t *abuf;
-		int blksz = dblkszsec << SPA_MINBLOCKSHIFT;
-		uint64_t offset;
-
-		/*
-		 * If we have large blocks stored on disk but the send flags
-		 * don't allow us to send large blocks, we split the data from
-		 * the arc buf into chunks.
-		 */
-		boolean_t split_large_blocks = blksz > SPA_OLD_MAXBLOCKSIZE &&
-		    !(dsa->dsa_featureflags & DMU_BACKUP_FEATURE_LARGE_BLOCKS);
-		/*
-		 * We should only request compressed data from the ARC if all
-		 * the following are true:
-		 *  - stream compression was requested
-		 *  - we aren't splitting large blocks into smaller chunks
-		 *  - the data won't need to be byteswapped before sending
-		 *  - this isn't an embedded block
-		 *  - this isn't metadata (if receiving on a different endian
-		 *    system it can be byteswapped more easily)
-		 */
-		boolean_t request_compressed =
-		    (dsa->dsa_featureflags & DMU_BACKUP_FEATURE_COMPRESSED) &&
-		    !split_large_blocks && !BP_SHOULD_BYTESWAP(bp) &&
-		    !BP_IS_EMBEDDED(bp) && !DMU_OT_IS_METADATA(BP_GET_TYPE(bp));
-
-		ASSERT0(zb->zb_level);
-		ASSERT(zb->zb_object > dsa->dsa_resume_object ||
-		    (zb->zb_object == dsa->dsa_resume_object &&
-		    zb->zb_blkid * blksz >= dsa->dsa_resume_offset));
-
-		ASSERT0(zb->zb_level);
-		ASSERT(zb->zb_object > dsa->dsa_resume_object ||
-		    (zb->zb_object == dsa->dsa_resume_object &&
-		    zb->zb_blkid * blksz >= dsa->dsa_resume_offset));
-
-		ASSERT3U(blksz, ==, BP_GET_LSIZE(bp));
-
-		enum zio_flag zioflags = ZIO_FLAG_CANFAIL;
-		if (request_compressed)
-			zioflags |= ZIO_FLAG_RAW;
-		if (arc_read(NULL, spa, bp, arc_getbuf_func, &abuf,
-		    ZIO_PRIORITY_ASYNC_READ, zioflags, &aflags, zb) != 0) {
-			if (zfs_send_corrupt_data) {
-				/* Send a block filled with 0x"zfs badd bloc" */
-				abuf = arc_alloc_buf(spa, &abuf, ARC_BUFC_DATA,
-				    blksz);
-				uint64_t *ptr;
-				for (ptr = abuf->b_data;
-				    (char *)ptr < (char *)abuf->b_data + blksz;
-				    ptr++)
-					*ptr = 0x2f5baddb10cULL;
-			} else {
-				return (SET_ERROR(EIO));
-			}
-		}
-
-		offset = zb->zb_blkid * blksz;
-
-		if (split_large_blocks) {
-			ASSERT3U(arc_get_compression(abuf), ==,
-			    ZIO_COMPRESS_OFF);
-			char *buf = abuf->b_data;
-			while (blksz > 0 && err == 0) {
-				int n = MIN(blksz, SPA_OLD_MAXBLOCKSIZE);
-				err = dump_write(dsa, type, zb->zb_object,
-				    offset, n, n, NULL, buf);
-				offset += n;
-				buf += n;
-				blksz -= n;
-			}
-		} else {
-			err = dump_write(dsa, type, zb->zb_object, offset,
-			    blksz, arc_buf_size(abuf), bp, abuf->b_data);
-		}
-		arc_buf_destroy(abuf, &abuf);
-	}
-
-	ASSERT(err == 0 || err == EINTR);
-	return (err);
-}
-
-/*
- * Pop the new data off the queue, and free the old data.
- */
-static struct send_block_record *
-get_next_record(bqueue_t *bq, struct send_block_record *data)
-{
-	struct send_block_record *tmp = bqueue_dequeue(bq);
-	kmem_free(data, sizeof (*data));
-	return (tmp);
-}
-
-/*
- * Actually do the bulk of the work in a zfs send.
- *
- * Note: Releases dp using the specified tag.
- */
-static int
-dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *to_ds,
-    zfs_bookmark_phys_t *ancestor_zb, boolean_t is_clone,
-    boolean_t embedok, boolean_t large_block_ok, boolean_t compressok,
-    int outfd, uint64_t resumeobj, uint64_t resumeoff,
-#ifdef illumos
-    vnode_t *vp, offset_t *off)
-#else
-    struct file *fp, offset_t *off)
-#endif
-{
-	objset_t *os;
-	dmu_replay_record_t *drr;
-	dmu_sendarg_t *dsp;
-	int err;
-	uint64_t fromtxg = 0;
-	uint64_t featureflags = 0;
-	struct send_thread_arg to_arg = { 0 };
-
-	err = dmu_objset_from_ds(to_ds, &os);
-	if (err != 0) {
-		dsl_pool_rele(dp, tag);
-		return (err);
-	}
-
-	drr = kmem_zalloc(sizeof (dmu_replay_record_t), KM_SLEEP);
-	drr->drr_type = DRR_BEGIN;
-	drr->drr_u.drr_begin.drr_magic = DMU_BACKUP_MAGIC;
-	DMU_SET_STREAM_HDRTYPE(drr->drr_u.drr_begin.drr_versioninfo,
-	    DMU_SUBSTREAM);
-
-#ifdef _KERNEL
-	if (dmu_objset_type(os) == DMU_OST_ZFS) {
-		uint64_t version;
-		if (zfs_get_zplprop(os, ZFS_PROP_VERSION, &version) != 0) {
-			kmem_free(drr, sizeof (dmu_replay_record_t));
-			dsl_pool_rele(dp, tag);
-			return (SET_ERROR(EINVAL));
-		}
-		if (version >= ZPL_VERSION_SA) {
-			featureflags |= DMU_BACKUP_FEATURE_SA_SPILL;
-		}
-	}
-#endif
-
-	if (large_block_ok && to_ds->ds_feature_inuse[SPA_FEATURE_LARGE_BLOCKS])
-		featureflags |= DMU_BACKUP_FEATURE_LARGE_BLOCKS;
-	if (to_ds->ds_feature_inuse[SPA_FEATURE_LARGE_DNODE])
-		featureflags |= DMU_BACKUP_FEATURE_LARGE_DNODE;
-	if (embedok &&
-	    spa_feature_is_active(dp->dp_spa, SPA_FEATURE_EMBEDDED_DATA)) {
-		featureflags |= DMU_BACKUP_FEATURE_EMBED_DATA;
-		if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_LZ4_COMPRESS))
-			featureflags |= DMU_BACKUP_FEATURE_LZ4;
-	}
-	if (compressok) {
-		featureflags |= DMU_BACKUP_FEATURE_COMPRESSED;
-	}
-	if ((featureflags &
-	    (DMU_BACKUP_FEATURE_EMBED_DATA | DMU_BACKUP_FEATURE_COMPRESSED)) !=
-	    0 && spa_feature_is_active(dp->dp_spa, SPA_FEATURE_LZ4_COMPRESS)) {
-		featureflags |= DMU_BACKUP_FEATURE_LZ4;
-	}
-
-	if (resumeobj != 0 || resumeoff != 0) {
-		featureflags |= DMU_BACKUP_FEATURE_RESUMING;
-	}
-
-	DMU_SET_FEATUREFLAGS(drr->drr_u.drr_begin.drr_versioninfo,
-	    featureflags);
-
-	drr->drr_u.drr_begin.drr_creation_time =
-	    dsl_dataset_phys(to_ds)->ds_creation_time;
-	drr->drr_u.drr_begin.drr_type = dmu_objset_type(os);
-	if (is_clone)
-		drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_CLONE;
-	drr->drr_u.drr_begin.drr_toguid = dsl_dataset_phys(to_ds)->ds_guid;
-	if (dsl_dataset_phys(to_ds)->ds_flags & DS_FLAG_CI_DATASET)
-		drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_CI_DATA;
-	if (zfs_send_set_freerecords_bit)
-		drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_FREERECORDS;
-
-	if (ancestor_zb != NULL) {
-		drr->drr_u.drr_begin.drr_fromguid =
-		    ancestor_zb->zbm_guid;
-		fromtxg = ancestor_zb->zbm_creation_txg;
-	}
-	dsl_dataset_name(to_ds, drr->drr_u.drr_begin.drr_toname);
-	if (!to_ds->ds_is_snapshot) {
-		(void) strlcat(drr->drr_u.drr_begin.drr_toname, "@--head--",
-		    sizeof (drr->drr_u.drr_begin.drr_toname));
-	}
-
-	dsp = kmem_zalloc(sizeof (dmu_sendarg_t), KM_SLEEP);
-
-	dsp->dsa_drr = drr;
-	dsp->dsa_outfd = outfd;
-	dsp->dsa_proc = curproc;
-	dsp->dsa_td = curthread;
-	dsp->dsa_fp = fp;
-	dsp->dsa_os = os;
-	dsp->dsa_off = off;
-	dsp->dsa_toguid = dsl_dataset_phys(to_ds)->ds_guid;
-	dsp->dsa_pending_op = PENDING_NONE;
-	dsp->dsa_featureflags = featureflags;
-	dsp->dsa_resume_object = resumeobj;
-	dsp->dsa_resume_offset = resumeoff;
-
-	mutex_enter(&to_ds->ds_sendstream_lock);
-	list_insert_head(&to_ds->ds_sendstreams, dsp);
-	mutex_exit(&to_ds->ds_sendstream_lock);
-
-	dsl_dataset_long_hold(to_ds, FTAG);
-	dsl_pool_rele(dp, tag);
-
-	void *payload = NULL;
-	size_t payload_len = 0;
-	if (resumeobj != 0 || resumeoff != 0) {
-		dmu_object_info_t to_doi;
-		err = dmu_object_info(os, resumeobj, &to_doi);
-		if (err != 0)
-			goto out;
-		SET_BOOKMARK(&to_arg.resume, to_ds->ds_object, resumeobj, 0,
-		    resumeoff / to_doi.doi_data_block_size);
-
-		nvlist_t *nvl = fnvlist_alloc();
-		fnvlist_add_uint64(nvl, "resume_object", resumeobj);
-		fnvlist_add_uint64(nvl, "resume_offset", resumeoff);
-		payload = fnvlist_pack(nvl, &payload_len);
-		drr->drr_payloadlen = payload_len;
-		fnvlist_free(nvl);
-	}
-
-	err = dump_record(dsp, payload, payload_len);
-	fnvlist_pack_free(payload, payload_len);
-	if (err != 0) {
-		err = dsp->dsa_err;
-		goto out;
-	}
-
-	err = bqueue_init(&to_arg.q, zfs_send_queue_length,
-	    offsetof(struct send_block_record, ln));
-	to_arg.error_code = 0;
-	to_arg.cancel = B_FALSE;
-	to_arg.ds = to_ds;
-	to_arg.fromtxg = fromtxg;
-	to_arg.flags = TRAVERSE_PRE | TRAVERSE_PREFETCH;
-	(void) thread_create(NULL, 0, send_traverse_thread, &to_arg, 0, &p0,
-	    TS_RUN, minclsyspri);
-
-	struct send_block_record *to_data;
-	to_data = bqueue_dequeue(&to_arg.q);
-
-	while (!to_data->eos_marker && err == 0) {
-		err = do_dump(dsp, to_data);
-		to_data = get_next_record(&to_arg.q, to_data);
-		if (issig(JUSTLOOKING) && issig(FORREAL))
-			err = EINTR;
-	}
-
-	if (err != 0) {
-		to_arg.cancel = B_TRUE;
-		while (!to_data->eos_marker) {
-			to_data = get_next_record(&to_arg.q, to_data);
-		}
-	}
-	kmem_free(to_data, sizeof (*to_data));
-
-	bqueue_destroy(&to_arg.q);
-
-	if (err == 0 && to_arg.error_code != 0)
-		err = to_arg.error_code;
-
-	if (err != 0)
-		goto out;
-
-	if (dsp->dsa_pending_op != PENDING_NONE)
-		if (dump_record(dsp, NULL, 0) != 0)
-			err = SET_ERROR(EINTR);
-
-	if (err != 0) {
-		if (err == EINTR && dsp->dsa_err != 0)
-			err = dsp->dsa_err;
-		goto out;
-	}
-
-	bzero(drr, sizeof (dmu_replay_record_t));
-	drr->drr_type = DRR_END;
-	drr->drr_u.drr_end.drr_checksum = dsp->dsa_zc;
-	drr->drr_u.drr_end.drr_toguid = dsp->dsa_toguid;
-
-	if (dump_record(dsp, NULL, 0) != 0)
-		err = dsp->dsa_err;
-
-out:
-	mutex_enter(&to_ds->ds_sendstream_lock);
-	list_remove(&to_ds->ds_sendstreams, dsp);
-	mutex_exit(&to_ds->ds_sendstream_lock);
-
-	VERIFY(err != 0 || (dsp->dsa_sent_begin && dsp->dsa_sent_end));
-
-	kmem_free(drr, sizeof (dmu_replay_record_t));
-	kmem_free(dsp, sizeof (dmu_sendarg_t));
-
-	dsl_dataset_long_rele(to_ds, FTAG);
-
-	return (err);
-}
-
-int
-dmu_send_obj(const char *pool, uint64_t tosnap, uint64_t fromsnap,
-    boolean_t embedok, boolean_t large_block_ok, boolean_t compressok,
-#ifdef illumos
-    int outfd, vnode_t *vp, offset_t *off)
-#else
-    int outfd, struct file *fp, offset_t *off)
-#endif
-{
-	dsl_pool_t *dp;
-	dsl_dataset_t *ds;
-	dsl_dataset_t *fromds = NULL;
-	int err;
-
-	err = dsl_pool_hold(pool, FTAG, &dp);
-	if (err != 0)
-		return (err);
-
-	err = dsl_dataset_hold_obj(dp, tosnap, FTAG, &ds);
-	if (err != 0) {
-		dsl_pool_rele(dp, FTAG);
-		return (err);
-	}
-
-	if (fromsnap != 0) {
-		zfs_bookmark_phys_t zb;
-		boolean_t is_clone;
-
-		err = dsl_dataset_hold_obj(dp, fromsnap, FTAG, &fromds);
-		if (err != 0) {
-			dsl_dataset_rele(ds, FTAG);
-			dsl_pool_rele(dp, FTAG);
-			return (err);
-		}
-		if (!dsl_dataset_is_before(ds, fromds, 0))
-			err = SET_ERROR(EXDEV);
-		zb.zbm_creation_time =
-		    dsl_dataset_phys(fromds)->ds_creation_time;
-		zb.zbm_creation_txg = dsl_dataset_phys(fromds)->ds_creation_txg;
-		zb.zbm_guid = dsl_dataset_phys(fromds)->ds_guid;
-		is_clone = (fromds->ds_dir != ds->ds_dir);
-		dsl_dataset_rele(fromds, FTAG);
-		err = dmu_send_impl(FTAG, dp, ds, &zb, is_clone,
-		    embedok, large_block_ok, compressok, outfd, 0, 0, fp, off);
-	} else {
-		err = dmu_send_impl(FTAG, dp, ds, NULL, B_FALSE,
-		    embedok, large_block_ok, compressok, outfd, 0, 0, fp, off);
-	}
-	dsl_dataset_rele(ds, FTAG);
-	return (err);
-}
-
-int
-dmu_send(const char *tosnap, const char *fromsnap, boolean_t embedok,
-    boolean_t large_block_ok, boolean_t compressok, int outfd,
-    uint64_t resumeobj, uint64_t resumeoff,
-#ifdef illumos
-    vnode_t *vp, offset_t *off)
-#else
-    struct file *fp, offset_t *off)
-#endif
-{
-	dsl_pool_t *dp;
-	dsl_dataset_t *ds;
-	int err;
-	boolean_t owned = B_FALSE;
-
-	if (fromsnap != NULL && strpbrk(fromsnap, "@#") == NULL)
-		return (SET_ERROR(EINVAL));
-
-	err = dsl_pool_hold(tosnap, FTAG, &dp);
-	if (err != 0)
-		return (err);
-
-	if (strchr(tosnap, '@') == NULL && spa_writeable(dp->dp_spa)) {
-		/*
-		 * We are sending a filesystem or volume.  Ensure
-		 * that it doesn't change by owning the dataset.
-		 */
-		err = dsl_dataset_own(dp, tosnap, FTAG, &ds);
-		owned = B_TRUE;
-	} else {
-		err = dsl_dataset_hold(dp, tosnap, FTAG, &ds);
-	}
-	if (err != 0) {
-		dsl_pool_rele(dp, FTAG);
-		return (err);
-	}
-
-	if (fromsnap != NULL) {
-		zfs_bookmark_phys_t zb;
-		boolean_t is_clone = B_FALSE;
-		int fsnamelen = strchr(tosnap, '@') - tosnap;
-
-		/*
-		 * If the fromsnap is in a different filesystem, then
-		 * mark the send stream as a clone.
-		 */
-		if (strncmp(tosnap, fromsnap, fsnamelen) != 0 ||
-		    (fromsnap[fsnamelen] != '@' &&
-		    fromsnap[fsnamelen] != '#')) {
-			is_clone = B_TRUE;
-		}
-
-		if (strchr(fromsnap, '@')) {
-			dsl_dataset_t *fromds;
-			err = dsl_dataset_hold(dp, fromsnap, FTAG, &fromds);
-			if (err == 0) {
-				if (!dsl_dataset_is_before(ds, fromds, 0))
-					err = SET_ERROR(EXDEV);
-				zb.zbm_creation_time =
-				    dsl_dataset_phys(fromds)->ds_creation_time;
-				zb.zbm_creation_txg =
-				    dsl_dataset_phys(fromds)->ds_creation_txg;
-				zb.zbm_guid = dsl_dataset_phys(fromds)->ds_guid;
-				is_clone = (ds->ds_dir != fromds->ds_dir);
-				dsl_dataset_rele(fromds, FTAG);
-			}
-		} else {
-			err = dsl_bookmark_lookup(dp, fromsnap, ds, &zb);
-		}
-		if (err != 0) {
-			dsl_dataset_rele(ds, FTAG);
-			dsl_pool_rele(dp, FTAG);
-			return (err);
-		}
-		err = dmu_send_impl(FTAG, dp, ds, &zb, is_clone,
-		    embedok, large_block_ok, compressok,
-		    outfd, resumeobj, resumeoff, fp, off);
-	} else {
-		err = dmu_send_impl(FTAG, dp, ds, NULL, B_FALSE,
-		    embedok, large_block_ok, compressok,
-		    outfd, resumeobj, resumeoff, fp, off);
-	}
-	if (owned)
-		dsl_dataset_disown(ds, FTAG);
-	else
-		dsl_dataset_rele(ds, FTAG);
-	return (err);
-}
-
-static int
-dmu_adjust_send_estimate_for_indirects(dsl_dataset_t *ds, uint64_t uncompressed,
-    uint64_t compressed, boolean_t stream_compressed, uint64_t *sizep)
-{
-	int err = 0;
-	uint64_t size;
-	/*
-	 * Assume that space (both on-disk and in-stream) is dominated by
-	 * data.  We will adjust for indirect blocks and the copies property,
-	 * but ignore per-object space used (eg, dnodes and DRR_OBJECT records).
-	 */
-	uint64_t recordsize;
-	uint64_t record_count;
-	objset_t *os;
-	VERIFY0(dmu_objset_from_ds(ds, &os));
-
-	/* Assume all (uncompressed) blocks are recordsize. */
-	if (zfs_override_estimate_recordsize != 0) {
-		recordsize = zfs_override_estimate_recordsize;
-	} else if (os->os_phys->os_type == DMU_OST_ZVOL) {
-		err = dsl_prop_get_int_ds(ds,
-		    zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), &recordsize);
-	} else {
-		err = dsl_prop_get_int_ds(ds,
-		    zfs_prop_to_name(ZFS_PROP_RECORDSIZE), &recordsize);
-	}
-	if (err != 0)
-		return (err);
-	record_count = uncompressed / recordsize;
-
-	/*
-	 * If we're estimating a send size for a compressed stream, use the
-	 * compressed data size to estimate the stream size. Otherwise, use the
-	 * uncompressed data size.
-	 */
-	size = stream_compressed ? compressed : uncompressed;
-
-	/*
-	 * Subtract out approximate space used by indirect blocks.
-	 * Assume most space is used by data blocks (non-indirect, non-dnode).
-	 * Assume no ditto blocks or internal fragmentation.
-	 *
-	 * Therefore, space used by indirect blocks is sizeof(blkptr_t) per
-	 * block.
-	 */
-	size -= record_count * sizeof (blkptr_t);
-
-	/* Add in the space for the record associated with each block. */
-	size += record_count * sizeof (dmu_replay_record_t);
-
-	*sizep = size;
-
-	return (0);
-}
-
-int
-dmu_send_estimate(dsl_dataset_t *ds, dsl_dataset_t *fromds,
-    boolean_t stream_compressed, uint64_t *sizep)
-{
-	dsl_pool_t *dp = ds->ds_dir->dd_pool;
-	int err;
-	uint64_t uncomp, comp;
-
-	ASSERT(dsl_pool_config_held(dp));
-
-	/* tosnap must be a snapshot */
-	if (!ds->ds_is_snapshot)
-		return (SET_ERROR(EINVAL));
-
-	/* fromsnap, if provided, must be a snapshot */
-	if (fromds != NULL && !fromds->ds_is_snapshot)
-		return (SET_ERROR(EINVAL));
-
-	/*
-	 * fromsnap must be an earlier snapshot from the same fs as tosnap,
-	 * or the origin's fs.
-	 */
-	if (fromds != NULL && !dsl_dataset_is_before(ds, fromds, 0))
-		return (SET_ERROR(EXDEV));
-
-	/* Get compressed and uncompressed size estimates of changed data. */
-	if (fromds == NULL) {
-		uncomp = dsl_dataset_phys(ds)->ds_uncompressed_bytes;
-		comp = dsl_dataset_phys(ds)->ds_compressed_bytes;
-	} else {
-		uint64_t used;
-		err = dsl_dataset_space_written(fromds, ds,
-		    &used, &comp, &uncomp);
-		if (err != 0)
-			return (err);
-	}
-
-	err = dmu_adjust_send_estimate_for_indirects(ds, uncomp, comp,
-	    stream_compressed, sizep);
-	/*
-	 * Add the size of the BEGIN and END records to the estimate.
-	 */
-	*sizep += 2 * sizeof (dmu_replay_record_t);
-	return (err);
-}
-
-struct calculate_send_arg {
-	uint64_t uncompressed;
-	uint64_t compressed;
-};
-
-/*
- * Simple callback used to traverse the blocks of a snapshot and sum their
- * uncompressed and compressed sizes.
- */
-/* ARGSUSED */
-static int
-dmu_calculate_send_traversal(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
-    const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg)
-{
-	struct calculate_send_arg *space = arg;
-	if (bp != NULL && !BP_IS_HOLE(bp)) {
-		space->uncompressed += BP_GET_UCSIZE(bp);
-		space->compressed += BP_GET_PSIZE(bp);
-	}
-	return (0);
-}
-
-/*
- * Given a desination snapshot and a TXG, calculate the approximate size of a
- * send stream sent from that TXG. from_txg may be zero, indicating that the
- * whole snapshot will be sent.
- */
-int
-dmu_send_estimate_from_txg(dsl_dataset_t *ds, uint64_t from_txg,
-    boolean_t stream_compressed, uint64_t *sizep)
-{
-	dsl_pool_t *dp = ds->ds_dir->dd_pool;
-	int err;
-	struct calculate_send_arg size = { 0 };
-
-	ASSERT(dsl_pool_config_held(dp));
-
-	/* tosnap must be a snapshot */
-	if (!ds->ds_is_snapshot)
-		return (SET_ERROR(EINVAL));
-
-	/* verify that from_txg is before the provided snapshot was taken */
-	if (from_txg >= dsl_dataset_phys(ds)->ds_creation_txg) {
-		return (SET_ERROR(EXDEV));
-	}
-
-	/*
-	 * traverse the blocks of the snapshot with birth times after
-	 * from_txg, summing their uncompressed size
-	 */
-	err = traverse_dataset(ds, from_txg, TRAVERSE_POST,
-	    dmu_calculate_send_traversal, &size);
-	if (err)
-		return (err);
-
-	err = dmu_adjust_send_estimate_for_indirects(ds, size.uncompressed,
-	    size.compressed, stream_compressed, sizep);
-	return (err);
-}
-
-typedef struct dmu_recv_begin_arg {
-	const char *drba_origin;
-	dmu_recv_cookie_t *drba_cookie;
-	cred_t *drba_cred;
-	uint64_t drba_snapobj;
-} dmu_recv_begin_arg_t;
-
-static int
-recv_begin_check_existing_impl(dmu_recv_begin_arg_t *drba, dsl_dataset_t *ds,
-    uint64_t fromguid)
-{
-	uint64_t val;
-	uint64_t children;
-	int error;
-	dsl_pool_t *dp = ds->ds_dir->dd_pool;
-
-	/* Temporary clone name must not exist. */
-	error = zap_lookup(dp->dp_meta_objset,
-	    dsl_dir_phys(ds->ds_dir)->dd_child_dir_zapobj, recv_clone_name,
-	    8, 1, &val);
-	if (error != ENOENT)
-		return (error == 0 ? SET_ERROR(EBUSY) : error);
-
-	/* Resume state must not be set. */
-	if (dsl_dataset_has_resume_receive_state(ds))
-		return (SET_ERROR(EBUSY));
-
-	/* New snapshot name must not exist. */
-	error = zap_lookup(dp->dp_meta_objset,
-	    dsl_dataset_phys(ds)->ds_snapnames_zapobj,
-	    drba->drba_cookie->drc_tosnap, 8, 1, &val);
-	if (error != ENOENT)
-		return (error == 0 ? SET_ERROR(EEXIST) : error);
-
-	/* must not have children if receiving a ZVOL */
-	error = zap_count(dp->dp_meta_objset,
-	    dsl_dir_phys(ds->ds_dir)->dd_child_dir_zapobj, &children);
-	if (error != 0)
-		return (error);
-	if (drba->drba_cookie->drc_drrb->drr_type != DMU_OST_ZFS &&
-	    children > 0)
-		return (SET_ERROR(ZFS_ERR_WRONG_PARENT));
-
-	/*
-	 * Check snapshot limit before receiving. We'll recheck again at the
-	 * end, but might as well abort before receiving if we're already over
-	 * the limit.
-	 *
-	 * Note that we do not check the file system limit with
-	 * dsl_dir_fscount_check because the temporary %clones don't count
-	 * against that limit.
-	 */
-	error = dsl_fs_ss_limit_check(ds->ds_dir, 1, ZFS_PROP_SNAPSHOT_LIMIT,
-	    NULL, drba->drba_cred);
-	if (error != 0)
-		return (error);
-
-	if (fromguid != 0) {
-		dsl_dataset_t *snap;
-		uint64_t obj = dsl_dataset_phys(ds)->ds_prev_snap_obj;
-
-		/* Find snapshot in this dir that matches fromguid. */
-		while (obj != 0) {
-			error = dsl_dataset_hold_obj(dp, obj, FTAG,
-			    &snap);
-			if (error != 0)
-				return (SET_ERROR(ENODEV));
-			if (snap->ds_dir != ds->ds_dir) {
-				dsl_dataset_rele(snap, FTAG);
-				return (SET_ERROR(ENODEV));
-			}
-			if (dsl_dataset_phys(snap)->ds_guid == fromguid)
-				break;
-			obj = dsl_dataset_phys(snap)->ds_prev_snap_obj;
-			dsl_dataset_rele(snap, FTAG);
-		}
-		if (obj == 0)
-			return (SET_ERROR(ENODEV));
-
-		if (drba->drba_cookie->drc_force) {
-			drba->drba_snapobj = obj;
-		} else {
-			/*
-			 * If we are not forcing, there must be no
-			 * changes since fromsnap.
-			 */
-			if (dsl_dataset_modified_since_snap(ds, snap)) {
-				dsl_dataset_rele(snap, FTAG);
-				return (SET_ERROR(ETXTBSY));
-			}
-			drba->drba_snapobj = ds->ds_prev->ds_object;
-		}
-
-		dsl_dataset_rele(snap, FTAG);
-	} else {
-		/* if full, then must be forced */
-		if (!drba->drba_cookie->drc_force)
-			return (SET_ERROR(EEXIST));
-		/* start from $ORIGIN@$ORIGIN, if supported */
-		drba->drba_snapobj = dp->dp_origin_snap != NULL ?
-		    dp->dp_origin_snap->ds_object : 0;
-	}
-
-	return (0);
-
-}
-
-static int
-dmu_recv_begin_check(void *arg, dmu_tx_t *tx)
-{
-	dmu_recv_begin_arg_t *drba = arg;
-	dsl_pool_t *dp = dmu_tx_pool(tx);
-	struct drr_begin *drrb = drba->drba_cookie->drc_drrb;
-	uint64_t fromguid = drrb->drr_fromguid;
-	int flags = drrb->drr_flags;
-	int error;
-	uint64_t featureflags = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo);
-	dsl_dataset_t *ds;
-	const char *tofs = drba->drba_cookie->drc_tofs;
-
-	/* already checked */
-	ASSERT3U(drrb->drr_magic, ==, DMU_BACKUP_MAGIC);
-	ASSERT(!(featureflags & DMU_BACKUP_FEATURE_RESUMING));
-
-	if (DMU_GET_STREAM_HDRTYPE(drrb->drr_versioninfo) ==
-	    DMU_COMPOUNDSTREAM ||
-	    drrb->drr_type >= DMU_OST_NUMTYPES ||
-	    ((flags & DRR_FLAG_CLONE) && drba->drba_origin == NULL))
-		return (SET_ERROR(EINVAL));
-
-	/* Verify pool version supports SA if SA_SPILL feature set */
-	if ((featureflags & DMU_BACKUP_FEATURE_SA_SPILL) &&
-	    spa_version(dp->dp_spa) < SPA_VERSION_SA)
-		return (SET_ERROR(ENOTSUP));
-
-	if (drba->drba_cookie->drc_resumable &&
-	    !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_EXTENSIBLE_DATASET))
-		return (SET_ERROR(ENOTSUP));
-
-	/*
-	 * The receiving code doesn't know how to translate a WRITE_EMBEDDED
-	 * record to a plain WRITE record, so the pool must have the
-	 * EMBEDDED_DATA feature enabled if the stream has WRITE_EMBEDDED
-	 * records.  Same with WRITE_EMBEDDED records that use LZ4 compression.
-	 */
-	if ((featureflags & DMU_BACKUP_FEATURE_EMBED_DATA) &&
-	    !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_EMBEDDED_DATA))
-		return (SET_ERROR(ENOTSUP));
-	if ((featureflags & DMU_BACKUP_FEATURE_LZ4) &&
-	    !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LZ4_COMPRESS))
-		return (SET_ERROR(ENOTSUP));
-
-	/*
-	 * The receiving code doesn't know how to translate large blocks
-	 * to smaller ones, so the pool must have the LARGE_BLOCKS
-	 * feature enabled if the stream has LARGE_BLOCKS. Same with
-	 * large dnodes.
-	 */
-	if ((featureflags & DMU_BACKUP_FEATURE_LARGE_BLOCKS) &&
-	    !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LARGE_BLOCKS))
-		return (SET_ERROR(ENOTSUP));
-	if ((featureflags & DMU_BACKUP_FEATURE_LARGE_DNODE) &&
-	    !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LARGE_DNODE))
-		return (SET_ERROR(ENOTSUP));
-
-	error = dsl_dataset_hold(dp, tofs, FTAG, &ds);
-	if (error == 0) {
-		/* target fs already exists; recv into temp clone */
-
-		/* Can't recv a clone into an existing fs */
-		if (flags & DRR_FLAG_CLONE || drba->drba_origin) {
-			dsl_dataset_rele(ds, FTAG);
-			return (SET_ERROR(EINVAL));
-		}
-
-		error = recv_begin_check_existing_impl(drba, ds, fromguid);
-		dsl_dataset_rele(ds, FTAG);
-	} else if (error == ENOENT) {
-		/* target fs does not exist; must be a full backup or clone */
-		char buf[ZFS_MAX_DATASET_NAME_LEN];
-		objset_t *os;
-
-		/*
-		 * If it's a non-clone incremental, we are missing the
-		 * target fs, so fail the recv.
-		 */
-		if (fromguid != 0 && !(flags & DRR_FLAG_CLONE ||
-		    drba->drba_origin))
-			return (SET_ERROR(ENOENT));
-
-		/*
-		 * If we're receiving a full send as a clone, and it doesn't
-		 * contain all the necessary free records and freeobject
-		 * records, reject it.
-		 */
-		if (fromguid == 0 && drba->drba_origin &&
-		    !(flags & DRR_FLAG_FREERECORDS))
-			return (SET_ERROR(EINVAL));
-
-		/* Open the parent of tofs */
-		ASSERT3U(strlen(tofs), <, sizeof (buf));
-		(void) strlcpy(buf, tofs, strrchr(tofs, '/') - tofs + 1);
-		error = dsl_dataset_hold(dp, buf, FTAG, &ds);
-		if (error != 0)
-			return (error);
-
-		/*
-		 * Check filesystem and snapshot limits before receiving. We'll
-		 * recheck snapshot limits again at the end (we create the
-		 * filesystems and increment those counts during begin_sync).
-		 */
-		error = dsl_fs_ss_limit_check(ds->ds_dir, 1,
-		    ZFS_PROP_FILESYSTEM_LIMIT, NULL, drba->drba_cred);
-		if (error != 0) {
-			dsl_dataset_rele(ds, FTAG);
-			return (error);
-		}
-
-		error = dsl_fs_ss_limit_check(ds->ds_dir, 1,
-		    ZFS_PROP_SNAPSHOT_LIMIT, NULL, drba->drba_cred);
-		if (error != 0) {
-			dsl_dataset_rele(ds, FTAG);
-			return (error);
-		}
-
-		/* can't recv below anything but filesystems (eg. no ZVOLs) */
-		error = dmu_objset_from_ds(ds, &os);
-		if (error != 0) {
-			dsl_dataset_rele(ds, FTAG);
-			return (error);
-		}
-		if (dmu_objset_type(os) != DMU_OST_ZFS) {
-			dsl_dataset_rele(ds, FTAG);
-			return (SET_ERROR(ZFS_ERR_WRONG_PARENT));
-		}
-
-		if (drba->drba_origin != NULL) {
-			dsl_dataset_t *origin;
-			error = dsl_dataset_hold(dp, drba->drba_origin,
-			    FTAG, &origin);
-			if (error != 0) {
-				dsl_dataset_rele(ds, FTAG);
-				return (error);
-			}
-			if (!origin->ds_is_snapshot) {
-				dsl_dataset_rele(origin, FTAG);
-				dsl_dataset_rele(ds, FTAG);
-				return (SET_ERROR(EINVAL));
-			}
-			if (dsl_dataset_phys(origin)->ds_guid != fromguid &&
-			    fromguid != 0) {
-				dsl_dataset_rele(origin, FTAG);
-				dsl_dataset_rele(ds, FTAG);
-				return (SET_ERROR(ENODEV));
-			}
-			dsl_dataset_rele(origin, FTAG);
-		}
-
-		dsl_dataset_rele(ds, FTAG);
-		error = 0;
-	}
-	return (error);
-}
-
-static void
-dmu_recv_begin_sync(void *arg, dmu_tx_t *tx)
-{
-	dmu_recv_begin_arg_t *drba = arg;
-	dsl_pool_t *dp = dmu_tx_pool(tx);
-	objset_t *mos = dp->dp_meta_objset;
-	struct drr_begin *drrb = drba->drba_cookie->drc_drrb;
-	const char *tofs = drba->drba_cookie->drc_tofs;
-	dsl_dataset_t *ds, *newds;
-	uint64_t dsobj;
-	int error;
-	uint64_t crflags = 0;
-
-	if (drrb->drr_flags & DRR_FLAG_CI_DATA)
-		crflags |= DS_FLAG_CI_DATASET;
-
-	error = dsl_dataset_hold(dp, tofs, FTAG, &ds);
-	if (error == 0) {
-		/* create temporary clone */
-		dsl_dataset_t *snap = NULL;
-		if (drba->drba_snapobj != 0) {
-			VERIFY0(dsl_dataset_hold_obj(dp,
-			    drba->drba_snapobj, FTAG, &snap));
-		}
-		dsobj = dsl_dataset_create_sync(ds->ds_dir, recv_clone_name,
-		    snap, crflags, drba->drba_cred, tx);
-		if (drba->drba_snapobj != 0)
-			dsl_dataset_rele(snap, FTAG);
-		dsl_dataset_rele(ds, FTAG);
-	} else {
-		dsl_dir_t *dd;
-		const char *tail;
-		dsl_dataset_t *origin = NULL;
-
-		VERIFY0(dsl_dir_hold(dp, tofs, FTAG, &dd, &tail));
-
-		if (drba->drba_origin != NULL) {
-			VERIFY0(dsl_dataset_hold(dp, drba->drba_origin,
-			    FTAG, &origin));
-		}
-
-		/* Create new dataset. */
-		dsobj = dsl_dataset_create_sync(dd,
-		    strrchr(tofs, '/') + 1,
-		    origin, crflags, drba->drba_cred, tx);
-		if (origin != NULL)
-			dsl_dataset_rele(origin, FTAG);
-		dsl_dir_rele(dd, FTAG);
-		drba->drba_cookie->drc_newfs = B_TRUE;
-	}
-	VERIFY0(dsl_dataset_own_obj(dp, dsobj, dmu_recv_tag, &newds));
-
-	if (drba->drba_cookie->drc_resumable) {
-		dsl_dataset_zapify(newds, tx);
-		if (drrb->drr_fromguid != 0) {
-			VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_FROMGUID,
-			    8, 1, &drrb->drr_fromguid, tx));
-		}
-		VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_TOGUID,
-		    8, 1, &drrb->drr_toguid, tx));
-		VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_TONAME,
-		    1, strlen(drrb->drr_toname) + 1, drrb->drr_toname, tx));
-		uint64_t one = 1;
-		uint64_t zero = 0;
-		VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_OBJECT,
-		    8, 1, &one, tx));
-		VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_OFFSET,
-		    8, 1, &zero, tx));
-		VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_BYTES,
-		    8, 1, &zero, tx));
-		if (DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo) &
-		    DMU_BACKUP_FEATURE_LARGE_BLOCKS) {
-			VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_LARGEBLOCK,
-			    8, 1, &one, tx));
-		}
-		if (DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo) &
-		    DMU_BACKUP_FEATURE_EMBED_DATA) {
-			VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_EMBEDOK,
-			    8, 1, &one, tx));
-		}
-		if (DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo) &
-		    DMU_BACKUP_FEATURE_COMPRESSED) {
-			VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_COMPRESSOK,
-			    8, 1, &one, tx));
-		}
-	}
-
-	dmu_buf_will_dirty(newds->ds_dbuf, tx);
-	dsl_dataset_phys(newds)->ds_flags |= DS_FLAG_INCONSISTENT;
-
-	/*
-	 * If we actually created a non-clone, we need to create the
-	 * objset in our new dataset.
-	 */
-	rrw_enter(&newds->ds_bp_rwlock, RW_READER, FTAG);
-	if (BP_IS_HOLE(dsl_dataset_get_blkptr(newds))) {
-		(void) dmu_objset_create_impl(dp->dp_spa,
-		    newds, dsl_dataset_get_blkptr(newds), drrb->drr_type, tx);
-	}
-	rrw_exit(&newds->ds_bp_rwlock, FTAG);
-
-	drba->drba_cookie->drc_ds = newds;
-
-	spa_history_log_internal_ds(newds, "receive", tx, "");
-}
-
-static int
-dmu_recv_resume_begin_check(void *arg, dmu_tx_t *tx)
-{
-	dmu_recv_begin_arg_t *drba = arg;
-	dsl_pool_t *dp = dmu_tx_pool(tx);
-	struct drr_begin *drrb = drba->drba_cookie->drc_drrb;
-	int error;
-	uint64_t featureflags = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo);
-	dsl_dataset_t *ds;
-	const char *tofs = drba->drba_cookie->drc_tofs;
-
-	/* 6 extra bytes for /%recv */
-	char recvname[ZFS_MAX_DATASET_NAME_LEN + 6];
-
-	/* already checked */
-	ASSERT3U(drrb->drr_magic, ==, DMU_BACKUP_MAGIC);
-	ASSERT(featureflags & DMU_BACKUP_FEATURE_RESUMING);
-
-	if (DMU_GET_STREAM_HDRTYPE(drrb->drr_versioninfo) ==
-	    DMU_COMPOUNDSTREAM ||
-	    drrb->drr_type >= DMU_OST_NUMTYPES)
-		return (SET_ERROR(EINVAL));
-
-	/* Verify pool version supports SA if SA_SPILL feature set */
-	if ((featureflags & DMU_BACKUP_FEATURE_SA_SPILL) &&
-	    spa_version(dp->dp_spa) < SPA_VERSION_SA)
-		return (SET_ERROR(ENOTSUP));
-
-	/*
-	 * The receiving code doesn't know how to translate a WRITE_EMBEDDED
-	 * record to a plain WRITE record, so the pool must have the
-	 * EMBEDDED_DATA feature enabled if the stream has WRITE_EMBEDDED
-	 * records.  Same with WRITE_EMBEDDED records that use LZ4 compression.
-	 */
-	if ((featureflags & DMU_BACKUP_FEATURE_EMBED_DATA) &&
-	    !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_EMBEDDED_DATA))
-		return (SET_ERROR(ENOTSUP));
-	if ((featureflags & DMU_BACKUP_FEATURE_LZ4) &&
-	    !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LZ4_COMPRESS))
-		return (SET_ERROR(ENOTSUP));
-
-	/*
-	 * The receiving code doesn't know how to translate large blocks
-	 * to smaller ones, so the pool must have the LARGE_BLOCKS
-	 * feature enabled if the stream has LARGE_BLOCKS. Same with
-	 * large dnodes.
-	 */
-	if ((featureflags & DMU_BACKUP_FEATURE_LARGE_BLOCKS) &&
-	    !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LARGE_BLOCKS))
-		return (SET_ERROR(ENOTSUP));
-	if ((featureflags & DMU_BACKUP_FEATURE_LARGE_DNODE) &&
-	    !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LARGE_DNODE))
-		return (SET_ERROR(ENOTSUP));
-
-	(void) snprintf(recvname, sizeof (recvname), "%s/%s",
-	    tofs, recv_clone_name);
-
-	if (dsl_dataset_hold(dp, recvname, FTAG, &ds) != 0) {
-		/* %recv does not exist; continue in tofs */
-		error = dsl_dataset_hold(dp, tofs, FTAG, &ds);
-		if (error != 0)
-			return (error);
-	}
-
-	/* check that ds is marked inconsistent */
-	if (!DS_IS_INCONSISTENT(ds)) {
-		dsl_dataset_rele(ds, FTAG);
-		return (SET_ERROR(EINVAL));
-	}
-
-	/* check that there is resuming data, and that the toguid matches */
-	if (!dsl_dataset_is_zapified(ds)) {
-		dsl_dataset_rele(ds, FTAG);
-		return (SET_ERROR(EINVAL));
-	}
-	uint64_t val;
-	error = zap_lookup(dp->dp_meta_objset, ds->ds_object,
-	    DS_FIELD_RESUME_TOGUID, sizeof (val), 1, &val);
-	if (error != 0 || drrb->drr_toguid != val) {
-		dsl_dataset_rele(ds, FTAG);
-		return (SET_ERROR(EINVAL));
-	}
-
-	/*
-	 * Check if the receive is still running.  If so, it will be owned.
-	 * Note that nothing else can own the dataset (e.g. after the receive
-	 * fails) because it will be marked inconsistent.
-	 */
-	if (dsl_dataset_has_owner(ds)) {
-		dsl_dataset_rele(ds, FTAG);
-		return (SET_ERROR(EBUSY));
-	}
-
-	/* There should not be any snapshots of this fs yet. */
-	if (ds->ds_prev != NULL && ds->ds_prev->ds_dir == ds->ds_dir) {
-		dsl_dataset_rele(ds, FTAG);
-		return (SET_ERROR(EINVAL));
-	}
-
-	/*
-	 * Note: resume point will be checked when we process the first WRITE
-	 * record.
-	 */
-
-	/* check that the origin matches */
-	val = 0;
-	(void) zap_lookup(dp->dp_meta_objset, ds->ds_object,
-	    DS_FIELD_RESUME_FROMGUID, sizeof (val), 1, &val);
-	if (drrb->drr_fromguid != val) {
-		dsl_dataset_rele(ds, FTAG);
-		return (SET_ERROR(EINVAL));
-	}
-
-	dsl_dataset_rele(ds, FTAG);
-	return (0);
-}
-
-static void
-dmu_recv_resume_begin_sync(void *arg, dmu_tx_t *tx)
-{
-	dmu_recv_begin_arg_t *drba = arg;
-	dsl_pool_t *dp = dmu_tx_pool(tx);
-	const char *tofs = drba->drba_cookie->drc_tofs;
-	dsl_dataset_t *ds;
-	uint64_t dsobj;
-	/* 6 extra bytes for /%recv */
-	char recvname[ZFS_MAX_DATASET_NAME_LEN + 6];
-
-	(void) snprintf(recvname, sizeof (recvname), "%s/%s",
-	    tofs, recv_clone_name);
-
-	if (dsl_dataset_hold(dp, recvname, FTAG, &ds) != 0) {
-		/* %recv does not exist; continue in tofs */
-		VERIFY0(dsl_dataset_hold(dp, tofs, FTAG, &ds));
-		drba->drba_cookie->drc_newfs = B_TRUE;
-	}
-
-	/* clear the inconsistent flag so that we can own it */
-	ASSERT(DS_IS_INCONSISTENT(ds));
-	dmu_buf_will_dirty(ds->ds_dbuf, tx);
-	dsl_dataset_phys(ds)->ds_flags &= ~DS_FLAG_INCONSISTENT;
-	dsobj = ds->ds_object;
-	dsl_dataset_rele(ds, FTAG);
-
-	VERIFY0(dsl_dataset_own_obj(dp, dsobj, dmu_recv_tag, &ds));
-
-	dmu_buf_will_dirty(ds->ds_dbuf, tx);
-	dsl_dataset_phys(ds)->ds_flags |= DS_FLAG_INCONSISTENT;
-
-	rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
-	ASSERT(!BP_IS_HOLE(dsl_dataset_get_blkptr(ds)));
-	rrw_exit(&ds->ds_bp_rwlock, FTAG);
-
-	drba->drba_cookie->drc_ds = ds;
-
-	spa_history_log_internal_ds(ds, "resume receive", tx, "");
-}
-
-/*
- * NB: callers *MUST* call dmu_recv_stream() if dmu_recv_begin()
- * succeeds; otherwise we will leak the holds on the datasets.
- */
-int
-dmu_recv_begin(char *tofs, char *tosnap, dmu_replay_record_t *drr_begin,
-    boolean_t force, boolean_t resumable, char *origin, dmu_recv_cookie_t *drc)
-{
-	dmu_recv_begin_arg_t drba = { 0 };
-
-	bzero(drc, sizeof (dmu_recv_cookie_t));
-	drc->drc_drr_begin = drr_begin;
-	drc->drc_drrb = &drr_begin->drr_u.drr_begin;
-	drc->drc_tosnap = tosnap;
-	drc->drc_tofs = tofs;
-	drc->drc_force = force;
-	drc->drc_resumable = resumable;
-	drc->drc_cred = CRED();
-	drc->drc_clone = (origin != NULL);
-
-	if (drc->drc_drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC)) {
-		drc->drc_byteswap = B_TRUE;
-		(void) fletcher_4_incremental_byteswap(drr_begin,
-		    sizeof (dmu_replay_record_t), &drc->drc_cksum);
-		byteswap_record(drr_begin);
-	} else if (drc->drc_drrb->drr_magic == DMU_BACKUP_MAGIC) {
-		(void) fletcher_4_incremental_native(drr_begin,
-		    sizeof (dmu_replay_record_t), &drc->drc_cksum);
-	} else {
-		return (SET_ERROR(EINVAL));
-	}
-
-	drba.drba_origin = origin;
-	drba.drba_cookie = drc;
-	drba.drba_cred = CRED();
-
-	if (DMU_GET_FEATUREFLAGS(drc->drc_drrb->drr_versioninfo) &
-	    DMU_BACKUP_FEATURE_RESUMING) {
-		return (dsl_sync_task(tofs,
-		    dmu_recv_resume_begin_check, dmu_recv_resume_begin_sync,
-		    &drba, 5, ZFS_SPACE_CHECK_NORMAL));
-	} else  {
-		return (dsl_sync_task(tofs,
-		    dmu_recv_begin_check, dmu_recv_begin_sync,
-		    &drba, 5, ZFS_SPACE_CHECK_NORMAL));
-	}
-}
-
-struct receive_record_arg {
-	dmu_replay_record_t header;
-	void *payload; /* Pointer to a buffer containing the payload */
-	/*
-	 * If the record is a write, pointer to the arc_buf_t containing the
-	 * payload.
-	 */
-	arc_buf_t *write_buf;
-	int payload_size;
-	uint64_t bytes_read; /* bytes read from stream when record created */
-	boolean_t eos_marker; /* Marks the end of the stream */
-	bqueue_node_t node;
-};
-
-struct receive_writer_arg {
-	objset_t *os;
-	boolean_t byteswap;
-	bqueue_t q;
-
-	/*
-	 * These three args are used to signal to the main thread that we're
-	 * done.
-	 */
-	kmutex_t mutex;
-	kcondvar_t cv;
-	boolean_t done;
-
-	int err;
-	/* A map from guid to dataset to help handle dedup'd streams. */
-	avl_tree_t *guid_to_ds_map;
-	boolean_t resumable;
-	uint64_t last_object;
-	uint64_t last_offset;
-	uint64_t max_object; /* highest object ID referenced in stream */
-	uint64_t bytes_read; /* bytes read when current record created */
-};
-
-struct objlist {
-	list_t list; /* List of struct receive_objnode. */
-	/*
-	 * Last object looked up. Used to assert that objects are being looked
-	 * up in ascending order.
-	 */
-	uint64_t last_lookup;
-};
-
-struct receive_objnode {
-	list_node_t node;
-	uint64_t object;
-};
-
-struct receive_arg {
-	objset_t *os;
-	kthread_t *td;
-	struct file *fp;
-	uint64_t voff; /* The current offset in the stream */
-	uint64_t bytes_read;
-	/*
-	 * A record that has had its payload read in, but hasn't yet been handed
-	 * off to the worker thread.
-	 */
-	struct receive_record_arg *rrd;
-	/* A record that has had its header read in, but not its payload. */
-	struct receive_record_arg *next_rrd;
-	zio_cksum_t cksum;
-	zio_cksum_t prev_cksum;
-	int err;
-	boolean_t byteswap;
-	/* Sorted list of objects not to issue prefetches for. */
-	struct objlist ignore_objlist;
-};
-
-typedef struct guid_map_entry {
-	uint64_t	guid;
-	dsl_dataset_t	*gme_ds;
-	avl_node_t	avlnode;
-} guid_map_entry_t;
-
-static int
-guid_compare(const void *arg1, const void *arg2)
-{
-	const guid_map_entry_t *gmep1 = (const guid_map_entry_t *)arg1;
-	const guid_map_entry_t *gmep2 = (const guid_map_entry_t *)arg2;
-
-	return (AVL_CMP(gmep1->guid, gmep2->guid));
-}
-
-static void
-free_guid_map_onexit(void *arg)
-{
-	avl_tree_t *ca = arg;
-	void *cookie = NULL;
-	guid_map_entry_t *gmep;
-
-	while ((gmep = avl_destroy_nodes(ca, &cookie)) != NULL) {
-		dsl_dataset_long_rele(gmep->gme_ds, gmep);
-		dsl_dataset_rele(gmep->gme_ds, gmep);
-		kmem_free(gmep, sizeof (guid_map_entry_t));
-	}
-	avl_destroy(ca);
-	kmem_free(ca, sizeof (avl_tree_t));
-}
-
-static int
-restore_bytes(struct receive_arg *ra, void *buf, int len, off_t off, ssize_t *resid)
-{
-	struct uio auio;
-	struct iovec aiov;
-	int error;
-
-	aiov.iov_base = buf;
-	aiov.iov_len = len;
-	auio.uio_iov = &aiov;
-	auio.uio_iovcnt = 1;
-	auio.uio_resid = len;
-	auio.uio_segflg = UIO_SYSSPACE;
-	auio.uio_rw = UIO_READ;
-	auio.uio_offset = off;
-	auio.uio_td = ra->td;
-#ifdef _KERNEL
-	error = fo_read(ra->fp, &auio, ra->td->td_ucred, FOF_OFFSET, ra->td);
-#else
-	fprintf(stderr, "%s: returning EOPNOTSUPP\n", __func__);
-	error = EOPNOTSUPP;
-#endif
-	*resid = auio.uio_resid;
-	return (error);
-}
-
-static int
-receive_read(struct receive_arg *ra, int len, void *buf)
-{
-	int done = 0;
-
-	/*
-	 * The code doesn't rely on this (lengths being multiples of 8).  See
-	 * comment in dump_bytes.
-	 */
-	ASSERT0(len % 8);
-
-	while (done < len) {
-		ssize_t resid;
-
-		ra->err = restore_bytes(ra, buf + done,
-		    len - done, ra->voff, &resid);
-
-		if (resid == len - done) {
-			/*
-			 * Note: ECKSUM indicates that the receive
-			 * was interrupted and can potentially be resumed.
-			 */
-			ra->err = SET_ERROR(ECKSUM);
-		}
-		ra->voff += len - done - resid;
-		done = len - resid;
-		if (ra->err != 0)
-			return (ra->err);
-	}
-
-	ra->bytes_read += len;
-
-	ASSERT3U(done, ==, len);
-	return (0);
-}
-
-noinline static void
-byteswap_record(dmu_replay_record_t *drr)
-{
-#define	DO64(X) (drr->drr_u.X = BSWAP_64(drr->drr_u.X))
-#define	DO32(X) (drr->drr_u.X = BSWAP_32(drr->drr_u.X))
-	drr->drr_type = BSWAP_32(drr->drr_type);
-	drr->drr_payloadlen = BSWAP_32(drr->drr_payloadlen);
-
-	switch (drr->drr_type) {
-	case DRR_BEGIN:
-		DO64(drr_begin.drr_magic);
-		DO64(drr_begin.drr_versioninfo);
-		DO64(drr_begin.drr_creation_time);
-		DO32(drr_begin.drr_type);
-		DO32(drr_begin.drr_flags);
-		DO64(drr_begin.drr_toguid);
-		DO64(drr_begin.drr_fromguid);
-		break;
-	case DRR_OBJECT:
-		DO64(drr_object.drr_object);
-		DO32(drr_object.drr_type);
-		DO32(drr_object.drr_bonustype);
-		DO32(drr_object.drr_blksz);
-		DO32(drr_object.drr_bonuslen);
-		DO64(drr_object.drr_toguid);
-		break;
-	case DRR_FREEOBJECTS:
-		DO64(drr_freeobjects.drr_firstobj);
-		DO64(drr_freeobjects.drr_numobjs);
-		DO64(drr_freeobjects.drr_toguid);
-		break;
-	case DRR_WRITE:
-		DO64(drr_write.drr_object);
-		DO32(drr_write.drr_type);
-		DO64(drr_write.drr_offset);
-		DO64(drr_write.drr_logical_size);
-		DO64(drr_write.drr_toguid);
-		ZIO_CHECKSUM_BSWAP(&drr->drr_u.drr_write.drr_key.ddk_cksum);
-		DO64(drr_write.drr_key.ddk_prop);
-		DO64(drr_write.drr_compressed_size);
-		break;
-	case DRR_WRITE_BYREF:
-		DO64(drr_write_byref.drr_object);
-		DO64(drr_write_byref.drr_offset);
-		DO64(drr_write_byref.drr_length);
-		DO64(drr_write_byref.drr_toguid);
-		DO64(drr_write_byref.drr_refguid);
-		DO64(drr_write_byref.drr_refobject);
-		DO64(drr_write_byref.drr_refoffset);
-		ZIO_CHECKSUM_BSWAP(&drr->drr_u.drr_write_byref.
-		    drr_key.ddk_cksum);
-		DO64(drr_write_byref.drr_key.ddk_prop);
-		break;
-	case DRR_WRITE_EMBEDDED:
-		DO64(drr_write_embedded.drr_object);
-		DO64(drr_write_embedded.drr_offset);
-		DO64(drr_write_embedded.drr_length);
-		DO64(drr_write_embedded.drr_toguid);
-		DO32(drr_write_embedded.drr_lsize);
-		DO32(drr_write_embedded.drr_psize);
-		break;
-	case DRR_FREE:
-		DO64(drr_free.drr_object);
-		DO64(drr_free.drr_offset);
-		DO64(drr_free.drr_length);
-		DO64(drr_free.drr_toguid);
-		break;
-	case DRR_SPILL:
-		DO64(drr_spill.drr_object);
-		DO64(drr_spill.drr_length);
-		DO64(drr_spill.drr_toguid);
-		break;
-	case DRR_END:
-		DO64(drr_end.drr_toguid);
-		ZIO_CHECKSUM_BSWAP(&drr->drr_u.drr_end.drr_checksum);
-		break;
-	}
-
-	if (drr->drr_type != DRR_BEGIN) {
-		ZIO_CHECKSUM_BSWAP(&drr->drr_u.drr_checksum.drr_checksum);
-	}
-
-#undef DO64
-#undef DO32
-}
-
-static inline uint8_t
-deduce_nblkptr(dmu_object_type_t bonus_type, uint64_t bonus_size)
-{
-	if (bonus_type == DMU_OT_SA) {
-		return (1);
-	} else {
-		return (1 +
-		    ((DN_OLD_MAX_BONUSLEN -
-		    MIN(DN_OLD_MAX_BONUSLEN, bonus_size)) >> SPA_BLKPTRSHIFT));
-	}
-}
-
-static void
-save_resume_state(struct receive_writer_arg *rwa,
-    uint64_t object, uint64_t offset, dmu_tx_t *tx)
-{
-	int txgoff = dmu_tx_get_txg(tx) & TXG_MASK;
-
-	if (!rwa->resumable)
-		return;
-
-	/*
-	 * We use ds_resume_bytes[] != 0 to indicate that we need to
-	 * update this on disk, so it must not be 0.
-	 */
-	ASSERT(rwa->bytes_read != 0);
-
-	/*
-	 * We only resume from write records, which have a valid
-	 * (non-meta-dnode) object number.
-	 */
-	ASSERT(object != 0);
-
-	/*
-	 * For resuming to work correctly, we must receive records in order,
-	 * sorted by object,offset.  This is checked by the callers, but
-	 * assert it here for good measure.
-	 */
-	ASSERT3U(object, >=, rwa->os->os_dsl_dataset->ds_resume_object[txgoff]);
-	ASSERT(object != rwa->os->os_dsl_dataset->ds_resume_object[txgoff] ||
-	    offset >= rwa->os->os_dsl_dataset->ds_resume_offset[txgoff]);
-	ASSERT3U(rwa->bytes_read, >=,
-	    rwa->os->os_dsl_dataset->ds_resume_bytes[txgoff]);
-
-	rwa->os->os_dsl_dataset->ds_resume_object[txgoff] = object;
-	rwa->os->os_dsl_dataset->ds_resume_offset[txgoff] = offset;
-	rwa->os->os_dsl_dataset->ds_resume_bytes[txgoff] = rwa->bytes_read;
-}
-
-noinline static int
-receive_object(struct receive_writer_arg *rwa, struct drr_object *drro,
-    void *data)
-{
-	dmu_object_info_t doi;
-	dmu_tx_t *tx;
-	uint64_t object;
-	int err;
-	uint8_t dn_slots = drro->drr_dn_slots != 0 ?
-	    drro->drr_dn_slots : DNODE_MIN_SLOTS;
-
-	if (drro->drr_type == DMU_OT_NONE ||
-	    !DMU_OT_IS_VALID(drro->drr_type) ||
-	    !DMU_OT_IS_VALID(drro->drr_bonustype) ||
-	    drro->drr_checksumtype >= ZIO_CHECKSUM_FUNCTIONS ||
-	    drro->drr_compress >= ZIO_COMPRESS_FUNCTIONS ||
-	    P2PHASE(drro->drr_blksz, SPA_MINBLOCKSIZE) ||
-	    drro->drr_blksz < SPA_MINBLOCKSIZE ||
-	    drro->drr_blksz > spa_maxblocksize(dmu_objset_spa(rwa->os)) ||
-	    drro->drr_bonuslen >
-	    DN_BONUS_SIZE(spa_maxdnodesize(dmu_objset_spa(rwa->os))) ||
-	    dn_slots >
-	    (spa_maxdnodesize(dmu_objset_spa(rwa->os)) >> DNODE_SHIFT)) {
-		return (SET_ERROR(EINVAL));
-	}
-
-	err = dmu_object_info(rwa->os, drro->drr_object, &doi);
-
-	if (err != 0 && err != ENOENT && err != EEXIST)
-		return (SET_ERROR(EINVAL));
-
-	if (drro->drr_object > rwa->max_object)
-		rwa->max_object = drro->drr_object;
-
-	/*
-	 * If we are losing blkptrs or changing the block size this must
-	 * be a new file instance.  We must clear out the previous file
-	 * contents before we can change this type of metadata in the dnode.
-	 */
-	if (err == 0) {
-		int nblkptr;
-
-		object = drro->drr_object;
-
-		nblkptr = deduce_nblkptr(drro->drr_bonustype,
-		    drro->drr_bonuslen);
-
-		if (drro->drr_blksz != doi.doi_data_block_size ||
-		    nblkptr < doi.doi_nblkptr ||
-		    dn_slots != doi.doi_dnodesize >> DNODE_SHIFT) {
-			err = dmu_free_long_range(rwa->os, drro->drr_object,
-			    0, DMU_OBJECT_END);
-			if (err != 0)
-				return (SET_ERROR(EINVAL));
-		}
-	} else if (err == EEXIST) {
-		/*
-		 * The object requested is currently an interior slot of a
-		 * multi-slot dnode. This will be resolved when the next txg
-		 * is synced out, since the send stream will have told us
-		 * to free this slot when we freed the associated dnode
-		 * earlier in the stream.
-		 */
-		txg_wait_synced(dmu_objset_pool(rwa->os), 0);
-		object = drro->drr_object;
-	} else {
-		/* object is free and we are about to allocate a new one */
-		object = DMU_NEW_OBJECT;
-	}
-
-	/*
-	 * If this is a multi-slot dnode there is a chance that this
-	 * object will expand into a slot that is already used by
-	 * another object from the previous snapshot. We must free
-	 * these objects before we attempt to allocate the new dnode.
-	 */
-	if (dn_slots > 1) {
-		boolean_t need_sync = B_FALSE;
-
-		for (uint64_t slot = drro->drr_object + 1;
-		    slot < drro->drr_object + dn_slots;
-		    slot++) {
-			dmu_object_info_t slot_doi;
-
-			err = dmu_object_info(rwa->os, slot, &slot_doi);
-			if (err == ENOENT || err == EEXIST)
-				continue;
-			else if (err != 0)
-				return (err);
-
-			err = dmu_free_long_object(rwa->os, slot);
-
-			if (err != 0)
-				return (err);
-
-			need_sync = B_TRUE;
-		}
-
-		if (need_sync)
-			txg_wait_synced(dmu_objset_pool(rwa->os), 0);
-	}
-
-	tx = dmu_tx_create(rwa->os);
-	dmu_tx_hold_bonus(tx, object);
-	err = dmu_tx_assign(tx, TXG_WAIT);
-	if (err != 0) {
-		dmu_tx_abort(tx);
-		return (err);
-	}
-
-	if (object == DMU_NEW_OBJECT) {
-		/* currently free, want to be allocated */
-		err = dmu_object_claim_dnsize(rwa->os, drro->drr_object,
-		    drro->drr_type, drro->drr_blksz,
-		    drro->drr_bonustype, drro->drr_bonuslen,
-		    dn_slots << DNODE_SHIFT, tx);
-	} else if (drro->drr_type != doi.doi_type ||
-	    drro->drr_blksz != doi.doi_data_block_size ||
-	    drro->drr_bonustype != doi.doi_bonus_type ||
-	    drro->drr_bonuslen != doi.doi_bonus_size ||
-	    drro->drr_dn_slots != (doi.doi_dnodesize >> DNODE_SHIFT)) {
-		/* currently allocated, but with different properties */
-		err = dmu_object_reclaim_dnsize(rwa->os, drro->drr_object,
-		    drro->drr_type, drro->drr_blksz,
-		    drro->drr_bonustype, drro->drr_bonuslen,
-		    drro->drr_dn_slots << DNODE_SHIFT, tx);
-	}
-	if (err != 0) {
-		dmu_tx_commit(tx);
-		return (SET_ERROR(EINVAL));
-	}
-
-	dmu_object_set_checksum(rwa->os, drro->drr_object,
-	    drro->drr_checksumtype, tx);
-	dmu_object_set_compress(rwa->os, drro->drr_object,
-	    drro->drr_compress, tx);
-
-	if (data != NULL) {
-		dmu_buf_t *db;
-
-		VERIFY0(dmu_bonus_hold(rwa->os, drro->drr_object, FTAG, &db));
-		dmu_buf_will_dirty(db, tx);
-
-		ASSERT3U(db->db_size, >=, drro->drr_bonuslen);
-		bcopy(data, db->db_data, drro->drr_bonuslen);
-		if (rwa->byteswap) {
-			dmu_object_byteswap_t byteswap =
-			    DMU_OT_BYTESWAP(drro->drr_bonustype);
-			dmu_ot_byteswap[byteswap].ob_func(db->db_data,
-			    drro->drr_bonuslen);
-		}
-		dmu_buf_rele(db, FTAG);
-	}
-	dmu_tx_commit(tx);
-
-	return (0);
-}
-
-/* ARGSUSED */
-noinline static int
-receive_freeobjects(struct receive_writer_arg *rwa,
-    struct drr_freeobjects *drrfo)
-{
-	uint64_t obj;
-	int next_err = 0;
-
-	if (drrfo->drr_firstobj + drrfo->drr_numobjs < drrfo->drr_firstobj)
-		return (SET_ERROR(EINVAL));
-
-	for (obj = drrfo->drr_firstobj == 0 ? 1 : drrfo->drr_firstobj;
-	    obj < drrfo->drr_firstobj + drrfo->drr_numobjs && next_err == 0;
-	    next_err = dmu_object_next(rwa->os, &obj, FALSE, 0)) {
-		dmu_object_info_t doi;
-		int err;
-
-		err = dmu_object_info(rwa->os, obj, NULL);
-		if (err == ENOENT)
-			continue;
-		else if (err != 0)
-			return (err);
-
-		err = dmu_free_long_object(rwa->os, obj);
-		if (err != 0)
-			return (err);
-
-		if (obj > rwa->max_object)
-			rwa->max_object = obj;
-	}
-	if (next_err != ESRCH)
-		return (next_err);
-	return (0);
-}
-
-noinline static int
-receive_write(struct receive_writer_arg *rwa, struct drr_write *drrw,
-    arc_buf_t *abuf)
-{
-	dmu_tx_t *tx;
-	int err;
-
-	if (drrw->drr_offset + drrw->drr_logical_size < drrw->drr_offset ||
-	    !DMU_OT_IS_VALID(drrw->drr_type))
-		return (SET_ERROR(EINVAL));
-
-	/*
-	 * For resuming to work, records must be in increasing order
-	 * by (object, offset).
-	 */
-	if (drrw->drr_object < rwa->last_object ||
-	    (drrw->drr_object == rwa->last_object &&
-	    drrw->drr_offset < rwa->last_offset)) {
-		return (SET_ERROR(EINVAL));
-	}
-	rwa->last_object = drrw->drr_object;
-	rwa->last_offset = drrw->drr_offset;
-
-	if (rwa->last_object > rwa->max_object)
-		rwa->max_object = rwa->last_object;
-
-	if (dmu_object_info(rwa->os, drrw->drr_object, NULL) != 0)
-		return (SET_ERROR(EINVAL));
-
-	tx = dmu_tx_create(rwa->os);
-	dmu_tx_hold_write(tx, drrw->drr_object,
-	    drrw->drr_offset, drrw->drr_logical_size);
-	err = dmu_tx_assign(tx, TXG_WAIT);
-	if (err != 0) {
-		dmu_tx_abort(tx);
-		return (err);
-	}
-	if (rwa->byteswap) {
-		dmu_object_byteswap_t byteswap =
-		    DMU_OT_BYTESWAP(drrw->drr_type);
-		dmu_ot_byteswap[byteswap].ob_func(abuf->b_data,
-		    DRR_WRITE_PAYLOAD_SIZE(drrw));
-	}
-
-	/* use the bonus buf to look up the dnode in dmu_assign_arcbuf */
-	dmu_buf_t *bonus;
-	if (dmu_bonus_hold(rwa->os, drrw->drr_object, FTAG, &bonus) != 0)
-		return (SET_ERROR(EINVAL));
-	dmu_assign_arcbuf(bonus, drrw->drr_offset, abuf, tx);
-
-	/*
-	 * Note: If the receive fails, we want the resume stream to start
-	 * with the same record that we last successfully received (as opposed
-	 * to the next record), so that we can verify that we are
-	 * resuming from the correct location.
-	 */
-	save_resume_state(rwa, drrw->drr_object, drrw->drr_offset, tx);
-	dmu_tx_commit(tx);
-	dmu_buf_rele(bonus, FTAG);
-
-	return (0);
-}
-
-/*
- * Handle a DRR_WRITE_BYREF record.  This record is used in dedup'ed
- * streams to refer to a copy of the data that is already on the
- * system because it came in earlier in the stream.  This function
- * finds the earlier copy of the data, and uses that copy instead of
- * data from the stream to fulfill this write.
- */
-static int
-receive_write_byref(struct receive_writer_arg *rwa,
-    struct drr_write_byref *drrwbr)
-{
-	dmu_tx_t *tx;
-	int err;
-	guid_map_entry_t gmesrch;
-	guid_map_entry_t *gmep;
-	avl_index_t where;
-	objset_t *ref_os = NULL;
-	dmu_buf_t *dbp;
-
-	if (drrwbr->drr_offset + drrwbr->drr_length < drrwbr->drr_offset)
-		return (SET_ERROR(EINVAL));
-
-	/*
-	 * If the GUID of the referenced dataset is different from the
-	 * GUID of the target dataset, find the referenced dataset.
-	 */
-	if (drrwbr->drr_toguid != drrwbr->drr_refguid) {
-		gmesrch.guid = drrwbr->drr_refguid;
-		if ((gmep = avl_find(rwa->guid_to_ds_map, &gmesrch,
-		    &where)) == NULL) {
-			return (SET_ERROR(EINVAL));
-		}
-		if (dmu_objset_from_ds(gmep->gme_ds, &ref_os))
-			return (SET_ERROR(EINVAL));
-	} else {
-		ref_os = rwa->os;
-	}
-
-	if (drrwbr->drr_object > rwa->max_object)
-		rwa->max_object = drrwbr->drr_object;
-
-	err = dmu_buf_hold(ref_os, drrwbr->drr_refobject,
-	    drrwbr->drr_refoffset, FTAG, &dbp, DMU_READ_PREFETCH);
-	if (err != 0)
-		return (err);
-
-	tx = dmu_tx_create(rwa->os);
-
-	dmu_tx_hold_write(tx, drrwbr->drr_object,
-	    drrwbr->drr_offset, drrwbr->drr_length);
-	err = dmu_tx_assign(tx, TXG_WAIT);
-	if (err != 0) {
-		dmu_tx_abort(tx);
-		return (err);
-	}
-	dmu_write(rwa->os, drrwbr->drr_object,
-	    drrwbr->drr_offset, drrwbr->drr_length, dbp->db_data, tx);
-	dmu_buf_rele(dbp, FTAG);
-
-	/* See comment in restore_write. */
-	save_resume_state(rwa, drrwbr->drr_object, drrwbr->drr_offset, tx);
-	dmu_tx_commit(tx);
-	return (0);
-}
-
-static int
-receive_write_embedded(struct receive_writer_arg *rwa,
-    struct drr_write_embedded *drrwe, void *data)
-{
-	dmu_tx_t *tx;
-	int err;
-
-	if (drrwe->drr_offset + drrwe->drr_length < drrwe->drr_offset)
-		return (EINVAL);
-
-	if (drrwe->drr_psize > BPE_PAYLOAD_SIZE)
-		return (EINVAL);
-
-	if (drrwe->drr_etype >= NUM_BP_EMBEDDED_TYPES)
-		return (EINVAL);
-	if (drrwe->drr_compression >= ZIO_COMPRESS_FUNCTIONS)
-		return (EINVAL);
-
-	if (drrwe->drr_object > rwa->max_object)
-		rwa->max_object = drrwe->drr_object;
-
-	tx = dmu_tx_create(rwa->os);
-
-	dmu_tx_hold_write(tx, drrwe->drr_object,
-	    drrwe->drr_offset, drrwe->drr_length);
-	err = dmu_tx_assign(tx, TXG_WAIT);
-	if (err != 0) {
-		dmu_tx_abort(tx);
-		return (err);
-	}
-
-	dmu_write_embedded(rwa->os, drrwe->drr_object,
-	    drrwe->drr_offset, data, drrwe->drr_etype,
-	    drrwe->drr_compression, drrwe->drr_lsize, drrwe->drr_psize,
-	    rwa->byteswap ^ ZFS_HOST_BYTEORDER, tx);
-
-	/* See comment in restore_write. */
-	save_resume_state(rwa, drrwe->drr_object, drrwe->drr_offset, tx);
-	dmu_tx_commit(tx);
-	return (0);
-}
-
-static int
-receive_spill(struct receive_writer_arg *rwa, struct drr_spill *drrs,
-    void *data)
-{
-	dmu_tx_t *tx;
-	dmu_buf_t *db, *db_spill;
-	int err;
-
-	if (drrs->drr_length < SPA_MINBLOCKSIZE ||
-	    drrs->drr_length > spa_maxblocksize(dmu_objset_spa(rwa->os)))
-		return (SET_ERROR(EINVAL));
-
-	if (dmu_object_info(rwa->os, drrs->drr_object, NULL) != 0)
-		return (SET_ERROR(EINVAL));
-
-	if (drrs->drr_object > rwa->max_object)
-		rwa->max_object = drrs->drr_object;
-
-	VERIFY0(dmu_bonus_hold(rwa->os, drrs->drr_object, FTAG, &db));
-	if ((err = dmu_spill_hold_by_bonus(db, FTAG, &db_spill)) != 0) {
-		dmu_buf_rele(db, FTAG);
-		return (err);
-	}
-
-	tx = dmu_tx_create(rwa->os);
-
-	dmu_tx_hold_spill(tx, db->db_object);
-
-	err = dmu_tx_assign(tx, TXG_WAIT);
-	if (err != 0) {
-		dmu_buf_rele(db, FTAG);
-		dmu_buf_rele(db_spill, FTAG);
-		dmu_tx_abort(tx);
-		return (err);
-	}
-	dmu_buf_will_dirty(db_spill, tx);
-
-	if (db_spill->db_size < drrs->drr_length)
-		VERIFY(0 == dbuf_spill_set_blksz(db_spill,
-		    drrs->drr_length, tx));
-	bcopy(data, db_spill->db_data, drrs->drr_length);
-
-	dmu_buf_rele(db, FTAG);
-	dmu_buf_rele(db_spill, FTAG);
-
-	dmu_tx_commit(tx);
-	return (0);
-}
-
-/* ARGSUSED */
-noinline static int
-receive_free(struct receive_writer_arg *rwa, struct drr_free *drrf)
-{
-	int err;
-
-	if (drrf->drr_length != -1ULL &&
-	    drrf->drr_offset + drrf->drr_length < drrf->drr_offset)
-		return (SET_ERROR(EINVAL));
-
-	if (dmu_object_info(rwa->os, drrf->drr_object, NULL) != 0)
-		return (SET_ERROR(EINVAL));
-
-	if (drrf->drr_object > rwa->max_object)
-		rwa->max_object = drrf->drr_object;
-
-	err = dmu_free_long_range(rwa->os, drrf->drr_object,
-	    drrf->drr_offset, drrf->drr_length);
-
-	return (err);
-}
-
-/* used to destroy the drc_ds on error */
-static void
-dmu_recv_cleanup_ds(dmu_recv_cookie_t *drc)
-{
-	if (drc->drc_resumable) {
-		/* wait for our resume state to be written to disk */
-		txg_wait_synced(drc->drc_ds->ds_dir->dd_pool, 0);
-		dsl_dataset_disown(drc->drc_ds, dmu_recv_tag);
-	} else {
-		char name[ZFS_MAX_DATASET_NAME_LEN];
-		dsl_dataset_name(drc->drc_ds, name);
-		dsl_dataset_disown(drc->drc_ds, dmu_recv_tag);
-		(void) dsl_destroy_head(name);
-	}
-}
-
-static void
-receive_cksum(struct receive_arg *ra, int len, void *buf)
-{
-	if (ra->byteswap) {
-		(void) fletcher_4_incremental_byteswap(buf, len, &ra->cksum);
-	} else {
-		(void) fletcher_4_incremental_native(buf, len, &ra->cksum);
-	}
-}
-
-/*
- * Read the payload into a buffer of size len, and update the current record's
- * payload field.
- * Allocate ra->next_rrd and read the next record's header into
- * ra->next_rrd->header.
- * Verify checksum of payload and next record.
- */
-static int
-receive_read_payload_and_next_header(struct receive_arg *ra, int len, void *buf)
-{
-	int err;
-
-	if (len != 0) {
-		ASSERT3U(len, <=, SPA_MAXBLOCKSIZE);
-		err = receive_read(ra, len, buf);
-		if (err != 0)
-			return (err);
-		receive_cksum(ra, len, buf);
-
-		/* note: rrd is NULL when reading the begin record's payload */
-		if (ra->rrd != NULL) {
-			ra->rrd->payload = buf;
-			ra->rrd->payload_size = len;
-			ra->rrd->bytes_read = ra->bytes_read;
-		}
-	}
-
-	ra->prev_cksum = ra->cksum;
-
-	ra->next_rrd = kmem_zalloc(sizeof (*ra->next_rrd), KM_SLEEP);
-	err = receive_read(ra, sizeof (ra->next_rrd->header),
-	    &ra->next_rrd->header);
-	ra->next_rrd->bytes_read = ra->bytes_read;
-	if (err != 0) {
-		kmem_free(ra->next_rrd, sizeof (*ra->next_rrd));
-		ra->next_rrd = NULL;
-		return (err);
-	}
-	if (ra->next_rrd->header.drr_type == DRR_BEGIN) {
-		kmem_free(ra->next_rrd, sizeof (*ra->next_rrd));
-		ra->next_rrd = NULL;
-		return (SET_ERROR(EINVAL));
-	}
-
-	/*
-	 * Note: checksum is of everything up to but not including the
-	 * checksum itself.
-	 */
-	ASSERT3U(offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum),
-	    ==, sizeof (dmu_replay_record_t) - sizeof (zio_cksum_t));
-	receive_cksum(ra,
-	    offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum),
-	    &ra->next_rrd->header);
-
-	zio_cksum_t cksum_orig =
-	    ra->next_rrd->header.drr_u.drr_checksum.drr_checksum;
-	zio_cksum_t *cksump =
-	    &ra->next_rrd->header.drr_u.drr_checksum.drr_checksum;
-
-	if (ra->byteswap)
-		byteswap_record(&ra->next_rrd->header);
-
-	if ((!ZIO_CHECKSUM_IS_ZERO(cksump)) &&
-	    !ZIO_CHECKSUM_EQUAL(ra->cksum, *cksump)) {
-		kmem_free(ra->next_rrd, sizeof (*ra->next_rrd));
-		ra->next_rrd = NULL;
-		return (SET_ERROR(ECKSUM));
-	}
-
-	receive_cksum(ra, sizeof (cksum_orig), &cksum_orig);
-
-	return (0);
-}
-
-static void
-objlist_create(struct objlist *list)
-{
-	list_create(&list->list, sizeof (struct receive_objnode),
-	    offsetof(struct receive_objnode, node));
-	list->last_lookup = 0;
-}
-
-static void
-objlist_destroy(struct objlist *list)
-{
-	for (struct receive_objnode *n = list_remove_head(&list->list);
-	    n != NULL; n = list_remove_head(&list->list)) {
-		kmem_free(n, sizeof (*n));
-	}
-	list_destroy(&list->list);
-}
-
-/*
- * This function looks through the objlist to see if the specified object number
- * is contained in the objlist.  In the process, it will remove all object
- * numbers in the list that are smaller than the specified object number.  Thus,
- * any lookup of an object number smaller than a previously looked up object
- * number will always return false; therefore, all lookups should be done in
- * ascending order.
- */
-static boolean_t
-objlist_exists(struct objlist *list, uint64_t object)
-{
-	struct receive_objnode *node = list_head(&list->list);
-	ASSERT3U(object, >=, list->last_lookup);
-	list->last_lookup = object;
-	while (node != NULL && node->object < object) {
-		VERIFY3P(node, ==, list_remove_head(&list->list));
-		kmem_free(node, sizeof (*node));
-		node = list_head(&list->list);
-	}
-	return (node != NULL && node->object == object);
-}
-
-/*
- * The objlist is a list of object numbers stored in ascending order.  However,
- * the insertion of new object numbers does not seek out the correct location to
- * store a new object number; instead, it appends it to the list for simplicity.
- * Thus, any users must take care to only insert new object numbers in ascending
- * order.
- */
-static void
-objlist_insert(struct objlist *list, uint64_t object)
-{
-	struct receive_objnode *node = kmem_zalloc(sizeof (*node), KM_SLEEP);
-	node->object = object;
-#ifdef ZFS_DEBUG
-	struct receive_objnode *last_object = list_tail(&list->list);
-	uint64_t last_objnum = (last_object != NULL ? last_object->object : 0);
-	ASSERT3U(node->object, >, last_objnum);
-#endif
-	list_insert_tail(&list->list, node);
-}
-
-/*
- * Issue the prefetch reads for any necessary indirect blocks.
- *
- * We use the object ignore list to tell us whether or not to issue prefetches
- * for a given object.  We do this for both correctness (in case the blocksize
- * of an object has changed) and performance (if the object doesn't exist, don't
- * needlessly try to issue prefetches).  We also trim the list as we go through
- * the stream to prevent it from growing to an unbounded size.
- *
- * The object numbers within will always be in sorted order, and any write
- * records we see will also be in sorted order, but they're not sorted with
- * respect to each other (i.e. we can get several object records before
- * receiving each object's write records).  As a result, once we've reached a
- * given object number, we can safely remove any reference to lower object
- * numbers in the ignore list. In practice, we receive up to 32 object records
- * before receiving write records, so the list can have up to 32 nodes in it.
- */
-/* ARGSUSED */
-static void
-receive_read_prefetch(struct receive_arg *ra,
-    uint64_t object, uint64_t offset, uint64_t length)
-{
-	if (!objlist_exists(&ra->ignore_objlist, object)) {
-		dmu_prefetch(ra->os, object, 1, offset, length,
-		    ZIO_PRIORITY_SYNC_READ);
-	}
-}
-
-/*
- * Read records off the stream, issuing any necessary prefetches.
- */
-static int
-receive_read_record(struct receive_arg *ra)
-{
-	int err;
-
-	switch (ra->rrd->header.drr_type) {
-	case DRR_OBJECT:
-	{
-		struct drr_object *drro = &ra->rrd->header.drr_u.drr_object;
-		uint32_t size = P2ROUNDUP(drro->drr_bonuslen, 8);
-		void *buf = kmem_zalloc(size, KM_SLEEP);
-		dmu_object_info_t doi;
-		err = receive_read_payload_and_next_header(ra, size, buf);
-		if (err != 0) {
-			kmem_free(buf, size);
-			return (err);
-		}
-		err = dmu_object_info(ra->os, drro->drr_object, &doi);
-		/*
-		 * See receive_read_prefetch for an explanation why we're
-		 * storing this object in the ignore_obj_list.
-		 */
-		if (err == ENOENT ||
-		    (err == 0 && doi.doi_data_block_size != drro->drr_blksz)) {
-			objlist_insert(&ra->ignore_objlist, drro->drr_object);
-			err = 0;
-		}
-		return (err);
-	}
-	case DRR_FREEOBJECTS:
-	{
-		err = receive_read_payload_and_next_header(ra, 0, NULL);
-		return (err);
-	}
-	case DRR_WRITE:
-	{
-		struct drr_write *drrw = &ra->rrd->header.drr_u.drr_write;
-		arc_buf_t *abuf;
-		boolean_t is_meta = DMU_OT_IS_METADATA(drrw->drr_type);
-		if (DRR_WRITE_COMPRESSED(drrw)) {
-			ASSERT3U(drrw->drr_compressed_size, >, 0);
-			ASSERT3U(drrw->drr_logical_size, >=,
-			    drrw->drr_compressed_size);
-			ASSERT(!is_meta);
-			abuf = arc_loan_compressed_buf(
-			    dmu_objset_spa(ra->os),
-			    drrw->drr_compressed_size, drrw->drr_logical_size,
-			    drrw->drr_compressiontype);
-		} else {
-			abuf = arc_loan_buf(dmu_objset_spa(ra->os),
-			    is_meta, drrw->drr_logical_size);
-		}
-
-		err = receive_read_payload_and_next_header(ra,
-		    DRR_WRITE_PAYLOAD_SIZE(drrw), abuf->b_data);
-		if (err != 0) {
-			dmu_return_arcbuf(abuf);
-			return (err);
-		}
-		ra->rrd->write_buf = abuf;
-		receive_read_prefetch(ra, drrw->drr_object, drrw->drr_offset,
-		    drrw->drr_logical_size);
-		return (err);
-	}
-	case DRR_WRITE_BYREF:
-	{
-		struct drr_write_byref *drrwb =
-		    &ra->rrd->header.drr_u.drr_write_byref;
-		err = receive_read_payload_and_next_header(ra, 0, NULL);
-		receive_read_prefetch(ra, drrwb->drr_object, drrwb->drr_offset,
-		    drrwb->drr_length);
-		return (err);
-	}
-	case DRR_WRITE_EMBEDDED:
-	{
-		struct drr_write_embedded *drrwe =
-		    &ra->rrd->header.drr_u.drr_write_embedded;
-		uint32_t size = P2ROUNDUP(drrwe->drr_psize, 8);
-		void *buf = kmem_zalloc(size, KM_SLEEP);
-
-		err = receive_read_payload_and_next_header(ra, size, buf);
-		if (err != 0) {
-			kmem_free(buf, size);
-			return (err);
-		}
-
-		receive_read_prefetch(ra, drrwe->drr_object, drrwe->drr_offset,
-		    drrwe->drr_length);
-		return (err);
-	}
-	case DRR_FREE:
-	{
-		/*
-		 * It might be beneficial to prefetch indirect blocks here, but
-		 * we don't really have the data to decide for sure.
-		 */
-		err = receive_read_payload_and_next_header(ra, 0, NULL);
-		return (err);
-	}
-	case DRR_END:
-	{
-		struct drr_end *drre = &ra->rrd->header.drr_u.drr_end;
-		if (!ZIO_CHECKSUM_EQUAL(ra->prev_cksum, drre->drr_checksum))
-			return (SET_ERROR(ECKSUM));
-		return (0);
-	}
-	case DRR_SPILL:
-	{
-		struct drr_spill *drrs = &ra->rrd->header.drr_u.drr_spill;
-		void *buf = kmem_zalloc(drrs->drr_length, KM_SLEEP);
-		err = receive_read_payload_and_next_header(ra, drrs->drr_length,
-		    buf);
-		if (err != 0)
-			kmem_free(buf, drrs->drr_length);
-		return (err);
-	}
-	default:
-		return (SET_ERROR(EINVAL));
-	}
-}
-
-/*
- * Commit the records to the pool.
- */
-static int
-receive_process_record(struct receive_writer_arg *rwa,
-    struct receive_record_arg *rrd)
-{
-	int err;
-
-	/* Processing in order, therefore bytes_read should be increasing. */
-	ASSERT3U(rrd->bytes_read, >=, rwa->bytes_read);
-	rwa->bytes_read = rrd->bytes_read;
-
-	switch (rrd->header.drr_type) {
-	case DRR_OBJECT:
-	{
-		struct drr_object *drro = &rrd->header.drr_u.drr_object;
-		err = receive_object(rwa, drro, rrd->payload);
-		kmem_free(rrd->payload, rrd->payload_size);
-		rrd->payload = NULL;
-		return (err);
-	}
-	case DRR_FREEOBJECTS:
-	{
-		struct drr_freeobjects *drrfo =
-		    &rrd->header.drr_u.drr_freeobjects;
-		return (receive_freeobjects(rwa, drrfo));
-	}
-	case DRR_WRITE:
-	{
-		struct drr_write *drrw = &rrd->header.drr_u.drr_write;
-		err = receive_write(rwa, drrw, rrd->write_buf);
-		/* if receive_write() is successful, it consumes the arc_buf */
-		if (err != 0)
-			dmu_return_arcbuf(rrd->write_buf);
-		rrd->write_buf = NULL;
-		rrd->payload = NULL;
-		return (err);
-	}
-	case DRR_WRITE_BYREF:
-	{
-		struct drr_write_byref *drrwbr =
-		    &rrd->header.drr_u.drr_write_byref;
-		return (receive_write_byref(rwa, drrwbr));
-	}
-	case DRR_WRITE_EMBEDDED:
-	{
-		struct drr_write_embedded *drrwe =
-		    &rrd->header.drr_u.drr_write_embedded;
-		err = receive_write_embedded(rwa, drrwe, rrd->payload);
-		kmem_free(rrd->payload, rrd->payload_size);
-		rrd->payload = NULL;
-		return (err);
-	}
-	case DRR_FREE:
-	{
-		struct drr_free *drrf = &rrd->header.drr_u.drr_free;
-		return (receive_free(rwa, drrf));
-	}
-	case DRR_SPILL:
-	{
-		struct drr_spill *drrs = &rrd->header.drr_u.drr_spill;
-		err = receive_spill(rwa, drrs, rrd->payload);
-		kmem_free(rrd->payload, rrd->payload_size);
-		rrd->payload = NULL;
-		return (err);
-	}
-	default:
-		return (SET_ERROR(EINVAL));
-	}
-}
-
-/*
- * dmu_recv_stream's worker thread; pull records off the queue, and then call
- * receive_process_record  When we're done, signal the main thread and exit.
- */
-static void
-receive_writer_thread(void *arg)
-{
-	struct receive_writer_arg *rwa = arg;
-	struct receive_record_arg *rrd;
-	for (rrd = bqueue_dequeue(&rwa->q); !rrd->eos_marker;
-	    rrd = bqueue_dequeue(&rwa->q)) {
-		/*
-		 * If there's an error, the main thread will stop putting things
-		 * on the queue, but we need to clear everything in it before we
-		 * can exit.
-		 */
-		if (rwa->err == 0) {
-			rwa->err = receive_process_record(rwa, rrd);
-		} else if (rrd->write_buf != NULL) {
-			dmu_return_arcbuf(rrd->write_buf);
-			rrd->write_buf = NULL;
-			rrd->payload = NULL;
-		} else if (rrd->payload != NULL) {
-			kmem_free(rrd->payload, rrd->payload_size);
-			rrd->payload = NULL;
-		}
-		kmem_free(rrd, sizeof (*rrd));
-	}
-	kmem_free(rrd, sizeof (*rrd));
-	mutex_enter(&rwa->mutex);
-	rwa->done = B_TRUE;
-	cv_signal(&rwa->cv);
-	mutex_exit(&rwa->mutex);
-	thread_exit();
-}
-
-static int
-resume_check(struct receive_arg *ra, nvlist_t *begin_nvl)
-{
-	uint64_t val;
-	objset_t *mos = dmu_objset_pool(ra->os)->dp_meta_objset;
-	uint64_t dsobj = dmu_objset_id(ra->os);
-	uint64_t resume_obj, resume_off;
-
-	if (nvlist_lookup_uint64(begin_nvl,
-	    "resume_object", &resume_obj) != 0 ||
-	    nvlist_lookup_uint64(begin_nvl,
-	    "resume_offset", &resume_off) != 0) {
-		return (SET_ERROR(EINVAL));
-	}
-	VERIFY0(zap_lookup(mos, dsobj,
-	    DS_FIELD_RESUME_OBJECT, sizeof (val), 1, &val));
-	if (resume_obj != val)
-		return (SET_ERROR(EINVAL));
-	VERIFY0(zap_lookup(mos, dsobj,
-	    DS_FIELD_RESUME_OFFSET, sizeof (val), 1, &val));
-	if (resume_off != val)
-		return (SET_ERROR(EINVAL));
-
-	return (0);
-}
-
-/*
- * Read in the stream's records, one by one, and apply them to the pool.  There
- * are two threads involved; the thread that calls this function will spin up a
- * worker thread, read the records off the stream one by one, and issue
- * prefetches for any necessary indirect blocks.  It will then push the records
- * onto an internal blocking queue.  The worker thread will pull the records off
- * the queue, and actually write the data into the DMU.  This way, the worker
- * thread doesn't have to wait for reads to complete, since everything it needs
- * (the indirect blocks) will be prefetched.
- *
- * NB: callers *must* call dmu_recv_end() if this succeeds.
- */
-int
-dmu_recv_stream(dmu_recv_cookie_t *drc, struct file *fp, offset_t *voffp,
-    int cleanup_fd, uint64_t *action_handlep)
-{
-	int err = 0;
-	struct receive_arg ra = { 0 };
-	struct receive_writer_arg rwa = { 0 };
-	int featureflags;
-	nvlist_t *begin_nvl = NULL;
-
-	ra.byteswap = drc->drc_byteswap;
-	ra.cksum = drc->drc_cksum;
-	ra.td = curthread;
-	ra.fp = fp;
-	ra.voff = *voffp;
-
-	if (dsl_dataset_is_zapified(drc->drc_ds)) {
-		(void) zap_lookup(drc->drc_ds->ds_dir->dd_pool->dp_meta_objset,
-		    drc->drc_ds->ds_object, DS_FIELD_RESUME_BYTES,
-		    sizeof (ra.bytes_read), 1, &ra.bytes_read);
-	}
-
-	objlist_create(&ra.ignore_objlist);
-
-	/* these were verified in dmu_recv_begin */
-	ASSERT3U(DMU_GET_STREAM_HDRTYPE(drc->drc_drrb->drr_versioninfo), ==,
-	    DMU_SUBSTREAM);
-	ASSERT3U(drc->drc_drrb->drr_type, <, DMU_OST_NUMTYPES);
-
-	/*
-	 * Open the objset we are modifying.
-	 */
-	VERIFY0(dmu_objset_from_ds(drc->drc_ds, &ra.os));
-
-	ASSERT(dsl_dataset_phys(drc->drc_ds)->ds_flags & DS_FLAG_INCONSISTENT);
-
-	featureflags = DMU_GET_FEATUREFLAGS(drc->drc_drrb->drr_versioninfo);
-
-	/* if this stream is dedup'ed, set up the avl tree for guid mapping */
-	if (featureflags & DMU_BACKUP_FEATURE_DEDUP) {
-		minor_t minor;
-
-		if (cleanup_fd == -1) {
-			ra.err = SET_ERROR(EBADF);
-			goto out;
-		}
-		ra.err = zfs_onexit_fd_hold(cleanup_fd, &minor);
-		if (ra.err != 0) {
-			cleanup_fd = -1;
-			goto out;
-		}
-
-		if (*action_handlep == 0) {
-			rwa.guid_to_ds_map =
-			    kmem_alloc(sizeof (avl_tree_t), KM_SLEEP);
-			avl_create(rwa.guid_to_ds_map, guid_compare,
-			    sizeof (guid_map_entry_t),
-			    offsetof(guid_map_entry_t, avlnode));
-			err = zfs_onexit_add_cb(minor,
-			    free_guid_map_onexit, rwa.guid_to_ds_map,
-			    action_handlep);
-			if (ra.err != 0)
-				goto out;
-		} else {
-			err = zfs_onexit_cb_data(minor, *action_handlep,
-			    (void **)&rwa.guid_to_ds_map);
-			if (ra.err != 0)
-				goto out;
-		}
-
-		drc->drc_guid_to_ds_map = rwa.guid_to_ds_map;
-	}
-
-	uint32_t payloadlen = drc->drc_drr_begin->drr_payloadlen;
-	void *payload = NULL;
-	if (payloadlen != 0)
-		payload = kmem_alloc(payloadlen, KM_SLEEP);
-
-	err = receive_read_payload_and_next_header(&ra, payloadlen, payload);
-	if (err != 0) {
-		if (payloadlen != 0)
-			kmem_free(payload, payloadlen);
-		goto out;
-	}
-	if (payloadlen != 0) {
-		err = nvlist_unpack(payload, payloadlen, &begin_nvl, KM_SLEEP);
-		kmem_free(payload, payloadlen);
-		if (err != 0)
-			goto out;
-	}
-
-	if (featureflags & DMU_BACKUP_FEATURE_RESUMING) {
-		err = resume_check(&ra, begin_nvl);
-		if (err != 0)
-			goto out;
-	}
-
-	(void) bqueue_init(&rwa.q, zfs_recv_queue_length,
-	    offsetof(struct receive_record_arg, node));
-	cv_init(&rwa.cv, NULL, CV_DEFAULT, NULL);
-	mutex_init(&rwa.mutex, NULL, MUTEX_DEFAULT, NULL);
-	rwa.os = ra.os;
-	rwa.byteswap = drc->drc_byteswap;
-	rwa.resumable = drc->drc_resumable;
-
-	(void) thread_create(NULL, 0, receive_writer_thread, &rwa, 0, &p0,
-	    TS_RUN, minclsyspri);
-	/*
-	 * We're reading rwa.err without locks, which is safe since we are the
-	 * only reader, and the worker thread is the only writer.  It's ok if we
-	 * miss a write for an iteration or two of the loop, since the writer
-	 * thread will keep freeing records we send it until we send it an eos
-	 * marker.
-	 *
-	 * We can leave this loop in 3 ways:  First, if rwa.err is
-	 * non-zero.  In that case, the writer thread will free the rrd we just
-	 * pushed.  Second, if  we're interrupted; in that case, either it's the
-	 * first loop and ra.rrd was never allocated, or it's later, and ra.rrd
-	 * has been handed off to the writer thread who will free it.  Finally,
-	 * if receive_read_record fails or we're at the end of the stream, then
-	 * we free ra.rrd and exit.
-	 */
-	while (rwa.err == 0) {
-		if (issig(JUSTLOOKING) && issig(FORREAL)) {
-			err = SET_ERROR(EINTR);
-			break;
-		}
-
-		ASSERT3P(ra.rrd, ==, NULL);
-		ra.rrd = ra.next_rrd;
-		ra.next_rrd = NULL;
-		/* Allocates and loads header into ra.next_rrd */
-		err = receive_read_record(&ra);
-
-		if (ra.rrd->header.drr_type == DRR_END || err != 0) {
-			kmem_free(ra.rrd, sizeof (*ra.rrd));
-			ra.rrd = NULL;
-			break;
-		}
-
-		bqueue_enqueue(&rwa.q, ra.rrd,
-		    sizeof (struct receive_record_arg) + ra.rrd->payload_size);
-		ra.rrd = NULL;
-	}
-	if (ra.next_rrd == NULL)
-		ra.next_rrd = kmem_zalloc(sizeof (*ra.next_rrd), KM_SLEEP);
-	ra.next_rrd->eos_marker = B_TRUE;
-	bqueue_enqueue(&rwa.q, ra.next_rrd, 1);
-
-	mutex_enter(&rwa.mutex);
-	while (!rwa.done) {
-		cv_wait(&rwa.cv, &rwa.mutex);
-	}
-	mutex_exit(&rwa.mutex);
-
-	/*
-	 * If we are receiving a full stream as a clone, all object IDs which
-	 * are greater than the maximum ID referenced in the stream are
-	 * by definition unused and must be freed. Note that it's possible that
-	 * we've resumed this send and the first record we received was the END
-	 * record. In that case, max_object would be 0, but we shouldn't start
-	 * freeing all objects from there; instead we should start from the
-	 * resumeobj.
-	 */
-	if (drc->drc_clone && drc->drc_drrb->drr_fromguid == 0) {
-		uint64_t obj;
-		if (nvlist_lookup_uint64(begin_nvl, "resume_object", &obj) != 0)
-			obj = 0;
-		if (rwa.max_object > obj)
-			obj = rwa.max_object;
-		obj++;
-		int free_err = 0;
-		int next_err = 0;
-
-		while (next_err == 0) {
-			free_err = dmu_free_long_object(rwa.os, obj);
-			if (free_err != 0 && free_err != ENOENT)
-				break;
-
-			next_err = dmu_object_next(rwa.os, &obj, FALSE, 0);
-		}
-
-		if (err == 0) {
-			if (free_err != 0 && free_err != ENOENT)
-				err = free_err;
-			else if (next_err != ESRCH)
-				err = next_err;
-		}
-	}
-
-	cv_destroy(&rwa.cv);
-	mutex_destroy(&rwa.mutex);
-	bqueue_destroy(&rwa.q);
-	if (err == 0)
-		err = rwa.err;
-
-out:
-	nvlist_free(begin_nvl);
-	if ((featureflags & DMU_BACKUP_FEATURE_DEDUP) && (cleanup_fd != -1))
-		zfs_onexit_fd_rele(cleanup_fd);
-
-	if (err != 0) {
-		/*
-		 * Clean up references. If receive is not resumable,
-		 * destroy what we created, so we don't leave it in
-		 * the inconsistent state.
-		 */
-		dmu_recv_cleanup_ds(drc);
-	}
-
-	*voffp = ra.voff;
-	objlist_destroy(&ra.ignore_objlist);
-	return (err);
-}
-
-static int
-dmu_recv_end_check(void *arg, dmu_tx_t *tx)
-{
-	dmu_recv_cookie_t *drc = arg;
-	dsl_pool_t *dp = dmu_tx_pool(tx);
-	int error;
-
-	ASSERT3P(drc->drc_ds->ds_owner, ==, dmu_recv_tag);
-
-	if (!drc->drc_newfs) {
-		dsl_dataset_t *origin_head;
-
-		error = dsl_dataset_hold(dp, drc->drc_tofs, FTAG, &origin_head);
-		if (error != 0)
-			return (error);
-		if (drc->drc_force) {
-			/*
-			 * We will destroy any snapshots in tofs (i.e. before
-			 * origin_head) that are after the origin (which is
-			 * the snap before drc_ds, because drc_ds can not
-			 * have any snaps of its own).
-			 */
-			uint64_t obj;
-
-			obj = dsl_dataset_phys(origin_head)->ds_prev_snap_obj;
-			while (obj !=
-			    dsl_dataset_phys(drc->drc_ds)->ds_prev_snap_obj) {
-				dsl_dataset_t *snap;
-				error = dsl_dataset_hold_obj(dp, obj, FTAG,
-				    &snap);
-				if (error != 0)
-					break;
-				if (snap->ds_dir != origin_head->ds_dir)
-					error = SET_ERROR(EINVAL);
-				if (error == 0)  {
-					error = dsl_destroy_snapshot_check_impl(
-					    snap, B_FALSE);
-				}
-				obj = dsl_dataset_phys(snap)->ds_prev_snap_obj;
-				dsl_dataset_rele(snap, FTAG);
-				if (error != 0)
-					break;
-			}
-			if (error != 0) {
-				dsl_dataset_rele(origin_head, FTAG);
-				return (error);
-			}
-		}
-		error = dsl_dataset_clone_swap_check_impl(drc->drc_ds,
-		    origin_head, drc->drc_force, drc->drc_owner, tx);
-		if (error != 0) {
-			dsl_dataset_rele(origin_head, FTAG);
-			return (error);
-		}
-		error = dsl_dataset_snapshot_check_impl(origin_head,
-		    drc->drc_tosnap, tx, B_TRUE, 1, drc->drc_cred);
-		dsl_dataset_rele(origin_head, FTAG);
-		if (error != 0)
-			return (error);
-
-		error = dsl_destroy_head_check_impl(drc->drc_ds, 1);
-	} else {
-		error = dsl_dataset_snapshot_check_impl(drc->drc_ds,
-		    drc->drc_tosnap, tx, B_TRUE, 1, drc->drc_cred);
-	}
-	return (error);
-}
-
-static void
-dmu_recv_end_sync(void *arg, dmu_tx_t *tx)
-{
-	dmu_recv_cookie_t *drc = arg;
-	dsl_pool_t *dp = dmu_tx_pool(tx);
-
-	spa_history_log_internal_ds(drc->drc_ds, "finish receiving",
-	    tx, "snap=%s", drc->drc_tosnap);
-
-	if (!drc->drc_newfs) {
-		dsl_dataset_t *origin_head;
-
-		VERIFY0(dsl_dataset_hold(dp, drc->drc_tofs, FTAG,
-		    &origin_head));
-
-		if (drc->drc_force) {
-			/*
-			 * Destroy any snapshots of drc_tofs (origin_head)
-			 * after the origin (the snap before drc_ds).
-			 */
-			uint64_t obj;
-
-			obj = dsl_dataset_phys(origin_head)->ds_prev_snap_obj;
-			while (obj !=
-			    dsl_dataset_phys(drc->drc_ds)->ds_prev_snap_obj) {
-				dsl_dataset_t *snap;
-				VERIFY0(dsl_dataset_hold_obj(dp, obj, FTAG,
-				    &snap));
-				ASSERT3P(snap->ds_dir, ==, origin_head->ds_dir);
-				obj = dsl_dataset_phys(snap)->ds_prev_snap_obj;
-				dsl_destroy_snapshot_sync_impl(snap,
-				    B_FALSE, tx);
-				dsl_dataset_rele(snap, FTAG);
-			}
-		}
-		VERIFY3P(drc->drc_ds->ds_prev, ==,
-		    origin_head->ds_prev);
-
-		dsl_dataset_clone_swap_sync_impl(drc->drc_ds,
-		    origin_head, tx);
-		dsl_dataset_snapshot_sync_impl(origin_head,
-		    drc->drc_tosnap, tx);
-
-		/* set snapshot's creation time and guid */
-		dmu_buf_will_dirty(origin_head->ds_prev->ds_dbuf, tx);
-		dsl_dataset_phys(origin_head->ds_prev)->ds_creation_time =
-		    drc->drc_drrb->drr_creation_time;
-		dsl_dataset_phys(origin_head->ds_prev)->ds_guid =
-		    drc->drc_drrb->drr_toguid;
-		dsl_dataset_phys(origin_head->ds_prev)->ds_flags &=
-		    ~DS_FLAG_INCONSISTENT;
-
-		dmu_buf_will_dirty(origin_head->ds_dbuf, tx);
-		dsl_dataset_phys(origin_head)->ds_flags &=
-		    ~DS_FLAG_INCONSISTENT;
-
-		drc->drc_newsnapobj =
-		    dsl_dataset_phys(origin_head)->ds_prev_snap_obj;
-
-		dsl_dataset_rele(origin_head, FTAG);
-		dsl_destroy_head_sync_impl(drc->drc_ds, tx);
-
-		if (drc->drc_owner != NULL)
-			VERIFY3P(origin_head->ds_owner, ==, drc->drc_owner);
-	} else {
-		dsl_dataset_t *ds = drc->drc_ds;
-
-		dsl_dataset_snapshot_sync_impl(ds, drc->drc_tosnap, tx);
-
-		/* set snapshot's creation time and guid */
-		dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx);
-		dsl_dataset_phys(ds->ds_prev)->ds_creation_time =
-		    drc->drc_drrb->drr_creation_time;
-		dsl_dataset_phys(ds->ds_prev)->ds_guid =
-		    drc->drc_drrb->drr_toguid;
-		dsl_dataset_phys(ds->ds_prev)->ds_flags &=
-		    ~DS_FLAG_INCONSISTENT;
-
-		dmu_buf_will_dirty(ds->ds_dbuf, tx);
-		dsl_dataset_phys(ds)->ds_flags &= ~DS_FLAG_INCONSISTENT;
-		if (dsl_dataset_has_resume_receive_state(ds)) {
-			(void) zap_remove(dp->dp_meta_objset, ds->ds_object,
-			    DS_FIELD_RESUME_FROMGUID, tx);
-			(void) zap_remove(dp->dp_meta_objset, ds->ds_object,
-			    DS_FIELD_RESUME_OBJECT, tx);
-			(void) zap_remove(dp->dp_meta_objset, ds->ds_object,
-			    DS_FIELD_RESUME_OFFSET, tx);
-			(void) zap_remove(dp->dp_meta_objset, ds->ds_object,
-			    DS_FIELD_RESUME_BYTES, tx);
-			(void) zap_remove(dp->dp_meta_objset, ds->ds_object,
-			    DS_FIELD_RESUME_TOGUID, tx);
-			(void) zap_remove(dp->dp_meta_objset, ds->ds_object,
-			    DS_FIELD_RESUME_TONAME, tx);
-		}
-		drc->drc_newsnapobj =
-		    dsl_dataset_phys(drc->drc_ds)->ds_prev_snap_obj;
-	}
-
-#if defined(__FreeBSD__) && defined(_KERNEL)
-	zvol_create_minors(dp->dp_spa, drc->drc_tofs);
-#endif
-
-	/*
-	 * Release the hold from dmu_recv_begin.  This must be done before
-	 * we return to open context, so that when we free the dataset's dnode,
-	 * we can evict its bonus buffer.
-	 */
-	dsl_dataset_disown(drc->drc_ds, dmu_recv_tag);
-	drc->drc_ds = NULL;
-}
-
-static int
-add_ds_to_guidmap(const char *name, avl_tree_t *guid_map, uint64_t snapobj)
-{
-	dsl_pool_t *dp;
-	dsl_dataset_t *snapds;
-	guid_map_entry_t *gmep;
-	int err;
-
-	ASSERT(guid_map != NULL);
-
-	err = dsl_pool_hold(name, FTAG, &dp);
-	if (err != 0)
-		return (err);
-	gmep = kmem_alloc(sizeof (*gmep), KM_SLEEP);
-	err = dsl_dataset_hold_obj(dp, snapobj, gmep, &snapds);
-	if (err == 0) {
-		gmep->guid = dsl_dataset_phys(snapds)->ds_guid;
-		gmep->gme_ds = snapds;
-		avl_add(guid_map, gmep);
-		dsl_dataset_long_hold(snapds, gmep);
-	} else
-		kmem_free(gmep, sizeof (*gmep));
-
-	dsl_pool_rele(dp, FTAG);
-	return (err);
-}
-
-static int dmu_recv_end_modified_blocks = 3;
-
-static int
-dmu_recv_existing_end(dmu_recv_cookie_t *drc)
-{
-#ifdef _KERNEL
-	/*
-	 * We will be destroying the ds; make sure its origin is unmounted if
-	 * necessary.
-	 */
-	char name[ZFS_MAX_DATASET_NAME_LEN];
-	dsl_dataset_name(drc->drc_ds, name);
-	zfs_destroy_unmount_origin(name);
-#endif
-
-	return (dsl_sync_task(drc->drc_tofs,
-	    dmu_recv_end_check, dmu_recv_end_sync, drc,
-	    dmu_recv_end_modified_blocks, ZFS_SPACE_CHECK_NORMAL));
-}
-
-static int
-dmu_recv_new_end(dmu_recv_cookie_t *drc)
-{
-	return (dsl_sync_task(drc->drc_tofs,
-	    dmu_recv_end_check, dmu_recv_end_sync, drc,
-	    dmu_recv_end_modified_blocks, ZFS_SPACE_CHECK_NORMAL));
-}
-
-int
-dmu_recv_end(dmu_recv_cookie_t *drc, void *owner)
-{
-	int error;
-
-	drc->drc_owner = owner;
-
-	if (drc->drc_newfs)
-		error = dmu_recv_new_end(drc);
-	else
-		error = dmu_recv_existing_end(drc);
-
-	if (error != 0) {
-		dmu_recv_cleanup_ds(drc);
-	} else if (drc->drc_guid_to_ds_map != NULL) {
-		(void) add_ds_to_guidmap(drc->drc_tofs,
-		    drc->drc_guid_to_ds_map,
-		    drc->drc_newsnapobj);
-	}
-	return (error);
-}
-
-/*
- * Return TRUE if this objset is currently being received into.
- */
-boolean_t
-dmu_objset_is_receiving(objset_t *os)
-{
-	return (os->os_dsl_dataset != NULL &&
-	    os->os_dsl_dataset->ds_owner == dmu_recv_tag);
-}
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_traverse.c
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_traverse.c
+++ /dev/null
@@ -1,712 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
- * Copyright (c) 2015 Chunwei Chen. All rights reserved.
- */
-
-#include <sys/zfs_context.h>
-#include <sys/dmu_objset.h>
-#include <sys/dmu_traverse.h>
-#include <sys/dsl_dataset.h>
-#include <sys/dsl_dir.h>
-#include <sys/dsl_pool.h>
-#include <sys/dnode.h>
-#include <sys/spa.h>
-#include <sys/spa_impl.h>
-#include <sys/zio.h>
-#include <sys/dmu_impl.h>
-#include <sys/sa.h>
-#include <sys/sa_impl.h>
-#include <sys/callb.h>
-#include <sys/zfeature.h>
-
-int32_t zfs_pd_bytes_max = 50 * 1024 * 1024;	/* 50MB */
-boolean_t send_holes_without_birth_time = B_TRUE;
-
-#ifdef _KERNEL
-SYSCTL_DECL(_vfs_zfs);
-SYSCTL_UINT(_vfs_zfs, OID_AUTO, send_holes_without_birth_time, CTLFLAG_RWTUN,
-    &send_holes_without_birth_time, 0, "Send holes without birth time");
-#endif
-
-typedef struct prefetch_data {
-	kmutex_t pd_mtx;
-	kcondvar_t pd_cv;
-	int32_t pd_bytes_fetched;
-	int pd_flags;
-	boolean_t pd_cancel;
-	boolean_t pd_exited;
-	zbookmark_phys_t pd_resume;
-} prefetch_data_t;
-
-typedef struct traverse_data {
-	spa_t *td_spa;
-	uint64_t td_objset;
-	blkptr_t *td_rootbp;
-	uint64_t td_min_txg;
-	zbookmark_phys_t *td_resume;
-	int td_flags;
-	prefetch_data_t *td_pfd;
-	boolean_t td_paused;
-	uint64_t td_hole_birth_enabled_txg;
-	blkptr_cb_t *td_func;
-	void *td_arg;
-	boolean_t td_realloc_possible;
-} traverse_data_t;
-
-static int traverse_dnode(traverse_data_t *td, const dnode_phys_t *dnp,
-    uint64_t objset, uint64_t object);
-static void prefetch_dnode_metadata(traverse_data_t *td, const dnode_phys_t *,
-    uint64_t objset, uint64_t object);
-
-static int
-traverse_zil_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg)
-{
-	traverse_data_t *td = arg;
-	zbookmark_phys_t zb;
-
-	if (BP_IS_HOLE(bp))
-		return (0);
-
-	if (claim_txg == 0 && bp->blk_birth >= spa_min_claim_txg(td->td_spa))
-		return (-1);
-
-	SET_BOOKMARK(&zb, td->td_objset, ZB_ZIL_OBJECT, ZB_ZIL_LEVEL,
-	    bp->blk_cksum.zc_word[ZIL_ZC_SEQ]);
-
-	(void) td->td_func(td->td_spa, zilog, bp, &zb, NULL, td->td_arg);
-
-	return (0);
-}
-
-static int
-traverse_zil_record(zilog_t *zilog, lr_t *lrc, void *arg, uint64_t claim_txg)
-{
-	traverse_data_t *td = arg;
-
-	if (lrc->lrc_txtype == TX_WRITE) {
-		lr_write_t *lr = (lr_write_t *)lrc;
-		blkptr_t *bp = &lr->lr_blkptr;
-		zbookmark_phys_t zb;
-
-		if (BP_IS_HOLE(bp))
-			return (0);
-
-		if (claim_txg == 0 || bp->blk_birth < claim_txg)
-			return (0);
-
-		SET_BOOKMARK(&zb, td->td_objset, lr->lr_foid,
-		    ZB_ZIL_LEVEL, lr->lr_offset / BP_GET_LSIZE(bp));
-
-		(void) td->td_func(td->td_spa, zilog, bp, &zb, NULL,
-		    td->td_arg);
-	}
-	return (0);
-}
-
-static void
-traverse_zil(traverse_data_t *td, zil_header_t *zh)
-{
-	uint64_t claim_txg = zh->zh_claim_txg;
-
-	/*
-	 * We only want to visit blocks that have been claimed but not yet
-	 * replayed; plus blocks that are already stable in read-only mode.
-	 */
-	if (claim_txg == 0 && spa_writeable(td->td_spa))
-		return;
-
-	zilog_t *zilog = zil_alloc(spa_get_dsl(td->td_spa)->dp_meta_objset, zh);
-	(void) zil_parse(zilog, traverse_zil_block, traverse_zil_record, td,
-	    claim_txg);
-	zil_free(zilog);
-}
-
-typedef enum resume_skip {
-	RESUME_SKIP_ALL,
-	RESUME_SKIP_NONE,
-	RESUME_SKIP_CHILDREN
-} resume_skip_t;
-
-/*
- * Returns RESUME_SKIP_ALL if td indicates that we are resuming a traversal and
- * the block indicated by zb does not need to be visited at all. Returns
- * RESUME_SKIP_CHILDREN if we are resuming a post traversal and we reach the
- * resume point. This indicates that this block should be visited but not its
- * children (since they must have been visited in a previous traversal).
- * Otherwise returns RESUME_SKIP_NONE.
- */
-static resume_skip_t
-resume_skip_check(traverse_data_t *td, const dnode_phys_t *dnp,
-    const zbookmark_phys_t *zb)
-{
-	if (td->td_resume != NULL && !ZB_IS_ZERO(td->td_resume)) {
-		/*
-		 * If we already visited this bp & everything below,
-		 * don't bother doing it again.
-		 */
-		if (zbookmark_subtree_completed(dnp, zb, td->td_resume))
-			return (RESUME_SKIP_ALL);
-
-		/*
-		 * If we found the block we're trying to resume from, zero
-		 * the bookmark out to indicate that we have resumed.
-		 */
-		if (bcmp(zb, td->td_resume, sizeof (*zb)) == 0) {
-			bzero(td->td_resume, sizeof (*zb));
-			if (td->td_flags & TRAVERSE_POST)
-				return (RESUME_SKIP_CHILDREN);
-		}
-	}
-	return (RESUME_SKIP_NONE);
-}
-
-static void
-traverse_prefetch_metadata(traverse_data_t *td,
-    const blkptr_t *bp, const zbookmark_phys_t *zb)
-{
-	arc_flags_t flags = ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH;
-
-	if (!(td->td_flags & TRAVERSE_PREFETCH_METADATA))
-		return;
-	/*
-	 * If we are in the process of resuming, don't prefetch, because
-	 * some children will not be needed (and in fact may have already
-	 * been freed).
-	 */
-	if (td->td_resume != NULL && !ZB_IS_ZERO(td->td_resume))
-		return;
-	if (BP_IS_HOLE(bp) || bp->blk_birth <= td->td_min_txg)
-		return;
-	if (BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_DNODE)
-		return;
-
-	(void) arc_read(NULL, td->td_spa, bp, NULL, NULL,
-	    ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
-}
-
-static boolean_t
-prefetch_needed(prefetch_data_t *pfd, const blkptr_t *bp)
-{
-	ASSERT(pfd->pd_flags & TRAVERSE_PREFETCH_DATA);
-	if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp) ||
-	    BP_GET_TYPE(bp) == DMU_OT_INTENT_LOG)
-		return (B_FALSE);
-	return (B_TRUE);
-}
-
-static int
-traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp,
-    const blkptr_t *bp, const zbookmark_phys_t *zb)
-{
-	zbookmark_phys_t czb;
-	int err = 0;
-	arc_buf_t *buf = NULL;
-	prefetch_data_t *pd = td->td_pfd;
-	boolean_t hard = td->td_flags & TRAVERSE_HARD;
-
-	switch (resume_skip_check(td, dnp, zb)) {
-	case RESUME_SKIP_ALL:
-		return (0);
-	case RESUME_SKIP_CHILDREN:
-		goto post;
-	case RESUME_SKIP_NONE:
-		break;
-	default:
-		ASSERT(0);
-	}
-
-	if (bp->blk_birth == 0) {
-		/*
-		 * Since this block has a birth time of 0 it must be one of
-		 * two things: a hole created before the
-		 * SPA_FEATURE_HOLE_BIRTH feature was enabled, or a hole
-		 * which has always been a hole in an object.
-		 *
-		 * If a file is written sparsely, then the unwritten parts of
-		 * the file were "always holes" -- that is, they have been
-		 * holes since this object was allocated.  However, we (and
-		 * our callers) can not necessarily tell when an object was
-		 * allocated.  Therefore, if it's possible that this object
-		 * was freed and then its object number reused, we need to
-		 * visit all the holes with birth==0.
-		 *
-		 * If it isn't possible that the object number was reused,
-		 * then if SPA_FEATURE_HOLE_BIRTH was enabled before we wrote
-		 * all the blocks we will visit as part of this traversal,
-		 * then this hole must have always existed, so we can skip
-		 * it.  We visit blocks born after (exclusive) td_min_txg.
-		 *
-		 * Note that the meta-dnode cannot be reallocated.
-		 */
-		if (!send_holes_without_birth_time &&
-		    (!td->td_realloc_possible ||
-		    zb->zb_object == DMU_META_DNODE_OBJECT) &&
-		    td->td_hole_birth_enabled_txg <= td->td_min_txg)
-			return (0);
-	} else if (bp->blk_birth <= td->td_min_txg) {
-		return (0);
-	}
-
-	if (pd != NULL && !pd->pd_exited && prefetch_needed(pd, bp)) {
-		uint64_t size = BP_GET_LSIZE(bp);
-		mutex_enter(&pd->pd_mtx);
-		ASSERT(pd->pd_bytes_fetched >= 0);
-		while (pd->pd_bytes_fetched < size && !pd->pd_exited)
-			cv_wait(&pd->pd_cv, &pd->pd_mtx);
-		pd->pd_bytes_fetched -= size;
-		cv_broadcast(&pd->pd_cv);
-		mutex_exit(&pd->pd_mtx);
-	}
-
-	if (BP_IS_HOLE(bp)) {
-		err = td->td_func(td->td_spa, NULL, bp, zb, dnp, td->td_arg);
-		if (err != 0)
-			goto post;
-		return (0);
-	}
-
-	if (td->td_flags & TRAVERSE_PRE) {
-		err = td->td_func(td->td_spa, NULL, bp, zb, dnp,
-		    td->td_arg);
-		if (err == TRAVERSE_VISIT_NO_CHILDREN)
-			return (0);
-		if (err != 0)
-			goto post;
-	}
-
-	if (BP_GET_LEVEL(bp) > 0) {
-		arc_flags_t flags = ARC_FLAG_WAIT;
-		int i;
-		blkptr_t *cbp;
-		int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT;
-
-		err = arc_read(NULL, td->td_spa, bp, arc_getbuf_func, &buf,
-		    ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
-		if (err != 0)
-			goto post;
-		cbp = buf->b_data;
-
-		for (i = 0; i < epb; i++) {
-			SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object,
-			    zb->zb_level - 1,
-			    zb->zb_blkid * epb + i);
-			traverse_prefetch_metadata(td, &cbp[i], &czb);
-		}
-
-		/* recursively visitbp() blocks below this */
-		for (i = 0; i < epb; i++) {
-			SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object,
-			    zb->zb_level - 1,
-			    zb->zb_blkid * epb + i);
-			err = traverse_visitbp(td, dnp, &cbp[i], &czb);
-			if (err != 0)
-				break;
-		}
-	} else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) {
-		arc_flags_t flags = ARC_FLAG_WAIT;
-		int i;
-		int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT;
-
-		err = arc_read(NULL, td->td_spa, bp, arc_getbuf_func, &buf,
-		    ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
-		if (err != 0)
-			goto post;
-		dnode_phys_t *child_dnp = buf->b_data;
-
-		for (i = 0; i < epb; i += child_dnp[i].dn_extra_slots + 1) {
-			prefetch_dnode_metadata(td, &child_dnp[i],
-			    zb->zb_objset, zb->zb_blkid * epb + i);
-		}
-
-		/* recursively visitbp() blocks below this */
-		for (i = 0; i < epb; i += child_dnp[i].dn_extra_slots + 1) {
-			err = traverse_dnode(td, &child_dnp[i],
-			    zb->zb_objset, zb->zb_blkid * epb + i);
-			if (err != 0)
-				break;
-		}
-	} else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) {
-		arc_flags_t flags = ARC_FLAG_WAIT;
-
-		err = arc_read(NULL, td->td_spa, bp, arc_getbuf_func, &buf,
-		    ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
-		if (err != 0)
-			goto post;
-
-		objset_phys_t *osp = buf->b_data;
-		prefetch_dnode_metadata(td, &osp->os_meta_dnode, zb->zb_objset,
-		    DMU_META_DNODE_OBJECT);
-		/*
-		 * See the block comment above for the goal of this variable.
-		 * If the maxblkid of the meta-dnode is 0, then we know that
-		 * we've never had more than DNODES_PER_BLOCK objects in the
-		 * dataset, which means we can't have reused any object ids.
-		 */
-		if (osp->os_meta_dnode.dn_maxblkid == 0)
-			td->td_realloc_possible = B_FALSE;
-
-		if (arc_buf_size(buf) >= sizeof (objset_phys_t)) {
-			prefetch_dnode_metadata(td, &osp->os_groupused_dnode,
-			    zb->zb_objset, DMU_GROUPUSED_OBJECT);
-			prefetch_dnode_metadata(td, &osp->os_userused_dnode,
-			    zb->zb_objset, DMU_USERUSED_OBJECT);
-		}
-
-		err = traverse_dnode(td, &osp->os_meta_dnode, zb->zb_objset,
-		    DMU_META_DNODE_OBJECT);
-		if (err == 0 && arc_buf_size(buf) >= sizeof (objset_phys_t)) {
-			err = traverse_dnode(td, &osp->os_groupused_dnode,
-			    zb->zb_objset, DMU_GROUPUSED_OBJECT);
-		}
-		if (err == 0 && arc_buf_size(buf) >= sizeof (objset_phys_t)) {
-			err = traverse_dnode(td, &osp->os_userused_dnode,
-			    zb->zb_objset, DMU_USERUSED_OBJECT);
-		}
-	}
-
-	if (buf)
-		arc_buf_destroy(buf, &buf);
-
-post:
-	if (err == 0 && (td->td_flags & TRAVERSE_POST))
-		err = td->td_func(td->td_spa, NULL, bp, zb, dnp, td->td_arg);
-
-	if (hard && (err == EIO || err == ECKSUM)) {
-		/*
-		 * Ignore this disk error as requested by the HARD flag,
-		 * and continue traversal.
-		 */
-		err = 0;
-	}
-
-	/*
-	 * If we are stopping here, set td_resume.
-	 */
-	if (td->td_resume != NULL && err != 0 && !td->td_paused) {
-		td->td_resume->zb_objset = zb->zb_objset;
-		td->td_resume->zb_object = zb->zb_object;
-		td->td_resume->zb_level = 0;
-		/*
-		 * If we have stopped on an indirect block (e.g. due to
-		 * i/o error), we have not visited anything below it.
-		 * Set the bookmark to the first level-0 block that we need
-		 * to visit.  This way, the resuming code does not need to
-		 * deal with resuming from indirect blocks.
-		 *
-		 * Note, if zb_level <= 0, dnp may be NULL, so we don't want
-		 * to dereference it.
-		 */
-		td->td_resume->zb_blkid = zb->zb_blkid;
-		if (zb->zb_level > 0) {
-			td->td_resume->zb_blkid <<= zb->zb_level *
-			    (dnp->dn_indblkshift - SPA_BLKPTRSHIFT);
-		}
-		td->td_paused = B_TRUE;
-	}
-
-	return (err);
-}
-
-static void
-prefetch_dnode_metadata(traverse_data_t *td, const dnode_phys_t *dnp,
-    uint64_t objset, uint64_t object)
-{
-	int j;
-	zbookmark_phys_t czb;
-
-	for (j = 0; j < dnp->dn_nblkptr; j++) {
-		SET_BOOKMARK(&czb, objset, object, dnp->dn_nlevels - 1, j);
-		traverse_prefetch_metadata(td, &dnp->dn_blkptr[j], &czb);
-	}
-
-	if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
-		SET_BOOKMARK(&czb, objset, object, 0, DMU_SPILL_BLKID);
-		traverse_prefetch_metadata(td, DN_SPILL_BLKPTR(dnp), &czb);
-	}
-}
-
-static int
-traverse_dnode(traverse_data_t *td, const dnode_phys_t *dnp,
-    uint64_t objset, uint64_t object)
-{
-	int j, err = 0;
-	zbookmark_phys_t czb;
-
-	if (object != DMU_META_DNODE_OBJECT && td->td_resume != NULL &&
-	    object < td->td_resume->zb_object)
-		return (0);
-
-	if (td->td_flags & TRAVERSE_PRE) {
-		SET_BOOKMARK(&czb, objset, object, ZB_DNODE_LEVEL,
-		    ZB_DNODE_BLKID);
-		err = td->td_func(td->td_spa, NULL, NULL, &czb, dnp,
-		    td->td_arg);
-		if (err == TRAVERSE_VISIT_NO_CHILDREN)
-			return (0);
-		if (err != 0)
-			return (err);
-	}
-
-	for (j = 0; j < dnp->dn_nblkptr; j++) {
-		SET_BOOKMARK(&czb, objset, object, dnp->dn_nlevels - 1, j);
-		err = traverse_visitbp(td, dnp, &dnp->dn_blkptr[j], &czb);
-		if (err != 0)
-			break;
-	}
-
-	if (err == 0 && (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR)) {
-		SET_BOOKMARK(&czb, objset, object, 0, DMU_SPILL_BLKID);
-		err = traverse_visitbp(td, dnp, DN_SPILL_BLKPTR(dnp), &czb);
-	}
-
-	if (err == 0 && (td->td_flags & TRAVERSE_POST)) {
-		SET_BOOKMARK(&czb, objset, object, ZB_DNODE_LEVEL,
-		    ZB_DNODE_BLKID);
-		err = td->td_func(td->td_spa, NULL, NULL, &czb, dnp,
-		    td->td_arg);
-		if (err == TRAVERSE_VISIT_NO_CHILDREN)
-			return (0);
-		if (err != 0)
-			return (err);
-	}
-	return (err);
-}
-
-/* ARGSUSED */
-static int
-traverse_prefetcher(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
-    const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg)
-{
-	prefetch_data_t *pfd = arg;
-	arc_flags_t aflags = ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH |
-	    ARC_FLAG_PRESCIENT_PREFETCH;
-	
-	ASSERT(pfd->pd_bytes_fetched >= 0);
-	if (bp == NULL)
-		return (0);
-	if (pfd->pd_cancel)
-		return (SET_ERROR(EINTR));
-
-	if (!prefetch_needed(pfd, bp))
-		return (0);
-
-	mutex_enter(&pfd->pd_mtx);
-	while (!pfd->pd_cancel && pfd->pd_bytes_fetched >= zfs_pd_bytes_max)
-		cv_wait(&pfd->pd_cv, &pfd->pd_mtx);
-	pfd->pd_bytes_fetched += BP_GET_LSIZE(bp);
-	cv_broadcast(&pfd->pd_cv);
-	mutex_exit(&pfd->pd_mtx);
-
-	(void) arc_read(NULL, spa, bp, NULL, NULL, ZIO_PRIORITY_ASYNC_READ,
-	    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, &aflags, zb);
-
-	return (0);
-}
-
-static void
-traverse_prefetch_thread(void *arg)
-{
-	traverse_data_t *td_main = arg;
-	traverse_data_t td = *td_main;
-	zbookmark_phys_t czb;
-
-	td.td_func = traverse_prefetcher;
-	td.td_arg = td_main->td_pfd;
-	td.td_pfd = NULL;
-	td.td_resume = &td_main->td_pfd->pd_resume;
-
-	SET_BOOKMARK(&czb, td.td_objset,
-	    ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
-	(void) traverse_visitbp(&td, NULL, td.td_rootbp, &czb);
-
-	mutex_enter(&td_main->td_pfd->pd_mtx);
-	td_main->td_pfd->pd_exited = B_TRUE;
-	cv_broadcast(&td_main->td_pfd->pd_cv);
-	mutex_exit(&td_main->td_pfd->pd_mtx);
-}
-
-/*
- * NB: dataset must not be changing on-disk (eg, is a snapshot or we are
- * in syncing context).
- */
-static int
-traverse_impl(spa_t *spa, dsl_dataset_t *ds, uint64_t objset, blkptr_t *rootbp,
-    uint64_t txg_start, zbookmark_phys_t *resume, int flags,
-    blkptr_cb_t func, void *arg)
-{
-	traverse_data_t td;
-	prefetch_data_t pd = { 0 };
-	zbookmark_phys_t czb;
-	int err;
-
-	ASSERT(ds == NULL || objset == ds->ds_object);
-	ASSERT(!(flags & TRAVERSE_PRE) || !(flags & TRAVERSE_POST));
-
-	td.td_spa = spa;
-	td.td_objset = objset;
-	td.td_rootbp = rootbp;
-	td.td_min_txg = txg_start;
-	td.td_resume = resume;
-	td.td_func = func;
-	td.td_arg = arg;
-	td.td_pfd = &pd;
-	td.td_flags = flags;
-	td.td_paused = B_FALSE;
-	td.td_realloc_possible = (txg_start == 0 ? B_FALSE : B_TRUE);
-
-	if (spa_feature_is_active(spa, SPA_FEATURE_HOLE_BIRTH)) {
-		VERIFY(spa_feature_enabled_txg(spa,
-		    SPA_FEATURE_HOLE_BIRTH, &td.td_hole_birth_enabled_txg));
-	} else {
-		td.td_hole_birth_enabled_txg = UINT64_MAX;
-	}
-
-	pd.pd_flags = flags;
-	if (resume != NULL)
-		pd.pd_resume = *resume;
-	mutex_init(&pd.pd_mtx, NULL, MUTEX_DEFAULT, NULL);
-	cv_init(&pd.pd_cv, NULL, CV_DEFAULT, NULL);
-
-	/* See comment on ZIL traversal in dsl_scan_visitds. */
-	if (ds != NULL && !ds->ds_is_snapshot && !BP_IS_HOLE(rootbp)) {
-		arc_flags_t flags = ARC_FLAG_WAIT;
-		objset_phys_t *osp;
-		arc_buf_t *buf;
-
-		err = arc_read(NULL, td.td_spa, rootbp,
-		    arc_getbuf_func, &buf,
-		    ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, NULL);
-		if (err != 0)
-			return (err);
-
-		osp = buf->b_data;
-		traverse_zil(&td, &osp->os_zil_header);
-		arc_buf_destroy(buf, &buf);
-	}
-
-	if (!(flags & TRAVERSE_PREFETCH_DATA) ||
-	    0 == taskq_dispatch(system_taskq, traverse_prefetch_thread,
-	    &td, TQ_NOQUEUE))
-		pd.pd_exited = B_TRUE;
-
-	SET_BOOKMARK(&czb, td.td_objset,
-	    ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
-	err = traverse_visitbp(&td, NULL, rootbp, &czb);
-
-	mutex_enter(&pd.pd_mtx);
-	pd.pd_cancel = B_TRUE;
-	cv_broadcast(&pd.pd_cv);
-	while (!pd.pd_exited)
-		cv_wait(&pd.pd_cv, &pd.pd_mtx);
-	mutex_exit(&pd.pd_mtx);
-
-	mutex_destroy(&pd.pd_mtx);
-	cv_destroy(&pd.pd_cv);
-
-	return (err);
-}
-
-/*
- * NB: dataset must not be changing on-disk (eg, is a snapshot or we are
- * in syncing context).
- */
-int
-traverse_dataset_resume(dsl_dataset_t *ds, uint64_t txg_start,
-    zbookmark_phys_t *resume,
-    int flags, blkptr_cb_t func, void *arg)
-{
-	return (traverse_impl(ds->ds_dir->dd_pool->dp_spa, ds, ds->ds_object,
-	    &dsl_dataset_phys(ds)->ds_bp, txg_start, resume, flags, func, arg));
-}
-
-int
-traverse_dataset(dsl_dataset_t *ds, uint64_t txg_start,
-    int flags, blkptr_cb_t func, void *arg)
-{
-	return (traverse_dataset_resume(ds, txg_start, NULL, flags, func, arg));
-}
-
-int
-traverse_dataset_destroyed(spa_t *spa, blkptr_t *blkptr,
-    uint64_t txg_start, zbookmark_phys_t *resume, int flags,
-    blkptr_cb_t func, void *arg)
-{
-	return (traverse_impl(spa, NULL, ZB_DESTROYED_OBJSET,
-	    blkptr, txg_start, resume, flags, func, arg));
-}
-
-/*
- * NB: pool must not be changing on-disk (eg, from zdb or sync context).
- */
-int
-traverse_pool(spa_t *spa, uint64_t txg_start, int flags,
-    blkptr_cb_t func, void *arg)
-{
-	int err;
-	dsl_pool_t *dp = spa_get_dsl(spa);
-	objset_t *mos = dp->dp_meta_objset;
-	boolean_t hard = (flags & TRAVERSE_HARD);
-
-	/* visit the MOS */
-	err = traverse_impl(spa, NULL, 0, spa_get_rootblkptr(spa),
-	    txg_start, NULL, flags, func, arg);
-	if (err != 0)
-		return (err);
-
-	/* visit each dataset */
-	for (uint64_t obj = 1; err == 0;
-	    err = dmu_object_next(mos, &obj, B_FALSE, txg_start)) {
-		dmu_object_info_t doi;
-
-		err = dmu_object_info(mos, obj, &doi);
-		if (err != 0) {
-			if (hard)
-				continue;
-			break;
-		}
-
-		if (doi.doi_bonus_type == DMU_OT_DSL_DATASET) {
-			dsl_dataset_t *ds;
-			uint64_t txg = txg_start;
-
-			dsl_pool_config_enter(dp, FTAG);
-			err = dsl_dataset_hold_obj(dp, obj, FTAG, &ds);
-			dsl_pool_config_exit(dp, FTAG);
-			if (err != 0) {
-				if (hard)
-					continue;
-				break;
-			}
-			if (dsl_dataset_phys(ds)->ds_prev_snap_txg > txg)
-				txg = dsl_dataset_phys(ds)->ds_prev_snap_txg;
-			err = traverse_dataset(ds, txg, flags, func, arg);
-			dsl_dataset_rele(ds, FTAG);
-			if (err != 0)
-				break;
-		}
-	}
-	if (err == ESRCH)
-		err = 0;
-	return (err);
-}
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c
+++ /dev/null
@@ -1,1345 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
- * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
- * Copyright (c) 2014 Integros [integros.com]
- */
-
-#include <sys/dmu.h>
-#include <sys/dmu_impl.h>
-#include <sys/dbuf.h>
-#include <sys/dmu_tx.h>
-#include <sys/dmu_objset.h>
-#include <sys/dsl_dataset.h>
-#include <sys/dsl_dir.h>
-#include <sys/dsl_pool.h>
-#include <sys/zap_impl.h>
-#include <sys/spa.h>
-#include <sys/sa.h>
-#include <sys/sa_impl.h>
-#include <sys/zfs_context.h>
-#include <sys/varargs.h>
-
-typedef void (*dmu_tx_hold_func_t)(dmu_tx_t *tx, struct dnode *dn,
-    uint64_t arg1, uint64_t arg2);
-
-
-dmu_tx_t *
-dmu_tx_create_dd(dsl_dir_t *dd)
-{
-	dmu_tx_t *tx = kmem_zalloc(sizeof (dmu_tx_t), KM_SLEEP);
-	tx->tx_dir = dd;
-	if (dd != NULL)
-		tx->tx_pool = dd->dd_pool;
-	list_create(&tx->tx_holds, sizeof (dmu_tx_hold_t),
-	    offsetof(dmu_tx_hold_t, txh_node));
-	list_create(&tx->tx_callbacks, sizeof (dmu_tx_callback_t),
-	    offsetof(dmu_tx_callback_t, dcb_node));
-	tx->tx_start = gethrtime();
-	return (tx);
-}
-
-dmu_tx_t *
-dmu_tx_create(objset_t *os)
-{
-	dmu_tx_t *tx = dmu_tx_create_dd(os->os_dsl_dataset->ds_dir);
-	tx->tx_objset = os;
-	return (tx);
-}
-
-dmu_tx_t *
-dmu_tx_create_assigned(struct dsl_pool *dp, uint64_t txg)
-{
-	dmu_tx_t *tx = dmu_tx_create_dd(NULL);
-
-	txg_verify(dp->dp_spa, txg);
-	tx->tx_pool = dp;
-	tx->tx_txg = txg;
-	tx->tx_anyobj = TRUE;
-
-	return (tx);
-}
-
-int
-dmu_tx_is_syncing(dmu_tx_t *tx)
-{
-	return (tx->tx_anyobj);
-}
-
-int
-dmu_tx_private_ok(dmu_tx_t *tx)
-{
-	return (tx->tx_anyobj);
-}
-
-static dmu_tx_hold_t *
-dmu_tx_hold_dnode_impl(dmu_tx_t *tx, dnode_t *dn, enum dmu_tx_hold_type type,
-    uint64_t arg1, uint64_t arg2)
-{
-	dmu_tx_hold_t *txh;
-
-	if (dn != NULL) {
-		(void) zfs_refcount_add(&dn->dn_holds, tx);
-		if (tx->tx_txg != 0) {
-			mutex_enter(&dn->dn_mtx);
-			/*
-			 * dn->dn_assigned_txg == tx->tx_txg doesn't pose a
-			 * problem, but there's no way for it to happen (for
-			 * now, at least).
-			 */
-			ASSERT(dn->dn_assigned_txg == 0);
-			dn->dn_assigned_txg = tx->tx_txg;
-			(void) zfs_refcount_add(&dn->dn_tx_holds, tx);
-			mutex_exit(&dn->dn_mtx);
-		}
-	}
-
-	txh = kmem_zalloc(sizeof (dmu_tx_hold_t), KM_SLEEP);
-	txh->txh_tx = tx;
-	txh->txh_dnode = dn;
-	zfs_refcount_create(&txh->txh_space_towrite);
-	zfs_refcount_create(&txh->txh_memory_tohold);
-	txh->txh_type = type;
-	txh->txh_arg1 = arg1;
-	txh->txh_arg2 = arg2;
-	list_insert_tail(&tx->tx_holds, txh);
-
-	return (txh);
-}
-
-static dmu_tx_hold_t *
-dmu_tx_hold_object_impl(dmu_tx_t *tx, objset_t *os, uint64_t object,
-    enum dmu_tx_hold_type type, uint64_t arg1, uint64_t arg2)
-{
-	dnode_t *dn = NULL;
-	dmu_tx_hold_t *txh;
-	int err;
-
-	if (object != DMU_NEW_OBJECT) {
-		err = dnode_hold(os, object, FTAG, &dn);
-		if (err != 0) {
-			tx->tx_err = err;
-			return (NULL);
-		}
-	}
-	txh = dmu_tx_hold_dnode_impl(tx, dn, type, arg1, arg2);
-	if (dn != NULL)
-		dnode_rele(dn, FTAG);
-	return (txh);
-}
-
-void
-dmu_tx_add_new_object(dmu_tx_t *tx, dnode_t *dn)
-{
-	/*
-	 * If we're syncing, they can manipulate any object anyhow, and
-	 * the hold on the dnode_t can cause problems.
-	 */
-	if (!dmu_tx_is_syncing(tx))
-		(void) dmu_tx_hold_dnode_impl(tx, dn, THT_NEWOBJECT, 0, 0);
-}
-
-/*
- * This function reads specified data from disk.  The specified data will
- * be needed to perform the transaction -- i.e, it will be read after
- * we do dmu_tx_assign().  There are two reasons that we read the data now
- * (before dmu_tx_assign()):
- *
- * 1. Reading it now has potentially better performance.  The transaction
- * has not yet been assigned, so the TXG is not held open, and also the
- * caller typically has less locks held when calling dmu_tx_hold_*() than
- * after the transaction has been assigned.  This reduces the lock (and txg)
- * hold times, thus reducing lock contention.
- *
- * 2. It is easier for callers (primarily the ZPL) to handle i/o errors
- * that are detected before they start making changes to the DMU state
- * (i.e. now).  Once the transaction has been assigned, and some DMU
- * state has been changed, it can be difficult to recover from an i/o
- * error (e.g. to undo the changes already made in memory at the DMU
- * layer).  Typically code to do so does not exist in the caller -- it
- * assumes that the data has already been cached and thus i/o errors are
- * not possible.
- *
- * It has been observed that the i/o initiated here can be a performance
- * problem, and it appears to be optional, because we don't look at the
- * data which is read.  However, removing this read would only serve to
- * move the work elsewhere (after the dmu_tx_assign()), where it may
- * have a greater impact on performance (in addition to the impact on
- * fault tolerance noted above).
- */
-static int
-dmu_tx_check_ioerr(zio_t *zio, dnode_t *dn, int level, uint64_t blkid)
-{
-	int err;
-	dmu_buf_impl_t *db;
-
-	rw_enter(&dn->dn_struct_rwlock, RW_READER);
-	db = dbuf_hold_level(dn, level, blkid, FTAG);
-	rw_exit(&dn->dn_struct_rwlock);
-	if (db == NULL)
-		return (SET_ERROR(EIO));
-	err = dbuf_read(db, zio, DB_RF_CANFAIL | DB_RF_NOPREFETCH);
-	dbuf_rele(db, FTAG);
-	return (err);
-}
-
-/* ARGSUSED */
-static void
-dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
-{
-	dnode_t *dn = txh->txh_dnode;
-	int err = 0;
-
-	if (len == 0)
-		return;
-
-	(void) zfs_refcount_add_many(&txh->txh_space_towrite, len, FTAG);
-
-	if (zfs_refcount_count(&txh->txh_space_towrite) > 2 * DMU_MAX_ACCESS)
-		err = SET_ERROR(EFBIG);
-
-	if (dn == NULL)
-		return;
-
-	/*
-	 * For i/o error checking, read the blocks that will be needed
-	 * to perform the write: the first and last level-0 blocks (if
-	 * they are not aligned, i.e. if they are partial-block writes),
-	 * and all the level-1 blocks.
-	 */
-	if (dn->dn_maxblkid == 0) {
-		if (off < dn->dn_datablksz &&
-		    (off > 0 || len < dn->dn_datablksz)) {
-			err = dmu_tx_check_ioerr(NULL, dn, 0, 0);
-			if (err != 0) {
-				txh->txh_tx->tx_err = err;
-			}
-		}
-	} else {
-		zio_t *zio = zio_root(dn->dn_objset->os_spa,
-		    NULL, NULL, ZIO_FLAG_CANFAIL);
-
-		/* first level-0 block */
-		uint64_t start = off >> dn->dn_datablkshift;
-		if (P2PHASE(off, dn->dn_datablksz) || len < dn->dn_datablksz) {
-			err = dmu_tx_check_ioerr(zio, dn, 0, start);
-			if (err != 0) {
-				txh->txh_tx->tx_err = err;
-			}
-		}
-
-		/* last level-0 block */
-		uint64_t end = (off + len - 1) >> dn->dn_datablkshift;
-		if (end != start && end <= dn->dn_maxblkid &&
-		    P2PHASE(off + len, dn->dn_datablksz)) {
-			err = dmu_tx_check_ioerr(zio, dn, 0, end);
-			if (err != 0) {
-				txh->txh_tx->tx_err = err;
-			}
-		}
-
-		/* level-1 blocks */
-		if (dn->dn_nlevels > 1) {
-			int shft = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
-			for (uint64_t i = (start >> shft) + 1;
-			    i < end >> shft; i++) {
-				err = dmu_tx_check_ioerr(zio, dn, 1, i);
-				if (err != 0) {
-					txh->txh_tx->tx_err = err;
-				}
-			}
-		}
-
-		err = zio_wait(zio);
-		if (err != 0) {
-			txh->txh_tx->tx_err = err;
-		}
-	}
-}
-
-static void
-dmu_tx_count_dnode(dmu_tx_hold_t *txh)
-{
-	(void) zfs_refcount_add_many(&txh->txh_space_towrite, DNODE_MIN_SIZE,
-	    FTAG);
-}
-
-void
-dmu_tx_hold_write(dmu_tx_t *tx, uint64_t object, uint64_t off, int len)
-{
-	dmu_tx_hold_t *txh;
-
-	ASSERT0(tx->tx_txg);
-	ASSERT3U(len, <=, DMU_MAX_ACCESS);
-	ASSERT(len == 0 || UINT64_MAX - off >= len - 1);
-
-	txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
-	    object, THT_WRITE, off, len);
-	if (txh != NULL) {
-		dmu_tx_count_write(txh, off, len);
-		dmu_tx_count_dnode(txh);
-	}
-}
-
-void
-dmu_tx_hold_remap_l1indirect(dmu_tx_t *tx, uint64_t object)
-{
-	dmu_tx_hold_t *txh;
-
-	ASSERT(tx->tx_txg == 0);
-	txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
-	    object, THT_WRITE, 0, 0);
-	if (txh == NULL)
-		return;
-
-	dnode_t *dn = txh->txh_dnode;
-	(void) zfs_refcount_add_many(&txh->txh_space_towrite,
-	    1ULL << dn->dn_indblkshift, FTAG);
-	dmu_tx_count_dnode(txh);
-}
-
-void
-dmu_tx_hold_write_by_dnode(dmu_tx_t *tx, dnode_t *dn, uint64_t off, int len)
-{
-	dmu_tx_hold_t *txh;
-
-	ASSERT0(tx->tx_txg);
-	ASSERT3U(len, <=, DMU_MAX_ACCESS);
-	ASSERT(len == 0 || UINT64_MAX - off >= len - 1);
-
-	txh = dmu_tx_hold_dnode_impl(tx, dn, THT_WRITE, off, len);
-	if (txh != NULL) {
-		dmu_tx_count_write(txh, off, len);
-		dmu_tx_count_dnode(txh);
-	}
-}
-
-/*
- * This function marks the transaction as being a "net free".  The end
- * result is that refquotas will be disabled for this transaction, and
- * this transaction will be able to use half of the pool space overhead
- * (see dsl_pool_adjustedsize()).  Therefore this function should only
- * be called for transactions that we expect will not cause a net increase
- * in the amount of space used (but it's OK if that is occasionally not true).
- */
-void
-dmu_tx_mark_netfree(dmu_tx_t *tx)
-{
-	tx->tx_netfree = B_TRUE;
-}
-
-static void
-dmu_tx_hold_free_impl(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
-{
-	dmu_tx_t *tx;
-	dnode_t *dn;
-	int err;
-
-	tx = txh->txh_tx;
-	ASSERT(tx->tx_txg == 0);
-
-	dn = txh->txh_dnode;
-	dmu_tx_count_dnode(txh);
-
-	if (off >= (dn->dn_maxblkid + 1) * dn->dn_datablksz)
-		return;
-	if (len == DMU_OBJECT_END)
-		len = (dn->dn_maxblkid + 1) * dn->dn_datablksz - off;
-
-
-	/*
-	 * For i/o error checking, we read the first and last level-0
-	 * blocks if they are not aligned, and all the level-1 blocks.
-	 *
-	 * Note:  dbuf_free_range() assumes that we have not instantiated
-	 * any level-0 dbufs that will be completely freed.  Therefore we must
-	 * exercise care to not read or count the first and last blocks
-	 * if they are blocksize-aligned.
-	 */
-	if (dn->dn_datablkshift == 0) {
-		if (off != 0 || len < dn->dn_datablksz)
-			dmu_tx_count_write(txh, 0, dn->dn_datablksz);
-	} else {
-		/* first block will be modified if it is not aligned */
-		if (!IS_P2ALIGNED(off, 1 << dn->dn_datablkshift))
-			dmu_tx_count_write(txh, off, 1);
-		/* last block will be modified if it is not aligned */
-		if (!IS_P2ALIGNED(off + len, 1 << dn->dn_datablkshift))
-			dmu_tx_count_write(txh, off + len, 1);
-	}
-
-	/*
-	 * Check level-1 blocks.
-	 */
-	if (dn->dn_nlevels > 1) {
-		int shift = dn->dn_datablkshift + dn->dn_indblkshift -
-		    SPA_BLKPTRSHIFT;
-		uint64_t start = off >> shift;
-		uint64_t end = (off + len) >> shift;
-
-		ASSERT(dn->dn_indblkshift != 0);
-
-		/*
-		 * dnode_reallocate() can result in an object with indirect
-		 * blocks having an odd data block size.  In this case,
-		 * just check the single block.
-		 */
-		if (dn->dn_datablkshift == 0)
-			start = end = 0;
-
-		zio_t *zio = zio_root(tx->tx_pool->dp_spa,
-		    NULL, NULL, ZIO_FLAG_CANFAIL);
-		for (uint64_t i = start; i <= end; i++) {
-			uint64_t ibyte = i << shift;
-			err = dnode_next_offset(dn, 0, &ibyte, 2, 1, 0);
-			i = ibyte >> shift;
-			if (err == ESRCH || i > end)
-				break;
-			if (err != 0) {
-				tx->tx_err = err;
-				(void) zio_wait(zio);
-				return;
-			}
-
-			(void) zfs_refcount_add_many(&txh->txh_memory_tohold,
-			    1 << dn->dn_indblkshift, FTAG);
-
-			err = dmu_tx_check_ioerr(zio, dn, 1, i);
-			if (err != 0) {
-				tx->tx_err = err;
-				(void) zio_wait(zio);
-				return;
-			}
-		}
-		err = zio_wait(zio);
-		if (err != 0) {
-			tx->tx_err = err;
-			return;
-		}
-	}
-}
-
-void
-dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off, uint64_t len)
-{
-	dmu_tx_hold_t *txh;
-
-	txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
-	    object, THT_FREE, off, len);
-	if (txh != NULL)
-		(void) dmu_tx_hold_free_impl(txh, off, len);
-}
-
-void
-dmu_tx_hold_free_by_dnode(dmu_tx_t *tx, dnode_t *dn, uint64_t off, uint64_t len)
-{
-	dmu_tx_hold_t *txh;
-
-	txh = dmu_tx_hold_dnode_impl(tx, dn, THT_FREE, off, len);
-	if (txh != NULL)
-		(void) dmu_tx_hold_free_impl(txh, off, len);
-}
-
-static void
-dmu_tx_hold_zap_impl(dmu_tx_hold_t *txh, const char *name)
-{
-	dmu_tx_t *tx = txh->txh_tx;
-	dnode_t *dn;
-	int err;
-
-	ASSERT(tx->tx_txg == 0);
-
-	dn = txh->txh_dnode;
-
-	dmu_tx_count_dnode(txh);
-
-	/*
-	 * Modifying a almost-full microzap is around the worst case (128KB)
-	 *
-	 * If it is a fat zap, the worst case would be 7*16KB=112KB:
-	 * - 3 blocks overwritten: target leaf, ptrtbl block, header block
-	 * - 4 new blocks written if adding:
-	 *    - 2 blocks for possibly split leaves,
-	 *    - 2 grown ptrtbl blocks
-	 */
-	(void) zfs_refcount_add_many(&txh->txh_space_towrite,
-	    MZAP_MAX_BLKSZ, FTAG);
-
-	if (dn == NULL)
-		return;
-
-	ASSERT3P(DMU_OT_BYTESWAP(dn->dn_type), ==, DMU_BSWAP_ZAP);
-
-	if (dn->dn_maxblkid == 0 || name == NULL) {
-		/*
-		 * This is a microzap (only one block), or we don't know
-		 * the name.  Check the first block for i/o errors.
-		 */
-		err = dmu_tx_check_ioerr(NULL, dn, 0, 0);
-		if (err != 0) {
-			tx->tx_err = err;
-		}
-	} else {
-		/*
-		 * Access the name so that we'll check for i/o errors to
-		 * the leaf blocks, etc.  We ignore ENOENT, as this name
-		 * may not yet exist.
-		 */
-		err = zap_lookup_by_dnode(dn, name, 8, 0, NULL);
-		if (err == EIO || err == ECKSUM || err == ENXIO) {
-			tx->tx_err = err;
-		}
-	}
-}
-
-void
-dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, const char *name)
-{
-	dmu_tx_hold_t *txh;
-
-	ASSERT0(tx->tx_txg);
-
-	txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
-	    object, THT_ZAP, add, (uintptr_t)name);
-	if (txh != NULL)
-		dmu_tx_hold_zap_impl(txh, name);
-}
-
-void
-dmu_tx_hold_zap_by_dnode(dmu_tx_t *tx, dnode_t *dn, int add, const char *name)
-{
-	dmu_tx_hold_t *txh;
-
-	ASSERT0(tx->tx_txg);
-	ASSERT(dn != NULL);
-
-	txh = dmu_tx_hold_dnode_impl(tx, dn, THT_ZAP, add, (uintptr_t)name);
-	if (txh != NULL)
-		dmu_tx_hold_zap_impl(txh, name);
-}
-
-void
-dmu_tx_hold_bonus(dmu_tx_t *tx, uint64_t object)
-{
-	dmu_tx_hold_t *txh;
-
-	ASSERT(tx->tx_txg == 0);
-
-	txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
-	    object, THT_BONUS, 0, 0);
-	if (txh)
-		dmu_tx_count_dnode(txh);
-}
-
-void
-dmu_tx_hold_bonus_by_dnode(dmu_tx_t *tx, dnode_t *dn)
-{
-	dmu_tx_hold_t *txh;
-
-	ASSERT0(tx->tx_txg);
-
-	txh = dmu_tx_hold_dnode_impl(tx, dn, THT_BONUS, 0, 0);
-	if (txh)
-		dmu_tx_count_dnode(txh);
-}
-
-void
-dmu_tx_hold_space(dmu_tx_t *tx, uint64_t space)
-{
-	dmu_tx_hold_t *txh;
-	ASSERT(tx->tx_txg == 0);
-
-	txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
-	    DMU_NEW_OBJECT, THT_SPACE, space, 0);
-
-	(void) zfs_refcount_add_many(&txh->txh_space_towrite, space, FTAG);
-}
-
-#ifdef ZFS_DEBUG
-void
-dmu_tx_dirty_buf(dmu_tx_t *tx, dmu_buf_impl_t *db)
-{
-	boolean_t match_object = B_FALSE;
-	boolean_t match_offset = B_FALSE;
-
-	DB_DNODE_ENTER(db);
-	dnode_t *dn = DB_DNODE(db);
-	ASSERT(tx->tx_txg != 0);
-	ASSERT(tx->tx_objset == NULL || dn->dn_objset == tx->tx_objset);
-	ASSERT3U(dn->dn_object, ==, db->db.db_object);
-
-	if (tx->tx_anyobj) {
-		DB_DNODE_EXIT(db);
-		return;
-	}
-
-	/* XXX No checking on the meta dnode for now */
-	if (db->db.db_object == DMU_META_DNODE_OBJECT) {
-		DB_DNODE_EXIT(db);
-		return;
-	}
-
-	for (dmu_tx_hold_t *txh = list_head(&tx->tx_holds); txh != NULL;
-	    txh = list_next(&tx->tx_holds, txh)) {
-		ASSERT(dn == NULL || dn->dn_assigned_txg == tx->tx_txg);
-		if (txh->txh_dnode == dn && txh->txh_type != THT_NEWOBJECT)
-			match_object = TRUE;
-		if (txh->txh_dnode == NULL || txh->txh_dnode == dn) {
-			int datablkshift = dn->dn_datablkshift ?
-			    dn->dn_datablkshift : SPA_MAXBLOCKSHIFT;
-			int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
-			int shift = datablkshift + epbs * db->db_level;
-			uint64_t beginblk = shift >= 64 ? 0 :
-			    (txh->txh_arg1 >> shift);
-			uint64_t endblk = shift >= 64 ? 0 :
-			    ((txh->txh_arg1 + txh->txh_arg2 - 1) >> shift);
-			uint64_t blkid = db->db_blkid;
-
-			/* XXX txh_arg2 better not be zero... */
-
-			dprintf("found txh type %x beginblk=%llx endblk=%llx\n",
-			    txh->txh_type, beginblk, endblk);
-
-			switch (txh->txh_type) {
-			case THT_WRITE:
-				if (blkid >= beginblk && blkid <= endblk)
-					match_offset = TRUE;
-				/*
-				 * We will let this hold work for the bonus
-				 * or spill buffer so that we don't need to
-				 * hold it when creating a new object.
-				 */
-				if (blkid == DMU_BONUS_BLKID ||
-				    blkid == DMU_SPILL_BLKID)
-					match_offset = TRUE;
-				/*
-				 * They might have to increase nlevels,
-				 * thus dirtying the new TLIBs.  Or the
-				 * might have to change the block size,
-				 * thus dirying the new lvl=0 blk=0.
-				 */
-				if (blkid == 0)
-					match_offset = TRUE;
-				break;
-			case THT_FREE:
-				/*
-				 * We will dirty all the level 1 blocks in
-				 * the free range and perhaps the first and
-				 * last level 0 block.
-				 */
-				if (blkid >= beginblk && (blkid <= endblk ||
-				    txh->txh_arg2 == DMU_OBJECT_END))
-					match_offset = TRUE;
-				break;
-			case THT_SPILL:
-				if (blkid == DMU_SPILL_BLKID)
-					match_offset = TRUE;
-				break;
-			case THT_BONUS:
-				if (blkid == DMU_BONUS_BLKID)
-					match_offset = TRUE;
-				break;
-			case THT_ZAP:
-				match_offset = TRUE;
-				break;
-			case THT_NEWOBJECT:
-				match_object = TRUE;
-				break;
-			default:
-				ASSERT(!"bad txh_type");
-			}
-		}
-		if (match_object && match_offset) {
-			DB_DNODE_EXIT(db);
-			return;
-		}
-	}
-	DB_DNODE_EXIT(db);
-	panic("dirtying dbuf obj=%llx lvl=%u blkid=%llx but not tx_held\n",
-	    (u_longlong_t)db->db.db_object, db->db_level,
-	    (u_longlong_t)db->db_blkid);
-}
-#endif
-
-/*
- * If we can't do 10 iops, something is wrong.  Let us go ahead
- * and hit zfs_dirty_data_max.
- */
-hrtime_t zfs_delay_max_ns = MSEC2NSEC(100);
-int zfs_delay_resolution_ns = 100 * 1000; /* 100 microseconds */
-
-/*
- * We delay transactions when we've determined that the backend storage
- * isn't able to accommodate the rate of incoming writes.
- *
- * If there is already a transaction waiting, we delay relative to when
- * that transaction finishes waiting.  This way the calculated min_time
- * is independent of the number of threads concurrently executing
- * transactions.
- *
- * If we are the only waiter, wait relative to when the transaction
- * started, rather than the current time.  This credits the transaction for
- * "time already served", e.g. reading indirect blocks.
- *
- * The minimum time for a transaction to take is calculated as:
- *     min_time = scale * (dirty - min) / (max - dirty)
- *     min_time is then capped at zfs_delay_max_ns.
- *
- * The delay has two degrees of freedom that can be adjusted via tunables.
- * The percentage of dirty data at which we start to delay is defined by
- * zfs_delay_min_dirty_percent. This should typically be at or above
- * zfs_vdev_async_write_active_max_dirty_percent so that we only start to
- * delay after writing at full speed has failed to keep up with the incoming
- * write rate. The scale of the curve is defined by zfs_delay_scale. Roughly
- * speaking, this variable determines the amount of delay at the midpoint of
- * the curve.
- *
- * delay
- *  10ms +-------------------------------------------------------------*+
- *       |                                                             *|
- *   9ms +                                                             *+
- *       |                                                             *|
- *   8ms +                                                             *+
- *       |                                                            * |
- *   7ms +                                                            * +
- *       |                                                            * |
- *   6ms +                                                            * +
- *       |                                                            * |
- *   5ms +                                                           *  +
- *       |                                                           *  |
- *   4ms +                                                           *  +
- *       |                                                           *  |
- *   3ms +                                                          *   +
- *       |                                                          *   |
- *   2ms +                                              (midpoint) *    +
- *       |                                                  |    **     |
- *   1ms +                                                  v ***       +
- *       |             zfs_delay_scale ---------->     ********         |
- *     0 +-------------------------------------*********----------------+
- *       0%                    <- zfs_dirty_data_max ->               100%
- *
- * Note that since the delay is added to the outstanding time remaining on the
- * most recent transaction, the delay is effectively the inverse of IOPS.
- * Here the midpoint of 500us translates to 2000 IOPS. The shape of the curve
- * was chosen such that small changes in the amount of accumulated dirty data
- * in the first 3/4 of the curve yield relatively small differences in the
- * amount of delay.
- *
- * The effects can be easier to understand when the amount of delay is
- * represented on a log scale:
- *
- * delay
- * 100ms +-------------------------------------------------------------++
- *       +                                                              +
- *       |                                                              |
- *       +                                                             *+
- *  10ms +                                                             *+
- *       +                                                           ** +
- *       |                                              (midpoint)  **  |
- *       +                                                  |     **    +
- *   1ms +                                                  v ****      +
- *       +             zfs_delay_scale ---------->        *****         +
- *       |                                             ****             |
- *       +                                          ****                +
- * 100us +                                        **                    +
- *       +                                       *                      +
- *       |                                      *                       |
- *       +                                     *                        +
- *  10us +                                     *                        +
- *       +                                                              +
- *       |                                                              |
- *       +                                                              +
- *       +--------------------------------------------------------------+
- *       0%                    <- zfs_dirty_data_max ->               100%
- *
- * Note here that only as the amount of dirty data approaches its limit does
- * the delay start to increase rapidly. The goal of a properly tuned system
- * should be to keep the amount of dirty data out of that range by first
- * ensuring that the appropriate limits are set for the I/O scheduler to reach
- * optimal throughput on the backend storage, and then by changing the value
- * of zfs_delay_scale to increase the steepness of the curve.
- */
-static void
-dmu_tx_delay(dmu_tx_t *tx, uint64_t dirty)
-{
-	dsl_pool_t *dp = tx->tx_pool;
-	uint64_t delay_min_bytes =
-	    zfs_dirty_data_max * zfs_delay_min_dirty_percent / 100;
-	hrtime_t wakeup, min_tx_time, now;
-
-	if (dirty <= delay_min_bytes)
-		return;
-
-	/*
-	 * The caller has already waited until we are under the max.
-	 * We make them pass us the amount of dirty data so we don't
-	 * have to handle the case of it being >= the max, which could
-	 * cause a divide-by-zero if it's == the max.
-	 */
-	ASSERT3U(dirty, <, zfs_dirty_data_max);
-
-	now = gethrtime();
-	min_tx_time = zfs_delay_scale *
-	    (dirty - delay_min_bytes) / (zfs_dirty_data_max - dirty);
-	if (now > tx->tx_start + min_tx_time)
-		return;
-
-	min_tx_time = MIN(min_tx_time, zfs_delay_max_ns);
-
-	DTRACE_PROBE3(delay__mintime, dmu_tx_t *, tx, uint64_t, dirty,
-	    uint64_t, min_tx_time);
-
-	mutex_enter(&dp->dp_lock);
-	wakeup = MAX(tx->tx_start + min_tx_time,
-	    dp->dp_last_wakeup + min_tx_time);
-	dp->dp_last_wakeup = wakeup;
-	mutex_exit(&dp->dp_lock);
-
-#ifdef _KERNEL
-#ifdef illumos
-	mutex_enter(&curthread->t_delay_lock);
-	while (cv_timedwait_hires(&curthread->t_delay_cv,
-	    &curthread->t_delay_lock, wakeup, zfs_delay_resolution_ns,
-	    CALLOUT_FLAG_ABSOLUTE | CALLOUT_FLAG_ROUNDUP) > 0)
-		continue;
-	mutex_exit(&curthread->t_delay_lock);
-#else
-	pause_sbt("dmu_tx_delay", nstosbt(wakeup),
-	    nstosbt(zfs_delay_resolution_ns), C_ABSOLUTE);
-#endif
-#else
-	hrtime_t delta = wakeup - gethrtime();
-	struct timespec ts;
-	ts.tv_sec = delta / NANOSEC;
-	ts.tv_nsec = delta % NANOSEC;
-	(void) nanosleep(&ts, NULL);
-#endif
-}
-
-/*
- * This routine attempts to assign the transaction to a transaction group.
- * To do so, we must determine if there is sufficient free space on disk.
- *
- * If this is a "netfree" transaction (i.e. we called dmu_tx_mark_netfree()
- * on it), then it is assumed that there is sufficient free space,
- * unless there's insufficient slop space in the pool (see the comment
- * above spa_slop_shift in spa_misc.c).
- *
- * If it is not a "netfree" transaction, then if the data already on disk
- * is over the allowed usage (e.g. quota), this will fail with EDQUOT or
- * ENOSPC.  Otherwise, if the current rough estimate of pending changes,
- * plus the rough estimate of this transaction's changes, may exceed the
- * allowed usage, then this will fail with ERESTART, which will cause the
- * caller to wait for the pending changes to be written to disk (by waiting
- * for the next TXG to open), and then check the space usage again.
- *
- * The rough estimate of pending changes is comprised of the sum of:
- *
- *  - this transaction's holds' txh_space_towrite
- *
- *  - dd_tempreserved[], which is the sum of in-flight transactions'
- *    holds' txh_space_towrite (i.e. those transactions that have called
- *    dmu_tx_assign() but not yet called dmu_tx_commit()).
- *
- *  - dd_space_towrite[], which is the amount of dirtied dbufs.
- *
- * Note that all of these values are inflated by spa_get_worst_case_asize(),
- * which means that we may get ERESTART well before we are actually in danger
- * of running out of space, but this also mitigates any small inaccuracies
- * in the rough estimate (e.g. txh_space_towrite doesn't take into account
- * indirect blocks, and dd_space_towrite[] doesn't take into account changes
- * to the MOS).
- *
- * Note that due to this algorithm, it is possible to exceed the allowed
- * usage by one transaction.  Also, as we approach the allowed usage,
- * we will allow a very limited amount of changes into each TXG, thus
- * decreasing performance.
- */
-static int
-dmu_tx_try_assign(dmu_tx_t *tx, uint64_t txg_how)
-{
-	spa_t *spa = tx->tx_pool->dp_spa;
-
-	ASSERT0(tx->tx_txg);
-
-	if (tx->tx_err)
-		return (tx->tx_err);
-
-	if (spa_suspended(spa)) {
-		/*
-		 * If the user has indicated a blocking failure mode
-		 * then return ERESTART which will block in dmu_tx_wait().
-		 * Otherwise, return EIO so that an error can get
-		 * propagated back to the VOP calls.
-		 *
-		 * Note that we always honor the txg_how flag regardless
-		 * of the failuremode setting.
-		 */
-		if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_CONTINUE &&
-		    !(txg_how & TXG_WAIT))
-			return (SET_ERROR(EIO));
-
-		return (SET_ERROR(ERESTART));
-	}
-
-	if (!tx->tx_dirty_delayed &&
-	    dsl_pool_need_dirty_delay(tx->tx_pool)) {
-		tx->tx_wait_dirty = B_TRUE;
-		return (SET_ERROR(ERESTART));
-	}
-
-	tx->tx_txg = txg_hold_open(tx->tx_pool, &tx->tx_txgh);
-	tx->tx_needassign_txh = NULL;
-
-	/*
-	 * NB: No error returns are allowed after txg_hold_open, but
-	 * before processing the dnode holds, due to the
-	 * dmu_tx_unassign() logic.
-	 */
-
-	uint64_t towrite = 0;
-	uint64_t tohold = 0;
-	for (dmu_tx_hold_t *txh = list_head(&tx->tx_holds); txh != NULL;
-	    txh = list_next(&tx->tx_holds, txh)) {
-		dnode_t *dn = txh->txh_dnode;
-		if (dn != NULL) {
-			mutex_enter(&dn->dn_mtx);
-			if (dn->dn_assigned_txg == tx->tx_txg - 1) {
-				mutex_exit(&dn->dn_mtx);
-				tx->tx_needassign_txh = txh;
-				return (SET_ERROR(ERESTART));
-			}
-			if (dn->dn_assigned_txg == 0)
-				dn->dn_assigned_txg = tx->tx_txg;
-			ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg);
-			(void) zfs_refcount_add(&dn->dn_tx_holds, tx);
-			mutex_exit(&dn->dn_mtx);
-		}
-		towrite += zfs_refcount_count(&txh->txh_space_towrite);
-		tohold += zfs_refcount_count(&txh->txh_memory_tohold);
-	}
-
-	/* needed allocation: worst-case estimate of write space */
-	uint64_t asize = spa_get_worst_case_asize(tx->tx_pool->dp_spa, towrite);
-	/* calculate memory footprint estimate */
-	uint64_t memory = towrite + tohold;
-
-	if (tx->tx_dir != NULL && asize != 0) {
-		int err = dsl_dir_tempreserve_space(tx->tx_dir, memory,
-		    asize, tx->tx_netfree, &tx->tx_tempreserve_cookie, tx);
-		if (err != 0)
-			return (err);
-	}
-
-	return (0);
-}
-
-static void
-dmu_tx_unassign(dmu_tx_t *tx)
-{
-	if (tx->tx_txg == 0)
-		return;
-
-	txg_rele_to_quiesce(&tx->tx_txgh);
-
-	/*
-	 * Walk the transaction's hold list, removing the hold on the
-	 * associated dnode, and notifying waiters if the refcount drops to 0.
-	 */
-	for (dmu_tx_hold_t *txh = list_head(&tx->tx_holds);
-	    txh != tx->tx_needassign_txh;
-	    txh = list_next(&tx->tx_holds, txh)) {
-		dnode_t *dn = txh->txh_dnode;
-
-		if (dn == NULL)
-			continue;
-		mutex_enter(&dn->dn_mtx);
-		ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg);
-
-		if (zfs_refcount_remove(&dn->dn_tx_holds, tx) == 0) {
-			dn->dn_assigned_txg = 0;
-			cv_broadcast(&dn->dn_notxholds);
-		}
-		mutex_exit(&dn->dn_mtx);
-	}
-
-	txg_rele_to_sync(&tx->tx_txgh);
-
-	tx->tx_lasttried_txg = tx->tx_txg;
-	tx->tx_txg = 0;
-}
-
-/*
- * Assign tx to a transaction group; txg_how is a bitmask:
- *
- * If TXG_WAIT is set and the currently open txg is full, this function
- * will wait until there's a new txg. This should be used when no locks
- * are being held. With this bit set, this function will only fail if
- * we're truly out of space (or over quota).
- *
- * If TXG_WAIT is *not* set and we can't assign into the currently open
- * txg without blocking, this function will return immediately with
- * ERESTART. This should be used whenever locks are being held.  On an
- * ERESTART error, the caller should drop all locks, call dmu_tx_wait(),
- * and try again.
- *
- * If TXG_NOTHROTTLE is set, this indicates that this tx should not be
- * delayed due on the ZFS Write Throttle (see comments in dsl_pool.c for
- * details on the throttle). This is used by the VFS operations, after
- * they have already called dmu_tx_wait() (though most likely on a
- * different tx).
- */
-int
-dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how)
-{
-	int err;
-
-	ASSERT(tx->tx_txg == 0);
-	ASSERT0(txg_how & ~(TXG_WAIT | TXG_NOTHROTTLE));
-	ASSERT(!dsl_pool_sync_context(tx->tx_pool));
-
-	/* If we might wait, we must not hold the config lock. */
-	IMPLY((txg_how & TXG_WAIT), !dsl_pool_config_held(tx->tx_pool));
-
-	if ((txg_how & TXG_NOTHROTTLE))
-		tx->tx_dirty_delayed = B_TRUE;
-
-	while ((err = dmu_tx_try_assign(tx, txg_how)) != 0) {
-		dmu_tx_unassign(tx);
-
-		if (err != ERESTART || !(txg_how & TXG_WAIT))
-			return (err);
-
-		dmu_tx_wait(tx);
-	}
-
-	txg_rele_to_quiesce(&tx->tx_txgh);
-
-	return (0);
-}
-
-void
-dmu_tx_wait(dmu_tx_t *tx)
-{
-	spa_t *spa = tx->tx_pool->dp_spa;
-	dsl_pool_t *dp = tx->tx_pool;
-
-	ASSERT(tx->tx_txg == 0);
-	ASSERT(!dsl_pool_config_held(tx->tx_pool));
-
-	if (tx->tx_wait_dirty) {
-		/*
-		 * dmu_tx_try_assign() has determined that we need to wait
-		 * because we've consumed much or all of the dirty buffer
-		 * space.
-		 */
-		mutex_enter(&dp->dp_lock);
-		while (dp->dp_dirty_total >= zfs_dirty_data_max)
-			cv_wait(&dp->dp_spaceavail_cv, &dp->dp_lock);
-		uint64_t dirty = dp->dp_dirty_total;
-		mutex_exit(&dp->dp_lock);
-
-		dmu_tx_delay(tx, dirty);
-
-		tx->tx_wait_dirty = B_FALSE;
-
-		/*
-		 * Note: setting tx_dirty_delayed only has effect if the
-		 * caller used TX_WAIT.  Otherwise they are going to
-		 * destroy this tx and try again.  The common case,
-		 * zfs_write(), uses TX_WAIT.
-		 */
-		tx->tx_dirty_delayed = B_TRUE;
-	} else if (spa_suspended(spa) || tx->tx_lasttried_txg == 0) {
-		/*
-		 * If the pool is suspended we need to wait until it
-		 * is resumed.  Note that it's possible that the pool
-		 * has become active after this thread has tried to
-		 * obtain a tx.  If that's the case then tx_lasttried_txg
-		 * would not have been set.
-		 */
-		txg_wait_synced(dp, spa_last_synced_txg(spa) + 1);
-	} else if (tx->tx_needassign_txh) {
-		/*
-		 * A dnode is assigned to the quiescing txg.  Wait for its
-		 * transaction to complete.
-		 */
-		dnode_t *dn = tx->tx_needassign_txh->txh_dnode;
-
-		mutex_enter(&dn->dn_mtx);
-		while (dn->dn_assigned_txg == tx->tx_lasttried_txg - 1)
-			cv_wait(&dn->dn_notxholds, &dn->dn_mtx);
-		mutex_exit(&dn->dn_mtx);
-		tx->tx_needassign_txh = NULL;
-	} else {
-		/*
-		 * If we have a lot of dirty data just wait until we sync
-		 * out a TXG at which point we'll hopefully have synced
-		 * a portion of the changes.
-		 */
-		txg_wait_synced(dp, spa_last_synced_txg(spa) + 1);
-	}
-}
-
-static void
-dmu_tx_destroy(dmu_tx_t *tx)
-{
-	dmu_tx_hold_t *txh;
-
-	while ((txh = list_head(&tx->tx_holds)) != NULL) {
-		dnode_t *dn = txh->txh_dnode;
-
-		list_remove(&tx->tx_holds, txh);
-		zfs_refcount_destroy_many(&txh->txh_space_towrite,
-		    zfs_refcount_count(&txh->txh_space_towrite));
-		zfs_refcount_destroy_many(&txh->txh_memory_tohold,
-		    zfs_refcount_count(&txh->txh_memory_tohold));
-		kmem_free(txh, sizeof (dmu_tx_hold_t));
-		if (dn != NULL)
-			dnode_rele(dn, tx);
-	}
-
-	list_destroy(&tx->tx_callbacks);
-	list_destroy(&tx->tx_holds);
-	kmem_free(tx, sizeof (dmu_tx_t));
-}
-
-void
-dmu_tx_commit(dmu_tx_t *tx)
-{
-	ASSERT(tx->tx_txg != 0);
-
-	/*
-	 * Go through the transaction's hold list and remove holds on
-	 * associated dnodes, notifying waiters if no holds remain.
-	 */
-	for (dmu_tx_hold_t *txh = list_head(&tx->tx_holds); txh != NULL;
-	    txh = list_next(&tx->tx_holds, txh)) {
-		dnode_t *dn = txh->txh_dnode;
-
-		if (dn == NULL)
-			continue;
-
-		mutex_enter(&dn->dn_mtx);
-		ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg);
-
-		if (zfs_refcount_remove(&dn->dn_tx_holds, tx) == 0) {
-			dn->dn_assigned_txg = 0;
-			cv_broadcast(&dn->dn_notxholds);
-		}
-		mutex_exit(&dn->dn_mtx);
-	}
-
-	if (tx->tx_tempreserve_cookie)
-		dsl_dir_tempreserve_clear(tx->tx_tempreserve_cookie, tx);
-
-	if (!list_is_empty(&tx->tx_callbacks))
-		txg_register_callbacks(&tx->tx_txgh, &tx->tx_callbacks);
-
-	if (tx->tx_anyobj == FALSE)
-		txg_rele_to_sync(&tx->tx_txgh);
-
-	dmu_tx_destroy(tx);
-}
-
-void
-dmu_tx_abort(dmu_tx_t *tx)
-{
-	ASSERT(tx->tx_txg == 0);
-
-	/*
-	 * Call any registered callbacks with an error code.
-	 */
-	if (!list_is_empty(&tx->tx_callbacks))
-		dmu_tx_do_callbacks(&tx->tx_callbacks, ECANCELED);
-
-	dmu_tx_destroy(tx);
-}
-
-uint64_t
-dmu_tx_get_txg(dmu_tx_t *tx)
-{
-	ASSERT(tx->tx_txg != 0);
-	return (tx->tx_txg);
-}
-
-dsl_pool_t *
-dmu_tx_pool(dmu_tx_t *tx)
-{
-	ASSERT(tx->tx_pool != NULL);
-	return (tx->tx_pool);
-}
-
-void
-dmu_tx_callback_register(dmu_tx_t *tx, dmu_tx_callback_func_t *func, void *data)
-{
-	dmu_tx_callback_t *dcb;
-
-	dcb = kmem_alloc(sizeof (dmu_tx_callback_t), KM_SLEEP);
-
-	dcb->dcb_func = func;
-	dcb->dcb_data = data;
-
-	list_insert_tail(&tx->tx_callbacks, dcb);
-}
-
-/*
- * Call all the commit callbacks on a list, with a given error code.
- */
-void
-dmu_tx_do_callbacks(list_t *cb_list, int error)
-{
-	dmu_tx_callback_t *dcb;
-
-	while ((dcb = list_head(cb_list)) != NULL) {
-		list_remove(cb_list, dcb);
-		dcb->dcb_func(dcb->dcb_data, error);
-		kmem_free(dcb, sizeof (dmu_tx_callback_t));
-	}
-}
-
-/*
- * Interface to hold a bunch of attributes.
- * used for creating new files.
- * attrsize is the total size of all attributes
- * to be added during object creation
- *
- * For updating/adding a single attribute dmu_tx_hold_sa() should be used.
- */
-
-/*
- * hold necessary attribute name for attribute registration.
- * should be a very rare case where this is needed.  If it does
- * happen it would only happen on the first write to the file system.
- */
-static void
-dmu_tx_sa_registration_hold(sa_os_t *sa, dmu_tx_t *tx)
-{
-	if (!sa->sa_need_attr_registration)
-		return;
-
-	for (int i = 0; i != sa->sa_num_attrs; i++) {
-		if (!sa->sa_attr_table[i].sa_registered) {
-			if (sa->sa_reg_attr_obj)
-				dmu_tx_hold_zap(tx, sa->sa_reg_attr_obj,
-				    B_TRUE, sa->sa_attr_table[i].sa_name);
-			else
-				dmu_tx_hold_zap(tx, DMU_NEW_OBJECT,
-				    B_TRUE, sa->sa_attr_table[i].sa_name);
-		}
-	}
-}
-
-void
-dmu_tx_hold_spill(dmu_tx_t *tx, uint64_t object)
-{
-	dmu_tx_hold_t *txh;
-
-	txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, object,
-	    THT_SPILL, 0, 0);
-	if (txh != NULL)
-		(void) zfs_refcount_add_many(&txh->txh_space_towrite,
-		    SPA_OLD_MAXBLOCKSIZE, FTAG);
-}
-
-void
-dmu_tx_hold_sa_create(dmu_tx_t *tx, int attrsize)
-{
-	sa_os_t *sa = tx->tx_objset->os_sa;
-
-	dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
-
-	if (tx->tx_objset->os_sa->sa_master_obj == 0)
-		return;
-
-	if (tx->tx_objset->os_sa->sa_layout_attr_obj) {
-		dmu_tx_hold_zap(tx, sa->sa_layout_attr_obj, B_TRUE, NULL);
-	} else {
-		dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_LAYOUTS);
-		dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_REGISTRY);
-		dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL);
-		dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL);
-	}
-
-	dmu_tx_sa_registration_hold(sa, tx);
-
-	if (attrsize <= DN_OLD_MAX_BONUSLEN && !sa->sa_force_spill)
-		return;
-
-	(void) dmu_tx_hold_object_impl(tx, tx->tx_objset, DMU_NEW_OBJECT,
-	    THT_SPILL, 0, 0);
-}
-
-/*
- * Hold SA attribute
- *
- * dmu_tx_hold_sa(dmu_tx_t *tx, sa_handle_t *, attribute, add, size)
- *
- * variable_size is the total size of all variable sized attributes
- * passed to this function.  It is not the total size of all
- * variable size attributes that *may* exist on this object.
- */
-void
-dmu_tx_hold_sa(dmu_tx_t *tx, sa_handle_t *hdl, boolean_t may_grow)
-{
-	uint64_t object;
-	sa_os_t *sa = tx->tx_objset->os_sa;
-
-	ASSERT(hdl != NULL);
-
-	object = sa_handle_object(hdl);
-
-	dmu_tx_hold_bonus(tx, object);
-
-	if (tx->tx_objset->os_sa->sa_master_obj == 0)
-		return;
-
-	if (tx->tx_objset->os_sa->sa_reg_attr_obj == 0 ||
-	    tx->tx_objset->os_sa->sa_layout_attr_obj == 0) {
-		dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_LAYOUTS);
-		dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_REGISTRY);
-		dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL);
-		dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL);
-	}
-
-	dmu_tx_sa_registration_hold(sa, tx);
-
-	if (may_grow && tx->tx_objset->os_sa->sa_layout_attr_obj)
-		dmu_tx_hold_zap(tx, sa->sa_layout_attr_obj, B_TRUE, NULL);
-
-	if (sa->sa_force_spill || may_grow || hdl->sa_spill) {
-		ASSERT(tx->tx_txg == 0);
-		dmu_tx_hold_spill(tx, object);
-	} else {
-		dmu_buf_impl_t *db = (dmu_buf_impl_t *)hdl->sa_bonus;
-		dnode_t *dn;
-
-		DB_DNODE_ENTER(db);
-		dn = DB_DNODE(db);
-		if (dn->dn_have_spill) {
-			ASSERT(tx->tx_txg == 0);
-			dmu_tx_hold_spill(tx, object);
-		}
-		DB_DNODE_EXIT(db);
-	}
-}
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_zfetch.c
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_zfetch.c
+++ /dev/null
@@ -1,374 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-
-/*
- * Copyright (c) 2013, 2015 by Delphix. All rights reserved.
- */
-
-#include <sys/zfs_context.h>
-#include <sys/dnode.h>
-#include <sys/dmu_objset.h>
-#include <sys/dmu_zfetch.h>
-#include <sys/dmu.h>
-#include <sys/dbuf.h>
-#include <sys/kstat.h>
-
-/*
- * This tunable disables predictive prefetch.  Note that it leaves "prescient"
- * prefetch (e.g. prefetch for zfs send) intact.  Unlike predictive prefetch,
- * prescient prefetch never issues i/os that end up not being needed,
- * so it can't hurt performance.
- */
-boolean_t zfs_prefetch_disable = B_FALSE;
-
-/* max # of streams per zfetch */
-uint32_t	zfetch_max_streams = 8;
-/* min time before stream reclaim */
-uint32_t	zfetch_min_sec_reap = 2;
-/* max bytes to prefetch per stream (default 8MB) */
-uint32_t	zfetch_max_distance = 8 * 1024 * 1024;
-/* max bytes to prefetch indirects for per stream (default 64MB) */
-uint32_t	zfetch_max_idistance = 64 * 1024 * 1024;
-/* max number of bytes in an array_read in which we allow prefetching (1MB) */
-uint64_t	zfetch_array_rd_sz = 1024 * 1024;
-
-SYSCTL_DECL(_vfs_zfs);
-SYSCTL_INT(_vfs_zfs, OID_AUTO, prefetch_disable, CTLFLAG_RW,
-    &zfs_prefetch_disable, 0, "Disable prefetch");
-SYSCTL_NODE(_vfs_zfs, OID_AUTO, zfetch, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
-    "ZFS ZFETCH");
-SYSCTL_UINT(_vfs_zfs_zfetch, OID_AUTO, max_streams, CTLFLAG_RWTUN,
-    &zfetch_max_streams, 0, "Max # of streams per zfetch");
-SYSCTL_UINT(_vfs_zfs_zfetch, OID_AUTO, min_sec_reap, CTLFLAG_RWTUN,
-    &zfetch_min_sec_reap, 0, "Min time before stream reclaim");
-SYSCTL_UINT(_vfs_zfs_zfetch, OID_AUTO, max_distance, CTLFLAG_RWTUN,
-    &zfetch_max_distance, 0, "Max bytes to prefetch per stream");
-SYSCTL_UINT(_vfs_zfs_zfetch, OID_AUTO, max_idistance, CTLFLAG_RWTUN,
-    &zfetch_max_idistance, 0, "Max bytes to prefetch indirects for per stream");
-SYSCTL_UQUAD(_vfs_zfs_zfetch, OID_AUTO, array_rd_sz, CTLFLAG_RWTUN,
-    &zfetch_array_rd_sz, 0,
-    "Number of bytes in a array_read at which we stop prefetching");
-
-typedef struct zfetch_stats {
-	kstat_named_t zfetchstat_hits;
-	kstat_named_t zfetchstat_misses;
-	kstat_named_t zfetchstat_max_streams;
-} zfetch_stats_t;
-
-static zfetch_stats_t zfetch_stats = {
-	{ "hits",			KSTAT_DATA_UINT64 },
-	{ "misses",			KSTAT_DATA_UINT64 },
-	{ "max_streams",		KSTAT_DATA_UINT64 },
-};
-
-#define	ZFETCHSTAT_BUMP(stat) \
-	atomic_inc_64(&zfetch_stats.stat.value.ui64);
-
-kstat_t		*zfetch_ksp;
-
-void
-zfetch_init(void)
-{
-	zfetch_ksp = kstat_create("zfs", 0, "zfetchstats", "misc",
-	    KSTAT_TYPE_NAMED, sizeof (zfetch_stats) / sizeof (kstat_named_t),
-	    KSTAT_FLAG_VIRTUAL);
-
-	if (zfetch_ksp != NULL) {
-		zfetch_ksp->ks_data = &zfetch_stats;
-		kstat_install(zfetch_ksp);
-	}
-}
-
-void
-zfetch_fini(void)
-{
-	if (zfetch_ksp != NULL) {
-		kstat_delete(zfetch_ksp);
-		zfetch_ksp = NULL;
-	}
-}
-
-/*
- * This takes a pointer to a zfetch structure and a dnode.  It performs the
- * necessary setup for the zfetch structure, grokking data from the
- * associated dnode.
- */
-void
-dmu_zfetch_init(zfetch_t *zf, dnode_t *dno)
-{
-	if (zf == NULL)
-		return;
-
-	zf->zf_dnode = dno;
-
-	list_create(&zf->zf_stream, sizeof (zstream_t),
-	    offsetof(zstream_t, zs_node));
-
-	rw_init(&zf->zf_rwlock, NULL, RW_DEFAULT, NULL);
-}
-
-static void
-dmu_zfetch_stream_remove(zfetch_t *zf, zstream_t *zs)
-{
-	ASSERT(RW_WRITE_HELD(&zf->zf_rwlock));
-	list_remove(&zf->zf_stream, zs);
-	mutex_destroy(&zs->zs_lock);
-	kmem_free(zs, sizeof (*zs));
-}
-
-/*
- * Clean-up state associated with a zfetch structure (e.g. destroy the
- * streams).  This doesn't free the zfetch_t itself, that's left to the caller.
- */
-void
-dmu_zfetch_fini(zfetch_t *zf)
-{
-	zstream_t *zs;
-
-	ASSERT(!RW_LOCK_HELD(&zf->zf_rwlock));
-
-	rw_enter(&zf->zf_rwlock, RW_WRITER);
-	while ((zs = list_head(&zf->zf_stream)) != NULL)
-		dmu_zfetch_stream_remove(zf, zs);
-	rw_exit(&zf->zf_rwlock);
-	list_destroy(&zf->zf_stream);
-	rw_destroy(&zf->zf_rwlock);
-
-	zf->zf_dnode = NULL;
-}
-
-/*
- * If there aren't too many streams already, create a new stream.
- * The "blkid" argument is the next block that we expect this stream to access.
- * While we're here, clean up old streams (which haven't been
- * accessed for at least zfetch_min_sec_reap seconds).
- */
-static void
-dmu_zfetch_stream_create(zfetch_t *zf, uint64_t blkid)
-{
-	zstream_t *zs_next;
-	int numstreams = 0;
-
-	ASSERT(RW_WRITE_HELD(&zf->zf_rwlock));
-
-	/*
-	 * Clean up old streams.
-	 */
-	for (zstream_t *zs = list_head(&zf->zf_stream);
-	    zs != NULL; zs = zs_next) {
-		zs_next = list_next(&zf->zf_stream, zs);
-		if (((gethrtime() - zs->zs_atime) / NANOSEC) >
-		    zfetch_min_sec_reap)
-			dmu_zfetch_stream_remove(zf, zs);
-		else
-			numstreams++;
-	}
-
-	/*
-	 * The maximum number of streams is normally zfetch_max_streams,
-	 * but for small files we lower it such that it's at least possible
-	 * for all the streams to be non-overlapping.
-	 *
-	 * If we are already at the maximum number of streams for this file,
-	 * even after removing old streams, then don't create this stream.
-	 */
-	uint32_t max_streams = MAX(1, MIN(zfetch_max_streams,
-	    zf->zf_dnode->dn_maxblkid * zf->zf_dnode->dn_datablksz /
-	    zfetch_max_distance));
-	if (numstreams >= max_streams) {
-		ZFETCHSTAT_BUMP(zfetchstat_max_streams);
-		return;
-	}
-
-	zstream_t *zs = kmem_zalloc(sizeof (*zs), KM_SLEEP);
-	zs->zs_blkid = blkid;
-	zs->zs_pf_blkid = blkid;
-	zs->zs_ipf_blkid = blkid;
-	zs->zs_atime = gethrtime();
-	mutex_init(&zs->zs_lock, NULL, MUTEX_DEFAULT, NULL);
-
-	list_insert_head(&zf->zf_stream, zs);
-}
-
-/*
- * This is the predictive prefetch entry point.  It associates dnode access
- * specified with blkid and nblks arguments with prefetch stream, predicts
- * further accesses based on that stats and initiates speculative prefetch.
- * fetch_data argument specifies whether actual data blocks should be fetched:
- *   FALSE -- prefetch only indirect blocks for predicted data blocks;
- *   TRUE -- prefetch predicted data blocks plus following indirect blocks.
- */
-void
-dmu_zfetch(zfetch_t *zf, uint64_t blkid, uint64_t nblks, boolean_t fetch_data)
-{
-	zstream_t *zs;
-	int64_t pf_start, ipf_start, ipf_istart, ipf_iend;
-	int64_t pf_ahead_blks, max_blks;
-	int epbs, max_dist_blks, pf_nblks, ipf_nblks;
-	uint64_t end_of_access_blkid = blkid + nblks;
-	spa_t *spa = zf->zf_dnode->dn_objset->os_spa;
-
-	if (zfs_prefetch_disable)
-		return;
-
-	/*
-	 * If we haven't yet loaded the indirect vdevs' mappings, we
-	 * can only read from blocks that we carefully ensure are on
-	 * concrete vdevs (or previously-loaded indirect vdevs).  So we
-	 * can't allow the predictive prefetcher to attempt reads of other
-	 * blocks (e.g. of the MOS's dnode obejct).
-	 */
-	if (!spa_indirect_vdevs_loaded(spa))
-		return;
-
-	/*
-	 * As a fast path for small (single-block) files, ignore access
-	 * to the first block.
-	 */
-	if (blkid == 0)
-		return;
-
-	rw_enter(&zf->zf_rwlock, RW_READER);
-
-	/*
-	 * Find matching prefetch stream.  Depending on whether the accesses
-	 * are block-aligned, first block of the new access may either follow
-	 * the last block of the previous access, or be equal to it.
-	 */
-	for (zs = list_head(&zf->zf_stream); zs != NULL;
-	    zs = list_next(&zf->zf_stream, zs)) {
-		if (blkid == zs->zs_blkid || blkid + 1 == zs->zs_blkid) {
-			mutex_enter(&zs->zs_lock);
-			/*
-			 * zs_blkid could have changed before we
-			 * acquired zs_lock; re-check them here.
-			 */
-			if (blkid == zs->zs_blkid) {
-				break;
-			} else if (blkid + 1 == zs->zs_blkid) {
-				blkid++;
-				nblks--;
-				if (nblks == 0) {
-					/* Already prefetched this before. */
-					mutex_exit(&zs->zs_lock);
-					rw_exit(&zf->zf_rwlock);
-					return;
-				}
-				break;
-			}
-			mutex_exit(&zs->zs_lock);
-		}
-	}
-
-	if (zs == NULL) {
-		/*
-		 * This access is not part of any existing stream.  Create
-		 * a new stream for it.
-		 */
-		ZFETCHSTAT_BUMP(zfetchstat_misses);
-		if (rw_tryupgrade(&zf->zf_rwlock))
-			dmu_zfetch_stream_create(zf, end_of_access_blkid);
-		rw_exit(&zf->zf_rwlock);
-		return;
-	}
-
-	/*
-	 * This access was to a block that we issued a prefetch for on
-	 * behalf of this stream. Issue further prefetches for this stream.
-	 *
-	 * Normally, we start prefetching where we stopped
-	 * prefetching last (zs_pf_blkid).  But when we get our first
-	 * hit on this stream, zs_pf_blkid == zs_blkid, we don't
-	 * want to prefetch the block we just accessed.  In this case,
-	 * start just after the block we just accessed.
-	 */
-	pf_start = MAX(zs->zs_pf_blkid, end_of_access_blkid);
-
-	/*
-	 * Double our amount of prefetched data, but don't let the
-	 * prefetch get further ahead than zfetch_max_distance.
-	 */
-	if (fetch_data) {
-		max_dist_blks =
-		    zfetch_max_distance >> zf->zf_dnode->dn_datablkshift;
-		/*
-		 * Previously, we were (zs_pf_blkid - blkid) ahead.  We
-		 * want to now be double that, so read that amount again,
-		 * plus the amount we are catching up by (i.e. the amount
-		 * read just now).
-		 */
-		pf_ahead_blks = zs->zs_pf_blkid - blkid + nblks;
-		max_blks = max_dist_blks - (pf_start - end_of_access_blkid);
-		pf_nblks = MIN(pf_ahead_blks, max_blks);
-	} else {
-		pf_nblks = 0;
-	}
-
-	zs->zs_pf_blkid = pf_start + pf_nblks;
-
-	/*
-	 * Do the same for indirects, starting from where we stopped last,
-	 * or where we will stop reading data blocks (and the indirects
-	 * that point to them).
-	 */
-	ipf_start = MAX(zs->zs_ipf_blkid, zs->zs_pf_blkid);
-	max_dist_blks = zfetch_max_idistance >> zf->zf_dnode->dn_datablkshift;
-	/*
-	 * We want to double our distance ahead of the data prefetch
-	 * (or reader, if we are not prefetching data).  Previously, we
-	 * were (zs_ipf_blkid - blkid) ahead.  To double that, we read
-	 * that amount again, plus the amount we are catching up by
-	 * (i.e. the amount read now + the amount of data prefetched now).
-	 */
-	pf_ahead_blks = zs->zs_ipf_blkid - blkid + nblks + pf_nblks;
-	max_blks = max_dist_blks - (ipf_start - end_of_access_blkid);
-	ipf_nblks = MIN(pf_ahead_blks, max_blks);
-	zs->zs_ipf_blkid = ipf_start + ipf_nblks;
-
-	epbs = zf->zf_dnode->dn_indblkshift - SPA_BLKPTRSHIFT;
-	ipf_istart = P2ROUNDUP(ipf_start, 1 << epbs) >> epbs;
-	ipf_iend = P2ROUNDUP(zs->zs_ipf_blkid, 1 << epbs) >> epbs;
-
-	zs->zs_atime = gethrtime();
-	zs->zs_blkid = end_of_access_blkid;
-	mutex_exit(&zs->zs_lock);
-	rw_exit(&zf->zf_rwlock);
-
-	/*
-	 * dbuf_prefetch() is asynchronous (even when it needs to read
-	 * indirect blocks), but we still prefer to drop our locks before
-	 * calling it to reduce the time we hold them.
-	 */
-
-	for (int i = 0; i < pf_nblks; i++) {
-		dbuf_prefetch(zf->zf_dnode, 0, pf_start + i,
-		    ZIO_PRIORITY_ASYNC_READ, ARC_FLAG_PREDICTIVE_PREFETCH);
-	}
-	for (int64_t iblk = ipf_istart; iblk < ipf_iend; iblk++) {
-		dbuf_prefetch(zf->zf_dnode, 1, iblk,
-		    ZIO_PRIORITY_ASYNC_READ, ARC_FLAG_PREDICTIVE_PREFETCH);
-	}
-	ZFETCHSTAT_BUMP(zfetchstat_hits);
-}
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c
+++ /dev/null
@@ -1,2418 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
- * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
- * Copyright (c) 2014 Integros [integros.com]
- * Copyright 2017 RackTop Systems.
- */
-
-#include <sys/zfs_context.h>
-#include <sys/dbuf.h>
-#include <sys/dnode.h>
-#include <sys/dmu.h>
-#include <sys/dmu_impl.h>
-#include <sys/dmu_tx.h>
-#include <sys/dmu_objset.h>
-#include <sys/dsl_dir.h>
-#include <sys/dsl_dataset.h>
-#include <sys/spa.h>
-#include <sys/zio.h>
-#include <sys/dmu_zfetch.h>
-#include <sys/range_tree.h>
-
-dnode_stats_t dnode_stats = {
-	{ "dnode_hold_dbuf_hold",		KSTAT_DATA_UINT64 },
-	{ "dnode_hold_dbuf_read",		KSTAT_DATA_UINT64 },
-	{ "dnode_hold_alloc_hits",		KSTAT_DATA_UINT64 },
-	{ "dnode_hold_alloc_misses",		KSTAT_DATA_UINT64 },
-	{ "dnode_hold_alloc_interior",		KSTAT_DATA_UINT64 },
-	{ "dnode_hold_alloc_lock_retry",	KSTAT_DATA_UINT64 },
-	{ "dnode_hold_alloc_lock_misses",	KSTAT_DATA_UINT64 },
-	{ "dnode_hold_alloc_type_none",		KSTAT_DATA_UINT64 },
-	{ "dnode_hold_free_hits",		KSTAT_DATA_UINT64 },
-	{ "dnode_hold_free_misses",		KSTAT_DATA_UINT64 },
-	{ "dnode_hold_free_lock_misses",	KSTAT_DATA_UINT64 },
-	{ "dnode_hold_free_lock_retry",		KSTAT_DATA_UINT64 },
-	{ "dnode_hold_free_overflow",		KSTAT_DATA_UINT64 },
-	{ "dnode_hold_free_refcount",		KSTAT_DATA_UINT64 },
-	{ "dnode_hold_free_txg",		KSTAT_DATA_UINT64 },
-	{ "dnode_free_interior_lock_retry",	KSTAT_DATA_UINT64 },
-	{ "dnode_allocate",			KSTAT_DATA_UINT64 },
-	{ "dnode_reallocate",			KSTAT_DATA_UINT64 },
-	{ "dnode_buf_evict",			KSTAT_DATA_UINT64 },
-	{ "dnode_alloc_next_chunk",		KSTAT_DATA_UINT64 },
-	{ "dnode_alloc_race",			KSTAT_DATA_UINT64 },
-	{ "dnode_alloc_next_block",		KSTAT_DATA_UINT64 },
-	{ "dnode_move_invalid",			KSTAT_DATA_UINT64 },
-	{ "dnode_move_recheck1",		KSTAT_DATA_UINT64 },
-	{ "dnode_move_recheck2",		KSTAT_DATA_UINT64 },
-	{ "dnode_move_special",			KSTAT_DATA_UINT64 },
-	{ "dnode_move_handle",			KSTAT_DATA_UINT64 },
-	{ "dnode_move_rwlock",			KSTAT_DATA_UINT64 },
-	{ "dnode_move_active",			KSTAT_DATA_UINT64 },
-};
-
-static kstat_t *dnode_ksp;
-static kmem_cache_t *dnode_cache;
-
-static dnode_phys_t dnode_phys_zero;
-
-int zfs_default_bs = SPA_MINBLOCKSHIFT;
-int zfs_default_ibs = DN_MAX_INDBLKSHIFT;
-
-SYSCTL_DECL(_vfs_zfs);
-SYSCTL_INT(_vfs_zfs, OID_AUTO, default_bs, CTLFLAG_RWTUN,
-    &zfs_default_bs, 0, "Default dnode block shift");
-SYSCTL_INT(_vfs_zfs, OID_AUTO, default_ibs, CTLFLAG_RWTUN,
-    &zfs_default_ibs, 0, "Default dnode indirect block shift");
-
-#ifdef illumos
-#ifdef	_KERNEL
-static kmem_cbrc_t dnode_move(void *, void *, size_t, void *);
-#endif	/* _KERNEL */
-#endif
-
-static int
-dbuf_compare(const void *x1, const void *x2)
-{
-	const dmu_buf_impl_t *d1 = x1;
-	const dmu_buf_impl_t *d2 = x2;
-
-	int cmp = AVL_CMP(d1->db_level, d2->db_level);
-	if (likely(cmp))
-		return (cmp);
-
-	cmp = AVL_CMP(d1->db_blkid, d2->db_blkid);
-	if (likely(cmp))
-		return (cmp);
-
-	if (d1->db_state == DB_SEARCH) {
-		ASSERT3S(d2->db_state, !=, DB_SEARCH);
-		return (-1);
-	} else if (d2->db_state == DB_SEARCH) {
-		ASSERT3S(d1->db_state, !=, DB_SEARCH);
-		return (1);
-	}
-
-	return (AVL_PCMP(d1, d2));
-}
-
-/* ARGSUSED */
-static int
-dnode_cons(void *arg, void *unused, int kmflag)
-{
-	dnode_t *dn = arg;
-	int i;
-
-	rw_init(&dn->dn_struct_rwlock, NULL, RW_DEFAULT, NULL);
-	mutex_init(&dn->dn_mtx, NULL, MUTEX_DEFAULT, NULL);
-	mutex_init(&dn->dn_dbufs_mtx, NULL, MUTEX_DEFAULT, NULL);
-	cv_init(&dn->dn_notxholds, NULL, CV_DEFAULT, NULL);
-
-	/*
-	 * Every dbuf has a reference, and dropping a tracked reference is
-	 * O(number of references), so don't track dn_holds.
-	 */
-	zfs_refcount_create_untracked(&dn->dn_holds);
-	zfs_refcount_create(&dn->dn_tx_holds);
-	list_link_init(&dn->dn_link);
-
-	bzero(&dn->dn_next_nblkptr[0], sizeof (dn->dn_next_nblkptr));
-	bzero(&dn->dn_next_nlevels[0], sizeof (dn->dn_next_nlevels));
-	bzero(&dn->dn_next_indblkshift[0], sizeof (dn->dn_next_indblkshift));
-	bzero(&dn->dn_next_bonustype[0], sizeof (dn->dn_next_bonustype));
-	bzero(&dn->dn_rm_spillblk[0], sizeof (dn->dn_rm_spillblk));
-	bzero(&dn->dn_next_bonuslen[0], sizeof (dn->dn_next_bonuslen));
-	bzero(&dn->dn_next_blksz[0], sizeof (dn->dn_next_blksz));
-
-	for (i = 0; i < TXG_SIZE; i++) {
-		multilist_link_init(&dn->dn_dirty_link[i]);
-		dn->dn_free_ranges[i] = NULL;
-		list_create(&dn->dn_dirty_records[i],
-		    sizeof (dbuf_dirty_record_t),
-		    offsetof(dbuf_dirty_record_t, dr_dirty_node));
-	}
-
-	dn->dn_allocated_txg = 0;
-	dn->dn_free_txg = 0;
-	dn->dn_assigned_txg = 0;
-	dn->dn_dirty_txg = 0;
-	dn->dn_dirtyctx = 0;
-	dn->dn_dirtyctx_firstset = NULL;
-	dn->dn_bonus = NULL;
-	dn->dn_have_spill = B_FALSE;
-	dn->dn_zio = NULL;
-	dn->dn_oldused = 0;
-	dn->dn_oldflags = 0;
-	dn->dn_olduid = 0;
-	dn->dn_oldgid = 0;
-	dn->dn_newuid = 0;
-	dn->dn_newgid = 0;
-	dn->dn_id_flags = 0;
-
-	dn->dn_dbufs_count = 0;
-	avl_create(&dn->dn_dbufs, dbuf_compare, sizeof (dmu_buf_impl_t),
-	    offsetof(dmu_buf_impl_t, db_link));
-
-	dn->dn_moved = 0;
-	POINTER_INVALIDATE(&dn->dn_objset);
-	return (0);
-}
-
-/* ARGSUSED */
-static void
-dnode_dest(void *arg, void *unused)
-{
-	int i;
-	dnode_t *dn = arg;
-
-	rw_destroy(&dn->dn_struct_rwlock);
-	mutex_destroy(&dn->dn_mtx);
-	mutex_destroy(&dn->dn_dbufs_mtx);
-	cv_destroy(&dn->dn_notxholds);
-	zfs_refcount_destroy(&dn->dn_holds);
-	zfs_refcount_destroy(&dn->dn_tx_holds);
-	ASSERT(!list_link_active(&dn->dn_link));
-
-	for (i = 0; i < TXG_SIZE; i++) {
-		ASSERT(!multilist_link_active(&dn->dn_dirty_link[i]));
-		ASSERT3P(dn->dn_free_ranges[i], ==, NULL);
-		list_destroy(&dn->dn_dirty_records[i]);
-		ASSERT0(dn->dn_next_nblkptr[i]);
-		ASSERT0(dn->dn_next_nlevels[i]);
-		ASSERT0(dn->dn_next_indblkshift[i]);
-		ASSERT0(dn->dn_next_bonustype[i]);
-		ASSERT0(dn->dn_rm_spillblk[i]);
-		ASSERT0(dn->dn_next_bonuslen[i]);
-		ASSERT0(dn->dn_next_blksz[i]);
-	}
-
-	ASSERT0(dn->dn_allocated_txg);
-	ASSERT0(dn->dn_free_txg);
-	ASSERT0(dn->dn_assigned_txg);
-	ASSERT0(dn->dn_dirty_txg);
-	ASSERT0(dn->dn_dirtyctx);
-	ASSERT3P(dn->dn_dirtyctx_firstset, ==, NULL);
-	ASSERT3P(dn->dn_bonus, ==, NULL);
-	ASSERT(!dn->dn_have_spill);
-	ASSERT3P(dn->dn_zio, ==, NULL);
-	ASSERT0(dn->dn_oldused);
-	ASSERT0(dn->dn_oldflags);
-	ASSERT0(dn->dn_olduid);
-	ASSERT0(dn->dn_oldgid);
-	ASSERT0(dn->dn_newuid);
-	ASSERT0(dn->dn_newgid);
-	ASSERT0(dn->dn_id_flags);
-
-	ASSERT0(dn->dn_dbufs_count);
-	avl_destroy(&dn->dn_dbufs);
-}
-
-void
-dnode_init(void)
-{
-	ASSERT(dnode_cache == NULL);
-	dnode_cache = kmem_cache_create("dnode_t",
-	    sizeof (dnode_t),
-	    0, dnode_cons, dnode_dest, NULL, NULL, NULL, 0);
-#ifdef	_KERNEL
-	kmem_cache_set_move(dnode_cache, dnode_move);
-
-	dnode_ksp = kstat_create("zfs", 0, "dnodestats", "misc",
-	    KSTAT_TYPE_NAMED, sizeof (dnode_stats) / sizeof (kstat_named_t),
-	    KSTAT_FLAG_VIRTUAL);
-	if (dnode_ksp != NULL) {
-		dnode_ksp->ks_data = &dnode_stats;
-		kstat_install(dnode_ksp);
-	}
-#endif	/* _KERNEL */
-}
-
-void
-dnode_fini(void)
-{
-	if (dnode_ksp != NULL) {
-		kstat_delete(dnode_ksp);
-		dnode_ksp = NULL;
-	}
-
-	kmem_cache_destroy(dnode_cache);
-	dnode_cache = NULL;
-}
-
-
-#ifdef ZFS_DEBUG
-void
-dnode_verify(dnode_t *dn)
-{
-	int drop_struct_lock = FALSE;
-
-	ASSERT(dn->dn_phys);
-	ASSERT(dn->dn_objset);
-	ASSERT(dn->dn_handle->dnh_dnode == dn);
-
-	ASSERT(DMU_OT_IS_VALID(dn->dn_phys->dn_type));
-
-	if (!(zfs_flags & ZFS_DEBUG_DNODE_VERIFY))
-		return;
-
-	if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) {
-		rw_enter(&dn->dn_struct_rwlock, RW_READER);
-		drop_struct_lock = TRUE;
-	}
-	if (dn->dn_phys->dn_type != DMU_OT_NONE || dn->dn_allocated_txg != 0) {
-		int i;
-		int max_bonuslen = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots);
-		ASSERT3U(dn->dn_indblkshift, >=, 0);
-		ASSERT3U(dn->dn_indblkshift, <=, SPA_MAXBLOCKSHIFT);
-		if (dn->dn_datablkshift) {
-			ASSERT3U(dn->dn_datablkshift, >=, SPA_MINBLOCKSHIFT);
-			ASSERT3U(dn->dn_datablkshift, <=, SPA_MAXBLOCKSHIFT);
-			ASSERT3U(1<<dn->dn_datablkshift, ==, dn->dn_datablksz);
-		}
-		ASSERT3U(dn->dn_nlevels, <=, 30);
-		ASSERT(DMU_OT_IS_VALID(dn->dn_type));
-		ASSERT3U(dn->dn_nblkptr, >=, 1);
-		ASSERT3U(dn->dn_nblkptr, <=, DN_MAX_NBLKPTR);
-		ASSERT3U(dn->dn_bonuslen, <=, max_bonuslen);
-		ASSERT3U(dn->dn_datablksz, ==,
-		    dn->dn_datablkszsec << SPA_MINBLOCKSHIFT);
-		ASSERT3U(ISP2(dn->dn_datablksz), ==, dn->dn_datablkshift != 0);
-		ASSERT3U((dn->dn_nblkptr - 1) * sizeof (blkptr_t) +
-		    dn->dn_bonuslen, <=, max_bonuslen);
-		for (i = 0; i < TXG_SIZE; i++) {
-			ASSERT3U(dn->dn_next_nlevels[i], <=, dn->dn_nlevels);
-		}
-	}
-	if (dn->dn_phys->dn_type != DMU_OT_NONE)
-		ASSERT3U(dn->dn_phys->dn_nlevels, <=, dn->dn_nlevels);
-	ASSERT(DMU_OBJECT_IS_SPECIAL(dn->dn_object) || dn->dn_dbuf != NULL);
-	if (dn->dn_dbuf != NULL) {
-		ASSERT3P(dn->dn_phys, ==,
-		    (dnode_phys_t *)dn->dn_dbuf->db.db_data +
-		    (dn->dn_object % (dn->dn_dbuf->db.db_size >> DNODE_SHIFT)));
-	}
-	if (drop_struct_lock)
-		rw_exit(&dn->dn_struct_rwlock);
-}
-#endif
-
-void
-dnode_byteswap(dnode_phys_t *dnp)
-{
-	uint64_t *buf64 = (void*)&dnp->dn_blkptr;
-	int i;
-
-	if (dnp->dn_type == DMU_OT_NONE) {
-		bzero(dnp, sizeof (dnode_phys_t));
-		return;
-	}
-
-	dnp->dn_datablkszsec = BSWAP_16(dnp->dn_datablkszsec);
-	dnp->dn_bonuslen = BSWAP_16(dnp->dn_bonuslen);
-	dnp->dn_extra_slots = BSWAP_8(dnp->dn_extra_slots);
-	dnp->dn_maxblkid = BSWAP_64(dnp->dn_maxblkid);
-	dnp->dn_used = BSWAP_64(dnp->dn_used);
-
-	/*
-	 * dn_nblkptr is only one byte, so it's OK to read it in either
-	 * byte order.  We can't read dn_bouslen.
-	 */
-	ASSERT(dnp->dn_indblkshift <= SPA_MAXBLOCKSHIFT);
-	ASSERT(dnp->dn_nblkptr <= DN_MAX_NBLKPTR);
-	for (i = 0; i < dnp->dn_nblkptr * sizeof (blkptr_t)/8; i++)
-		buf64[i] = BSWAP_64(buf64[i]);
-
-	/*
-	 * OK to check dn_bonuslen for zero, because it won't matter if
-	 * we have the wrong byte order.  This is necessary because the
-	 * dnode dnode is smaller than a regular dnode.
-	 */
-	if (dnp->dn_bonuslen != 0) {
-		/*
-		 * Note that the bonus length calculated here may be
-		 * longer than the actual bonus buffer.  This is because
-		 * we always put the bonus buffer after the last block
-		 * pointer (instead of packing it against the end of the
-		 * dnode buffer).
-		 */
-		int off = (dnp->dn_nblkptr-1) * sizeof (blkptr_t);
-		int slots = dnp->dn_extra_slots + 1;
-		size_t len = DN_SLOTS_TO_BONUSLEN(slots) - off;
-		ASSERT(DMU_OT_IS_VALID(dnp->dn_bonustype));
-		dmu_object_byteswap_t byteswap =
-		    DMU_OT_BYTESWAP(dnp->dn_bonustype);
-		dmu_ot_byteswap[byteswap].ob_func(dnp->dn_bonus + off, len);
-	}
-
-	/* Swap SPILL block if we have one */
-	if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR)
-		byteswap_uint64_array(DN_SPILL_BLKPTR(dnp), sizeof (blkptr_t));
-
-}
-
-void
-dnode_buf_byteswap(void *vbuf, size_t size)
-{
-	int i = 0;
-
-	ASSERT3U(sizeof (dnode_phys_t), ==, (1<<DNODE_SHIFT));
-	ASSERT((size & (sizeof (dnode_phys_t)-1)) == 0);
-
-	while (i < size) {
-		dnode_phys_t *dnp = (void *)(((char *)vbuf) + i);
-		dnode_byteswap(dnp);
-
-		i += DNODE_MIN_SIZE;
-		if (dnp->dn_type != DMU_OT_NONE)
-			i += dnp->dn_extra_slots * DNODE_MIN_SIZE;
-	}
-}
-
-void
-dnode_setbonuslen(dnode_t *dn, int newsize, dmu_tx_t *tx)
-{
-	ASSERT3U(zfs_refcount_count(&dn->dn_holds), >=, 1);
-
-	dnode_setdirty(dn, tx);
-	rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
-	ASSERT3U(newsize, <=, DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots) -
-	    (dn->dn_nblkptr-1) * sizeof (blkptr_t));
-	dn->dn_bonuslen = newsize;
-	if (newsize == 0)
-		dn->dn_next_bonuslen[tx->tx_txg & TXG_MASK] = DN_ZERO_BONUSLEN;
-	else
-		dn->dn_next_bonuslen[tx->tx_txg & TXG_MASK] = dn->dn_bonuslen;
-	rw_exit(&dn->dn_struct_rwlock);
-}
-
-void
-dnode_setbonus_type(dnode_t *dn, dmu_object_type_t newtype, dmu_tx_t *tx)
-{
-	ASSERT3U(zfs_refcount_count(&dn->dn_holds), >=, 1);
-	dnode_setdirty(dn, tx);
-	rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
-	dn->dn_bonustype = newtype;
-	dn->dn_next_bonustype[tx->tx_txg & TXG_MASK] = dn->dn_bonustype;
-	rw_exit(&dn->dn_struct_rwlock);
-}
-
-void
-dnode_rm_spill(dnode_t *dn, dmu_tx_t *tx)
-{
-	ASSERT3U(zfs_refcount_count(&dn->dn_holds), >=, 1);
-	ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
-	dnode_setdirty(dn, tx);
-	dn->dn_rm_spillblk[tx->tx_txg&TXG_MASK] = DN_KILL_SPILLBLK;
-	dn->dn_have_spill = B_FALSE;
-}
-
-static void
-dnode_setdblksz(dnode_t *dn, int size)
-{
-	ASSERT0(P2PHASE(size, SPA_MINBLOCKSIZE));
-	ASSERT3U(size, <=, SPA_MAXBLOCKSIZE);
-	ASSERT3U(size, >=, SPA_MINBLOCKSIZE);
-	ASSERT3U(size >> SPA_MINBLOCKSHIFT, <,
-	    1<<(sizeof (dn->dn_phys->dn_datablkszsec) * 8));
-	dn->dn_datablksz = size;
-	dn->dn_datablkszsec = size >> SPA_MINBLOCKSHIFT;
-	dn->dn_datablkshift = ISP2(size) ? highbit64(size - 1) : 0;
-}
-
-static dnode_t *
-dnode_create(objset_t *os, dnode_phys_t *dnp, dmu_buf_impl_t *db,
-    uint64_t object, dnode_handle_t *dnh)
-{
-	dnode_t *dn;
-
-	dn = kmem_cache_alloc(dnode_cache, KM_SLEEP);
-#ifdef _KERNEL
-	ASSERT(!POINTER_IS_VALID(dn->dn_objset));
-#endif /* _KERNEL */
-	dn->dn_moved = 0;
-
-	/*
-	 * Defer setting dn_objset until the dnode is ready to be a candidate
-	 * for the dnode_move() callback.
-	 */
-	dn->dn_object = object;
-	dn->dn_dbuf = db;
-	dn->dn_handle = dnh;
-	dn->dn_phys = dnp;
-
-	if (dnp->dn_datablkszsec) {
-		dnode_setdblksz(dn, dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT);
-	} else {
-		dn->dn_datablksz = 0;
-		dn->dn_datablkszsec = 0;
-		dn->dn_datablkshift = 0;
-	}
-	dn->dn_indblkshift = dnp->dn_indblkshift;
-	dn->dn_nlevels = dnp->dn_nlevels;
-	dn->dn_type = dnp->dn_type;
-	dn->dn_nblkptr = dnp->dn_nblkptr;
-	dn->dn_checksum = dnp->dn_checksum;
-	dn->dn_compress = dnp->dn_compress;
-	dn->dn_bonustype = dnp->dn_bonustype;
-	dn->dn_bonuslen = dnp->dn_bonuslen;
-	dn->dn_num_slots = dnp->dn_extra_slots + 1;
-	dn->dn_maxblkid = dnp->dn_maxblkid;
-	dn->dn_have_spill = ((dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) != 0);
-	dn->dn_id_flags = 0;
-
-	dmu_zfetch_init(&dn->dn_zfetch, dn);
-
-	ASSERT(DMU_OT_IS_VALID(dn->dn_phys->dn_type));
-	ASSERT(zrl_is_locked(&dnh->dnh_zrlock));
-	ASSERT(!DN_SLOT_IS_PTR(dnh->dnh_dnode));
-
-	mutex_enter(&os->os_lock);
-
-	/*
-	 * Exclude special dnodes from os_dnodes so an empty os_dnodes
-	 * signifies that the special dnodes have no references from
-	 * their children (the entries in os_dnodes).  This allows
-	 * dnode_destroy() to easily determine if the last child has
-	 * been removed and then complete eviction of the objset.
-	 */
-	if (!DMU_OBJECT_IS_SPECIAL(object))
-		list_insert_head(&os->os_dnodes, dn);
-	membar_producer();
-
-	/*
-	 * Everything else must be valid before assigning dn_objset
-	 * makes the dnode eligible for dnode_move().
-	 */
-	dn->dn_objset = os;
-
-	dnh->dnh_dnode = dn;
-	mutex_exit(&os->os_lock);
-
-	arc_space_consume(sizeof (dnode_t), ARC_SPACE_DNODE);
-
-	return (dn);
-}
-
-/*
- * Caller must be holding the dnode handle, which is released upon return.
- */
-static void
-dnode_destroy(dnode_t *dn)
-{
-	objset_t *os = dn->dn_objset;
-	boolean_t complete_os_eviction = B_FALSE;
-
-	ASSERT((dn->dn_id_flags & DN_ID_NEW_EXIST) == 0);
-
-	mutex_enter(&os->os_lock);
-	POINTER_INVALIDATE(&dn->dn_objset);
-	if (!DMU_OBJECT_IS_SPECIAL(dn->dn_object)) {
-		list_remove(&os->os_dnodes, dn);
-		complete_os_eviction =
-		    list_is_empty(&os->os_dnodes) &&
-		    list_link_active(&os->os_evicting_node);
-	}
-	mutex_exit(&os->os_lock);
-
-	/* the dnode can no longer move, so we can release the handle */
-	if (!zrl_is_locked(&dn->dn_handle->dnh_zrlock))
-		zrl_remove(&dn->dn_handle->dnh_zrlock);
-
-	dn->dn_allocated_txg = 0;
-	dn->dn_free_txg = 0;
-	dn->dn_assigned_txg = 0;
-	dn->dn_dirty_txg = 0;
-
-	dn->dn_dirtyctx = 0;
-	if (dn->dn_dirtyctx_firstset != NULL) {
-		kmem_free(dn->dn_dirtyctx_firstset, 1);
-		dn->dn_dirtyctx_firstset = NULL;
-	}
-	if (dn->dn_bonus != NULL) {
-		mutex_enter(&dn->dn_bonus->db_mtx);
-		dbuf_destroy(dn->dn_bonus);
-		dn->dn_bonus = NULL;
-	}
-	dn->dn_zio = NULL;
-
-	dn->dn_have_spill = B_FALSE;
-	dn->dn_oldused = 0;
-	dn->dn_oldflags = 0;
-	dn->dn_olduid = 0;
-	dn->dn_oldgid = 0;
-	dn->dn_newuid = 0;
-	dn->dn_newgid = 0;
-	dn->dn_id_flags = 0;
-
-	dmu_zfetch_fini(&dn->dn_zfetch);
-	kmem_cache_free(dnode_cache, dn);
-	arc_space_return(sizeof (dnode_t), ARC_SPACE_DNODE);
-
-	if (complete_os_eviction)
-		dmu_objset_evict_done(os);
-}
-
-void
-dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs,
-    dmu_object_type_t bonustype, int bonuslen, int dn_slots, dmu_tx_t *tx)
-{
-	int i;
-
-	ASSERT3U(dn_slots, >, 0);
-	ASSERT3U(dn_slots << DNODE_SHIFT, <=,
-	    spa_maxdnodesize(dmu_objset_spa(dn->dn_objset)));
-	ASSERT3U(blocksize, <=,
-	    spa_maxblocksize(dmu_objset_spa(dn->dn_objset)));
-	if (blocksize == 0)
-		blocksize = 1 << zfs_default_bs;
-	else
-		blocksize = P2ROUNDUP(blocksize, SPA_MINBLOCKSIZE);
-
-	if (ibs == 0)
-		ibs = zfs_default_ibs;
-
-	ibs = MIN(MAX(ibs, DN_MIN_INDBLKSHIFT), DN_MAX_INDBLKSHIFT);
-
-	dprintf("os=%p obj=%" PRIu64 " txg=%" PRIu64
-	    " blocksize=%d ibs=%d dn_slots=%d\n",
-	    dn->dn_objset, dn->dn_object, tx->tx_txg, blocksize, ibs, dn_slots);
-	DNODE_STAT_BUMP(dnode_allocate);
-
-	ASSERT(dn->dn_type == DMU_OT_NONE);
-	ASSERT(bcmp(dn->dn_phys, &dnode_phys_zero, sizeof (dnode_phys_t)) == 0);
-	ASSERT(dn->dn_phys->dn_type == DMU_OT_NONE);
-	ASSERT(ot != DMU_OT_NONE);
-	ASSERT(DMU_OT_IS_VALID(ot));
-	ASSERT((bonustype == DMU_OT_NONE && bonuslen == 0) ||
-	    (bonustype == DMU_OT_SA && bonuslen == 0) ||
-	    (bonustype != DMU_OT_NONE && bonuslen != 0));
-	ASSERT(DMU_OT_IS_VALID(bonustype));
-	ASSERT3U(bonuslen, <=, DN_SLOTS_TO_BONUSLEN(dn_slots));
-	ASSERT(dn->dn_type == DMU_OT_NONE);
-	ASSERT0(dn->dn_maxblkid);
-	ASSERT0(dn->dn_allocated_txg);
-	ASSERT0(dn->dn_dirty_txg);
-	ASSERT0(dn->dn_assigned_txg);
-	ASSERT(zfs_refcount_is_zero(&dn->dn_tx_holds));
-	ASSERT3U(zfs_refcount_count(&dn->dn_holds), <=, 1);
-	ASSERT(avl_is_empty(&dn->dn_dbufs));
-
-	for (i = 0; i < TXG_SIZE; i++) {
-		ASSERT0(dn->dn_next_nblkptr[i]);
-		ASSERT0(dn->dn_next_nlevels[i]);
-		ASSERT0(dn->dn_next_indblkshift[i]);
-		ASSERT0(dn->dn_next_bonuslen[i]);
-		ASSERT0(dn->dn_next_bonustype[i]);
-		ASSERT0(dn->dn_rm_spillblk[i]);
-		ASSERT0(dn->dn_next_blksz[i]);
-		ASSERT(!multilist_link_active(&dn->dn_dirty_link[i]));
-		ASSERT3P(list_head(&dn->dn_dirty_records[i]), ==, NULL);
-		ASSERT3P(dn->dn_free_ranges[i], ==, NULL);
-	}
-
-	dn->dn_type = ot;
-	dnode_setdblksz(dn, blocksize);
-	dn->dn_indblkshift = ibs;
-	dn->dn_nlevels = 1;
-	dn->dn_num_slots = dn_slots;
-	if (bonustype == DMU_OT_SA) /* Maximize bonus space for SA */
-		dn->dn_nblkptr = 1;
-	else {
-		dn->dn_nblkptr = MIN(DN_MAX_NBLKPTR,
-		    1 + ((DN_SLOTS_TO_BONUSLEN(dn_slots) - bonuslen) >>
-		    SPA_BLKPTRSHIFT));
-	}
-
-	dn->dn_bonustype = bonustype;
-	dn->dn_bonuslen = bonuslen;
-	dn->dn_checksum = ZIO_CHECKSUM_INHERIT;
-	dn->dn_compress = ZIO_COMPRESS_INHERIT;
-	dn->dn_dirtyctx = 0;
-
-	dn->dn_free_txg = 0;
-	if (dn->dn_dirtyctx_firstset) {
-		kmem_free(dn->dn_dirtyctx_firstset, 1);
-		dn->dn_dirtyctx_firstset = NULL;
-	}
-
-	dn->dn_allocated_txg = tx->tx_txg;
-	dn->dn_id_flags = 0;
-
-	dnode_setdirty(dn, tx);
-	dn->dn_next_indblkshift[tx->tx_txg & TXG_MASK] = ibs;
-	dn->dn_next_bonuslen[tx->tx_txg & TXG_MASK] = dn->dn_bonuslen;
-	dn->dn_next_bonustype[tx->tx_txg & TXG_MASK] = dn->dn_bonustype;
-	dn->dn_next_blksz[tx->tx_txg & TXG_MASK] = dn->dn_datablksz;
-}
-
-void
-dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize,
-    dmu_object_type_t bonustype, int bonuslen, int dn_slots, dmu_tx_t *tx)
-{
-	int nblkptr;
-
-	ASSERT3U(blocksize, >=, SPA_MINBLOCKSIZE);
-	ASSERT3U(blocksize, <=,
-	    spa_maxblocksize(dmu_objset_spa(dn->dn_objset)));
-	ASSERT0(blocksize % SPA_MINBLOCKSIZE);
-	ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT || dmu_tx_private_ok(tx));
-	ASSERT(tx->tx_txg != 0);
-	ASSERT((bonustype == DMU_OT_NONE && bonuslen == 0) ||
-	    (bonustype != DMU_OT_NONE && bonuslen != 0) ||
-	    (bonustype == DMU_OT_SA && bonuslen == 0));
-	ASSERT(DMU_OT_IS_VALID(bonustype));
-	ASSERT3U(bonuslen, <=,
-	    DN_BONUS_SIZE(spa_maxdnodesize(dmu_objset_spa(dn->dn_objset))));
-	ASSERT3U(bonuslen, <=, DN_BONUS_SIZE(dn_slots << DNODE_SHIFT));
-
-	dnode_free_interior_slots(dn);
-	DNODE_STAT_BUMP(dnode_reallocate);
-
-	/* clean up any unreferenced dbufs */
-	dnode_evict_dbufs(dn);
-
-	dn->dn_id_flags = 0;
-
-	rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
-	dnode_setdirty(dn, tx);
-	if (dn->dn_datablksz != blocksize) {
-		/* change blocksize */
-		ASSERT(dn->dn_maxblkid == 0 &&
-		    (BP_IS_HOLE(&dn->dn_phys->dn_blkptr[0]) ||
-		    dnode_block_freed(dn, 0)));
-		dnode_setdblksz(dn, blocksize);
-		dn->dn_next_blksz[tx->tx_txg&TXG_MASK] = blocksize;
-	}
-	if (dn->dn_bonuslen != bonuslen)
-		dn->dn_next_bonuslen[tx->tx_txg&TXG_MASK] = bonuslen;
-
-	if (bonustype == DMU_OT_SA) /* Maximize bonus space for SA */
-		nblkptr = 1;
-	else
-		nblkptr = MIN(DN_MAX_NBLKPTR,
-		    1 + ((DN_SLOTS_TO_BONUSLEN(dn_slots) - bonuslen) >>
-		    SPA_BLKPTRSHIFT));
-	if (dn->dn_bonustype != bonustype)
-		dn->dn_next_bonustype[tx->tx_txg&TXG_MASK] = bonustype;
-	if (dn->dn_nblkptr != nblkptr)
-		dn->dn_next_nblkptr[tx->tx_txg&TXG_MASK] = nblkptr;
-	if (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
-		dbuf_rm_spill(dn, tx);
-		dnode_rm_spill(dn, tx);
-	}
-	rw_exit(&dn->dn_struct_rwlock);
-
-	/* change type */
-	dn->dn_type = ot;
-
-	/* change bonus size and type */
-	mutex_enter(&dn->dn_mtx);
-	dn->dn_bonustype = bonustype;
-	dn->dn_bonuslen = bonuslen;
-	dn->dn_num_slots = dn_slots;
-	dn->dn_nblkptr = nblkptr;
-	dn->dn_checksum = ZIO_CHECKSUM_INHERIT;
-	dn->dn_compress = ZIO_COMPRESS_INHERIT;
-	ASSERT3U(dn->dn_nblkptr, <=, DN_MAX_NBLKPTR);
-
-	/* fix up the bonus db_size */
-	if (dn->dn_bonus) {
-		dn->dn_bonus->db.db_size =
-		    DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots) -
-		    (dn->dn_nblkptr - 1) * sizeof (blkptr_t);
-		ASSERT(dn->dn_bonuslen <= dn->dn_bonus->db.db_size);
-	}
-
-	dn->dn_allocated_txg = tx->tx_txg;
-	mutex_exit(&dn->dn_mtx);
-}
-
-#ifdef	_KERNEL
-static void
-dnode_move_impl(dnode_t *odn, dnode_t *ndn)
-{
-	int i;
-
-	ASSERT(!RW_LOCK_HELD(&odn->dn_struct_rwlock));
-	ASSERT(MUTEX_NOT_HELD(&odn->dn_mtx));
-	ASSERT(MUTEX_NOT_HELD(&odn->dn_dbufs_mtx));
-	ASSERT(!RW_LOCK_HELD(&odn->dn_zfetch.zf_rwlock));
-
-	/* Copy fields. */
-	ndn->dn_objset = odn->dn_objset;
-	ndn->dn_object = odn->dn_object;
-	ndn->dn_dbuf = odn->dn_dbuf;
-	ndn->dn_handle = odn->dn_handle;
-	ndn->dn_phys = odn->dn_phys;
-	ndn->dn_type = odn->dn_type;
-	ndn->dn_bonuslen = odn->dn_bonuslen;
-	ndn->dn_bonustype = odn->dn_bonustype;
-	ndn->dn_nblkptr = odn->dn_nblkptr;
-	ndn->dn_checksum = odn->dn_checksum;
-	ndn->dn_compress = odn->dn_compress;
-	ndn->dn_nlevels = odn->dn_nlevels;
-	ndn->dn_indblkshift = odn->dn_indblkshift;
-	ndn->dn_datablkshift = odn->dn_datablkshift;
-	ndn->dn_datablkszsec = odn->dn_datablkszsec;
-	ndn->dn_datablksz = odn->dn_datablksz;
-	ndn->dn_maxblkid = odn->dn_maxblkid;
-	ndn->dn_num_slots = odn->dn_num_slots;
-	bcopy(&odn->dn_next_type[0], &ndn->dn_next_type[0],
-	    sizeof (odn->dn_next_type));
-	bcopy(&odn->dn_next_nblkptr[0], &ndn->dn_next_nblkptr[0],
-	    sizeof (odn->dn_next_nblkptr));
-	bcopy(&odn->dn_next_nlevels[0], &ndn->dn_next_nlevels[0],
-	    sizeof (odn->dn_next_nlevels));
-	bcopy(&odn->dn_next_indblkshift[0], &ndn->dn_next_indblkshift[0],
-	    sizeof (odn->dn_next_indblkshift));
-	bcopy(&odn->dn_next_bonustype[0], &ndn->dn_next_bonustype[0],
-	    sizeof (odn->dn_next_bonustype));
-	bcopy(&odn->dn_rm_spillblk[0], &ndn->dn_rm_spillblk[0],
-	    sizeof (odn->dn_rm_spillblk));
-	bcopy(&odn->dn_next_bonuslen[0], &ndn->dn_next_bonuslen[0],
-	    sizeof (odn->dn_next_bonuslen));
-	bcopy(&odn->dn_next_blksz[0], &ndn->dn_next_blksz[0],
-	    sizeof (odn->dn_next_blksz));
-	for (i = 0; i < TXG_SIZE; i++) {
-		list_move_tail(&ndn->dn_dirty_records[i],
-		    &odn->dn_dirty_records[i]);
-	}
-	bcopy(&odn->dn_free_ranges[0], &ndn->dn_free_ranges[0],
-	    sizeof (odn->dn_free_ranges));
-	ndn->dn_allocated_txg = odn->dn_allocated_txg;
-	ndn->dn_free_txg = odn->dn_free_txg;
-	ndn->dn_assigned_txg = odn->dn_assigned_txg;
-	ndn->dn_dirty_txg = odn->dn_dirty_txg;
-	ndn->dn_dirtyctx = odn->dn_dirtyctx;
-	ndn->dn_dirtyctx_firstset = odn->dn_dirtyctx_firstset;
-	ASSERT(zfs_refcount_count(&odn->dn_tx_holds) == 0);
-	zfs_refcount_transfer(&ndn->dn_holds, &odn->dn_holds);
-	ASSERT(avl_is_empty(&ndn->dn_dbufs));
-	avl_swap(&ndn->dn_dbufs, &odn->dn_dbufs);
-	ndn->dn_dbufs_count = odn->dn_dbufs_count;
-	ndn->dn_bonus = odn->dn_bonus;
-	ndn->dn_have_spill = odn->dn_have_spill;
-	ndn->dn_zio = odn->dn_zio;
-	ndn->dn_oldused = odn->dn_oldused;
-	ndn->dn_oldflags = odn->dn_oldflags;
-	ndn->dn_olduid = odn->dn_olduid;
-	ndn->dn_oldgid = odn->dn_oldgid;
-	ndn->dn_newuid = odn->dn_newuid;
-	ndn->dn_newgid = odn->dn_newgid;
-	ndn->dn_id_flags = odn->dn_id_flags;
-	dmu_zfetch_init(&ndn->dn_zfetch, NULL);
-	list_move_tail(&ndn->dn_zfetch.zf_stream, &odn->dn_zfetch.zf_stream);
-	ndn->dn_zfetch.zf_dnode = odn->dn_zfetch.zf_dnode;
-
-	/*
-	 * Update back pointers. Updating the handle fixes the back pointer of
-	 * every descendant dbuf as well as the bonus dbuf.
-	 */
-	ASSERT(ndn->dn_handle->dnh_dnode == odn);
-	ndn->dn_handle->dnh_dnode = ndn;
-	if (ndn->dn_zfetch.zf_dnode == odn) {
-		ndn->dn_zfetch.zf_dnode = ndn;
-	}
-
-	/*
-	 * Invalidate the original dnode by clearing all of its back pointers.
-	 */
-	odn->dn_dbuf = NULL;
-	odn->dn_handle = NULL;
-	avl_create(&odn->dn_dbufs, dbuf_compare, sizeof (dmu_buf_impl_t),
-	    offsetof(dmu_buf_impl_t, db_link));
-	odn->dn_dbufs_count = 0;
-	odn->dn_bonus = NULL;
-	odn->dn_zfetch.zf_dnode = NULL;
-
-	/*
-	 * Set the low bit of the objset pointer to ensure that dnode_move()
-	 * recognizes the dnode as invalid in any subsequent callback.
-	 */
-	POINTER_INVALIDATE(&odn->dn_objset);
-
-	/*
-	 * Satisfy the destructor.
-	 */
-	for (i = 0; i < TXG_SIZE; i++) {
-		list_create(&odn->dn_dirty_records[i],
-		    sizeof (dbuf_dirty_record_t),
-		    offsetof(dbuf_dirty_record_t, dr_dirty_node));
-		odn->dn_free_ranges[i] = NULL;
-		odn->dn_next_nlevels[i] = 0;
-		odn->dn_next_indblkshift[i] = 0;
-		odn->dn_next_bonustype[i] = 0;
-		odn->dn_rm_spillblk[i] = 0;
-		odn->dn_next_bonuslen[i] = 0;
-		odn->dn_next_blksz[i] = 0;
-	}
-	odn->dn_allocated_txg = 0;
-	odn->dn_free_txg = 0;
-	odn->dn_assigned_txg = 0;
-	odn->dn_dirty_txg = 0;
-	odn->dn_dirtyctx = 0;
-	odn->dn_dirtyctx_firstset = NULL;
-	odn->dn_have_spill = B_FALSE;
-	odn->dn_zio = NULL;
-	odn->dn_oldused = 0;
-	odn->dn_oldflags = 0;
-	odn->dn_olduid = 0;
-	odn->dn_oldgid = 0;
-	odn->dn_newuid = 0;
-	odn->dn_newgid = 0;
-	odn->dn_id_flags = 0;
-
-	/*
-	 * Mark the dnode.
-	 */
-	ndn->dn_moved = 1;
-	odn->dn_moved = (uint8_t)-1;
-}
-
-#ifdef illumos
-/*ARGSUSED*/
-static kmem_cbrc_t
-dnode_move(void *buf, void *newbuf, size_t size, void *arg)
-{
-	dnode_t *odn = buf, *ndn = newbuf;
-	objset_t *os;
-	int64_t refcount;
-	uint32_t dbufs;
-
-	/*
-	 * The dnode is on the objset's list of known dnodes if the objset
-	 * pointer is valid. We set the low bit of the objset pointer when
-	 * freeing the dnode to invalidate it, and the memory patterns written
-	 * by kmem (baddcafe and deadbeef) set at least one of the two low bits.
-	 * A newly created dnode sets the objset pointer last of all to indicate
-	 * that the dnode is known and in a valid state to be moved by this
-	 * function.
-	 */
-	os = odn->dn_objset;
-	if (!POINTER_IS_VALID(os)) {
-		DNODE_STAT_BUMP(dnode_move_invalid);
-		return (KMEM_CBRC_DONT_KNOW);
-	}
-
-	/*
-	 * Ensure that the objset does not go away during the move.
-	 */
-	rw_enter(&os_lock, RW_WRITER);
-	if (os != odn->dn_objset) {
-		rw_exit(&os_lock);
-		DNODE_STAT_BUMP(dnode_move_recheck1);
-		return (KMEM_CBRC_DONT_KNOW);
-	}
-
-	/*
-	 * If the dnode is still valid, then so is the objset. We know that no
-	 * valid objset can be freed while we hold os_lock, so we can safely
-	 * ensure that the objset remains in use.
-	 */
-	mutex_enter(&os->os_lock);
-
-	/*
-	 * Recheck the objset pointer in case the dnode was removed just before
-	 * acquiring the lock.
-	 */
-	if (os != odn->dn_objset) {
-		mutex_exit(&os->os_lock);
-		rw_exit(&os_lock);
-		DNODE_STAT_BUMP(dnode_move_recheck2);
-		return (KMEM_CBRC_DONT_KNOW);
-	}
-
-	/*
-	 * At this point we know that as long as we hold os->os_lock, the dnode
-	 * cannot be freed and fields within the dnode can be safely accessed.
-	 * The objset listing this dnode cannot go away as long as this dnode is
-	 * on its list.
-	 */
-	rw_exit(&os_lock);
-	if (DMU_OBJECT_IS_SPECIAL(odn->dn_object)) {
-		mutex_exit(&os->os_lock);
-		DNODE_STAT_BUMP(dnode_move_special);
-		return (KMEM_CBRC_NO);
-	}
-	ASSERT(odn->dn_dbuf != NULL); /* only "special" dnodes have no parent */
-
-	/*
-	 * Lock the dnode handle to prevent the dnode from obtaining any new
-	 * holds. This also prevents the descendant dbufs and the bonus dbuf
-	 * from accessing the dnode, so that we can discount their holds. The
-	 * handle is safe to access because we know that while the dnode cannot
-	 * go away, neither can its handle. Once we hold dnh_zrlock, we can
-	 * safely move any dnode referenced only by dbufs.
-	 */
-	if (!zrl_tryenter(&odn->dn_handle->dnh_zrlock)) {
-		mutex_exit(&os->os_lock);
-		DNODE_STAT_BUMP(dnode_move_handle);
-		return (KMEM_CBRC_LATER);
-	}
-
-	/*
-	 * Ensure a consistent view of the dnode's holds and the dnode's dbufs.
-	 * We need to guarantee that there is a hold for every dbuf in order to
-	 * determine whether the dnode is actively referenced. Falsely matching
-	 * a dbuf to an active hold would lead to an unsafe move. It's possible
-	 * that a thread already having an active dnode hold is about to add a
-	 * dbuf, and we can't compare hold and dbuf counts while the add is in
-	 * progress.
-	 */
-	if (!rw_tryenter(&odn->dn_struct_rwlock, RW_WRITER)) {
-		zrl_exit(&odn->dn_handle->dnh_zrlock);
-		mutex_exit(&os->os_lock);
-		DNODE_STAT_BUMP(dnode_move_rwlock);
-		return (KMEM_CBRC_LATER);
-	}
-
-	/*
-	 * A dbuf may be removed (evicted) without an active dnode hold. In that
-	 * case, the dbuf count is decremented under the handle lock before the
-	 * dbuf's hold is released. This order ensures that if we count the hold
-	 * after the dbuf is removed but before its hold is released, we will
-	 * treat the unmatched hold as active and exit safely. If we count the
-	 * hold before the dbuf is removed, the hold is discounted, and the
-	 * removal is blocked until the move completes.
-	 */
-	refcount = zfs_refcount_count(&odn->dn_holds);
-	ASSERT(refcount >= 0);
-	dbufs = DN_DBUFS_COUNT(odn);
-
-	/* We can't have more dbufs than dnode holds. */
-	ASSERT3U(dbufs, <=, refcount);
-	DTRACE_PROBE3(dnode__move, dnode_t *, odn, int64_t, refcount,
-	    uint32_t, dbufs);
-
-	if (refcount > dbufs) {
-		rw_exit(&odn->dn_struct_rwlock);
-		zrl_exit(&odn->dn_handle->dnh_zrlock);
-		mutex_exit(&os->os_lock);
-		DNODE_STAT_BUMP(dnode_move_active);
-		return (KMEM_CBRC_LATER);
-	}
-
-	rw_exit(&odn->dn_struct_rwlock);
-
-	/*
-	 * At this point we know that anyone with a hold on the dnode is not
-	 * actively referencing it. The dnode is known and in a valid state to
-	 * move. We're holding the locks needed to execute the critical section.
-	 */
-	dnode_move_impl(odn, ndn);
-
-	list_link_replace(&odn->dn_link, &ndn->dn_link);
-	/* If the dnode was safe to move, the refcount cannot have changed. */
-	ASSERT(refcount == zfs_refcount_count(&ndn->dn_holds));
-	ASSERT(dbufs == DN_DBUFS_COUNT(ndn));
-	zrl_exit(&ndn->dn_handle->dnh_zrlock); /* handle has moved */
-	mutex_exit(&os->os_lock);
-
-	return (KMEM_CBRC_YES);
-}
-#endif	/* illumos */
-#endif	/* _KERNEL */
-
-static void
-dnode_slots_hold(dnode_children_t *children, int idx, int slots)
-{
-	ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK);
-
-	for (int i = idx; i < idx + slots; i++) {
-		dnode_handle_t *dnh = &children->dnc_children[i];
-		zrl_add(&dnh->dnh_zrlock);
-	}
-}
-
-static void
-dnode_slots_rele(dnode_children_t *children, int idx, int slots)
-{
-	ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK);
-
-	for (int i = idx; i < idx + slots; i++) {
-		dnode_handle_t *dnh = &children->dnc_children[i];
-
-		if (zrl_is_locked(&dnh->dnh_zrlock))
-			zrl_exit(&dnh->dnh_zrlock);
-		else
-			zrl_remove(&dnh->dnh_zrlock);
-	}
-}
-
-static int
-dnode_slots_tryenter(dnode_children_t *children, int idx, int slots)
-{
-	ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK);
-
-	for (int i = idx; i < idx + slots; i++) {
-		dnode_handle_t *dnh = &children->dnc_children[i];
-
-		if (!zrl_tryenter(&dnh->dnh_zrlock)) {
-			for (int j = idx; j < i; j++) {
-				dnh = &children->dnc_children[j];
-				zrl_exit(&dnh->dnh_zrlock);
-			}
-
-			return (0);
-		}
-	}
-
-	return (1);
-}
-
-static void
-dnode_set_slots(dnode_children_t *children, int idx, int slots, void *ptr)
-{
-	ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK);
-
-	for (int i = idx; i < idx + slots; i++) {
-		dnode_handle_t *dnh = &children->dnc_children[i];
-		dnh->dnh_dnode = ptr;
-	}
-}
-
-static boolean_t
-dnode_check_slots_free(dnode_children_t *children, int idx, int slots)
-{
-	ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK);
-
-	/*
-	 * If all dnode slots are either already free or
-	 * evictable return B_TRUE.
-	 */
-	for (int i = idx; i < idx + slots; i++) {
-		dnode_handle_t *dnh = &children->dnc_children[i];
-		dnode_t *dn = dnh->dnh_dnode;
-
-		if (dn == DN_SLOT_FREE) {
-			continue;
-		} else if (DN_SLOT_IS_PTR(dn)) {
-			mutex_enter(&dn->dn_mtx);
-			boolean_t can_free = (dn->dn_type == DMU_OT_NONE &&
-			    zfs_refcount_is_zero(&dn->dn_holds) &&
-			    !DNODE_IS_DIRTY(dn));
-			mutex_exit(&dn->dn_mtx);
-
-			if (!can_free)
-				return (B_FALSE);
-			else
-				continue;
-		} else {
-			return (B_FALSE);
-		}
-	}
-
-	return (B_TRUE);
-}
-
-static void
-dnode_reclaim_slots(dnode_children_t *children, int idx, int slots)
-{
-	ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK);
-
-	for (int i = idx; i < idx + slots; i++) {
-		dnode_handle_t *dnh = &children->dnc_children[i];
-
-		ASSERT(zrl_is_locked(&dnh->dnh_zrlock));
-
-		if (DN_SLOT_IS_PTR(dnh->dnh_dnode)) {
-			ASSERT3S(dnh->dnh_dnode->dn_type, ==, DMU_OT_NONE);
-			dnode_destroy(dnh->dnh_dnode);
-			dnh->dnh_dnode = DN_SLOT_FREE;
-		}
-	}
-}
-
-void
-dnode_free_interior_slots(dnode_t *dn)
-{
-	dnode_children_t *children = dmu_buf_get_user(&dn->dn_dbuf->db);
-	int epb = dn->dn_dbuf->db.db_size >> DNODE_SHIFT;
-	int idx = (dn->dn_object & (epb - 1)) + 1;
-	int slots = dn->dn_num_slots - 1;
-
-	if (slots == 0)
-		return;
-
-	ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK);
-
-	while (!dnode_slots_tryenter(children, idx, slots))
-		DNODE_STAT_BUMP(dnode_free_interior_lock_retry);
-
-	dnode_set_slots(children, idx, slots, DN_SLOT_FREE);
-	dnode_slots_rele(children, idx, slots);
-}
-
-void
-dnode_special_close(dnode_handle_t *dnh)
-{
-	dnode_t *dn = dnh->dnh_dnode;
-
-	/*
-	 * Wait for final references to the dnode to clear.  This can
-	 * only happen if the arc is asynchronously evicting state that
-	 * has a hold on this dnode while we are trying to evict this
-	 * dnode.
-	 */
-	while (zfs_refcount_count(&dn->dn_holds) > 0)
-		delay(1);
-	ASSERT(dn->dn_dbuf == NULL ||
-	    dmu_buf_get_user(&dn->dn_dbuf->db) == NULL);
-	zrl_add(&dnh->dnh_zrlock);
-	dnode_destroy(dn); /* implicit zrl_remove() */
-	zrl_destroy(&dnh->dnh_zrlock);
-	dnh->dnh_dnode = NULL;
-}
-
-void
-dnode_special_open(objset_t *os, dnode_phys_t *dnp, uint64_t object,
-    dnode_handle_t *dnh)
-{
-	dnode_t *dn;
-
-	zrl_init(&dnh->dnh_zrlock);
-	zrl_tryenter(&dnh->dnh_zrlock);
-
-	dn = dnode_create(os, dnp, NULL, object, dnh);
-	DNODE_VERIFY(dn);
-
-	zrl_exit(&dnh->dnh_zrlock);
-}
-
-static void
-dnode_buf_evict_async(void *dbu)
-{
-	dnode_children_t *dnc = dbu;
-
-	DNODE_STAT_BUMP(dnode_buf_evict);
-
-	for (int i = 0; i < dnc->dnc_count; i++) {
-		dnode_handle_t *dnh = &dnc->dnc_children[i];
-		dnode_t *dn;
-
-		/*
-		 * The dnode handle lock guards against the dnode moving to
-		 * another valid address, so there is no need here to guard
-		 * against changes to or from NULL.
-		 */
-		if (!DN_SLOT_IS_PTR(dnh->dnh_dnode)) {
-			zrl_destroy(&dnh->dnh_zrlock);
-			dnh->dnh_dnode = DN_SLOT_UNINIT;
-			continue;
-		}
-
-		zrl_add(&dnh->dnh_zrlock);
-		dn = dnh->dnh_dnode;
-		/*
-		 * If there are holds on this dnode, then there should
-		 * be holds on the dnode's containing dbuf as well; thus
-		 * it wouldn't be eligible for eviction and this function
-		 * would not have been called.
-		 */
-		ASSERT(zfs_refcount_is_zero(&dn->dn_holds));
-		ASSERT(zfs_refcount_is_zero(&dn->dn_tx_holds));
-
-		dnode_destroy(dn); /* implicit zrl_remove() for first slot */
-		zrl_destroy(&dnh->dnh_zrlock);
-		dnh->dnh_dnode = DN_SLOT_UNINIT;
-	}
-	kmem_free(dnc, sizeof (dnode_children_t) +
-	    dnc->dnc_count * sizeof (dnode_handle_t));
-}
-
-/*
- * When the DNODE_MUST_BE_FREE flag is set, the "slots" parameter is used
- * to ensure the hole at the specified object offset is large enough to
- * hold the dnode being created. The slots parameter is also used to ensure
- * a dnode does not span multiple dnode blocks. In both of these cases, if
- * a failure occurs, ENOSPC is returned. Keep in mind, these failure cases
- * are only possible when using DNODE_MUST_BE_FREE.
- *
- * If the DNODE_MUST_BE_ALLOCATED flag is set, "slots" must be 0.
- * dnode_hold_impl() will check if the requested dnode is already consumed
- * as an extra dnode slot by an large dnode, in which case it returns
- * ENOENT.
- *
- * errors:
- * EINVAL - invalid object number or flags.
- * ENOSPC - hole too small to fulfill "slots" request (DNODE_MUST_BE_FREE)
- * EEXIST - Refers to an allocated dnode (DNODE_MUST_BE_FREE)
- *        - Refers to a freeing dnode (DNODE_MUST_BE_FREE)
- *        - Refers to an interior dnode slot (DNODE_MUST_BE_ALLOCATED)
- * ENOENT - The requested dnode is not allocated (DNODE_MUST_BE_ALLOCATED)
- *        - The requested dnode is being freed (DNODE_MUST_BE_ALLOCATED)
- * EIO    - i/o error error when reading the meta dnode dbuf.
- * succeeds even for free dnodes.
- */
-int
-dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots,
-    void *tag, dnode_t **dnp)
-{
-	int epb, idx, err, i;
-	int drop_struct_lock = FALSE;
-	int type;
-	uint64_t blk;
-	dnode_t *mdn, *dn;
-	dmu_buf_impl_t *db;
-	dnode_children_t *dnc;
-	dnode_phys_t *dn_block;
-	dnode_phys_t *dn_block_begin;
-	dnode_handle_t *dnh;
-
-	ASSERT(!(flag & DNODE_MUST_BE_ALLOCATED) || (slots == 0));
-	ASSERT(!(flag & DNODE_MUST_BE_FREE) || (slots > 0));
-
-	/*
-	 * If you are holding the spa config lock as writer, you shouldn't
-	 * be asking the DMU to do *anything* unless it's the root pool
-	 * which may require us to read from the root filesystem while
-	 * holding some (not all) of the locks as writer.
-	 */
-	ASSERT(spa_config_held(os->os_spa, SCL_ALL, RW_WRITER) == 0 ||
-	    (spa_is_root(os->os_spa) &&
-	    spa_config_held(os->os_spa, SCL_STATE, RW_WRITER)));
-
-	ASSERT((flag & DNODE_MUST_BE_ALLOCATED) || (flag & DNODE_MUST_BE_FREE));
-
-	if (object == DMU_USERUSED_OBJECT || object == DMU_GROUPUSED_OBJECT) {
-		dn = (object == DMU_USERUSED_OBJECT) ?
-		    DMU_USERUSED_DNODE(os) : DMU_GROUPUSED_DNODE(os);
-		if (dn == NULL)
-			return (SET_ERROR(ENOENT));
-		type = dn->dn_type;
-		if ((flag & DNODE_MUST_BE_ALLOCATED) && type == DMU_OT_NONE)
-			return (SET_ERROR(ENOENT));
-		if ((flag & DNODE_MUST_BE_FREE) && type != DMU_OT_NONE)
-			return (SET_ERROR(EEXIST));
-		DNODE_VERIFY(dn);
-		(void) zfs_refcount_add(&dn->dn_holds, tag);
-		*dnp = dn;
-		return (0);
-	}
-
-	if (object == 0 || object >= DN_MAX_OBJECT)
-		return (SET_ERROR(EINVAL));
-
-	mdn = DMU_META_DNODE(os);
-	ASSERT(mdn->dn_object == DMU_META_DNODE_OBJECT);
-
-	DNODE_VERIFY(mdn);
-
-	if (!RW_WRITE_HELD(&mdn->dn_struct_rwlock)) {
-		rw_enter(&mdn->dn_struct_rwlock, RW_READER);
-		drop_struct_lock = TRUE;
-	}
-
-	blk = dbuf_whichblock(mdn, 0, object * sizeof (dnode_phys_t));
-
-	db = dbuf_hold(mdn, blk, FTAG);
-	if (drop_struct_lock)
-		rw_exit(&mdn->dn_struct_rwlock);
-	if (db == NULL) {
-		DNODE_STAT_BUMP(dnode_hold_dbuf_hold);
-		return (SET_ERROR(EIO));
-	}
-	err = dbuf_read(db, NULL, DB_RF_CANFAIL);
-	if (err) {
-		DNODE_STAT_BUMP(dnode_hold_dbuf_read);
-		dbuf_rele(db, FTAG);
-		return (err);
-	}
-
-	ASSERT3U(db->db.db_size, >=, 1<<DNODE_SHIFT);
-	epb = db->db.db_size >> DNODE_SHIFT;
-
-	idx = object & (epb - 1);
-	dn_block = (dnode_phys_t *)db->db.db_data;
-
-	ASSERT(DB_DNODE(db)->dn_type == DMU_OT_DNODE);
-	dnc = dmu_buf_get_user(&db->db);
-	dnh = NULL;
-	if (dnc == NULL) {
-		dnode_children_t *winner;
-		int skip = 0;
-
-		dnc = kmem_zalloc(sizeof (dnode_children_t) +
-		    epb * sizeof (dnode_handle_t), KM_SLEEP);
-		dnc->dnc_count = epb;
-		dnh = &dnc->dnc_children[0];
-
-		/* Initialize dnode slot status from dnode_phys_t */
-		for (int i = 0; i < epb; i++) {
-			zrl_init(&dnh[i].dnh_zrlock);
-
-			if (skip) {
-				skip--;
-				continue;
-			}
-
-			if (dn_block[i].dn_type != DMU_OT_NONE) {
-				int interior = dn_block[i].dn_extra_slots;
-
-				dnode_set_slots(dnc, i, 1, DN_SLOT_ALLOCATED);
-				dnode_set_slots(dnc, i + 1, interior,
-				    DN_SLOT_INTERIOR);
-				skip = interior;
-			} else {
-				dnh[i].dnh_dnode = DN_SLOT_FREE;
-				skip = 0;
-			}
-		}
-
-		dmu_buf_init_user(&dnc->dnc_dbu, NULL,
-		    dnode_buf_evict_async, NULL);
-		winner = dmu_buf_set_user(&db->db, &dnc->dnc_dbu);
-		if (winner != NULL) {
-
-			for (int i = 0; i < epb; i++)
-				zrl_destroy(&dnh[i].dnh_zrlock);
-
-			kmem_free(dnc, sizeof (dnode_children_t) +
-			    epb * sizeof (dnode_handle_t));
-			dnc = winner;
-		}
-	}
-
-	ASSERT(dnc->dnc_count == epb);
-	dn = DN_SLOT_UNINIT;
-
-	if (flag & DNODE_MUST_BE_ALLOCATED) {
-		slots = 1;
-
-		while (dn == DN_SLOT_UNINIT) {
-			dnode_slots_hold(dnc, idx, slots);
-			dnh = &dnc->dnc_children[idx];
-
-			if (DN_SLOT_IS_PTR(dnh->dnh_dnode)) {
-				dn = dnh->dnh_dnode;
-				break;
-			} else if (dnh->dnh_dnode == DN_SLOT_INTERIOR) {
-				DNODE_STAT_BUMP(dnode_hold_alloc_interior);
-				dnode_slots_rele(dnc, idx, slots);
-				dbuf_rele(db, FTAG);
-				return (SET_ERROR(EEXIST));
-			} else if (dnh->dnh_dnode != DN_SLOT_ALLOCATED) {
-				DNODE_STAT_BUMP(dnode_hold_alloc_misses);
-				dnode_slots_rele(dnc, idx, slots);
-				dbuf_rele(db, FTAG);
-				return (SET_ERROR(ENOENT));
-			}
-
-			dnode_slots_rele(dnc, idx, slots);
-			if (!dnode_slots_tryenter(dnc, idx, slots)) {
-				DNODE_STAT_BUMP(dnode_hold_alloc_lock_retry);
-				continue;
-			}
-
-			/*
-			 * Someone else won the race and called dnode_create()
-			 * after we checked DN_SLOT_IS_PTR() above but before
-			 * we acquired the lock.
-			 */
-			if (DN_SLOT_IS_PTR(dnh->dnh_dnode)) {
-				DNODE_STAT_BUMP(dnode_hold_alloc_lock_misses);
-				dn = dnh->dnh_dnode;
-			} else {
-				dn = dnode_create(os, dn_block + idx, db,
-				    object, dnh);
-			}
-		}
-
-		mutex_enter(&dn->dn_mtx);
-		if (dn->dn_type == DMU_OT_NONE || dn->dn_free_txg != 0) {
-			DNODE_STAT_BUMP(dnode_hold_alloc_type_none);
-			mutex_exit(&dn->dn_mtx);
-			dnode_slots_rele(dnc, idx, slots);
-			dbuf_rele(db, FTAG);
-			return (SET_ERROR(ENOENT));
-		}
-
-		DNODE_STAT_BUMP(dnode_hold_alloc_hits);
-	} else if (flag & DNODE_MUST_BE_FREE) {
-
-		if (idx + slots - 1 >= DNODES_PER_BLOCK) {
-			DNODE_STAT_BUMP(dnode_hold_free_overflow);
-			dbuf_rele(db, FTAG);
-			return (SET_ERROR(ENOSPC));
-		}
-
-		while (dn == DN_SLOT_UNINIT) {
-			dnode_slots_hold(dnc, idx, slots);
-
-			if (!dnode_check_slots_free(dnc, idx, slots)) {
-				DNODE_STAT_BUMP(dnode_hold_free_misses);
-				dnode_slots_rele(dnc, idx, slots);
-				dbuf_rele(db, FTAG);
-				return (SET_ERROR(ENOSPC));
-			}
-
-			dnode_slots_rele(dnc, idx, slots);
-			if (!dnode_slots_tryenter(dnc, idx, slots)) {
-				DNODE_STAT_BUMP(dnode_hold_free_lock_retry);
-				continue;
-			}
-
-			if (!dnode_check_slots_free(dnc, idx, slots)) {
-				DNODE_STAT_BUMP(dnode_hold_free_lock_misses);
-				dnode_slots_rele(dnc, idx, slots);
-				dbuf_rele(db, FTAG);
-				return (SET_ERROR(ENOSPC));
-			}
-
-			/*
-			 * Allocated but otherwise free dnodes which would
-			 * be in the interior of a multi-slot dnodes need
-			 * to be freed.  Single slot dnodes can be safely
-			 * re-purposed as a performance optimization.
-			 */
-			if (slots > 1)
-				dnode_reclaim_slots(dnc, idx + 1, slots - 1);
-
-			dnh = &dnc->dnc_children[idx];
-			if (DN_SLOT_IS_PTR(dnh->dnh_dnode)) {
-				dn = dnh->dnh_dnode;
-			} else {
-				dn = dnode_create(os, dn_block + idx, db,
-				    object, dnh);
-			}
-		}
-
-		mutex_enter(&dn->dn_mtx);
-		if (!zfs_refcount_is_zero(&dn->dn_holds) || dn->dn_free_txg) {
-			DNODE_STAT_BUMP(dnode_hold_free_refcount);
-			mutex_exit(&dn->dn_mtx);
-			dnode_slots_rele(dnc, idx, slots);
-			dbuf_rele(db, FTAG);
-			return (SET_ERROR(EEXIST));
-		}
-
-		dnode_set_slots(dnc, idx + 1, slots - 1, DN_SLOT_INTERIOR);
-		DNODE_STAT_BUMP(dnode_hold_free_hits);
-	} else {
-		dbuf_rele(db, FTAG);
-		return (SET_ERROR(EINVAL));
-	}
-
-	if (dn->dn_free_txg) {
-		DNODE_STAT_BUMP(dnode_hold_free_txg);
-		type = dn->dn_type;
-		mutex_exit(&dn->dn_mtx);
-		dnode_slots_rele(dnc, idx, slots);
-		dbuf_rele(db, FTAG);
-		return (SET_ERROR((flag & DNODE_MUST_BE_ALLOCATED) ?
-		    ENOENT : EEXIST));
-	}
-
-	if (zfs_refcount_add(&dn->dn_holds, tag) == 1)
-		dbuf_add_ref(db, dnh);
-
-	mutex_exit(&dn->dn_mtx);
-
-	/* Now we can rely on the hold to prevent the dnode from moving. */
-	dnode_slots_rele(dnc, idx, slots);
-
-	DNODE_VERIFY(dn);
-	ASSERT3P(dn->dn_dbuf, ==, db);
-	ASSERT3U(dn->dn_object, ==, object);
-	dbuf_rele(db, FTAG);
-
-	*dnp = dn;
-	return (0);
-}
-
-/*
- * Return held dnode if the object is allocated, NULL if not.
- */
-int
-dnode_hold(objset_t *os, uint64_t object, void *tag, dnode_t **dnp)
-{
-	return (dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, 0, tag,
-	    dnp));
-}
-
-/*
- * Can only add a reference if there is already at least one
- * reference on the dnode.  Returns FALSE if unable to add a
- * new reference.
- */
-boolean_t
-dnode_add_ref(dnode_t *dn, void *tag)
-{
-	mutex_enter(&dn->dn_mtx);
-	if (zfs_refcount_is_zero(&dn->dn_holds)) {
-		mutex_exit(&dn->dn_mtx);
-		return (FALSE);
-	}
-	VERIFY(1 < zfs_refcount_add(&dn->dn_holds, tag));
-	mutex_exit(&dn->dn_mtx);
-	return (TRUE);
-}
-
-void
-dnode_rele(dnode_t *dn, void *tag)
-{
-	mutex_enter(&dn->dn_mtx);
-	dnode_rele_and_unlock(dn, tag, B_FALSE);
-}
-
-void
-dnode_rele_and_unlock(dnode_t *dn, void *tag, boolean_t evicting)
-{
-	uint64_t refs;
-	/* Get while the hold prevents the dnode from moving. */
-	dmu_buf_impl_t *db = dn->dn_dbuf;
-	dnode_handle_t *dnh = dn->dn_handle;
-
-	refs = zfs_refcount_remove(&dn->dn_holds, tag);
-	mutex_exit(&dn->dn_mtx);
-
-	/*
-	 * It's unsafe to release the last hold on a dnode by dnode_rele() or
-	 * indirectly by dbuf_rele() while relying on the dnode handle to
-	 * prevent the dnode from moving, since releasing the last hold could
-	 * result in the dnode's parent dbuf evicting its dnode handles. For
-	 * that reason anyone calling dnode_rele() or dbuf_rele() without some
-	 * other direct or indirect hold on the dnode must first drop the dnode
-	 * handle.
-	 */
-	ASSERT(refs > 0 || dnh->dnh_zrlock.zr_owner != curthread);
-
-	/* NOTE: the DNODE_DNODE does not have a dn_dbuf */
-	if (refs == 0 && db != NULL) {
-		/*
-		 * Another thread could add a hold to the dnode handle in
-		 * dnode_hold_impl() while holding the parent dbuf. Since the
-		 * hold on the parent dbuf prevents the handle from being
-		 * destroyed, the hold on the handle is OK. We can't yet assert
-		 * that the handle has zero references, but that will be
-		 * asserted anyway when the handle gets destroyed.
-		 */
-		mutex_enter(&db->db_mtx);
-		dbuf_rele_and_unlock(db, dnh, evicting);
-	}
-}
-
-void
-dnode_setdirty(dnode_t *dn, dmu_tx_t *tx)
-{
-	objset_t *os = dn->dn_objset;
-	uint64_t txg = tx->tx_txg;
-
-	if (DMU_OBJECT_IS_SPECIAL(dn->dn_object)) {
-		dsl_dataset_dirty(os->os_dsl_dataset, tx);
-		return;
-	}
-
-	DNODE_VERIFY(dn);
-
-#ifdef ZFS_DEBUG
-	mutex_enter(&dn->dn_mtx);
-	ASSERT(dn->dn_phys->dn_type || dn->dn_allocated_txg);
-	ASSERT(dn->dn_free_txg == 0 || dn->dn_free_txg >= txg);
-	mutex_exit(&dn->dn_mtx);
-#endif
-
-	/*
-	 * Determine old uid/gid when necessary
-	 */
-	dmu_objset_userquota_get_ids(dn, B_TRUE, tx);
-
-	multilist_t *dirtylist = os->os_dirty_dnodes[txg & TXG_MASK];
-	multilist_sublist_t *mls = multilist_sublist_lock_obj(dirtylist, dn);
-
-	/*
-	 * If we are already marked dirty, we're done.
-	 */
-	if (multilist_link_active(&dn->dn_dirty_link[txg & TXG_MASK])) {
-		multilist_sublist_unlock(mls);
-		return;
-	}
-
-	ASSERT(!zfs_refcount_is_zero(&dn->dn_holds) ||
-	    !avl_is_empty(&dn->dn_dbufs));
-	ASSERT(dn->dn_datablksz != 0);
-	ASSERT0(dn->dn_next_bonuslen[txg&TXG_MASK]);
-	ASSERT0(dn->dn_next_blksz[txg&TXG_MASK]);
-	ASSERT0(dn->dn_next_bonustype[txg&TXG_MASK]);
-
-	dprintf_ds(os->os_dsl_dataset, "obj=%llu txg=%llu\n",
-	    dn->dn_object, txg);
-
-	multilist_sublist_insert_head(mls, dn);
-
-	multilist_sublist_unlock(mls);
-
-	/*
-	 * The dnode maintains a hold on its containing dbuf as
-	 * long as there are holds on it.  Each instantiated child
-	 * dbuf maintains a hold on the dnode.  When the last child
-	 * drops its hold, the dnode will drop its hold on the
-	 * containing dbuf. We add a "dirty hold" here so that the
-	 * dnode will hang around after we finish processing its
-	 * children.
-	 */
-	VERIFY(dnode_add_ref(dn, (void *)(uintptr_t)tx->tx_txg));
-
-	(void) dbuf_dirty(dn->dn_dbuf, tx);
-
-	dsl_dataset_dirty(os->os_dsl_dataset, tx);
-}
-
-void
-dnode_free(dnode_t *dn, dmu_tx_t *tx)
-{
-	mutex_enter(&dn->dn_mtx);
-	if (dn->dn_type == DMU_OT_NONE || dn->dn_free_txg) {
-		mutex_exit(&dn->dn_mtx);
-		return;
-	}
-	dn->dn_free_txg = tx->tx_txg;
-	mutex_exit(&dn->dn_mtx);
-
-	dnode_setdirty(dn, tx);
-}
-
-/*
- * Try to change the block size for the indicated dnode.  This can only
- * succeed if there are no blocks allocated or dirty beyond first block
- */
-int
-dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx)
-{
-	dmu_buf_impl_t *db;
-	int err;
-
-	ASSERT3U(size, <=, spa_maxblocksize(dmu_objset_spa(dn->dn_objset)));
-	if (size == 0)
-		size = SPA_MINBLOCKSIZE;
-	else
-		size = P2ROUNDUP(size, SPA_MINBLOCKSIZE);
-
-	if (ibs == dn->dn_indblkshift)
-		ibs = 0;
-
-	if (size >> SPA_MINBLOCKSHIFT == dn->dn_datablkszsec && ibs == 0)
-		return (0);
-
-	rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
-
-	/* Check for any allocated blocks beyond the first */
-	if (dn->dn_maxblkid != 0)
-		goto fail;
-
-	mutex_enter(&dn->dn_dbufs_mtx);
-	for (db = avl_first(&dn->dn_dbufs); db != NULL;
-	    db = AVL_NEXT(&dn->dn_dbufs, db)) {
-		if (db->db_blkid != 0 && db->db_blkid != DMU_BONUS_BLKID &&
-		    db->db_blkid != DMU_SPILL_BLKID) {
-			mutex_exit(&dn->dn_dbufs_mtx);
-			goto fail;
-		}
-	}
-	mutex_exit(&dn->dn_dbufs_mtx);
-
-	if (ibs && dn->dn_nlevels != 1)
-		goto fail;
-
-	/* resize the old block */
-	err = dbuf_hold_impl(dn, 0, 0, TRUE, FALSE, FTAG, &db);
-	if (err == 0)
-		dbuf_new_size(db, size, tx);
-	else if (err != ENOENT)
-		goto fail;
-
-	dnode_setdblksz(dn, size);
-	dnode_setdirty(dn, tx);
-	dn->dn_next_blksz[tx->tx_txg&TXG_MASK] = size;
-	if (ibs) {
-		dn->dn_indblkshift = ibs;
-		dn->dn_next_indblkshift[tx->tx_txg&TXG_MASK] = ibs;
-	}
-	/* rele after we have fixed the blocksize in the dnode */
-	if (db)
-		dbuf_rele(db, FTAG);
-
-	rw_exit(&dn->dn_struct_rwlock);
-	return (0);
-
-fail:
-	rw_exit(&dn->dn_struct_rwlock);
-	return (SET_ERROR(ENOTSUP));
-}
-
-/* read-holding callers must not rely on the lock being continuously held */
-void
-dnode_new_blkid(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx, boolean_t have_read)
-{
-	uint64_t txgoff = tx->tx_txg & TXG_MASK;
-	int epbs, new_nlevels;
-	uint64_t sz;
-
-	ASSERT(blkid != DMU_BONUS_BLKID);
-
-	ASSERT(have_read ?
-	    RW_READ_HELD(&dn->dn_struct_rwlock) :
-	    RW_WRITE_HELD(&dn->dn_struct_rwlock));
-
-	/*
-	 * if we have a read-lock, check to see if we need to do any work
-	 * before upgrading to a write-lock.
-	 */
-	if (have_read) {
-		if (blkid <= dn->dn_maxblkid)
-			return;
-
-		if (!rw_tryupgrade(&dn->dn_struct_rwlock)) {
-			rw_exit(&dn->dn_struct_rwlock);
-			rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
-		}
-	}
-
-	if (blkid <= dn->dn_maxblkid)
-		goto out;
-
-	dn->dn_maxblkid = blkid;
-
-	/*
-	 * Compute the number of levels necessary to support the new maxblkid.
-	 */
-	new_nlevels = 1;
-	epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
-	for (sz = dn->dn_nblkptr;
-	    sz <= blkid && sz >= dn->dn_nblkptr; sz <<= epbs)
-		new_nlevels++;
-
-	if (new_nlevels > dn->dn_nlevels) {
-		int old_nlevels = dn->dn_nlevels;
-		dmu_buf_impl_t *db;
-		list_t *list;
-		dbuf_dirty_record_t *new, *dr, *dr_next;
-
-		dn->dn_nlevels = new_nlevels;
-
-		ASSERT3U(new_nlevels, >, dn->dn_next_nlevels[txgoff]);
-		dn->dn_next_nlevels[txgoff] = new_nlevels;
-
-		/* dirty the left indirects */
-		db = dbuf_hold_level(dn, old_nlevels, 0, FTAG);
-		ASSERT(db != NULL);
-		new = dbuf_dirty(db, tx);
-		dbuf_rele(db, FTAG);
-
-		/* transfer the dirty records to the new indirect */
-		mutex_enter(&dn->dn_mtx);
-		mutex_enter(&new->dt.di.dr_mtx);
-		list = &dn->dn_dirty_records[txgoff];
-		for (dr = list_head(list); dr; dr = dr_next) {
-			dr_next = list_next(&dn->dn_dirty_records[txgoff], dr);
-			if (dr->dr_dbuf->db_level != new_nlevels-1 &&
-			    dr->dr_dbuf->db_blkid != DMU_BONUS_BLKID &&
-			    dr->dr_dbuf->db_blkid != DMU_SPILL_BLKID) {
-				ASSERT(dr->dr_dbuf->db_level == old_nlevels-1);
-				list_remove(&dn->dn_dirty_records[txgoff], dr);
-				list_insert_tail(&new->dt.di.dr_children, dr);
-				dr->dr_parent = new;
-			}
-		}
-		mutex_exit(&new->dt.di.dr_mtx);
-		mutex_exit(&dn->dn_mtx);
-	}
-
-out:
-	if (have_read)
-		rw_downgrade(&dn->dn_struct_rwlock);
-}
-
-static void
-dnode_dirty_l1(dnode_t *dn, uint64_t l1blkid, dmu_tx_t *tx)
-{
-	dmu_buf_impl_t *db = dbuf_hold_level(dn, 1, l1blkid, FTAG);
-	if (db != NULL) {
-		dmu_buf_will_dirty(&db->db, tx);
-		dbuf_rele(db, FTAG);
-	}
-}
-
-/*
- * Dirty all the in-core level-1 dbufs in the range specified by start_blkid
- * and end_blkid.
- */
-static void
-dnode_dirty_l1range(dnode_t *dn, uint64_t start_blkid, uint64_t end_blkid,
-    dmu_tx_t *tx)
-{
-	dmu_buf_impl_t db_search;
-	dmu_buf_impl_t *db;
-	avl_index_t where;
-
-	mutex_enter(&dn->dn_dbufs_mtx);
-
-	db_search.db_level = 1;
-	db_search.db_blkid = start_blkid + 1;
-	db_search.db_state = DB_SEARCH;
-	for (;;) {
-
-		db = avl_find(&dn->dn_dbufs, &db_search, &where);
-		if (db == NULL)
-			db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER);
-
-		if (db == NULL || db->db_level != 1 ||
-		    db->db_blkid >= end_blkid) {
-			break;
-		}
-
-		/*
-		 * Setup the next blkid we want to search for.
-		 */
-		db_search.db_blkid = db->db_blkid + 1;
-		ASSERT3U(db->db_blkid, >=, start_blkid);
-
-		/*
-		 * If the dbuf transitions to DB_EVICTING while we're trying
-		 * to dirty it, then we will be unable to discover it in
-		 * the dbuf hash table. This will result in a call to
-		 * dbuf_create() which needs to acquire the dn_dbufs_mtx
-		 * lock. To avoid a deadlock, we drop the lock before
-		 * dirtying the level-1 dbuf.
-		 */
-		mutex_exit(&dn->dn_dbufs_mtx);
-		dnode_dirty_l1(dn, db->db_blkid, tx);
-		mutex_enter(&dn->dn_dbufs_mtx);
-	}
-
-#ifdef ZFS_DEBUG
-	/*
-	 * Walk all the in-core level-1 dbufs and verify they have been dirtied.
-	 */
-	db_search.db_level = 1;
-	db_search.db_blkid = start_blkid + 1;
-	db_search.db_state = DB_SEARCH;
-	db = avl_find(&dn->dn_dbufs, &db_search, &where);
-	if (db == NULL)
-		db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER);
-	for (; db != NULL; db = AVL_NEXT(&dn->dn_dbufs, db)) {
-		if (db->db_level != 1 || db->db_blkid >= end_blkid)
-			break;
-		ASSERT(db->db_dirtycnt > 0);
-	}
-#endif
-	mutex_exit(&dn->dn_dbufs_mtx);
-}
-
-void
-dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx)
-{
-	dmu_buf_impl_t *db;
-	uint64_t blkoff, blkid, nblks;
-	int blksz, blkshift, head, tail;
-	int trunc = FALSE;
-	int epbs;
-
-	rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
-	blksz = dn->dn_datablksz;
-	blkshift = dn->dn_datablkshift;
-	epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
-
-	if (len == DMU_OBJECT_END) {
-		len = UINT64_MAX - off;
-		trunc = TRUE;
-	}
-
-	/*
-	 * First, block align the region to free:
-	 */
-	if (ISP2(blksz)) {
-		head = P2NPHASE(off, blksz);
-		blkoff = P2PHASE(off, blksz);
-		if ((off >> blkshift) > dn->dn_maxblkid)
-			goto out;
-	} else {
-		ASSERT(dn->dn_maxblkid == 0);
-		if (off == 0 && len >= blksz) {
-			/*
-			 * Freeing the whole block; fast-track this request.
-			 */
-			blkid = 0;
-			nblks = 1;
-			if (dn->dn_nlevels > 1)
-				dnode_dirty_l1(dn, 0, tx);
-			goto done;
-		} else if (off >= blksz) {
-			/* Freeing past end-of-data */
-			goto out;
-		} else {
-			/* Freeing part of the block. */
-			head = blksz - off;
-			ASSERT3U(head, >, 0);
-		}
-		blkoff = off;
-	}
-	/* zero out any partial block data at the start of the range */
-	if (head) {
-		ASSERT3U(blkoff + head, ==, blksz);
-		if (len < head)
-			head = len;
-		if (dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, 0, off),
-		    TRUE, FALSE, FTAG, &db) == 0) {
-			caddr_t data;
-
-			/* don't dirty if it isn't on disk and isn't dirty */
-			if (db->db_last_dirty ||
-			    (db->db_blkptr && !BP_IS_HOLE(db->db_blkptr))) {
-				rw_exit(&dn->dn_struct_rwlock);
-				dmu_buf_will_dirty(&db->db, tx);
-				rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
-				data = db->db.db_data;
-				bzero(data + blkoff, head);
-			}
-			dbuf_rele(db, FTAG);
-		}
-		off += head;
-		len -= head;
-	}
-
-	/* If the range was less than one block, we're done */
-	if (len == 0)
-		goto out;
-
-	/* If the remaining range is past end of file, we're done */
-	if ((off >> blkshift) > dn->dn_maxblkid)
-		goto out;
-
-	ASSERT(ISP2(blksz));
-	if (trunc)
-		tail = 0;
-	else
-		tail = P2PHASE(len, blksz);
-
-	ASSERT0(P2PHASE(off, blksz));
-	/* zero out any partial block data at the end of the range */
-	if (tail) {
-		if (len < tail)
-			tail = len;
-		if (dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, 0, off+len),
-		    TRUE, FALSE, FTAG, &db) == 0) {
-			/* don't dirty if not on disk and not dirty */
-			if (db->db_last_dirty ||
-			    (db->db_blkptr && !BP_IS_HOLE(db->db_blkptr))) {
-				rw_exit(&dn->dn_struct_rwlock);
-				dmu_buf_will_dirty(&db->db, tx);
-				rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
-				bzero(db->db.db_data, tail);
-			}
-			dbuf_rele(db, FTAG);
-		}
-		len -= tail;
-	}
-
-	/* If the range did not include a full block, we are done */
-	if (len == 0)
-		goto out;
-
-	ASSERT(IS_P2ALIGNED(off, blksz));
-	ASSERT(trunc || IS_P2ALIGNED(len, blksz));
-	blkid = off >> blkshift;
-	nblks = len >> blkshift;
-	if (trunc)
-		nblks += 1;
-
-	/*
-	 * Dirty all the indirect blocks in this range.  Note that only
-	 * the first and last indirect blocks can actually be written
-	 * (if they were partially freed) -- they must be dirtied, even if
-	 * they do not exist on disk yet.  The interior blocks will
-	 * be freed by free_children(), so they will not actually be written.
-	 * Even though these interior blocks will not be written, we
-	 * dirty them for two reasons:
-	 *
-	 *  - It ensures that the indirect blocks remain in memory until
-	 *    syncing context.  (They have already been prefetched by
-	 *    dmu_tx_hold_free(), so we don't have to worry about reading
-	 *    them serially here.)
-	 *
-	 *  - The dirty space accounting will put pressure on the txg sync
-	 *    mechanism to begin syncing, and to delay transactions if there
-	 *    is a large amount of freeing.  Even though these indirect
-	 *    blocks will not be written, we could need to write the same
-	 *    amount of space if we copy the freed BPs into deadlists.
-	 */
-	if (dn->dn_nlevels > 1) {
-		uint64_t first, last;
-
-		first = blkid >> epbs;
-		dnode_dirty_l1(dn, first, tx);
-		if (trunc)
-			last = dn->dn_maxblkid >> epbs;
-		else
-			last = (blkid + nblks - 1) >> epbs;
-		if (last != first)
-			dnode_dirty_l1(dn, last, tx);
-
-		dnode_dirty_l1range(dn, first, last, tx);
-
-		int shift = dn->dn_datablkshift + dn->dn_indblkshift -
-		    SPA_BLKPTRSHIFT;
-		for (uint64_t i = first + 1; i < last; i++) {
-			/*
-			 * Set i to the blockid of the next non-hole
-			 * level-1 indirect block at or after i.  Note
-			 * that dnode_next_offset() operates in terms of
-			 * level-0-equivalent bytes.
-			 */
-			uint64_t ibyte = i << shift;
-			int err = dnode_next_offset(dn, DNODE_FIND_HAVELOCK,
-			    &ibyte, 2, 1, 0);
-			i = ibyte >> shift;
-			if (i >= last)
-				break;
-
-			/*
-			 * Normally we should not see an error, either
-			 * from dnode_next_offset() or dbuf_hold_level()
-			 * (except for ESRCH from dnode_next_offset).
-			 * If there is an i/o error, then when we read
-			 * this block in syncing context, it will use
-			 * ZIO_FLAG_MUSTSUCCEED, and thus hang/panic according
-			 * to the "failmode" property.  dnode_next_offset()
-			 * doesn't have a flag to indicate MUSTSUCCEED.
-			 */
-			if (err != 0)
-				break;
-
-			dnode_dirty_l1(dn, i, tx);
-		}
-	}
-
-done:
-	/*
-	 * Add this range to the dnode range list.
-	 * We will finish up this free operation in the syncing phase.
-	 */
-	mutex_enter(&dn->dn_mtx);
-	int txgoff = tx->tx_txg & TXG_MASK;
-	if (dn->dn_free_ranges[txgoff] == NULL) {
-		dn->dn_free_ranges[txgoff] = range_tree_create(NULL, NULL);
-	}
-	range_tree_clear(dn->dn_free_ranges[txgoff], blkid, nblks);
-	range_tree_add(dn->dn_free_ranges[txgoff], blkid, nblks);
-	dprintf_dnode(dn, "blkid=%llu nblks=%llu txg=%llu\n",
-	    blkid, nblks, tx->tx_txg);
-	mutex_exit(&dn->dn_mtx);
-
-	dbuf_free_range(dn, blkid, blkid + nblks - 1, tx);
-	dnode_setdirty(dn, tx);
-out:
-
-	rw_exit(&dn->dn_struct_rwlock);
-}
-
-static boolean_t
-dnode_spill_freed(dnode_t *dn)
-{
-	int i;
-
-	mutex_enter(&dn->dn_mtx);
-	for (i = 0; i < TXG_SIZE; i++) {
-		if (dn->dn_rm_spillblk[i] == DN_KILL_SPILLBLK)
-			break;
-	}
-	mutex_exit(&dn->dn_mtx);
-	return (i < TXG_SIZE);
-}
-
-/* return TRUE if this blkid was freed in a recent txg, or FALSE if it wasn't */
-uint64_t
-dnode_block_freed(dnode_t *dn, uint64_t blkid)
-{
-	void *dp = spa_get_dsl(dn->dn_objset->os_spa);
-	int i;
-
-	if (blkid == DMU_BONUS_BLKID)
-		return (FALSE);
-
-	/*
-	 * If we're in the process of opening the pool, dp will not be
-	 * set yet, but there shouldn't be anything dirty.
-	 */
-	if (dp == NULL)
-		return (FALSE);
-
-	if (dn->dn_free_txg)
-		return (TRUE);
-
-	if (blkid == DMU_SPILL_BLKID)
-		return (dnode_spill_freed(dn));
-
-	mutex_enter(&dn->dn_mtx);
-	for (i = 0; i < TXG_SIZE; i++) {
-		if (dn->dn_free_ranges[i] != NULL &&
-		    range_tree_contains(dn->dn_free_ranges[i], blkid, 1))
-			break;
-	}
-	mutex_exit(&dn->dn_mtx);
-	return (i < TXG_SIZE);
-}
-
-/* call from syncing context when we actually write/free space for this dnode */
-void
-dnode_diduse_space(dnode_t *dn, int64_t delta)
-{
-	uint64_t space;
-	dprintf_dnode(dn, "dn=%p dnp=%p used=%llu delta=%lld\n",
-	    dn, dn->dn_phys,
-	    (u_longlong_t)dn->dn_phys->dn_used,
-	    (longlong_t)delta);
-
-	mutex_enter(&dn->dn_mtx);
-	space = DN_USED_BYTES(dn->dn_phys);
-	if (delta > 0) {
-		ASSERT3U(space + delta, >=, space); /* no overflow */
-	} else {
-		ASSERT3U(space, >=, -delta); /* no underflow */
-	}
-	space += delta;
-	if (spa_version(dn->dn_objset->os_spa) < SPA_VERSION_DNODE_BYTES) {
-		ASSERT((dn->dn_phys->dn_flags & DNODE_FLAG_USED_BYTES) == 0);
-		ASSERT0(P2PHASE(space, 1<<DEV_BSHIFT));
-		dn->dn_phys->dn_used = space >> DEV_BSHIFT;
-	} else {
-		dn->dn_phys->dn_used = space;
-		dn->dn_phys->dn_flags |= DNODE_FLAG_USED_BYTES;
-	}
-	mutex_exit(&dn->dn_mtx);
-}
-
-/*
- * Scans a block at the indicated "level" looking for a hole or data,
- * depending on 'flags'.
- *
- * If level > 0, then we are scanning an indirect block looking at its
- * pointers.  If level == 0, then we are looking at a block of dnodes.
- *
- * If we don't find what we are looking for in the block, we return ESRCH.
- * Otherwise, return with *offset pointing to the beginning (if searching
- * forwards) or end (if searching backwards) of the range covered by the
- * block pointer we matched on (or dnode).
- *
- * The basic search algorithm used below by dnode_next_offset() is to
- * use this function to search up the block tree (widen the search) until
- * we find something (i.e., we don't return ESRCH) and then search back
- * down the tree (narrow the search) until we reach our original search
- * level.
- */
-static int
-dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset,
-    int lvl, uint64_t blkfill, uint64_t txg)
-{
-	dmu_buf_impl_t *db = NULL;
-	void *data = NULL;
-	uint64_t epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
-	uint64_t epb = 1ULL << epbs;
-	uint64_t minfill, maxfill;
-	boolean_t hole;
-	int i, inc, error, span;
-
-	dprintf("probing object %llu offset %llx level %d of %u\n",
-	    dn->dn_object, *offset, lvl, dn->dn_phys->dn_nlevels);
-
-	hole = ((flags & DNODE_FIND_HOLE) != 0);
-	inc = (flags & DNODE_FIND_BACKWARDS) ? -1 : 1;
-	ASSERT(txg == 0 || !hole);
-
-	if (lvl == dn->dn_phys->dn_nlevels) {
-		error = 0;
-		epb = dn->dn_phys->dn_nblkptr;
-		data = dn->dn_phys->dn_blkptr;
-	} else {
-		uint64_t blkid = dbuf_whichblock(dn, lvl, *offset);
-		error = dbuf_hold_impl(dn, lvl, blkid, TRUE, FALSE, FTAG, &db);
-		if (error) {
-			if (error != ENOENT)
-				return (error);
-			if (hole)
-				return (0);
-			/*
-			 * This can only happen when we are searching up
-			 * the block tree for data.  We don't really need to
-			 * adjust the offset, as we will just end up looking
-			 * at the pointer to this block in its parent, and its
-			 * going to be unallocated, so we will skip over it.
-			 */
-			return (SET_ERROR(ESRCH));
-		}
-		error = dbuf_read(db, NULL, DB_RF_CANFAIL | DB_RF_HAVESTRUCT);
-		if (error) {
-			dbuf_rele(db, FTAG);
-			return (error);
-		}
-		data = db->db.db_data;
-	}
-
-
-	if (db != NULL && txg != 0 && (db->db_blkptr == NULL ||
-	    db->db_blkptr->blk_birth <= txg ||
-	    BP_IS_HOLE(db->db_blkptr))) {
-		/*
-		 * This can only happen when we are searching up the tree
-		 * and these conditions mean that we need to keep climbing.
-		 */
-		error = SET_ERROR(ESRCH);
-	} else if (lvl == 0) {
-		dnode_phys_t *dnp = data;
-
-		ASSERT(dn->dn_type == DMU_OT_DNODE);
-		ASSERT(!(flags & DNODE_FIND_BACKWARDS));
-
-		for (i = (*offset >> DNODE_SHIFT) & (blkfill - 1);
-		    i < blkfill; i += dnp[i].dn_extra_slots + 1) {
-			if ((dnp[i].dn_type == DMU_OT_NONE) == hole)
-				break;
-		}
-
-		if (i == blkfill)
-			error = SET_ERROR(ESRCH);
-
-		*offset = (*offset & ~(DNODE_BLOCK_SIZE - 1)) +
-		    (i << DNODE_SHIFT);
-	} else {
-		blkptr_t *bp = data;
-		uint64_t start = *offset;
-		span = (lvl - 1) * epbs + dn->dn_datablkshift;
-		minfill = 0;
-		maxfill = blkfill << ((lvl - 1) * epbs);
-
-		if (hole)
-			maxfill--;
-		else
-			minfill++;
-
-		*offset = *offset >> span;
-		for (i = BF64_GET(*offset, 0, epbs);
-		    i >= 0 && i < epb; i += inc) {
-			if (BP_GET_FILL(&bp[i]) >= minfill &&
-			    BP_GET_FILL(&bp[i]) <= maxfill &&
-			    (hole || bp[i].blk_birth > txg))
-				break;
-			if (inc > 0 || *offset > 0)
-				*offset += inc;
-		}
-		*offset = *offset << span;
-		if (inc < 0) {
-			/* traversing backwards; position offset at the end */
-			ASSERT3U(*offset, <=, start);
-			*offset = MIN(*offset + (1ULL << span) - 1, start);
-		} else if (*offset < start) {
-			*offset = start;
-		}
-		if (i < 0 || i >= epb)
-			error = SET_ERROR(ESRCH);
-	}
-
-	if (db)
-		dbuf_rele(db, FTAG);
-
-	return (error);
-}
-
-/*
- * Find the next hole, data, or sparse region at or after *offset.
- * The value 'blkfill' tells us how many items we expect to find
- * in an L0 data block; this value is 1 for normal objects,
- * DNODES_PER_BLOCK for the meta dnode, and some fraction of
- * DNODES_PER_BLOCK when searching for sparse regions thereof.
- *
- * Examples:
- *
- * dnode_next_offset(dn, flags, offset, 1, 1, 0);
- *	Finds the next/previous hole/data in a file.
- *	Used in dmu_offset_next().
- *
- * dnode_next_offset(mdn, flags, offset, 0, DNODES_PER_BLOCK, txg);
- *	Finds the next free/allocated dnode an objset's meta-dnode.
- *	Only finds objects that have new contents since txg (ie.
- *	bonus buffer changes and content removal are ignored).
- *	Used in dmu_object_next().
- *
- * dnode_next_offset(mdn, DNODE_FIND_HOLE, offset, 2, DNODES_PER_BLOCK >> 2, 0);
- *	Finds the next L2 meta-dnode bp that's at most 1/4 full.
- *	Used in dmu_object_alloc().
- */
-int
-dnode_next_offset(dnode_t *dn, int flags, uint64_t *offset,
-    int minlvl, uint64_t blkfill, uint64_t txg)
-{
-	uint64_t initial_offset = *offset;
-	int lvl, maxlvl;
-	int error = 0;
-
-	if (!(flags & DNODE_FIND_HAVELOCK))
-		rw_enter(&dn->dn_struct_rwlock, RW_READER);
-
-	if (dn->dn_phys->dn_nlevels == 0) {
-		error = SET_ERROR(ESRCH);
-		goto out;
-	}
-
-	if (dn->dn_datablkshift == 0) {
-		if (*offset < dn->dn_datablksz) {
-			if (flags & DNODE_FIND_HOLE)
-				*offset = dn->dn_datablksz;
-		} else {
-			error = SET_ERROR(ESRCH);
-		}
-		goto out;
-	}
-
-	maxlvl = dn->dn_phys->dn_nlevels;
-
-	for (lvl = minlvl; lvl <= maxlvl; lvl++) {
-		error = dnode_next_offset_level(dn,
-		    flags, offset, lvl, blkfill, txg);
-		if (error != ESRCH)
-			break;
-	}
-
-	while (error == 0 && --lvl >= minlvl) {
-		error = dnode_next_offset_level(dn,
-		    flags, offset, lvl, blkfill, txg);
-	}
-
-	/*
-	 * There's always a "virtual hole" at the end of the object, even
-	 * if all BP's which physically exist are non-holes.
-	 */
-	if ((flags & DNODE_FIND_HOLE) && error == ESRCH && txg == 0 &&
-	    minlvl == 1 && blkfill == 1 && !(flags & DNODE_FIND_BACKWARDS)) {
-		error = 0;
-	}
-
-	if (error == 0 && (flags & DNODE_FIND_BACKWARDS ?
-	    initial_offset < *offset : initial_offset > *offset))
-		error = SET_ERROR(ESRCH);
-out:
-	if (!(flags & DNODE_FIND_HAVELOCK))
-		rw_exit(&dn->dn_struct_rwlock);
-
-	return (error);
-}
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c
+++ /dev/null
@@ -1,779 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-
-/*
- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
- * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
- */
-
-#include <sys/zfs_context.h>
-#include <sys/dbuf.h>
-#include <sys/dnode.h>
-#include <sys/dmu.h>
-#include <sys/dmu_tx.h>
-#include <sys/dmu_objset.h>
-#include <sys/dsl_dataset.h>
-#include <sys/spa.h>
-#include <sys/range_tree.h>
-#include <sys/zfeature.h>
-
-static void
-dnode_increase_indirection(dnode_t *dn, dmu_tx_t *tx)
-{
-	dmu_buf_impl_t *db;
-	int txgoff = tx->tx_txg & TXG_MASK;
-	int nblkptr = dn->dn_phys->dn_nblkptr;
-	int old_toplvl = dn->dn_phys->dn_nlevels - 1;
-	int new_level = dn->dn_next_nlevels[txgoff];
-	int i;
-
-	rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
-
-	/* this dnode can't be paged out because it's dirty */
-	ASSERT(dn->dn_phys->dn_type != DMU_OT_NONE);
-	ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
-	ASSERT(new_level > 1 && dn->dn_phys->dn_nlevels > 0);
-
-	db = dbuf_hold_level(dn, dn->dn_phys->dn_nlevels, 0, FTAG);
-	ASSERT(db != NULL);
-
-	dn->dn_phys->dn_nlevels = new_level;
-	dprintf("os=%p obj=%llu, increase to %d\n", dn->dn_objset,
-	    dn->dn_object, dn->dn_phys->dn_nlevels);
-
-	/* transfer dnode's block pointers to new indirect block */
-	(void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED|DB_RF_HAVESTRUCT);
-	ASSERT(db->db.db_data);
-	ASSERT(arc_released(db->db_buf));
-	ASSERT3U(sizeof (blkptr_t) * nblkptr, <=, db->db.db_size);
-	bcopy(dn->dn_phys->dn_blkptr, db->db.db_data,
-	    sizeof (blkptr_t) * nblkptr);
-	arc_buf_freeze(db->db_buf);
-
-	/* set dbuf's parent pointers to new indirect buf */
-	for (i = 0; i < nblkptr; i++) {
-		dmu_buf_impl_t *child =
-		    dbuf_find(dn->dn_objset, dn->dn_object, old_toplvl, i);
-
-		if (child == NULL)
-			continue;
-#ifdef	DEBUG
-		DB_DNODE_ENTER(child);
-		ASSERT3P(DB_DNODE(child), ==, dn);
-		DB_DNODE_EXIT(child);
-#endif	/* DEBUG */
-		if (child->db_parent && child->db_parent != dn->dn_dbuf) {
-			ASSERT(child->db_parent->db_level == db->db_level);
-			ASSERT(child->db_blkptr !=
-			    &dn->dn_phys->dn_blkptr[child->db_blkid]);
-			mutex_exit(&child->db_mtx);
-			continue;
-		}
-		ASSERT(child->db_parent == NULL ||
-		    child->db_parent == dn->dn_dbuf);
-
-		child->db_parent = db;
-		dbuf_add_ref(db, child);
-		if (db->db.db_data)
-			child->db_blkptr = (blkptr_t *)db->db.db_data + i;
-		else
-			child->db_blkptr = NULL;
-		dprintf_dbuf_bp(child, child->db_blkptr,
-		    "changed db_blkptr to new indirect %s", "");
-
-		mutex_exit(&child->db_mtx);
-	}
-
-	bzero(dn->dn_phys->dn_blkptr, sizeof (blkptr_t) * nblkptr);
-
-	dbuf_rele(db, FTAG);
-
-	rw_exit(&dn->dn_struct_rwlock);
-}
-
-static void
-free_blocks(dnode_t *dn, blkptr_t *bp, int num, dmu_tx_t *tx)
-{
-	dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
-	uint64_t bytesfreed = 0;
-
-	dprintf("ds=%p obj=%llx num=%d\n", ds, dn->dn_object, num);
-
-	for (int i = 0; i < num; i++, bp++) {
-		if (BP_IS_HOLE(bp))
-			continue;
-
-		bytesfreed += dsl_dataset_block_kill(ds, bp, tx, B_FALSE);
-		ASSERT3U(bytesfreed, <=, DN_USED_BYTES(dn->dn_phys));
-
-		/*
-		 * Save some useful information on the holes being
-		 * punched, including logical size, type, and indirection
-		 * level. Retaining birth time enables detection of when
-		 * holes are punched for reducing the number of free
-		 * records transmitted during a zfs send.
-		 */
-
-		uint64_t lsize = BP_GET_LSIZE(bp);
-		dmu_object_type_t type = BP_GET_TYPE(bp);
-		uint64_t lvl = BP_GET_LEVEL(bp);
-
-		bzero(bp, sizeof (blkptr_t));
-
-		if (spa_feature_is_active(dn->dn_objset->os_spa,
-		    SPA_FEATURE_HOLE_BIRTH)) {
-			BP_SET_LSIZE(bp, lsize);
-			BP_SET_TYPE(bp, type);
-			BP_SET_LEVEL(bp, lvl);
-			BP_SET_BIRTH(bp, dmu_tx_get_txg(tx), 0);
-		}
-	}
-	dnode_diduse_space(dn, -bytesfreed);
-}
-
-#ifdef ZFS_DEBUG
-static void
-free_verify(dmu_buf_impl_t *db, uint64_t start, uint64_t end, dmu_tx_t *tx)
-{
-	int off, num;
-	int i, err, epbs;
-	uint64_t txg = tx->tx_txg;
-	dnode_t *dn;
-
-	DB_DNODE_ENTER(db);
-	dn = DB_DNODE(db);
-	epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
-	off = start - (db->db_blkid * 1<<epbs);
-	num = end - start + 1;
-
-	ASSERT3U(off, >=, 0);
-	ASSERT3U(num, >=, 0);
-	ASSERT3U(db->db_level, >, 0);
-	ASSERT3U(db->db.db_size, ==, 1 << dn->dn_phys->dn_indblkshift);
-	ASSERT3U(off+num, <=, db->db.db_size >> SPA_BLKPTRSHIFT);
-	ASSERT(db->db_blkptr != NULL);
-
-	for (i = off; i < off+num; i++) {
-		uint64_t *buf;
-		dmu_buf_impl_t *child;
-		dbuf_dirty_record_t *dr;
-		int j;
-
-		ASSERT(db->db_level == 1);
-
-		rw_enter(&dn->dn_struct_rwlock, RW_READER);
-		err = dbuf_hold_impl(dn, db->db_level-1,
-		    (db->db_blkid << epbs) + i, TRUE, FALSE, FTAG, &child);
-		rw_exit(&dn->dn_struct_rwlock);
-		if (err == ENOENT)
-			continue;
-		ASSERT(err == 0);
-		ASSERT(child->db_level == 0);
-		dr = child->db_last_dirty;
-		while (dr && dr->dr_txg > txg)
-			dr = dr->dr_next;
-		ASSERT(dr == NULL || dr->dr_txg == txg);
-
-		/* data_old better be zeroed */
-		if (dr) {
-			buf = dr->dt.dl.dr_data->b_data;
-			for (j = 0; j < child->db.db_size >> 3; j++) {
-				if (buf[j] != 0) {
-					panic("freed data not zero: "
-					    "child=%p i=%d off=%d num=%d\n",
-					    (void *)child, i, off, num);
-				}
-			}
-		}
-
-		/*
-		 * db_data better be zeroed unless it's dirty in a
-		 * future txg.
-		 */
-		mutex_enter(&child->db_mtx);
-		buf = child->db.db_data;
-		if (buf != NULL && child->db_state != DB_FILL &&
-		    child->db_last_dirty == NULL) {
-			for (j = 0; j < child->db.db_size >> 3; j++) {
-				if (buf[j] != 0) {
-					panic("freed data not zero: "
-					    "child=%p i=%d off=%d num=%d\n",
-					    (void *)child, i, off, num);
-				}
-			}
-		}
-		mutex_exit(&child->db_mtx);
-
-		dbuf_rele(child, FTAG);
-	}
-	DB_DNODE_EXIT(db);
-}
-#endif
-
-/*
- * We don't usually free the indirect blocks here.  If in one txg we have a
- * free_range and a write to the same indirect block, it's important that we
- * preserve the hole's birth times. Therefore, we don't free any any indirect
- * blocks in free_children().  If an indirect block happens to turn into all
- * holes, it will be freed by dbuf_write_children_ready, which happens at a
- * point in the syncing process where we know for certain the contents of the
- * indirect block.
- *
- * However, if we're freeing a dnode, its space accounting must go to zero
- * before we actually try to free the dnode, or we will trip an assertion. In
- * addition, we know the case described above cannot occur, because the dnode is
- * being freed.  Therefore, we free the indirect blocks immediately in that
- * case.
- */
-static void
-free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks,
-    boolean_t free_indirects, dmu_tx_t *tx)
-{
-	dnode_t *dn;
-	blkptr_t *bp;
-	dmu_buf_impl_t *subdb;
-	uint64_t start, end, dbstart, dbend;
-	unsigned int epbs, shift, i;
-
-	/*
-	 * There is a small possibility that this block will not be cached:
-	 *   1 - if level > 1 and there are no children with level <= 1
-	 *   2 - if this block was evicted since we read it from
-	 *	 dmu_tx_hold_free().
-	 */
-	if (db->db_state != DB_CACHED)
-		(void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED);
-
-	/*
-	 * If we modify this indirect block, and we are not freeing the
-	 * dnode (!free_indirects), then this indirect block needs to get
-	 * written to disk by dbuf_write().  If it is dirty, we know it will
-	 * be written (otherwise, we would have incorrect on-disk state
-	 * because the space would be freed but still referenced by the BP
-	 * in this indirect block).  Therefore we VERIFY that it is
-	 * dirty.
-	 *
-	 * Our VERIFY covers some cases that do not actually have to be
-	 * dirty, but the open-context code happens to dirty.  E.g. if the
-	 * blocks we are freeing are all holes, because in that case, we
-	 * are only freeing part of this indirect block, so it is an
-	 * ancestor of the first or last block to be freed.  The first and
-	 * last L1 indirect blocks are always dirtied by dnode_free_range().
-	 */
-	VERIFY(BP_GET_FILL(db->db_blkptr) == 0 || db->db_dirtycnt > 0);
-
-	dbuf_release_bp(db);
-	bp = db->db.db_data;
-
-	DB_DNODE_ENTER(db);
-	dn = DB_DNODE(db);
-	epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
-	ASSERT3U(epbs, <, 31);
-	shift = (db->db_level - 1) * epbs;
-	dbstart = db->db_blkid << epbs;
-	start = blkid >> shift;
-	if (dbstart < start) {
-		bp += start - dbstart;
-	} else {
-		start = dbstart;
-	}
-	dbend = ((db->db_blkid + 1) << epbs) - 1;
-	end = (blkid + nblks - 1) >> shift;
-	if (dbend <= end)
-		end = dbend;
-
-	ASSERT3U(start, <=, end);
-
-	if (db->db_level == 1) {
-		FREE_VERIFY(db, start, end, tx);
-		free_blocks(dn, bp, end-start+1, tx);
-	} else {
-		for (uint64_t id = start; id <= end; id++, bp++) {
-			if (BP_IS_HOLE(bp))
-				continue;
-			rw_enter(&dn->dn_struct_rwlock, RW_READER);
-			VERIFY0(dbuf_hold_impl(dn, db->db_level - 1,
-			    id, TRUE, FALSE, FTAG, &subdb));
-			rw_exit(&dn->dn_struct_rwlock);
-			ASSERT3P(bp, ==, subdb->db_blkptr);
-
-			free_children(subdb, blkid, nblks, free_indirects, tx);
-			dbuf_rele(subdb, FTAG);
-		}
-	}
-
-	if (free_indirects) {
-		for (i = 0, bp = db->db.db_data; i < 1 << epbs; i++, bp++)
-			ASSERT(BP_IS_HOLE(bp));
-		bzero(db->db.db_data, db->db.db_size);
-		free_blocks(dn, db->db_blkptr, 1, tx);
-	}
-
-	DB_DNODE_EXIT(db);
-	arc_buf_freeze(db->db_buf);
-}
-
-/*
- * Traverse the indicated range of the provided file
- * and "free" all the blocks contained there.
- */
-static void
-dnode_sync_free_range_impl(dnode_t *dn, uint64_t blkid, uint64_t nblks,
-    boolean_t free_indirects, dmu_tx_t *tx)
-{
-	blkptr_t *bp = dn->dn_phys->dn_blkptr;
-	int dnlevel = dn->dn_phys->dn_nlevels;
-	boolean_t trunc = B_FALSE;
-
-	if (blkid > dn->dn_phys->dn_maxblkid)
-		return;
-
-	ASSERT(dn->dn_phys->dn_maxblkid < UINT64_MAX);
-	if (blkid + nblks > dn->dn_phys->dn_maxblkid) {
-		nblks = dn->dn_phys->dn_maxblkid - blkid + 1;
-		trunc = B_TRUE;
-	}
-
-	/* There are no indirect blocks in the object */
-	if (dnlevel == 1) {
-		if (blkid >= dn->dn_phys->dn_nblkptr) {
-			/* this range was never made persistent */
-			return;
-		}
-		ASSERT3U(blkid + nblks, <=, dn->dn_phys->dn_nblkptr);
-		free_blocks(dn, bp + blkid, nblks, tx);
-	} else {
-		int shift = (dnlevel - 1) *
-		    (dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT);
-		int start = blkid >> shift;
-		int end = (blkid + nblks - 1) >> shift;
-		dmu_buf_impl_t *db;
-
-		ASSERT(start < dn->dn_phys->dn_nblkptr);
-		bp += start;
-		for (int i = start; i <= end; i++, bp++) {
-			if (BP_IS_HOLE(bp))
-				continue;
-			rw_enter(&dn->dn_struct_rwlock, RW_READER);
-			VERIFY0(dbuf_hold_impl(dn, dnlevel - 1, i,
-			    TRUE, FALSE, FTAG, &db));
-			rw_exit(&dn->dn_struct_rwlock);
-
-			free_children(db, blkid, nblks, free_indirects, tx);
-			dbuf_rele(db, FTAG);
-		}
-	}
-
-	if (trunc) {
-		dn->dn_phys->dn_maxblkid = blkid == 0 ? 0 : blkid - 1;
-
-		uint64_t off = (dn->dn_phys->dn_maxblkid + 1) *
-		    (dn->dn_phys->dn_datablkszsec << SPA_MINBLOCKSHIFT);
-		ASSERT(off < dn->dn_phys->dn_maxblkid ||
-		    dn->dn_phys->dn_maxblkid == 0 ||
-		    dnode_next_offset(dn, 0, &off, 1, 1, 0) != 0);
-	}
-}
-
-typedef struct dnode_sync_free_range_arg {
-	dnode_t *dsfra_dnode;
-	dmu_tx_t *dsfra_tx;
-	boolean_t dsfra_free_indirects;
-} dnode_sync_free_range_arg_t;
-
-static void
-dnode_sync_free_range(void *arg, uint64_t blkid, uint64_t nblks)
-{
-	dnode_sync_free_range_arg_t *dsfra = arg;
-	dnode_t *dn = dsfra->dsfra_dnode;
-
-	mutex_exit(&dn->dn_mtx);
-	dnode_sync_free_range_impl(dn, blkid, nblks,
-	    dsfra->dsfra_free_indirects, dsfra->dsfra_tx);
-	mutex_enter(&dn->dn_mtx);
-}
-
-/*
- * Try to kick all the dnode's dbufs out of the cache...
- */
-void
-dnode_evict_dbufs(dnode_t *dn)
-{
-	dmu_buf_impl_t db_marker;
-	dmu_buf_impl_t *db, *db_next;
-
-	mutex_enter(&dn->dn_dbufs_mtx);
-	for (db = avl_first(&dn->dn_dbufs); db != NULL; db = db_next) {
-
-#ifdef	DEBUG
-		DB_DNODE_ENTER(db);
-		ASSERT3P(DB_DNODE(db), ==, dn);
-		DB_DNODE_EXIT(db);
-#endif	/* DEBUG */
-
-		mutex_enter(&db->db_mtx);
-		if (db->db_state != DB_EVICTING &&
-		    zfs_refcount_is_zero(&db->db_holds)) {
-			db_marker.db_level = db->db_level;
-			db_marker.db_blkid = db->db_blkid;
-			db_marker.db_state = DB_SEARCH;
-			avl_insert_here(&dn->dn_dbufs, &db_marker, db,
-			    AVL_BEFORE);
-
-			/*
-			 * We need to use the "marker" dbuf rather than
-			 * simply getting the next dbuf, because
-			 * dbuf_destroy() may actually remove multiple dbufs.
-			 * It can call itself recursively on the parent dbuf,
-			 * which may also be removed from dn_dbufs.  The code
-			 * flow would look like:
-			 *
-			 * dbuf_destroy():
-			 *   dnode_rele_and_unlock(parent_dbuf, evicting=TRUE):
-			 *	if (!cacheable || pending_evict)
-			 *	  dbuf_destroy()
-			 */
-			dbuf_destroy(db);
-
-			db_next = AVL_NEXT(&dn->dn_dbufs, &db_marker);
-			avl_remove(&dn->dn_dbufs, &db_marker);
-		} else {
-			db->db_pending_evict = TRUE;
-			mutex_exit(&db->db_mtx);
-			db_next = AVL_NEXT(&dn->dn_dbufs, db);
-		}
-	}
-	mutex_exit(&dn->dn_dbufs_mtx);
-
-	dnode_evict_bonus(dn);
-}
-
-void
-dnode_evict_bonus(dnode_t *dn)
-{
-	rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
-	if (dn->dn_bonus != NULL) {
-		if (zfs_refcount_is_zero(&dn->dn_bonus->db_holds)) {
-			mutex_enter(&dn->dn_bonus->db_mtx);
-			dbuf_destroy(dn->dn_bonus);
-			dn->dn_bonus = NULL;
-		} else {
-			dn->dn_bonus->db_pending_evict = TRUE;
-		}
-	}
-	rw_exit(&dn->dn_struct_rwlock);
-}
-
-static void
-dnode_undirty_dbufs(list_t *list)
-{
-	dbuf_dirty_record_t *dr;
-
-	while (dr = list_head(list)) {
-		dmu_buf_impl_t *db = dr->dr_dbuf;
-		uint64_t txg = dr->dr_txg;
-
-		if (db->db_level != 0)
-			dnode_undirty_dbufs(&dr->dt.di.dr_children);
-
-		mutex_enter(&db->db_mtx);
-		/* XXX - use dbuf_undirty()? */
-		list_remove(list, dr);
-		ASSERT(db->db_last_dirty == dr);
-		db->db_last_dirty = NULL;
-		db->db_dirtycnt -= 1;
-		if (db->db_level == 0) {
-			ASSERT(db->db_blkid == DMU_BONUS_BLKID ||
-			    dr->dt.dl.dr_data == db->db_buf);
-			dbuf_unoverride(dr);
-		} else {
-			mutex_destroy(&dr->dt.di.dr_mtx);
-			list_destroy(&dr->dt.di.dr_children);
-		}
-		kmem_free(dr, sizeof (dbuf_dirty_record_t));
-		dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg, B_FALSE);
-	}
-}
-
-static void
-dnode_sync_free(dnode_t *dn, dmu_tx_t *tx)
-{
-	int txgoff = tx->tx_txg & TXG_MASK;
-
-	ASSERT(dmu_tx_is_syncing(tx));
-
-	/*
-	 * Our contents should have been freed in dnode_sync() by the
-	 * free range record inserted by the caller of dnode_free().
-	 */
-	ASSERT0(DN_USED_BYTES(dn->dn_phys));
-	ASSERT(BP_IS_HOLE(dn->dn_phys->dn_blkptr));
-
-	dnode_undirty_dbufs(&dn->dn_dirty_records[txgoff]);
-	dnode_evict_dbufs(dn);
-
-	/*
-	 * XXX - It would be nice to assert this, but we may still
-	 * have residual holds from async evictions from the arc...
-	 *
-	 * zfs_obj_to_path() also depends on this being
-	 * commented out.
-	 *
-	 * ASSERT3U(zfs_refcount_count(&dn->dn_holds), ==, 1);
-	 */
-
-	/* Undirty next bits */
-	dn->dn_next_nlevels[txgoff] = 0;
-	dn->dn_next_indblkshift[txgoff] = 0;
-	dn->dn_next_blksz[txgoff] = 0;
-
-	/* ASSERT(blkptrs are zero); */
-	ASSERT(dn->dn_phys->dn_type != DMU_OT_NONE);
-	ASSERT(dn->dn_type != DMU_OT_NONE);
-
-	ASSERT(dn->dn_free_txg > 0);
-	if (dn->dn_allocated_txg != dn->dn_free_txg)
-		dmu_buf_will_dirty(&dn->dn_dbuf->db, tx);
-	bzero(dn->dn_phys, sizeof (dnode_phys_t) * dn->dn_num_slots);
-	dnode_free_interior_slots(dn);
-
-	mutex_enter(&dn->dn_mtx);
-	dn->dn_type = DMU_OT_NONE;
-	dn->dn_maxblkid = 0;
-	dn->dn_allocated_txg = 0;
-	dn->dn_free_txg = 0;
-	dn->dn_have_spill = B_FALSE;
-	dn->dn_num_slots = 1;
-	mutex_exit(&dn->dn_mtx);
-
-	ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT);
-
-	dnode_rele(dn, (void *)(uintptr_t)tx->tx_txg);
-	/*
-	 * Now that we've released our hold, the dnode may
-	 * be evicted, so we musn't access it.
-	 */
-}
-
-/*
- * Write out the dnode's dirty buffers.
- */
-void
-dnode_sync(dnode_t *dn, dmu_tx_t *tx)
-{
-	dnode_phys_t *dnp = dn->dn_phys;
-	int txgoff = tx->tx_txg & TXG_MASK;
-	list_t *list = &dn->dn_dirty_records[txgoff];
-	static const dnode_phys_t zerodn = { 0 };
-	boolean_t kill_spill = B_FALSE;
-
-	ASSERT(dmu_tx_is_syncing(tx));
-	ASSERT(dnp->dn_type != DMU_OT_NONE || dn->dn_allocated_txg);
-	ASSERT(dnp->dn_type != DMU_OT_NONE ||
-	    bcmp(dnp, &zerodn, DNODE_MIN_SIZE) == 0);
-	DNODE_VERIFY(dn);
-
-	ASSERT(dn->dn_dbuf == NULL || arc_released(dn->dn_dbuf->db_buf));
-
-	if (dmu_objset_userused_enabled(dn->dn_objset) &&
-	    !DMU_OBJECT_IS_SPECIAL(dn->dn_object)) {
-		mutex_enter(&dn->dn_mtx);
-		dn->dn_oldused = DN_USED_BYTES(dn->dn_phys);
-		dn->dn_oldflags = dn->dn_phys->dn_flags;
-		dn->dn_phys->dn_flags |= DNODE_FLAG_USERUSED_ACCOUNTED;
-		mutex_exit(&dn->dn_mtx);
-		dmu_objset_userquota_get_ids(dn, B_FALSE, tx);
-	} else {
-		/* Once we account for it, we should always account for it. */
-		ASSERT(!(dn->dn_phys->dn_flags &
-		    DNODE_FLAG_USERUSED_ACCOUNTED));
-	}
-
-	mutex_enter(&dn->dn_mtx);
-	if (dn->dn_allocated_txg == tx->tx_txg) {
-		/* The dnode is newly allocated or reallocated */
-		if (dnp->dn_type == DMU_OT_NONE) {
-			/* this is a first alloc, not a realloc */
-			dnp->dn_nlevels = 1;
-			dnp->dn_nblkptr = dn->dn_nblkptr;
-		}
-
-		dnp->dn_type = dn->dn_type;
-		dnp->dn_bonustype = dn->dn_bonustype;
-		dnp->dn_bonuslen = dn->dn_bonuslen;
-	}
-
-	dnp->dn_extra_slots = dn->dn_num_slots - 1;
-
-	ASSERT(dnp->dn_nlevels > 1 ||
-	    BP_IS_HOLE(&dnp->dn_blkptr[0]) ||
-	    BP_IS_EMBEDDED(&dnp->dn_blkptr[0]) ||
-	    BP_GET_LSIZE(&dnp->dn_blkptr[0]) ==
-	    dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT);
-	ASSERT(dnp->dn_nlevels < 2 ||
-	    BP_IS_HOLE(&dnp->dn_blkptr[0]) ||
-	    BP_GET_LSIZE(&dnp->dn_blkptr[0]) == 1 << dnp->dn_indblkshift);
-
-	if (dn->dn_next_type[txgoff] != 0) {
-		dnp->dn_type = dn->dn_type;
-		dn->dn_next_type[txgoff] = 0;
-	}
-
-	if (dn->dn_next_blksz[txgoff] != 0) {
-		ASSERT(P2PHASE(dn->dn_next_blksz[txgoff],
-		    SPA_MINBLOCKSIZE) == 0);
-		ASSERT(BP_IS_HOLE(&dnp->dn_blkptr[0]) ||
-		    dn->dn_maxblkid == 0 || list_head(list) != NULL ||
-		    dn->dn_next_blksz[txgoff] >> SPA_MINBLOCKSHIFT ==
-		    dnp->dn_datablkszsec ||
-		    !range_tree_is_empty(dn->dn_free_ranges[txgoff]));
-		dnp->dn_datablkszsec =
-		    dn->dn_next_blksz[txgoff] >> SPA_MINBLOCKSHIFT;
-		dn->dn_next_blksz[txgoff] = 0;
-	}
-
-	if (dn->dn_next_bonuslen[txgoff] != 0) {
-		if (dn->dn_next_bonuslen[txgoff] == DN_ZERO_BONUSLEN)
-			dnp->dn_bonuslen = 0;
-		else
-			dnp->dn_bonuslen = dn->dn_next_bonuslen[txgoff];
-		ASSERT(dnp->dn_bonuslen <=
-		    DN_SLOTS_TO_BONUSLEN(dnp->dn_extra_slots + 1));
-		dn->dn_next_bonuslen[txgoff] = 0;
-	}
-
-	if (dn->dn_next_bonustype[txgoff] != 0) {
-		ASSERT(DMU_OT_IS_VALID(dn->dn_next_bonustype[txgoff]));
-		dnp->dn_bonustype = dn->dn_next_bonustype[txgoff];
-		dn->dn_next_bonustype[txgoff] = 0;
-	}
-
-	boolean_t freeing_dnode = dn->dn_free_txg > 0 &&
-	    dn->dn_free_txg <= tx->tx_txg;
-
-	/*
-	 * Remove the spill block if we have been explicitly asked to
-	 * remove it, or if the object is being removed.
-	 */
-	if (dn->dn_rm_spillblk[txgoff] || freeing_dnode) {
-		if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR)
-			kill_spill = B_TRUE;
-		dn->dn_rm_spillblk[txgoff] = 0;
-	}
-
-	if (dn->dn_next_indblkshift[txgoff] != 0) {
-		ASSERT(dnp->dn_nlevels == 1);
-		dnp->dn_indblkshift = dn->dn_next_indblkshift[txgoff];
-		dn->dn_next_indblkshift[txgoff] = 0;
-	}
-
-	/*
-	 * Just take the live (open-context) values for checksum and compress.
-	 * Strictly speaking it's a future leak, but nothing bad happens if we
-	 * start using the new checksum or compress algorithm a little early.
-	 */
-	dnp->dn_checksum = dn->dn_checksum;
-	dnp->dn_compress = dn->dn_compress;
-
-	mutex_exit(&dn->dn_mtx);
-
-	if (kill_spill) {
-		free_blocks(dn, DN_SPILL_BLKPTR(dn->dn_phys), 1, tx);
-		mutex_enter(&dn->dn_mtx);
-		dnp->dn_flags &= ~DNODE_FLAG_SPILL_BLKPTR;
-		mutex_exit(&dn->dn_mtx);
-	}
-
-	/* process all the "freed" ranges in the file */
-	if (dn->dn_free_ranges[txgoff] != NULL) {
-		dnode_sync_free_range_arg_t dsfra;
-		dsfra.dsfra_dnode = dn;
-		dsfra.dsfra_tx = tx;
-		dsfra.dsfra_free_indirects = freeing_dnode;
-		if (freeing_dnode) {
-			ASSERT(range_tree_contains(dn->dn_free_ranges[txgoff],
-			    0, dn->dn_maxblkid + 1));
-		}
-		mutex_enter(&dn->dn_mtx);
-		range_tree_vacate(dn->dn_free_ranges[txgoff],
-		    dnode_sync_free_range, &dsfra);
-		range_tree_destroy(dn->dn_free_ranges[txgoff]);
-		dn->dn_free_ranges[txgoff] = NULL;
-		mutex_exit(&dn->dn_mtx);
-	}
-
-	if (freeing_dnode) {
-		dn->dn_objset->os_freed_dnodes++;
-		dnode_sync_free(dn, tx);
-		return;
-	}
-
-	if (dn->dn_num_slots > DNODE_MIN_SLOTS) {
-		dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
-		mutex_enter(&ds->ds_lock);
-		ds->ds_feature_activation_needed[SPA_FEATURE_LARGE_DNODE] =
-		    B_TRUE;
-		mutex_exit(&ds->ds_lock);
-	}
-
-	if (dn->dn_next_nlevels[txgoff]) {
-		dnode_increase_indirection(dn, tx);
-		dn->dn_next_nlevels[txgoff] = 0;
-	}
-
-	if (dn->dn_next_nblkptr[txgoff]) {
-		/* this should only happen on a realloc */
-		ASSERT(dn->dn_allocated_txg == tx->tx_txg);
-		if (dn->dn_next_nblkptr[txgoff] > dnp->dn_nblkptr) {
-			/* zero the new blkptrs we are gaining */
-			bzero(dnp->dn_blkptr + dnp->dn_nblkptr,
-			    sizeof (blkptr_t) *
-			    (dn->dn_next_nblkptr[txgoff] - dnp->dn_nblkptr));
-#ifdef ZFS_DEBUG
-		} else {
-			int i;
-			ASSERT(dn->dn_next_nblkptr[txgoff] < dnp->dn_nblkptr);
-			/* the blkptrs we are losing better be unallocated */
-			for (i = dn->dn_next_nblkptr[txgoff];
-			    i < dnp->dn_nblkptr; i++)
-				ASSERT(BP_IS_HOLE(&dnp->dn_blkptr[i]));
-#endif
-		}
-		mutex_enter(&dn->dn_mtx);
-		dnp->dn_nblkptr = dn->dn_next_nblkptr[txgoff];
-		dn->dn_next_nblkptr[txgoff] = 0;
-		mutex_exit(&dn->dn_mtx);
-	}
-
-	dbuf_sync_list(list, dn->dn_phys->dn_nlevels - 1, tx);
-
-	if (!DMU_OBJECT_IS_SPECIAL(dn->dn_object)) {
-		ASSERT3P(list_head(list), ==, NULL);
-		dnode_rele(dn, (void *)(uintptr_t)tx->tx_txg);
-	}
-
-	/*
-	 * Although we have dropped our reference to the dnode, it
-	 * can't be evicted until its written, and we haven't yet
-	 * initiated the IO for the dnode's dbuf.
-	 */
-}
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_bookmark.c
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_bookmark.c
+++ /dev/null
@@ -1,566 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * This file and its contents are supplied under the terms of the
- * Common Development and Distribution License ("CDDL"), version 1.0.
- * You may only use this file in accordance with the terms of version
- * 1.0 of the CDDL.
- *
- * A full copy of the text of the CDDL should have accompanied this
- * source.  A copy of the CDDL is also available via the Internet at
- * http://www.illumos.org/license/CDDL.
- *
- * CDDL HEADER END
- */
-
-/*
- * Copyright (c) 2013, 2014 by Delphix. All rights reserved.
- * Copyright 2017 Nexenta Systems, Inc.
- */
-
-#include <sys/zfs_context.h>
-#include <sys/dsl_dataset.h>
-#include <sys/dsl_dir.h>
-#include <sys/dsl_prop.h>
-#include <sys/dsl_synctask.h>
-#include <sys/dmu_impl.h>
-#include <sys/dmu_tx.h>
-#include <sys/arc.h>
-#include <sys/zap.h>
-#include <sys/zfeature.h>
-#include <sys/spa.h>
-#include <sys/dsl_bookmark.h>
-#include <zfs_namecheck.h>
-
-static int
-dsl_bookmark_hold_ds(dsl_pool_t *dp, const char *fullname,
-    dsl_dataset_t **dsp, void *tag, char **shortnamep)
-{
-	char buf[ZFS_MAX_DATASET_NAME_LEN];
-	char *hashp;
-
-	if (strlen(fullname) >= ZFS_MAX_DATASET_NAME_LEN)
-		return (SET_ERROR(ENAMETOOLONG));
-	hashp = strchr(fullname, '#');
-	if (hashp == NULL)
-		return (SET_ERROR(EINVAL));
-
-	*shortnamep = hashp + 1;
-	if (zfs_component_namecheck(*shortnamep, NULL, NULL))
-		return (SET_ERROR(EINVAL));
-	(void) strlcpy(buf, fullname, hashp - fullname + 1);
-	return (dsl_dataset_hold(dp, buf, tag, dsp));
-}
-
-/*
- * Returns ESRCH if bookmark is not found.
- */
-static int
-dsl_dataset_bmark_lookup(dsl_dataset_t *ds, const char *shortname,
-    zfs_bookmark_phys_t *bmark_phys)
-{
-	objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
-	uint64_t bmark_zapobj = ds->ds_bookmarks;
-	matchtype_t mt = 0;
-	int err;
-
-	if (bmark_zapobj == 0)
-		return (SET_ERROR(ESRCH));
-
-	if (dsl_dataset_phys(ds)->ds_flags & DS_FLAG_CI_DATASET)
-		mt = MT_NORMALIZE;
-
-	err = zap_lookup_norm(mos, bmark_zapobj, shortname, sizeof (uint64_t),
-	    sizeof (*bmark_phys) / sizeof (uint64_t), bmark_phys, mt,
-	    NULL, 0, NULL);
-
-	return (err == ENOENT ? ESRCH : err);
-}
-
-/*
- * If later_ds is non-NULL, this will return EXDEV if the the specified bookmark
- * does not represents an earlier point in later_ds's timeline.
- *
- * Returns ENOENT if the dataset containing the bookmark does not exist.
- * Returns ESRCH if the dataset exists but the bookmark was not found in it.
- */
-int
-dsl_bookmark_lookup(dsl_pool_t *dp, const char *fullname,
-    dsl_dataset_t *later_ds, zfs_bookmark_phys_t *bmp)
-{
-	char *shortname;
-	dsl_dataset_t *ds;
-	int error;
-
-	error = dsl_bookmark_hold_ds(dp, fullname, &ds, FTAG, &shortname);
-	if (error != 0)
-		return (error);
-
-	error = dsl_dataset_bmark_lookup(ds, shortname, bmp);
-	if (error == 0 && later_ds != NULL) {
-		if (!dsl_dataset_is_before(later_ds, ds, bmp->zbm_creation_txg))
-			error = SET_ERROR(EXDEV);
-	}
-	dsl_dataset_rele(ds, FTAG);
-	return (error);
-}
-
-typedef struct dsl_bookmark_create_arg {
-	nvlist_t *dbca_bmarks;
-	nvlist_t *dbca_errors;
-} dsl_bookmark_create_arg_t;
-
-static int
-dsl_bookmark_create_check_impl(dsl_dataset_t *snapds, const char *bookmark_name,
-    dmu_tx_t *tx)
-{
-	dsl_pool_t *dp = dmu_tx_pool(tx);
-	dsl_dataset_t *bmark_fs;
-	char *shortname;
-	int error;
-	zfs_bookmark_phys_t bmark_phys;
-
-	if (!snapds->ds_is_snapshot)
-		return (SET_ERROR(EINVAL));
-
-	error = dsl_bookmark_hold_ds(dp, bookmark_name,
-	    &bmark_fs, FTAG, &shortname);
-	if (error != 0)
-		return (error);
-
-	if (!dsl_dataset_is_before(bmark_fs, snapds, 0)) {
-		dsl_dataset_rele(bmark_fs, FTAG);
-		return (SET_ERROR(EINVAL));
-	}
-
-	error = dsl_dataset_bmark_lookup(bmark_fs, shortname,
-	    &bmark_phys);
-	dsl_dataset_rele(bmark_fs, FTAG);
-	if (error == 0)
-		return (SET_ERROR(EEXIST));
-	if (error == ESRCH)
-		return (0);
-	return (error);
-}
-
-static int
-dsl_bookmark_create_check(void *arg, dmu_tx_t *tx)
-{
-	dsl_bookmark_create_arg_t *dbca = arg;
-	dsl_pool_t *dp = dmu_tx_pool(tx);
-	int rv = 0;
-
-	if (!spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_BOOKMARKS))
-		return (SET_ERROR(ENOTSUP));
-
-	for (nvpair_t *pair = nvlist_next_nvpair(dbca->dbca_bmarks, NULL);
-	    pair != NULL; pair = nvlist_next_nvpair(dbca->dbca_bmarks, pair)) {
-		dsl_dataset_t *snapds;
-		int error;
-
-		/* note: validity of nvlist checked by ioctl layer */
-		error = dsl_dataset_hold(dp, fnvpair_value_string(pair),
-		    FTAG, &snapds);
-		if (error == 0) {
-			error = dsl_bookmark_create_check_impl(snapds,
-			    nvpair_name(pair), tx);
-			dsl_dataset_rele(snapds, FTAG);
-		}
-		if (error != 0) {
-			fnvlist_add_int32(dbca->dbca_errors,
-			    nvpair_name(pair), error);
-			rv = error;
-		}
-	}
-
-	return (rv);
-}
-
-static void
-dsl_bookmark_create_sync(void *arg, dmu_tx_t *tx)
-{
-	dsl_bookmark_create_arg_t *dbca = arg;
-	dsl_pool_t *dp = dmu_tx_pool(tx);
-	objset_t *mos = dp->dp_meta_objset;
-
-	ASSERT(spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_BOOKMARKS));
-
-	for (nvpair_t *pair = nvlist_next_nvpair(dbca->dbca_bmarks, NULL);
-	    pair != NULL; pair = nvlist_next_nvpair(dbca->dbca_bmarks, pair)) {
-		dsl_dataset_t *snapds, *bmark_fs;
-		zfs_bookmark_phys_t bmark_phys;
-		char *shortname;
-
-		VERIFY0(dsl_dataset_hold(dp, fnvpair_value_string(pair),
-		    FTAG, &snapds));
-		VERIFY0(dsl_bookmark_hold_ds(dp, nvpair_name(pair),
-		    &bmark_fs, FTAG, &shortname));
-		if (bmark_fs->ds_bookmarks == 0) {
-			bmark_fs->ds_bookmarks =
-			    zap_create_norm(mos, U8_TEXTPREP_TOUPPER,
-			    DMU_OTN_ZAP_METADATA, DMU_OT_NONE, 0, tx);
-			spa_feature_incr(dp->dp_spa, SPA_FEATURE_BOOKMARKS, tx);
-
-			dsl_dataset_zapify(bmark_fs, tx);
-			VERIFY0(zap_add(mos, bmark_fs->ds_object,
-			    DS_FIELD_BOOKMARK_NAMES,
-			    sizeof (bmark_fs->ds_bookmarks), 1,
-			    &bmark_fs->ds_bookmarks, tx));
-		}
-
-		bmark_phys.zbm_guid = dsl_dataset_phys(snapds)->ds_guid;
-		bmark_phys.zbm_creation_txg =
-		    dsl_dataset_phys(snapds)->ds_creation_txg;
-		bmark_phys.zbm_creation_time =
-		    dsl_dataset_phys(snapds)->ds_creation_time;
-
-		VERIFY0(zap_add(mos, bmark_fs->ds_bookmarks,
-		    shortname, sizeof (uint64_t),
-		    sizeof (zfs_bookmark_phys_t) / sizeof (uint64_t),
-		    &bmark_phys, tx));
-
-		spa_history_log_internal_ds(bmark_fs, "bookmark", tx,
-		    "name=%s creation_txg=%llu target_snap=%llu",
-		    shortname,
-		    (longlong_t)bmark_phys.zbm_creation_txg,
-		    (longlong_t)snapds->ds_object);
-
-		dsl_dataset_rele(bmark_fs, FTAG);
-		dsl_dataset_rele(snapds, FTAG);
-	}
-}
-
-/*
- * The bookmarks must all be in the same pool.
- */
-int
-dsl_bookmark_create(nvlist_t *bmarks, nvlist_t *errors)
-{
-	nvpair_t *pair;
-	dsl_bookmark_create_arg_t dbca;
-
-	pair = nvlist_next_nvpair(bmarks, NULL);
-	if (pair == NULL)
-		return (0);
-
-	dbca.dbca_bmarks = bmarks;
-	dbca.dbca_errors = errors;
-
-	return (dsl_sync_task(nvpair_name(pair), dsl_bookmark_create_check,
-	    dsl_bookmark_create_sync, &dbca,
-	    fnvlist_num_pairs(bmarks), ZFS_SPACE_CHECK_NORMAL));
-}
-
-int
-dsl_get_bookmarks_impl(dsl_dataset_t *ds, nvlist_t *props, nvlist_t *outnvl)
-{
-	int err = 0;
-	zap_cursor_t zc;
-	zap_attribute_t attr;
-	dsl_pool_t *dp = ds->ds_dir->dd_pool;
-
-	uint64_t bmark_zapobj = ds->ds_bookmarks;
-	if (bmark_zapobj == 0)
-		return (0);
-
-	for (zap_cursor_init(&zc, dp->dp_meta_objset, bmark_zapobj);
-	    zap_cursor_retrieve(&zc, &attr) == 0;
-	    zap_cursor_advance(&zc)) {
-		char *bmark_name = attr.za_name;
-		zfs_bookmark_phys_t bmark_phys;
-
-		err = dsl_dataset_bmark_lookup(ds, bmark_name, &bmark_phys);
-		ASSERT3U(err, !=, ENOENT);
-		if (err != 0)
-			break;
-
-		nvlist_t *out_props = fnvlist_alloc();
-		if (nvlist_exists(props,
-		    zfs_prop_to_name(ZFS_PROP_GUID))) {
-			dsl_prop_nvlist_add_uint64(out_props,
-			    ZFS_PROP_GUID, bmark_phys.zbm_guid);
-		}
-		if (nvlist_exists(props,
-		    zfs_prop_to_name(ZFS_PROP_CREATETXG))) {
-			dsl_prop_nvlist_add_uint64(out_props,
-			    ZFS_PROP_CREATETXG, bmark_phys.zbm_creation_txg);
-		}
-		if (nvlist_exists(props,
-		    zfs_prop_to_name(ZFS_PROP_CREATION))) {
-			dsl_prop_nvlist_add_uint64(out_props,
-			    ZFS_PROP_CREATION, bmark_phys.zbm_creation_time);
-		}
-
-		fnvlist_add_nvlist(outnvl, bmark_name, out_props);
-		fnvlist_free(out_props);
-	}
-	zap_cursor_fini(&zc);
-	return (err);
-}
-
-/*
- * Retrieve the bookmarks that exist in the specified dataset, and the
- * requested properties of each bookmark.
- *
- * The "props" nvlist specifies which properties are requested.
- * See lzc_get_bookmarks() for the list of valid properties.
- */
-int
-dsl_get_bookmarks(const char *dsname, nvlist_t *props, nvlist_t *outnvl)
-{
-	dsl_pool_t *dp;
-	dsl_dataset_t *ds;
-	int err;
-
-	err = dsl_pool_hold(dsname, FTAG, &dp);
-	if (err != 0)
-		return (err);
-	err = dsl_dataset_hold(dp, dsname, FTAG, &ds);
-	if (err != 0) {
-		dsl_pool_rele(dp, FTAG);
-		return (err);
-	}
-
-	err = dsl_get_bookmarks_impl(ds, props, outnvl);
-
-	dsl_dataset_rele(ds, FTAG);
-	dsl_pool_rele(dp, FTAG);
-	return (err);
-}
-
-typedef struct dsl_bookmark_destroy_arg {
-	nvlist_t *dbda_bmarks;
-	nvlist_t *dbda_success;
-	nvlist_t *dbda_errors;
-} dsl_bookmark_destroy_arg_t;
-
-static int
-dsl_dataset_bookmark_remove(dsl_dataset_t *ds, const char *name, dmu_tx_t *tx)
-{
-	objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
-	uint64_t bmark_zapobj = ds->ds_bookmarks;
-	matchtype_t mt = 0;
-
-	if (dsl_dataset_phys(ds)->ds_flags & DS_FLAG_CI_DATASET)
-		mt = MT_NORMALIZE;
-
-	return (zap_remove_norm(mos, bmark_zapobj, name, mt, tx));
-}
-
-static int
-dsl_bookmark_destroy_check(void *arg, dmu_tx_t *tx)
-{
-	dsl_bookmark_destroy_arg_t *dbda = arg;
-	dsl_pool_t *dp = dmu_tx_pool(tx);
-	int rv = 0;
-
-	ASSERT(nvlist_empty(dbda->dbda_success));
-	ASSERT(nvlist_empty(dbda->dbda_errors));
-
-	if (!spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_BOOKMARKS))
-		return (0);
-
-	for (nvpair_t *pair = nvlist_next_nvpair(dbda->dbda_bmarks, NULL);
-	    pair != NULL; pair = nvlist_next_nvpair(dbda->dbda_bmarks, pair)) {
-		const char *fullname = nvpair_name(pair);
-		dsl_dataset_t *ds;
-		zfs_bookmark_phys_t bm;
-		int error;
-		char *shortname;
-
-		error = dsl_bookmark_hold_ds(dp, fullname, &ds,
-		    FTAG, &shortname);
-		if (error == ENOENT) {
-			/* ignore it; the bookmark is "already destroyed" */
-			continue;
-		}
-		if (error == 0) {
-			error = dsl_dataset_bmark_lookup(ds, shortname, &bm);
-			dsl_dataset_rele(ds, FTAG);
-			if (error == ESRCH) {
-				/*
-				 * ignore it; the bookmark is
-				 * "already destroyed"
-				 */
-				continue;
-			}
-		}
-		if (error == 0) {
-			if (dmu_tx_is_syncing(tx)) {
-				fnvlist_add_boolean(dbda->dbda_success,
-				    fullname);
-			}
-		} else {
-			fnvlist_add_int32(dbda->dbda_errors, fullname, error);
-			rv = error;
-		}
-	}
-	return (rv);
-}
-
-static void
-dsl_bookmark_destroy_sync(void *arg, dmu_tx_t *tx)
-{
-	dsl_bookmark_destroy_arg_t *dbda = arg;
-	dsl_pool_t *dp = dmu_tx_pool(tx);
-	objset_t *mos = dp->dp_meta_objset;
-
-	for (nvpair_t *pair = nvlist_next_nvpair(dbda->dbda_success, NULL);
-	    pair != NULL; pair = nvlist_next_nvpair(dbda->dbda_success, pair)) {
-		dsl_dataset_t *ds;
-		char *shortname;
-		uint64_t zap_cnt;
-
-		VERIFY0(dsl_bookmark_hold_ds(dp, nvpair_name(pair),
-		    &ds, FTAG, &shortname));
-		VERIFY0(dsl_dataset_bookmark_remove(ds, shortname, tx));
-
-		/*
-		 * If all of this dataset's bookmarks have been destroyed,
-		 * free the zap object and decrement the feature's use count.
-		 */
-		VERIFY0(zap_count(mos, ds->ds_bookmarks,
-		    &zap_cnt));
-		if (zap_cnt == 0) {
-			dmu_buf_will_dirty(ds->ds_dbuf, tx);
-			VERIFY0(zap_destroy(mos, ds->ds_bookmarks, tx));
-			ds->ds_bookmarks = 0;
-			spa_feature_decr(dp->dp_spa, SPA_FEATURE_BOOKMARKS, tx);
-			VERIFY0(zap_remove(mos, ds->ds_object,
-			    DS_FIELD_BOOKMARK_NAMES, tx));
-		}
-
-		spa_history_log_internal_ds(ds, "remove bookmark", tx,
-		    "name=%s", shortname);
-
-		dsl_dataset_rele(ds, FTAG);
-	}
-}
-
-/*
- * The bookmarks must all be in the same pool.
- */
-int
-dsl_bookmark_destroy(nvlist_t *bmarks, nvlist_t *errors)
-{
-	int rv;
-	dsl_bookmark_destroy_arg_t dbda;
-	nvpair_t *pair = nvlist_next_nvpair(bmarks, NULL);
-	if (pair == NULL)
-		return (0);
-
-	dbda.dbda_bmarks = bmarks;
-	dbda.dbda_errors = errors;
-	dbda.dbda_success = fnvlist_alloc();
-
-	rv = dsl_sync_task(nvpair_name(pair), dsl_bookmark_destroy_check,
-	    dsl_bookmark_destroy_sync, &dbda, fnvlist_num_pairs(bmarks),
-	    ZFS_SPACE_CHECK_RESERVED);
-	fnvlist_free(dbda.dbda_success);
-	return (rv);
-}
-
-typedef struct dsl_bookmark_rename_arg {
-	const char *dbra_fsname;
-	const char *dbra_oldname;
-	const char *dbra_newname;
-} dsl_bookmark_rename_arg_t;
-
-static int
-dsl_bookmark_rename_check(void *arg, dmu_tx_t *tx)
-{
-	dsl_bookmark_rename_arg_t *dbra = arg;
-	dsl_pool_t *dp = dmu_tx_pool(tx);
-	dsl_dataset_t *ds;
-	zfs_bookmark_phys_t bmark_phys;
-	int error;
-
-	if (!spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_BOOKMARKS))
-		return (SET_ERROR(ENOTSUP));
-
-	/* Check validity and the full length of the new bookmark name. */
-	if (zfs_component_namecheck(dbra->dbra_newname, NULL, NULL))
-		return (SET_ERROR(EINVAL));
-	if (strlen(dbra->dbra_fsname) + strlen(dbra->dbra_newname) + 1 >=
-	    ZFS_MAX_DATASET_NAME_LEN)
-		return (SET_ERROR(ENAMETOOLONG));
-
-	error = dsl_dataset_hold(dp, dbra->dbra_fsname, FTAG, &ds);
-	if (error != 0)
-		return (error);
-	if (ds->ds_is_snapshot) {
-		dsl_dataset_rele(ds, FTAG);
-		return (SET_ERROR(EINVAL));
-	}
-	error = dsl_dataset_bmark_lookup(ds, dbra->dbra_oldname, &bmark_phys);
-	if (error != 0) {
-		dsl_dataset_rele(ds, FTAG);
-		return (error);
-	}
-
-	error = dsl_dataset_bmark_lookup(ds, dbra->dbra_newname, &bmark_phys);
-	dsl_dataset_rele(ds, FTAG);
-	if (error == 0)
-		return (SET_ERROR(EEXIST));
-	if (error != ESRCH)
-		return (error);
-	return (0);
-}
-
-static void
-dsl_bookmark_rename_sync(void *arg, dmu_tx_t *tx)
-{
-	zfs_bookmark_phys_t bmark_phys;
-	dsl_bookmark_rename_arg_t *dbra = arg;
-	dsl_pool_t *dp = dmu_tx_pool(tx);
-	objset_t *mos;
-	dsl_dataset_t *ds;
-	uint64_t bmark_zapobj;
-	uint64_t int_size, num_ints;
-	matchtype_t mt = 0;
-	int error;
-
-	ASSERT(spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_BOOKMARKS));
-	VERIFY0(dsl_dataset_hold(dp, dbra->dbra_fsname, FTAG, &ds));
-
-	mos = ds->ds_dir->dd_pool->dp_meta_objset;
-	bmark_zapobj = ds->ds_bookmarks;
-
-	if (dsl_dataset_phys(ds)->ds_flags & DS_FLAG_CI_DATASET)
-		mt = MT_NORMALIZE;
-
-	VERIFY0(zap_length(mos, bmark_zapobj, dbra->dbra_oldname,
-	    &int_size, &num_ints));
-	ASSERT3U(int_size, ==, sizeof (uint64_t));
-	VERIFY0(zap_lookup_norm(mos, bmark_zapobj, dbra->dbra_oldname, int_size,
-	    num_ints, &bmark_phys, mt, NULL, 0, NULL));
-	VERIFY0(zap_remove_norm(mos, bmark_zapobj, dbra->dbra_oldname, mt, tx));
-
-	VERIFY0(zap_add(mos, bmark_zapobj, dbra->dbra_newname, int_size,
-	    num_ints, &bmark_phys, tx));
-
-	spa_history_log_internal_ds(ds, "rename bookmark", tx,
-	    "#%s -> #%s creation_txg=%llu",
-	    dbra->dbra_oldname, dbra->dbra_newname,
-	    (longlong_t)bmark_phys.zbm_creation_txg);
-
-	dsl_dataset_rele(ds, FTAG);
-}
-
-/*
- * The bookmarks must all be in the same pool.
- */
-int
-dsl_bookmark_rename(const char *fsname, const char *oldbmark,
-    const char *newbmark)
-{
-	dsl_bookmark_rename_arg_t dbra;
-
-	dbra.dbra_fsname = fsname;
-	dbra.dbra_oldname = oldbmark;
-	dbra.dbra_newname = newbmark;
-
-	return (dsl_sync_task(fsname, dsl_bookmark_rename_check,
-	    dsl_bookmark_rename_sync, &dbra, 1, ZFS_SPACE_CHECK_NORMAL));
-}
-
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c
+++ /dev/null
@@ -1,4252 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-
-/*
- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Portions Copyright (c) 2011 Martin Matuska <mm@FreeBSD.org>
- * Copyright (c) 2011, 2017 by Delphix. All rights reserved.
- * Copyright (c) 2014, Joyent, Inc. All rights reserved.
- * Copyright (c) 2014 RackTop Systems.
- * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
- * Copyright (c) 2014 Integros [integros.com]
- * Copyright 2016, OmniTI Computer Consulting, Inc. All rights reserved.
- * Copyright 2017 Nexenta Systems, Inc.
- */
-
-#include <sys/dmu_objset.h>
-#include <sys/dsl_dataset.h>
-#include <sys/dsl_dir.h>
-#include <sys/dsl_prop.h>
-#include <sys/dsl_synctask.h>
-#include <sys/dmu_traverse.h>
-#include <sys/dmu_impl.h>
-#include <sys/dmu_send.h>
-#include <sys/dmu_tx.h>
-#include <sys/arc.h>
-#include <sys/zio.h>
-#include <sys/zap.h>
-#include <sys/zfeature.h>
-#include <sys/unique.h>
-#include <sys/zfs_context.h>
-#include <sys/zfs_ioctl.h>
-#include <sys/spa.h>
-#include <sys/spa_impl.h>
-#include <sys/vdev.h>
-#include <sys/zfs_znode.h>
-#include <sys/zfs_onexit.h>
-#include <sys/zvol.h>
-#include <sys/dsl_scan.h>
-#include <sys/dsl_deadlist.h>
-#include <sys/dsl_destroy.h>
-#include <sys/dsl_userhold.h>
-#include <sys/dsl_bookmark.h>
-#include <sys/dmu_send.h>
-#include <sys/zio_checksum.h>
-#include <sys/zio_compress.h>
-#include <zfs_fletcher.h>
-
-SYSCTL_DECL(_vfs_zfs);
-
-/*
- * The SPA supports block sizes up to 16MB.  However, very large blocks
- * can have an impact on i/o latency (e.g. tying up a spinning disk for
- * ~300ms), and also potentially on the memory allocator.  Therefore,
- * we do not allow the recordsize to be set larger than zfs_max_recordsize
- * (default 1MB).  Larger blocks can be created by changing this tunable,
- * and pools with larger blocks can always be imported and used, regardless
- * of this setting.
- */
-int zfs_max_recordsize = 1 * 1024 * 1024;
-SYSCTL_INT(_vfs_zfs, OID_AUTO, max_recordsize, CTLFLAG_RWTUN,
-    &zfs_max_recordsize, 0,
-    "Maximum block size.  Expect dragons when tuning this.");
-
-#define	SWITCH64(x, y) \
-	{ \
-		uint64_t __tmp = (x); \
-		(x) = (y); \
-		(y) = __tmp; \
-	}
-
-#define	DS_REF_MAX	(1ULL << 62)
-
-extern inline dsl_dataset_phys_t *dsl_dataset_phys(dsl_dataset_t *ds);
-
-static void dsl_dataset_set_remap_deadlist_object(dsl_dataset_t *ds,
-    uint64_t obj, dmu_tx_t *tx);
-static void dsl_dataset_unset_remap_deadlist_object(dsl_dataset_t *ds,
-    dmu_tx_t *tx);
-
-extern int spa_asize_inflation;
-
-static zil_header_t zero_zil;
-
-/*
- * Figure out how much of this delta should be propogated to the dsl_dir
- * layer.  If there's a refreservation, that space has already been
- * partially accounted for in our ancestors.
- */
-static int64_t
-parent_delta(dsl_dataset_t *ds, int64_t delta)
-{
-	dsl_dataset_phys_t *ds_phys;
-	uint64_t old_bytes, new_bytes;
-
-	if (ds->ds_reserved == 0)
-		return (delta);
-
-	ds_phys = dsl_dataset_phys(ds);
-	old_bytes = MAX(ds_phys->ds_unique_bytes, ds->ds_reserved);
-	new_bytes = MAX(ds_phys->ds_unique_bytes + delta, ds->ds_reserved);
-
-	ASSERT3U(ABS((int64_t)(new_bytes - old_bytes)), <=, ABS(delta));
-	return (new_bytes - old_bytes);
-}
-
-void
-dsl_dataset_block_born(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx)
-{
-	int used = bp_get_dsize_sync(tx->tx_pool->dp_spa, bp);
-	int compressed = BP_GET_PSIZE(bp);
-	int uncompressed = BP_GET_UCSIZE(bp);
-	int64_t delta;
-
-	dprintf_bp(bp, "ds=%p", ds);
-
-	ASSERT(dmu_tx_is_syncing(tx));
-	/* It could have been compressed away to nothing */
-	if (BP_IS_HOLE(bp))
-		return;
-	ASSERT(BP_GET_TYPE(bp) != DMU_OT_NONE);
-	ASSERT(DMU_OT_IS_VALID(BP_GET_TYPE(bp)));
-	if (ds == NULL) {
-		dsl_pool_mos_diduse_space(tx->tx_pool,
-		    used, compressed, uncompressed);
-		return;
-	}
-
-	ASSERT3U(bp->blk_birth, >, dsl_dataset_phys(ds)->ds_prev_snap_txg);
-	dmu_buf_will_dirty(ds->ds_dbuf, tx);
-	mutex_enter(&ds->ds_lock);
-	delta = parent_delta(ds, used);
-	dsl_dataset_phys(ds)->ds_referenced_bytes += used;
-	dsl_dataset_phys(ds)->ds_compressed_bytes += compressed;
-	dsl_dataset_phys(ds)->ds_uncompressed_bytes += uncompressed;
-	dsl_dataset_phys(ds)->ds_unique_bytes += used;
-
-	if (BP_GET_LSIZE(bp) > SPA_OLD_MAXBLOCKSIZE) {
-		ds->ds_feature_activation_needed[SPA_FEATURE_LARGE_BLOCKS] =
-		    B_TRUE;
-	}
-
-	spa_feature_t f = zio_checksum_to_feature(BP_GET_CHECKSUM(bp));
-	if (f != SPA_FEATURE_NONE)
-		ds->ds_feature_activation_needed[f] = B_TRUE;
-
-	mutex_exit(&ds->ds_lock);
-	dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD, delta,
-	    compressed, uncompressed, tx);
-	dsl_dir_transfer_space(ds->ds_dir, used - delta,
-	    DD_USED_REFRSRV, DD_USED_HEAD, NULL);
-}
-
-/*
- * Called when the specified segment has been remapped, and is thus no
- * longer referenced in the head dataset.  The vdev must be indirect.
- *
- * If the segment is referenced by a snapshot, put it on the remap deadlist.
- * Otherwise, add this segment to the obsolete spacemap.
- */
-void
-dsl_dataset_block_remapped(dsl_dataset_t *ds, uint64_t vdev, uint64_t offset,
-    uint64_t size, uint64_t birth, dmu_tx_t *tx)
-{
-	spa_t *spa = ds->ds_dir->dd_pool->dp_spa;
-
-	ASSERT(dmu_tx_is_syncing(tx));
-	ASSERT(birth <= tx->tx_txg);
-	ASSERT(!ds->ds_is_snapshot);
-
-	if (birth > dsl_dataset_phys(ds)->ds_prev_snap_txg) {
-		spa_vdev_indirect_mark_obsolete(spa, vdev, offset, size, tx);
-	} else {
-		blkptr_t fakebp;
-		dva_t *dva = &fakebp.blk_dva[0];
-
-		ASSERT(ds != NULL);
-
-		mutex_enter(&ds->ds_remap_deadlist_lock);
-		if (!dsl_dataset_remap_deadlist_exists(ds)) {
-			dsl_dataset_create_remap_deadlist(ds, tx);
-		}
-		mutex_exit(&ds->ds_remap_deadlist_lock);
-
-		BP_ZERO(&fakebp);
-		fakebp.blk_birth = birth;
-		DVA_SET_VDEV(dva, vdev);
-		DVA_SET_OFFSET(dva, offset);
-		DVA_SET_ASIZE(dva, size);
-
-		dsl_deadlist_insert(&ds->ds_remap_deadlist, &fakebp, tx);
-	}
-}
-
-int
-dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx,
-    boolean_t async)
-{
-	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
-
-	int used = bp_get_dsize_sync(spa, bp);
-	int compressed = BP_GET_PSIZE(bp);
-	int uncompressed = BP_GET_UCSIZE(bp);
-
-	if (BP_IS_HOLE(bp))
-		return (0);
-
-	ASSERT(dmu_tx_is_syncing(tx));
-	ASSERT(bp->blk_birth <= tx->tx_txg);
-
-	if (ds == NULL) {
-		dsl_free(tx->tx_pool, tx->tx_txg, bp);
-		dsl_pool_mos_diduse_space(tx->tx_pool,
-		    -used, -compressed, -uncompressed);
-		return (used);
-	}
-	ASSERT3P(tx->tx_pool, ==, ds->ds_dir->dd_pool);
-
-	ASSERT(!ds->ds_is_snapshot);
-	dmu_buf_will_dirty(ds->ds_dbuf, tx);
-
-	if (bp->blk_birth > dsl_dataset_phys(ds)->ds_prev_snap_txg) {
-		int64_t delta;
-
-		dprintf_bp(bp, "freeing ds=%llu", ds->ds_object);
-		dsl_free(tx->tx_pool, tx->tx_txg, bp);
-
-		mutex_enter(&ds->ds_lock);
-		ASSERT(dsl_dataset_phys(ds)->ds_unique_bytes >= used ||
-		    !DS_UNIQUE_IS_ACCURATE(ds));
-		delta = parent_delta(ds, -used);
-		dsl_dataset_phys(ds)->ds_unique_bytes -= used;
-		mutex_exit(&ds->ds_lock);
-		dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD,
-		    delta, -compressed, -uncompressed, tx);
-		dsl_dir_transfer_space(ds->ds_dir, -used - delta,
-		    DD_USED_REFRSRV, DD_USED_HEAD, NULL);
-	} else {
-		dprintf_bp(bp, "putting on dead list: %s", "");
-		if (async) {
-			/*
-			 * We are here as part of zio's write done callback,
-			 * which means we're a zio interrupt thread.  We can't
-			 * call dsl_deadlist_insert() now because it may block
-			 * waiting for I/O.  Instead, put bp on the deferred
-			 * queue and let dsl_pool_sync() finish the job.
-			 */
-			bplist_append(&ds->ds_pending_deadlist, bp);
-		} else {
-			dsl_deadlist_insert(&ds->ds_deadlist, bp, tx);
-		}
-		ASSERT3U(ds->ds_prev->ds_object, ==,
-		    dsl_dataset_phys(ds)->ds_prev_snap_obj);
-		ASSERT(dsl_dataset_phys(ds->ds_prev)->ds_num_children > 0);
-		/* if (bp->blk_birth > prev prev snap txg) prev unique += bs */
-		if (dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj ==
-		    ds->ds_object && bp->blk_birth >
-		    dsl_dataset_phys(ds->ds_prev)->ds_prev_snap_txg) {
-			dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx);
-			mutex_enter(&ds->ds_prev->ds_lock);
-			dsl_dataset_phys(ds->ds_prev)->ds_unique_bytes += used;
-			mutex_exit(&ds->ds_prev->ds_lock);
-		}
-		if (bp->blk_birth > ds->ds_dir->dd_origin_txg) {
-			dsl_dir_transfer_space(ds->ds_dir, used,
-			    DD_USED_HEAD, DD_USED_SNAP, tx);
-		}
-	}
-	mutex_enter(&ds->ds_lock);
-	ASSERT3U(dsl_dataset_phys(ds)->ds_referenced_bytes, >=, used);
-	dsl_dataset_phys(ds)->ds_referenced_bytes -= used;
-	ASSERT3U(dsl_dataset_phys(ds)->ds_compressed_bytes, >=, compressed);
-	dsl_dataset_phys(ds)->ds_compressed_bytes -= compressed;
-	ASSERT3U(dsl_dataset_phys(ds)->ds_uncompressed_bytes, >=, uncompressed);
-	dsl_dataset_phys(ds)->ds_uncompressed_bytes -= uncompressed;
-	mutex_exit(&ds->ds_lock);
-
-	return (used);
-}
-
-/*
- * We have to release the fsid syncronously or we risk that a subsequent
- * mount of the same dataset will fail to unique_insert the fsid.  This
- * failure would manifest itself as the fsid of this dataset changing
- * between mounts which makes NFS clients quite unhappy.
- */
-static void
-dsl_dataset_evict_sync(void *dbu)
-{
-	dsl_dataset_t *ds = dbu;
-
-	ASSERT(ds->ds_owner == NULL);
-
-	unique_remove(ds->ds_fsid_guid);
-}
-
-static void
-dsl_dataset_evict_async(void *dbu)
-{
-	dsl_dataset_t *ds = dbu;
-
-	ASSERT(ds->ds_owner == NULL);
-
-	ds->ds_dbuf = NULL;
-
-	if (ds->ds_objset != NULL)
-		dmu_objset_evict(ds->ds_objset);
-
-	if (ds->ds_prev) {
-		dsl_dataset_rele(ds->ds_prev, ds);
-		ds->ds_prev = NULL;
-	}
-
-	bplist_destroy(&ds->ds_pending_deadlist);
-	if (dsl_deadlist_is_open(&ds->ds_deadlist))
-		dsl_deadlist_close(&ds->ds_deadlist);
-	if (dsl_deadlist_is_open(&ds->ds_remap_deadlist))
-		dsl_deadlist_close(&ds->ds_remap_deadlist);
-	if (ds->ds_dir)
-		dsl_dir_async_rele(ds->ds_dir, ds);
-
-	ASSERT(!list_link_active(&ds->ds_synced_link));
-
-	list_destroy(&ds->ds_prop_cbs);
-	if (mutex_owned(&ds->ds_lock))
-		mutex_exit(&ds->ds_lock);
-	mutex_destroy(&ds->ds_lock);
-	if (mutex_owned(&ds->ds_opening_lock))
-		mutex_exit(&ds->ds_opening_lock);
-	mutex_destroy(&ds->ds_opening_lock);
-	mutex_destroy(&ds->ds_sendstream_lock);
-	mutex_destroy(&ds->ds_remap_deadlist_lock);
-	zfs_refcount_destroy(&ds->ds_longholds);
-	rrw_destroy(&ds->ds_bp_rwlock);
-
-	kmem_free(ds, sizeof (dsl_dataset_t));
-}
-
-int
-dsl_dataset_get_snapname(dsl_dataset_t *ds)
-{
-	dsl_dataset_phys_t *headphys;
-	int err;
-	dmu_buf_t *headdbuf;
-	dsl_pool_t *dp = ds->ds_dir->dd_pool;
-	objset_t *mos = dp->dp_meta_objset;
-
-	if (ds->ds_snapname[0])
-		return (0);
-	if (dsl_dataset_phys(ds)->ds_next_snap_obj == 0)
-		return (0);
-
-	err = dmu_bonus_hold(mos, dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj,
-	    FTAG, &headdbuf);
-	if (err != 0)
-		return (err);
-	headphys = headdbuf->db_data;
-	err = zap_value_search(dp->dp_meta_objset,
-	    headphys->ds_snapnames_zapobj, ds->ds_object, 0, ds->ds_snapname);
-	dmu_buf_rele(headdbuf, FTAG);
-	return (err);
-}
-
-int
-dsl_dataset_snap_lookup(dsl_dataset_t *ds, const char *name, uint64_t *value)
-{
-	objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
-	uint64_t snapobj = dsl_dataset_phys(ds)->ds_snapnames_zapobj;
-	matchtype_t mt = 0;
-	int err;
-
-	if (dsl_dataset_phys(ds)->ds_flags & DS_FLAG_CI_DATASET)
-		mt = MT_NORMALIZE;
-
-	err = zap_lookup_norm(mos, snapobj, name, 8, 1,
-	    value, mt, NULL, 0, NULL);
-	if (err == ENOTSUP && (mt & MT_NORMALIZE))
-		err = zap_lookup(mos, snapobj, name, 8, 1, value);
-	return (err);
-}
-
-int
-dsl_dataset_snap_remove(dsl_dataset_t *ds, const char *name, dmu_tx_t *tx,
-    boolean_t adj_cnt)
-{
-	objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
-	uint64_t snapobj = dsl_dataset_phys(ds)->ds_snapnames_zapobj;
-	matchtype_t mt = 0;
-	int err;
-
-	dsl_dir_snap_cmtime_update(ds->ds_dir);
-
-	if (dsl_dataset_phys(ds)->ds_flags & DS_FLAG_CI_DATASET)
-		mt = MT_NORMALIZE;
-
-	err = zap_remove_norm(mos, snapobj, name, mt, tx);
-	if (err == ENOTSUP && (mt & MT_NORMALIZE))
-		err = zap_remove(mos, snapobj, name, tx);
-
-	if (err == 0 && adj_cnt)
-		dsl_fs_ss_count_adjust(ds->ds_dir, -1,
-		    DD_FIELD_SNAPSHOT_COUNT, tx);
-
-	return (err);
-}
-
-boolean_t
-dsl_dataset_try_add_ref(dsl_pool_t *dp, dsl_dataset_t *ds, void *tag)
-{
-	dmu_buf_t *dbuf = ds->ds_dbuf;
-	boolean_t result = B_FALSE;
-
-	if (dbuf != NULL && dmu_buf_try_add_ref(dbuf, dp->dp_meta_objset,
-	    ds->ds_object, DMU_BONUS_BLKID, tag)) {
-
-		if (ds == dmu_buf_get_user(dbuf))
-			result = B_TRUE;
-		else
-			dmu_buf_rele(dbuf, tag);
-	}
-
-	return (result);
-}
-
-int
-dsl_dataset_hold_obj(dsl_pool_t *dp, uint64_t dsobj, void *tag,
-    dsl_dataset_t **dsp)
-{
-	objset_t *mos = dp->dp_meta_objset;
-	dmu_buf_t *dbuf;
-	dsl_dataset_t *ds;
-	int err;
-	dmu_object_info_t doi;
-
-	ASSERT(dsl_pool_config_held(dp));
-
-	err = dmu_bonus_hold(mos, dsobj, tag, &dbuf);
-	if (err != 0)
-		return (err);
-
-	/* Make sure dsobj has the correct object type. */
-	dmu_object_info_from_db(dbuf, &doi);
-	if (doi.doi_bonus_type != DMU_OT_DSL_DATASET) {
-		dmu_buf_rele(dbuf, tag);
-		return (SET_ERROR(EINVAL));
-	}
-
-	ds = dmu_buf_get_user(dbuf);
-	if (ds == NULL) {
-		dsl_dataset_t *winner = NULL;
-
-		ds = kmem_zalloc(sizeof (dsl_dataset_t), KM_SLEEP);
-		ds->ds_dbuf = dbuf;
-		ds->ds_object = dsobj;
-		ds->ds_is_snapshot = dsl_dataset_phys(ds)->ds_num_children != 0;
-
-		err = dsl_dir_hold_obj(dp, dsl_dataset_phys(ds)->ds_dir_obj,
-		    NULL, ds, &ds->ds_dir);
-		if (err != 0) {
-			kmem_free(ds, sizeof (dsl_dataset_t));
-			dmu_buf_rele(dbuf, tag);
-			return (err);
-		}
-
-		mutex_init(&ds->ds_lock, NULL, MUTEX_DEFAULT, NULL);
-		mutex_init(&ds->ds_opening_lock, NULL, MUTEX_DEFAULT, NULL);
-		mutex_init(&ds->ds_sendstream_lock, NULL, MUTEX_DEFAULT, NULL);
-		mutex_init(&ds->ds_remap_deadlist_lock,
-		    NULL, MUTEX_DEFAULT, NULL);
-		rrw_init(&ds->ds_bp_rwlock, B_FALSE);
-		zfs_refcount_create(&ds->ds_longholds);
-
-		bplist_create(&ds->ds_pending_deadlist);
-
-		list_create(&ds->ds_sendstreams, sizeof (dmu_sendarg_t),
-		    offsetof(dmu_sendarg_t, dsa_link));
-
-		list_create(&ds->ds_prop_cbs, sizeof (dsl_prop_cb_record_t),
-		    offsetof(dsl_prop_cb_record_t, cbr_ds_node));
-
-		if (doi.doi_type == DMU_OTN_ZAP_METADATA) {
-			for (spa_feature_t f = 0; f < SPA_FEATURES; f++) {
-				if (!(spa_feature_table[f].fi_flags &
-				    ZFEATURE_FLAG_PER_DATASET))
-					continue;
-				err = zap_contains(mos, dsobj,
-				    spa_feature_table[f].fi_guid);
-				if (err == 0) {
-					ds->ds_feature_inuse[f] = B_TRUE;
-				} else {
-					ASSERT3U(err, ==, ENOENT);
-					err = 0;
-				}
-			}
-		}
-
-		if (!ds->ds_is_snapshot) {
-			ds->ds_snapname[0] = '\0';
-			if (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) {
-				err = dsl_dataset_hold_obj(dp,
-				    dsl_dataset_phys(ds)->ds_prev_snap_obj,
-				    ds, &ds->ds_prev);
-			}
-			if (doi.doi_type == DMU_OTN_ZAP_METADATA) {
-				int zaperr = zap_lookup(mos, ds->ds_object,
-				    DS_FIELD_BOOKMARK_NAMES,
-				    sizeof (ds->ds_bookmarks), 1,
-				    &ds->ds_bookmarks);
-				if (zaperr != ENOENT)
-					VERIFY0(zaperr);
-			}
-		} else {
-			if (zfs_flags & ZFS_DEBUG_SNAPNAMES)
-				err = dsl_dataset_get_snapname(ds);
-			if (err == 0 &&
-			    dsl_dataset_phys(ds)->ds_userrefs_obj != 0) {
-				err = zap_count(
-				    ds->ds_dir->dd_pool->dp_meta_objset,
-				    dsl_dataset_phys(ds)->ds_userrefs_obj,
-				    &ds->ds_userrefs);
-			}
-		}
-
-		if (err == 0 && !ds->ds_is_snapshot) {
-			err = dsl_prop_get_int_ds(ds,
-			    zfs_prop_to_name(ZFS_PROP_REFRESERVATION),
-			    &ds->ds_reserved);
-			if (err == 0) {
-				err = dsl_prop_get_int_ds(ds,
-				    zfs_prop_to_name(ZFS_PROP_REFQUOTA),
-				    &ds->ds_quota);
-			}
-		} else {
-			ds->ds_reserved = ds->ds_quota = 0;
-		}
-
-		dsl_deadlist_open(&ds->ds_deadlist,
-		    mos, dsl_dataset_phys(ds)->ds_deadlist_obj);
-		uint64_t remap_deadlist_obj =
-		    dsl_dataset_get_remap_deadlist_object(ds);
-		if (remap_deadlist_obj != 0) {
-			dsl_deadlist_open(&ds->ds_remap_deadlist, mos,
-			    remap_deadlist_obj);
-		}
-
-		dmu_buf_init_user(&ds->ds_dbu, dsl_dataset_evict_sync,
-		    dsl_dataset_evict_async, &ds->ds_dbuf);
-		if (err == 0)
-			winner = dmu_buf_set_user_ie(dbuf, &ds->ds_dbu);
-
-		if (err != 0 || winner != NULL) {
-			bplist_destroy(&ds->ds_pending_deadlist);
-			dsl_deadlist_close(&ds->ds_deadlist);
-			if (dsl_deadlist_is_open(&ds->ds_remap_deadlist))
-				dsl_deadlist_close(&ds->ds_remap_deadlist);
-			if (ds->ds_prev)
-				dsl_dataset_rele(ds->ds_prev, ds);
-			dsl_dir_rele(ds->ds_dir, ds);
-			list_destroy(&ds->ds_prop_cbs);
-			list_destroy(&ds->ds_sendstreams);
-			mutex_destroy(&ds->ds_lock);
-			mutex_destroy(&ds->ds_opening_lock);
-			mutex_destroy(&ds->ds_sendstream_lock);
-			mutex_destroy(&ds->ds_remap_deadlist_lock);
-			zfs_refcount_destroy(&ds->ds_longholds);
-			rrw_destroy(&ds->ds_bp_rwlock);
-			kmem_free(ds, sizeof (dsl_dataset_t));
-			if (err != 0) {
-				dmu_buf_rele(dbuf, tag);
-				return (err);
-			}
-			ds = winner;
-		} else {
-			ds->ds_fsid_guid =
-			    unique_insert(dsl_dataset_phys(ds)->ds_fsid_guid);
-			if (ds->ds_fsid_guid !=
-			    dsl_dataset_phys(ds)->ds_fsid_guid) {
-				zfs_dbgmsg("ds_fsid_guid changed from "
-				    "%llx to %llx for pool %s dataset id %llu",
-				    (long long)
-				    dsl_dataset_phys(ds)->ds_fsid_guid,
-				    (long long)ds->ds_fsid_guid,
-				    spa_name(dp->dp_spa),
-				    dsobj);
-			}
-		}
-	}
-	ASSERT3P(ds->ds_dbuf, ==, dbuf);
-	ASSERT3P(dsl_dataset_phys(ds), ==, dbuf->db_data);
-	ASSERT(dsl_dataset_phys(ds)->ds_prev_snap_obj != 0 ||
-	    spa_version(dp->dp_spa) < SPA_VERSION_ORIGIN ||
-	    dp->dp_origin_snap == NULL || ds == dp->dp_origin_snap);
-	*dsp = ds;
-	return (0);
-}
-
-int
-dsl_dataset_hold(dsl_pool_t *dp, const char *name,
-    void *tag, dsl_dataset_t **dsp)
-{
-	dsl_dir_t *dd;
-	const char *snapname;
-	uint64_t obj;
-	int err = 0;
-	dsl_dataset_t *ds;
-
-	err = dsl_dir_hold(dp, name, FTAG, &dd, &snapname);
-	if (err != 0)
-		return (err);
-
-	ASSERT(dsl_pool_config_held(dp));
-	obj = dsl_dir_phys(dd)->dd_head_dataset_obj;
-	if (obj != 0)
-		err = dsl_dataset_hold_obj(dp, obj, tag, &ds);
-	else
-		err = SET_ERROR(ENOENT);
-
-	/* we may be looking for a snapshot */
-	if (err == 0 && snapname != NULL) {
-		dsl_dataset_t *snap_ds;
-
-		if (*snapname++ != '@') {
-			dsl_dataset_rele(ds, tag);
-			dsl_dir_rele(dd, FTAG);
-			return (SET_ERROR(ENOENT));
-		}
-
-		dprintf("looking for snapshot '%s'\n", snapname);
-		err = dsl_dataset_snap_lookup(ds, snapname, &obj);
-		if (err == 0)
-			err = dsl_dataset_hold_obj(dp, obj, tag, &snap_ds);
-		dsl_dataset_rele(ds, tag);
-
-		if (err == 0) {
-			mutex_enter(&snap_ds->ds_lock);
-			if (snap_ds->ds_snapname[0] == 0)
-				(void) strlcpy(snap_ds->ds_snapname, snapname,
-				    sizeof (snap_ds->ds_snapname));
-			mutex_exit(&snap_ds->ds_lock);
-			ds = snap_ds;
-		}
-	}
-	if (err == 0)
-		*dsp = ds;
-	dsl_dir_rele(dd, FTAG);
-	return (err);
-}
-
-int
-dsl_dataset_own_obj(dsl_pool_t *dp, uint64_t dsobj,
-    void *tag, dsl_dataset_t **dsp)
-{
-	int err = dsl_dataset_hold_obj(dp, dsobj, tag, dsp);
-	if (err != 0)
-		return (err);
-	if (!dsl_dataset_tryown(*dsp, tag)) {
-		dsl_dataset_rele(*dsp, tag);
-		*dsp = NULL;
-		return (SET_ERROR(EBUSY));
-	}
-	return (0);
-}
-
-int
-dsl_dataset_own(dsl_pool_t *dp, const char *name,
-    void *tag, dsl_dataset_t **dsp)
-{
-	int err = dsl_dataset_hold(dp, name, tag, dsp);
-	if (err != 0)
-		return (err);
-	if (!dsl_dataset_tryown(*dsp, tag)) {
-		dsl_dataset_rele(*dsp, tag);
-		return (SET_ERROR(EBUSY));
-	}
-	return (0);
-}
-
-/*
- * See the comment above dsl_pool_hold() for details.  In summary, a long
- * hold is used to prevent destruction of a dataset while the pool hold
- * is dropped, allowing other concurrent operations (e.g. spa_sync()).
- *
- * The dataset and pool must be held when this function is called.  After it
- * is called, the pool hold may be released while the dataset is still held
- * and accessed.
- */
-void
-dsl_dataset_long_hold(dsl_dataset_t *ds, void *tag)
-{
-	ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool));
-	(void) zfs_refcount_add(&ds->ds_longholds, tag);
-}
-
-void
-dsl_dataset_long_rele(dsl_dataset_t *ds, void *tag)
-{
-	(void) zfs_refcount_remove(&ds->ds_longholds, tag);
-}
-
-/* Return B_TRUE if there are any long holds on this dataset. */
-boolean_t
-dsl_dataset_long_held(dsl_dataset_t *ds)
-{
-	return (!zfs_refcount_is_zero(&ds->ds_longholds));
-}
-
-void
-dsl_dataset_name(dsl_dataset_t *ds, char *name)
-{
-	if (ds == NULL) {
-		(void) strcpy(name, "mos");
-	} else {
-		dsl_dir_name(ds->ds_dir, name);
-		VERIFY0(dsl_dataset_get_snapname(ds));
-		if (ds->ds_snapname[0]) {
-			VERIFY3U(strlcat(name, "@", ZFS_MAX_DATASET_NAME_LEN),
-			    <, ZFS_MAX_DATASET_NAME_LEN);
-			/*
-			 * We use a "recursive" mutex so that we
-			 * can call dprintf_ds() with ds_lock held.
-			 */
-			if (!MUTEX_HELD(&ds->ds_lock)) {
-				mutex_enter(&ds->ds_lock);
-				VERIFY3U(strlcat(name, ds->ds_snapname,
-				    ZFS_MAX_DATASET_NAME_LEN), <,
-				    ZFS_MAX_DATASET_NAME_LEN);
-				mutex_exit(&ds->ds_lock);
-			} else {
-				VERIFY3U(strlcat(name, ds->ds_snapname,
-				    ZFS_MAX_DATASET_NAME_LEN), <,
-				    ZFS_MAX_DATASET_NAME_LEN);
-			}
-		}
-	}
-}
-
-int
-dsl_dataset_namelen(dsl_dataset_t *ds)
-{
-	VERIFY0(dsl_dataset_get_snapname(ds));
-	mutex_enter(&ds->ds_lock);
-	int len = dsl_dir_namelen(ds->ds_dir) + 1 + strlen(ds->ds_snapname);
-	mutex_exit(&ds->ds_lock);
-	return (len);
-}
-
-void
-dsl_dataset_rele(dsl_dataset_t *ds, void *tag)
-{
-	dmu_buf_rele(ds->ds_dbuf, tag);
-}
-
-void
-dsl_dataset_disown(dsl_dataset_t *ds, void *tag)
-{
-	ASSERT3P(ds->ds_owner, ==, tag);
-	ASSERT(ds->ds_dbuf != NULL);
-
-	mutex_enter(&ds->ds_lock);
-	ds->ds_owner = NULL;
-	mutex_exit(&ds->ds_lock);
-	dsl_dataset_long_rele(ds, tag);
-	dsl_dataset_rele(ds, tag);
-}
-
-boolean_t
-dsl_dataset_tryown(dsl_dataset_t *ds, void *tag)
-{
-	boolean_t gotit = FALSE;
-
-	ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool));
-	mutex_enter(&ds->ds_lock);
-	if (ds->ds_owner == NULL && !DS_IS_INCONSISTENT(ds)) {
-		ds->ds_owner = tag;
-		dsl_dataset_long_hold(ds, tag);
-		gotit = TRUE;
-	}
-	mutex_exit(&ds->ds_lock);
-	return (gotit);
-}
-
-boolean_t
-dsl_dataset_has_owner(dsl_dataset_t *ds)
-{
-	boolean_t rv;
-	mutex_enter(&ds->ds_lock);
-	rv = (ds->ds_owner != NULL);
-	mutex_exit(&ds->ds_lock);
-	return (rv);
-}
-
-static void
-dsl_dataset_activate_feature(uint64_t dsobj, spa_feature_t f, dmu_tx_t *tx)
-{
-	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
-	objset_t *mos = dmu_tx_pool(tx)->dp_meta_objset;
-	uint64_t zero = 0;
-
-	VERIFY(spa_feature_table[f].fi_flags & ZFEATURE_FLAG_PER_DATASET);
-
-	spa_feature_incr(spa, f, tx);
-	dmu_object_zapify(mos, dsobj, DMU_OT_DSL_DATASET, tx);
-
-	VERIFY0(zap_add(mos, dsobj, spa_feature_table[f].fi_guid,
-	    sizeof (zero), 1, &zero, tx));
-}
-
-void
-dsl_dataset_deactivate_feature(uint64_t dsobj, spa_feature_t f, dmu_tx_t *tx)
-{
-	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
-	objset_t *mos = dmu_tx_pool(tx)->dp_meta_objset;
-
-	VERIFY(spa_feature_table[f].fi_flags & ZFEATURE_FLAG_PER_DATASET);
-
-	VERIFY0(zap_remove(mos, dsobj, spa_feature_table[f].fi_guid, tx));
-	spa_feature_decr(spa, f, tx);
-}
-
-uint64_t
-dsl_dataset_create_sync_dd(dsl_dir_t *dd, dsl_dataset_t *origin,
-    uint64_t flags, dmu_tx_t *tx)
-{
-	dsl_pool_t *dp = dd->dd_pool;
-	dmu_buf_t *dbuf;
-	dsl_dataset_phys_t *dsphys;
-	uint64_t dsobj;
-	objset_t *mos = dp->dp_meta_objset;
-
-	if (origin == NULL)
-		origin = dp->dp_origin_snap;
-
-	ASSERT(origin == NULL || origin->ds_dir->dd_pool == dp);
-	ASSERT(origin == NULL || dsl_dataset_phys(origin)->ds_num_children > 0);
-	ASSERT(dmu_tx_is_syncing(tx));
-	ASSERT(dsl_dir_phys(dd)->dd_head_dataset_obj == 0);
-
-	dsobj = dmu_object_alloc(mos, DMU_OT_DSL_DATASET, 0,
-	    DMU_OT_DSL_DATASET, sizeof (dsl_dataset_phys_t), tx);
-	VERIFY0(dmu_bonus_hold(mos, dsobj, FTAG, &dbuf));
-	dmu_buf_will_dirty(dbuf, tx);
-	dsphys = dbuf->db_data;
-	bzero(dsphys, sizeof (dsl_dataset_phys_t));
-	dsphys->ds_dir_obj = dd->dd_object;
-	dsphys->ds_flags = flags;
-	dsphys->ds_fsid_guid = unique_create();
-	do {
-		(void) random_get_pseudo_bytes((void*)&dsphys->ds_guid,
-		    sizeof (dsphys->ds_guid));
-	} while (dsphys->ds_guid == 0);
-	dsphys->ds_snapnames_zapobj =
-	    zap_create_norm(mos, U8_TEXTPREP_TOUPPER, DMU_OT_DSL_DS_SNAP_MAP,
-	    DMU_OT_NONE, 0, tx);
-	dsphys->ds_creation_time = gethrestime_sec();
-	dsphys->ds_creation_txg = tx->tx_txg == TXG_INITIAL ? 1 : tx->tx_txg;
-
-	if (origin == NULL) {
-		dsphys->ds_deadlist_obj = dsl_deadlist_alloc(mos, tx);
-	} else {
-		dsl_dataset_t *ohds; /* head of the origin snapshot */
-
-		dsphys->ds_prev_snap_obj = origin->ds_object;
-		dsphys->ds_prev_snap_txg =
-		    dsl_dataset_phys(origin)->ds_creation_txg;
-		dsphys->ds_referenced_bytes =
-		    dsl_dataset_phys(origin)->ds_referenced_bytes;
-		dsphys->ds_compressed_bytes =
-		    dsl_dataset_phys(origin)->ds_compressed_bytes;
-		dsphys->ds_uncompressed_bytes =
-		    dsl_dataset_phys(origin)->ds_uncompressed_bytes;
-		rrw_enter(&origin->ds_bp_rwlock, RW_READER, FTAG);
-		dsphys->ds_bp = dsl_dataset_phys(origin)->ds_bp;
-		rrw_exit(&origin->ds_bp_rwlock, FTAG);
-
-		/*
-		 * Inherit flags that describe the dataset's contents
-		 * (INCONSISTENT) or properties (Case Insensitive).
-		 */
-		dsphys->ds_flags |= dsl_dataset_phys(origin)->ds_flags &
-		    (DS_FLAG_INCONSISTENT | DS_FLAG_CI_DATASET);
-
-		for (spa_feature_t f = 0; f < SPA_FEATURES; f++) {
-			if (origin->ds_feature_inuse[f])
-				dsl_dataset_activate_feature(dsobj, f, tx);
-		}
-
-		dmu_buf_will_dirty(origin->ds_dbuf, tx);
-		dsl_dataset_phys(origin)->ds_num_children++;
-
-		VERIFY0(dsl_dataset_hold_obj(dp,
-		    dsl_dir_phys(origin->ds_dir)->dd_head_dataset_obj,
-		    FTAG, &ohds));
-		dsphys->ds_deadlist_obj = dsl_deadlist_clone(&ohds->ds_deadlist,
-		    dsphys->ds_prev_snap_txg, dsphys->ds_prev_snap_obj, tx);
-		dsl_dataset_rele(ohds, FTAG);
-
-		if (spa_version(dp->dp_spa) >= SPA_VERSION_NEXT_CLONES) {
-			if (dsl_dataset_phys(origin)->ds_next_clones_obj == 0) {
-				dsl_dataset_phys(origin)->ds_next_clones_obj =
-				    zap_create(mos,
-				    DMU_OT_NEXT_CLONES, DMU_OT_NONE, 0, tx);
-			}
-			VERIFY0(zap_add_int(mos,
-			    dsl_dataset_phys(origin)->ds_next_clones_obj,
-			    dsobj, tx));
-		}
-
-		dmu_buf_will_dirty(dd->dd_dbuf, tx);
-		dsl_dir_phys(dd)->dd_origin_obj = origin->ds_object;
-		if (spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) {
-			if (dsl_dir_phys(origin->ds_dir)->dd_clones == 0) {
-				dmu_buf_will_dirty(origin->ds_dir->dd_dbuf, tx);
-				dsl_dir_phys(origin->ds_dir)->dd_clones =
-				    zap_create(mos,
-				    DMU_OT_DSL_CLONES, DMU_OT_NONE, 0, tx);
-			}
-			VERIFY0(zap_add_int(mos,
-			    dsl_dir_phys(origin->ds_dir)->dd_clones,
-			    dsobj, tx));
-		}
-	}
-
-	if (spa_version(dp->dp_spa) >= SPA_VERSION_UNIQUE_ACCURATE)
-		dsphys->ds_flags |= DS_FLAG_UNIQUE_ACCURATE;
-
-	dmu_buf_rele(dbuf, FTAG);
-
-	dmu_buf_will_dirty(dd->dd_dbuf, tx);
-	dsl_dir_phys(dd)->dd_head_dataset_obj = dsobj;
-
-	return (dsobj);
-}
-
-static void
-dsl_dataset_zero_zil(dsl_dataset_t *ds, dmu_tx_t *tx)
-{
-	objset_t *os;
-
-	VERIFY0(dmu_objset_from_ds(ds, &os));
-	if (bcmp(&os->os_zil_header, &zero_zil, sizeof (zero_zil)) != 0) {
-		dsl_pool_t *dp = ds->ds_dir->dd_pool;
-		zio_t *zio;
-
-		bzero(&os->os_zil_header, sizeof (os->os_zil_header));
-
-		zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
-		dsl_dataset_sync(ds, zio, tx);
-		VERIFY0(zio_wait(zio));
-
-		/* dsl_dataset_sync_done will drop this reference. */
-		dmu_buf_add_ref(ds->ds_dbuf, ds);
-		dsl_dataset_sync_done(ds, tx);
-	}
-}
-
-uint64_t
-dsl_dataset_create_sync(dsl_dir_t *pdd, const char *lastname,
-    dsl_dataset_t *origin, uint64_t flags, cred_t *cr, dmu_tx_t *tx)
-{
-	dsl_pool_t *dp = pdd->dd_pool;
-	uint64_t dsobj, ddobj;
-	dsl_dir_t *dd;
-
-	ASSERT(dmu_tx_is_syncing(tx));
-	ASSERT(lastname[0] != '@');
-
-	ddobj = dsl_dir_create_sync(dp, pdd, lastname, tx);
-	VERIFY0(dsl_dir_hold_obj(dp, ddobj, lastname, FTAG, &dd));
-
-	dsobj = dsl_dataset_create_sync_dd(dd, origin,
-	    flags & ~DS_CREATE_FLAG_NODIRTY, tx);
-
-	dsl_deleg_set_create_perms(dd, tx, cr);
-
-	/*
-	 * Since we're creating a new node we know it's a leaf, so we can
-	 * initialize the counts if the limit feature is active.
-	 */
-	if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_FS_SS_LIMIT)) {
-		uint64_t cnt = 0;
-		objset_t *os = dd->dd_pool->dp_meta_objset;
-
-		dsl_dir_zapify(dd, tx);
-		VERIFY0(zap_add(os, dd->dd_object, DD_FIELD_FILESYSTEM_COUNT,
-		    sizeof (cnt), 1, &cnt, tx));
-		VERIFY0(zap_add(os, dd->dd_object, DD_FIELD_SNAPSHOT_COUNT,
-		    sizeof (cnt), 1, &cnt, tx));
-	}
-
-	dsl_dir_rele(dd, FTAG);
-
-	/*
-	 * If we are creating a clone, make sure we zero out any stale
-	 * data from the origin snapshots zil header.
-	 */
-	if (origin != NULL && !(flags & DS_CREATE_FLAG_NODIRTY)) {
-		dsl_dataset_t *ds;
-
-		VERIFY0(dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds));
-		dsl_dataset_zero_zil(ds, tx);
-		dsl_dataset_rele(ds, FTAG);
-	}
-
-	return (dsobj);
-}
-
-#ifdef __FreeBSD__
-/* FreeBSD ioctl compat begin */
-struct destroyarg {
-	nvlist_t *nvl;
-	const char *snapname;
-};
-
-static int
-dsl_check_snap_cb(const char *name, void *arg)
-{
-	struct destroyarg *da = arg;
-	dsl_dataset_t *ds;
-	char *dsname;
-
-	dsname = kmem_asprintf("%s@%s", name, da->snapname);
-	fnvlist_add_boolean(da->nvl, dsname);
-	kmem_free(dsname, strlen(dsname) + 1);
-
-	return (0);
-}
-
-int
-dmu_get_recursive_snaps_nvl(char *fsname, const char *snapname,
-    nvlist_t *snaps)
-{
-	struct destroyarg *da;
-	int err;
-
-	da = kmem_zalloc(sizeof (struct destroyarg), KM_SLEEP);
-	da->nvl = snaps;
-	da->snapname = snapname;
-	err = dmu_objset_find(fsname, dsl_check_snap_cb, da,
-	    DS_FIND_CHILDREN);
-	kmem_free(da, sizeof (struct destroyarg));
-
-	return (err);
-}
-/* FreeBSD ioctl compat end */
-#endif /* __FreeBSD__ */
-
-/*
- * The unique space in the head dataset can be calculated by subtracting
- * the space used in the most recent snapshot, that is still being used
- * in this file system, from the space currently in use.  To figure out
- * the space in the most recent snapshot still in use, we need to take
- * the total space used in the snapshot and subtract out the space that
- * has been freed up since the snapshot was taken.
- */
-void
-dsl_dataset_recalc_head_uniq(dsl_dataset_t *ds)
-{
-	uint64_t mrs_used;
-	uint64_t dlused, dlcomp, dluncomp;
-
-	ASSERT(!ds->ds_is_snapshot);
-
-	if (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0)
-		mrs_used = dsl_dataset_phys(ds->ds_prev)->ds_referenced_bytes;
-	else
-		mrs_used = 0;
-
-	dsl_deadlist_space(&ds->ds_deadlist, &dlused, &dlcomp, &dluncomp);
-
-	ASSERT3U(dlused, <=, mrs_used);
-	dsl_dataset_phys(ds)->ds_unique_bytes =
-	    dsl_dataset_phys(ds)->ds_referenced_bytes - (mrs_used - dlused);
-
-	if (spa_version(ds->ds_dir->dd_pool->dp_spa) >=
-	    SPA_VERSION_UNIQUE_ACCURATE)
-		dsl_dataset_phys(ds)->ds_flags |= DS_FLAG_UNIQUE_ACCURATE;
-}
-
-void
-dsl_dataset_remove_from_next_clones(dsl_dataset_t *ds, uint64_t obj,
-    dmu_tx_t *tx)
-{
-	objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
-	uint64_t count;
-	int err;
-
-	ASSERT(dsl_dataset_phys(ds)->ds_num_children >= 2);
-	err = zap_remove_int(mos, dsl_dataset_phys(ds)->ds_next_clones_obj,
-	    obj, tx);
-	/*
-	 * The err should not be ENOENT, but a bug in a previous version
-	 * of the code could cause upgrade_clones_cb() to not set
-	 * ds_next_snap_obj when it should, leading to a missing entry.
-	 * If we knew that the pool was created after
-	 * SPA_VERSION_NEXT_CLONES, we could assert that it isn't
-	 * ENOENT.  However, at least we can check that we don't have
-	 * too many entries in the next_clones_obj even after failing to
-	 * remove this one.
-	 */
-	if (err != ENOENT)
-		VERIFY0(err);
-	ASSERT0(zap_count(mos, dsl_dataset_phys(ds)->ds_next_clones_obj,
-	    &count));
-	ASSERT3U(count, <=, dsl_dataset_phys(ds)->ds_num_children - 2);
-}
-
-
-blkptr_t *
-dsl_dataset_get_blkptr(dsl_dataset_t *ds)
-{
-	return (&dsl_dataset_phys(ds)->ds_bp);
-}
-
-spa_t *
-dsl_dataset_get_spa(dsl_dataset_t *ds)
-{
-	return (ds->ds_dir->dd_pool->dp_spa);
-}
-
-void
-dsl_dataset_dirty(dsl_dataset_t *ds, dmu_tx_t *tx)
-{
-	dsl_pool_t *dp;
-
-	if (ds == NULL) /* this is the meta-objset */
-		return;
-
-	ASSERT(ds->ds_objset != NULL);
-
-	if (dsl_dataset_phys(ds)->ds_next_snap_obj != 0)
-		panic("dirtying snapshot!");
-
-	/* Must not dirty a dataset in the same txg where it got snapshotted. */
-	ASSERT3U(tx->tx_txg, >, dsl_dataset_phys(ds)->ds_prev_snap_txg);
-
-	dp = ds->ds_dir->dd_pool;
-	if (txg_list_add(&dp->dp_dirty_datasets, ds, tx->tx_txg)) {
-		/* up the hold count until we can be written out */
-		dmu_buf_add_ref(ds->ds_dbuf, ds);
-	}
-}
-
-boolean_t
-dsl_dataset_is_dirty(dsl_dataset_t *ds)
-{
-	for (int t = 0; t < TXG_SIZE; t++) {
-		if (txg_list_member(&ds->ds_dir->dd_pool->dp_dirty_datasets,
-		    ds, t))
-			return (B_TRUE);
-	}
-	return (B_FALSE);
-}
-
-static int
-dsl_dataset_snapshot_reserve_space(dsl_dataset_t *ds, dmu_tx_t *tx)
-{
-	uint64_t asize;
-
-	if (!dmu_tx_is_syncing(tx))
-		return (0);
-
-	/*
-	 * If there's an fs-only reservation, any blocks that might become
-	 * owned by the snapshot dataset must be accommodated by space
-	 * outside of the reservation.
-	 */
-	ASSERT(ds->ds_reserved == 0 || DS_UNIQUE_IS_ACCURATE(ds));
-	asize = MIN(dsl_dataset_phys(ds)->ds_unique_bytes, ds->ds_reserved);
-	if (asize > dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE))
-		return (SET_ERROR(ENOSPC));
-
-	/*
-	 * Propagate any reserved space for this snapshot to other
-	 * snapshot checks in this sync group.
-	 */
-	if (asize > 0)
-		dsl_dir_willuse_space(ds->ds_dir, asize, tx);
-
-	return (0);
-}
-
-int
-dsl_dataset_snapshot_check_impl(dsl_dataset_t *ds, const char *snapname,
-    dmu_tx_t *tx, boolean_t recv, uint64_t cnt, cred_t *cr)
-{
-	int error;
-	uint64_t value;
-
-	ds->ds_trysnap_txg = tx->tx_txg;
-
-	if (!dmu_tx_is_syncing(tx))
-		return (0);
-
-	/*
-	 * We don't allow multiple snapshots of the same txg.  If there
-	 * is already one, try again.
-	 */
-	if (dsl_dataset_phys(ds)->ds_prev_snap_txg >= tx->tx_txg)
-		return (SET_ERROR(EAGAIN));
-
-	/*
-	 * Check for conflicting snapshot name.
-	 */
-	error = dsl_dataset_snap_lookup(ds, snapname, &value);
-	if (error == 0)
-		return (SET_ERROR(EEXIST));
-	if (error != ENOENT)
-		return (error);
-
-	/*
-	 * We don't allow taking snapshots of inconsistent datasets, such as
-	 * those into which we are currently receiving.  However, if we are
-	 * creating this snapshot as part of a receive, this check will be
-	 * executed atomically with respect to the completion of the receive
-	 * itself but prior to the clearing of DS_FLAG_INCONSISTENT; in this
-	 * case we ignore this, knowing it will be fixed up for us shortly in
-	 * dmu_recv_end_sync().
-	 */
-	if (!recv && DS_IS_INCONSISTENT(ds))
-		return (SET_ERROR(EBUSY));
-
-	/*
-	 * Skip the check for temporary snapshots or if we have already checked
-	 * the counts in dsl_dataset_snapshot_check. This means we really only
-	 * check the count here when we're receiving a stream.
-	 */
-	if (cnt != 0 && cr != NULL) {
-		error = dsl_fs_ss_limit_check(ds->ds_dir, cnt,
-		    ZFS_PROP_SNAPSHOT_LIMIT, NULL, cr);
-		if (error != 0)
-			return (error);
-	}
-
-	error = dsl_dataset_snapshot_reserve_space(ds, tx);
-	if (error != 0)
-		return (error);
-
-	return (0);
-}
-
-int
-dsl_dataset_snapshot_check(void *arg, dmu_tx_t *tx)
-{
-	dsl_dataset_snapshot_arg_t *ddsa = arg;
-	dsl_pool_t *dp = dmu_tx_pool(tx);
-	nvpair_t *pair;
-	int rv = 0;
-
-	/*
-	 * Pre-compute how many total new snapshots will be created for each
-	 * level in the tree and below. This is needed for validating the
-	 * snapshot limit when either taking a recursive snapshot or when
-	 * taking multiple snapshots.
-	 *
-	 * The problem is that the counts are not actually adjusted when
-	 * we are checking, only when we finally sync. For a single snapshot,
-	 * this is easy, the count will increase by 1 at each node up the tree,
-	 * but its more complicated for the recursive/multiple snapshot case.
-	 *
-	 * The dsl_fs_ss_limit_check function does recursively check the count
-	 * at each level up the tree but since it is validating each snapshot
-	 * independently we need to be sure that we are validating the complete
-	 * count for the entire set of snapshots. We do this by rolling up the
-	 * counts for each component of the name into an nvlist and then
-	 * checking each of those cases with the aggregated count.
-	 *
-	 * This approach properly handles not only the recursive snapshot
-	 * case (where we get all of those on the ddsa_snaps list) but also
-	 * the sibling case (e.g. snapshot a/b and a/c so that we will also
-	 * validate the limit on 'a' using a count of 2).
-	 *
-	 * We validate the snapshot names in the third loop and only report
-	 * name errors once.
-	 */
-	if (dmu_tx_is_syncing(tx)) {
-		nvlist_t *cnt_track = NULL;
-		cnt_track = fnvlist_alloc();
-
-		/* Rollup aggregated counts into the cnt_track list */
-		for (pair = nvlist_next_nvpair(ddsa->ddsa_snaps, NULL);
-		    pair != NULL;
-		    pair = nvlist_next_nvpair(ddsa->ddsa_snaps, pair)) {
-			char *pdelim;
-			uint64_t val;
-			char nm[MAXPATHLEN];
-
-			(void) strlcpy(nm, nvpair_name(pair), sizeof (nm));
-			pdelim = strchr(nm, '@');
-			if (pdelim == NULL)
-				continue;
-			*pdelim = '\0';
-
-			do {
-				if (nvlist_lookup_uint64(cnt_track, nm,
-				    &val) == 0) {
-					/* update existing entry */
-					fnvlist_add_uint64(cnt_track, nm,
-					    val + 1);
-				} else {
-					/* add to list */
-					fnvlist_add_uint64(cnt_track, nm, 1);
-				}
-
-				pdelim = strrchr(nm, '/');
-				if (pdelim != NULL)
-					*pdelim = '\0';
-			} while (pdelim != NULL);
-		}
-
-		/* Check aggregated counts at each level */
-		for (pair = nvlist_next_nvpair(cnt_track, NULL);
-		    pair != NULL; pair = nvlist_next_nvpair(cnt_track, pair)) {
-			int error = 0;
-			char *name;
-			uint64_t cnt = 0;
-			dsl_dataset_t *ds;
-
-			name = nvpair_name(pair);
-			cnt = fnvpair_value_uint64(pair);
-			ASSERT(cnt > 0);
-
-			error = dsl_dataset_hold(dp, name, FTAG, &ds);
-			if (error == 0) {
-				error = dsl_fs_ss_limit_check(ds->ds_dir, cnt,
-				    ZFS_PROP_SNAPSHOT_LIMIT, NULL,
-				    ddsa->ddsa_cr);
-				dsl_dataset_rele(ds, FTAG);
-			}
-
-			if (error != 0) {
-				if (ddsa->ddsa_errors != NULL)
-					fnvlist_add_int32(ddsa->ddsa_errors,
-					    name, error);
-				rv = error;
-				/* only report one error for this check */
-				break;
-			}
-		}
-		nvlist_free(cnt_track);
-	}
-
-	for (pair = nvlist_next_nvpair(ddsa->ddsa_snaps, NULL);
-	    pair != NULL; pair = nvlist_next_nvpair(ddsa->ddsa_snaps, pair)) {
-		int error = 0;
-		dsl_dataset_t *ds;
-		char *name, *atp;
-		char dsname[ZFS_MAX_DATASET_NAME_LEN];
-
-		name = nvpair_name(pair);
-		if (strlen(name) >= ZFS_MAX_DATASET_NAME_LEN)
-			error = SET_ERROR(ENAMETOOLONG);
-		if (error == 0) {
-			atp = strchr(name, '@');
-			if (atp == NULL)
-				error = SET_ERROR(EINVAL);
-			if (error == 0)
-				(void) strlcpy(dsname, name, atp - name + 1);
-		}
-		if (error == 0)
-			error = dsl_dataset_hold(dp, dsname, FTAG, &ds);
-		if (error == 0) {
-			/* passing 0/NULL skips dsl_fs_ss_limit_check */
-			error = dsl_dataset_snapshot_check_impl(ds,
-			    atp + 1, tx, B_FALSE, 0, NULL);
-			dsl_dataset_rele(ds, FTAG);
-		}
-
-		if (error != 0) {
-			if (ddsa->ddsa_errors != NULL) {
-				fnvlist_add_int32(ddsa->ddsa_errors,
-				    name, error);
-			}
-			rv = error;
-		}
-	}
-
-	return (rv);
-}
-
-void
-dsl_dataset_snapshot_sync_impl(dsl_dataset_t *ds, const char *snapname,
-    dmu_tx_t *tx)
-{
-	dsl_pool_t *dp = ds->ds_dir->dd_pool;
-	dmu_buf_t *dbuf;
-	dsl_dataset_phys_t *dsphys;
-	uint64_t dsobj, crtxg;
-	objset_t *mos = dp->dp_meta_objset;
-	objset_t *os;
-
-	ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock));
-
-	/*
-	 * If we are on an old pool, the zil must not be active, in which
-	 * case it will be zeroed.  Usually zil_suspend() accomplishes this.
-	 */
-	ASSERT(spa_version(dmu_tx_pool(tx)->dp_spa) >= SPA_VERSION_FAST_SNAP ||
-	    dmu_objset_from_ds(ds, &os) != 0 ||
-	    bcmp(&os->os_phys->os_zil_header, &zero_zil,
-	    sizeof (zero_zil)) == 0);
-
-	/* Should not snapshot a dirty dataset. */
-	ASSERT(!txg_list_member(&ds->ds_dir->dd_pool->dp_dirty_datasets,
-	    ds, tx->tx_txg));
-
-	dsl_fs_ss_count_adjust(ds->ds_dir, 1, DD_FIELD_SNAPSHOT_COUNT, tx);
-
-	/*
-	 * The origin's ds_creation_txg has to be < TXG_INITIAL
-	 */
-	if (strcmp(snapname, ORIGIN_DIR_NAME) == 0)
-		crtxg = 1;
-	else
-		crtxg = tx->tx_txg;
-
-	dsobj = dmu_object_alloc(mos, DMU_OT_DSL_DATASET, 0,
-	    DMU_OT_DSL_DATASET, sizeof (dsl_dataset_phys_t), tx);
-	VERIFY0(dmu_bonus_hold(mos, dsobj, FTAG, &dbuf));
-	dmu_buf_will_dirty(dbuf, tx);
-	dsphys = dbuf->db_data;
-	bzero(dsphys, sizeof (dsl_dataset_phys_t));
-	dsphys->ds_dir_obj = ds->ds_dir->dd_object;
-	dsphys->ds_fsid_guid = unique_create();
-	do {
-		(void) random_get_pseudo_bytes((void*)&dsphys->ds_guid,
-		    sizeof (dsphys->ds_guid));
-	} while (dsphys->ds_guid == 0);
-	dsphys->ds_prev_snap_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj;
-	dsphys->ds_prev_snap_txg = dsl_dataset_phys(ds)->ds_prev_snap_txg;
-	dsphys->ds_next_snap_obj = ds->ds_object;
-	dsphys->ds_num_children = 1;
-	dsphys->ds_creation_time = gethrestime_sec();
-	dsphys->ds_creation_txg = crtxg;
-	dsphys->ds_deadlist_obj = dsl_dataset_phys(ds)->ds_deadlist_obj;
-	dsphys->ds_referenced_bytes = dsl_dataset_phys(ds)->ds_referenced_bytes;
-	dsphys->ds_compressed_bytes = dsl_dataset_phys(ds)->ds_compressed_bytes;
-	dsphys->ds_uncompressed_bytes =
-	    dsl_dataset_phys(ds)->ds_uncompressed_bytes;
-	dsphys->ds_flags = dsl_dataset_phys(ds)->ds_flags;
-	rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
-	dsphys->ds_bp = dsl_dataset_phys(ds)->ds_bp;
-	rrw_exit(&ds->ds_bp_rwlock, FTAG);
-	dmu_buf_rele(dbuf, FTAG);
-
-	for (spa_feature_t f = 0; f < SPA_FEATURES; f++) {
-		if (ds->ds_feature_inuse[f])
-			dsl_dataset_activate_feature(dsobj, f, tx);
-	}
-
-	ASSERT3U(ds->ds_prev != 0, ==,
-	    dsl_dataset_phys(ds)->ds_prev_snap_obj != 0);
-	if (ds->ds_prev) {
-		uint64_t next_clones_obj =
-		    dsl_dataset_phys(ds->ds_prev)->ds_next_clones_obj;
-		ASSERT(dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj ==
-		    ds->ds_object ||
-		    dsl_dataset_phys(ds->ds_prev)->ds_num_children > 1);
-		if (dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj ==
-		    ds->ds_object) {
-			dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx);
-			ASSERT3U(dsl_dataset_phys(ds)->ds_prev_snap_txg, ==,
-			    dsl_dataset_phys(ds->ds_prev)->ds_creation_txg);
-			dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj = dsobj;
-		} else if (next_clones_obj != 0) {
-			dsl_dataset_remove_from_next_clones(ds->ds_prev,
-			    dsphys->ds_next_snap_obj, tx);
-			VERIFY0(zap_add_int(mos,
-			    next_clones_obj, dsobj, tx));
-		}
-	}
-
-	/*
-	 * If we have a reference-reservation on this dataset, we will
-	 * need to increase the amount of refreservation being charged
-	 * since our unique space is going to zero.
-	 */
-	if (ds->ds_reserved) {
-		int64_t delta;
-		ASSERT(DS_UNIQUE_IS_ACCURATE(ds));
-		delta = MIN(dsl_dataset_phys(ds)->ds_unique_bytes,
-		    ds->ds_reserved);
-		dsl_dir_diduse_space(ds->ds_dir, DD_USED_REFRSRV,
-		    delta, 0, 0, tx);
-	}
-
-	dmu_buf_will_dirty(ds->ds_dbuf, tx);
-	dsl_dataset_phys(ds)->ds_deadlist_obj =
-	    dsl_deadlist_clone(&ds->ds_deadlist, UINT64_MAX,
-	    dsl_dataset_phys(ds)->ds_prev_snap_obj, tx);
-	dsl_deadlist_close(&ds->ds_deadlist);
-	dsl_deadlist_open(&ds->ds_deadlist, mos,
-	    dsl_dataset_phys(ds)->ds_deadlist_obj);
-	dsl_deadlist_add_key(&ds->ds_deadlist,
-	    dsl_dataset_phys(ds)->ds_prev_snap_txg, tx);
-
-	if (dsl_dataset_remap_deadlist_exists(ds)) {
-		uint64_t remap_deadlist_obj =
-		    dsl_dataset_get_remap_deadlist_object(ds);
-		/*
-		 * Move the remap_deadlist to the snapshot.  The head
-		 * will create a new remap deadlist on demand, from
-		 * dsl_dataset_block_remapped().
-		 */
-		dsl_dataset_unset_remap_deadlist_object(ds, tx);
-		dsl_deadlist_close(&ds->ds_remap_deadlist);
-
-		dmu_object_zapify(mos, dsobj, DMU_OT_DSL_DATASET, tx);
-		VERIFY0(zap_add(mos, dsobj, DS_FIELD_REMAP_DEADLIST,
-		    sizeof (remap_deadlist_obj), 1, &remap_deadlist_obj, tx));
-	}
-
-	ASSERT3U(dsl_dataset_phys(ds)->ds_prev_snap_txg, <, tx->tx_txg);
-	dsl_dataset_phys(ds)->ds_prev_snap_obj = dsobj;
-	dsl_dataset_phys(ds)->ds_prev_snap_txg = crtxg;
-	dsl_dataset_phys(ds)->ds_unique_bytes = 0;
-
-	if (spa_version(dp->dp_spa) >= SPA_VERSION_UNIQUE_ACCURATE)
-		dsl_dataset_phys(ds)->ds_flags |= DS_FLAG_UNIQUE_ACCURATE;
-
-	VERIFY0(zap_add(mos, dsl_dataset_phys(ds)->ds_snapnames_zapobj,
-	    snapname, 8, 1, &dsobj, tx));
-
-	if (ds->ds_prev)
-		dsl_dataset_rele(ds->ds_prev, ds);
-	VERIFY0(dsl_dataset_hold_obj(dp,
-	    dsl_dataset_phys(ds)->ds_prev_snap_obj, ds, &ds->ds_prev));
-
-	dsl_scan_ds_snapshotted(ds, tx);
-
-	dsl_dir_snap_cmtime_update(ds->ds_dir);
-
-	spa_history_log_internal_ds(ds->ds_prev, "snapshot", tx, "");
-}
-
-void
-dsl_dataset_snapshot_sync(void *arg, dmu_tx_t *tx)
-{
-	dsl_dataset_snapshot_arg_t *ddsa = arg;
-	dsl_pool_t *dp = dmu_tx_pool(tx);
-	nvpair_t *pair;
-
-	for (pair = nvlist_next_nvpair(ddsa->ddsa_snaps, NULL);
-	    pair != NULL; pair = nvlist_next_nvpair(ddsa->ddsa_snaps, pair)) {
-		dsl_dataset_t *ds;
-		char *name, *atp;
-		char dsname[ZFS_MAX_DATASET_NAME_LEN];
-
-		name = nvpair_name(pair);
-		atp = strchr(name, '@');
-		(void) strlcpy(dsname, name, atp - name + 1);
-		VERIFY0(dsl_dataset_hold(dp, dsname, FTAG, &ds));
-
-		dsl_dataset_snapshot_sync_impl(ds, atp + 1, tx);
-		if (ddsa->ddsa_props != NULL) {
-			dsl_props_set_sync_impl(ds->ds_prev,
-			    ZPROP_SRC_LOCAL, ddsa->ddsa_props, tx);
-		}
-#if defined(__FreeBSD__) && defined(_KERNEL)
-		zvol_create_minors(dp->dp_spa, name);
-#endif
-		dsl_dataset_rele(ds, FTAG);
-	}
-}
-
-/*
- * The snapshots must all be in the same pool.
- * All-or-nothing: if there are any failures, nothing will be modified.
- */
-int
-dsl_dataset_snapshot(nvlist_t *snaps, nvlist_t *props, nvlist_t *errors)
-{
-	dsl_dataset_snapshot_arg_t ddsa;
-	nvpair_t *pair;
-	boolean_t needsuspend;
-	int error;
-	spa_t *spa;
-	char *firstname;
-	nvlist_t *suspended = NULL;
-
-	pair = nvlist_next_nvpair(snaps, NULL);
-	if (pair == NULL)
-		return (0);
-	firstname = nvpair_name(pair);
-
-	error = spa_open(firstname, &spa, FTAG);
-	if (error != 0)
-		return (error);
-	needsuspend = (spa_version(spa) < SPA_VERSION_FAST_SNAP);
-	spa_close(spa, FTAG);
-
-	if (needsuspend) {
-		suspended = fnvlist_alloc();
-		for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL;
-		    pair = nvlist_next_nvpair(snaps, pair)) {
-			char fsname[ZFS_MAX_DATASET_NAME_LEN];
-			char *snapname = nvpair_name(pair);
-			char *atp;
-			void *cookie;
-
-			atp = strchr(snapname, '@');
-			if (atp == NULL) {
-				error = SET_ERROR(EINVAL);
-				break;
-			}
-			(void) strlcpy(fsname, snapname, atp - snapname + 1);
-
-			error = zil_suspend(fsname, &cookie);
-			if (error != 0)
-				break;
-			fnvlist_add_uint64(suspended, fsname,
-			    (uintptr_t)cookie);
-		}
-	}
-
-	ddsa.ddsa_snaps = snaps;
-	ddsa.ddsa_props = props;
-	ddsa.ddsa_errors = errors;
-	ddsa.ddsa_cr = CRED();
-
-	if (error == 0) {
-		error = dsl_sync_task(firstname, dsl_dataset_snapshot_check,
-		    dsl_dataset_snapshot_sync, &ddsa,
-		    fnvlist_num_pairs(snaps) * 3, ZFS_SPACE_CHECK_NORMAL);
-	}
-
-	if (suspended != NULL) {
-		for (pair = nvlist_next_nvpair(suspended, NULL); pair != NULL;
-		    pair = nvlist_next_nvpair(suspended, pair)) {
-			zil_resume((void *)(uintptr_t)
-			    fnvpair_value_uint64(pair));
-		}
-		fnvlist_free(suspended);
-	}
-
-	return (error);
-}
-
-typedef struct dsl_dataset_snapshot_tmp_arg {
-	const char *ddsta_fsname;
-	const char *ddsta_snapname;
-	minor_t ddsta_cleanup_minor;
-	const char *ddsta_htag;
-} dsl_dataset_snapshot_tmp_arg_t;
-
-static int
-dsl_dataset_snapshot_tmp_check(void *arg, dmu_tx_t *tx)
-{
-	dsl_dataset_snapshot_tmp_arg_t *ddsta = arg;
-	dsl_pool_t *dp = dmu_tx_pool(tx);
-	dsl_dataset_t *ds;
-	int error;
-
-	error = dsl_dataset_hold(dp, ddsta->ddsta_fsname, FTAG, &ds);
-	if (error != 0)
-		return (error);
-
-	/* NULL cred means no limit check for tmp snapshot */
-	error = dsl_dataset_snapshot_check_impl(ds, ddsta->ddsta_snapname,
-	    tx, B_FALSE, 0, NULL);
-	if (error != 0) {
-		dsl_dataset_rele(ds, FTAG);
-		return (error);
-	}
-
-	if (spa_version(dp->dp_spa) < SPA_VERSION_USERREFS) {
-		dsl_dataset_rele(ds, FTAG);
-		return (SET_ERROR(ENOTSUP));
-	}
-	error = dsl_dataset_user_hold_check_one(NULL, ddsta->ddsta_htag,
-	    B_TRUE, tx);
-	if (error != 0) {
-		dsl_dataset_rele(ds, FTAG);
-		return (error);
-	}
-
-	dsl_dataset_rele(ds, FTAG);
-	return (0);
-}
-
-static void
-dsl_dataset_snapshot_tmp_sync(void *arg, dmu_tx_t *tx)
-{
-	dsl_dataset_snapshot_tmp_arg_t *ddsta = arg;
-	dsl_pool_t *dp = dmu_tx_pool(tx);
-	dsl_dataset_t *ds;
-
-	VERIFY0(dsl_dataset_hold(dp, ddsta->ddsta_fsname, FTAG, &ds));
-
-	dsl_dataset_snapshot_sync_impl(ds, ddsta->ddsta_snapname, tx);
-	dsl_dataset_user_hold_sync_one(ds->ds_prev, ddsta->ddsta_htag,
-	    ddsta->ddsta_cleanup_minor, gethrestime_sec(), tx);
-	dsl_destroy_snapshot_sync_impl(ds->ds_prev, B_TRUE, tx);
-
-	dsl_dataset_rele(ds, FTAG);
-}
-
-int
-dsl_dataset_snapshot_tmp(const char *fsname, const char *snapname,
-    minor_t cleanup_minor, const char *htag)
-{
-	dsl_dataset_snapshot_tmp_arg_t ddsta;
-	int error;
-	spa_t *spa;
-	boolean_t needsuspend;
-	void *cookie;
-
-	ddsta.ddsta_fsname = fsname;
-	ddsta.ddsta_snapname = snapname;
-	ddsta.ddsta_cleanup_minor = cleanup_minor;
-	ddsta.ddsta_htag = htag;
-
-	error = spa_open(fsname, &spa, FTAG);
-	if (error != 0)
-		return (error);
-	needsuspend = (spa_version(spa) < SPA_VERSION_FAST_SNAP);
-	spa_close(spa, FTAG);
-
-	if (needsuspend) {
-		error = zil_suspend(fsname, &cookie);
-		if (error != 0)
-			return (error);
-	}
-
-	error = dsl_sync_task(fsname, dsl_dataset_snapshot_tmp_check,
-	    dsl_dataset_snapshot_tmp_sync, &ddsta, 3, ZFS_SPACE_CHECK_RESERVED);
-
-	if (needsuspend)
-		zil_resume(cookie);
-	return (error);
-}
-
-void
-dsl_dataset_sync(dsl_dataset_t *ds, zio_t *zio, dmu_tx_t *tx)
-{
-	ASSERT(dmu_tx_is_syncing(tx));
-	ASSERT(ds->ds_objset != NULL);
-	ASSERT(dsl_dataset_phys(ds)->ds_next_snap_obj == 0);
-
-	/*
-	 * in case we had to change ds_fsid_guid when we opened it,
-	 * sync it out now.
-	 */
-	dmu_buf_will_dirty(ds->ds_dbuf, tx);
-	dsl_dataset_phys(ds)->ds_fsid_guid = ds->ds_fsid_guid;
-
-	if (ds->ds_resume_bytes[tx->tx_txg & TXG_MASK] != 0) {
-		VERIFY0(zap_update(tx->tx_pool->dp_meta_objset,
-		    ds->ds_object, DS_FIELD_RESUME_OBJECT, 8, 1,
-		    &ds->ds_resume_object[tx->tx_txg & TXG_MASK], tx));
-		VERIFY0(zap_update(tx->tx_pool->dp_meta_objset,
-		    ds->ds_object, DS_FIELD_RESUME_OFFSET, 8, 1,
-		    &ds->ds_resume_offset[tx->tx_txg & TXG_MASK], tx));
-		VERIFY0(zap_update(tx->tx_pool->dp_meta_objset,
-		    ds->ds_object, DS_FIELD_RESUME_BYTES, 8, 1,
-		    &ds->ds_resume_bytes[tx->tx_txg & TXG_MASK], tx));
-		ds->ds_resume_object[tx->tx_txg & TXG_MASK] = 0;
-		ds->ds_resume_offset[tx->tx_txg & TXG_MASK] = 0;
-		ds->ds_resume_bytes[tx->tx_txg & TXG_MASK] = 0;
-	}
-
-	dmu_objset_sync(ds->ds_objset, zio, tx);
-
-	for (spa_feature_t f = 0; f < SPA_FEATURES; f++) {
-		if (ds->ds_feature_activation_needed[f]) {
-			if (ds->ds_feature_inuse[f])
-				continue;
-			dsl_dataset_activate_feature(ds->ds_object, f, tx);
-			ds->ds_feature_inuse[f] = B_TRUE;
-		}
-	}
-}
-
-static int
-deadlist_enqueue_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
-{
-	dsl_deadlist_t *dl = arg;
-	dsl_deadlist_insert(dl, bp, tx);
-	return (0);
-}
-
-void
-dsl_dataset_sync_done(dsl_dataset_t *ds, dmu_tx_t *tx)
-{
-	objset_t *os = ds->ds_objset;
-
-	bplist_iterate(&ds->ds_pending_deadlist,
-	    deadlist_enqueue_cb, &ds->ds_deadlist, tx);
-
-	if (os->os_synced_dnodes != NULL) {
-		multilist_destroy(os->os_synced_dnodes);
-		os->os_synced_dnodes = NULL;
-	}
-
-	ASSERT(!dmu_objset_is_dirty(os, dmu_tx_get_txg(tx)));
-
-	dmu_buf_rele(ds->ds_dbuf, ds);
-}
-
-int
-get_clones_stat_impl(dsl_dataset_t *ds, nvlist_t *val)
-{
-	uint64_t count = 0;
-	objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
-	zap_cursor_t zc;
-	zap_attribute_t za;
-
-	ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool));
-
-	/*
-	 * There may be missing entries in ds_next_clones_obj
-	 * due to a bug in a previous version of the code.
-	 * Only trust it if it has the right number of entries.
-	 */
-	if (dsl_dataset_phys(ds)->ds_next_clones_obj != 0) {
-		VERIFY0(zap_count(mos, dsl_dataset_phys(ds)->ds_next_clones_obj,
-		    &count));
-	}
-	if (count != dsl_dataset_phys(ds)->ds_num_children - 1) {
-		return (ENOENT);
-	}
-	for (zap_cursor_init(&zc, mos,
-	    dsl_dataset_phys(ds)->ds_next_clones_obj);
-	    zap_cursor_retrieve(&zc, &za) == 0;
-	    zap_cursor_advance(&zc)) {
-		dsl_dataset_t *clone;
-		char buf[ZFS_MAX_DATASET_NAME_LEN];
-		VERIFY0(dsl_dataset_hold_obj(ds->ds_dir->dd_pool,
-		    za.za_first_integer, FTAG, &clone));
-		dsl_dir_name(clone->ds_dir, buf);
-		fnvlist_add_boolean(val, buf);
-		dsl_dataset_rele(clone, FTAG);
-	}
-	zap_cursor_fini(&zc);
-	return (0);
-}
-
-void
-get_clones_stat(dsl_dataset_t *ds, nvlist_t *nv)
-{
-	nvlist_t *propval = fnvlist_alloc();
-	nvlist_t *val;
-
-	/*
-	 * We use nvlist_alloc() instead of fnvlist_alloc() because the
-	 * latter would allocate the list with NV_UNIQUE_NAME flag.
-	 * As a result, every time a clone name is appended to the list
-	 * it would be (linearly) searched for for a duplicate name.
-	 * We already know that all clone names must be unique and we
-	 * want avoid the quadratic complexity of double-checking that
-	 * because we can have a large number of clones.
-	 */
-	VERIFY0(nvlist_alloc(&val, 0, KM_SLEEP));
-
-	if (get_clones_stat_impl(ds, val) == 0) {
-		fnvlist_add_nvlist(propval, ZPROP_VALUE, val);
-		fnvlist_add_nvlist(nv, zfs_prop_to_name(ZFS_PROP_CLONES),
-		    propval);
-	}
-
-	nvlist_free(val);
-	nvlist_free(propval);
-}
-
-/*
- * Returns a string that represents the receive resume stats token. It should
- * be freed with strfree().
- */
-char *
-get_receive_resume_stats_impl(dsl_dataset_t *ds)
-{
-	dsl_pool_t *dp = ds->ds_dir->dd_pool;
-
-	if (dsl_dataset_has_resume_receive_state(ds)) {
-		char *str;
-		void *packed;
-		uint8_t *compressed;
-		uint64_t val;
-		nvlist_t *token_nv = fnvlist_alloc();
-		size_t packed_size, compressed_size;
-
-		if (zap_lookup(dp->dp_meta_objset, ds->ds_object,
-		    DS_FIELD_RESUME_FROMGUID, sizeof (val), 1, &val) == 0) {
-			fnvlist_add_uint64(token_nv, "fromguid", val);
-		}
-		if (zap_lookup(dp->dp_meta_objset, ds->ds_object,
-		    DS_FIELD_RESUME_OBJECT, sizeof (val), 1, &val) == 0) {
-			fnvlist_add_uint64(token_nv, "object", val);
-		}
-		if (zap_lookup(dp->dp_meta_objset, ds->ds_object,
-		    DS_FIELD_RESUME_OFFSET, sizeof (val), 1, &val) == 0) {
-			fnvlist_add_uint64(token_nv, "offset", val);
-		}
-		if (zap_lookup(dp->dp_meta_objset, ds->ds_object,
-		    DS_FIELD_RESUME_BYTES, sizeof (val), 1, &val) == 0) {
-			fnvlist_add_uint64(token_nv, "bytes", val);
-		}
-		if (zap_lookup(dp->dp_meta_objset, ds->ds_object,
-		    DS_FIELD_RESUME_TOGUID, sizeof (val), 1, &val) == 0) {
-			fnvlist_add_uint64(token_nv, "toguid", val);
-		}
-		char buf[256];
-		if (zap_lookup(dp->dp_meta_objset, ds->ds_object,
-		    DS_FIELD_RESUME_TONAME, 1, sizeof (buf), buf) == 0) {
-			fnvlist_add_string(token_nv, "toname", buf);
-		}
-		if (zap_contains(dp->dp_meta_objset, ds->ds_object,
-		    DS_FIELD_RESUME_LARGEBLOCK) == 0) {
-			fnvlist_add_boolean(token_nv, "largeblockok");
-		}
-		if (zap_contains(dp->dp_meta_objset, ds->ds_object,
-		    DS_FIELD_RESUME_EMBEDOK) == 0) {
-			fnvlist_add_boolean(token_nv, "embedok");
-		}
-		if (zap_contains(dp->dp_meta_objset, ds->ds_object,
-		    DS_FIELD_RESUME_COMPRESSOK) == 0) {
-			fnvlist_add_boolean(token_nv, "compressok");
-		}
-		packed = fnvlist_pack(token_nv, &packed_size);
-		fnvlist_free(token_nv);
-		compressed = kmem_alloc(packed_size, KM_SLEEP);
-
-		compressed_size = gzip_compress(packed, compressed,
-		    packed_size, packed_size, 6);
-
-		zio_cksum_t cksum;
-		fletcher_4_native(compressed, compressed_size, NULL, &cksum);
-
-		str = kmem_alloc(compressed_size * 2 + 1, KM_SLEEP);
-		for (int i = 0; i < compressed_size; i++) {
-			(void) sprintf(str + i * 2, "%02x", compressed[i]);
-		}
-		str[compressed_size * 2] = '\0';
-		char *propval = kmem_asprintf("%u-%llx-%llx-%s",
-		    ZFS_SEND_RESUME_TOKEN_VERSION,
-		    (longlong_t)cksum.zc_word[0],
-		    (longlong_t)packed_size, str);
-		kmem_free(packed, packed_size);
-		kmem_free(str, compressed_size * 2 + 1);
-		kmem_free(compressed, packed_size);
-		return (propval);
-	}
-	return (spa_strdup(""));
-}
-
-/*
- * Returns a string that represents the receive resume stats token of the
- * dataset's child. It should be freed with strfree().
- */
-char *
-get_child_receive_stats(dsl_dataset_t *ds)
-{
-	char recvname[ZFS_MAX_DATASET_NAME_LEN + 6];
-	dsl_dataset_t *recv_ds;
-	dsl_dataset_name(ds, recvname);
-	if (strlcat(recvname, "/", sizeof (recvname)) <
-	    sizeof (recvname) &&
-	    strlcat(recvname, recv_clone_name, sizeof (recvname)) <
-	    sizeof (recvname) &&
-	    dsl_dataset_hold(ds->ds_dir->dd_pool, recvname, FTAG,
-	    &recv_ds)  == 0) {
-		char *propval = get_receive_resume_stats_impl(recv_ds);
-		dsl_dataset_rele(recv_ds, FTAG);
-		return (propval);
-	}
-	return (spa_strdup(""));
-}
-
-static void
-get_receive_resume_stats(dsl_dataset_t *ds, nvlist_t *nv)
-{
-	char *propval = get_receive_resume_stats_impl(ds);
-	if (strcmp(propval, "") != 0) {
-		dsl_prop_nvlist_add_string(nv,
-		    ZFS_PROP_RECEIVE_RESUME_TOKEN, propval);
-	} else {
-		char *childval = get_child_receive_stats(ds);
-		if (strcmp(childval, "") != 0) {
-			dsl_prop_nvlist_add_string(nv,
-			    ZFS_PROP_RECEIVE_RESUME_TOKEN, childval);
-		}
-		strfree(childval);
-	}
-	strfree(propval);
-}
-
-uint64_t
-dsl_get_refratio(dsl_dataset_t *ds)
-{
-	uint64_t ratio = dsl_dataset_phys(ds)->ds_compressed_bytes == 0 ? 100 :
-	    (dsl_dataset_phys(ds)->ds_uncompressed_bytes * 100 /
-	    dsl_dataset_phys(ds)->ds_compressed_bytes);
-	return (ratio);
-}
-
-uint64_t
-dsl_get_logicalreferenced(dsl_dataset_t *ds)
-{
-	return (dsl_dataset_phys(ds)->ds_uncompressed_bytes);
-}
-
-uint64_t
-dsl_get_compressratio(dsl_dataset_t *ds)
-{
-	if (ds->ds_is_snapshot) {
-		return (dsl_get_refratio(ds));
-	} else {
-		dsl_dir_t *dd = ds->ds_dir;
-		mutex_enter(&dd->dd_lock);
-		uint64_t val = dsl_dir_get_compressratio(dd);
-		mutex_exit(&dd->dd_lock);
-		return (val);
-	}
-}
-
-uint64_t
-dsl_get_used(dsl_dataset_t *ds)
-{
-	if (ds->ds_is_snapshot) {
-		return (dsl_dataset_phys(ds)->ds_unique_bytes);
-	} else {
-		dsl_dir_t *dd = ds->ds_dir;
-		mutex_enter(&dd->dd_lock);
-		uint64_t val = dsl_dir_get_used(dd);
-		mutex_exit(&dd->dd_lock);
-		return (val);
-	}
-}
-
-uint64_t
-dsl_get_creation(dsl_dataset_t *ds)
-{
-	return (dsl_dataset_phys(ds)->ds_creation_time);
-}
-
-uint64_t
-dsl_get_creationtxg(dsl_dataset_t *ds)
-{
-	return (dsl_dataset_phys(ds)->ds_creation_txg);
-}
-
-uint64_t
-dsl_get_refquota(dsl_dataset_t *ds)
-{
-	return (ds->ds_quota);
-}
-
-uint64_t
-dsl_get_refreservation(dsl_dataset_t *ds)
-{
-	return (ds->ds_reserved);
-}
-
-uint64_t
-dsl_get_guid(dsl_dataset_t *ds)
-{
-	return (dsl_dataset_phys(ds)->ds_guid);
-}
-
-uint64_t
-dsl_get_unique(dsl_dataset_t *ds)
-{
-	return (dsl_dataset_phys(ds)->ds_unique_bytes);
-}
-
-uint64_t
-dsl_get_objsetid(dsl_dataset_t *ds)
-{
-	return (ds->ds_object);
-}
-
-uint64_t
-dsl_get_userrefs(dsl_dataset_t *ds)
-{
-	return (ds->ds_userrefs);
-}
-
-uint64_t
-dsl_get_defer_destroy(dsl_dataset_t *ds)
-{
-	return (DS_IS_DEFER_DESTROY(ds) ? 1 : 0);
-}
-
-uint64_t
-dsl_get_referenced(dsl_dataset_t *ds)
-{
-	return (dsl_dataset_phys(ds)->ds_referenced_bytes);
-}
-
-uint64_t
-dsl_get_numclones(dsl_dataset_t *ds)
-{
-	ASSERT(ds->ds_is_snapshot);
-	return (dsl_dataset_phys(ds)->ds_num_children - 1);
-}
-
-uint64_t
-dsl_get_inconsistent(dsl_dataset_t *ds)
-{
-	return ((dsl_dataset_phys(ds)->ds_flags & DS_FLAG_INCONSISTENT) ?
-	    1 : 0);
-}
-
-uint64_t
-dsl_get_available(dsl_dataset_t *ds)
-{
-	uint64_t refdbytes = dsl_get_referenced(ds);
-	uint64_t availbytes = dsl_dir_space_available(ds->ds_dir,
-	    NULL, 0, TRUE);
-	if (ds->ds_reserved > dsl_dataset_phys(ds)->ds_unique_bytes) {
-		availbytes +=
-		    ds->ds_reserved - dsl_dataset_phys(ds)->ds_unique_bytes;
-	}
-	if (ds->ds_quota != 0) {
-		/*
-		 * Adjust available bytes according to refquota
-		 */
-		if (refdbytes < ds->ds_quota) {
-			availbytes = MIN(availbytes,
-			    ds->ds_quota - refdbytes);
-		} else {
-			availbytes = 0;
-		}
-	}
-	return (availbytes);
-}
-
-int
-dsl_get_written(dsl_dataset_t *ds, uint64_t *written)
-{
-	dsl_pool_t *dp = ds->ds_dir->dd_pool;
-	dsl_dataset_t *prev;
-	int err = dsl_dataset_hold_obj(dp,
-	    dsl_dataset_phys(ds)->ds_prev_snap_obj, FTAG, &prev);
-	if (err == 0) {
-		uint64_t comp, uncomp;
-		err = dsl_dataset_space_written(prev, ds, written,
-		    &comp, &uncomp);
-		dsl_dataset_rele(prev, FTAG);
-	}
-	return (err);
-}
-
-/*
- * 'snap' should be a buffer of size ZFS_MAX_DATASET_NAME_LEN.
- */
-int
-dsl_get_prev_snap(dsl_dataset_t *ds, char *snap)
-{
-	dsl_pool_t *dp = ds->ds_dir->dd_pool;
-	if (ds->ds_prev != NULL && ds->ds_prev != dp->dp_origin_snap) {
-		dsl_dataset_name(ds->ds_prev, snap);
-		return (0);
-	} else {
-		return (ENOENT);
-	}
-}
-
-/*
- * Returns the mountpoint property and source for the given dataset in the value
- * and source buffers. The value buffer must be at least as large as MAXPATHLEN
- * and the source buffer as least as large a ZFS_MAX_DATASET_NAME_LEN.
- * Returns 0 on success and an error on failure.
- */
-int
-dsl_get_mountpoint(dsl_dataset_t *ds, const char *dsname, char *value,
-    char *source)
-{
-	int error;
-	dsl_pool_t *dp = ds->ds_dir->dd_pool;
-
-	/* Retrieve the mountpoint value stored in the zap opbject */
-	error = dsl_prop_get_ds(ds, zfs_prop_to_name(ZFS_PROP_MOUNTPOINT), 1,
-	    ZAP_MAXVALUELEN, value, source);
-	if (error != 0) {
-		return (error);
-	}
-
-	/*
-	 * Process the dsname and source to find the full mountpoint string.
-	 * Can be skipped for 'legacy' or 'none'.
-	 */
-	if (value[0] == '/') {
-		char *buf = kmem_alloc(ZAP_MAXVALUELEN, KM_SLEEP);
-		char *root = buf;
-		const char *relpath;
-
-		/*
-		 * If we inherit the mountpoint, even from a dataset
-		 * with a received value, the source will be the path of
-		 * the dataset we inherit from. If source is
-		 * ZPROP_SOURCE_VAL_RECVD, the received value is not
-		 * inherited.
-		 */
-		if (strcmp(source, ZPROP_SOURCE_VAL_RECVD) == 0) {
-			relpath = "";
-		} else {
-			ASSERT0(strncmp(dsname, source, strlen(source)));
-			relpath = dsname + strlen(source);
-			if (relpath[0] == '/')
-				relpath++;
-		}
-
-		spa_altroot(dp->dp_spa, root, ZAP_MAXVALUELEN);
-
-		/*
-		 * Special case an alternate root of '/'. This will
-		 * avoid having multiple leading slashes in the
-		 * mountpoint path.
-		 */
-		if (strcmp(root, "/") == 0)
-			root++;
-
-		/*
-		 * If the mountpoint is '/' then skip over this
-		 * if we are obtaining either an alternate root or
-		 * an inherited mountpoint.
-		 */
-		char *mnt = value;
-		if (value[1] == '\0' && (root[0] != '\0' ||
-		    relpath[0] != '\0'))
-			mnt = value + 1;
-
-		if (relpath[0] == '\0') {
-			(void) snprintf(value, ZAP_MAXVALUELEN, "%s%s",
-			    root, mnt);
-		} else {
-			(void) snprintf(value, ZAP_MAXVALUELEN, "%s%s%s%s",
-			    root, mnt, relpath[0] == '@' ? "" : "/",
-			    relpath);
-		}
-		kmem_free(buf, ZAP_MAXVALUELEN);
-	}
-
-	return (0);
-}
-
-void
-dsl_dataset_stats(dsl_dataset_t *ds, nvlist_t *nv)
-{
-	dsl_pool_t *dp = ds->ds_dir->dd_pool;
-
-	ASSERT(dsl_pool_config_held(dp));
-
-	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFRATIO,
-	    dsl_get_refratio(ds));
-	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_LOGICALREFERENCED,
-	    dsl_get_logicalreferenced(ds));
-	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_COMPRESSRATIO,
-	    dsl_get_compressratio(ds));
-	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USED,
-	    dsl_get_used(ds));
-
-	if (ds->ds_is_snapshot) {
-		get_clones_stat(ds, nv);
-	} else {
-		char buf[ZFS_MAX_DATASET_NAME_LEN];
-		if (dsl_get_prev_snap(ds, buf) == 0)
-			dsl_prop_nvlist_add_string(nv, ZFS_PROP_PREV_SNAP,
-			    buf);
-		dsl_dir_stats(ds->ds_dir, nv);
-	}
-
-	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_AVAILABLE,
-	    dsl_get_available(ds));
-	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFERENCED,
-	    dsl_get_referenced(ds));
-	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_CREATION,
-	    dsl_get_creation(ds));
-	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_CREATETXG,
-	    dsl_get_creationtxg(ds));
-	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFQUOTA,
-	    dsl_get_refquota(ds));
-	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFRESERVATION,
-	    dsl_get_refreservation(ds));
-	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_GUID,
-	    dsl_get_guid(ds));
-	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_UNIQUE,
-	    dsl_get_unique(ds));
-	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_OBJSETID,
-	    dsl_get_objsetid(ds));
-	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USERREFS,
-	    dsl_get_userrefs(ds));
-	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_DEFER_DESTROY,
-	    dsl_get_defer_destroy(ds));
-
-	if (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) {
-		uint64_t written;
-		if (dsl_get_written(ds, &written) == 0) {
-			dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_WRITTEN,
-			    written);
-		}
-	}
-
-	if (!dsl_dataset_is_snapshot(ds)) {
-		/*
-		 * A failed "newfs" (e.g. full) resumable receive leaves
-		 * the stats set on this dataset.  Check here for the prop.
-		 */
-		get_receive_resume_stats(ds, nv);
-
-		/*
-		 * A failed incremental resumable receive leaves the
-		 * stats set on our child named "%recv".  Check the child
-		 * for the prop.
-		 */
-		/* 6 extra bytes for /%recv */
-		char recvname[ZFS_MAX_DATASET_NAME_LEN + 6];
-		dsl_dataset_t *recv_ds;
-		dsl_dataset_name(ds, recvname);
-		if (strlcat(recvname, "/", sizeof (recvname)) <
-		    sizeof (recvname) &&
-		    strlcat(recvname, recv_clone_name, sizeof (recvname)) <
-		    sizeof (recvname) &&
-		    dsl_dataset_hold(dp, recvname, FTAG, &recv_ds) == 0) {
-			get_receive_resume_stats(recv_ds, nv);
-			dsl_dataset_rele(recv_ds, FTAG);
-		}
-	}
-}
-
-void
-dsl_dataset_fast_stat(dsl_dataset_t *ds, dmu_objset_stats_t *stat)
-{
-	dsl_pool_t *dp = ds->ds_dir->dd_pool;
-	ASSERT(dsl_pool_config_held(dp));
-
-	stat->dds_creation_txg = dsl_get_creationtxg(ds);
-	stat->dds_inconsistent = dsl_get_inconsistent(ds);
-	stat->dds_guid = dsl_get_guid(ds);
-	stat->dds_origin[0] = '\0';
-	if (ds->ds_is_snapshot) {
-		stat->dds_is_snapshot = B_TRUE;
-		stat->dds_num_clones = dsl_get_numclones(ds);
-	} else {
-		stat->dds_is_snapshot = B_FALSE;
-		stat->dds_num_clones = 0;
-
-		if (dsl_dir_is_clone(ds->ds_dir)) {
-			dsl_dir_get_origin(ds->ds_dir, stat->dds_origin);
-		}
-	}
-}
-
-uint64_t
-dsl_dataset_fsid_guid(dsl_dataset_t *ds)
-{
-	return (ds->ds_fsid_guid);
-}
-
-void
-dsl_dataset_space(dsl_dataset_t *ds,
-    uint64_t *refdbytesp, uint64_t *availbytesp,
-    uint64_t *usedobjsp, uint64_t *availobjsp)
-{
-	*refdbytesp = dsl_dataset_phys(ds)->ds_referenced_bytes;
-	*availbytesp = dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE);
-	if (ds->ds_reserved > dsl_dataset_phys(ds)->ds_unique_bytes)
-		*availbytesp +=
-		    ds->ds_reserved - dsl_dataset_phys(ds)->ds_unique_bytes;
-	if (ds->ds_quota != 0) {
-		/*
-		 * Adjust available bytes according to refquota
-		 */
-		if (*refdbytesp < ds->ds_quota)
-			*availbytesp = MIN(*availbytesp,
-			    ds->ds_quota - *refdbytesp);
-		else
-			*availbytesp = 0;
-	}
-	rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
-	*usedobjsp = BP_GET_FILL(&dsl_dataset_phys(ds)->ds_bp);
-	rrw_exit(&ds->ds_bp_rwlock, FTAG);
-	*availobjsp = DN_MAX_OBJECT - *usedobjsp;
-}
-
-boolean_t
-dsl_dataset_modified_since_snap(dsl_dataset_t *ds, dsl_dataset_t *snap)
-{
-	dsl_pool_t *dp = ds->ds_dir->dd_pool;
-	uint64_t birth;
-
-	ASSERT(dsl_pool_config_held(dp));
-	if (snap == NULL)
-		return (B_FALSE);
-	rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
-	birth = dsl_dataset_get_blkptr(ds)->blk_birth;
-	rrw_exit(&ds->ds_bp_rwlock, FTAG);
-	if (birth > dsl_dataset_phys(snap)->ds_creation_txg) {
-		objset_t *os, *os_snap;
-		/*
-		 * It may be that only the ZIL differs, because it was
-		 * reset in the head.  Don't count that as being
-		 * modified.
-		 */
-		if (dmu_objset_from_ds(ds, &os) != 0)
-			return (B_TRUE);
-		if (dmu_objset_from_ds(snap, &os_snap) != 0)
-			return (B_TRUE);
-		return (bcmp(&os->os_phys->os_meta_dnode,
-		    &os_snap->os_phys->os_meta_dnode,
-		    sizeof (os->os_phys->os_meta_dnode)) != 0);
-	}
-	return (B_FALSE);
-}
-
-typedef struct dsl_dataset_rename_snapshot_arg {
-	const char *ddrsa_fsname;
-	const char *ddrsa_oldsnapname;
-	const char *ddrsa_newsnapname;
-	boolean_t ddrsa_recursive;
-	dmu_tx_t *ddrsa_tx;
-} dsl_dataset_rename_snapshot_arg_t;
-
-/* ARGSUSED */
-static int
-dsl_dataset_rename_snapshot_check_impl(dsl_pool_t *dp,
-    dsl_dataset_t *hds, void *arg)
-{
-	dsl_dataset_rename_snapshot_arg_t *ddrsa = arg;
-	int error;
-	uint64_t val;
-
-	error = dsl_dataset_snap_lookup(hds, ddrsa->ddrsa_oldsnapname, &val);
-	if (error != 0) {
-		/* ignore nonexistent snapshots */
-		return (error == ENOENT ? 0 : error);
-	}
-
-	/* new name should not exist */
-	error = dsl_dataset_snap_lookup(hds, ddrsa->ddrsa_newsnapname, &val);
-	if (error == 0)
-		error = SET_ERROR(EEXIST);
-	else if (error == ENOENT)
-		error = 0;
-
-	/* dataset name + 1 for the "@" + the new snapshot name must fit */
-	if (dsl_dir_namelen(hds->ds_dir) + 1 +
-	    strlen(ddrsa->ddrsa_newsnapname) >= ZFS_MAX_DATASET_NAME_LEN)
-		error = SET_ERROR(ENAMETOOLONG);
-
-	return (error);
-}
-
-static int
-dsl_dataset_rename_snapshot_check(void *arg, dmu_tx_t *tx)
-{
-	dsl_dataset_rename_snapshot_arg_t *ddrsa = arg;
-	dsl_pool_t *dp = dmu_tx_pool(tx);
-	dsl_dataset_t *hds;
-	int error;
-
-	error = dsl_dataset_hold(dp, ddrsa->ddrsa_fsname, FTAG, &hds);
-	if (error != 0)
-		return (error);
-
-	if (ddrsa->ddrsa_recursive) {
-		error = dmu_objset_find_dp(dp, hds->ds_dir->dd_object,
-		    dsl_dataset_rename_snapshot_check_impl, ddrsa,
-		    DS_FIND_CHILDREN);
-	} else {
-		error = dsl_dataset_rename_snapshot_check_impl(dp, hds, ddrsa);
-	}
-	dsl_dataset_rele(hds, FTAG);
-	return (error);
-}
-
-static int
-dsl_dataset_rename_snapshot_sync_impl(dsl_pool_t *dp,
-    dsl_dataset_t *hds, void *arg)
-{
-#ifdef __FreeBSD__
-#ifdef _KERNEL
-	char *oldname, *newname;
-#endif
-#endif
-	dsl_dataset_rename_snapshot_arg_t *ddrsa = arg;
-	dsl_dataset_t *ds;
-	uint64_t val;
-	dmu_tx_t *tx = ddrsa->ddrsa_tx;
-	int error;
-
-	error = dsl_dataset_snap_lookup(hds, ddrsa->ddrsa_oldsnapname, &val);
-	ASSERT(error == 0 || error == ENOENT);
-	if (error == ENOENT) {
-		/* ignore nonexistent snapshots */
-		return (0);
-	}
-
-	VERIFY0(dsl_dataset_hold_obj(dp, val, FTAG, &ds));
-
-	/* log before we change the name */
-	spa_history_log_internal_ds(ds, "rename", tx,
-	    "-> @%s", ddrsa->ddrsa_newsnapname);
-
-	VERIFY0(dsl_dataset_snap_remove(hds, ddrsa->ddrsa_oldsnapname, tx,
-	    B_FALSE));
-	mutex_enter(&ds->ds_lock);
-	(void) strcpy(ds->ds_snapname, ddrsa->ddrsa_newsnapname);
-	mutex_exit(&ds->ds_lock);
-	VERIFY0(zap_add(dp->dp_meta_objset,
-	    dsl_dataset_phys(hds)->ds_snapnames_zapobj,
-	    ds->ds_snapname, 8, 1, &ds->ds_object, tx));
-
-#ifdef __FreeBSD__
-#ifdef _KERNEL
-	oldname = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP);
-	newname = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP);
-	snprintf(oldname, ZFS_MAX_DATASET_NAME_LEN, "%s@%s",
-	    ddrsa->ddrsa_fsname, ddrsa->ddrsa_oldsnapname);
-	snprintf(newname, ZFS_MAX_DATASET_NAME_LEN, "%s@%s",
-	    ddrsa->ddrsa_fsname, ddrsa->ddrsa_newsnapname);
-	zfsvfs_update_fromname(oldname, newname);
-	zvol_rename_minors(dp->dp_spa, oldname, newname);
-	kmem_free(newname, ZFS_MAX_DATASET_NAME_LEN);
-	kmem_free(oldname, ZFS_MAX_DATASET_NAME_LEN);
-#endif
-#endif
-	dsl_dataset_rele(ds, FTAG);
-
-	return (0);
-}
-
-static void
-dsl_dataset_rename_snapshot_sync(void *arg, dmu_tx_t *tx)
-{
-	dsl_dataset_rename_snapshot_arg_t *ddrsa = arg;
-	dsl_pool_t *dp = dmu_tx_pool(tx);
-	dsl_dataset_t *hds;
-
-	VERIFY0(dsl_dataset_hold(dp, ddrsa->ddrsa_fsname, FTAG, &hds));
-	ddrsa->ddrsa_tx = tx;
-	if (ddrsa->ddrsa_recursive) {
-		VERIFY0(dmu_objset_find_dp(dp, hds->ds_dir->dd_object,
-		    dsl_dataset_rename_snapshot_sync_impl, ddrsa,
-		    DS_FIND_CHILDREN));
-	} else {
-		VERIFY0(dsl_dataset_rename_snapshot_sync_impl(dp, hds, ddrsa));
-	}
-	dsl_dataset_rele(hds, FTAG);
-}
-
-int
-dsl_dataset_rename_snapshot(const char *fsname,
-    const char *oldsnapname, const char *newsnapname, boolean_t recursive)
-{
-	dsl_dataset_rename_snapshot_arg_t ddrsa;
-
-	ddrsa.ddrsa_fsname = fsname;
-	ddrsa.ddrsa_oldsnapname = oldsnapname;
-	ddrsa.ddrsa_newsnapname = newsnapname;
-	ddrsa.ddrsa_recursive = recursive;
-
-	return (dsl_sync_task(fsname, dsl_dataset_rename_snapshot_check,
-	    dsl_dataset_rename_snapshot_sync, &ddrsa,
-	    1, ZFS_SPACE_CHECK_RESERVED));
-}
-
-/*
- * If we're doing an ownership handoff, we need to make sure that there is
- * only one long hold on the dataset.  We're not allowed to change anything here
- * so we don't permanently release the long hold or regular hold here.  We want
- * to do this only when syncing to avoid the dataset unexpectedly going away
- * when we release the long hold.
- */
-static int
-dsl_dataset_handoff_check(dsl_dataset_t *ds, void *owner, dmu_tx_t *tx)
-{
-	boolean_t held;
-
-	if (!dmu_tx_is_syncing(tx))
-		return (0);
-
-	if (owner != NULL) {
-		VERIFY3P(ds->ds_owner, ==, owner);
-		dsl_dataset_long_rele(ds, owner);
-	}
-
-	held = dsl_dataset_long_held(ds);
-
-	if (owner != NULL)
-		dsl_dataset_long_hold(ds, owner);
-
-	if (held)
-		return (SET_ERROR(EBUSY));
-
-	return (0);
-}
-
-int
-dsl_dataset_rollback_check(void *arg, dmu_tx_t *tx)
-{
-	dsl_dataset_rollback_arg_t *ddra = arg;
-	dsl_pool_t *dp = dmu_tx_pool(tx);
-	dsl_dataset_t *ds;
-	int64_t unused_refres_delta;
-	int error;
-
-	error = dsl_dataset_hold(dp, ddra->ddra_fsname, FTAG, &ds);
-	if (error != 0)
-		return (error);
-
-	/* must not be a snapshot */
-	if (ds->ds_is_snapshot) {
-		dsl_dataset_rele(ds, FTAG);
-		return (SET_ERROR(EINVAL));
-	}
-
-	/* must have a most recent snapshot */
-	if (dsl_dataset_phys(ds)->ds_prev_snap_txg < TXG_INITIAL) {
-		dsl_dataset_rele(ds, FTAG);
-		return (SET_ERROR(ESRCH));
-	}
-
-	/*
-	 * No rollback to a snapshot created in the current txg, because
-	 * the rollback may dirty the dataset and create blocks that are
-	 * not reachable from the rootbp while having a birth txg that
-	 * falls into the snapshot's range.
-	 */
-	if (dmu_tx_is_syncing(tx) &&
-	    dsl_dataset_phys(ds)->ds_prev_snap_txg >= tx->tx_txg) {
-		dsl_dataset_rele(ds, FTAG);
-		return (SET_ERROR(EAGAIN));
-	}
-
-	/*
-	 * If the expected target snapshot is specified, then check that
-	 * the latest snapshot is it.
-	 */
-	if (ddra->ddra_tosnap != NULL) {
-		dsl_dataset_t *snapds;
-
-		/* Check if the target snapshot exists at all. */
-		error = dsl_dataset_hold(dp, ddra->ddra_tosnap, FTAG, &snapds);
-		if (error != 0) {
-			/*
-			 * ESRCH is used to signal that the target snapshot does
-			 * not exist, while ENOENT is used to report that
-			 * the rolled back dataset does not exist.
-			 * ESRCH is also used to cover other cases where the
-			 * target snapshot is not related to the dataset being
-			 * rolled back such as being in a different pool.
-			 */
-			if (error == ENOENT || error == EXDEV)
-				error = SET_ERROR(ESRCH);
-			dsl_dataset_rele(ds, FTAG);
-			return (error);
-		}
-		ASSERT(snapds->ds_is_snapshot);
-
-		/* Check if the snapshot is the latest snapshot indeed. */
-		if (snapds != ds->ds_prev) {
-			/*
-			 * Distinguish between the case where the only problem
-			 * is intervening snapshots (EEXIST) vs the snapshot
-			 * not being a valid target for rollback (ESRCH).
-			 */
-			if (snapds->ds_dir == ds->ds_dir ||
-			    (dsl_dir_is_clone(ds->ds_dir) &&
-			    dsl_dir_phys(ds->ds_dir)->dd_origin_obj ==
-			    snapds->ds_object)) {
-				error = SET_ERROR(EEXIST);
-			} else {
-				error = SET_ERROR(ESRCH);
-			}
-			dsl_dataset_rele(snapds, FTAG);
-			dsl_dataset_rele(ds, FTAG);
-			return (error);
-		}
-		dsl_dataset_rele(snapds, FTAG);
-	}
-
-	/* must not have any bookmarks after the most recent snapshot */
-	nvlist_t *proprequest = fnvlist_alloc();
-	fnvlist_add_boolean(proprequest, zfs_prop_to_name(ZFS_PROP_CREATETXG));
-	nvlist_t *bookmarks = fnvlist_alloc();
-	error = dsl_get_bookmarks_impl(ds, proprequest, bookmarks);
-	fnvlist_free(proprequest);
-	if (error != 0) {
-		dsl_dataset_rele(ds, FTAG);
-		return (error);
-	}
-	for (nvpair_t *pair = nvlist_next_nvpair(bookmarks, NULL);
-	    pair != NULL; pair = nvlist_next_nvpair(bookmarks, pair)) {
-		nvlist_t *valuenv =
-		    fnvlist_lookup_nvlist(fnvpair_value_nvlist(pair),
-		    zfs_prop_to_name(ZFS_PROP_CREATETXG));
-		uint64_t createtxg = fnvlist_lookup_uint64(valuenv, "value");
-		if (createtxg > dsl_dataset_phys(ds)->ds_prev_snap_txg) {
-			fnvlist_free(bookmarks);
-			dsl_dataset_rele(ds, FTAG);
-			return (SET_ERROR(EEXIST));
-		}
-	}
-	fnvlist_free(bookmarks);
-
-	error = dsl_dataset_handoff_check(ds, ddra->ddra_owner, tx);
-	if (error != 0) {
-		dsl_dataset_rele(ds, FTAG);
-		return (error);
-	}
-
-	/*
-	 * Check if the snap we are rolling back to uses more than
-	 * the refquota.
-	 */
-	if (ds->ds_quota != 0 &&
-	    dsl_dataset_phys(ds->ds_prev)->ds_referenced_bytes > ds->ds_quota) {
-		dsl_dataset_rele(ds, FTAG);
-		return (SET_ERROR(EDQUOT));
-	}
-
-	/*
-	 * When we do the clone swap, we will temporarily use more space
-	 * due to the refreservation (the head will no longer have any
-	 * unique space, so the entire amount of the refreservation will need
-	 * to be free).  We will immediately destroy the clone, freeing
-	 * this space, but the freeing happens over many txg's.
-	 */
-	unused_refres_delta = (int64_t)MIN(ds->ds_reserved,
-	    dsl_dataset_phys(ds)->ds_unique_bytes);
-
-	if (unused_refres_delta > 0 &&
-	    unused_refres_delta >
-	    dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE)) {
-		dsl_dataset_rele(ds, FTAG);
-		return (SET_ERROR(ENOSPC));
-	}
-
-	dsl_dataset_rele(ds, FTAG);
-	return (0);
-}
-
-void
-dsl_dataset_rollback_sync(void *arg, dmu_tx_t *tx)
-{
-	dsl_dataset_rollback_arg_t *ddra = arg;
-	dsl_pool_t *dp = dmu_tx_pool(tx);
-	dsl_dataset_t *ds, *clone;
-	uint64_t cloneobj;
-	char namebuf[ZFS_MAX_DATASET_NAME_LEN];
-
-	VERIFY0(dsl_dataset_hold(dp, ddra->ddra_fsname, FTAG, &ds));
-
-	dsl_dataset_name(ds->ds_prev, namebuf);
-	fnvlist_add_string(ddra->ddra_result, "target", namebuf);
-
-	cloneobj = dsl_dataset_create_sync(ds->ds_dir, "%rollback",
-	    ds->ds_prev, DS_CREATE_FLAG_NODIRTY, kcred, tx);
-
-	VERIFY0(dsl_dataset_hold_obj(dp, cloneobj, FTAG, &clone));
-
-	dsl_dataset_clone_swap_sync_impl(clone, ds, tx);
-	dsl_dataset_zero_zil(ds, tx);
-
-	dsl_destroy_head_sync_impl(clone, tx);
-
-	dsl_dataset_rele(clone, FTAG);
-	dsl_dataset_rele(ds, FTAG);
-}
-
-/*
- * Rolls back the given filesystem or volume to the most recent snapshot.
- * The name of the most recent snapshot will be returned under key "target"
- * in the result nvlist.
- *
- * If owner != NULL:
- * - The existing dataset MUST be owned by the specified owner at entry
- * - Upon return, dataset will still be held by the same owner, whether we
- *   succeed or not.
- *
- * This mode is required any time the existing filesystem is mounted.  See
- * notes above zfs_suspend_fs() for further details.
- */
-int
-dsl_dataset_rollback(const char *fsname, const char *tosnap, void *owner,
-    nvlist_t *result)
-{
-	dsl_dataset_rollback_arg_t ddra;
-
-	ddra.ddra_fsname = fsname;
-	ddra.ddra_tosnap = tosnap;
-	ddra.ddra_owner = owner;
-	ddra.ddra_result = result;
-
-	return (dsl_sync_task(fsname, dsl_dataset_rollback_check,
-	    dsl_dataset_rollback_sync, &ddra,
-	    1, ZFS_SPACE_CHECK_RESERVED));
-}
-
-struct promotenode {
-	list_node_t link;
-	dsl_dataset_t *ds;
-};
-
-static int snaplist_space(list_t *l, uint64_t mintxg, uint64_t *spacep);
-static int promote_hold(dsl_dataset_promote_arg_t *ddpa, dsl_pool_t *dp,
-    void *tag);
-static void promote_rele(dsl_dataset_promote_arg_t *ddpa, void *tag);
-
-int
-dsl_dataset_promote_check(void *arg, dmu_tx_t *tx)
-{
-	dsl_dataset_promote_arg_t *ddpa = arg;
-	dsl_pool_t *dp = dmu_tx_pool(tx);
-	dsl_dataset_t *hds;
-	struct promotenode *snap;
-	dsl_dataset_t *origin_ds;
-	int err;
-	uint64_t unused;
-	uint64_t ss_mv_cnt;
-	size_t max_snap_len;
-	boolean_t conflicting_snaps;
-
-	err = promote_hold(ddpa, dp, FTAG);
-	if (err != 0)
-		return (err);
-
-	hds = ddpa->ddpa_clone;
-	snap = list_head(&ddpa->shared_snaps);
-	origin_ds = snap->ds;
-	max_snap_len = MAXNAMELEN - strlen(ddpa->ddpa_clonename) - 1;
-
-	snap = list_head(&ddpa->origin_snaps);
-
-	if (dsl_dataset_phys(hds)->ds_flags & DS_FLAG_NOPROMOTE) {
-		promote_rele(ddpa, FTAG);
-		return (SET_ERROR(EXDEV));
-	}
-
-	/*
-	 * Compute and check the amount of space to transfer.  Since this is
-	 * so expensive, don't do the preliminary check.
-	 */
-	if (!dmu_tx_is_syncing(tx)) {
-		promote_rele(ddpa, FTAG);
-		return (0);
-	}
-
-	/* compute origin's new unique space */
-	snap = list_tail(&ddpa->clone_snaps);
-	ASSERT3U(dsl_dataset_phys(snap->ds)->ds_prev_snap_obj, ==,
-	    origin_ds->ds_object);
-	dsl_deadlist_space_range(&snap->ds->ds_deadlist,
-	    dsl_dataset_phys(origin_ds)->ds_prev_snap_txg, UINT64_MAX,
-	    &ddpa->unique, &unused, &unused);
-
-	/*
-	 * Walk the snapshots that we are moving
-	 *
-	 * Compute space to transfer.  Consider the incremental changes
-	 * to used by each snapshot:
-	 * (my used) = (prev's used) + (blocks born) - (blocks killed)
-	 * So each snapshot gave birth to:
-	 * (blocks born) = (my used) - (prev's used) + (blocks killed)
-	 * So a sequence would look like:
-	 * (uN - u(N-1) + kN) + ... + (u1 - u0 + k1) + (u0 - 0 + k0)
-	 * Which simplifies to:
-	 * uN + kN + kN-1 + ... + k1 + k0
-	 * Note however, if we stop before we reach the ORIGIN we get:
-	 * uN + kN + kN-1 + ... + kM - uM-1
-	 */
-	conflicting_snaps = B_FALSE;
-	ss_mv_cnt = 0;
-	ddpa->used = dsl_dataset_phys(origin_ds)->ds_referenced_bytes;
-	ddpa->comp = dsl_dataset_phys(origin_ds)->ds_compressed_bytes;
-	ddpa->uncomp = dsl_dataset_phys(origin_ds)->ds_uncompressed_bytes;
-	for (snap = list_head(&ddpa->shared_snaps); snap;
-	    snap = list_next(&ddpa->shared_snaps, snap)) {
-		uint64_t val, dlused, dlcomp, dluncomp;
-		dsl_dataset_t *ds = snap->ds;
-
-		ss_mv_cnt++;
-
-		/*
-		 * If there are long holds, we won't be able to evict
-		 * the objset.
-		 */
-		if (dsl_dataset_long_held(ds)) {
-			err = SET_ERROR(EBUSY);
-			goto out;
-		}
-
-		/* Check that the snapshot name does not conflict */
-		VERIFY0(dsl_dataset_get_snapname(ds));
-		if (strlen(ds->ds_snapname) >= max_snap_len) {
-			err = SET_ERROR(ENAMETOOLONG);
-			goto out;
-		}
-		err = dsl_dataset_snap_lookup(hds, ds->ds_snapname, &val);
-		if (err == 0) {
-			fnvlist_add_boolean(ddpa->err_ds,
-			    snap->ds->ds_snapname);
-			conflicting_snaps = B_TRUE;
-		} else if (err != ENOENT) {
-			goto out;
-		}
-
-		/* The very first snapshot does not have a deadlist */
-		if (dsl_dataset_phys(ds)->ds_prev_snap_obj == 0)
-			continue;
-
-		dsl_deadlist_space(&ds->ds_deadlist,
-		    &dlused, &dlcomp, &dluncomp);
-		ddpa->used += dlused;
-		ddpa->comp += dlcomp;
-		ddpa->uncomp += dluncomp;
-	}
-
-	/*
-	 * In order to return the full list of conflicting snapshots, we check
-	 * whether there was a conflict after traversing all of them.
-	 */
-	if (conflicting_snaps) {
-		err = SET_ERROR(EEXIST);
-		goto out;
-	}
-
-	/*
-	 * If we are a clone of a clone then we never reached ORIGIN,
-	 * so we need to subtract out the clone origin's used space.
-	 */
-	if (ddpa->origin_origin) {
-		ddpa->used -=
-		    dsl_dataset_phys(ddpa->origin_origin)->ds_referenced_bytes;
-		ddpa->comp -=
-		    dsl_dataset_phys(ddpa->origin_origin)->ds_compressed_bytes;
-		ddpa->uncomp -=
-		    dsl_dataset_phys(ddpa->origin_origin)->
-		    ds_uncompressed_bytes;
-	}
-
-	/* Check that there is enough space and limit headroom here */
-	err = dsl_dir_transfer_possible(origin_ds->ds_dir, hds->ds_dir,
-	    0, ss_mv_cnt, ddpa->used, ddpa->cr);
-	if (err != 0)
-		goto out;
-
-	/*
-	 * Compute the amounts of space that will be used by snapshots
-	 * after the promotion (for both origin and clone).  For each,
-	 * it is the amount of space that will be on all of their
-	 * deadlists (that was not born before their new origin).
-	 */
-	if (dsl_dir_phys(hds->ds_dir)->dd_flags & DD_FLAG_USED_BREAKDOWN) {
-		uint64_t space;
-
-		/*
-		 * Note, typically this will not be a clone of a clone,
-		 * so dd_origin_txg will be < TXG_INITIAL, so
-		 * these snaplist_space() -> dsl_deadlist_space_range()
-		 * calls will be fast because they do not have to
-		 * iterate over all bps.
-		 */
-		snap = list_head(&ddpa->origin_snaps);
-		err = snaplist_space(&ddpa->shared_snaps,
-		    snap->ds->ds_dir->dd_origin_txg, &ddpa->cloneusedsnap);
-		if (err != 0)
-			goto out;
-
-		err = snaplist_space(&ddpa->clone_snaps,
-		    snap->ds->ds_dir->dd_origin_txg, &space);
-		if (err != 0)
-			goto out;
-		ddpa->cloneusedsnap += space;
-	}
-	if (dsl_dir_phys(origin_ds->ds_dir)->dd_flags &
-	    DD_FLAG_USED_BREAKDOWN) {
-		err = snaplist_space(&ddpa->origin_snaps,
-		    dsl_dataset_phys(origin_ds)->ds_creation_txg,
-		    &ddpa->originusedsnap);
-		if (err != 0)
-			goto out;
-	}
-
-out:
-	promote_rele(ddpa, FTAG);
-	return (err);
-}
-
-void
-dsl_dataset_promote_sync(void *arg, dmu_tx_t *tx)
-{
-	dsl_dataset_promote_arg_t *ddpa = arg;
-	dsl_pool_t *dp = dmu_tx_pool(tx);
-	dsl_dataset_t *hds;
-	struct promotenode *snap;
-	dsl_dataset_t *origin_ds;
-	dsl_dataset_t *origin_head;
-	dsl_dir_t *dd;
-	dsl_dir_t *odd = NULL;
-	uint64_t oldnext_obj;
-	int64_t delta;
-#if defined(__FreeBSD__) && defined(_KERNEL)
-	char *oldname, *newname;
-#endif
-
-	VERIFY0(promote_hold(ddpa, dp, FTAG));
-	hds = ddpa->ddpa_clone;
-
-	ASSERT0(dsl_dataset_phys(hds)->ds_flags & DS_FLAG_NOPROMOTE);
-
-	snap = list_head(&ddpa->shared_snaps);
-	origin_ds = snap->ds;
-	dd = hds->ds_dir;
-
-	snap = list_head(&ddpa->origin_snaps);
-	origin_head = snap->ds;
-
-	/*
-	 * We need to explicitly open odd, since origin_ds's dd will be
-	 * changing.
-	 */
-	VERIFY0(dsl_dir_hold_obj(dp, origin_ds->ds_dir->dd_object,
-	    NULL, FTAG, &odd));
-
-	/* change origin's next snap */
-	dmu_buf_will_dirty(origin_ds->ds_dbuf, tx);
-	oldnext_obj = dsl_dataset_phys(origin_ds)->ds_next_snap_obj;
-	snap = list_tail(&ddpa->clone_snaps);
-	ASSERT3U(dsl_dataset_phys(snap->ds)->ds_prev_snap_obj, ==,
-	    origin_ds->ds_object);
-	dsl_dataset_phys(origin_ds)->ds_next_snap_obj = snap->ds->ds_object;
-
-	/* change the origin's next clone */
-	if (dsl_dataset_phys(origin_ds)->ds_next_clones_obj) {
-		dsl_dataset_remove_from_next_clones(origin_ds,
-		    snap->ds->ds_object, tx);
-		VERIFY0(zap_add_int(dp->dp_meta_objset,
-		    dsl_dataset_phys(origin_ds)->ds_next_clones_obj,
-		    oldnext_obj, tx));
-	}
-
-	/* change origin */
-	dmu_buf_will_dirty(dd->dd_dbuf, tx);
-	ASSERT3U(dsl_dir_phys(dd)->dd_origin_obj, ==, origin_ds->ds_object);
-	dsl_dir_phys(dd)->dd_origin_obj = dsl_dir_phys(odd)->dd_origin_obj;
-	dd->dd_origin_txg = origin_head->ds_dir->dd_origin_txg;
-	dmu_buf_will_dirty(odd->dd_dbuf, tx);
-	dsl_dir_phys(odd)->dd_origin_obj = origin_ds->ds_object;
-	origin_head->ds_dir->dd_origin_txg =
-	    dsl_dataset_phys(origin_ds)->ds_creation_txg;
-
-	/* change dd_clone entries */
-	if (spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) {
-		VERIFY0(zap_remove_int(dp->dp_meta_objset,
-		    dsl_dir_phys(odd)->dd_clones, hds->ds_object, tx));
-		VERIFY0(zap_add_int(dp->dp_meta_objset,
-		    dsl_dir_phys(ddpa->origin_origin->ds_dir)->dd_clones,
-		    hds->ds_object, tx));
-
-		VERIFY0(zap_remove_int(dp->dp_meta_objset,
-		    dsl_dir_phys(ddpa->origin_origin->ds_dir)->dd_clones,
-		    origin_head->ds_object, tx));
-		if (dsl_dir_phys(dd)->dd_clones == 0) {
-			dsl_dir_phys(dd)->dd_clones =
-			    zap_create(dp->dp_meta_objset, DMU_OT_DSL_CLONES,
-			    DMU_OT_NONE, 0, tx);
-		}
-		VERIFY0(zap_add_int(dp->dp_meta_objset,
-		    dsl_dir_phys(dd)->dd_clones, origin_head->ds_object, tx));
-	}
-
-#if defined(__FreeBSD__) && defined(_KERNEL)
-	oldname = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP);
-	newname = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP);
-#endif
-
-	/* move snapshots to this dir */
-	for (snap = list_head(&ddpa->shared_snaps); snap;
-	    snap = list_next(&ddpa->shared_snaps, snap)) {
-		dsl_dataset_t *ds = snap->ds;
-
-		/*
-		 * Property callbacks are registered to a particular
-		 * dsl_dir.  Since ours is changing, evict the objset
-		 * so that they will be unregistered from the old dsl_dir.
-		 */
-		if (ds->ds_objset) {
-			dmu_objset_evict(ds->ds_objset);
-			ds->ds_objset = NULL;
-		}
-
-#if defined(__FreeBSD__) && defined(_KERNEL)
-		dsl_dataset_name(ds, oldname);
-#endif
-
-		/* move snap name entry */
-		VERIFY0(dsl_dataset_get_snapname(ds));
-		VERIFY0(dsl_dataset_snap_remove(origin_head,
-		    ds->ds_snapname, tx, B_TRUE));
-		VERIFY0(zap_add(dp->dp_meta_objset,
-		    dsl_dataset_phys(hds)->ds_snapnames_zapobj, ds->ds_snapname,
-		    8, 1, &ds->ds_object, tx));
-		dsl_fs_ss_count_adjust(hds->ds_dir, 1,
-		    DD_FIELD_SNAPSHOT_COUNT, tx);
-
-		/* change containing dsl_dir */
-		dmu_buf_will_dirty(ds->ds_dbuf, tx);
-		ASSERT3U(dsl_dataset_phys(ds)->ds_dir_obj, ==, odd->dd_object);
-		dsl_dataset_phys(ds)->ds_dir_obj = dd->dd_object;
-		ASSERT3P(ds->ds_dir, ==, odd);
-		dsl_dir_rele(ds->ds_dir, ds);
-		VERIFY0(dsl_dir_hold_obj(dp, dd->dd_object,
-		    NULL, ds, &ds->ds_dir));
-
-#if defined(__FreeBSD__) && defined(_KERNEL)
-		dsl_dataset_name(ds, newname);
-		zfsvfs_update_fromname(oldname, newname);
-		zvol_rename_minors(dp->dp_spa, oldname, newname);
-#endif
-
-		/* move any clone references */
-		if (dsl_dataset_phys(ds)->ds_next_clones_obj &&
-		    spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) {
-			zap_cursor_t zc;
-			zap_attribute_t za;
-
-			for (zap_cursor_init(&zc, dp->dp_meta_objset,
-			    dsl_dataset_phys(ds)->ds_next_clones_obj);
-			    zap_cursor_retrieve(&zc, &za) == 0;
-			    zap_cursor_advance(&zc)) {
-				dsl_dataset_t *cnds;
-				uint64_t o;
-
-				if (za.za_first_integer == oldnext_obj) {
-					/*
-					 * We've already moved the
-					 * origin's reference.
-					 */
-					continue;
-				}
-
-				VERIFY0(dsl_dataset_hold_obj(dp,
-				    za.za_first_integer, FTAG, &cnds));
-				o = dsl_dir_phys(cnds->ds_dir)->
-				    dd_head_dataset_obj;
-
-				VERIFY0(zap_remove_int(dp->dp_meta_objset,
-				    dsl_dir_phys(odd)->dd_clones, o, tx));
-				VERIFY0(zap_add_int(dp->dp_meta_objset,
-				    dsl_dir_phys(dd)->dd_clones, o, tx));
-				dsl_dataset_rele(cnds, FTAG);
-			}
-			zap_cursor_fini(&zc);
-		}
-
-		ASSERT(!dsl_prop_hascb(ds));
-	}
-
-#if defined(__FreeBSD__) && defined(_KERNEL)
-	kmem_free(newname, ZFS_MAX_DATASET_NAME_LEN);
-	kmem_free(oldname, ZFS_MAX_DATASET_NAME_LEN);
-#endif
-	/*
-	 * Change space accounting.
-	 * Note, pa->*usedsnap and dd_used_breakdown[SNAP] will either
-	 * both be valid, or both be 0 (resulting in delta == 0).  This
-	 * is true for each of {clone,origin} independently.
-	 */
-
-	delta = ddpa->cloneusedsnap -
-	    dsl_dir_phys(dd)->dd_used_breakdown[DD_USED_SNAP];
-	ASSERT3S(delta, >=, 0);
-	ASSERT3U(ddpa->used, >=, delta);
-	dsl_dir_diduse_space(dd, DD_USED_SNAP, delta, 0, 0, tx);
-	dsl_dir_diduse_space(dd, DD_USED_HEAD,
-	    ddpa->used - delta, ddpa->comp, ddpa->uncomp, tx);
-
-	delta = ddpa->originusedsnap -
-	    dsl_dir_phys(odd)->dd_used_breakdown[DD_USED_SNAP];
-	ASSERT3S(delta, <=, 0);
-	ASSERT3U(ddpa->used, >=, -delta);
-	dsl_dir_diduse_space(odd, DD_USED_SNAP, delta, 0, 0, tx);
-	dsl_dir_diduse_space(odd, DD_USED_HEAD,
-	    -ddpa->used - delta, -ddpa->comp, -ddpa->uncomp, tx);
-
-	dsl_dataset_phys(origin_ds)->ds_unique_bytes = ddpa->unique;
-
-	/* log history record */
-	spa_history_log_internal_ds(hds, "promote", tx, "");
-
-	dsl_dir_rele(odd, FTAG);
-	promote_rele(ddpa, FTAG);
-}
-
-/*
- * Make a list of dsl_dataset_t's for the snapshots between first_obj
- * (exclusive) and last_obj (inclusive).  The list will be in reverse
- * order (last_obj will be the list_head()).  If first_obj == 0, do all
- * snapshots back to this dataset's origin.
- */
-static int
-snaplist_make(dsl_pool_t *dp,
-    uint64_t first_obj, uint64_t last_obj, list_t *l, void *tag)
-{
-	uint64_t obj = last_obj;
-
-	list_create(l, sizeof (struct promotenode),
-	    offsetof(struct promotenode, link));
-
-	while (obj != first_obj) {
-		dsl_dataset_t *ds;
-		struct promotenode *snap;
-		int err;
-
-		err = dsl_dataset_hold_obj(dp, obj, tag, &ds);
-		ASSERT(err != ENOENT);
-		if (err != 0)
-			return (err);
-
-		if (first_obj == 0)
-			first_obj = dsl_dir_phys(ds->ds_dir)->dd_origin_obj;
-
-		snap = kmem_alloc(sizeof (*snap), KM_SLEEP);
-		snap->ds = ds;
-		list_insert_tail(l, snap);
-		obj = dsl_dataset_phys(ds)->ds_prev_snap_obj;
-	}
-
-	return (0);
-}
-
-static int
-snaplist_space(list_t *l, uint64_t mintxg, uint64_t *spacep)
-{
-	struct promotenode *snap;
-
-	*spacep = 0;
-	for (snap = list_head(l); snap; snap = list_next(l, snap)) {
-		uint64_t used, comp, uncomp;
-		dsl_deadlist_space_range(&snap->ds->ds_deadlist,
-		    mintxg, UINT64_MAX, &used, &comp, &uncomp);
-		*spacep += used;
-	}
-	return (0);
-}
-
-static void
-snaplist_destroy(list_t *l, void *tag)
-{
-	struct promotenode *snap;
-
-	if (l == NULL || !list_link_active(&l->list_head))
-		return;
-
-	while ((snap = list_tail(l)) != NULL) {
-		list_remove(l, snap);
-		dsl_dataset_rele(snap->ds, tag);
-		kmem_free(snap, sizeof (*snap));
-	}
-	list_destroy(l);
-}
-
-static int
-promote_hold(dsl_dataset_promote_arg_t *ddpa, dsl_pool_t *dp, void *tag)
-{
-	int error;
-	dsl_dir_t *dd;
-	struct promotenode *snap;
-
-	error = dsl_dataset_hold(dp, ddpa->ddpa_clonename, tag,
-	    &ddpa->ddpa_clone);
-	if (error != 0)
-		return (error);
-	dd = ddpa->ddpa_clone->ds_dir;
-
-	if (ddpa->ddpa_clone->ds_is_snapshot ||
-	    !dsl_dir_is_clone(dd)) {
-		dsl_dataset_rele(ddpa->ddpa_clone, tag);
-		return (SET_ERROR(EINVAL));
-	}
-
-	error = snaplist_make(dp, 0, dsl_dir_phys(dd)->dd_origin_obj,
-	    &ddpa->shared_snaps, tag);
-	if (error != 0)
-		goto out;
-
-	error = snaplist_make(dp, 0, ddpa->ddpa_clone->ds_object,
-	    &ddpa->clone_snaps, tag);
-	if (error != 0)
-		goto out;
-
-	snap = list_head(&ddpa->shared_snaps);
-	ASSERT3U(snap->ds->ds_object, ==, dsl_dir_phys(dd)->dd_origin_obj);
-	error = snaplist_make(dp, dsl_dir_phys(dd)->dd_origin_obj,
-	    dsl_dir_phys(snap->ds->ds_dir)->dd_head_dataset_obj,
-	    &ddpa->origin_snaps, tag);
-	if (error != 0)
-		goto out;
-
-	if (dsl_dir_phys(snap->ds->ds_dir)->dd_origin_obj != 0) {
-		error = dsl_dataset_hold_obj(dp,
-		    dsl_dir_phys(snap->ds->ds_dir)->dd_origin_obj,
-		    tag, &ddpa->origin_origin);
-		if (error != 0)
-			goto out;
-	}
-out:
-	if (error != 0)
-		promote_rele(ddpa, tag);
-	return (error);
-}
-
-static void
-promote_rele(dsl_dataset_promote_arg_t *ddpa, void *tag)
-{
-	snaplist_destroy(&ddpa->shared_snaps, tag);
-	snaplist_destroy(&ddpa->clone_snaps, tag);
-	snaplist_destroy(&ddpa->origin_snaps, tag);
-	if (ddpa->origin_origin != NULL)
-		dsl_dataset_rele(ddpa->origin_origin, tag);
-	dsl_dataset_rele(ddpa->ddpa_clone, tag);
-}
-
-/*
- * Promote a clone.
- *
- * If it fails due to a conflicting snapshot name, "conflsnap" will be filled
- * in with the name.  (It must be at least ZFS_MAX_DATASET_NAME_LEN bytes long.)
- */
-int
-dsl_dataset_promote(const char *name, char *conflsnap)
-{
-	dsl_dataset_promote_arg_t ddpa = { 0 };
-	uint64_t numsnaps;
-	int error;
-	nvpair_t *snap_pair;
-	objset_t *os;
-
-	/*
-	 * We will modify space proportional to the number of
-	 * snapshots.  Compute numsnaps.
-	 */
-	error = dmu_objset_hold(name, FTAG, &os);
-	if (error != 0)
-		return (error);
-	error = zap_count(dmu_objset_pool(os)->dp_meta_objset,
-	    dsl_dataset_phys(dmu_objset_ds(os))->ds_snapnames_zapobj,
-	    &numsnaps);
-	dmu_objset_rele(os, FTAG);
-	if (error != 0)
-		return (error);
-
-	ddpa.ddpa_clonename = name;
-	ddpa.err_ds = fnvlist_alloc();
-	ddpa.cr = CRED();
-
-	error = dsl_sync_task(name, dsl_dataset_promote_check,
-	    dsl_dataset_promote_sync, &ddpa,
-	    2 + numsnaps, ZFS_SPACE_CHECK_RESERVED);
-
-	/*
-	 * Return the first conflicting snapshot found.
-	 */
-	snap_pair = nvlist_next_nvpair(ddpa.err_ds, NULL);
-	if (snap_pair != NULL && conflsnap != NULL)
-		(void) strcpy(conflsnap, nvpair_name(snap_pair));
-
-	fnvlist_free(ddpa.err_ds);
-	return (error);
-}
-
-int
-dsl_dataset_clone_swap_check_impl(dsl_dataset_t *clone,
-    dsl_dataset_t *origin_head, boolean_t force, void *owner, dmu_tx_t *tx)
-{
-	/*
-	 * "slack" factor for received datasets with refquota set on them.
-	 * See the bottom of this function for details on its use.
-	 */
-	uint64_t refquota_slack = DMU_MAX_ACCESS * spa_asize_inflation;
-	int64_t unused_refres_delta;
-
-	/* they should both be heads */
-	if (clone->ds_is_snapshot ||
-	    origin_head->ds_is_snapshot)
-		return (SET_ERROR(EINVAL));
-
-	/* if we are not forcing, the branch point should be just before them */
-	if (!force && clone->ds_prev != origin_head->ds_prev)
-		return (SET_ERROR(EINVAL));
-
-	/* clone should be the clone (unless they are unrelated) */
-	if (clone->ds_prev != NULL &&
-	    clone->ds_prev != clone->ds_dir->dd_pool->dp_origin_snap &&
-	    origin_head->ds_dir != clone->ds_prev->ds_dir)
-		return (SET_ERROR(EINVAL));
-
-	/* the clone should be a child of the origin */
-	if (clone->ds_dir->dd_parent != origin_head->ds_dir)
-		return (SET_ERROR(EINVAL));
-
-	/* origin_head shouldn't be modified unless 'force' */
-	if (!force &&
-	    dsl_dataset_modified_since_snap(origin_head, origin_head->ds_prev))
-		return (SET_ERROR(ETXTBSY));
-
-	/* origin_head should have no long holds (e.g. is not mounted) */
-	if (dsl_dataset_handoff_check(origin_head, owner, tx))
-		return (SET_ERROR(EBUSY));
-
-	/* check amount of any unconsumed refreservation */
-	unused_refres_delta =
-	    (int64_t)MIN(origin_head->ds_reserved,
-	    dsl_dataset_phys(origin_head)->ds_unique_bytes) -
-	    (int64_t)MIN(origin_head->ds_reserved,
-	    dsl_dataset_phys(clone)->ds_unique_bytes);
-
-	if (unused_refres_delta > 0 &&
-	    unused_refres_delta >
-	    dsl_dir_space_available(origin_head->ds_dir, NULL, 0, TRUE))
-		return (SET_ERROR(ENOSPC));
-
-	/*
-	 * The clone can't be too much over the head's refquota.
-	 *
-	 * To ensure that the entire refquota can be used, we allow one
-	 * transaction to exceed the the refquota.  Therefore, this check
-	 * needs to also allow for the space referenced to be more than the
-	 * refquota.  The maximum amount of space that one transaction can use
-	 * on disk is DMU_MAX_ACCESS * spa_asize_inflation.  Allowing this
-	 * overage ensures that we are able to receive a filesystem that
-	 * exceeds the refquota on the source system.
-	 *
-	 * So that overage is the refquota_slack we use below.
-	 */
-	if (origin_head->ds_quota != 0 &&
-	    dsl_dataset_phys(clone)->ds_referenced_bytes >
-	    origin_head->ds_quota + refquota_slack)
-		return (SET_ERROR(EDQUOT));
-
-	return (0);
-}
-
-static void
-dsl_dataset_swap_remap_deadlists(dsl_dataset_t *clone,
-    dsl_dataset_t *origin, dmu_tx_t *tx)
-{
-	uint64_t clone_remap_dl_obj, origin_remap_dl_obj;
-	dsl_pool_t *dp = dmu_tx_pool(tx);
-
-	ASSERT(dsl_pool_sync_context(dp));
-
-	clone_remap_dl_obj = dsl_dataset_get_remap_deadlist_object(clone);
-	origin_remap_dl_obj = dsl_dataset_get_remap_deadlist_object(origin);
-
-	if (clone_remap_dl_obj != 0) {
-		dsl_deadlist_close(&clone->ds_remap_deadlist);
-		dsl_dataset_unset_remap_deadlist_object(clone, tx);
-	}
-	if (origin_remap_dl_obj != 0) {
-		dsl_deadlist_close(&origin->ds_remap_deadlist);
-		dsl_dataset_unset_remap_deadlist_object(origin, tx);
-	}
-
-	if (clone_remap_dl_obj != 0) {
-		dsl_dataset_set_remap_deadlist_object(origin,
-		    clone_remap_dl_obj, tx);
-		dsl_deadlist_open(&origin->ds_remap_deadlist,
-		    dp->dp_meta_objset, clone_remap_dl_obj);
-	}
-	if (origin_remap_dl_obj != 0) {
-		dsl_dataset_set_remap_deadlist_object(clone,
-		    origin_remap_dl_obj, tx);
-		dsl_deadlist_open(&clone->ds_remap_deadlist,
-		    dp->dp_meta_objset, origin_remap_dl_obj);
-	}
-}
-
-void
-dsl_dataset_clone_swap_sync_impl(dsl_dataset_t *clone,
-    dsl_dataset_t *origin_head, dmu_tx_t *tx)
-{
-	dsl_pool_t *dp = dmu_tx_pool(tx);
-	int64_t unused_refres_delta;
-
-	ASSERT(clone->ds_reserved == 0);
-	/*
-	 * NOTE: On DEBUG kernels there could be a race between this and
-	 * the check function if spa_asize_inflation is adjusted...
-	 */
-	ASSERT(origin_head->ds_quota == 0 ||
-	    dsl_dataset_phys(clone)->ds_unique_bytes <= origin_head->ds_quota +
-	    DMU_MAX_ACCESS * spa_asize_inflation);
-	ASSERT3P(clone->ds_prev, ==, origin_head->ds_prev);
-
-	/*
-	 * Swap per-dataset feature flags.
-	 */
-	for (spa_feature_t f = 0; f < SPA_FEATURES; f++) {
-		if (!(spa_feature_table[f].fi_flags &
-		    ZFEATURE_FLAG_PER_DATASET)) {
-			ASSERT(!clone->ds_feature_inuse[f]);
-			ASSERT(!origin_head->ds_feature_inuse[f]);
-			continue;
-		}
-
-		boolean_t clone_inuse = clone->ds_feature_inuse[f];
-		boolean_t origin_head_inuse = origin_head->ds_feature_inuse[f];
-
-		if (clone_inuse) {
-			dsl_dataset_deactivate_feature(clone->ds_object, f, tx);
-			clone->ds_feature_inuse[f] = B_FALSE;
-		}
-		if (origin_head_inuse) {
-			dsl_dataset_deactivate_feature(origin_head->ds_object,
-			    f, tx);
-			origin_head->ds_feature_inuse[f] = B_FALSE;
-		}
-		if (clone_inuse) {
-			dsl_dataset_activate_feature(origin_head->ds_object,
-			    f, tx);
-			origin_head->ds_feature_inuse[f] = B_TRUE;
-		}
-		if (origin_head_inuse) {
-			dsl_dataset_activate_feature(clone->ds_object, f, tx);
-			clone->ds_feature_inuse[f] = B_TRUE;
-		}
-	}
-
-	dmu_buf_will_dirty(clone->ds_dbuf, tx);
-	dmu_buf_will_dirty(origin_head->ds_dbuf, tx);
-
-	if (clone->ds_objset != NULL) {
-		dmu_objset_evict(clone->ds_objset);
-		clone->ds_objset = NULL;
-	}
-
-	if (origin_head->ds_objset != NULL) {
-		dmu_objset_evict(origin_head->ds_objset);
-		origin_head->ds_objset = NULL;
-	}
-
-	unused_refres_delta =
-	    (int64_t)MIN(origin_head->ds_reserved,
-	    dsl_dataset_phys(origin_head)->ds_unique_bytes) -
-	    (int64_t)MIN(origin_head->ds_reserved,
-	    dsl_dataset_phys(clone)->ds_unique_bytes);
-
-	/*
-	 * Reset origin's unique bytes, if it exists.
-	 */
-	if (clone->ds_prev) {
-		dsl_dataset_t *origin = clone->ds_prev;
-		uint64_t comp, uncomp;
-
-		dmu_buf_will_dirty(origin->ds_dbuf, tx);
-		dsl_deadlist_space_range(&clone->ds_deadlist,
-		    dsl_dataset_phys(origin)->ds_prev_snap_txg, UINT64_MAX,
-		    &dsl_dataset_phys(origin)->ds_unique_bytes, &comp, &uncomp);
-	}
-
-	/* swap blkptrs */
-	{
-		rrw_enter(&clone->ds_bp_rwlock, RW_WRITER, FTAG);
-		rrw_enter(&origin_head->ds_bp_rwlock, RW_WRITER, FTAG);
-		blkptr_t tmp;
-		tmp = dsl_dataset_phys(origin_head)->ds_bp;
-		dsl_dataset_phys(origin_head)->ds_bp =
-		    dsl_dataset_phys(clone)->ds_bp;
-		dsl_dataset_phys(clone)->ds_bp = tmp;
-		rrw_exit(&origin_head->ds_bp_rwlock, FTAG);
-		rrw_exit(&clone->ds_bp_rwlock, FTAG);
-	}
-
-	/* set dd_*_bytes */
-	{
-		int64_t dused, dcomp, duncomp;
-		uint64_t cdl_used, cdl_comp, cdl_uncomp;
-		uint64_t odl_used, odl_comp, odl_uncomp;
-
-		ASSERT3U(dsl_dir_phys(clone->ds_dir)->
-		    dd_used_breakdown[DD_USED_SNAP], ==, 0);
-
-		dsl_deadlist_space(&clone->ds_deadlist,
-		    &cdl_used, &cdl_comp, &cdl_uncomp);
-		dsl_deadlist_space(&origin_head->ds_deadlist,
-		    &odl_used, &odl_comp, &odl_uncomp);
-
-		dused = dsl_dataset_phys(clone)->ds_referenced_bytes +
-		    cdl_used -
-		    (dsl_dataset_phys(origin_head)->ds_referenced_bytes +
-		    odl_used);
-		dcomp = dsl_dataset_phys(clone)->ds_compressed_bytes +
-		    cdl_comp -
-		    (dsl_dataset_phys(origin_head)->ds_compressed_bytes +
-		    odl_comp);
-		duncomp = dsl_dataset_phys(clone)->ds_uncompressed_bytes +
-		    cdl_uncomp -
-		    (dsl_dataset_phys(origin_head)->ds_uncompressed_bytes +
-		    odl_uncomp);
-
-		dsl_dir_diduse_space(origin_head->ds_dir, DD_USED_HEAD,
-		    dused, dcomp, duncomp, tx);
-		dsl_dir_diduse_space(clone->ds_dir, DD_USED_HEAD,
-		    -dused, -dcomp, -duncomp, tx);
-
-		/*
-		 * The difference in the space used by snapshots is the
-		 * difference in snapshot space due to the head's
-		 * deadlist (since that's the only thing that's
-		 * changing that affects the snapused).
-		 */
-		dsl_deadlist_space_range(&clone->ds_deadlist,
-		    origin_head->ds_dir->dd_origin_txg, UINT64_MAX,
-		    &cdl_used, &cdl_comp, &cdl_uncomp);
-		dsl_deadlist_space_range(&origin_head->ds_deadlist,
-		    origin_head->ds_dir->dd_origin_txg, UINT64_MAX,
-		    &odl_used, &odl_comp, &odl_uncomp);
-		dsl_dir_transfer_space(origin_head->ds_dir, cdl_used - odl_used,
-		    DD_USED_HEAD, DD_USED_SNAP, NULL);
-	}
-
-	/* swap ds_*_bytes */
-	SWITCH64(dsl_dataset_phys(origin_head)->ds_referenced_bytes,
-	    dsl_dataset_phys(clone)->ds_referenced_bytes);
-	SWITCH64(dsl_dataset_phys(origin_head)->ds_compressed_bytes,
-	    dsl_dataset_phys(clone)->ds_compressed_bytes);
-	SWITCH64(dsl_dataset_phys(origin_head)->ds_uncompressed_bytes,
-	    dsl_dataset_phys(clone)->ds_uncompressed_bytes);
-	SWITCH64(dsl_dataset_phys(origin_head)->ds_unique_bytes,
-	    dsl_dataset_phys(clone)->ds_unique_bytes);
-
-	/* apply any parent delta for change in unconsumed refreservation */
-	dsl_dir_diduse_space(origin_head->ds_dir, DD_USED_REFRSRV,
-	    unused_refres_delta, 0, 0, tx);
-
-	/*
-	 * Swap deadlists.
-	 */
-	dsl_deadlist_close(&clone->ds_deadlist);
-	dsl_deadlist_close(&origin_head->ds_deadlist);
-	SWITCH64(dsl_dataset_phys(origin_head)->ds_deadlist_obj,
-	    dsl_dataset_phys(clone)->ds_deadlist_obj);
-	dsl_deadlist_open(&clone->ds_deadlist, dp->dp_meta_objset,
-	    dsl_dataset_phys(clone)->ds_deadlist_obj);
-	dsl_deadlist_open(&origin_head->ds_deadlist, dp->dp_meta_objset,
-	    dsl_dataset_phys(origin_head)->ds_deadlist_obj);
-	dsl_dataset_swap_remap_deadlists(clone, origin_head, tx);
-
-	dsl_scan_ds_clone_swapped(origin_head, clone, tx);
-
-	spa_history_log_internal_ds(clone, "clone swap", tx,
-	    "parent=%s", origin_head->ds_dir->dd_myname);
-}
-
-/*
- * Given a pool name and a dataset object number in that pool,
- * return the name of that dataset.
- */
-int
-dsl_dsobj_to_dsname(char *pname, uint64_t obj, char *buf)
-{
-	dsl_pool_t *dp;
-	dsl_dataset_t *ds;
-	int error;
-
-	error = dsl_pool_hold(pname, FTAG, &dp);
-	if (error != 0)
-		return (error);
-
-	error = dsl_dataset_hold_obj(dp, obj, FTAG, &ds);
-	if (error == 0) {
-		dsl_dataset_name(ds, buf);
-		dsl_dataset_rele(ds, FTAG);
-	}
-	dsl_pool_rele(dp, FTAG);
-
-	return (error);
-}
-
-int
-dsl_dataset_check_quota(dsl_dataset_t *ds, boolean_t check_quota,
-    uint64_t asize, uint64_t inflight, uint64_t *used, uint64_t *ref_rsrv)
-{
-	int error = 0;
-
-	ASSERT3S(asize, >, 0);
-
-	/*
-	 * *ref_rsrv is the portion of asize that will come from any
-	 * unconsumed refreservation space.
-	 */
-	*ref_rsrv = 0;
-
-	mutex_enter(&ds->ds_lock);
-	/*
-	 * Make a space adjustment for reserved bytes.
-	 */
-	if (ds->ds_reserved > dsl_dataset_phys(ds)->ds_unique_bytes) {
-		ASSERT3U(*used, >=,
-		    ds->ds_reserved - dsl_dataset_phys(ds)->ds_unique_bytes);
-		*used -=
-		    (ds->ds_reserved - dsl_dataset_phys(ds)->ds_unique_bytes);
-		*ref_rsrv =
-		    asize - MIN(asize, parent_delta(ds, asize + inflight));
-	}
-
-	if (!check_quota || ds->ds_quota == 0) {
-		mutex_exit(&ds->ds_lock);
-		return (0);
-	}
-	/*
-	 * If they are requesting more space, and our current estimate
-	 * is over quota, they get to try again unless the actual
-	 * on-disk is over quota and there are no pending changes (which
-	 * may free up space for us).
-	 */
-	if (dsl_dataset_phys(ds)->ds_referenced_bytes + inflight >=
-	    ds->ds_quota) {
-		if (inflight > 0 ||
-		    dsl_dataset_phys(ds)->ds_referenced_bytes < ds->ds_quota)
-			error = SET_ERROR(ERESTART);
-		else
-			error = SET_ERROR(EDQUOT);
-	}
-	mutex_exit(&ds->ds_lock);
-
-	return (error);
-}
-
-typedef struct dsl_dataset_set_qr_arg {
-	const char *ddsqra_name;
-	zprop_source_t ddsqra_source;
-	uint64_t ddsqra_value;
-} dsl_dataset_set_qr_arg_t;
-
-
-/* ARGSUSED */
-static int
-dsl_dataset_set_refquota_check(void *arg, dmu_tx_t *tx)
-{
-	dsl_dataset_set_qr_arg_t *ddsqra = arg;
-	dsl_pool_t *dp = dmu_tx_pool(tx);
-	dsl_dataset_t *ds;
-	int error;
-	uint64_t newval;
-
-	if (spa_version(dp->dp_spa) < SPA_VERSION_REFQUOTA)
-		return (SET_ERROR(ENOTSUP));
-
-	error = dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds);
-	if (error != 0)
-		return (error);
-
-	if (ds->ds_is_snapshot) {
-		dsl_dataset_rele(ds, FTAG);
-		return (SET_ERROR(EINVAL));
-	}
-
-	error = dsl_prop_predict(ds->ds_dir,
-	    zfs_prop_to_name(ZFS_PROP_REFQUOTA),
-	    ddsqra->ddsqra_source, ddsqra->ddsqra_value, &newval);
-	if (error != 0) {
-		dsl_dataset_rele(ds, FTAG);
-		return (error);
-	}
-
-	if (newval == 0) {
-		dsl_dataset_rele(ds, FTAG);
-		return (0);
-	}
-
-	if (newval < dsl_dataset_phys(ds)->ds_referenced_bytes ||
-	    newval < ds->ds_reserved) {
-		dsl_dataset_rele(ds, FTAG);
-		return (SET_ERROR(ENOSPC));
-	}
-
-	dsl_dataset_rele(ds, FTAG);
-	return (0);
-}
-
-static void
-dsl_dataset_set_refquota_sync(void *arg, dmu_tx_t *tx)
-{
-	dsl_dataset_set_qr_arg_t *ddsqra = arg;
-	dsl_pool_t *dp = dmu_tx_pool(tx);
-	dsl_dataset_t *ds;
-	uint64_t newval;
-
-	VERIFY0(dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds));
-
-	dsl_prop_set_sync_impl(ds,
-	    zfs_prop_to_name(ZFS_PROP_REFQUOTA),
-	    ddsqra->ddsqra_source, sizeof (ddsqra->ddsqra_value), 1,
-	    &ddsqra->ddsqra_value, tx);
-
-	VERIFY0(dsl_prop_get_int_ds(ds,
-	    zfs_prop_to_name(ZFS_PROP_REFQUOTA), &newval));
-
-	if (ds->ds_quota != newval) {
-		dmu_buf_will_dirty(ds->ds_dbuf, tx);
-		ds->ds_quota = newval;
-	}
-	dsl_dataset_rele(ds, FTAG);
-}
-
-int
-dsl_dataset_set_refquota(const char *dsname, zprop_source_t source,
-    uint64_t refquota)
-{
-	dsl_dataset_set_qr_arg_t ddsqra;
-
-	ddsqra.ddsqra_name = dsname;
-	ddsqra.ddsqra_source = source;
-	ddsqra.ddsqra_value = refquota;
-
-	return (dsl_sync_task(dsname, dsl_dataset_set_refquota_check,
-	    dsl_dataset_set_refquota_sync, &ddsqra, 0,
-	    ZFS_SPACE_CHECK_EXTRA_RESERVED));
-}
-
-static int
-dsl_dataset_set_refreservation_check(void *arg, dmu_tx_t *tx)
-{
-	dsl_dataset_set_qr_arg_t *ddsqra = arg;
-	dsl_pool_t *dp = dmu_tx_pool(tx);
-	dsl_dataset_t *ds;
-	int error;
-	uint64_t newval, unique;
-
-	if (spa_version(dp->dp_spa) < SPA_VERSION_REFRESERVATION)
-		return (SET_ERROR(ENOTSUP));
-
-	error = dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds);
-	if (error != 0)
-		return (error);
-
-	if (ds->ds_is_snapshot) {
-		dsl_dataset_rele(ds, FTAG);
-		return (SET_ERROR(EINVAL));
-	}
-
-	error = dsl_prop_predict(ds->ds_dir,
-	    zfs_prop_to_name(ZFS_PROP_REFRESERVATION),
-	    ddsqra->ddsqra_source, ddsqra->ddsqra_value, &newval);
-	if (error != 0) {
-		dsl_dataset_rele(ds, FTAG);
-		return (error);
-	}
-
-	/*
-	 * If we are doing the preliminary check in open context, the
-	 * space estimates may be inaccurate.
-	 */
-	if (!dmu_tx_is_syncing(tx)) {
-		dsl_dataset_rele(ds, FTAG);
-		return (0);
-	}
-
-	mutex_enter(&ds->ds_lock);
-	if (!DS_UNIQUE_IS_ACCURATE(ds))
-		dsl_dataset_recalc_head_uniq(ds);
-	unique = dsl_dataset_phys(ds)->ds_unique_bytes;
-	mutex_exit(&ds->ds_lock);
-
-	if (MAX(unique, newval) > MAX(unique, ds->ds_reserved)) {
-		uint64_t delta = MAX(unique, newval) -
-		    MAX(unique, ds->ds_reserved);
-
-		if (delta >
-		    dsl_dir_space_available(ds->ds_dir, NULL, 0, B_TRUE) ||
-		    (ds->ds_quota > 0 && newval > ds->ds_quota)) {
-			dsl_dataset_rele(ds, FTAG);
-			return (SET_ERROR(ENOSPC));
-		}
-	}
-
-	dsl_dataset_rele(ds, FTAG);
-	return (0);
-}
-
-void
-dsl_dataset_set_refreservation_sync_impl(dsl_dataset_t *ds,
-    zprop_source_t source, uint64_t value, dmu_tx_t *tx)
-{
-	uint64_t newval;
-	uint64_t unique;
-	int64_t delta;
-
-	dsl_prop_set_sync_impl(ds, zfs_prop_to_name(ZFS_PROP_REFRESERVATION),
-	    source, sizeof (value), 1, &value, tx);
-
-	VERIFY0(dsl_prop_get_int_ds(ds,
-	    zfs_prop_to_name(ZFS_PROP_REFRESERVATION), &newval));
-
-	dmu_buf_will_dirty(ds->ds_dbuf, tx);
-	mutex_enter(&ds->ds_dir->dd_lock);
-	mutex_enter(&ds->ds_lock);
-	ASSERT(DS_UNIQUE_IS_ACCURATE(ds));
-	unique = dsl_dataset_phys(ds)->ds_unique_bytes;
-	delta = MAX(0, (int64_t)(newval - unique)) -
-	    MAX(0, (int64_t)(ds->ds_reserved - unique));
-	ds->ds_reserved = newval;
-	mutex_exit(&ds->ds_lock);
-
-	dsl_dir_diduse_space(ds->ds_dir, DD_USED_REFRSRV, delta, 0, 0, tx);
-	mutex_exit(&ds->ds_dir->dd_lock);
-}
-
-static void
-dsl_dataset_set_refreservation_sync(void *arg, dmu_tx_t *tx)
-{
-	dsl_dataset_set_qr_arg_t *ddsqra = arg;
-	dsl_pool_t *dp = dmu_tx_pool(tx);
-	dsl_dataset_t *ds;
-
-	VERIFY0(dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds));
-	dsl_dataset_set_refreservation_sync_impl(ds,
-	    ddsqra->ddsqra_source, ddsqra->ddsqra_value, tx);
-	dsl_dataset_rele(ds, FTAG);
-}
-
-int
-dsl_dataset_set_refreservation(const char *dsname, zprop_source_t source,
-    uint64_t refreservation)
-{
-	dsl_dataset_set_qr_arg_t ddsqra;
-
-	ddsqra.ddsqra_name = dsname;
-	ddsqra.ddsqra_source = source;
-	ddsqra.ddsqra_value = refreservation;
-
-	return (dsl_sync_task(dsname, dsl_dataset_set_refreservation_check,
-	    dsl_dataset_set_refreservation_sync, &ddsqra, 0,
-	    ZFS_SPACE_CHECK_EXTRA_RESERVED));
-}
-
-/*
- * Return (in *usedp) the amount of space written in new that is not
- * present in oldsnap.  New may be a snapshot or the head.  Old must be
- * a snapshot before new, in new's filesystem (or its origin).  If not then
- * fail and return EINVAL.
- *
- * The written space is calculated by considering two components:  First, we
- * ignore any freed space, and calculate the written as new's used space
- * minus old's used space.  Next, we add in the amount of space that was freed
- * between the two snapshots, thus reducing new's used space relative to old's.
- * Specifically, this is the space that was born before old->ds_creation_txg,
- * and freed before new (ie. on new's deadlist or a previous deadlist).
- *
- * space freed                         [---------------------]
- * snapshots                       ---O-------O--------O-------O------
- *                                         oldsnap            new
- */
-int
-dsl_dataset_space_written(dsl_dataset_t *oldsnap, dsl_dataset_t *new,
-    uint64_t *usedp, uint64_t *compp, uint64_t *uncompp)
-{
-	int err = 0;
-	uint64_t snapobj;
-	dsl_pool_t *dp = new->ds_dir->dd_pool;
-
-	ASSERT(dsl_pool_config_held(dp));
-
-	*usedp = 0;
-	*usedp += dsl_dataset_phys(new)->ds_referenced_bytes;
-	*usedp -= dsl_dataset_phys(oldsnap)->ds_referenced_bytes;
-
-	*compp = 0;
-	*compp += dsl_dataset_phys(new)->ds_compressed_bytes;
-	*compp -= dsl_dataset_phys(oldsnap)->ds_compressed_bytes;
-
-	*uncompp = 0;
-	*uncompp += dsl_dataset_phys(new)->ds_uncompressed_bytes;
-	*uncompp -= dsl_dataset_phys(oldsnap)->ds_uncompressed_bytes;
-
-	snapobj = new->ds_object;
-	while (snapobj != oldsnap->ds_object) {
-		dsl_dataset_t *snap;
-		uint64_t used, comp, uncomp;
-
-		if (snapobj == new->ds_object) {
-			snap = new;
-		} else {
-			err = dsl_dataset_hold_obj(dp, snapobj, FTAG, &snap);
-			if (err != 0)
-				break;
-		}
-
-		if (dsl_dataset_phys(snap)->ds_prev_snap_txg ==
-		    dsl_dataset_phys(oldsnap)->ds_creation_txg) {
-			/*
-			 * The blocks in the deadlist can not be born after
-			 * ds_prev_snap_txg, so get the whole deadlist space,
-			 * which is more efficient (especially for old-format
-			 * deadlists).  Unfortunately the deadlist code
-			 * doesn't have enough information to make this
-			 * optimization itself.
-			 */
-			dsl_deadlist_space(&snap->ds_deadlist,
-			    &used, &comp, &uncomp);
-		} else {
-			dsl_deadlist_space_range(&snap->ds_deadlist,
-			    0, dsl_dataset_phys(oldsnap)->ds_creation_txg,
-			    &used, &comp, &uncomp);
-		}
-		*usedp += used;
-		*compp += comp;
-		*uncompp += uncomp;
-
-		/*
-		 * If we get to the beginning of the chain of snapshots
-		 * (ds_prev_snap_obj == 0) before oldsnap, then oldsnap
-		 * was not a snapshot of/before new.
-		 */
-		snapobj = dsl_dataset_phys(snap)->ds_prev_snap_obj;
-		if (snap != new)
-			dsl_dataset_rele(snap, FTAG);
-		if (snapobj == 0) {
-			err = SET_ERROR(EINVAL);
-			break;
-		}
-
-	}
-	return (err);
-}
-
-/*
- * Return (in *usedp) the amount of space that will be reclaimed if firstsnap,
- * lastsnap, and all snapshots in between are deleted.
- *
- * blocks that would be freed            [---------------------------]
- * snapshots                       ---O-------O--------O-------O--------O
- *                                        firstsnap        lastsnap
- *
- * This is the set of blocks that were born after the snap before firstsnap,
- * (birth > firstsnap->prev_snap_txg) and died before the snap after the
- * last snap (ie, is on lastsnap->ds_next->ds_deadlist or an earlier deadlist).
- * We calculate this by iterating over the relevant deadlists (from the snap
- * after lastsnap, backward to the snap after firstsnap), summing up the
- * space on the deadlist that was born after the snap before firstsnap.
- */
-int
-dsl_dataset_space_wouldfree(dsl_dataset_t *firstsnap,
-    dsl_dataset_t *lastsnap,
-    uint64_t *usedp, uint64_t *compp, uint64_t *uncompp)
-{
-	int err = 0;
-	uint64_t snapobj;
-	dsl_pool_t *dp = firstsnap->ds_dir->dd_pool;
-
-	ASSERT(firstsnap->ds_is_snapshot);
-	ASSERT(lastsnap->ds_is_snapshot);
-
-	/*
-	 * Check that the snapshots are in the same dsl_dir, and firstsnap
-	 * is before lastsnap.
-	 */
-	if (firstsnap->ds_dir != lastsnap->ds_dir ||
-	    dsl_dataset_phys(firstsnap)->ds_creation_txg >
-	    dsl_dataset_phys(lastsnap)->ds_creation_txg)
-		return (SET_ERROR(EINVAL));
-
-	*usedp = *compp = *uncompp = 0;
-
-	snapobj = dsl_dataset_phys(lastsnap)->ds_next_snap_obj;
-	while (snapobj != firstsnap->ds_object) {
-		dsl_dataset_t *ds;
-		uint64_t used, comp, uncomp;
-
-		err = dsl_dataset_hold_obj(dp, snapobj, FTAG, &ds);
-		if (err != 0)
-			break;
-
-		dsl_deadlist_space_range(&ds->ds_deadlist,
-		    dsl_dataset_phys(firstsnap)->ds_prev_snap_txg, UINT64_MAX,
-		    &used, &comp, &uncomp);
-		*usedp += used;
-		*compp += comp;
-		*uncompp += uncomp;
-
-		snapobj = dsl_dataset_phys(ds)->ds_prev_snap_obj;
-		ASSERT3U(snapobj, !=, 0);
-		dsl_dataset_rele(ds, FTAG);
-	}
-	return (err);
-}
-
-/*
- * Return TRUE if 'earlier' is an earlier snapshot in 'later's timeline.
- * For example, they could both be snapshots of the same filesystem, and
- * 'earlier' is before 'later'.  Or 'earlier' could be the origin of
- * 'later's filesystem.  Or 'earlier' could be an older snapshot in the origin's
- * filesystem.  Or 'earlier' could be the origin's origin.
- *
- * If non-zero, earlier_txg is used instead of earlier's ds_creation_txg.
- */
-boolean_t
-dsl_dataset_is_before(dsl_dataset_t *later, dsl_dataset_t *earlier,
-    uint64_t earlier_txg)
-{
-	dsl_pool_t *dp = later->ds_dir->dd_pool;
-	int error;
-	boolean_t ret;
-
-	ASSERT(dsl_pool_config_held(dp));
-	ASSERT(earlier->ds_is_snapshot || earlier_txg != 0);
-
-	if (earlier_txg == 0)
-		earlier_txg = dsl_dataset_phys(earlier)->ds_creation_txg;
-
-	if (later->ds_is_snapshot &&
-	    earlier_txg >= dsl_dataset_phys(later)->ds_creation_txg)
-		return (B_FALSE);
-
-	if (later->ds_dir == earlier->ds_dir)
-		return (B_TRUE);
-	if (!dsl_dir_is_clone(later->ds_dir))
-		return (B_FALSE);
-
-	if (dsl_dir_phys(later->ds_dir)->dd_origin_obj == earlier->ds_object)
-		return (B_TRUE);
-	dsl_dataset_t *origin;
-	error = dsl_dataset_hold_obj(dp,
-	    dsl_dir_phys(later->ds_dir)->dd_origin_obj, FTAG, &origin);
-	if (error != 0)
-		return (B_FALSE);
-	ret = dsl_dataset_is_before(origin, earlier, earlier_txg);
-	dsl_dataset_rele(origin, FTAG);
-	return (ret);
-}
-
-void
-dsl_dataset_zapify(dsl_dataset_t *ds, dmu_tx_t *tx)
-{
-	objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
-	dmu_object_zapify(mos, ds->ds_object, DMU_OT_DSL_DATASET, tx);
-}
-
-boolean_t
-dsl_dataset_is_zapified(dsl_dataset_t *ds)
-{
-	dmu_object_info_t doi;
-
-	dmu_object_info_from_db(ds->ds_dbuf, &doi);
-	return (doi.doi_type == DMU_OTN_ZAP_METADATA);
-}
-
-boolean_t
-dsl_dataset_has_resume_receive_state(dsl_dataset_t *ds)
-{
-	return (dsl_dataset_is_zapified(ds) &&
-	    zap_contains(ds->ds_dir->dd_pool->dp_meta_objset,
-	    ds->ds_object, DS_FIELD_RESUME_TOGUID) == 0);
-}
-
-uint64_t
-dsl_dataset_get_remap_deadlist_object(dsl_dataset_t *ds)
-{
-	uint64_t remap_deadlist_obj;
-	int err;
-
-	if (!dsl_dataset_is_zapified(ds))
-		return (0);
-
-	err = zap_lookup(ds->ds_dir->dd_pool->dp_meta_objset, ds->ds_object,
-	    DS_FIELD_REMAP_DEADLIST, sizeof (remap_deadlist_obj), 1,
-	    &remap_deadlist_obj);
-
-	if (err != 0) {
-		VERIFY3S(err, ==, ENOENT);
-		return (0);
-	}
-
-	ASSERT(remap_deadlist_obj != 0);
-	return (remap_deadlist_obj);
-}
-
-boolean_t
-dsl_dataset_remap_deadlist_exists(dsl_dataset_t *ds)
-{
-	EQUIV(dsl_deadlist_is_open(&ds->ds_remap_deadlist),
-	    dsl_dataset_get_remap_deadlist_object(ds) != 0);
-	return (dsl_deadlist_is_open(&ds->ds_remap_deadlist));
-}
-
-static void
-dsl_dataset_set_remap_deadlist_object(dsl_dataset_t *ds, uint64_t obj,
-    dmu_tx_t *tx)
-{
-	ASSERT(obj != 0);
-	dsl_dataset_zapify(ds, tx);
-	VERIFY0(zap_add(ds->ds_dir->dd_pool->dp_meta_objset, ds->ds_object,
-	    DS_FIELD_REMAP_DEADLIST, sizeof (obj), 1, &obj, tx));
-}
-
-static void
-dsl_dataset_unset_remap_deadlist_object(dsl_dataset_t *ds, dmu_tx_t *tx)
-{
-	VERIFY0(zap_remove(ds->ds_dir->dd_pool->dp_meta_objset,
-	    ds->ds_object, DS_FIELD_REMAP_DEADLIST, tx));
-}
-
-void
-dsl_dataset_destroy_remap_deadlist(dsl_dataset_t *ds, dmu_tx_t *tx)
-{
-	uint64_t remap_deadlist_object;
-	spa_t *spa = ds->ds_dir->dd_pool->dp_spa;
-
-	ASSERT(dmu_tx_is_syncing(tx));
-	ASSERT(dsl_dataset_remap_deadlist_exists(ds));
-
-	remap_deadlist_object = ds->ds_remap_deadlist.dl_object;
-	dsl_deadlist_close(&ds->ds_remap_deadlist);
-	dsl_deadlist_free(spa_meta_objset(spa), remap_deadlist_object, tx);
-	dsl_dataset_unset_remap_deadlist_object(ds, tx);
-	spa_feature_decr(spa, SPA_FEATURE_OBSOLETE_COUNTS, tx);
-}
-
-void
-dsl_dataset_create_remap_deadlist(dsl_dataset_t *ds, dmu_tx_t *tx)
-{
-	uint64_t remap_deadlist_obj;
-	spa_t *spa = ds->ds_dir->dd_pool->dp_spa;
-
-	ASSERT(dmu_tx_is_syncing(tx));
-	ASSERT(MUTEX_HELD(&ds->ds_remap_deadlist_lock));
-	/*
-	 * Currently we only create remap deadlists when there are indirect
-	 * vdevs with referenced mappings.
-	 */
-	ASSERT(spa_feature_is_active(spa, SPA_FEATURE_DEVICE_REMOVAL));
-
-	remap_deadlist_obj = dsl_deadlist_clone(
-	    &ds->ds_deadlist, UINT64_MAX,
-	    dsl_dataset_phys(ds)->ds_prev_snap_obj, tx);
-	dsl_dataset_set_remap_deadlist_object(ds,
-	    remap_deadlist_obj, tx);
-	dsl_deadlist_open(&ds->ds_remap_deadlist, spa_meta_objset(spa),
-	    remap_deadlist_obj);
-	spa_feature_incr(spa, SPA_FEATURE_OBSOLETE_COUNTS, tx);
-}
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_deadlist.c
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_deadlist.c
+++ /dev/null
@@ -1,561 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
- * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
- * Copyright (c) 2014 Integros [integros.com]
- */
-
-#include <sys/dsl_dataset.h>
-#include <sys/dmu.h>
-#include <sys/refcount.h>
-#include <sys/zap.h>
-#include <sys/zfs_context.h>
-#include <sys/dsl_pool.h>
-
-/*
- * Deadlist concurrency:
- *
- * Deadlists can only be modified from the syncing thread.
- *
- * Except for dsl_deadlist_insert(), it can only be modified with the
- * dp_config_rwlock held with RW_WRITER.
- *
- * The accessors (dsl_deadlist_space() and dsl_deadlist_space_range()) can
- * be called concurrently, from open context, with the dl_config_rwlock held
- * with RW_READER.
- *
- * Therefore, we only need to provide locking between dsl_deadlist_insert() and
- * the accessors, protecting:
- *     dl_phys->dl_used,comp,uncomp
- *     and protecting the dl_tree from being loaded.
- * The locking is provided by dl_lock.  Note that locking on the bpobj_t
- * provides its own locking, and dl_oldfmt is immutable.
- */
-
-static int
-dsl_deadlist_compare(const void *arg1, const void *arg2)
-{
-	const dsl_deadlist_entry_t *dle1 = (const dsl_deadlist_entry_t *)arg1;
-	const dsl_deadlist_entry_t *dle2 = (const dsl_deadlist_entry_t *)arg2;
-
-	return (AVL_CMP(dle1->dle_mintxg, dle2->dle_mintxg));
-}
-
-static void
-dsl_deadlist_load_tree(dsl_deadlist_t *dl)
-{
-	zap_cursor_t zc;
-	zap_attribute_t za;
-
-	ASSERT(MUTEX_HELD(&dl->dl_lock));
-
-	ASSERT(!dl->dl_oldfmt);
-	if (dl->dl_havetree)
-		return;
-
-	avl_create(&dl->dl_tree, dsl_deadlist_compare,
-	    sizeof (dsl_deadlist_entry_t),
-	    offsetof(dsl_deadlist_entry_t, dle_node));
-	for (zap_cursor_init(&zc, dl->dl_os, dl->dl_object);
-	    zap_cursor_retrieve(&zc, &za) == 0;
-	    zap_cursor_advance(&zc)) {
-		dsl_deadlist_entry_t *dle = kmem_alloc(sizeof (*dle), KM_SLEEP);
-		dle->dle_mintxg = zfs_strtonum(za.za_name, NULL);
-		VERIFY3U(0, ==, bpobj_open(&dle->dle_bpobj, dl->dl_os,
-		    za.za_first_integer));
-		avl_add(&dl->dl_tree, dle);
-	}
-	zap_cursor_fini(&zc);
-	dl->dl_havetree = B_TRUE;
-}
-
-void
-dsl_deadlist_open(dsl_deadlist_t *dl, objset_t *os, uint64_t object)
-{
-	dmu_object_info_t doi;
-
-	ASSERT(!dsl_deadlist_is_open(dl));
-
-	mutex_init(&dl->dl_lock, NULL, MUTEX_DEFAULT, NULL);
-	dl->dl_os = os;
-	dl->dl_object = object;
-	VERIFY3U(0, ==, dmu_bonus_hold(os, object, dl, &dl->dl_dbuf));
-	dmu_object_info_from_db(dl->dl_dbuf, &doi);
-	if (doi.doi_type == DMU_OT_BPOBJ) {
-		dmu_buf_rele(dl->dl_dbuf, dl);
-		dl->dl_dbuf = NULL;
-		dl->dl_oldfmt = B_TRUE;
-		VERIFY3U(0, ==, bpobj_open(&dl->dl_bpobj, os, object));
-		return;
-	}
-
-	dl->dl_oldfmt = B_FALSE;
-	dl->dl_phys = dl->dl_dbuf->db_data;
-	dl->dl_havetree = B_FALSE;
-}
-
-boolean_t
-dsl_deadlist_is_open(dsl_deadlist_t *dl)
-{
-	return (dl->dl_os != NULL);
-}
-
-void
-dsl_deadlist_close(dsl_deadlist_t *dl)
-{
-	void *cookie = NULL;
-	dsl_deadlist_entry_t *dle;
-
-	ASSERT(dsl_deadlist_is_open(dl));
-
-	if (dl->dl_oldfmt) {
-		dl->dl_oldfmt = B_FALSE;
-		bpobj_close(&dl->dl_bpobj);
-		dl->dl_os = NULL;
-		dl->dl_object = 0;
-		return;
-	}
-
-	if (dl->dl_havetree) {
-		while ((dle = avl_destroy_nodes(&dl->dl_tree, &cookie))
-		    != NULL) {
-			bpobj_close(&dle->dle_bpobj);
-			kmem_free(dle, sizeof (*dle));
-		}
-		avl_destroy(&dl->dl_tree);
-	}
-	dmu_buf_rele(dl->dl_dbuf, dl);
-	mutex_destroy(&dl->dl_lock);
-	dl->dl_dbuf = NULL;
-	dl->dl_phys = NULL;
-	dl->dl_os = NULL;
-	dl->dl_object = 0;
-}
-
-uint64_t
-dsl_deadlist_alloc(objset_t *os, dmu_tx_t *tx)
-{
-	if (spa_version(dmu_objset_spa(os)) < SPA_VERSION_DEADLISTS)
-		return (bpobj_alloc(os, SPA_OLD_MAXBLOCKSIZE, tx));
-	return (zap_create(os, DMU_OT_DEADLIST, DMU_OT_DEADLIST_HDR,
-	    sizeof (dsl_deadlist_phys_t), tx));
-}
-
-void
-dsl_deadlist_free(objset_t *os, uint64_t dlobj, dmu_tx_t *tx)
-{
-	dmu_object_info_t doi;
-	zap_cursor_t zc;
-	zap_attribute_t za;
-
-	VERIFY3U(0, ==, dmu_object_info(os, dlobj, &doi));
-	if (doi.doi_type == DMU_OT_BPOBJ) {
-		bpobj_free(os, dlobj, tx);
-		return;
-	}
-
-	for (zap_cursor_init(&zc, os, dlobj);
-	    zap_cursor_retrieve(&zc, &za) == 0;
-	    zap_cursor_advance(&zc)) {
-		uint64_t obj = za.za_first_integer;
-		if (obj == dmu_objset_pool(os)->dp_empty_bpobj)
-			bpobj_decr_empty(os, tx);
-		else
-			bpobj_free(os, obj, tx);
-	}
-	zap_cursor_fini(&zc);
-	VERIFY3U(0, ==, dmu_object_free(os, dlobj, tx));
-}
-
-static void
-dle_enqueue(dsl_deadlist_t *dl, dsl_deadlist_entry_t *dle,
-    const blkptr_t *bp, dmu_tx_t *tx)
-{
-	ASSERT(MUTEX_HELD(&dl->dl_lock));
-	if (dle->dle_bpobj.bpo_object ==
-	    dmu_objset_pool(dl->dl_os)->dp_empty_bpobj) {
-		uint64_t obj = bpobj_alloc(dl->dl_os, SPA_OLD_MAXBLOCKSIZE, tx);
-		bpobj_close(&dle->dle_bpobj);
-		bpobj_decr_empty(dl->dl_os, tx);
-		VERIFY3U(0, ==, bpobj_open(&dle->dle_bpobj, dl->dl_os, obj));
-		VERIFY3U(0, ==, zap_update_int_key(dl->dl_os, dl->dl_object,
-		    dle->dle_mintxg, obj, tx));
-	}
-	bpobj_enqueue(&dle->dle_bpobj, bp, tx);
-}
-
-static void
-dle_enqueue_subobj(dsl_deadlist_t *dl, dsl_deadlist_entry_t *dle,
-    uint64_t obj, dmu_tx_t *tx)
-{
-	ASSERT(MUTEX_HELD(&dl->dl_lock));
-	if (dle->dle_bpobj.bpo_object !=
-	    dmu_objset_pool(dl->dl_os)->dp_empty_bpobj) {
-		bpobj_enqueue_subobj(&dle->dle_bpobj, obj, tx);
-	} else {
-		bpobj_close(&dle->dle_bpobj);
-		bpobj_decr_empty(dl->dl_os, tx);
-		VERIFY3U(0, ==, bpobj_open(&dle->dle_bpobj, dl->dl_os, obj));
-		VERIFY3U(0, ==, zap_update_int_key(dl->dl_os, dl->dl_object,
-		    dle->dle_mintxg, obj, tx));
-	}
-}
-
-void
-dsl_deadlist_insert(dsl_deadlist_t *dl, const blkptr_t *bp, dmu_tx_t *tx)
-{
-	dsl_deadlist_entry_t dle_tofind;
-	dsl_deadlist_entry_t *dle;
-	avl_index_t where;
-
-	if (dl->dl_oldfmt) {
-		bpobj_enqueue(&dl->dl_bpobj, bp, tx);
-		return;
-	}
-
-	mutex_enter(&dl->dl_lock);
-	dsl_deadlist_load_tree(dl);
-
-	dmu_buf_will_dirty(dl->dl_dbuf, tx);
-	dl->dl_phys->dl_used +=
-	    bp_get_dsize_sync(dmu_objset_spa(dl->dl_os), bp);
-	dl->dl_phys->dl_comp += BP_GET_PSIZE(bp);
-	dl->dl_phys->dl_uncomp += BP_GET_UCSIZE(bp);
-
-	dle_tofind.dle_mintxg = bp->blk_birth;
-	dle = avl_find(&dl->dl_tree, &dle_tofind, &where);
-	if (dle == NULL)
-		dle = avl_nearest(&dl->dl_tree, where, AVL_BEFORE);
-	else
-		dle = AVL_PREV(&dl->dl_tree, dle);
-	dle_enqueue(dl, dle, bp, tx);
-	mutex_exit(&dl->dl_lock);
-}
-
-/*
- * Insert new key in deadlist, which must be > all current entries.
- * mintxg is not inclusive.
- */
-void
-dsl_deadlist_add_key(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx)
-{
-	uint64_t obj;
-	dsl_deadlist_entry_t *dle;
-
-	if (dl->dl_oldfmt)
-		return;
-
-	dle = kmem_alloc(sizeof (*dle), KM_SLEEP);
-	dle->dle_mintxg = mintxg;
-
-	mutex_enter(&dl->dl_lock);
-	dsl_deadlist_load_tree(dl);
-
-	obj = bpobj_alloc_empty(dl->dl_os, SPA_OLD_MAXBLOCKSIZE, tx);
-	VERIFY3U(0, ==, bpobj_open(&dle->dle_bpobj, dl->dl_os, obj));
-	avl_add(&dl->dl_tree, dle);
-
-	VERIFY3U(0, ==, zap_add_int_key(dl->dl_os, dl->dl_object,
-	    mintxg, obj, tx));
-	mutex_exit(&dl->dl_lock);
-}
-
-/*
- * Remove this key, merging its entries into the previous key.
- */
-void
-dsl_deadlist_remove_key(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx)
-{
-	dsl_deadlist_entry_t dle_tofind;
-	dsl_deadlist_entry_t *dle, *dle_prev;
-
-	if (dl->dl_oldfmt)
-		return;
-
-	mutex_enter(&dl->dl_lock);
-	dsl_deadlist_load_tree(dl);
-
-	dle_tofind.dle_mintxg = mintxg;
-	dle = avl_find(&dl->dl_tree, &dle_tofind, NULL);
-	dle_prev = AVL_PREV(&dl->dl_tree, dle);
-
-	dle_enqueue_subobj(dl, dle_prev, dle->dle_bpobj.bpo_object, tx);
-
-	avl_remove(&dl->dl_tree, dle);
-	bpobj_close(&dle->dle_bpobj);
-	kmem_free(dle, sizeof (*dle));
-
-	VERIFY3U(0, ==, zap_remove_int(dl->dl_os, dl->dl_object, mintxg, tx));
-	mutex_exit(&dl->dl_lock);
-}
-
-/*
- * Walk ds's snapshots to regenerate generate ZAP & AVL.
- */
-static void
-dsl_deadlist_regenerate(objset_t *os, uint64_t dlobj,
-    uint64_t mrs_obj, dmu_tx_t *tx)
-{
-	dsl_deadlist_t dl = { 0 };
-	dsl_pool_t *dp = dmu_objset_pool(os);
-
-	dsl_deadlist_open(&dl, os, dlobj);
-	if (dl.dl_oldfmt) {
-		dsl_deadlist_close(&dl);
-		return;
-	}
-
-	while (mrs_obj != 0) {
-		dsl_dataset_t *ds;
-		VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, mrs_obj, FTAG, &ds));
-		dsl_deadlist_add_key(&dl,
-		    dsl_dataset_phys(ds)->ds_prev_snap_txg, tx);
-		mrs_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj;
-		dsl_dataset_rele(ds, FTAG);
-	}
-	dsl_deadlist_close(&dl);
-}
-
-uint64_t
-dsl_deadlist_clone(dsl_deadlist_t *dl, uint64_t maxtxg,
-    uint64_t mrs_obj, dmu_tx_t *tx)
-{
-	dsl_deadlist_entry_t *dle;
-	uint64_t newobj;
-
-	newobj = dsl_deadlist_alloc(dl->dl_os, tx);
-
-	if (dl->dl_oldfmt) {
-		dsl_deadlist_regenerate(dl->dl_os, newobj, mrs_obj, tx);
-		return (newobj);
-	}
-
-	mutex_enter(&dl->dl_lock);
-	dsl_deadlist_load_tree(dl);
-
-	for (dle = avl_first(&dl->dl_tree); dle;
-	    dle = AVL_NEXT(&dl->dl_tree, dle)) {
-		uint64_t obj;
-
-		if (dle->dle_mintxg >= maxtxg)
-			break;
-
-		obj = bpobj_alloc_empty(dl->dl_os, SPA_OLD_MAXBLOCKSIZE, tx);
-		VERIFY3U(0, ==, zap_add_int_key(dl->dl_os, newobj,
-		    dle->dle_mintxg, obj, tx));
-	}
-	mutex_exit(&dl->dl_lock);
-	return (newobj);
-}
-
-void
-dsl_deadlist_space(dsl_deadlist_t *dl,
-    uint64_t *usedp, uint64_t *compp, uint64_t *uncompp)
-{
-	ASSERT(dsl_deadlist_is_open(dl));
-	if (dl->dl_oldfmt) {
-		VERIFY3U(0, ==, bpobj_space(&dl->dl_bpobj,
-		    usedp, compp, uncompp));
-		return;
-	}
-
-	mutex_enter(&dl->dl_lock);
-	*usedp = dl->dl_phys->dl_used;
-	*compp = dl->dl_phys->dl_comp;
-	*uncompp = dl->dl_phys->dl_uncomp;
-	mutex_exit(&dl->dl_lock);
-}
-
-/*
- * return space used in the range (mintxg, maxtxg].
- * Includes maxtxg, does not include mintxg.
- * mintxg and maxtxg must both be keys in the deadlist (unless maxtxg is
- * larger than any bp in the deadlist (eg. UINT64_MAX)).
- */
-void
-dsl_deadlist_space_range(dsl_deadlist_t *dl, uint64_t mintxg, uint64_t maxtxg,
-    uint64_t *usedp, uint64_t *compp, uint64_t *uncompp)
-{
-	dsl_deadlist_entry_t *dle;
-	dsl_deadlist_entry_t dle_tofind;
-	avl_index_t where;
-
-	if (dl->dl_oldfmt) {
-		VERIFY3U(0, ==, bpobj_space_range(&dl->dl_bpobj,
-		    mintxg, maxtxg, usedp, compp, uncompp));
-		return;
-	}
-
-	*usedp = *compp = *uncompp = 0;
-
-	mutex_enter(&dl->dl_lock);
-	dsl_deadlist_load_tree(dl);
-	dle_tofind.dle_mintxg = mintxg;
-	dle = avl_find(&dl->dl_tree, &dle_tofind, &where);
-	/*
-	 * If we don't find this mintxg, there shouldn't be anything
-	 * after it either.
-	 */
-	ASSERT(dle != NULL ||
-	    avl_nearest(&dl->dl_tree, where, AVL_AFTER) == NULL);
-
-	for (; dle && dle->dle_mintxg < maxtxg;
-	    dle = AVL_NEXT(&dl->dl_tree, dle)) {
-		uint64_t used, comp, uncomp;
-
-		VERIFY3U(0, ==, bpobj_space(&dle->dle_bpobj,
-		    &used, &comp, &uncomp));
-
-		*usedp += used;
-		*compp += comp;
-		*uncompp += uncomp;
-	}
-	mutex_exit(&dl->dl_lock);
-}
-
-static void
-dsl_deadlist_insert_bpobj(dsl_deadlist_t *dl, uint64_t obj, uint64_t birth,
-    dmu_tx_t *tx)
-{
-	dsl_deadlist_entry_t dle_tofind;
-	dsl_deadlist_entry_t *dle;
-	avl_index_t where;
-	uint64_t used, comp, uncomp;
-	bpobj_t bpo;
-
-	ASSERT(MUTEX_HELD(&dl->dl_lock));
-
-	VERIFY3U(0, ==, bpobj_open(&bpo, dl->dl_os, obj));
-	VERIFY3U(0, ==, bpobj_space(&bpo, &used, &comp, &uncomp));
-	bpobj_close(&bpo);
-
-	dsl_deadlist_load_tree(dl);
-
-	dmu_buf_will_dirty(dl->dl_dbuf, tx);
-	dl->dl_phys->dl_used += used;
-	dl->dl_phys->dl_comp += comp;
-	dl->dl_phys->dl_uncomp += uncomp;
-
-	dle_tofind.dle_mintxg = birth;
-	dle = avl_find(&dl->dl_tree, &dle_tofind, &where);
-	if (dle == NULL)
-		dle = avl_nearest(&dl->dl_tree, where, AVL_BEFORE);
-	dle_enqueue_subobj(dl, dle, obj, tx);
-}
-
-static int
-dsl_deadlist_insert_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
-{
-	dsl_deadlist_t *dl = arg;
-	dsl_deadlist_insert(dl, bp, tx);
-	return (0);
-}
-
-/*
- * Merge the deadlist pointed to by 'obj' into dl.  obj will be left as
- * an empty deadlist.
- */
-void
-dsl_deadlist_merge(dsl_deadlist_t *dl, uint64_t obj, dmu_tx_t *tx)
-{
-	zap_cursor_t zc;
-	zap_attribute_t za;
-	dmu_buf_t *bonus;
-	dsl_deadlist_phys_t *dlp;
-	dmu_object_info_t doi;
-
-	VERIFY3U(0, ==, dmu_object_info(dl->dl_os, obj, &doi));
-	if (doi.doi_type == DMU_OT_BPOBJ) {
-		bpobj_t bpo;
-		VERIFY3U(0, ==, bpobj_open(&bpo, dl->dl_os, obj));
-		VERIFY3U(0, ==, bpobj_iterate(&bpo,
-		    dsl_deadlist_insert_cb, dl, tx));
-		bpobj_close(&bpo);
-		return;
-	}
-
-	mutex_enter(&dl->dl_lock);
-	for (zap_cursor_init(&zc, dl->dl_os, obj);
-	    zap_cursor_retrieve(&zc, &za) == 0;
-	    zap_cursor_advance(&zc)) {
-		uint64_t mintxg = zfs_strtonum(za.za_name, NULL);
-		dsl_deadlist_insert_bpobj(dl, za.za_first_integer, mintxg, tx);
-		VERIFY3U(0, ==, zap_remove_int(dl->dl_os, obj, mintxg, tx));
-	}
-	zap_cursor_fini(&zc);
-
-	VERIFY3U(0, ==, dmu_bonus_hold(dl->dl_os, obj, FTAG, &bonus));
-	dlp = bonus->db_data;
-	dmu_buf_will_dirty(bonus, tx);
-	bzero(dlp, sizeof (*dlp));
-	dmu_buf_rele(bonus, FTAG);
-	mutex_exit(&dl->dl_lock);
-}
-
-/*
- * Remove entries on dl that are >= mintxg, and put them on the bpobj.
- */
-void
-dsl_deadlist_move_bpobj(dsl_deadlist_t *dl, bpobj_t *bpo, uint64_t mintxg,
-    dmu_tx_t *tx)
-{
-	dsl_deadlist_entry_t dle_tofind;
-	dsl_deadlist_entry_t *dle;
-	avl_index_t where;
-
-	ASSERT(!dl->dl_oldfmt);
-
-	mutex_enter(&dl->dl_lock);
-	dmu_buf_will_dirty(dl->dl_dbuf, tx);
-	dsl_deadlist_load_tree(dl);
-
-	dle_tofind.dle_mintxg = mintxg;
-	dle = avl_find(&dl->dl_tree, &dle_tofind, &where);
-	if (dle == NULL)
-		dle = avl_nearest(&dl->dl_tree, where, AVL_AFTER);
-	while (dle) {
-		uint64_t used, comp, uncomp;
-		dsl_deadlist_entry_t *dle_next;
-
-		bpobj_enqueue_subobj(bpo, dle->dle_bpobj.bpo_object, tx);
-
-		VERIFY3U(0, ==, bpobj_space(&dle->dle_bpobj,
-		    &used, &comp, &uncomp));
-		ASSERT3U(dl->dl_phys->dl_used, >=, used);
-		ASSERT3U(dl->dl_phys->dl_comp, >=, comp);
-		ASSERT3U(dl->dl_phys->dl_uncomp, >=, uncomp);
-		dl->dl_phys->dl_used -= used;
-		dl->dl_phys->dl_comp -= comp;
-		dl->dl_phys->dl_uncomp -= uncomp;
-
-		VERIFY3U(0, ==, zap_remove_int(dl->dl_os, dl->dl_object,
-		    dle->dle_mintxg, tx));
-
-		dle_next = AVL_NEXT(&dl->dl_tree, dle);
-		avl_remove(&dl->dl_tree, dle);
-		bpobj_close(&dle->dle_bpobj);
-		kmem_free(dle, sizeof (*dle));
-		dle = dle_next;
-	}
-	mutex_exit(&dl->dl_lock);
-}
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_deleg.c
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_deleg.c
+++ /dev/null
@@ -1,760 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
- */
-
-/*
- * DSL permissions are stored in a two level zap attribute
- * mechanism.   The first level identifies the "class" of
- * entry.  The class is identified by the first 2 letters of
- * the attribute.  The second letter "l" or "d" identifies whether
- * it is a local or descendent permission.  The first letter
- * identifies the type of entry.
- *
- * ul$<id>    identifies permissions granted locally for this userid.
- * ud$<id>    identifies permissions granted on descendent datasets for
- *            this userid.
- * Ul$<id>    identifies permission sets granted locally for this userid.
- * Ud$<id>    identifies permission sets granted on descendent datasets for
- *            this userid.
- * gl$<id>    identifies permissions granted locally for this groupid.
- * gd$<id>    identifies permissions granted on descendent datasets for
- *            this groupid.
- * Gl$<id>    identifies permission sets granted locally for this groupid.
- * Gd$<id>    identifies permission sets granted on descendent datasets for
- *            this groupid.
- * el$        identifies permissions granted locally for everyone.
- * ed$        identifies permissions granted on descendent datasets
- *            for everyone.
- * El$        identifies permission sets granted locally for everyone.
- * Ed$        identifies permission sets granted to descendent datasets for
- *            everyone.
- * c-$        identifies permission to create at dataset creation time.
- * C-$        identifies permission sets to grant locally at dataset creation
- *            time.
- * s-$@<name> permissions defined in specified set @<name>
- * S-$@<name> Sets defined in named set @<name>
- *
- * Each of the above entities points to another zap attribute that contains one
- * attribute for each allowed permission, such as create, destroy,...
- * All of the "upper" case class types will specify permission set names
- * rather than permissions.
- *
- * Basically it looks something like this:
- * ul$12 -> ZAP OBJ -> permissions...
- *
- * The ZAP OBJ is referred to as the jump object.
- */
-
-#include <sys/dmu.h>
-#include <sys/dmu_objset.h>
-#include <sys/dmu_tx.h>
-#include <sys/dsl_dataset.h>
-#include <sys/dsl_dir.h>
-#include <sys/dsl_prop.h>
-#include <sys/dsl_synctask.h>
-#include <sys/dsl_deleg.h>
-#include <sys/spa.h>
-#include <sys/zap.h>
-#include <sys/fs/zfs.h>
-#include <sys/cred.h>
-#include <sys/sunddi.h>
-
-#include "zfs_deleg.h"
-
-/*
- * Validate that user is allowed to delegate specified permissions.
- *
- * In order to delegate "create" you must have "create"
- * and "allow".
- */
-int
-dsl_deleg_can_allow(char *ddname, nvlist_t *nvp, cred_t *cr)
-{
-	nvpair_t *whopair = NULL;
-	int error;
-
-	if ((error = dsl_deleg_access(ddname, ZFS_DELEG_PERM_ALLOW, cr)) != 0)
-		return (error);
-
-	while (whopair = nvlist_next_nvpair(nvp, whopair)) {
-		nvlist_t *perms;
-		nvpair_t *permpair = NULL;
-
-		VERIFY(nvpair_value_nvlist(whopair, &perms) == 0);
-
-		while (permpair = nvlist_next_nvpair(perms, permpair)) {
-			const char *perm = nvpair_name(permpair);
-
-			if (strcmp(perm, ZFS_DELEG_PERM_ALLOW) == 0)
-				return (SET_ERROR(EPERM));
-
-			if ((error = dsl_deleg_access(ddname, perm, cr)) != 0)
-				return (error);
-		}
-	}
-	return (0);
-}
-
-/*
- * Validate that user is allowed to unallow specified permissions.  They
- * must have the 'allow' permission, and even then can only unallow
- * perms for their uid.
- */
-int
-dsl_deleg_can_unallow(char *ddname, nvlist_t *nvp, cred_t *cr)
-{
-	nvpair_t *whopair = NULL;
-	int error;
-	char idstr[32];
-
-	if ((error = dsl_deleg_access(ddname, ZFS_DELEG_PERM_ALLOW, cr)) != 0)
-		return (error);
-
-	(void) snprintf(idstr, sizeof (idstr), "%lld",
-	    (longlong_t)crgetuid(cr));
-
-	while (whopair = nvlist_next_nvpair(nvp, whopair)) {
-		zfs_deleg_who_type_t type = nvpair_name(whopair)[0];
-
-		if (type != ZFS_DELEG_USER &&
-		    type != ZFS_DELEG_USER_SETS)
-			return (SET_ERROR(EPERM));
-
-		if (strcmp(idstr, &nvpair_name(whopair)[3]) != 0)
-			return (SET_ERROR(EPERM));
-	}
-	return (0);
-}
-
-typedef struct dsl_deleg_arg {
-	const char *dda_name;
-	nvlist_t *dda_nvlist;
-} dsl_deleg_arg_t;
-
-static void
-dsl_deleg_set_sync(void *arg, dmu_tx_t *tx)
-{
-	dsl_deleg_arg_t *dda = arg;
-	dsl_dir_t *dd;
-	dsl_pool_t *dp = dmu_tx_pool(tx);
-	objset_t *mos = dp->dp_meta_objset;
-	nvpair_t *whopair = NULL;
-	uint64_t zapobj;
-
-	VERIFY0(dsl_dir_hold(dp, dda->dda_name, FTAG, &dd, NULL));
-
-	zapobj = dsl_dir_phys(dd)->dd_deleg_zapobj;
-	if (zapobj == 0) {
-		dmu_buf_will_dirty(dd->dd_dbuf, tx);
-		zapobj = dsl_dir_phys(dd)->dd_deleg_zapobj = zap_create(mos,
-		    DMU_OT_DSL_PERMS, DMU_OT_NONE, 0, tx);
-	}
-
-	while (whopair = nvlist_next_nvpair(dda->dda_nvlist, whopair)) {
-		const char *whokey = nvpair_name(whopair);
-		nvlist_t *perms;
-		nvpair_t *permpair = NULL;
-		uint64_t jumpobj;
-
-		perms = fnvpair_value_nvlist(whopair);
-
-		if (zap_lookup(mos, zapobj, whokey, 8, 1, &jumpobj) != 0) {
-			jumpobj = zap_create_link(mos, DMU_OT_DSL_PERMS,
-			    zapobj, whokey, tx);
-		}
-
-		while (permpair = nvlist_next_nvpair(perms, permpair)) {
-			const char *perm = nvpair_name(permpair);
-			uint64_t n = 0;
-
-			VERIFY(zap_update(mos, jumpobj,
-			    perm, 8, 1, &n, tx) == 0);
-			spa_history_log_internal_dd(dd, "permission update", tx,
-			    "%s %s", whokey, perm);
-		}
-	}
-	dsl_dir_rele(dd, FTAG);
-}
-
-static void
-dsl_deleg_unset_sync(void *arg, dmu_tx_t *tx)
-{
-	dsl_deleg_arg_t *dda = arg;
-	dsl_dir_t *dd;
-	dsl_pool_t *dp = dmu_tx_pool(tx);
-	objset_t *mos = dp->dp_meta_objset;
-	nvpair_t *whopair = NULL;
-	uint64_t zapobj;
-
-	VERIFY0(dsl_dir_hold(dp, dda->dda_name, FTAG, &dd, NULL));
-	zapobj = dsl_dir_phys(dd)->dd_deleg_zapobj;
-	if (zapobj == 0) {
-		dsl_dir_rele(dd, FTAG);
-		return;
-	}
-
-	while (whopair = nvlist_next_nvpair(dda->dda_nvlist, whopair)) {
-		const char *whokey = nvpair_name(whopair);
-		nvlist_t *perms;
-		nvpair_t *permpair = NULL;
-		uint64_t jumpobj;
-
-		if (nvpair_value_nvlist(whopair, &perms) != 0) {
-			if (zap_lookup(mos, zapobj, whokey, 8,
-			    1, &jumpobj) == 0) {
-				(void) zap_remove(mos, zapobj, whokey, tx);
-				VERIFY(0 == zap_destroy(mos, jumpobj, tx));
-			}
-			spa_history_log_internal_dd(dd, "permission who remove",
-			    tx, "%s", whokey);
-			continue;
-		}
-
-		if (zap_lookup(mos, zapobj, whokey, 8, 1, &jumpobj) != 0)
-			continue;
-
-		while (permpair = nvlist_next_nvpair(perms, permpair)) {
-			const char *perm = nvpair_name(permpair);
-			uint64_t n = 0;
-
-			(void) zap_remove(mos, jumpobj, perm, tx);
-			if (zap_count(mos, jumpobj, &n) == 0 && n == 0) {
-				(void) zap_remove(mos, zapobj,
-				    whokey, tx);
-				VERIFY(0 == zap_destroy(mos,
-				    jumpobj, tx));
-			}
-			spa_history_log_internal_dd(dd, "permission remove", tx,
-			    "%s %s", whokey, perm);
-		}
-	}
-	dsl_dir_rele(dd, FTAG);
-}
-
-static int
-dsl_deleg_check(void *arg, dmu_tx_t *tx)
-{
-	dsl_deleg_arg_t *dda = arg;
-	dsl_dir_t *dd;
-	int error;
-
-	if (spa_version(dmu_tx_pool(tx)->dp_spa) <
-	    SPA_VERSION_DELEGATED_PERMS) {
-		return (SET_ERROR(ENOTSUP));
-	}
-
-	error = dsl_dir_hold(dmu_tx_pool(tx), dda->dda_name, FTAG, &dd, NULL);
-	if (error == 0)
-		dsl_dir_rele(dd, FTAG);
-	return (error);
-}
-
-int
-dsl_deleg_set(const char *ddname, nvlist_t *nvp, boolean_t unset)
-{
-	dsl_deleg_arg_t dda;
-
-	/* nvp must already have been verified to be valid */
-
-	dda.dda_name = ddname;
-	dda.dda_nvlist = nvp;
-
-	return (dsl_sync_task(ddname, dsl_deleg_check,
-	    unset ? dsl_deleg_unset_sync : dsl_deleg_set_sync,
-	    &dda, fnvlist_num_pairs(nvp), ZFS_SPACE_CHECK_RESERVED));
-}
-
-/*
- * Find all 'allow' permissions from a given point and then continue
- * traversing up to the root.
- *
- * This function constructs an nvlist of nvlists.
- * each setpoint is an nvlist composed of an nvlist of an nvlist
- * of the individual * users/groups/everyone/create
- * permissions.
- *
- * The nvlist will look like this.
- *
- * { source fsname -> { whokeys { permissions,...}, ...}}
- *
- * The fsname nvpairs will be arranged in a bottom up order.  For example,
- * if we have the following structure a/b/c then the nvpairs for the fsnames
- * will be ordered a/b/c, a/b, a.
- */
-int
-dsl_deleg_get(const char *ddname, nvlist_t **nvp)
-{
-	dsl_dir_t *dd, *startdd;
-	dsl_pool_t *dp;
-	int error;
-	objset_t *mos;
-
-	error = dsl_pool_hold(ddname, FTAG, &dp);
-	if (error != 0)
-		return (error);
-
-	error = dsl_dir_hold(dp, ddname, FTAG, &startdd, NULL);
-	if (error != 0) {
-		dsl_pool_rele(dp, FTAG);
-		return (error);
-	}
-
-	dp = startdd->dd_pool;
-	mos = dp->dp_meta_objset;
-
-	VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0);
-
-	for (dd = startdd; dd != NULL; dd = dd->dd_parent) {
-		zap_cursor_t basezc;
-		zap_attribute_t baseza;
-		nvlist_t *sp_nvp;
-		uint64_t n;
-		char source[ZFS_MAX_DATASET_NAME_LEN];
-
-		if (dsl_dir_phys(dd)->dd_deleg_zapobj == 0 ||
-		    zap_count(mos,
-		    dsl_dir_phys(dd)->dd_deleg_zapobj, &n) != 0 || n == 0)
-			continue;
-
-		sp_nvp = fnvlist_alloc();
-		for (zap_cursor_init(&basezc, mos,
-		    dsl_dir_phys(dd)->dd_deleg_zapobj);
-		    zap_cursor_retrieve(&basezc, &baseza) == 0;
-		    zap_cursor_advance(&basezc)) {
-			zap_cursor_t zc;
-			zap_attribute_t za;
-			nvlist_t *perms_nvp;
-
-			ASSERT(baseza.za_integer_length == 8);
-			ASSERT(baseza.za_num_integers == 1);
-
-			perms_nvp = fnvlist_alloc();
-			for (zap_cursor_init(&zc, mos, baseza.za_first_integer);
-			    zap_cursor_retrieve(&zc, &za) == 0;
-			    zap_cursor_advance(&zc)) {
-				fnvlist_add_boolean(perms_nvp, za.za_name);
-			}
-			zap_cursor_fini(&zc);
-			fnvlist_add_nvlist(sp_nvp, baseza.za_name, perms_nvp);
-			fnvlist_free(perms_nvp);
-		}
-
-		zap_cursor_fini(&basezc);
-
-		dsl_dir_name(dd, source);
-		fnvlist_add_nvlist(*nvp, source, sp_nvp);
-		nvlist_free(sp_nvp);
-	}
-
-	dsl_dir_rele(startdd, FTAG);
-	dsl_pool_rele(dp, FTAG);
-	return (0);
-}
-
-/*
- * Routines for dsl_deleg_access() -- access checking.
- */
-typedef struct perm_set {
-	avl_node_t	p_node;
-	boolean_t	p_matched;
-	char		p_setname[ZFS_MAX_DELEG_NAME];
-} perm_set_t;
-
-static int
-perm_set_compare(const void *arg1, const void *arg2)
-{
-	const perm_set_t *node1 = (const perm_set_t *)arg1;
-	const perm_set_t *node2 = (const perm_set_t *)arg2;
-	int val;
-
-	val = strcmp(node1->p_setname, node2->p_setname);
-
-	return (AVL_ISIGN(val));
-}
-
-/*
- * Determine whether a specified permission exists.
- *
- * First the base attribute has to be retrieved.  i.e. ul$12
- * Once the base object has been retrieved the actual permission
- * is lookup up in the zap object the base object points to.
- *
- * Return 0 if permission exists, ENOENT if there is no whokey, EPERM if
- * there is no perm in that jumpobj.
- */
-static int
-dsl_check_access(objset_t *mos, uint64_t zapobj,
-    char type, char checkflag, void *valp, const char *perm)
-{
-	int error;
-	uint64_t jumpobj, zero;
-	char whokey[ZFS_MAX_DELEG_NAME];
-
-	zfs_deleg_whokey(whokey, type, checkflag, valp);
-	error = zap_lookup(mos, zapobj, whokey, 8, 1, &jumpobj);
-	if (error == 0) {
-		error = zap_lookup(mos, jumpobj, perm, 8, 1, &zero);
-		if (error == ENOENT)
-			error = SET_ERROR(EPERM);
-	}
-	return (error);
-}
-
-/*
- * check a specified user/group for a requested permission
- */
-static int
-dsl_check_user_access(objset_t *mos, uint64_t zapobj, const char *perm,
-    int checkflag, cred_t *cr)
-{
-	const	gid_t *gids;
-	int	ngids;
-	int	i;
-	uint64_t id;
-
-	/* check for user */
-	id = crgetuid(cr);
-	if (dsl_check_access(mos, zapobj,
-	    ZFS_DELEG_USER, checkflag, &id, perm) == 0)
-		return (0);
-
-	/* check for users primary group */
-	id = crgetgid(cr);
-	if (dsl_check_access(mos, zapobj,
-	    ZFS_DELEG_GROUP, checkflag, &id, perm) == 0)
-		return (0);
-
-	/* check for everyone entry */
-	id = -1;
-	if (dsl_check_access(mos, zapobj,
-	    ZFS_DELEG_EVERYONE, checkflag, &id, perm) == 0)
-		return (0);
-
-	/* check each supplemental group user is a member of */
-	ngids = crgetngroups(cr);
-	gids = crgetgroups(cr);
-	for (i = 0; i != ngids; i++) {
-		id = gids[i];
-		if (dsl_check_access(mos, zapobj,
-		    ZFS_DELEG_GROUP, checkflag, &id, perm) == 0)
-			return (0);
-	}
-
-	return (SET_ERROR(EPERM));
-}
-
-/*
- * Iterate over the sets specified in the specified zapobj
- * and load them into the permsets avl tree.
- */
-static int
-dsl_load_sets(objset_t *mos, uint64_t zapobj,
-    char type, char checkflag, void *valp, avl_tree_t *avl)
-{
-	zap_cursor_t zc;
-	zap_attribute_t za;
-	perm_set_t *permnode;
-	avl_index_t idx;
-	uint64_t jumpobj;
-	int error;
-	char whokey[ZFS_MAX_DELEG_NAME];
-
-	zfs_deleg_whokey(whokey, type, checkflag, valp);
-
-	error = zap_lookup(mos, zapobj, whokey, 8, 1, &jumpobj);
-	if (error != 0)
-		return (error);
-
-	for (zap_cursor_init(&zc, mos, jumpobj);
-	    zap_cursor_retrieve(&zc, &za) == 0;
-	    zap_cursor_advance(&zc)) {
-		permnode = kmem_alloc(sizeof (perm_set_t), KM_SLEEP);
-		(void) strlcpy(permnode->p_setname, za.za_name,
-		    sizeof (permnode->p_setname));
-		permnode->p_matched = B_FALSE;
-
-		if (avl_find(avl, permnode, &idx) == NULL) {
-			avl_insert(avl, permnode, idx);
-		} else {
-			kmem_free(permnode, sizeof (perm_set_t));
-		}
-	}
-	zap_cursor_fini(&zc);
-	return (0);
-}
-
-/*
- * Load all permissions user based on cred belongs to.
- */
-static void
-dsl_load_user_sets(objset_t *mos, uint64_t zapobj, avl_tree_t *avl,
-    char checkflag, cred_t *cr)
-{
-	const	gid_t *gids;
-	int	ngids, i;
-	uint64_t id;
-
-	id = crgetuid(cr);
-	(void) dsl_load_sets(mos, zapobj,
-	    ZFS_DELEG_USER_SETS, checkflag, &id, avl);
-
-	id = crgetgid(cr);
-	(void) dsl_load_sets(mos, zapobj,
-	    ZFS_DELEG_GROUP_SETS, checkflag, &id, avl);
-
-	(void) dsl_load_sets(mos, zapobj,
-	    ZFS_DELEG_EVERYONE_SETS, checkflag, NULL, avl);
-
-	ngids = crgetngroups(cr);
-	gids = crgetgroups(cr);
-	for (i = 0; i != ngids; i++) {
-		id = gids[i];
-		(void) dsl_load_sets(mos, zapobj,
-		    ZFS_DELEG_GROUP_SETS, checkflag, &id, avl);
-	}
-}
-
-/*
- * Check if user has requested permission.
- */
-int
-dsl_deleg_access_impl(dsl_dataset_t *ds, const char *perm, cred_t *cr)
-{
-	dsl_dir_t *dd;
-	dsl_pool_t *dp;
-	void *cookie;
-	int	error;
-	char	checkflag;
-	objset_t *mos;
-	avl_tree_t permsets;
-	perm_set_t *setnode;
-
-	dp = ds->ds_dir->dd_pool;
-	mos = dp->dp_meta_objset;
-
-	if (dsl_delegation_on(mos) == B_FALSE)
-		return (SET_ERROR(ECANCELED));
-
-	if (spa_version(dmu_objset_spa(dp->dp_meta_objset)) <
-	    SPA_VERSION_DELEGATED_PERMS)
-		return (SET_ERROR(EPERM));
-
-	if (ds->ds_is_snapshot) {
-		/*
-		 * Snapshots are treated as descendents only,
-		 * local permissions do not apply.
-		 */
-		checkflag = ZFS_DELEG_DESCENDENT;
-	} else {
-		checkflag = ZFS_DELEG_LOCAL;
-	}
-
-	avl_create(&permsets, perm_set_compare, sizeof (perm_set_t),
-	    offsetof(perm_set_t, p_node));
-
-	ASSERT(dsl_pool_config_held(dp));
-	for (dd = ds->ds_dir; dd != NULL; dd = dd->dd_parent,
-	    checkflag = ZFS_DELEG_DESCENDENT) {
-		uint64_t zapobj;
-		boolean_t expanded;
-
-		/*
-		 * If not in global zone then make sure
-		 * the zoned property is set
-		 */
-		if (!INGLOBALZONE(curthread)) {
-			uint64_t zoned;
-
-			if (dsl_prop_get_dd(dd,
-			    zfs_prop_to_name(ZFS_PROP_ZONED),
-			    8, 1, &zoned, NULL, B_FALSE) != 0)
-				break;
-			if (!zoned)
-				break;
-		}
-		zapobj = dsl_dir_phys(dd)->dd_deleg_zapobj;
-
-		if (zapobj == 0)
-			continue;
-
-		dsl_load_user_sets(mos, zapobj, &permsets, checkflag, cr);
-again:
-		expanded = B_FALSE;
-		for (setnode = avl_first(&permsets); setnode;
-		    setnode = AVL_NEXT(&permsets, setnode)) {
-			if (setnode->p_matched == B_TRUE)
-				continue;
-
-			/* See if this set directly grants this permission */
-			error = dsl_check_access(mos, zapobj,
-			    ZFS_DELEG_NAMED_SET, 0, setnode->p_setname, perm);
-			if (error == 0)
-				goto success;
-			if (error == EPERM)
-				setnode->p_matched = B_TRUE;
-
-			/* See if this set includes other sets */
-			error = dsl_load_sets(mos, zapobj,
-			    ZFS_DELEG_NAMED_SET_SETS, 0,
-			    setnode->p_setname, &permsets);
-			if (error == 0)
-				setnode->p_matched = expanded = B_TRUE;
-		}
-		/*
-		 * If we expanded any sets, that will define more sets,
-		 * which we need to check.
-		 */
-		if (expanded)
-			goto again;
-
-		error = dsl_check_user_access(mos, zapobj, perm, checkflag, cr);
-		if (error == 0)
-			goto success;
-	}
-	error = SET_ERROR(EPERM);
-success:
-
-	cookie = NULL;
-	while ((setnode = avl_destroy_nodes(&permsets, &cookie)) != NULL)
-		kmem_free(setnode, sizeof (perm_set_t));
-
-	return (error);
-}
-
-int
-dsl_deleg_access(const char *dsname, const char *perm, cred_t *cr)
-{
-	dsl_pool_t *dp;
-	dsl_dataset_t *ds;
-	int error;
-
-	error = dsl_pool_hold(dsname, FTAG, &dp);
-	if (error != 0)
-		return (error);
-	error = dsl_dataset_hold(dp, dsname, FTAG, &ds);
-	if (error == 0) {
-		error = dsl_deleg_access_impl(ds, perm, cr);
-		dsl_dataset_rele(ds, FTAG);
-	}
-	dsl_pool_rele(dp, FTAG);
-
-	return (error);
-}
-
-/*
- * Other routines.
- */
-
-static void
-copy_create_perms(dsl_dir_t *dd, uint64_t pzapobj,
-    boolean_t dosets, uint64_t uid, dmu_tx_t *tx)
-{
-	objset_t *mos = dd->dd_pool->dp_meta_objset;
-	uint64_t jumpobj, pjumpobj;
-	uint64_t zapobj = dsl_dir_phys(dd)->dd_deleg_zapobj;
-	zap_cursor_t zc;
-	zap_attribute_t za;
-	char whokey[ZFS_MAX_DELEG_NAME];
-
-	zfs_deleg_whokey(whokey,
-	    dosets ? ZFS_DELEG_CREATE_SETS : ZFS_DELEG_CREATE,
-	    ZFS_DELEG_LOCAL, NULL);
-	if (zap_lookup(mos, pzapobj, whokey, 8, 1, &pjumpobj) != 0)
-		return;
-
-	if (zapobj == 0) {
-		dmu_buf_will_dirty(dd->dd_dbuf, tx);
-		zapobj = dsl_dir_phys(dd)->dd_deleg_zapobj = zap_create(mos,
-		    DMU_OT_DSL_PERMS, DMU_OT_NONE, 0, tx);
-	}
-
-	zfs_deleg_whokey(whokey,
-	    dosets ? ZFS_DELEG_USER_SETS : ZFS_DELEG_USER,
-	    ZFS_DELEG_LOCAL, &uid);
-	if (zap_lookup(mos, zapobj, whokey, 8, 1, &jumpobj) == ENOENT) {
-		jumpobj = zap_create(mos, DMU_OT_DSL_PERMS, DMU_OT_NONE, 0, tx);
-		VERIFY(zap_add(mos, zapobj, whokey, 8, 1, &jumpobj, tx) == 0);
-	}
-
-	for (zap_cursor_init(&zc, mos, pjumpobj);
-	    zap_cursor_retrieve(&zc, &za) == 0;
-	    zap_cursor_advance(&zc)) {
-		uint64_t zero = 0;
-		ASSERT(za.za_integer_length == 8 && za.za_num_integers == 1);
-
-		VERIFY(zap_update(mos, jumpobj, za.za_name,
-		    8, 1, &zero, tx) == 0);
-	}
-	zap_cursor_fini(&zc);
-}
-
-/*
- * set all create time permission on new dataset.
- */
-void
-dsl_deleg_set_create_perms(dsl_dir_t *sdd, dmu_tx_t *tx, cred_t *cr)
-{
-	dsl_dir_t *dd;
-	uint64_t uid = crgetuid(cr);
-
-	if (spa_version(dmu_objset_spa(sdd->dd_pool->dp_meta_objset)) <
-	    SPA_VERSION_DELEGATED_PERMS)
-		return;
-
-	for (dd = sdd->dd_parent; dd != NULL; dd = dd->dd_parent) {
-		uint64_t pzapobj = dsl_dir_phys(dd)->dd_deleg_zapobj;
-
-		if (pzapobj == 0)
-			continue;
-
-		copy_create_perms(sdd, pzapobj, B_FALSE, uid, tx);
-		copy_create_perms(sdd, pzapobj, B_TRUE, uid, tx);
-	}
-}
-
-int
-dsl_deleg_destroy(objset_t *mos, uint64_t zapobj, dmu_tx_t *tx)
-{
-	zap_cursor_t zc;
-	zap_attribute_t za;
-
-	if (zapobj == 0)
-		return (0);
-
-	for (zap_cursor_init(&zc, mos, zapobj);
-	    zap_cursor_retrieve(&zc, &za) == 0;
-	    zap_cursor_advance(&zc)) {
-		ASSERT(za.za_integer_length == 8 && za.za_num_integers == 1);
-		VERIFY(0 == zap_destroy(mos, za.za_first_integer, tx));
-	}
-	zap_cursor_fini(&zc);
-	VERIFY(0 == zap_destroy(mos, zapobj, tx));
-	return (0);
-}
-
-boolean_t
-dsl_delegation_on(objset_t *os)
-{
-	return (!!spa_delegation(os->os_spa));
-}
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_destroy.c
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_destroy.c
+++ /dev/null
@@ -1,1097 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
- * Copyright (c) 2013 Steven Hartland. All rights reserved.
- * Copyright (c) 2013 by Joyent, Inc. All rights reserved.
- * Copyright (c) 2014 Integros [integros.com]
- */
-
-#include <sys/zfs_context.h>
-#include <sys/dsl_userhold.h>
-#include <sys/dsl_dataset.h>
-#include <sys/dsl_synctask.h>
-#include <sys/dsl_destroy.h>
-#include <sys/dmu_tx.h>
-#include <sys/dsl_pool.h>
-#include <sys/dsl_dir.h>
-#include <sys/dmu_traverse.h>
-#include <sys/dsl_scan.h>
-#include <sys/dmu_objset.h>
-#include <sys/zap.h>
-#include <sys/zfeature.h>
-#include <sys/zfs_ioctl.h>
-#include <sys/dsl_deleg.h>
-#include <sys/dmu_impl.h>
-#include <sys/zcp.h>
-#if defined(__FreeBSD__) && defined(_KERNEL)
-#include <sys/zvol.h>
-#endif
-
-
-int
-dsl_destroy_snapshot_check_impl(dsl_dataset_t *ds, boolean_t defer)
-{
-	if (!ds->ds_is_snapshot)
-		return (SET_ERROR(EINVAL));
-
-	if (dsl_dataset_long_held(ds))
-		return (SET_ERROR(EBUSY));
-
-	/*
-	 * Only allow deferred destroy on pools that support it.
-	 * NOTE: deferred destroy is only supported on snapshots.
-	 */
-	if (defer) {
-		if (spa_version(ds->ds_dir->dd_pool->dp_spa) <
-		    SPA_VERSION_USERREFS)
-			return (SET_ERROR(ENOTSUP));
-		return (0);
-	}
-
-	/*
-	 * If this snapshot has an elevated user reference count,
-	 * we can't destroy it yet.
-	 */
-	if (ds->ds_userrefs > 0)
-		return (SET_ERROR(EBUSY));
-
-	/*
-	 * Can't delete a branch point.
-	 */
-	if (dsl_dataset_phys(ds)->ds_num_children > 1)
-		return (SET_ERROR(EEXIST));
-
-	return (0);
-}
-
-int
-dsl_destroy_snapshot_check(void *arg, dmu_tx_t *tx)
-{
-	dsl_destroy_snapshot_arg_t *ddsa = arg;
-	const char *dsname = ddsa->ddsa_name;
-	boolean_t defer = ddsa->ddsa_defer;
-
-	dsl_pool_t *dp = dmu_tx_pool(tx);
-	int error = 0;
-	dsl_dataset_t *ds;
-
-	error = dsl_dataset_hold(dp, dsname, FTAG, &ds);
-
-	/*
-	 * If the snapshot does not exist, silently ignore it, and
-	 * dsl_destroy_snapshot_sync() will be a no-op
-	 * (it's "already destroyed").
-	 */
-	if (error == ENOENT)
-		return (0);
-
-	if (error == 0) {
-		error = dsl_destroy_snapshot_check_impl(ds, defer);
-		dsl_dataset_rele(ds, FTAG);
-	}
-
-	return (error);
-}
-
-struct process_old_arg {
-	dsl_dataset_t *ds;
-	dsl_dataset_t *ds_prev;
-	boolean_t after_branch_point;
-	zio_t *pio;
-	uint64_t used, comp, uncomp;
-};
-
-static int
-process_old_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
-{
-	struct process_old_arg *poa = arg;
-	dsl_pool_t *dp = poa->ds->ds_dir->dd_pool;
-
-	ASSERT(!BP_IS_HOLE(bp));
-
-	if (bp->blk_birth <= dsl_dataset_phys(poa->ds)->ds_prev_snap_txg) {
-		dsl_deadlist_insert(&poa->ds->ds_deadlist, bp, tx);
-		if (poa->ds_prev && !poa->after_branch_point &&
-		    bp->blk_birth >
-		    dsl_dataset_phys(poa->ds_prev)->ds_prev_snap_txg) {
-			dsl_dataset_phys(poa->ds_prev)->ds_unique_bytes +=
-			    bp_get_dsize_sync(dp->dp_spa, bp);
-		}
-	} else {
-		poa->used += bp_get_dsize_sync(dp->dp_spa, bp);
-		poa->comp += BP_GET_PSIZE(bp);
-		poa->uncomp += BP_GET_UCSIZE(bp);
-		dsl_free_sync(poa->pio, dp, tx->tx_txg, bp);
-	}
-	return (0);
-}
-
-static void
-process_old_deadlist(dsl_dataset_t *ds, dsl_dataset_t *ds_prev,
-    dsl_dataset_t *ds_next, boolean_t after_branch_point, dmu_tx_t *tx)
-{
-	struct process_old_arg poa = { 0 };
-	dsl_pool_t *dp = ds->ds_dir->dd_pool;
-	objset_t *mos = dp->dp_meta_objset;
-	uint64_t deadlist_obj;
-
-	ASSERT(ds->ds_deadlist.dl_oldfmt);
-	ASSERT(ds_next->ds_deadlist.dl_oldfmt);
-
-	poa.ds = ds;
-	poa.ds_prev = ds_prev;
-	poa.after_branch_point = after_branch_point;
-	poa.pio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
-	VERIFY0(bpobj_iterate(&ds_next->ds_deadlist.dl_bpobj,
-	    process_old_cb, &poa, tx));
-	VERIFY0(zio_wait(poa.pio));
-	ASSERT3U(poa.used, ==, dsl_dataset_phys(ds)->ds_unique_bytes);
-
-	/* change snapused */
-	dsl_dir_diduse_space(ds->ds_dir, DD_USED_SNAP,
-	    -poa.used, -poa.comp, -poa.uncomp, tx);
-
-	/* swap next's deadlist to our deadlist */
-	dsl_deadlist_close(&ds->ds_deadlist);
-	dsl_deadlist_close(&ds_next->ds_deadlist);
-	deadlist_obj = dsl_dataset_phys(ds)->ds_deadlist_obj;
-	dsl_dataset_phys(ds)->ds_deadlist_obj =
-	    dsl_dataset_phys(ds_next)->ds_deadlist_obj;
-	dsl_dataset_phys(ds_next)->ds_deadlist_obj = deadlist_obj;
-	dsl_deadlist_open(&ds->ds_deadlist, mos,
-	    dsl_dataset_phys(ds)->ds_deadlist_obj);
-	dsl_deadlist_open(&ds_next->ds_deadlist, mos,
-	    dsl_dataset_phys(ds_next)->ds_deadlist_obj);
-}
-
-static void
-dsl_dataset_remove_clones_key(dsl_dataset_t *ds, uint64_t mintxg, dmu_tx_t *tx)
-{
-	objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
-	zap_cursor_t zc;
-	zap_attribute_t za;
-
-	/*
-	 * If it is the old version, dd_clones doesn't exist so we can't
-	 * find the clones, but dsl_deadlist_remove_key() is a no-op so it
-	 * doesn't matter.
-	 */
-	if (dsl_dir_phys(ds->ds_dir)->dd_clones == 0)
-		return;
-
-	for (zap_cursor_init(&zc, mos, dsl_dir_phys(ds->ds_dir)->dd_clones);
-	    zap_cursor_retrieve(&zc, &za) == 0;
-	    zap_cursor_advance(&zc)) {
-		dsl_dataset_t *clone;
-
-		VERIFY0(dsl_dataset_hold_obj(ds->ds_dir->dd_pool,
-		    za.za_first_integer, FTAG, &clone));
-		if (clone->ds_dir->dd_origin_txg > mintxg) {
-			dsl_deadlist_remove_key(&clone->ds_deadlist,
-			    mintxg, tx);
-			if (dsl_dataset_remap_deadlist_exists(clone)) {
-				dsl_deadlist_remove_key(
-				    &clone->ds_remap_deadlist, mintxg, tx);
-			}
-			dsl_dataset_remove_clones_key(clone, mintxg, tx);
-		}
-		dsl_dataset_rele(clone, FTAG);
-	}
-	zap_cursor_fini(&zc);
-}
-
-static void
-dsl_destroy_snapshot_handle_remaps(dsl_dataset_t *ds, dsl_dataset_t *ds_next,
-    dmu_tx_t *tx)
-{
-	dsl_pool_t *dp = ds->ds_dir->dd_pool;
-
-	/* Move blocks to be obsoleted to pool's obsolete list. */
-	if (dsl_dataset_remap_deadlist_exists(ds_next)) {
-		if (!bpobj_is_open(&dp->dp_obsolete_bpobj))
-			dsl_pool_create_obsolete_bpobj(dp, tx);
-
-		dsl_deadlist_move_bpobj(&ds_next->ds_remap_deadlist,
-		    &dp->dp_obsolete_bpobj,
-		    dsl_dataset_phys(ds)->ds_prev_snap_txg, tx);
-	}
-
-	/* Merge our deadlist into next's and free it. */
-	if (dsl_dataset_remap_deadlist_exists(ds)) {
-		uint64_t remap_deadlist_object =
-		    dsl_dataset_get_remap_deadlist_object(ds);
-		ASSERT(remap_deadlist_object != 0);
-
-		mutex_enter(&ds_next->ds_remap_deadlist_lock);
-		if (!dsl_dataset_remap_deadlist_exists(ds_next))
-			dsl_dataset_create_remap_deadlist(ds_next, tx);
-		mutex_exit(&ds_next->ds_remap_deadlist_lock);
-
-		dsl_deadlist_merge(&ds_next->ds_remap_deadlist,
-		    remap_deadlist_object, tx);
-		dsl_dataset_destroy_remap_deadlist(ds, tx);
-	}
-}
-
-void
-dsl_destroy_snapshot_sync_impl(dsl_dataset_t *ds, boolean_t defer, dmu_tx_t *tx)
-{
-	int err;
-	int after_branch_point = FALSE;
-	dsl_pool_t *dp = ds->ds_dir->dd_pool;
-	objset_t *mos = dp->dp_meta_objset;
-	dsl_dataset_t *ds_prev = NULL;
-	uint64_t obj;
-
-	ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock));
-	rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
-	ASSERT3U(dsl_dataset_phys(ds)->ds_bp.blk_birth, <=, tx->tx_txg);
-	rrw_exit(&ds->ds_bp_rwlock, FTAG);
-	ASSERT(zfs_refcount_is_zero(&ds->ds_longholds));
-
-	if (defer &&
-	    (ds->ds_userrefs > 0 ||
-	    dsl_dataset_phys(ds)->ds_num_children > 1)) {
-		ASSERT(spa_version(dp->dp_spa) >= SPA_VERSION_USERREFS);
-		dmu_buf_will_dirty(ds->ds_dbuf, tx);
-		dsl_dataset_phys(ds)->ds_flags |= DS_FLAG_DEFER_DESTROY;
-		spa_history_log_internal_ds(ds, "defer_destroy", tx, "");
-		return;
-	}
-
-	ASSERT3U(dsl_dataset_phys(ds)->ds_num_children, <=, 1);
-
-	/* We need to log before removing it from the namespace. */
-	spa_history_log_internal_ds(ds, "destroy", tx, "");
-
-	dsl_scan_ds_destroyed(ds, tx);
-
-	obj = ds->ds_object;
-
-	for (spa_feature_t f = 0; f < SPA_FEATURES; f++) {
-		if (ds->ds_feature_inuse[f]) {
-			dsl_dataset_deactivate_feature(obj, f, tx);
-			ds->ds_feature_inuse[f] = B_FALSE;
-		}
-	}
-	if (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) {
-		ASSERT3P(ds->ds_prev, ==, NULL);
-		VERIFY0(dsl_dataset_hold_obj(dp,
-		    dsl_dataset_phys(ds)->ds_prev_snap_obj, FTAG, &ds_prev));
-		after_branch_point =
-		    (dsl_dataset_phys(ds_prev)->ds_next_snap_obj != obj);
-
-		dmu_buf_will_dirty(ds_prev->ds_dbuf, tx);
-		if (after_branch_point &&
-		    dsl_dataset_phys(ds_prev)->ds_next_clones_obj != 0) {
-			dsl_dataset_remove_from_next_clones(ds_prev, obj, tx);
-			if (dsl_dataset_phys(ds)->ds_next_snap_obj != 0) {
-				VERIFY0(zap_add_int(mos,
-				    dsl_dataset_phys(ds_prev)->
-				    ds_next_clones_obj,
-				    dsl_dataset_phys(ds)->ds_next_snap_obj,
-				    tx));
-			}
-		}
-		if (!after_branch_point) {
-			dsl_dataset_phys(ds_prev)->ds_next_snap_obj =
-			    dsl_dataset_phys(ds)->ds_next_snap_obj;
-		}
-	}
-
-	dsl_dataset_t *ds_next;
-	uint64_t old_unique;
-	uint64_t used = 0, comp = 0, uncomp = 0;
-
-	VERIFY0(dsl_dataset_hold_obj(dp,
-	    dsl_dataset_phys(ds)->ds_next_snap_obj, FTAG, &ds_next));
-	ASSERT3U(dsl_dataset_phys(ds_next)->ds_prev_snap_obj, ==, obj);
-
-	old_unique = dsl_dataset_phys(ds_next)->ds_unique_bytes;
-
-	dmu_buf_will_dirty(ds_next->ds_dbuf, tx);
-	dsl_dataset_phys(ds_next)->ds_prev_snap_obj =
-	    dsl_dataset_phys(ds)->ds_prev_snap_obj;
-	dsl_dataset_phys(ds_next)->ds_prev_snap_txg =
-	    dsl_dataset_phys(ds)->ds_prev_snap_txg;
-	ASSERT3U(dsl_dataset_phys(ds)->ds_prev_snap_txg, ==,
-	    ds_prev ? dsl_dataset_phys(ds_prev)->ds_creation_txg : 0);
-
-	if (ds_next->ds_deadlist.dl_oldfmt) {
-		process_old_deadlist(ds, ds_prev, ds_next,
-		    after_branch_point, tx);
-	} else {
-		/* Adjust prev's unique space. */
-		if (ds_prev && !after_branch_point) {
-			dsl_deadlist_space_range(&ds_next->ds_deadlist,
-			    dsl_dataset_phys(ds_prev)->ds_prev_snap_txg,
-			    dsl_dataset_phys(ds)->ds_prev_snap_txg,
-			    &used, &comp, &uncomp);
-			dsl_dataset_phys(ds_prev)->ds_unique_bytes += used;
-		}
-
-		/* Adjust snapused. */
-		dsl_deadlist_space_range(&ds_next->ds_deadlist,
-		    dsl_dataset_phys(ds)->ds_prev_snap_txg, UINT64_MAX,
-		    &used, &comp, &uncomp);
-		dsl_dir_diduse_space(ds->ds_dir, DD_USED_SNAP,
-		    -used, -comp, -uncomp, tx);
-
-		/* Move blocks to be freed to pool's free list. */
-		dsl_deadlist_move_bpobj(&ds_next->ds_deadlist,
-		    &dp->dp_free_bpobj, dsl_dataset_phys(ds)->ds_prev_snap_txg,
-		    tx);
-		dsl_dir_diduse_space(tx->tx_pool->dp_free_dir,
-		    DD_USED_HEAD, used, comp, uncomp, tx);
-
-		/* Merge our deadlist into next's and free it. */
-		dsl_deadlist_merge(&ds_next->ds_deadlist,
-		    dsl_dataset_phys(ds)->ds_deadlist_obj, tx);
-	}
-
-	dsl_deadlist_close(&ds->ds_deadlist);
-	dsl_deadlist_free(mos, dsl_dataset_phys(ds)->ds_deadlist_obj, tx);
-	dmu_buf_will_dirty(ds->ds_dbuf, tx);
-	dsl_dataset_phys(ds)->ds_deadlist_obj = 0;
-
-	dsl_destroy_snapshot_handle_remaps(ds, ds_next, tx);
-
-	/* Collapse range in clone heads */
-	dsl_dataset_remove_clones_key(ds,
-	    dsl_dataset_phys(ds)->ds_creation_txg, tx);
-
-	if (ds_next->ds_is_snapshot) {
-		dsl_dataset_t *ds_nextnext;
-
-		/*
-		 * Update next's unique to include blocks which
-		 * were previously shared by only this snapshot
-		 * and it.  Those blocks will be born after the
-		 * prev snap and before this snap, and will have
-		 * died after the next snap and before the one
-		 * after that (ie. be on the snap after next's
-		 * deadlist).
-		 */
-		VERIFY0(dsl_dataset_hold_obj(dp,
-		    dsl_dataset_phys(ds_next)->ds_next_snap_obj,
-		    FTAG, &ds_nextnext));
-		dsl_deadlist_space_range(&ds_nextnext->ds_deadlist,
-		    dsl_dataset_phys(ds)->ds_prev_snap_txg,
-		    dsl_dataset_phys(ds)->ds_creation_txg,
-		    &used, &comp, &uncomp);
-		dsl_dataset_phys(ds_next)->ds_unique_bytes += used;
-		dsl_dataset_rele(ds_nextnext, FTAG);
-		ASSERT3P(ds_next->ds_prev, ==, NULL);
-
-		/* Collapse range in this head. */
-		dsl_dataset_t *hds;
-		VERIFY0(dsl_dataset_hold_obj(dp,
-		    dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj, FTAG, &hds));
-		dsl_deadlist_remove_key(&hds->ds_deadlist,
-		    dsl_dataset_phys(ds)->ds_creation_txg, tx);
-		if (dsl_dataset_remap_deadlist_exists(hds)) {
-			dsl_deadlist_remove_key(&hds->ds_remap_deadlist,
-			    dsl_dataset_phys(ds)->ds_creation_txg, tx);
-		}
-		dsl_dataset_rele(hds, FTAG);
-
-	} else {
-		ASSERT3P(ds_next->ds_prev, ==, ds);
-		dsl_dataset_rele(ds_next->ds_prev, ds_next);
-		ds_next->ds_prev = NULL;
-		if (ds_prev) {
-			VERIFY0(dsl_dataset_hold_obj(dp,
-			    dsl_dataset_phys(ds)->ds_prev_snap_obj,
-			    ds_next, &ds_next->ds_prev));
-		}
-
-		dsl_dataset_recalc_head_uniq(ds_next);
-
-		/*
-		 * Reduce the amount of our unconsumed refreservation
-		 * being charged to our parent by the amount of
-		 * new unique data we have gained.
-		 */
-		if (old_unique < ds_next->ds_reserved) {
-			int64_t mrsdelta;
-			uint64_t new_unique =
-			    dsl_dataset_phys(ds_next)->ds_unique_bytes;
-
-			ASSERT(old_unique <= new_unique);
-			mrsdelta = MIN(new_unique - old_unique,
-			    ds_next->ds_reserved - old_unique);
-			dsl_dir_diduse_space(ds->ds_dir,
-			    DD_USED_REFRSRV, -mrsdelta, 0, 0, tx);
-		}
-	}
-	dsl_dataset_rele(ds_next, FTAG);
-
-	/*
-	 * This must be done after the dsl_traverse(), because it will
-	 * re-open the objset.
-	 */
-	if (ds->ds_objset) {
-		dmu_objset_evict(ds->ds_objset);
-		ds->ds_objset = NULL;
-	}
-
-	/* remove from snapshot namespace */
-	dsl_dataset_t *ds_head;
-	ASSERT(dsl_dataset_phys(ds)->ds_snapnames_zapobj == 0);
-	VERIFY0(dsl_dataset_hold_obj(dp,
-	    dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj, FTAG, &ds_head));
-	VERIFY0(dsl_dataset_get_snapname(ds));
-#ifdef ZFS_DEBUG
-	{
-		uint64_t val;
-
-		err = dsl_dataset_snap_lookup(ds_head,
-		    ds->ds_snapname, &val);
-		ASSERT0(err);
-		ASSERT3U(val, ==, obj);
-	}
-#endif
-	VERIFY0(dsl_dataset_snap_remove(ds_head, ds->ds_snapname, tx, B_TRUE));
-	dsl_dataset_rele(ds_head, FTAG);
-
-	if (ds_prev != NULL)
-		dsl_dataset_rele(ds_prev, FTAG);
-
-	spa_prop_clear_bootfs(dp->dp_spa, ds->ds_object, tx);
-
-	if (dsl_dataset_phys(ds)->ds_next_clones_obj != 0) {
-		uint64_t count;
-		ASSERT0(zap_count(mos,
-		    dsl_dataset_phys(ds)->ds_next_clones_obj, &count) &&
-		    count == 0);
-		VERIFY0(dmu_object_free(mos,
-		    dsl_dataset_phys(ds)->ds_next_clones_obj, tx));
-	}
-	if (dsl_dataset_phys(ds)->ds_props_obj != 0)
-		VERIFY0(zap_destroy(mos, dsl_dataset_phys(ds)->ds_props_obj,
-		    tx));
-	if (dsl_dataset_phys(ds)->ds_userrefs_obj != 0)
-		VERIFY0(zap_destroy(mos, dsl_dataset_phys(ds)->ds_userrefs_obj,
-		    tx));
-
-#if defined(__FreeBSD__) && defined(_KERNEL)
-	char dsname[ZFS_MAX_DATASET_NAME_LEN];
-
-	dsl_dataset_name(ds, dsname);
-	zvol_remove_minors(dp->dp_spa, dsname);
-#endif
-
-	dsl_dir_rele(ds->ds_dir, ds);
-	ds->ds_dir = NULL;
-	dmu_object_free_zapified(mos, obj, tx);
-}
-
-void
-dsl_destroy_snapshot_sync(void *arg, dmu_tx_t *tx)
-{
-	dsl_destroy_snapshot_arg_t *ddsa = arg;
-	const char *dsname = ddsa->ddsa_name;
-	boolean_t defer = ddsa->ddsa_defer;
-
-	dsl_pool_t *dp = dmu_tx_pool(tx);
-	dsl_dataset_t *ds;
-
-	int error = dsl_dataset_hold(dp, dsname, FTAG, &ds);
-	if (error == ENOENT)
-		return;
-	ASSERT0(error);
-	dsl_destroy_snapshot_sync_impl(ds, defer, tx);
-	dsl_dataset_rele(ds, FTAG);
-}
-
-/*
- * The semantics of this function are described in the comment above
- * lzc_destroy_snaps().  To summarize:
- *
- * The snapshots must all be in the same pool.
- *
- * Snapshots that don't exist will be silently ignored (considered to be
- * "already deleted").
- *
- * On success, all snaps will be destroyed and this will return 0.
- * On failure, no snaps will be destroyed, the errlist will be filled in,
- * and this will return an errno.
- */
-int
-dsl_destroy_snapshots_nvl(nvlist_t *snaps, boolean_t defer,
-    nvlist_t *errlist)
-{
-	if (nvlist_next_nvpair(snaps, NULL) == NULL)
-		return (0);
-
-	/*
-	 * lzc_destroy_snaps() is documented to take an nvlist whose
-	 * values "don't matter".  We need to convert that nvlist to
-	 * one that we know can be converted to LUA. We also don't
-	 * care about any duplicate entries because the nvlist will
-	 * be converted to a LUA table which should take care of this.
-	 */
-	nvlist_t *snaps_normalized;
-	VERIFY0(nvlist_alloc(&snaps_normalized, 0, KM_SLEEP));
-	for (nvpair_t *pair = nvlist_next_nvpair(snaps, NULL);
-	    pair != NULL; pair = nvlist_next_nvpair(snaps, pair)) {
-		fnvlist_add_boolean_value(snaps_normalized,
-		    nvpair_name(pair), B_TRUE);
-	}
-
-	nvlist_t *arg;
-	VERIFY0(nvlist_alloc(&arg, 0, KM_SLEEP));
-	fnvlist_add_nvlist(arg, "snaps", snaps_normalized);
-	fnvlist_free(snaps_normalized);
-	fnvlist_add_boolean_value(arg, "defer", defer);
-
-	nvlist_t *wrapper;
-	VERIFY0(nvlist_alloc(&wrapper, 0, KM_SLEEP));
-	fnvlist_add_nvlist(wrapper, ZCP_ARG_ARGLIST, arg);
-	fnvlist_free(arg);
-
-	const char *program =
-	    "arg = ...\n"
-	    "snaps = arg['snaps']\n"
-	    "defer = arg['defer']\n"
-	    "errors = { }\n"
-	    "has_errors = false\n"
-	    "for snap, v in pairs(snaps) do\n"
-	    "    errno = zfs.check.destroy{snap, defer=defer}\n"
-	    "    zfs.debug('snap: ' .. snap .. ' errno: ' .. errno)\n"
-	    "    if errno == ENOENT then\n"
-	    "        snaps[snap] = nil\n"
-	    "    elseif errno ~= 0 then\n"
-	    "        errors[snap] = errno\n"
-	    "        has_errors = true\n"
-	    "    end\n"
-	    "end\n"
-	    "if has_errors then\n"
-	    "    return errors\n"
-	    "end\n"
-	    "for snap, v in pairs(snaps) do\n"
-	    "    errno = zfs.sync.destroy{snap, defer=defer}\n"
-	    "    assert(errno == 0)\n"
-	    "end\n"
-	    "return { }\n";
-
-	nvlist_t *result = fnvlist_alloc();
-	int error = zcp_eval(nvpair_name(nvlist_next_nvpair(snaps, NULL)),
-	    program,
-	    B_TRUE,
-	    0,
-	    zfs_lua_max_memlimit,
-	    nvlist_next_nvpair(wrapper, NULL), result);
-	if (error != 0) {
-		char *errorstr = NULL;
-		(void) nvlist_lookup_string(result, ZCP_RET_ERROR, &errorstr);
-		if (errorstr != NULL) {
-			zfs_dbgmsg(errorstr);
-		}
-		return (error);
-	}
-	fnvlist_free(wrapper);
-
-	/*
-	 * lzc_destroy_snaps() is documented to fill the errlist with
-	 * int32 values, so we need to covert the int64 values that are
-	 * returned from LUA.
-	 */
-	int rv = 0;
-	nvlist_t *errlist_raw = fnvlist_lookup_nvlist(result, ZCP_RET_RETURN);
-	for (nvpair_t *pair = nvlist_next_nvpair(errlist_raw, NULL);
-	    pair != NULL; pair = nvlist_next_nvpair(errlist_raw, pair)) {
-		int32_t val = (int32_t)fnvpair_value_int64(pair);
-		if (rv == 0)
-			rv = val;
-		fnvlist_add_int32(errlist, nvpair_name(pair), val);
-	}
-	fnvlist_free(result);
-	return (rv);
-}
-
-int
-dsl_destroy_snapshot(const char *name, boolean_t defer)
-{
-	int error;
-	nvlist_t *nvl = fnvlist_alloc();
-	nvlist_t *errlist = fnvlist_alloc();
-
-	fnvlist_add_boolean(nvl, name);
-	error = dsl_destroy_snapshots_nvl(nvl, defer, errlist);
-	fnvlist_free(errlist);
-	fnvlist_free(nvl);
-	return (error);
-}
-
-struct killarg {
-	dsl_dataset_t *ds;
-	dmu_tx_t *tx;
-};
-
-/* ARGSUSED */
-static int
-kill_blkptr(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
-    const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg)
-{
-	struct killarg *ka = arg;
-	dmu_tx_t *tx = ka->tx;
-
-	if (bp == NULL || BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp))
-		return (0);
-
-	if (zb->zb_level == ZB_ZIL_LEVEL) {
-		ASSERT(zilog != NULL);
-		/*
-		 * It's a block in the intent log.  It has no
-		 * accounting, so just free it.
-		 */
-		dsl_free(ka->tx->tx_pool, ka->tx->tx_txg, bp);
-	} else {
-		ASSERT(zilog == NULL);
-		ASSERT3U(bp->blk_birth, >,
-		    dsl_dataset_phys(ka->ds)->ds_prev_snap_txg);
-		(void) dsl_dataset_block_kill(ka->ds, bp, tx, B_FALSE);
-	}
-
-	return (0);
-}
-
-static void
-old_synchronous_dataset_destroy(dsl_dataset_t *ds, dmu_tx_t *tx)
-{
-	struct killarg ka;
-
-	/*
-	 * Free everything that we point to (that's born after
-	 * the previous snapshot, if we are a clone)
-	 *
-	 * NB: this should be very quick, because we already
-	 * freed all the objects in open context.
-	 */
-	ka.ds = ds;
-	ka.tx = tx;
-	VERIFY0(traverse_dataset(ds,
-	    dsl_dataset_phys(ds)->ds_prev_snap_txg, TRAVERSE_POST,
-	    kill_blkptr, &ka));
-	ASSERT(!DS_UNIQUE_IS_ACCURATE(ds) ||
-	    dsl_dataset_phys(ds)->ds_unique_bytes == 0);
-}
-
-int
-dsl_destroy_head_check_impl(dsl_dataset_t *ds, int expected_holds)
-{
-	int error;
-	uint64_t count;
-	objset_t *mos;
-
-	ASSERT(!ds->ds_is_snapshot);
-	if (ds->ds_is_snapshot)
-		return (SET_ERROR(EINVAL));
-
-	if (zfs_refcount_count(&ds->ds_longholds) != expected_holds)
-		return (SET_ERROR(EBUSY));
-
-	mos = ds->ds_dir->dd_pool->dp_meta_objset;
-
-	/*
-	 * Can't delete a head dataset if there are snapshots of it.
-	 * (Except if the only snapshots are from the branch we cloned
-	 * from.)
-	 */
-	if (ds->ds_prev != NULL &&
-	    dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj == ds->ds_object)
-		return (SET_ERROR(EBUSY));
-
-	/*
-	 * Can't delete if there are children of this fs.
-	 */
-	error = zap_count(mos,
-	    dsl_dir_phys(ds->ds_dir)->dd_child_dir_zapobj, &count);
-	if (error != 0)
-		return (error);
-	if (count != 0)
-		return (SET_ERROR(EEXIST));
-
-	if (dsl_dir_is_clone(ds->ds_dir) && DS_IS_DEFER_DESTROY(ds->ds_prev) &&
-	    dsl_dataset_phys(ds->ds_prev)->ds_num_children == 2 &&
-	    ds->ds_prev->ds_userrefs == 0) {
-		/* We need to remove the origin snapshot as well. */
-		if (!zfs_refcount_is_zero(&ds->ds_prev->ds_longholds))
-			return (SET_ERROR(EBUSY));
-	}
-	return (0);
-}
-
-int
-dsl_destroy_head_check(void *arg, dmu_tx_t *tx)
-{
-	dsl_destroy_head_arg_t *ddha = arg;
-	dsl_pool_t *dp = dmu_tx_pool(tx);
-	dsl_dataset_t *ds;
-	int error;
-
-	error = dsl_dataset_hold(dp, ddha->ddha_name, FTAG, &ds);
-	if (error != 0)
-		return (error);
-
-	error = dsl_destroy_head_check_impl(ds, 0);
-	dsl_dataset_rele(ds, FTAG);
-	return (error);
-}
-
-static void
-dsl_dir_destroy_sync(uint64_t ddobj, dmu_tx_t *tx)
-{
-	dsl_dir_t *dd;
-	dsl_pool_t *dp = dmu_tx_pool(tx);
-	objset_t *mos = dp->dp_meta_objset;
-	dd_used_t t;
-
-	ASSERT(RRW_WRITE_HELD(&dmu_tx_pool(tx)->dp_config_rwlock));
-
-	VERIFY0(dsl_dir_hold_obj(dp, ddobj, NULL, FTAG, &dd));
-
-	ASSERT0(dsl_dir_phys(dd)->dd_head_dataset_obj);
-
-	/*
-	 * Decrement the filesystem count for all parent filesystems.
-	 *
-	 * When we receive an incremental stream into a filesystem that already
-	 * exists, a temporary clone is created.  We never count this temporary
-	 * clone, whose name begins with a '%'.
-	 */
-	if (dd->dd_myname[0] != '%' && dd->dd_parent != NULL)
-		dsl_fs_ss_count_adjust(dd->dd_parent, -1,
-		    DD_FIELD_FILESYSTEM_COUNT, tx);
-
-	/*
-	 * Remove our reservation. The impl() routine avoids setting the
-	 * actual property, which would require the (already destroyed) ds.
-	 */
-	dsl_dir_set_reservation_sync_impl(dd, 0, tx);
-
-	ASSERT0(dsl_dir_phys(dd)->dd_used_bytes);
-	ASSERT0(dsl_dir_phys(dd)->dd_reserved);
-	for (t = 0; t < DD_USED_NUM; t++)
-		ASSERT0(dsl_dir_phys(dd)->dd_used_breakdown[t]);
-
-	VERIFY0(zap_destroy(mos, dsl_dir_phys(dd)->dd_child_dir_zapobj, tx));
-	VERIFY0(zap_destroy(mos, dsl_dir_phys(dd)->dd_props_zapobj, tx));
-	if (dsl_dir_phys(dd)->dd_clones != 0)
-		VERIFY0(zap_destroy(mos, dsl_dir_phys(dd)->dd_clones, tx));
-	VERIFY0(dsl_deleg_destroy(mos, dsl_dir_phys(dd)->dd_deleg_zapobj, tx));
-	VERIFY0(zap_remove(mos,
-	    dsl_dir_phys(dd->dd_parent)->dd_child_dir_zapobj,
-	    dd->dd_myname, tx));
-
-	dsl_dir_rele(dd, FTAG);
-	dmu_object_free_zapified(mos, ddobj, tx);
-}
-
-void
-dsl_destroy_head_sync_impl(dsl_dataset_t *ds, dmu_tx_t *tx)
-{
-	dsl_pool_t *dp = dmu_tx_pool(tx);
-	objset_t *mos = dp->dp_meta_objset;
-	uint64_t obj, ddobj, prevobj = 0;
-	boolean_t rmorigin;
-
-	ASSERT3U(dsl_dataset_phys(ds)->ds_num_children, <=, 1);
-	ASSERT(ds->ds_prev == NULL ||
-	    dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj != ds->ds_object);
-	rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
-	ASSERT3U(dsl_dataset_phys(ds)->ds_bp.blk_birth, <=, tx->tx_txg);
-	rrw_exit(&ds->ds_bp_rwlock, FTAG);
-	ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock));
-
-	/* We need to log before removing it from the namespace. */
-	spa_history_log_internal_ds(ds, "destroy", tx, "");
-
-	rmorigin = (dsl_dir_is_clone(ds->ds_dir) &&
-	    DS_IS_DEFER_DESTROY(ds->ds_prev) &&
-	    dsl_dataset_phys(ds->ds_prev)->ds_num_children == 2 &&
-	    ds->ds_prev->ds_userrefs == 0);
-
-	/* Remove our reservation. */
-	if (ds->ds_reserved != 0) {
-		dsl_dataset_set_refreservation_sync_impl(ds,
-		    (ZPROP_SRC_NONE | ZPROP_SRC_LOCAL | ZPROP_SRC_RECEIVED),
-		    0, tx);
-		ASSERT0(ds->ds_reserved);
-	}
-
-	obj = ds->ds_object;
-
-	for (spa_feature_t f = 0; f < SPA_FEATURES; f++) {
-		if (ds->ds_feature_inuse[f]) {
-			dsl_dataset_deactivate_feature(obj, f, tx);
-			ds->ds_feature_inuse[f] = B_FALSE;
-		}
-	}
-
-	dsl_scan_ds_destroyed(ds, tx);
-
-	if (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) {
-		/* This is a clone */
-		ASSERT(ds->ds_prev != NULL);
-		ASSERT3U(dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj, !=,
-		    obj);
-		ASSERT0(dsl_dataset_phys(ds)->ds_next_snap_obj);
-
-		dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx);
-		if (dsl_dataset_phys(ds->ds_prev)->ds_next_clones_obj != 0) {
-			dsl_dataset_remove_from_next_clones(ds->ds_prev,
-			    obj, tx);
-		}
-
-		ASSERT3U(dsl_dataset_phys(ds->ds_prev)->ds_num_children, >, 1);
-		dsl_dataset_phys(ds->ds_prev)->ds_num_children--;
-	}
-
-	/*
-	 * Destroy the deadlist.  Unless it's a clone, the
-	 * deadlist should be empty since the dataset has no snapshots.
-	 * (If it's a clone, it's safe to ignore the deadlist contents
-	 * since they are still referenced by the origin snapshot.)
-	 */
-	dsl_deadlist_close(&ds->ds_deadlist);
-	dsl_deadlist_free(mos, dsl_dataset_phys(ds)->ds_deadlist_obj, tx);
-	dmu_buf_will_dirty(ds->ds_dbuf, tx);
-	dsl_dataset_phys(ds)->ds_deadlist_obj = 0;
-
-	if (dsl_dataset_remap_deadlist_exists(ds))
-		dsl_dataset_destroy_remap_deadlist(ds, tx);
-
-	objset_t *os;
-	VERIFY0(dmu_objset_from_ds(ds, &os));
-
-	if (!spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_ASYNC_DESTROY)) {
-		old_synchronous_dataset_destroy(ds, tx);
-	} else {
-		/*
-		 * Move the bptree into the pool's list of trees to
-		 * clean up and update space accounting information.
-		 */
-		uint64_t used, comp, uncomp;
-
-		zil_destroy_sync(dmu_objset_zil(os), tx);
-
-		if (!spa_feature_is_active(dp->dp_spa,
-		    SPA_FEATURE_ASYNC_DESTROY)) {
-			dsl_scan_t *scn = dp->dp_scan;
-			spa_feature_incr(dp->dp_spa, SPA_FEATURE_ASYNC_DESTROY,
-			    tx);
-			dp->dp_bptree_obj = bptree_alloc(mos, tx);
-			VERIFY0(zap_add(mos,
-			    DMU_POOL_DIRECTORY_OBJECT,
-			    DMU_POOL_BPTREE_OBJ, sizeof (uint64_t), 1,
-			    &dp->dp_bptree_obj, tx));
-			ASSERT(!scn->scn_async_destroying);
-			scn->scn_async_destroying = B_TRUE;
-		}
-
-		used = dsl_dir_phys(ds->ds_dir)->dd_used_bytes;
-		comp = dsl_dir_phys(ds->ds_dir)->dd_compressed_bytes;
-		uncomp = dsl_dir_phys(ds->ds_dir)->dd_uncompressed_bytes;
-
-		ASSERT(!DS_UNIQUE_IS_ACCURATE(ds) ||
-		    dsl_dataset_phys(ds)->ds_unique_bytes == used);
-
-		rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
-		bptree_add(mos, dp->dp_bptree_obj,
-		    &dsl_dataset_phys(ds)->ds_bp,
-		    dsl_dataset_phys(ds)->ds_prev_snap_txg,
-		    used, comp, uncomp, tx);
-		rrw_exit(&ds->ds_bp_rwlock, FTAG);
-		dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD,
-		    -used, -comp, -uncomp, tx);
-		dsl_dir_diduse_space(dp->dp_free_dir, DD_USED_HEAD,
-		    used, comp, uncomp, tx);
-	}
-
-	if (ds->ds_prev != NULL) {
-		if (spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) {
-			VERIFY0(zap_remove_int(mos,
-			    dsl_dir_phys(ds->ds_prev->ds_dir)->dd_clones,
-			    ds->ds_object, tx));
-		}
-		prevobj = ds->ds_prev->ds_object;
-		dsl_dataset_rele(ds->ds_prev, ds);
-		ds->ds_prev = NULL;
-	}
-
-	/*
-	 * This must be done after the dsl_traverse(), because it will
-	 * re-open the objset.
-	 */
-	if (ds->ds_objset) {
-		dmu_objset_evict(ds->ds_objset);
-		ds->ds_objset = NULL;
-	}
-
-	/* Erase the link in the dir */
-	dmu_buf_will_dirty(ds->ds_dir->dd_dbuf, tx);
-	dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj = 0;
-	ddobj = ds->ds_dir->dd_object;
-	ASSERT(dsl_dataset_phys(ds)->ds_snapnames_zapobj != 0);
-	VERIFY0(zap_destroy(mos,
-	    dsl_dataset_phys(ds)->ds_snapnames_zapobj, tx));
-
-	if (ds->ds_bookmarks != 0) {
-		VERIFY0(zap_destroy(mos, ds->ds_bookmarks, tx));
-		spa_feature_decr(dp->dp_spa, SPA_FEATURE_BOOKMARKS, tx);
-	}
-
-	spa_prop_clear_bootfs(dp->dp_spa, ds->ds_object, tx);
-
-	ASSERT0(dsl_dataset_phys(ds)->ds_next_clones_obj);
-	ASSERT0(dsl_dataset_phys(ds)->ds_props_obj);
-	ASSERT0(dsl_dataset_phys(ds)->ds_userrefs_obj);
-	dsl_dir_rele(ds->ds_dir, ds);
-	ds->ds_dir = NULL;
-	dmu_object_free_zapified(mos, obj, tx);
-
-	dsl_dir_destroy_sync(ddobj, tx);
-
-	if (rmorigin) {
-		dsl_dataset_t *prev;
-		VERIFY0(dsl_dataset_hold_obj(dp, prevobj, FTAG, &prev));
-		dsl_destroy_snapshot_sync_impl(prev, B_FALSE, tx);
-		dsl_dataset_rele(prev, FTAG);
-	}
-}
-
-void
-dsl_destroy_head_sync(void *arg, dmu_tx_t *tx)
-{
-	dsl_destroy_head_arg_t *ddha = arg;
-	dsl_pool_t *dp = dmu_tx_pool(tx);
-	dsl_dataset_t *ds;
-
-	VERIFY0(dsl_dataset_hold(dp, ddha->ddha_name, FTAG, &ds));
-	dsl_destroy_head_sync_impl(ds, tx);
-#if defined(__FreeBSD__) && defined(_KERNEL)
-	zvol_remove_minors(dp->dp_spa, ddha->ddha_name);
-#endif
-	dsl_dataset_rele(ds, FTAG);
-}
-
-static void
-dsl_destroy_head_begin_sync(void *arg, dmu_tx_t *tx)
-{
-	dsl_destroy_head_arg_t *ddha = arg;
-	dsl_pool_t *dp = dmu_tx_pool(tx);
-	dsl_dataset_t *ds;
-
-	VERIFY0(dsl_dataset_hold(dp, ddha->ddha_name, FTAG, &ds));
-
-	/* Mark it as inconsistent on-disk, in case we crash */
-	dmu_buf_will_dirty(ds->ds_dbuf, tx);
-	dsl_dataset_phys(ds)->ds_flags |= DS_FLAG_INCONSISTENT;
-
-	spa_history_log_internal_ds(ds, "destroy begin", tx, "");
-	dsl_dataset_rele(ds, FTAG);
-}
-
-int
-dsl_destroy_head(const char *name)
-{
-	dsl_destroy_head_arg_t ddha;
-	int error;
-	spa_t *spa;
-	boolean_t isenabled;
-
-#ifdef _KERNEL
-	zfs_destroy_unmount_origin(name);
-#endif
-
-	error = spa_open(name, &spa, FTAG);
-	if (error != 0)
-		return (error);
-	isenabled = spa_feature_is_enabled(spa, SPA_FEATURE_ASYNC_DESTROY);
-	spa_close(spa, FTAG);
-
-	ddha.ddha_name = name;
-
-	if (!isenabled) {
-		objset_t *os;
-
-		error = dsl_sync_task(name, dsl_destroy_head_check,
-		    dsl_destroy_head_begin_sync, &ddha,
-		    0, ZFS_SPACE_CHECK_DESTROY);
-		if (error != 0)
-			return (error);
-
-		/*
-		 * Head deletion is processed in one txg on old pools;
-		 * remove the objects from open context so that the txg sync
-		 * is not too long.
-		 */
-		error = dmu_objset_own(name, DMU_OST_ANY, B_FALSE, FTAG, &os);
-		if (error == 0) {
-			uint64_t prev_snap_txg =
-			    dsl_dataset_phys(dmu_objset_ds(os))->
-			    ds_prev_snap_txg;
-			for (uint64_t obj = 0; error == 0;
-			    error = dmu_object_next(os, &obj, FALSE,
-			    prev_snap_txg))
-				(void) dmu_free_long_object(os, obj);
-			/* sync out all frees */
-			txg_wait_synced(dmu_objset_pool(os), 0);
-			dmu_objset_disown(os, FTAG);
-		}
-	}
-
-	return (dsl_sync_task(name, dsl_destroy_head_check,
-	    dsl_destroy_head_sync, &ddha, 0, ZFS_SPACE_CHECK_DESTROY));
-}
-
-/*
- * Note, this function is used as the callback for dmu_objset_find().  We
- * always return 0 so that we will continue to find and process
- * inconsistent datasets, even if we encounter an error trying to
- * process one of them.
- */
-/* ARGSUSED */
-int
-dsl_destroy_inconsistent(const char *dsname, void *arg)
-{
-	objset_t *os;
-
-	if (dmu_objset_hold(dsname, FTAG, &os) == 0) {
-		boolean_t need_destroy = DS_IS_INCONSISTENT(dmu_objset_ds(os));
-
-		/*
-		 * If the dataset is inconsistent because a resumable receive
-		 * has failed, then do not destroy it.
-		 */
-		if (dsl_dataset_has_resume_receive_state(dmu_objset_ds(os)))
-			need_destroy = B_FALSE;
-
-		dmu_objset_rele(os, FTAG);
-		if (need_destroy)
-			(void) dsl_destroy_head(dsname);
-	}
-	return (0);
-}
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dir.c
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dir.c
+++ /dev/null
@@ -1,2184 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011 Pawel Jakub Dawidek <pawel@dawidek.net>.
- * All rights reserved.
- * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
- * Copyright (c) 2014 Joyent, Inc. All rights reserved.
- * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
- * Copyright 2015 Nexenta Systems, Inc. All rights reserved.
- * Copyright (c) 2018, loli10K <ezomori.nozomu@gmail.com>. All rights reserved.
- */
-
-#include <sys/dmu.h>
-#include <sys/dmu_objset.h>
-#include <sys/dmu_tx.h>
-#include <sys/dsl_dataset.h>
-#include <sys/dsl_dir.h>
-#include <sys/dsl_prop.h>
-#include <sys/dsl_synctask.h>
-#include <sys/dsl_deleg.h>
-#include <sys/dmu_impl.h>
-#include <sys/spa.h>
-#include <sys/metaslab.h>
-#include <sys/zap.h>
-#include <sys/zio.h>
-#include <sys/arc.h>
-#include <sys/sunddi.h>
-#include <sys/zvol.h>
-#ifdef _KERNEL
-#include <sys/zfs_vfsops.h>
-#endif
-#include <sys/zfeature.h>
-#include <sys/policy.h>
-#include <sys/zfs_znode.h>
-#include "zfs_namecheck.h"
-#include "zfs_prop.h"
-
-/*
- * Filesystem and Snapshot Limits
- * ------------------------------
- *
- * These limits are used to restrict the number of filesystems and/or snapshots
- * that can be created at a given level in the tree or below. A typical
- * use-case is with a delegated dataset where the administrator wants to ensure
- * that a user within the zone is not creating too many additional filesystems
- * or snapshots, even though they're not exceeding their space quota.
- *
- * The filesystem and snapshot counts are stored as extensible properties. This
- * capability is controlled by a feature flag and must be enabled to be used.
- * Once enabled, the feature is not active until the first limit is set. At
- * that point, future operations to create/destroy filesystems or snapshots
- * will validate and update the counts.
- *
- * Because the count properties will not exist before the feature is active,
- * the counts are updated when a limit is first set on an uninitialized
- * dsl_dir node in the tree (The filesystem/snapshot count on a node includes
- * all of the nested filesystems/snapshots. Thus, a new leaf node has a
- * filesystem count of 0 and a snapshot count of 0. Non-existent filesystem and
- * snapshot count properties on a node indicate uninitialized counts on that
- * node.) When first setting a limit on an uninitialized node, the code starts
- * at the filesystem with the new limit and descends into all sub-filesystems
- * to add the count properties.
- *
- * In practice this is lightweight since a limit is typically set when the
- * filesystem is created and thus has no children. Once valid, changing the
- * limit value won't require a re-traversal since the counts are already valid.
- * When recursively fixing the counts, if a node with a limit is encountered
- * during the descent, the counts are known to be valid and there is no need to
- * descend into that filesystem's children. The counts on filesystems above the
- * one with the new limit will still be uninitialized, unless a limit is
- * eventually set on one of those filesystems. The counts are always recursively
- * updated when a limit is set on a dataset, unless there is already a limit.
- * When a new limit value is set on a filesystem with an existing limit, it is
- * possible for the new limit to be less than the current count at that level
- * since a user who can change the limit is also allowed to exceed the limit.
- *
- * Once the feature is active, then whenever a filesystem or snapshot is
- * created, the code recurses up the tree, validating the new count against the
- * limit at each initialized level. In practice, most levels will not have a
- * limit set. If there is a limit at any initialized level up the tree, the
- * check must pass or the creation will fail. Likewise, when a filesystem or
- * snapshot is destroyed, the counts are recursively adjusted all the way up
- * the initizized nodes in the tree. Renaming a filesystem into different point
- * in the tree will first validate, then update the counts on each branch up to
- * the common ancestor. A receive will also validate the counts and then update
- * them.
- *
- * An exception to the above behavior is that the limit is not enforced if the
- * user has permission to modify the limit. This is primarily so that
- * recursive snapshots in the global zone always work. We want to prevent a
- * denial-of-service in which a lower level delegated dataset could max out its
- * limit and thus block recursive snapshots from being taken in the global zone.
- * Because of this, it is possible for the snapshot count to be over the limit
- * and snapshots taken in the global zone could cause a lower level dataset to
- * hit or exceed its limit. The administrator taking the global zone recursive
- * snapshot should be aware of this side-effect and behave accordingly.
- * For consistency, the filesystem limit is also not enforced if the user can
- * modify the limit.
- *
- * The filesystem and snapshot limits are validated by dsl_fs_ss_limit_check()
- * and updated by dsl_fs_ss_count_adjust(). A new limit value is setup in
- * dsl_dir_activate_fs_ss_limit() and the counts are adjusted, if necessary, by
- * dsl_dir_init_fs_ss_count().
- *
- * There is a special case when we receive a filesystem that already exists. In
- * this case a temporary clone name of %X is created (see dmu_recv_begin). We
- * never update the filesystem counts for temporary clones.
- *
- * Likewise, we do not update the snapshot counts for temporary snapshots,
- * such as those created by zfs diff.
- */
-
-extern inline dsl_dir_phys_t *dsl_dir_phys(dsl_dir_t *dd);
-
-static uint64_t dsl_dir_space_towrite(dsl_dir_t *dd);
-
-typedef struct ddulrt_arg {
-	dsl_dir_t	*ddulrta_dd;
-	uint64_t	ddlrta_txg;
-} ddulrt_arg_t;
-
-static void
-dsl_dir_evict_async(void *dbu)
-{
-	dsl_dir_t *dd = dbu;
-	dsl_pool_t *dp = dd->dd_pool;
-	int t;
-
-	dd->dd_dbuf = NULL;
-
-	for (t = 0; t < TXG_SIZE; t++) {
-		ASSERT(!txg_list_member(&dp->dp_dirty_dirs, dd, t));
-		ASSERT(dd->dd_tempreserved[t] == 0);
-		ASSERT(dd->dd_space_towrite[t] == 0);
-	}
-
-	if (dd->dd_parent)
-		dsl_dir_async_rele(dd->dd_parent, dd);
-
-	spa_async_close(dd->dd_pool->dp_spa, dd);
-
-	dsl_prop_fini(dd);
-	mutex_destroy(&dd->dd_lock);
-	kmem_free(dd, sizeof (dsl_dir_t));
-}
-
-int
-dsl_dir_hold_obj(dsl_pool_t *dp, uint64_t ddobj,
-    const char *tail, void *tag, dsl_dir_t **ddp)
-{
-	dmu_buf_t *dbuf;
-	dsl_dir_t *dd;
-	int err;
-
-	ASSERT(dsl_pool_config_held(dp));
-
-	err = dmu_bonus_hold(dp->dp_meta_objset, ddobj, tag, &dbuf);
-	if (err != 0)
-		return (err);
-	dd = dmu_buf_get_user(dbuf);
-#ifdef ZFS_DEBUG
-	{
-		dmu_object_info_t doi;
-		dmu_object_info_from_db(dbuf, &doi);
-		ASSERT3U(doi.doi_bonus_type, ==, DMU_OT_DSL_DIR);
-		ASSERT3U(doi.doi_bonus_size, >=, sizeof (dsl_dir_phys_t));
-	}
-#endif
-	if (dd == NULL) {
-		dsl_dir_t *winner;
-
-		dd = kmem_zalloc(sizeof (dsl_dir_t), KM_SLEEP);
-		dd->dd_object = ddobj;
-		dd->dd_dbuf = dbuf;
-		dd->dd_pool = dp;
-		mutex_init(&dd->dd_lock, NULL, MUTEX_DEFAULT, NULL);
-		dsl_prop_init(dd);
-
-		dsl_dir_snap_cmtime_update(dd);
-
-		if (dsl_dir_phys(dd)->dd_parent_obj) {
-			err = dsl_dir_hold_obj(dp,
-			    dsl_dir_phys(dd)->dd_parent_obj, NULL, dd,
-			    &dd->dd_parent);
-			if (err != 0)
-				goto errout;
-			if (tail) {
-#ifdef ZFS_DEBUG
-				uint64_t foundobj;
-
-				err = zap_lookup(dp->dp_meta_objset,
-				    dsl_dir_phys(dd->dd_parent)->
-				    dd_child_dir_zapobj, tail,
-				    sizeof (foundobj), 1, &foundobj);
-				ASSERT(err || foundobj == ddobj);
-#endif
-				(void) strcpy(dd->dd_myname, tail);
-			} else {
-				err = zap_value_search(dp->dp_meta_objset,
-				    dsl_dir_phys(dd->dd_parent)->
-				    dd_child_dir_zapobj,
-				    ddobj, 0, dd->dd_myname);
-			}
-			if (err != 0)
-				goto errout;
-		} else {
-			(void) strcpy(dd->dd_myname, spa_name(dp->dp_spa));
-		}
-
-		if (dsl_dir_is_clone(dd)) {
-			dmu_buf_t *origin_bonus;
-			dsl_dataset_phys_t *origin_phys;
-
-			/*
-			 * We can't open the origin dataset, because
-			 * that would require opening this dsl_dir.
-			 * Just look at its phys directly instead.
-			 */
-			err = dmu_bonus_hold(dp->dp_meta_objset,
-			    dsl_dir_phys(dd)->dd_origin_obj, FTAG,
-			    &origin_bonus);
-			if (err != 0)
-				goto errout;
-			origin_phys = origin_bonus->db_data;
-			dd->dd_origin_txg =
-			    origin_phys->ds_creation_txg;
-			dmu_buf_rele(origin_bonus, FTAG);
-		}
-
-		dmu_buf_init_user(&dd->dd_dbu, NULL, dsl_dir_evict_async,
-		    &dd->dd_dbuf);
-		winner = dmu_buf_set_user_ie(dbuf, &dd->dd_dbu);
-		if (winner != NULL) {
-			if (dd->dd_parent)
-				dsl_dir_rele(dd->dd_parent, dd);
-			dsl_prop_fini(dd);
-			mutex_destroy(&dd->dd_lock);
-			kmem_free(dd, sizeof (dsl_dir_t));
-			dd = winner;
-		} else {
-			spa_open_ref(dp->dp_spa, dd);
-		}
-	}
-
-	/*
-	 * The dsl_dir_t has both open-to-close and instantiate-to-evict
-	 * holds on the spa.  We need the open-to-close holds because
-	 * otherwise the spa_refcnt wouldn't change when we open a
-	 * dir which the spa also has open, so we could incorrectly
-	 * think it was OK to unload/export/destroy the pool.  We need
-	 * the instantiate-to-evict hold because the dsl_dir_t has a
-	 * pointer to the dd_pool, which has a pointer to the spa_t.
-	 */
-	spa_open_ref(dp->dp_spa, tag);
-	ASSERT3P(dd->dd_pool, ==, dp);
-	ASSERT3U(dd->dd_object, ==, ddobj);
-	ASSERT3P(dd->dd_dbuf, ==, dbuf);
-	*ddp = dd;
-	return (0);
-
-errout:
-	if (dd->dd_parent)
-		dsl_dir_rele(dd->dd_parent, dd);
-	dsl_prop_fini(dd);
-	mutex_destroy(&dd->dd_lock);
-	kmem_free(dd, sizeof (dsl_dir_t));
-	dmu_buf_rele(dbuf, tag);
-	return (err);
-}
-
-void
-dsl_dir_rele(dsl_dir_t *dd, void *tag)
-{
-	dprintf_dd(dd, "%s\n", "");
-	spa_close(dd->dd_pool->dp_spa, tag);
-	dmu_buf_rele(dd->dd_dbuf, tag);
-}
-
-/*
- * Remove a reference to the given dsl dir that is being asynchronously
- * released.  Async releases occur from a taskq performing eviction of
- * dsl datasets and dirs.  This process is identical to a normal release
- * with the exception of using the async API for releasing the reference on
- * the spa.
- */
-void
-dsl_dir_async_rele(dsl_dir_t *dd, void *tag)
-{
-	dprintf_dd(dd, "%s\n", "");
-	spa_async_close(dd->dd_pool->dp_spa, tag);
-	dmu_buf_rele(dd->dd_dbuf, tag);
-}
-
-/* buf must be at least ZFS_MAX_DATASET_NAME_LEN bytes */
-void
-dsl_dir_name(dsl_dir_t *dd, char *buf)
-{
-	if (dd->dd_parent) {
-		dsl_dir_name(dd->dd_parent, buf);
-		VERIFY3U(strlcat(buf, "/", ZFS_MAX_DATASET_NAME_LEN), <,
-		    ZFS_MAX_DATASET_NAME_LEN);
-	} else {
-		buf[0] = '\0';
-	}
-	if (!MUTEX_HELD(&dd->dd_lock)) {
-		/*
-		 * recursive mutex so that we can use
-		 * dprintf_dd() with dd_lock held
-		 */
-		mutex_enter(&dd->dd_lock);
-		VERIFY3U(strlcat(buf, dd->dd_myname, ZFS_MAX_DATASET_NAME_LEN),
-		    <, ZFS_MAX_DATASET_NAME_LEN);
-		mutex_exit(&dd->dd_lock);
-	} else {
-		VERIFY3U(strlcat(buf, dd->dd_myname, ZFS_MAX_DATASET_NAME_LEN),
-		    <, ZFS_MAX_DATASET_NAME_LEN);
-	}
-}
-
-/* Calculate name length, avoiding all the strcat calls of dsl_dir_name */
-int
-dsl_dir_namelen(dsl_dir_t *dd)
-{
-	int result = 0;
-
-	if (dd->dd_parent) {
-		/* parent's name + 1 for the "/" */
-		result = dsl_dir_namelen(dd->dd_parent) + 1;
-	}
-
-	if (!MUTEX_HELD(&dd->dd_lock)) {
-		/* see dsl_dir_name */
-		mutex_enter(&dd->dd_lock);
-		result += strlen(dd->dd_myname);
-		mutex_exit(&dd->dd_lock);
-	} else {
-		result += strlen(dd->dd_myname);
-	}
-
-	return (result);
-}
-
-static int
-getcomponent(const char *path, char *component, const char **nextp)
-{
-	char *p;
-
-	if ((path == NULL) || (path[0] == '\0'))
-		return (SET_ERROR(ENOENT));
-	/* This would be a good place to reserve some namespace... */
-	p = strpbrk(path, "/@");
-	if (p && (p[1] == '/' || p[1] == '@')) {
-		/* two separators in a row */
-		return (SET_ERROR(EINVAL));
-	}
-	if (p == NULL || p == path) {
-		/*
-		 * if the first thing is an @ or /, it had better be an
-		 * @ and it had better not have any more ats or slashes,
-		 * and it had better have something after the @.
-		 */
-		if (p != NULL &&
-		    (p[0] != '@' || strpbrk(path+1, "/@") || p[1] == '\0'))
-			return (SET_ERROR(EINVAL));
-		if (strlen(path) >= ZFS_MAX_DATASET_NAME_LEN)
-			return (SET_ERROR(ENAMETOOLONG));
-		(void) strcpy(component, path);
-		p = NULL;
-	} else if (p[0] == '/') {
-		if (p - path >= ZFS_MAX_DATASET_NAME_LEN)
-			return (SET_ERROR(ENAMETOOLONG));
-		(void) strncpy(component, path, p - path);
-		component[p - path] = '\0';
-		p++;
-	} else if (p[0] == '@') {
-		/*
-		 * if the next separator is an @, there better not be
-		 * any more slashes.
-		 */
-		if (strchr(path, '/'))
-			return (SET_ERROR(EINVAL));
-		if (p - path >= ZFS_MAX_DATASET_NAME_LEN)
-			return (SET_ERROR(ENAMETOOLONG));
-		(void) strncpy(component, path, p - path);
-		component[p - path] = '\0';
-	} else {
-		panic("invalid p=%p", (void *)p);
-	}
-	*nextp = p;
-	return (0);
-}
-
-/*
- * Return the dsl_dir_t, and possibly the last component which couldn't
- * be found in *tail.  The name must be in the specified dsl_pool_t.  This
- * thread must hold the dp_config_rwlock for the pool.  Returns NULL if the
- * path is bogus, or if tail==NULL and we couldn't parse the whole name.
- * (*tail)[0] == '@' means that the last component is a snapshot.
- */
-int
-dsl_dir_hold(dsl_pool_t *dp, const char *name, void *tag,
-    dsl_dir_t **ddp, const char **tailp)
-{
-	char buf[ZFS_MAX_DATASET_NAME_LEN];
-	const char *spaname, *next, *nextnext = NULL;
-	int err;
-	dsl_dir_t *dd;
-	uint64_t ddobj;
-
-	err = getcomponent(name, buf, &next);
-	if (err != 0)
-		return (err);
-
-	/* Make sure the name is in the specified pool. */
-	spaname = spa_name(dp->dp_spa);
-	if (strcmp(buf, spaname) != 0)
-		return (SET_ERROR(EXDEV));
-
-	ASSERT(dsl_pool_config_held(dp));
-
-	err = dsl_dir_hold_obj(dp, dp->dp_root_dir_obj, NULL, tag, &dd);
-	if (err != 0) {
-		return (err);
-	}
-
-	while (next != NULL) {
-		dsl_dir_t *child_dd;
-		err = getcomponent(next, buf, &nextnext);
-		if (err != 0)
-			break;
-		ASSERT(next[0] != '\0');
-		if (next[0] == '@')
-			break;
-		dprintf("looking up %s in obj%lld\n",
-		    buf, dsl_dir_phys(dd)->dd_child_dir_zapobj);
-
-		err = zap_lookup(dp->dp_meta_objset,
-		    dsl_dir_phys(dd)->dd_child_dir_zapobj,
-		    buf, sizeof (ddobj), 1, &ddobj);
-		if (err != 0) {
-			if (err == ENOENT)
-				err = 0;
-			break;
-		}
-
-		err = dsl_dir_hold_obj(dp, ddobj, buf, tag, &child_dd);
-		if (err != 0)
-			break;
-		dsl_dir_rele(dd, tag);
-		dd = child_dd;
-		next = nextnext;
-	}
-
-	if (err != 0) {
-		dsl_dir_rele(dd, tag);
-		return (err);
-	}
-
-	/*
-	 * It's an error if there's more than one component left, or
-	 * tailp==NULL and there's any component left.
-	 */
-	if (next != NULL &&
-	    (tailp == NULL || (nextnext && nextnext[0] != '\0'))) {
-		/* bad path name */
-		dsl_dir_rele(dd, tag);
-		dprintf("next=%p (%s) tail=%p\n", next, next?next:"", tailp);
-		err = SET_ERROR(ENOENT);
-	}
-	if (tailp != NULL)
-		*tailp = next;
-	*ddp = dd;
-	return (err);
-}
-
-/*
- * If the counts are already initialized for this filesystem and its
- * descendants then do nothing, otherwise initialize the counts.
- *
- * The counts on this filesystem, and those below, may be uninitialized due to
- * either the use of a pre-existing pool which did not support the
- * filesystem/snapshot limit feature, or one in which the feature had not yet
- * been enabled.
- *
- * Recursively descend the filesystem tree and update the filesystem/snapshot
- * counts on each filesystem below, then update the cumulative count on the
- * current filesystem. If the filesystem already has a count set on it,
- * then we know that its counts, and the counts on the filesystems below it,
- * are already correct, so we don't have to update this filesystem.
- */
-static void
-dsl_dir_init_fs_ss_count(dsl_dir_t *dd, dmu_tx_t *tx)
-{
-	uint64_t my_fs_cnt = 0;
-	uint64_t my_ss_cnt = 0;
-	dsl_pool_t *dp = dd->dd_pool;
-	objset_t *os = dp->dp_meta_objset;
-	zap_cursor_t *zc;
-	zap_attribute_t *za;
-	dsl_dataset_t *ds;
-
-	ASSERT(spa_feature_is_active(dp->dp_spa, SPA_FEATURE_FS_SS_LIMIT));
-	ASSERT(dsl_pool_config_held(dp));
-	ASSERT(dmu_tx_is_syncing(tx));
-
-	dsl_dir_zapify(dd, tx);
-
-	/*
-	 * If the filesystem count has already been initialized then we
-	 * don't need to recurse down any further.
-	 */
-	if (zap_contains(os, dd->dd_object, DD_FIELD_FILESYSTEM_COUNT) == 0)
-		return;
-
-	zc = kmem_alloc(sizeof (zap_cursor_t), KM_SLEEP);
-	za = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP);
-
-	/* Iterate my child dirs */
-	for (zap_cursor_init(zc, os, dsl_dir_phys(dd)->dd_child_dir_zapobj);
-	    zap_cursor_retrieve(zc, za) == 0; zap_cursor_advance(zc)) {
-		dsl_dir_t *chld_dd;
-		uint64_t count;
-
-		VERIFY0(dsl_dir_hold_obj(dp, za->za_first_integer, NULL, FTAG,
-		    &chld_dd));
-
-		/*
-		 * Ignore hidden ($FREE, $MOS & $ORIGIN) objsets and
-		 * temporary datasets.
-		 */
-		if (chld_dd->dd_myname[0] == '$' ||
-		    chld_dd->dd_myname[0] == '%') {
-			dsl_dir_rele(chld_dd, FTAG);
-			continue;
-		}
-
-		my_fs_cnt++;	/* count this child */
-
-		dsl_dir_init_fs_ss_count(chld_dd, tx);
-
-		VERIFY0(zap_lookup(os, chld_dd->dd_object,
-		    DD_FIELD_FILESYSTEM_COUNT, sizeof (count), 1, &count));
-		my_fs_cnt += count;
-		VERIFY0(zap_lookup(os, chld_dd->dd_object,
-		    DD_FIELD_SNAPSHOT_COUNT, sizeof (count), 1, &count));
-		my_ss_cnt += count;
-
-		dsl_dir_rele(chld_dd, FTAG);
-	}
-	zap_cursor_fini(zc);
-	/* Count my snapshots (we counted children's snapshots above) */
-	VERIFY0(dsl_dataset_hold_obj(dd->dd_pool,
-	    dsl_dir_phys(dd)->dd_head_dataset_obj, FTAG, &ds));
-
-	for (zap_cursor_init(zc, os, dsl_dataset_phys(ds)->ds_snapnames_zapobj);
-	    zap_cursor_retrieve(zc, za) == 0;
-	    zap_cursor_advance(zc)) {
-		/* Don't count temporary snapshots */
-		if (za->za_name[0] != '%')
-			my_ss_cnt++;
-	}
-	zap_cursor_fini(zc);
-
-	dsl_dataset_rele(ds, FTAG);
-
-	kmem_free(zc, sizeof (zap_cursor_t));
-	kmem_free(za, sizeof (zap_attribute_t));
-
-	/* we're in a sync task, update counts */
-	dmu_buf_will_dirty(dd->dd_dbuf, tx);
-	VERIFY0(zap_add(os, dd->dd_object, DD_FIELD_FILESYSTEM_COUNT,
-	    sizeof (my_fs_cnt), 1, &my_fs_cnt, tx));
-	VERIFY0(zap_add(os, dd->dd_object, DD_FIELD_SNAPSHOT_COUNT,
-	    sizeof (my_ss_cnt), 1, &my_ss_cnt, tx));
-}
-
-static int
-dsl_dir_actv_fs_ss_limit_check(void *arg, dmu_tx_t *tx)
-{
-	char *ddname = (char *)arg;
-	dsl_pool_t *dp = dmu_tx_pool(tx);
-	dsl_dataset_t *ds;
-	dsl_dir_t *dd;
-	int error;
-
-	error = dsl_dataset_hold(dp, ddname, FTAG, &ds);
-	if (error != 0)
-		return (error);
-
-	if (!spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_FS_SS_LIMIT)) {
-		dsl_dataset_rele(ds, FTAG);
-		return (SET_ERROR(ENOTSUP));
-	}
-
-	dd = ds->ds_dir;
-	if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_FS_SS_LIMIT) &&
-	    dsl_dir_is_zapified(dd) &&
-	    zap_contains(dp->dp_meta_objset, dd->dd_object,
-	    DD_FIELD_FILESYSTEM_COUNT) == 0) {
-		dsl_dataset_rele(ds, FTAG);
-		return (SET_ERROR(EALREADY));
-	}
-
-	dsl_dataset_rele(ds, FTAG);
-	return (0);
-}
-
-static void
-dsl_dir_actv_fs_ss_limit_sync(void *arg, dmu_tx_t *tx)
-{
-	char *ddname = (char *)arg;
-	dsl_pool_t *dp = dmu_tx_pool(tx);
-	dsl_dataset_t *ds;
-	spa_t *spa;
-
-	VERIFY0(dsl_dataset_hold(dp, ddname, FTAG, &ds));
-
-	spa = dsl_dataset_get_spa(ds);
-
-	if (!spa_feature_is_active(spa, SPA_FEATURE_FS_SS_LIMIT)) {
-		/*
-		 * Since the feature was not active and we're now setting a
-		 * limit, increment the feature-active counter so that the
-		 * feature becomes active for the first time.
-		 *
-		 * We are already in a sync task so we can update the MOS.
-		 */
-		spa_feature_incr(spa, SPA_FEATURE_FS_SS_LIMIT, tx);
-	}
-
-	/*
-	 * Since we are now setting a non-UINT64_MAX limit on the filesystem,
-	 * we need to ensure the counts are correct. Descend down the tree from
-	 * this point and update all of the counts to be accurate.
-	 */
-	dsl_dir_init_fs_ss_count(ds->ds_dir, tx);
-
-	dsl_dataset_rele(ds, FTAG);
-}
-
-/*
- * Make sure the feature is enabled and activate it if necessary.
- * Since we're setting a limit, ensure the on-disk counts are valid.
- * This is only called by the ioctl path when setting a limit value.
- *
- * We do not need to validate the new limit, since users who can change the
- * limit are also allowed to exceed the limit.
- */
-int
-dsl_dir_activate_fs_ss_limit(const char *ddname)
-{
-	int error;
-
-	error = dsl_sync_task(ddname, dsl_dir_actv_fs_ss_limit_check,
-	    dsl_dir_actv_fs_ss_limit_sync, (void *)ddname, 0,
-	    ZFS_SPACE_CHECK_RESERVED);
-
-	if (error == EALREADY)
-		error = 0;
-
-	return (error);
-}
-
-/*
- * Used to determine if the filesystem_limit or snapshot_limit should be
- * enforced. We allow the limit to be exceeded if the user has permission to
- * write the property value. We pass in the creds that we got in the open
- * context since we will always be the GZ root in syncing context. We also have
- * to handle the case where we are allowed to change the limit on the current
- * dataset, but there may be another limit in the tree above.
- *
- * We can never modify these two properties within a non-global zone. In
- * addition, the other checks are modeled on zfs_secpolicy_write_perms. We
- * can't use that function since we are already holding the dp_config_rwlock.
- * In addition, we already have the dd and dealing with snapshots is simplified
- * in this code.
- */
-
-typedef enum {
-	ENFORCE_ALWAYS,
-	ENFORCE_NEVER,
-	ENFORCE_ABOVE
-} enforce_res_t;
-
-static enforce_res_t
-dsl_enforce_ds_ss_limits(dsl_dir_t *dd, zfs_prop_t prop, cred_t *cr)
-{
-	enforce_res_t enforce = ENFORCE_ALWAYS;
-	uint64_t obj;
-	dsl_dataset_t *ds;
-	uint64_t zoned;
-
-	ASSERT(prop == ZFS_PROP_FILESYSTEM_LIMIT ||
-	    prop == ZFS_PROP_SNAPSHOT_LIMIT);
-
-#ifdef _KERNEL
-#ifdef __FreeBSD__
-	if (jailed(cr))
-#else
-	if (crgetzoneid(cr) != GLOBAL_ZONEID)
-#endif
-		return (ENFORCE_ALWAYS);
-
-	if (secpolicy_zfs(cr) == 0)
-		return (ENFORCE_NEVER);
-#endif
-
-	if ((obj = dsl_dir_phys(dd)->dd_head_dataset_obj) == 0)
-		return (ENFORCE_ALWAYS);
-
-	ASSERT(dsl_pool_config_held(dd->dd_pool));
-
-	if (dsl_dataset_hold_obj(dd->dd_pool, obj, FTAG, &ds) != 0)
-		return (ENFORCE_ALWAYS);
-
-	if (dsl_prop_get_ds(ds, "zoned", 8, 1, &zoned, NULL) || zoned) {
-		/* Only root can access zoned fs's from the GZ */
-		enforce = ENFORCE_ALWAYS;
-	} else {
-		if (dsl_deleg_access_impl(ds, zfs_prop_to_name(prop), cr) == 0)
-			enforce = ENFORCE_ABOVE;
-	}
-
-	dsl_dataset_rele(ds, FTAG);
-	return (enforce);
-}
-
-static void
-dsl_dir_update_last_remap_txg_sync(void *varg, dmu_tx_t *tx)
-{
-	ddulrt_arg_t *arg = varg;
-	uint64_t last_remap_txg;
-	dsl_dir_t *dd = arg->ddulrta_dd;
-	objset_t *mos = dd->dd_pool->dp_meta_objset;
-
-	dsl_dir_zapify(dd, tx);
-	if (zap_lookup(mos, dd->dd_object, DD_FIELD_LAST_REMAP_TXG,
-	    sizeof (last_remap_txg), 1, &last_remap_txg) != 0 ||
-	    last_remap_txg < arg->ddlrta_txg) {
-		VERIFY0(zap_update(mos, dd->dd_object, DD_FIELD_LAST_REMAP_TXG,
-		    sizeof (arg->ddlrta_txg), 1, &arg->ddlrta_txg, tx));
-	}
-}
-
-int
-dsl_dir_update_last_remap_txg(dsl_dir_t *dd, uint64_t txg)
-{
-	ddulrt_arg_t arg;
-	arg.ddulrta_dd = dd;
-	arg.ddlrta_txg = txg;
-
-	return (dsl_sync_task(spa_name(dd->dd_pool->dp_spa),
-	    NULL, dsl_dir_update_last_remap_txg_sync, &arg,
-	    1, ZFS_SPACE_CHECK_RESERVED));
-}
-
-/*
- * Check if adding additional child filesystem(s) would exceed any filesystem
- * limits or adding additional snapshot(s) would exceed any snapshot limits.
- * The prop argument indicates which limit to check.
- *
- * Note that all filesystem limits up to the root (or the highest
- * initialized) filesystem or the given ancestor must be satisfied.
- */
-int
-dsl_fs_ss_limit_check(dsl_dir_t *dd, uint64_t delta, zfs_prop_t prop,
-    dsl_dir_t *ancestor, cred_t *cr)
-{
-	objset_t *os = dd->dd_pool->dp_meta_objset;
-	uint64_t limit, count;
-	char *count_prop;
-	enforce_res_t enforce;
-	int err = 0;
-
-	ASSERT(dsl_pool_config_held(dd->dd_pool));
-	ASSERT(prop == ZFS_PROP_FILESYSTEM_LIMIT ||
-	    prop == ZFS_PROP_SNAPSHOT_LIMIT);
-
-	/*
-	 * If we're allowed to change the limit, don't enforce the limit
-	 * e.g. this can happen if a snapshot is taken by an administrative
-	 * user in the global zone (i.e. a recursive snapshot by root).
-	 * However, we must handle the case of delegated permissions where we
-	 * are allowed to change the limit on the current dataset, but there
-	 * is another limit in the tree above.
-	 */
-	enforce = dsl_enforce_ds_ss_limits(dd, prop, cr);
-	if (enforce == ENFORCE_NEVER)
-		return (0);
-
-	/*
-	 * e.g. if renaming a dataset with no snapshots, count adjustment
-	 * is 0.
-	 */
-	if (delta == 0)
-		return (0);
-
-	if (prop == ZFS_PROP_SNAPSHOT_LIMIT) {
-		/*
-		 * We don't enforce the limit for temporary snapshots. This is
-		 * indicated by a NULL cred_t argument.
-		 */
-		if (cr == NULL)
-			return (0);
-
-		count_prop = DD_FIELD_SNAPSHOT_COUNT;
-	} else {
-		count_prop = DD_FIELD_FILESYSTEM_COUNT;
-	}
-
-	/*
-	 * If an ancestor has been provided, stop checking the limit once we
-	 * hit that dir. We need this during rename so that we don't overcount
-	 * the check once we recurse up to the common ancestor.
-	 */
-	if (ancestor == dd)
-		return (0);
-
-	/*
-	 * If we hit an uninitialized node while recursing up the tree, we can
-	 * stop since we know there is no limit here (or above). The counts are
-	 * not valid on this node and we know we won't touch this node's counts.
-	 */
-	if (!dsl_dir_is_zapified(dd) || zap_lookup(os, dd->dd_object,
-	    count_prop, sizeof (count), 1, &count) == ENOENT)
-		return (0);
-
-	err = dsl_prop_get_dd(dd, zfs_prop_to_name(prop), 8, 1, &limit, NULL,
-	    B_FALSE);
-	if (err != 0)
-		return (err);
-
-	/* Is there a limit which we've hit? */
-	if (enforce == ENFORCE_ALWAYS && (count + delta) > limit)
-		return (SET_ERROR(EDQUOT));
-
-	if (dd->dd_parent != NULL)
-		err = dsl_fs_ss_limit_check(dd->dd_parent, delta, prop,
-		    ancestor, cr);
-
-	return (err);
-}
-
-/*
- * Adjust the filesystem or snapshot count for the specified dsl_dir_t and all
- * parents. When a new filesystem/snapshot is created, increment the count on
- * all parents, and when a filesystem/snapshot is destroyed, decrement the
- * count.
- */
-void
-dsl_fs_ss_count_adjust(dsl_dir_t *dd, int64_t delta, const char *prop,
-    dmu_tx_t *tx)
-{
-	int err;
-	objset_t *os = dd->dd_pool->dp_meta_objset;
-	uint64_t count;
-
-	ASSERT(dsl_pool_config_held(dd->dd_pool));
-	ASSERT(dmu_tx_is_syncing(tx));
-	ASSERT(strcmp(prop, DD_FIELD_FILESYSTEM_COUNT) == 0 ||
-	    strcmp(prop, DD_FIELD_SNAPSHOT_COUNT) == 0);
-
-	/*
-	 * When we receive an incremental stream into a filesystem that already
-	 * exists, a temporary clone is created.  We don't count this temporary
-	 * clone, whose name begins with a '%'. We also ignore hidden ($FREE,
-	 * $MOS & $ORIGIN) objsets.
-	 */
-	if ((dd->dd_myname[0] == '%' || dd->dd_myname[0] == '$') &&
-	    strcmp(prop, DD_FIELD_FILESYSTEM_COUNT) == 0)
-		return;
-
-	/*
-	 * e.g. if renaming a dataset with no snapshots, count adjustment is 0
-	 */
-	if (delta == 0)
-		return;
-
-	/*
-	 * If we hit an uninitialized node while recursing up the tree, we can
-	 * stop since we know the counts are not valid on this node and we
-	 * know we shouldn't touch this node's counts. An uninitialized count
-	 * on the node indicates that either the feature has not yet been
-	 * activated or there are no limits on this part of the tree.
-	 */
-	if (!dsl_dir_is_zapified(dd) || (err = zap_lookup(os, dd->dd_object,
-	    prop, sizeof (count), 1, &count)) == ENOENT)
-		return;
-	VERIFY0(err);
-
-	count += delta;
-	/* Use a signed verify to make sure we're not neg. */
-	VERIFY3S(count, >=, 0);
-
-	VERIFY0(zap_update(os, dd->dd_object, prop, sizeof (count), 1, &count,
-	    tx));
-
-	/* Roll up this additional count into our ancestors */
-	if (dd->dd_parent != NULL)
-		dsl_fs_ss_count_adjust(dd->dd_parent, delta, prop, tx);
-}
-
-uint64_t
-dsl_dir_create_sync(dsl_pool_t *dp, dsl_dir_t *pds, const char *name,
-    dmu_tx_t *tx)
-{
-	objset_t *mos = dp->dp_meta_objset;
-	uint64_t ddobj;
-	dsl_dir_phys_t *ddphys;
-	dmu_buf_t *dbuf;
-
-	ddobj = dmu_object_alloc(mos, DMU_OT_DSL_DIR, 0,
-	    DMU_OT_DSL_DIR, sizeof (dsl_dir_phys_t), tx);
-	if (pds) {
-		VERIFY0(zap_add(mos, dsl_dir_phys(pds)->dd_child_dir_zapobj,
-		    name, sizeof (uint64_t), 1, &ddobj, tx));
-	} else {
-		/* it's the root dir */
-		VERIFY0(zap_add(mos, DMU_POOL_DIRECTORY_OBJECT,
-		    DMU_POOL_ROOT_DATASET, sizeof (uint64_t), 1, &ddobj, tx));
-	}
-	VERIFY0(dmu_bonus_hold(mos, ddobj, FTAG, &dbuf));
-	dmu_buf_will_dirty(dbuf, tx);
-	ddphys = dbuf->db_data;
-
-	ddphys->dd_creation_time = gethrestime_sec();
-	if (pds) {
-		ddphys->dd_parent_obj = pds->dd_object;
-
-		/* update the filesystem counts */
-		dsl_fs_ss_count_adjust(pds, 1, DD_FIELD_FILESYSTEM_COUNT, tx);
-	}
-	ddphys->dd_props_zapobj = zap_create(mos,
-	    DMU_OT_DSL_PROPS, DMU_OT_NONE, 0, tx);
-	ddphys->dd_child_dir_zapobj = zap_create(mos,
-	    DMU_OT_DSL_DIR_CHILD_MAP, DMU_OT_NONE, 0, tx);
-	if (spa_version(dp->dp_spa) >= SPA_VERSION_USED_BREAKDOWN)
-		ddphys->dd_flags |= DD_FLAG_USED_BREAKDOWN;
-	dmu_buf_rele(dbuf, FTAG);
-
-	return (ddobj);
-}
-
-boolean_t
-dsl_dir_is_clone(dsl_dir_t *dd)
-{
-	return (dsl_dir_phys(dd)->dd_origin_obj &&
-	    (dd->dd_pool->dp_origin_snap == NULL ||
-	    dsl_dir_phys(dd)->dd_origin_obj !=
-	    dd->dd_pool->dp_origin_snap->ds_object));
-}
-
-
-uint64_t
-dsl_dir_get_used(dsl_dir_t *dd)
-{
-	return (dsl_dir_phys(dd)->dd_used_bytes);
-}
-
-uint64_t
-dsl_dir_get_compressed(dsl_dir_t *dd)
-{
-	return (dsl_dir_phys(dd)->dd_compressed_bytes);
-}
-
-uint64_t
-dsl_dir_get_quota(dsl_dir_t *dd)
-{
-	return (dsl_dir_phys(dd)->dd_quota);
-}
-
-uint64_t
-dsl_dir_get_reservation(dsl_dir_t *dd)
-{
-	return (dsl_dir_phys(dd)->dd_reserved);
-}
-
-uint64_t
-dsl_dir_get_compressratio(dsl_dir_t *dd)
-{
-	/* a fixed point number, 100x the ratio */
-	return (dsl_dir_phys(dd)->dd_compressed_bytes == 0 ? 100 :
-	    (dsl_dir_phys(dd)->dd_uncompressed_bytes * 100 /
-	    dsl_dir_phys(dd)->dd_compressed_bytes));
-}
-
-uint64_t
-dsl_dir_get_logicalused(dsl_dir_t *dd)
-{
-	return (dsl_dir_phys(dd)->dd_uncompressed_bytes);
-}
-
-uint64_t
-dsl_dir_get_usedsnap(dsl_dir_t *dd)
-{
-	return (dsl_dir_phys(dd)->dd_used_breakdown[DD_USED_SNAP]);
-}
-
-uint64_t
-dsl_dir_get_usedds(dsl_dir_t *dd)
-{
-	return (dsl_dir_phys(dd)->dd_used_breakdown[DD_USED_HEAD]);
-}
-
-uint64_t
-dsl_dir_get_usedrefreserv(dsl_dir_t *dd)
-{
-	return (dsl_dir_phys(dd)->dd_used_breakdown[DD_USED_REFRSRV]);
-}
-
-uint64_t
-dsl_dir_get_usedchild(dsl_dir_t *dd)
-{
-	return (dsl_dir_phys(dd)->dd_used_breakdown[DD_USED_CHILD] +
-	    dsl_dir_phys(dd)->dd_used_breakdown[DD_USED_CHILD_RSRV]);
-}
-
-void
-dsl_dir_get_origin(dsl_dir_t *dd, char *buf)
-{
-	dsl_dataset_t *ds;
-	VERIFY0(dsl_dataset_hold_obj(dd->dd_pool,
-	    dsl_dir_phys(dd)->dd_origin_obj, FTAG, &ds));
-
-	dsl_dataset_name(ds, buf);
-
-	dsl_dataset_rele(ds, FTAG);
-}
-
-int
-dsl_dir_get_filesystem_count(dsl_dir_t *dd, uint64_t *count)
-{
-	if (dsl_dir_is_zapified(dd)) {
-		objset_t *os = dd->dd_pool->dp_meta_objset;
-		return (zap_lookup(os, dd->dd_object, DD_FIELD_FILESYSTEM_COUNT,
-		    sizeof (*count), 1, count));
-	} else {
-		return (ENOENT);
-	}
-}
-
-int
-dsl_dir_get_snapshot_count(dsl_dir_t *dd, uint64_t *count)
-{
-	if (dsl_dir_is_zapified(dd)) {
-		objset_t *os = dd->dd_pool->dp_meta_objset;
-		return (zap_lookup(os, dd->dd_object, DD_FIELD_SNAPSHOT_COUNT,
-		    sizeof (*count), 1, count));
-	} else {
-		return (ENOENT);
-	}
-}
-
-int
-dsl_dir_get_remaptxg(dsl_dir_t *dd, uint64_t *count)
-{
-	if (dsl_dir_is_zapified(dd)) {
-		objset_t *os = dd->dd_pool->dp_meta_objset;
-		return (zap_lookup(os, dd->dd_object, DD_FIELD_LAST_REMAP_TXG,
-		    sizeof (*count), 1, count));
-	} else {
-		return (ENOENT);
-	}
-}
-
-void
-dsl_dir_stats(dsl_dir_t *dd, nvlist_t *nv)
-{
-	mutex_enter(&dd->dd_lock);
-	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_QUOTA,
-	    dsl_dir_get_quota(dd));
-	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_RESERVATION,
-	    dsl_dir_get_reservation(dd));
-	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_LOGICALUSED,
-	    dsl_dir_get_logicalused(dd));
-	if (dsl_dir_phys(dd)->dd_flags & DD_FLAG_USED_BREAKDOWN) {
-		dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDSNAP,
-		    dsl_dir_get_usedsnap(dd));
-		dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDDS,
-		    dsl_dir_get_usedds(dd));
-		dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDREFRESERV,
-		    dsl_dir_get_usedrefreserv(dd));
-		dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDCHILD,
-		    dsl_dir_get_usedchild(dd));
-	}
-	mutex_exit(&dd->dd_lock);
-
-	uint64_t count;
-	if (dsl_dir_get_filesystem_count(dd, &count) == 0) {
-		dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_FILESYSTEM_COUNT,
-		    count);
-	}
-	if (dsl_dir_get_snapshot_count(dd, &count) == 0) {
-		dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_SNAPSHOT_COUNT,
-		    count);
-	}
-	if (dsl_dir_get_remaptxg(dd, &count) == 0) {
-		dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REMAPTXG,
-		    count);
-	}
-
-	if (dsl_dir_is_clone(dd)) {
-		char buf[ZFS_MAX_DATASET_NAME_LEN];
-		dsl_dir_get_origin(dd, buf);
-		dsl_prop_nvlist_add_string(nv, ZFS_PROP_ORIGIN, buf);
-	}
-
-}
-
-void
-dsl_dir_dirty(dsl_dir_t *dd, dmu_tx_t *tx)
-{
-	dsl_pool_t *dp = dd->dd_pool;
-
-	ASSERT(dsl_dir_phys(dd));
-
-	if (txg_list_add(&dp->dp_dirty_dirs, dd, tx->tx_txg)) {
-		/* up the hold count until we can be written out */
-		dmu_buf_add_ref(dd->dd_dbuf, dd);
-	}
-}
-
-static int64_t
-parent_delta(dsl_dir_t *dd, uint64_t used, int64_t delta)
-{
-	uint64_t old_accounted = MAX(used, dsl_dir_phys(dd)->dd_reserved);
-	uint64_t new_accounted =
-	    MAX(used + delta, dsl_dir_phys(dd)->dd_reserved);
-	return (new_accounted - old_accounted);
-}
-
-void
-dsl_dir_sync(dsl_dir_t *dd, dmu_tx_t *tx)
-{
-	ASSERT(dmu_tx_is_syncing(tx));
-
-	mutex_enter(&dd->dd_lock);
-	ASSERT0(dd->dd_tempreserved[tx->tx_txg&TXG_MASK]);
-	dprintf_dd(dd, "txg=%llu towrite=%lluK\n", tx->tx_txg,
-	    dd->dd_space_towrite[tx->tx_txg&TXG_MASK] / 1024);
-	dd->dd_space_towrite[tx->tx_txg&TXG_MASK] = 0;
-	mutex_exit(&dd->dd_lock);
-
-	/* release the hold from dsl_dir_dirty */
-	dmu_buf_rele(dd->dd_dbuf, dd);
-}
-
-static uint64_t
-dsl_dir_space_towrite(dsl_dir_t *dd)
-{
-	uint64_t space = 0;
-
-	ASSERT(MUTEX_HELD(&dd->dd_lock));
-
-	for (int i = 0; i < TXG_SIZE; i++) {
-		space += dd->dd_space_towrite[i & TXG_MASK];
-		ASSERT3U(dd->dd_space_towrite[i & TXG_MASK], >=, 0);
-	}
-	return (space);
-}
-
-/*
- * How much space would dd have available if ancestor had delta applied
- * to it?  If ondiskonly is set, we're only interested in what's
- * on-disk, not estimated pending changes.
- */
-uint64_t
-dsl_dir_space_available(dsl_dir_t *dd,
-    dsl_dir_t *ancestor, int64_t delta, int ondiskonly)
-{
-	uint64_t parentspace, myspace, quota, used;
-
-	/*
-	 * If there are no restrictions otherwise, assume we have
-	 * unlimited space available.
-	 */
-	quota = UINT64_MAX;
-	parentspace = UINT64_MAX;
-
-	if (dd->dd_parent != NULL) {
-		parentspace = dsl_dir_space_available(dd->dd_parent,
-		    ancestor, delta, ondiskonly);
-	}
-
-	mutex_enter(&dd->dd_lock);
-	if (dsl_dir_phys(dd)->dd_quota != 0)
-		quota = dsl_dir_phys(dd)->dd_quota;
-	used = dsl_dir_phys(dd)->dd_used_bytes;
-	if (!ondiskonly)
-		used += dsl_dir_space_towrite(dd);
-
-	if (dd->dd_parent == NULL) {
-		uint64_t poolsize = dsl_pool_adjustedsize(dd->dd_pool,
-		    ZFS_SPACE_CHECK_NORMAL);
-		quota = MIN(quota, poolsize);
-	}
-
-	if (dsl_dir_phys(dd)->dd_reserved > used && parentspace != UINT64_MAX) {
-		/*
-		 * We have some space reserved, in addition to what our
-		 * parent gave us.
-		 */
-		parentspace += dsl_dir_phys(dd)->dd_reserved - used;
-	}
-
-	if (dd == ancestor) {
-		ASSERT(delta <= 0);
-		ASSERT(used >= -delta);
-		used += delta;
-		if (parentspace != UINT64_MAX)
-			parentspace -= delta;
-	}
-
-	if (used > quota) {
-		/* over quota */
-		myspace = 0;
-	} else {
-		/*
-		 * the lesser of the space provided by our parent and
-		 * the space left in our quota
-		 */
-		myspace = MIN(parentspace, quota - used);
-	}
-
-	mutex_exit(&dd->dd_lock);
-
-	return (myspace);
-}
-
-struct tempreserve {
-	list_node_t tr_node;
-	dsl_dir_t *tr_ds;
-	uint64_t tr_size;
-};
-
-static int
-dsl_dir_tempreserve_impl(dsl_dir_t *dd, uint64_t asize, boolean_t netfree,
-    boolean_t ignorequota, list_t *tr_list,
-    dmu_tx_t *tx, boolean_t first)
-{
-	uint64_t txg = tx->tx_txg;
-	uint64_t quota;
-	struct tempreserve *tr;
-	int retval = EDQUOT;
-	uint64_t ref_rsrv = 0;
-
-	ASSERT3U(txg, !=, 0);
-	ASSERT3S(asize, >, 0);
-
-	mutex_enter(&dd->dd_lock);
-
-	/*
-	 * Check against the dsl_dir's quota.  We don't add in the delta
-	 * when checking for over-quota because they get one free hit.
-	 */
-	uint64_t est_inflight = dsl_dir_space_towrite(dd);
-	for (int i = 0; i < TXG_SIZE; i++)
-		est_inflight += dd->dd_tempreserved[i];
-	uint64_t used_on_disk = dsl_dir_phys(dd)->dd_used_bytes;
-
-	/*
-	 * On the first iteration, fetch the dataset's used-on-disk and
-	 * refreservation values. Also, if checkrefquota is set, test if
-	 * allocating this space would exceed the dataset's refquota.
-	 */
-	if (first && tx->tx_objset) {
-		int error;
-		dsl_dataset_t *ds = tx->tx_objset->os_dsl_dataset;
-
-		error = dsl_dataset_check_quota(ds, !netfree,
-		    asize, est_inflight, &used_on_disk, &ref_rsrv);
-		if (error != 0) {
-			mutex_exit(&dd->dd_lock);
-			return (error);
-		}
-	}
-
-	/*
-	 * If this transaction will result in a net free of space,
-	 * we want to let it through.
-	 */
-	if (ignorequota || netfree || dsl_dir_phys(dd)->dd_quota == 0)
-		quota = UINT64_MAX;
-	else
-		quota = dsl_dir_phys(dd)->dd_quota;
-
-	/*
-	 * Adjust the quota against the actual pool size at the root
-	 * minus any outstanding deferred frees.
-	 * To ensure that it's possible to remove files from a full
-	 * pool without inducing transient overcommits, we throttle
-	 * netfree transactions against a quota that is slightly larger,
-	 * but still within the pool's allocation slop.  In cases where
-	 * we're very close to full, this will allow a steady trickle of
-	 * removes to get through.
-	 */
-	uint64_t deferred = 0;
-	if (dd->dd_parent == NULL) {
-		uint64_t avail = dsl_pool_unreserved_space(dd->dd_pool,
-		    (netfree) ?
-		    ZFS_SPACE_CHECK_RESERVED : ZFS_SPACE_CHECK_NORMAL);
-
-		if (avail < quota) {
-			quota = avail;
-			retval = ENOSPC;
-		}
-	}
-
-	/*
-	 * If they are requesting more space, and our current estimate
-	 * is over quota, they get to try again unless the actual
-	 * on-disk is over quota and there are no pending changes (which
-	 * may free up space for us).
-	 */
-	if (used_on_disk + est_inflight >= quota) {
-		if (est_inflight > 0 || used_on_disk < quota ||
-		    (retval == ENOSPC && used_on_disk < quota + deferred))
-			retval = ERESTART;
-		dprintf_dd(dd, "failing: used=%lluK inflight = %lluK "
-		    "quota=%lluK tr=%lluK err=%d\n",
-		    used_on_disk>>10, est_inflight>>10,
-		    quota>>10, asize>>10, retval);
-		mutex_exit(&dd->dd_lock);
-		return (SET_ERROR(retval));
-	}
-
-	/* We need to up our estimated delta before dropping dd_lock */
-	dd->dd_tempreserved[txg & TXG_MASK] += asize;
-
-	uint64_t parent_rsrv = parent_delta(dd, used_on_disk + est_inflight,
-	    asize - ref_rsrv);
-	mutex_exit(&dd->dd_lock);
-
-	tr = kmem_zalloc(sizeof (struct tempreserve), KM_SLEEP);
-	tr->tr_ds = dd;
-	tr->tr_size = asize;
-	list_insert_tail(tr_list, tr);
-
-	/* see if it's OK with our parent */
-	if (dd->dd_parent != NULL && parent_rsrv != 0) {
-		boolean_t ismos = (dsl_dir_phys(dd)->dd_head_dataset_obj == 0);
-
-		return (dsl_dir_tempreserve_impl(dd->dd_parent,
-		    parent_rsrv, netfree, ismos, tr_list, tx, B_FALSE));
-	} else {
-		return (0);
-	}
-}
-
-/*
- * Reserve space in this dsl_dir, to be used in this tx's txg.
- * After the space has been dirtied (and dsl_dir_willuse_space()
- * has been called), the reservation should be canceled, using
- * dsl_dir_tempreserve_clear().
- */
-int
-dsl_dir_tempreserve_space(dsl_dir_t *dd, uint64_t lsize, uint64_t asize,
-    boolean_t netfree, void **tr_cookiep, dmu_tx_t *tx)
-{
-	int err;
-	list_t *tr_list;
-
-	if (asize == 0) {
-		*tr_cookiep = NULL;
-		return (0);
-	}
-
-	tr_list = kmem_alloc(sizeof (list_t), KM_SLEEP);
-	list_create(tr_list, sizeof (struct tempreserve),
-	    offsetof(struct tempreserve, tr_node));
-	ASSERT3S(asize, >, 0);
-
-	err = arc_tempreserve_space(dd->dd_pool->dp_spa, lsize, tx->tx_txg);
-	if (err == 0) {
-		struct tempreserve *tr;
-
-		tr = kmem_zalloc(sizeof (struct tempreserve), KM_SLEEP);
-		tr->tr_size = lsize;
-		list_insert_tail(tr_list, tr);
-	} else {
-		if (err == EAGAIN) {
-			/*
-			 * If arc_memory_throttle() detected that pageout
-			 * is running and we are low on memory, we delay new
-			 * non-pageout transactions to give pageout an
-			 * advantage.
-			 *
-			 * It is unfortunate to be delaying while the caller's
-			 * locks are held.
-			 */
-			txg_delay(dd->dd_pool, tx->tx_txg,
-			    MSEC2NSEC(10), MSEC2NSEC(10));
-			err = SET_ERROR(ERESTART);
-		}
-	}
-
-	if (err == 0) {
-		err = dsl_dir_tempreserve_impl(dd, asize, netfree,
-		    B_FALSE, tr_list, tx, B_TRUE);
-	}
-
-	if (err != 0)
-		dsl_dir_tempreserve_clear(tr_list, tx);
-	else
-		*tr_cookiep = tr_list;
-
-	return (err);
-}
-
-/*
- * Clear a temporary reservation that we previously made with
- * dsl_dir_tempreserve_space().
- */
-void
-dsl_dir_tempreserve_clear(void *tr_cookie, dmu_tx_t *tx)
-{
-	int txgidx = tx->tx_txg & TXG_MASK;
-	list_t *tr_list = tr_cookie;
-	struct tempreserve *tr;
-
-	ASSERT3U(tx->tx_txg, !=, 0);
-
-	if (tr_cookie == NULL)
-		return;
-
-	while ((tr = list_head(tr_list)) != NULL) {
-		if (tr->tr_ds) {
-			mutex_enter(&tr->tr_ds->dd_lock);
-			ASSERT3U(tr->tr_ds->dd_tempreserved[txgidx], >=,
-			    tr->tr_size);
-			tr->tr_ds->dd_tempreserved[txgidx] -= tr->tr_size;
-			mutex_exit(&tr->tr_ds->dd_lock);
-		} else {
-			arc_tempreserve_clear(tr->tr_size);
-		}
-		list_remove(tr_list, tr);
-		kmem_free(tr, sizeof (struct tempreserve));
-	}
-
-	kmem_free(tr_list, sizeof (list_t));
-}
-
-/*
- * This should be called from open context when we think we're going to write
- * or free space, for example when dirtying data. Be conservative; it's okay
- * to write less space or free more, but we don't want to write more or free
- * less than the amount specified.
- */
-void
-dsl_dir_willuse_space(dsl_dir_t *dd, int64_t space, dmu_tx_t *tx)
-{
-	int64_t parent_space;
-	uint64_t est_used;
-
-	mutex_enter(&dd->dd_lock);
-	if (space > 0)
-		dd->dd_space_towrite[tx->tx_txg & TXG_MASK] += space;
-
-	est_used = dsl_dir_space_towrite(dd) + dsl_dir_phys(dd)->dd_used_bytes;
-	parent_space = parent_delta(dd, est_used, space);
-	mutex_exit(&dd->dd_lock);
-
-	/* Make sure that we clean up dd_space_to* */
-	dsl_dir_dirty(dd, tx);
-
-	/* XXX this is potentially expensive and unnecessary... */
-	if (parent_space && dd->dd_parent)
-		dsl_dir_willuse_space(dd->dd_parent, parent_space, tx);
-}
-
-/* call from syncing context when we actually write/free space for this dd */
-void
-dsl_dir_diduse_space(dsl_dir_t *dd, dd_used_t type,
-    int64_t used, int64_t compressed, int64_t uncompressed, dmu_tx_t *tx)
-{
-	int64_t accounted_delta;
-
-	/*
-	 * dsl_dataset_set_refreservation_sync_impl() calls this with
-	 * dd_lock held, so that it can atomically update
-	 * ds->ds_reserved and the dsl_dir accounting, so that
-	 * dsl_dataset_check_quota() can see dataset and dir accounting
-	 * consistently.
-	 */
-	boolean_t needlock = !MUTEX_HELD(&dd->dd_lock);
-
-	ASSERT(dmu_tx_is_syncing(tx));
-	ASSERT(type < DD_USED_NUM);
-
-	dmu_buf_will_dirty(dd->dd_dbuf, tx);
-
-	if (needlock)
-		mutex_enter(&dd->dd_lock);
-	accounted_delta =
-	    parent_delta(dd, dsl_dir_phys(dd)->dd_used_bytes, used);
-	ASSERT(used >= 0 || dsl_dir_phys(dd)->dd_used_bytes >= -used);
-	ASSERT(compressed >= 0 ||
-	    dsl_dir_phys(dd)->dd_compressed_bytes >= -compressed);
-	ASSERT(uncompressed >= 0 ||
-	    dsl_dir_phys(dd)->dd_uncompressed_bytes >= -uncompressed);
-	dsl_dir_phys(dd)->dd_used_bytes += used;
-	dsl_dir_phys(dd)->dd_uncompressed_bytes += uncompressed;
-	dsl_dir_phys(dd)->dd_compressed_bytes += compressed;
-
-	if (dsl_dir_phys(dd)->dd_flags & DD_FLAG_USED_BREAKDOWN) {
-		ASSERT(used > 0 ||
-		    dsl_dir_phys(dd)->dd_used_breakdown[type] >= -used);
-		dsl_dir_phys(dd)->dd_used_breakdown[type] += used;
-#ifdef DEBUG
-		dd_used_t t;
-		uint64_t u = 0;
-		for (t = 0; t < DD_USED_NUM; t++)
-			u += dsl_dir_phys(dd)->dd_used_breakdown[t];
-		ASSERT3U(u, ==, dsl_dir_phys(dd)->dd_used_bytes);
-#endif
-	}
-	if (needlock)
-		mutex_exit(&dd->dd_lock);
-
-	if (dd->dd_parent != NULL) {
-		dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD,
-		    accounted_delta, compressed, uncompressed, tx);
-		dsl_dir_transfer_space(dd->dd_parent,
-		    used - accounted_delta,
-		    DD_USED_CHILD_RSRV, DD_USED_CHILD, NULL);
-	}
-}
-
-void
-dsl_dir_transfer_space(dsl_dir_t *dd, int64_t delta,
-    dd_used_t oldtype, dd_used_t newtype, dmu_tx_t *tx)
-{
-	ASSERT(tx == NULL || dmu_tx_is_syncing(tx));
-	ASSERT(oldtype < DD_USED_NUM);
-	ASSERT(newtype < DD_USED_NUM);
-
-	if (delta == 0 ||
-	    !(dsl_dir_phys(dd)->dd_flags & DD_FLAG_USED_BREAKDOWN))
-		return;
-
-	if (tx != NULL)
-		dmu_buf_will_dirty(dd->dd_dbuf, tx);
-	mutex_enter(&dd->dd_lock);
-	ASSERT(delta > 0 ?
-	    dsl_dir_phys(dd)->dd_used_breakdown[oldtype] >= delta :
-	    dsl_dir_phys(dd)->dd_used_breakdown[newtype] >= -delta);
-	ASSERT(dsl_dir_phys(dd)->dd_used_bytes >= ABS(delta));
-	dsl_dir_phys(dd)->dd_used_breakdown[oldtype] -= delta;
-	dsl_dir_phys(dd)->dd_used_breakdown[newtype] += delta;
-	mutex_exit(&dd->dd_lock);
-}
-
-typedef struct dsl_dir_set_qr_arg {
-	const char *ddsqra_name;
-	zprop_source_t ddsqra_source;
-	uint64_t ddsqra_value;
-} dsl_dir_set_qr_arg_t;
-
-static int
-dsl_dir_set_quota_check(void *arg, dmu_tx_t *tx)
-{
-	dsl_dir_set_qr_arg_t *ddsqra = arg;
-	dsl_pool_t *dp = dmu_tx_pool(tx);
-	dsl_dataset_t *ds;
-	int error;
-	uint64_t towrite, newval;
-
-	error = dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds);
-	if (error != 0)
-		return (error);
-
-	error = dsl_prop_predict(ds->ds_dir, "quota",
-	    ddsqra->ddsqra_source, ddsqra->ddsqra_value, &newval);
-	if (error != 0) {
-		dsl_dataset_rele(ds, FTAG);
-		return (error);
-	}
-
-	if (newval == 0) {
-		dsl_dataset_rele(ds, FTAG);
-		return (0);
-	}
-
-	mutex_enter(&ds->ds_dir->dd_lock);
-	/*
-	 * If we are doing the preliminary check in open context, and
-	 * there are pending changes, then don't fail it, since the
-	 * pending changes could under-estimate the amount of space to be
-	 * freed up.
-	 */
-	towrite = dsl_dir_space_towrite(ds->ds_dir);
-	if ((dmu_tx_is_syncing(tx) || towrite == 0) &&
-	    (newval < dsl_dir_phys(ds->ds_dir)->dd_reserved ||
-	    newval < dsl_dir_phys(ds->ds_dir)->dd_used_bytes + towrite)) {
-		error = SET_ERROR(ENOSPC);
-	}
-	mutex_exit(&ds->ds_dir->dd_lock);
-	dsl_dataset_rele(ds, FTAG);
-	return (error);
-}
-
-static void
-dsl_dir_set_quota_sync(void *arg, dmu_tx_t *tx)
-{
-	dsl_dir_set_qr_arg_t *ddsqra = arg;
-	dsl_pool_t *dp = dmu_tx_pool(tx);
-	dsl_dataset_t *ds;
-	uint64_t newval;
-
-	VERIFY0(dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds));
-
-	if (spa_version(dp->dp_spa) >= SPA_VERSION_RECVD_PROPS) {
-		dsl_prop_set_sync_impl(ds, zfs_prop_to_name(ZFS_PROP_QUOTA),
-		    ddsqra->ddsqra_source, sizeof (ddsqra->ddsqra_value), 1,
-		    &ddsqra->ddsqra_value, tx);
-
-		VERIFY0(dsl_prop_get_int_ds(ds,
-		    zfs_prop_to_name(ZFS_PROP_QUOTA), &newval));
-	} else {
-		newval = ddsqra->ddsqra_value;
-		spa_history_log_internal_ds(ds, "set", tx, "%s=%lld",
-		    zfs_prop_to_name(ZFS_PROP_QUOTA), (longlong_t)newval);
-	}
-
-	dmu_buf_will_dirty(ds->ds_dir->dd_dbuf, tx);
-	mutex_enter(&ds->ds_dir->dd_lock);
-	dsl_dir_phys(ds->ds_dir)->dd_quota = newval;
-	mutex_exit(&ds->ds_dir->dd_lock);
-	dsl_dataset_rele(ds, FTAG);
-}
-
-int
-dsl_dir_set_quota(const char *ddname, zprop_source_t source, uint64_t quota)
-{
-	dsl_dir_set_qr_arg_t ddsqra;
-
-	ddsqra.ddsqra_name = ddname;
-	ddsqra.ddsqra_source = source;
-	ddsqra.ddsqra_value = quota;
-
-	return (dsl_sync_task(ddname, dsl_dir_set_quota_check,
-	    dsl_dir_set_quota_sync, &ddsqra, 0,
-	    ZFS_SPACE_CHECK_EXTRA_RESERVED));
-}
-
-int
-dsl_dir_set_reservation_check(void *arg, dmu_tx_t *tx)
-{
-	dsl_dir_set_qr_arg_t *ddsqra = arg;
-	dsl_pool_t *dp = dmu_tx_pool(tx);
-	dsl_dataset_t *ds;
-	dsl_dir_t *dd;
-	uint64_t newval, used, avail;
-	int error;
-
-	error = dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds);
-	if (error != 0)
-		return (error);
-	dd = ds->ds_dir;
-
-	/*
-	 * If we are doing the preliminary check in open context, the
-	 * space estimates may be inaccurate.
-	 */
-	if (!dmu_tx_is_syncing(tx)) {
-		dsl_dataset_rele(ds, FTAG);
-		return (0);
-	}
-
-	error = dsl_prop_predict(ds->ds_dir,
-	    zfs_prop_to_name(ZFS_PROP_RESERVATION),
-	    ddsqra->ddsqra_source, ddsqra->ddsqra_value, &newval);
-	if (error != 0) {
-		dsl_dataset_rele(ds, FTAG);
-		return (error);
-	}
-
-	mutex_enter(&dd->dd_lock);
-	used = dsl_dir_phys(dd)->dd_used_bytes;
-	mutex_exit(&dd->dd_lock);
-
-	if (dd->dd_parent) {
-		avail = dsl_dir_space_available(dd->dd_parent,
-		    NULL, 0, FALSE);
-	} else {
-		avail = dsl_pool_adjustedsize(dd->dd_pool,
-		    ZFS_SPACE_CHECK_NORMAL) - used;
-	}
-
-	if (MAX(used, newval) > MAX(used, dsl_dir_phys(dd)->dd_reserved)) {
-		uint64_t delta = MAX(used, newval) -
-		    MAX(used, dsl_dir_phys(dd)->dd_reserved);
-
-		if (delta > avail ||
-		    (dsl_dir_phys(dd)->dd_quota > 0 &&
-		    newval > dsl_dir_phys(dd)->dd_quota))
-			error = SET_ERROR(ENOSPC);
-	}
-
-	dsl_dataset_rele(ds, FTAG);
-	return (error);
-}
-
-void
-dsl_dir_set_reservation_sync_impl(dsl_dir_t *dd, uint64_t value, dmu_tx_t *tx)
-{
-	uint64_t used;
-	int64_t delta;
-
-	dmu_buf_will_dirty(dd->dd_dbuf, tx);
-
-	mutex_enter(&dd->dd_lock);
-	used = dsl_dir_phys(dd)->dd_used_bytes;
-	delta = MAX(used, value) - MAX(used, dsl_dir_phys(dd)->dd_reserved);
-	dsl_dir_phys(dd)->dd_reserved = value;
-
-	if (dd->dd_parent != NULL) {
-		/* Roll up this additional usage into our ancestors */
-		dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD_RSRV,
-		    delta, 0, 0, tx);
-	}
-	mutex_exit(&dd->dd_lock);
-}
-
-static void
-dsl_dir_set_reservation_sync(void *arg, dmu_tx_t *tx)
-{
-	dsl_dir_set_qr_arg_t *ddsqra = arg;
-	dsl_pool_t *dp = dmu_tx_pool(tx);
-	dsl_dataset_t *ds;
-	uint64_t newval;
-
-	VERIFY0(dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds));
-
-	if (spa_version(dp->dp_spa) >= SPA_VERSION_RECVD_PROPS) {
-		dsl_prop_set_sync_impl(ds,
-		    zfs_prop_to_name(ZFS_PROP_RESERVATION),
-		    ddsqra->ddsqra_source, sizeof (ddsqra->ddsqra_value), 1,
-		    &ddsqra->ddsqra_value, tx);
-
-		VERIFY0(dsl_prop_get_int_ds(ds,
-		    zfs_prop_to_name(ZFS_PROP_RESERVATION), &newval));
-	} else {
-		newval = ddsqra->ddsqra_value;
-		spa_history_log_internal_ds(ds, "set", tx, "%s=%lld",
-		    zfs_prop_to_name(ZFS_PROP_RESERVATION),
-		    (longlong_t)newval);
-	}
-
-	dsl_dir_set_reservation_sync_impl(ds->ds_dir, newval, tx);
-	dsl_dataset_rele(ds, FTAG);
-}
-
-int
-dsl_dir_set_reservation(const char *ddname, zprop_source_t source,
-    uint64_t reservation)
-{
-	dsl_dir_set_qr_arg_t ddsqra;
-
-	ddsqra.ddsqra_name = ddname;
-	ddsqra.ddsqra_source = source;
-	ddsqra.ddsqra_value = reservation;
-
-	return (dsl_sync_task(ddname, dsl_dir_set_reservation_check,
-	    dsl_dir_set_reservation_sync, &ddsqra, 0,
-	    ZFS_SPACE_CHECK_EXTRA_RESERVED));
-}
-
-static dsl_dir_t *
-closest_common_ancestor(dsl_dir_t *ds1, dsl_dir_t *ds2)
-{
-	for (; ds1; ds1 = ds1->dd_parent) {
-		dsl_dir_t *dd;
-		for (dd = ds2; dd; dd = dd->dd_parent) {
-			if (ds1 == dd)
-				return (dd);
-		}
-	}
-	return (NULL);
-}
-
-/*
- * If delta is applied to dd, how much of that delta would be applied to
- * ancestor?  Syncing context only.
- */
-static int64_t
-would_change(dsl_dir_t *dd, int64_t delta, dsl_dir_t *ancestor)
-{
-	if (dd == ancestor)
-		return (delta);
-
-	mutex_enter(&dd->dd_lock);
-	delta = parent_delta(dd, dsl_dir_phys(dd)->dd_used_bytes, delta);
-	mutex_exit(&dd->dd_lock);
-	return (would_change(dd->dd_parent, delta, ancestor));
-}
-
-typedef struct dsl_dir_rename_arg {
-	const char *ddra_oldname;
-	const char *ddra_newname;
-	cred_t *ddra_cred;
-} dsl_dir_rename_arg_t;
-
-typedef struct dsl_valid_rename_arg {
-	int char_delta;
-	int nest_delta;
-} dsl_valid_rename_arg_t;
-
-/* ARGSUSED */
-static int
-dsl_valid_rename(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg)
-{
-	dsl_valid_rename_arg_t *dvra = arg;
-	char namebuf[ZFS_MAX_DATASET_NAME_LEN];
-
-	dsl_dataset_name(ds, namebuf);
-
-	ASSERT3U(strnlen(namebuf, ZFS_MAX_DATASET_NAME_LEN),
-	    <, ZFS_MAX_DATASET_NAME_LEN);
-	int namelen = strlen(namebuf) + dvra->char_delta;
-	int depth = get_dataset_depth(namebuf) + dvra->nest_delta;
-
-	if (namelen >= ZFS_MAX_DATASET_NAME_LEN)
-		return (SET_ERROR(ENAMETOOLONG));
-	if (dvra->nest_delta > 0 && depth >= zfs_max_dataset_nesting)
-		return (SET_ERROR(ENAMETOOLONG));
-	return (0);
-}
-
-static int
-dsl_dir_rename_check(void *arg, dmu_tx_t *tx)
-{
-	dsl_dir_rename_arg_t *ddra = arg;
-	dsl_pool_t *dp = dmu_tx_pool(tx);
-	dsl_dir_t *dd, *newparent;
-	dsl_valid_rename_arg_t dvra;
-	dsl_dataset_t *parentds;
-	objset_t *parentos;
-	const char *mynewname;
-	int error;
-
-	/* target dir should exist */
-	error = dsl_dir_hold(dp, ddra->ddra_oldname, FTAG, &dd, NULL);
-	if (error != 0)
-		return (error);
-
-	/* new parent should exist */
-	error = dsl_dir_hold(dp, ddra->ddra_newname, FTAG,
-	    &newparent, &mynewname);
-	if (error != 0) {
-		dsl_dir_rele(dd, FTAG);
-		return (error);
-	}
-
-	/* can't rename to different pool */
-	if (dd->dd_pool != newparent->dd_pool) {
-		dsl_dir_rele(newparent, FTAG);
-		dsl_dir_rele(dd, FTAG);
-		return (SET_ERROR(EXDEV));
-	}
-
-	/* new name should not already exist */
-	if (mynewname == NULL) {
-		dsl_dir_rele(newparent, FTAG);
-		dsl_dir_rele(dd, FTAG);
-		return (SET_ERROR(EEXIST));
-	}
-
-	/* can't rename below anything but filesystems (eg. no ZVOLs) */
-	error = dsl_dataset_hold_obj(newparent->dd_pool,
-	    dsl_dir_phys(newparent)->dd_head_dataset_obj, FTAG, &parentds);
-	if (error != 0) {
-		dsl_dir_rele(newparent, FTAG);
-		dsl_dir_rele(dd, FTAG);
-		return (error);
-	}
-	error = dmu_objset_from_ds(parentds, &parentos);
-	if (error != 0) {
-		dsl_dataset_rele(parentds, FTAG);
-		dsl_dir_rele(newparent, FTAG);
-		dsl_dir_rele(dd, FTAG);
-		return (error);
-	}
-	if (dmu_objset_type(parentos) != DMU_OST_ZFS) {
-		dsl_dataset_rele(parentds, FTAG);
-		dsl_dir_rele(newparent, FTAG);
-		dsl_dir_rele(dd, FTAG);
-		return (error);
-	}
-	dsl_dataset_rele(parentds, FTAG);
-
-	ASSERT3U(strnlen(ddra->ddra_newname, ZFS_MAX_DATASET_NAME_LEN),
-	    <, ZFS_MAX_DATASET_NAME_LEN);
-	ASSERT3U(strnlen(ddra->ddra_oldname, ZFS_MAX_DATASET_NAME_LEN),
-	    <, ZFS_MAX_DATASET_NAME_LEN);
-	dvra.char_delta = strlen(ddra->ddra_newname)
-	    - strlen(ddra->ddra_oldname);
-	dvra.nest_delta = get_dataset_depth(ddra->ddra_newname)
-	    - get_dataset_depth(ddra->ddra_oldname);
-
-	/* if the name length is growing, validate child name lengths */
-	if (dvra.char_delta > 0 || dvra.nest_delta > 0) {
-		error = dmu_objset_find_dp(dp, dd->dd_object, dsl_valid_rename,
-		    &dvra, DS_FIND_CHILDREN | DS_FIND_SNAPSHOTS);
-		if (error != 0) {
-			dsl_dir_rele(newparent, FTAG);
-			dsl_dir_rele(dd, FTAG);
-			return (error);
-		}
-	}
-
-	if (dmu_tx_is_syncing(tx)) {
-		if (spa_feature_is_active(dp->dp_spa,
-		    SPA_FEATURE_FS_SS_LIMIT)) {
-			/*
-			 * Although this is the check function and we don't
-			 * normally make on-disk changes in check functions,
-			 * we need to do that here.
-			 *
-			 * Ensure this portion of the tree's counts have been
-			 * initialized in case the new parent has limits set.
-			 */
-			dsl_dir_init_fs_ss_count(dd, tx);
-		}
-	}
-
-	if (newparent != dd->dd_parent) {
-		/* is there enough space? */
-		uint64_t myspace =
-		    MAX(dsl_dir_phys(dd)->dd_used_bytes,
-		    dsl_dir_phys(dd)->dd_reserved);
-		objset_t *os = dd->dd_pool->dp_meta_objset;
-		uint64_t fs_cnt = 0;
-		uint64_t ss_cnt = 0;
-
-		if (dsl_dir_is_zapified(dd)) {
-			int err;
-
-			err = zap_lookup(os, dd->dd_object,
-			    DD_FIELD_FILESYSTEM_COUNT, sizeof (fs_cnt), 1,
-			    &fs_cnt);
-			if (err != ENOENT && err != 0) {
-				dsl_dir_rele(newparent, FTAG);
-				dsl_dir_rele(dd, FTAG);
-				return (err);
-			}
-
-			/*
-			 * have to add 1 for the filesystem itself that we're
-			 * moving
-			 */
-			fs_cnt++;
-
-			err = zap_lookup(os, dd->dd_object,
-			    DD_FIELD_SNAPSHOT_COUNT, sizeof (ss_cnt), 1,
-			    &ss_cnt);
-			if (err != ENOENT && err != 0) {
-				dsl_dir_rele(newparent, FTAG);
-				dsl_dir_rele(dd, FTAG);
-				return (err);
-			}
-		}
-
-		/* no rename into our descendant */
-		if (closest_common_ancestor(dd, newparent) == dd) {
-			dsl_dir_rele(newparent, FTAG);
-			dsl_dir_rele(dd, FTAG);
-			return (SET_ERROR(EINVAL));
-		}
-
-		error = dsl_dir_transfer_possible(dd->dd_parent,
-		    newparent, fs_cnt, ss_cnt, myspace, ddra->ddra_cred);
-		if (error != 0) {
-			dsl_dir_rele(newparent, FTAG);
-			dsl_dir_rele(dd, FTAG);
-			return (error);
-		}
-	}
-
-	dsl_dir_rele(newparent, FTAG);
-	dsl_dir_rele(dd, FTAG);
-	return (0);
-}
-
-static void
-dsl_dir_rename_sync(void *arg, dmu_tx_t *tx)
-{
-	dsl_dir_rename_arg_t *ddra = arg;
-	dsl_pool_t *dp = dmu_tx_pool(tx);
-	dsl_dir_t *dd, *newparent;
-	const char *mynewname;
-	int error;
-	objset_t *mos = dp->dp_meta_objset;
-
-	VERIFY0(dsl_dir_hold(dp, ddra->ddra_oldname, FTAG, &dd, NULL));
-	VERIFY0(dsl_dir_hold(dp, ddra->ddra_newname, FTAG, &newparent,
-	    &mynewname));
-
-	/* Log this before we change the name. */
-	spa_history_log_internal_dd(dd, "rename", tx,
-	    "-> %s", ddra->ddra_newname);
-
-	if (newparent != dd->dd_parent) {
-		objset_t *os = dd->dd_pool->dp_meta_objset;
-		uint64_t fs_cnt = 0;
-		uint64_t ss_cnt = 0;
-
-		/*
-		 * We already made sure the dd counts were initialized in the
-		 * check function.
-		 */
-		if (spa_feature_is_active(dp->dp_spa,
-		    SPA_FEATURE_FS_SS_LIMIT)) {
-			VERIFY0(zap_lookup(os, dd->dd_object,
-			    DD_FIELD_FILESYSTEM_COUNT, sizeof (fs_cnt), 1,
-			    &fs_cnt));
-			/* add 1 for the filesystem itself that we're moving */
-			fs_cnt++;
-
-			VERIFY0(zap_lookup(os, dd->dd_object,
-			    DD_FIELD_SNAPSHOT_COUNT, sizeof (ss_cnt), 1,
-			    &ss_cnt));
-		}
-
-		dsl_fs_ss_count_adjust(dd->dd_parent, -fs_cnt,
-		    DD_FIELD_FILESYSTEM_COUNT, tx);
-		dsl_fs_ss_count_adjust(newparent, fs_cnt,
-		    DD_FIELD_FILESYSTEM_COUNT, tx);
-
-		dsl_fs_ss_count_adjust(dd->dd_parent, -ss_cnt,
-		    DD_FIELD_SNAPSHOT_COUNT, tx);
-		dsl_fs_ss_count_adjust(newparent, ss_cnt,
-		    DD_FIELD_SNAPSHOT_COUNT, tx);
-
-		dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD,
-		    -dsl_dir_phys(dd)->dd_used_bytes,
-		    -dsl_dir_phys(dd)->dd_compressed_bytes,
-		    -dsl_dir_phys(dd)->dd_uncompressed_bytes, tx);
-		dsl_dir_diduse_space(newparent, DD_USED_CHILD,
-		    dsl_dir_phys(dd)->dd_used_bytes,
-		    dsl_dir_phys(dd)->dd_compressed_bytes,
-		    dsl_dir_phys(dd)->dd_uncompressed_bytes, tx);
-
-		if (dsl_dir_phys(dd)->dd_reserved >
-		    dsl_dir_phys(dd)->dd_used_bytes) {
-			uint64_t unused_rsrv = dsl_dir_phys(dd)->dd_reserved -
-			    dsl_dir_phys(dd)->dd_used_bytes;
-
-			dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD_RSRV,
-			    -unused_rsrv, 0, 0, tx);
-			dsl_dir_diduse_space(newparent, DD_USED_CHILD_RSRV,
-			    unused_rsrv, 0, 0, tx);
-		}
-	}
-
-	dmu_buf_will_dirty(dd->dd_dbuf, tx);
-
-	/* remove from old parent zapobj */
-	error = zap_remove(mos,
-	    dsl_dir_phys(dd->dd_parent)->dd_child_dir_zapobj,
-	    dd->dd_myname, tx);
-	ASSERT0(error);
-
-	(void) strcpy(dd->dd_myname, mynewname);
-	dsl_dir_rele(dd->dd_parent, dd);
-	dsl_dir_phys(dd)->dd_parent_obj = newparent->dd_object;
-	VERIFY0(dsl_dir_hold_obj(dp,
-	    newparent->dd_object, NULL, dd, &dd->dd_parent));
-
-	/* add to new parent zapobj */
-	VERIFY0(zap_add(mos, dsl_dir_phys(newparent)->dd_child_dir_zapobj,
-	    dd->dd_myname, 8, 1, &dd->dd_object, tx));
-
-#ifdef __FreeBSD__
-#ifdef _KERNEL
-	zfsvfs_update_fromname(ddra->ddra_oldname, ddra->ddra_newname);
-	zvol_rename_minors(dp->dp_spa, ddra->ddra_oldname, ddra->ddra_newname);
-#endif
-#endif
-
-	dsl_prop_notify_all(dd);
-
-	dsl_dir_rele(newparent, FTAG);
-	dsl_dir_rele(dd, FTAG);
-}
-
-int
-dsl_dir_rename(const char *oldname, const char *newname)
-{
-	dsl_dir_rename_arg_t ddra;
-
-	ddra.ddra_oldname = oldname;
-	ddra.ddra_newname = newname;
-	ddra.ddra_cred = CRED();
-
-	return (dsl_sync_task(oldname,
-	    dsl_dir_rename_check, dsl_dir_rename_sync, &ddra,
-	    3, ZFS_SPACE_CHECK_RESERVED));
-}
-
-int
-dsl_dir_transfer_possible(dsl_dir_t *sdd, dsl_dir_t *tdd,
-    uint64_t fs_cnt, uint64_t ss_cnt, uint64_t space, cred_t *cr)
-{
-	dsl_dir_t *ancestor;
-	int64_t adelta;
-	uint64_t avail;
-	int err;
-
-	ancestor = closest_common_ancestor(sdd, tdd);
-	adelta = would_change(sdd, -space, ancestor);
-	avail = dsl_dir_space_available(tdd, ancestor, adelta, FALSE);
-	if (avail < space)
-		return (SET_ERROR(ENOSPC));
-
-	err = dsl_fs_ss_limit_check(tdd, fs_cnt, ZFS_PROP_FILESYSTEM_LIMIT,
-	    ancestor, cr);
-	if (err != 0)
-		return (err);
-	err = dsl_fs_ss_limit_check(tdd, ss_cnt, ZFS_PROP_SNAPSHOT_LIMIT,
-	    ancestor, cr);
-	if (err != 0)
-		return (err);
-
-	return (0);
-}
-
-timestruc_t
-dsl_dir_snap_cmtime(dsl_dir_t *dd)
-{
-	timestruc_t t;
-
-	mutex_enter(&dd->dd_lock);
-	t = dd->dd_snap_cmtime;
-	mutex_exit(&dd->dd_lock);
-
-	return (t);
-}
-
-void
-dsl_dir_snap_cmtime_update(dsl_dir_t *dd)
-{
-	timestruc_t t;
-
-	gethrestime(&t);
-	mutex_enter(&dd->dd_lock);
-	dd->dd_snap_cmtime = t;
-	mutex_exit(&dd->dd_lock);
-}
-
-void
-dsl_dir_zapify(dsl_dir_t *dd, dmu_tx_t *tx)
-{
-	objset_t *mos = dd->dd_pool->dp_meta_objset;
-	dmu_object_zapify(mos, dd->dd_object, DMU_OT_DSL_DIR, tx);
-}
-
-boolean_t
-dsl_dir_is_zapified(dsl_dir_t *dd)
-{
-	dmu_object_info_t doi;
-
-	dmu_object_info_from_db(dd->dd_dbuf, &doi);
-	return (doi.doi_type == DMU_OTN_ZAP_METADATA);
-}
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_pool.c
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_pool.c
+++ /dev/null
@@ -1,1372 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, 2017 by Delphix. All rights reserved.
- * Copyright (c) 2013 Steven Hartland. All rights reserved.
- * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
- * Copyright (c) 2014 Integros [integros.com]
- * Copyright 2016 Nexenta Systems, Inc.  All rights reserved.
- */
-
-#include <sys/dsl_pool.h>
-#include <sys/dsl_dataset.h>
-#include <sys/dsl_prop.h>
-#include <sys/dsl_dir.h>
-#include <sys/dsl_synctask.h>
-#include <sys/dsl_scan.h>
-#include <sys/dnode.h>
-#include <sys/dmu_tx.h>
-#include <sys/dmu_objset.h>
-#include <sys/arc.h>
-#include <sys/zap.h>
-#include <sys/zio.h>
-#include <sys/zfs_context.h>
-#include <sys/fs/zfs.h>
-#include <sys/zfs_znode.h>
-#include <sys/spa_impl.h>
-#include <sys/dsl_deadlist.h>
-#include <sys/vdev_impl.h>
-#include <sys/metaslab_impl.h>
-#include <sys/bptree.h>
-#include <sys/zfeature.h>
-#include <sys/zil_impl.h>
-#include <sys/dsl_userhold.h>
-#include <sys/mmp.h>
-
-#if defined(__FreeBSD__) && defined(_KERNEL)
-#include <sys/types.h>
-#include <sys/sysctl.h>
-#endif
-
-/*
- * ZFS Write Throttle
- * ------------------
- *
- * ZFS must limit the rate of incoming writes to the rate at which it is able
- * to sync data modifications to the backend storage. Throttling by too much
- * creates an artificial limit; throttling by too little can only be sustained
- * for short periods and would lead to highly lumpy performance. On a per-pool
- * basis, ZFS tracks the amount of modified (dirty) data. As operations change
- * data, the amount of dirty data increases; as ZFS syncs out data, the amount
- * of dirty data decreases. When the amount of dirty data exceeds a
- * predetermined threshold further modifications are blocked until the amount
- * of dirty data decreases (as data is synced out).
- *
- * The limit on dirty data is tunable, and should be adjusted according to
- * both the IO capacity and available memory of the system. The larger the
- * window, the more ZFS is able to aggregate and amortize metadata (and data)
- * changes. However, memory is a limited resource, and allowing for more dirty
- * data comes at the cost of keeping other useful data in memory (for example
- * ZFS data cached by the ARC).
- *
- * Implementation
- *
- * As buffers are modified dsl_pool_willuse_space() increments both the per-
- * txg (dp_dirty_pertxg[]) and poolwide (dp_dirty_total) accounting of
- * dirty space used; dsl_pool_dirty_space() decrements those values as data
- * is synced out from dsl_pool_sync(). While only the poolwide value is
- * relevant, the per-txg value is useful for debugging. The tunable
- * zfs_dirty_data_max determines the dirty space limit. Once that value is
- * exceeded, new writes are halted until space frees up.
- *
- * The zfs_dirty_data_sync tunable dictates the threshold at which we
- * ensure that there is a txg syncing (see the comment in txg.c for a full
- * description of transaction group stages).
- *
- * The IO scheduler uses both the dirty space limit and current amount of
- * dirty data as inputs. Those values affect the number of concurrent IOs ZFS
- * issues. See the comment in vdev_queue.c for details of the IO scheduler.
- *
- * The delay is also calculated based on the amount of dirty data.  See the
- * comment above dmu_tx_delay() for details.
- */
-
-/*
- * zfs_dirty_data_max will be set to zfs_dirty_data_max_percent% of all memory,
- * capped at zfs_dirty_data_max_max.  It can also be overridden in /etc/system.
- */
-uint64_t zfs_dirty_data_max;
-uint64_t zfs_dirty_data_max_max = 4ULL * 1024 * 1024 * 1024;
-int zfs_dirty_data_max_percent = 10;
-
-/*
- * If there's at least this much dirty data (as a percentage of
- * zfs_dirty_data_max), push out a txg.  This should be less than
- * zfs_vdev_async_write_active_min_dirty_percent.
- */
-uint64_t zfs_dirty_data_sync_pct = 20;
-
-/*
- * Once there is this amount of dirty data, the dmu_tx_delay() will kick in
- * and delay each transaction.
- * This value should be >= zfs_vdev_async_write_active_max_dirty_percent.
- */
-int zfs_delay_min_dirty_percent = 60;
-
-/*
- * This controls how quickly the delay approaches infinity.
- * Larger values cause it to delay more for a given amount of dirty data.
- * Therefore larger values will cause there to be less dirty data for a
- * given throughput.
- *
- * For the smoothest delay, this value should be about 1 billion divided
- * by the maximum number of operations per second.  This will smoothly
- * handle between 10x and 1/10th this number.
- *
- * Note: zfs_delay_scale * zfs_dirty_data_max must be < 2^64, due to the
- * multiply in dmu_tx_delay().
- */
-uint64_t zfs_delay_scale = 1000 * 1000 * 1000 / 2000;
-
-/*
- * This determines the number of threads used by the dp_sync_taskq.
- */
-int zfs_sync_taskq_batch_pct = 75;
-
-/*
- * These tunables determine the behavior of how zil_itxg_clean() is
- * called via zil_clean() in the context of spa_sync(). When an itxg
- * list needs to be cleaned, TQ_NOSLEEP will be used when dispatching.
- * If the dispatch fails, the call to zil_itxg_clean() will occur
- * synchronously in the context of spa_sync(), which can negatively
- * impact the performance of spa_sync() (e.g. in the case of the itxg
- * list having a large number of itxs that needs to be cleaned).
- *
- * Thus, these tunables can be used to manipulate the behavior of the
- * taskq used by zil_clean(); they determine the number of taskq entries
- * that are pre-populated when the taskq is first created (via the
- * "zfs_zil_clean_taskq_minalloc" tunable) and the maximum number of
- * taskq entries that are cached after an on-demand allocation (via the
- * "zfs_zil_clean_taskq_maxalloc").
- *
- * The idea being, we want to try reasonably hard to ensure there will
- * already be a taskq entry pre-allocated by the time that it is needed
- * by zil_clean(). This way, we can avoid the possibility of an
- * on-demand allocation of a new taskq entry from failing, which would
- * result in zil_itxg_clean() being called synchronously from zil_clean()
- * (which can adversely affect performance of spa_sync()).
- *
- * Additionally, the number of threads used by the taskq can be
- * configured via the "zfs_zil_clean_taskq_nthr_pct" tunable.
- */
-int zfs_zil_clean_taskq_nthr_pct = 100;
-int zfs_zil_clean_taskq_minalloc = 1024;
-int zfs_zil_clean_taskq_maxalloc = 1024 * 1024;
-
-#if defined(__FreeBSD__) && defined(_KERNEL)
-
-extern int zfs_vdev_async_write_active_max_dirty_percent;
-
-SYSCTL_DECL(_vfs_zfs);
-
-SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, dirty_data_max, CTLFLAG_RWTUN,
-    &zfs_dirty_data_max, 0,
-    "The maximum amount of dirty data in bytes after which new writes are "
-    "halted until space becomes available");
-
-SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, dirty_data_max_max, CTLFLAG_RDTUN,
-    &zfs_dirty_data_max_max, 0,
-    "The absolute cap on dirty_data_max when auto calculating");
-
-static int sysctl_zfs_dirty_data_max_percent(SYSCTL_HANDLER_ARGS);
-SYSCTL_PROC(_vfs_zfs, OID_AUTO, dirty_data_max_percent,
-    CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_RWTUN, 0, sizeof(int),
-    sysctl_zfs_dirty_data_max_percent, "I",
-    "The percent of physical memory used to auto calculate dirty_data_max");
-
-SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, dirty_data_sync_pct, CTLFLAG_RWTUN,
-    &zfs_dirty_data_sync_pct, 0,
-    "Force a txg if the percent of dirty buffer bytes exceed this value");
-
-static int sysctl_zfs_delay_min_dirty_percent(SYSCTL_HANDLER_ARGS);
-/* No zfs_delay_min_dirty_percent tunable due to limit requirements */
-SYSCTL_PROC(_vfs_zfs, OID_AUTO, delay_min_dirty_percent,
-    CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(int),
-    sysctl_zfs_delay_min_dirty_percent, "I",
-    "The limit of outstanding dirty data before transactions are delayed");
-
-static int sysctl_zfs_delay_scale(SYSCTL_HANDLER_ARGS);
-/* No zfs_delay_scale tunable due to limit requirements */
-SYSCTL_PROC(_vfs_zfs, OID_AUTO, delay_scale,
-    CTLTYPE_U64 | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(uint64_t),
-    sysctl_zfs_delay_scale, "QU",
-    "Controls how quickly the delay approaches infinity");
-
-static int
-sysctl_zfs_dirty_data_max_percent(SYSCTL_HANDLER_ARGS)
-{
-	int val, err;
-
-	val = zfs_dirty_data_max_percent;
-	err = sysctl_handle_int(oidp, &val, 0, req);
-	if (err != 0 || req->newptr == NULL)
-		return (err);
-
-	if (val < 0 || val > 100)
-		return (EINVAL);
-
-	zfs_dirty_data_max_percent = val;
-
-	return (0);
-}
-
-static int
-sysctl_zfs_delay_min_dirty_percent(SYSCTL_HANDLER_ARGS)
-{
-	int val, err;
-
-	val = zfs_delay_min_dirty_percent;
-	err = sysctl_handle_int(oidp, &val, 0, req);
-	if (err != 0 || req->newptr == NULL)
-		return (err);
-
-	if (val < zfs_vdev_async_write_active_max_dirty_percent)
-		return (EINVAL);
-
-	zfs_delay_min_dirty_percent = val;
-
-	return (0);
-}
-
-static int
-sysctl_zfs_delay_scale(SYSCTL_HANDLER_ARGS)
-{
-	uint64_t val;
-	int err;
-
-	val = zfs_delay_scale;
-	err = sysctl_handle_64(oidp, &val, 0, req);
-	if (err != 0 || req->newptr == NULL)
-		return (err);
-
-	if (val > UINT64_MAX / zfs_dirty_data_max)
-		return (EINVAL);
-
-	zfs_delay_scale = val;
-
-	return (0);
-}
-#endif
-
-int
-dsl_pool_open_special_dir(dsl_pool_t *dp, const char *name, dsl_dir_t **ddp)
-{
-	uint64_t obj;
-	int err;
-
-	err = zap_lookup(dp->dp_meta_objset,
-	    dsl_dir_phys(dp->dp_root_dir)->dd_child_dir_zapobj,
-	    name, sizeof (obj), 1, &obj);
-	if (err)
-		return (err);
-
-	return (dsl_dir_hold_obj(dp, obj, name, dp, ddp));
-}
-
-static dsl_pool_t *
-dsl_pool_open_impl(spa_t *spa, uint64_t txg)
-{
-	dsl_pool_t *dp;
-	blkptr_t *bp = spa_get_rootblkptr(spa);
-
-	dp = kmem_zalloc(sizeof (dsl_pool_t), KM_SLEEP);
-	dp->dp_spa = spa;
-	dp->dp_meta_rootbp = *bp;
-	rrw_init(&dp->dp_config_rwlock, B_TRUE);
-	txg_init(dp, txg);
-	mmp_init(spa);
-
-	txg_list_create(&dp->dp_dirty_datasets, spa,
-	    offsetof(dsl_dataset_t, ds_dirty_link));
-	txg_list_create(&dp->dp_dirty_zilogs, spa,
-	    offsetof(zilog_t, zl_dirty_link));
-	txg_list_create(&dp->dp_dirty_dirs, spa,
-	    offsetof(dsl_dir_t, dd_dirty_link));
-	txg_list_create(&dp->dp_sync_tasks, spa,
-	    offsetof(dsl_sync_task_t, dst_node));
-	txg_list_create(&dp->dp_early_sync_tasks, spa,
-	    offsetof(dsl_sync_task_t, dst_node));
-
-	dp->dp_sync_taskq = taskq_create("dp_sync_taskq",
-	    zfs_sync_taskq_batch_pct, minclsyspri, 1, INT_MAX,
-	    TASKQ_THREADS_CPU_PCT);
-
-	dp->dp_zil_clean_taskq = taskq_create("dp_zil_clean_taskq",
-	    zfs_zil_clean_taskq_nthr_pct, minclsyspri,
-	    zfs_zil_clean_taskq_minalloc,
-	    zfs_zil_clean_taskq_maxalloc,
-	    TASKQ_PREPOPULATE | TASKQ_THREADS_CPU_PCT);
-
-	mutex_init(&dp->dp_lock, NULL, MUTEX_DEFAULT, NULL);
-	cv_init(&dp->dp_spaceavail_cv, NULL, CV_DEFAULT, NULL);
-
-	dp->dp_vnrele_taskq = taskq_create("zfs_vn_rele_taskq", 1, minclsyspri,
-	    1, 4, 0);
-
-	return (dp);
-}
-
-int
-dsl_pool_init(spa_t *spa, uint64_t txg, dsl_pool_t **dpp)
-{
-	int err;
-	dsl_pool_t *dp = dsl_pool_open_impl(spa, txg);
-
-	err = dmu_objset_open_impl(spa, NULL, &dp->dp_meta_rootbp,
-	    &dp->dp_meta_objset);
-	if (err != 0)
-		dsl_pool_close(dp);
-	else
-		*dpp = dp;
-
-	return (err);
-}
-
-int
-dsl_pool_open(dsl_pool_t *dp)
-{
-	int err;
-	dsl_dir_t *dd;
-	dsl_dataset_t *ds;
-	uint64_t obj;
-
-	rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG);
-	err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
-	    DMU_POOL_ROOT_DATASET, sizeof (uint64_t), 1,
-	    &dp->dp_root_dir_obj);
-	if (err)
-		goto out;
-
-	err = dsl_dir_hold_obj(dp, dp->dp_root_dir_obj,
-	    NULL, dp, &dp->dp_root_dir);
-	if (err)
-		goto out;
-
-	err = dsl_pool_open_special_dir(dp, MOS_DIR_NAME, &dp->dp_mos_dir);
-	if (err)
-		goto out;
-
-	if (spa_version(dp->dp_spa) >= SPA_VERSION_ORIGIN) {
-		err = dsl_pool_open_special_dir(dp, ORIGIN_DIR_NAME, &dd);
-		if (err)
-			goto out;
-		err = dsl_dataset_hold_obj(dp,
-		    dsl_dir_phys(dd)->dd_head_dataset_obj, FTAG, &ds);
-		if (err == 0) {
-			err = dsl_dataset_hold_obj(dp,
-			    dsl_dataset_phys(ds)->ds_prev_snap_obj, dp,
-			    &dp->dp_origin_snap);
-			dsl_dataset_rele(ds, FTAG);
-		}
-		dsl_dir_rele(dd, dp);
-		if (err)
-			goto out;
-	}
-
-	if (spa_version(dp->dp_spa) >= SPA_VERSION_DEADLISTS) {
-		err = dsl_pool_open_special_dir(dp, FREE_DIR_NAME,
-		    &dp->dp_free_dir);
-		if (err)
-			goto out;
-
-		err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
-		    DMU_POOL_FREE_BPOBJ, sizeof (uint64_t), 1, &obj);
-		if (err)
-			goto out;
-		VERIFY0(bpobj_open(&dp->dp_free_bpobj,
-		    dp->dp_meta_objset, obj));
-	}
-
-	if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_OBSOLETE_COUNTS)) {
-		err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
-		    DMU_POOL_OBSOLETE_BPOBJ, sizeof (uint64_t), 1, &obj);
-		if (err == 0) {
-			VERIFY0(bpobj_open(&dp->dp_obsolete_bpobj,
-			    dp->dp_meta_objset, obj));
-		} else if (err == ENOENT) {
-			/*
-			 * We might not have created the remap bpobj yet.
-			 */
-			err = 0;
-		} else {
-			goto out;
-		}
-	}
-
-	/*
-	 * Note: errors ignored, because the these special dirs, used for
-	 * space accounting, are only created on demand.
-	 */
-	(void) dsl_pool_open_special_dir(dp, LEAK_DIR_NAME,
-	    &dp->dp_leak_dir);
-
-	if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_ASYNC_DESTROY)) {
-		err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
-		    DMU_POOL_BPTREE_OBJ, sizeof (uint64_t), 1,
-		    &dp->dp_bptree_obj);
-		if (err != 0)
-			goto out;
-	}
-
-	if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_EMPTY_BPOBJ)) {
-		err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
-		    DMU_POOL_EMPTY_BPOBJ, sizeof (uint64_t), 1,
-		    &dp->dp_empty_bpobj);
-		if (err != 0)
-			goto out;
-	}
-
-	err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
-	    DMU_POOL_TMP_USERREFS, sizeof (uint64_t), 1,
-	    &dp->dp_tmp_userrefs_obj);
-	if (err == ENOENT)
-		err = 0;
-	if (err)
-		goto out;
-
-	err = dsl_scan_init(dp, dp->dp_tx.tx_open_txg);
-
-out:
-	rrw_exit(&dp->dp_config_rwlock, FTAG);
-	return (err);
-}
-
-void
-dsl_pool_close(dsl_pool_t *dp)
-{
-	/*
-	 * Drop our references from dsl_pool_open().
-	 *
-	 * Since we held the origin_snap from "syncing" context (which
-	 * includes pool-opening context), it actually only got a "ref"
-	 * and not a hold, so just drop that here.
-	 */
-	if (dp->dp_origin_snap != NULL)
-		dsl_dataset_rele(dp->dp_origin_snap, dp);
-	if (dp->dp_mos_dir != NULL)
-		dsl_dir_rele(dp->dp_mos_dir, dp);
-	if (dp->dp_free_dir != NULL)
-		dsl_dir_rele(dp->dp_free_dir, dp);
-	if (dp->dp_leak_dir != NULL)
-		dsl_dir_rele(dp->dp_leak_dir, dp);
-	if (dp->dp_root_dir != NULL)
-		dsl_dir_rele(dp->dp_root_dir, dp);
-
-	bpobj_close(&dp->dp_free_bpobj);
-	bpobj_close(&dp->dp_obsolete_bpobj);
-
-	/* undo the dmu_objset_open_impl(mos) from dsl_pool_open() */
-	if (dp->dp_meta_objset != NULL)
-		dmu_objset_evict(dp->dp_meta_objset);
-
-	txg_list_destroy(&dp->dp_dirty_datasets);
-	txg_list_destroy(&dp->dp_dirty_zilogs);
-	txg_list_destroy(&dp->dp_sync_tasks);
-	txg_list_destroy(&dp->dp_early_sync_tasks);
-	txg_list_destroy(&dp->dp_dirty_dirs);
-
-	taskq_destroy(dp->dp_zil_clean_taskq);
-	taskq_destroy(dp->dp_sync_taskq);
-
-	/*
-	 * We can't set retry to TRUE since we're explicitly specifying
-	 * a spa to flush. This is good enough; any missed buffers for
-	 * this spa won't cause trouble, and they'll eventually fall
-	 * out of the ARC just like any other unused buffer.
-	 */
-	arc_flush(dp->dp_spa, FALSE);
-
-	mmp_fini(dp->dp_spa);
-	txg_fini(dp);
-	dsl_scan_fini(dp);
-	dmu_buf_user_evict_wait();
-
-	rrw_destroy(&dp->dp_config_rwlock);
-	mutex_destroy(&dp->dp_lock);
-	taskq_destroy(dp->dp_vnrele_taskq);
-	if (dp->dp_blkstats != NULL) {
-		mutex_destroy(&dp->dp_blkstats->zab_lock);
-		kmem_free(dp->dp_blkstats, sizeof (zfs_all_blkstats_t));
-	}
-	kmem_free(dp, sizeof (dsl_pool_t));
-}
-
-void
-dsl_pool_create_obsolete_bpobj(dsl_pool_t *dp, dmu_tx_t *tx)
-{
-	uint64_t obj;
-	/*
-	 * Currently, we only create the obsolete_bpobj where there are
-	 * indirect vdevs with referenced mappings.
-	 */
-	ASSERT(spa_feature_is_active(dp->dp_spa, SPA_FEATURE_DEVICE_REMOVAL));
-	/* create and open the obsolete_bpobj */
-	obj = bpobj_alloc(dp->dp_meta_objset, SPA_OLD_MAXBLOCKSIZE, tx);
-	VERIFY0(bpobj_open(&dp->dp_obsolete_bpobj, dp->dp_meta_objset, obj));
-	VERIFY0(zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
-	    DMU_POOL_OBSOLETE_BPOBJ, sizeof (uint64_t), 1, &obj, tx));
-	spa_feature_incr(dp->dp_spa, SPA_FEATURE_OBSOLETE_COUNTS, tx);
-}
-
-void
-dsl_pool_destroy_obsolete_bpobj(dsl_pool_t *dp, dmu_tx_t *tx)
-{
-	spa_feature_decr(dp->dp_spa, SPA_FEATURE_OBSOLETE_COUNTS, tx);
-	VERIFY0(zap_remove(dp->dp_meta_objset,
-	    DMU_POOL_DIRECTORY_OBJECT,
-	    DMU_POOL_OBSOLETE_BPOBJ, tx));
-	bpobj_free(dp->dp_meta_objset,
-	    dp->dp_obsolete_bpobj.bpo_object, tx);
-	bpobj_close(&dp->dp_obsolete_bpobj);
-}
-
-dsl_pool_t *
-dsl_pool_create(spa_t *spa, nvlist_t *zplprops, uint64_t txg)
-{
-	int err;
-	dsl_pool_t *dp = dsl_pool_open_impl(spa, txg);
-	dmu_tx_t *tx = dmu_tx_create_assigned(dp, txg);
-	dsl_dataset_t *ds;
-	uint64_t obj;
-
-	rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG);
-
-	/* create and open the MOS (meta-objset) */
-	dp->dp_meta_objset = dmu_objset_create_impl(spa,
-	    NULL, &dp->dp_meta_rootbp, DMU_OST_META, tx);
-
-	/* create the pool directory */
-	err = zap_create_claim(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
-	    DMU_OT_OBJECT_DIRECTORY, DMU_OT_NONE, 0, tx);
-	ASSERT0(err);
-
-	/* Initialize scan structures */
-	VERIFY0(dsl_scan_init(dp, txg));
-
-	/* create and open the root dir */
-	dp->dp_root_dir_obj = dsl_dir_create_sync(dp, NULL, NULL, tx);
-	VERIFY0(dsl_dir_hold_obj(dp, dp->dp_root_dir_obj,
-	    NULL, dp, &dp->dp_root_dir));
-
-	/* create and open the meta-objset dir */
-	(void) dsl_dir_create_sync(dp, dp->dp_root_dir, MOS_DIR_NAME, tx);
-	VERIFY0(dsl_pool_open_special_dir(dp,
-	    MOS_DIR_NAME, &dp->dp_mos_dir));
-
-	if (spa_version(spa) >= SPA_VERSION_DEADLISTS) {
-		/* create and open the free dir */
-		(void) dsl_dir_create_sync(dp, dp->dp_root_dir,
-		    FREE_DIR_NAME, tx);
-		VERIFY0(dsl_pool_open_special_dir(dp,
-		    FREE_DIR_NAME, &dp->dp_free_dir));
-
-		/* create and open the free_bplist */
-		obj = bpobj_alloc(dp->dp_meta_objset, SPA_OLD_MAXBLOCKSIZE, tx);
-		VERIFY(zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
-		    DMU_POOL_FREE_BPOBJ, sizeof (uint64_t), 1, &obj, tx) == 0);
-		VERIFY0(bpobj_open(&dp->dp_free_bpobj,
-		    dp->dp_meta_objset, obj));
-	}
-
-	if (spa_version(spa) >= SPA_VERSION_DSL_SCRUB)
-		dsl_pool_create_origin(dp, tx);
-
-	/* create the root dataset */
-	obj = dsl_dataset_create_sync_dd(dp->dp_root_dir, NULL, 0, tx);
-
-	/* create the root objset */
-	VERIFY0(dsl_dataset_hold_obj(dp, obj, FTAG, &ds));
-#ifdef _KERNEL
-	{
-		objset_t *os;
-		rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
-		os = dmu_objset_create_impl(dp->dp_spa, ds,
-		    dsl_dataset_get_blkptr(ds), DMU_OST_ZFS, tx);
-		rrw_exit(&ds->ds_bp_rwlock, FTAG);
-		zfs_create_fs(os, kcred, zplprops, tx);
-	}
-#endif
-	dsl_dataset_rele(ds, FTAG);
-
-	dmu_tx_commit(tx);
-
-	rrw_exit(&dp->dp_config_rwlock, FTAG);
-
-	return (dp);
-}
-
-/*
- * Account for the meta-objset space in its placeholder dsl_dir.
- */
-void
-dsl_pool_mos_diduse_space(dsl_pool_t *dp,
-    int64_t used, int64_t comp, int64_t uncomp)
-{
-	ASSERT3U(comp, ==, uncomp); /* it's all metadata */
-	mutex_enter(&dp->dp_lock);
-	dp->dp_mos_used_delta += used;
-	dp->dp_mos_compressed_delta += comp;
-	dp->dp_mos_uncompressed_delta += uncomp;
-	mutex_exit(&dp->dp_lock);
-}
-
-static void
-dsl_pool_sync_mos(dsl_pool_t *dp, dmu_tx_t *tx)
-{
-	zio_t *zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
-	dmu_objset_sync(dp->dp_meta_objset, zio, tx);
-	VERIFY0(zio_wait(zio));
-	dprintf_bp(&dp->dp_meta_rootbp, "meta objset rootbp is %s", "");
-	spa_set_rootblkptr(dp->dp_spa, &dp->dp_meta_rootbp);
-}
-
-static void
-dsl_pool_dirty_delta(dsl_pool_t *dp, int64_t delta)
-{
-	ASSERT(MUTEX_HELD(&dp->dp_lock));
-
-	if (delta < 0)
-		ASSERT3U(-delta, <=, dp->dp_dirty_total);
-
-	dp->dp_dirty_total += delta;
-
-	/*
-	 * Note: we signal even when increasing dp_dirty_total.
-	 * This ensures forward progress -- each thread wakes the next waiter.
-	 */
-	if (dp->dp_dirty_total < zfs_dirty_data_max)
-		cv_signal(&dp->dp_spaceavail_cv);
-}
-
-static boolean_t
-dsl_early_sync_task_verify(dsl_pool_t *dp, uint64_t txg)
-{
-	spa_t *spa = dp->dp_spa;
-	vdev_t *rvd = spa->spa_root_vdev;
-
-	for (uint64_t c = 0; c < rvd->vdev_children; c++) {
-		vdev_t *vd = rvd->vdev_child[c];
-		txg_list_t *tl = &vd->vdev_ms_list;
-		metaslab_t *ms;
-
-		for (ms = txg_list_head(tl, TXG_CLEAN(txg)); ms;
-		    ms = txg_list_next(tl, ms, TXG_CLEAN(txg))) {
-			VERIFY(range_tree_is_empty(ms->ms_freeing));
-			VERIFY(range_tree_is_empty(ms->ms_checkpointing));
-		}
-	}
-
-	return (B_TRUE);
-}
-
-void
-dsl_pool_sync(dsl_pool_t *dp, uint64_t txg)
-{
-	zio_t *zio;
-	dmu_tx_t *tx;
-	dsl_dir_t *dd;
-	dsl_dataset_t *ds;
-	objset_t *mos = dp->dp_meta_objset;
-	list_t synced_datasets;
-
-	list_create(&synced_datasets, sizeof (dsl_dataset_t),
-	    offsetof(dsl_dataset_t, ds_synced_link));
-
-	tx = dmu_tx_create_assigned(dp, txg);
-
-	/*
-	 * Run all early sync tasks before writing out any dirty blocks.
-	 * For more info on early sync tasks see block comment in
-	 * dsl_early_sync_task().
-	 */
-	if (!txg_list_empty(&dp->dp_early_sync_tasks, txg)) {
-		dsl_sync_task_t *dst;
-
-		ASSERT3U(spa_sync_pass(dp->dp_spa), ==, 1);
-		while ((dst =
-		    txg_list_remove(&dp->dp_early_sync_tasks, txg)) != NULL) {
-			ASSERT(dsl_early_sync_task_verify(dp, txg));
-			dsl_sync_task_sync(dst, tx);
-		}
-		ASSERT(dsl_early_sync_task_verify(dp, txg));
-	}
-
-	/*
-	 * Write out all dirty blocks of dirty datasets.
-	 */
-	zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
-	while ((ds = txg_list_remove(&dp->dp_dirty_datasets, txg)) != NULL) {
-		/*
-		 * We must not sync any non-MOS datasets twice, because
-		 * we may have taken a snapshot of them.  However, we
-		 * may sync newly-created datasets on pass 2.
-		 */
-		ASSERT(!list_link_active(&ds->ds_synced_link));
-		list_insert_tail(&synced_datasets, ds);
-		dsl_dataset_sync(ds, zio, tx);
-	}
-	VERIFY0(zio_wait(zio));
-
-	/*
-	 * We have written all of the accounted dirty data, so our
-	 * dp_space_towrite should now be zero.  However, some seldom-used
-	 * code paths do not adhere to this (e.g. dbuf_undirty(), also
-	 * rounding error in dbuf_write_physdone).
-	 * Shore up the accounting of any dirtied space now.
-	 */
-	dsl_pool_undirty_space(dp, dp->dp_dirty_pertxg[txg & TXG_MASK], txg);
-
-	/*
-	 * Update the long range free counter after
-	 * we're done syncing user data
-	 */
-	mutex_enter(&dp->dp_lock);
-	ASSERT(spa_sync_pass(dp->dp_spa) == 1 ||
-	    dp->dp_long_free_dirty_pertxg[txg & TXG_MASK] == 0);
-	dp->dp_long_free_dirty_pertxg[txg & TXG_MASK] = 0;
-	mutex_exit(&dp->dp_lock);
-
-	/*
-	 * After the data blocks have been written (ensured by the zio_wait()
-	 * above), update the user/group space accounting.  This happens
-	 * in tasks dispatched to dp_sync_taskq, so wait for them before
-	 * continuing.
-	 */
-	for (ds = list_head(&synced_datasets); ds != NULL;
-	    ds = list_next(&synced_datasets, ds)) {
-		dmu_objset_do_userquota_updates(ds->ds_objset, tx);
-	}
-	taskq_wait(dp->dp_sync_taskq);
-
-	/*
-	 * Sync the datasets again to push out the changes due to
-	 * userspace updates.  This must be done before we process the
-	 * sync tasks, so that any snapshots will have the correct
-	 * user accounting information (and we won't get confused
-	 * about which blocks are part of the snapshot).
-	 */
-	zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
-	while ((ds = txg_list_remove(&dp->dp_dirty_datasets, txg)) != NULL) {
-		ASSERT(list_link_active(&ds->ds_synced_link));
-		dmu_buf_rele(ds->ds_dbuf, ds);
-		dsl_dataset_sync(ds, zio, tx);
-	}
-	VERIFY0(zio_wait(zio));
-
-	/*
-	 * Now that the datasets have been completely synced, we can
-	 * clean up our in-memory structures accumulated while syncing:
-	 *
-	 *  - move dead blocks from the pending deadlist to the on-disk deadlist
-	 *  - release hold from dsl_dataset_dirty()
-	 */
-	while ((ds = list_remove_head(&synced_datasets)) != NULL) {
-		dsl_dataset_sync_done(ds, tx);
-	}
-	while ((dd = txg_list_remove(&dp->dp_dirty_dirs, txg)) != NULL) {
-		dsl_dir_sync(dd, tx);
-	}
-
-	/*
-	 * The MOS's space is accounted for in the pool/$MOS
-	 * (dp_mos_dir).  We can't modify the mos while we're syncing
-	 * it, so we remember the deltas and apply them here.
-	 */
-	if (dp->dp_mos_used_delta != 0 || dp->dp_mos_compressed_delta != 0 ||
-	    dp->dp_mos_uncompressed_delta != 0) {
-		dsl_dir_diduse_space(dp->dp_mos_dir, DD_USED_HEAD,
-		    dp->dp_mos_used_delta,
-		    dp->dp_mos_compressed_delta,
-		    dp->dp_mos_uncompressed_delta, tx);
-		dp->dp_mos_used_delta = 0;
-		dp->dp_mos_compressed_delta = 0;
-		dp->dp_mos_uncompressed_delta = 0;
-	}
-
-	if (!multilist_is_empty(mos->os_dirty_dnodes[txg & TXG_MASK])) {
-		dsl_pool_sync_mos(dp, tx);
-	}
-
-	/*
-	 * If we modify a dataset in the same txg that we want to destroy it,
-	 * its dsl_dir's dd_dbuf will be dirty, and thus have a hold on it.
-	 * dsl_dir_destroy_check() will fail if there are unexpected holds.
-	 * Therefore, we want to sync the MOS (thus syncing the dd_dbuf
-	 * and clearing the hold on it) before we process the sync_tasks.
-	 * The MOS data dirtied by the sync_tasks will be synced on the next
-	 * pass.
-	 */
-	if (!txg_list_empty(&dp->dp_sync_tasks, txg)) {
-		dsl_sync_task_t *dst;
-		/*
-		 * No more sync tasks should have been added while we
-		 * were syncing.
-		 */
-		ASSERT3U(spa_sync_pass(dp->dp_spa), ==, 1);
-		while ((dst = txg_list_remove(&dp->dp_sync_tasks, txg)) != NULL)
-			dsl_sync_task_sync(dst, tx);
-	}
-
-	dmu_tx_commit(tx);
-
-	DTRACE_PROBE2(dsl_pool_sync__done, dsl_pool_t *dp, dp, uint64_t, txg);
-}
-
-void
-dsl_pool_sync_done(dsl_pool_t *dp, uint64_t txg)
-{
-	zilog_t *zilog;
-
-	while (zilog = txg_list_head(&dp->dp_dirty_zilogs, txg)) {
-		dsl_dataset_t *ds = dmu_objset_ds(zilog->zl_os);
-		/*
-		 * We don't remove the zilog from the dp_dirty_zilogs
-		 * list until after we've cleaned it. This ensures that
-		 * callers of zilog_is_dirty() receive an accurate
-		 * answer when they are racing with the spa sync thread.
-		 */
-		zil_clean(zilog, txg);
-		(void) txg_list_remove_this(&dp->dp_dirty_zilogs, zilog, txg);
-		ASSERT(!dmu_objset_is_dirty(zilog->zl_os, txg));
-		dmu_buf_rele(ds->ds_dbuf, zilog);
-	}
-	ASSERT(!dmu_objset_is_dirty(dp->dp_meta_objset, txg));
-}
-
-/*
- * TRUE if the current thread is the tx_sync_thread or if we
- * are being called from SPA context during pool initialization.
- */
-int
-dsl_pool_sync_context(dsl_pool_t *dp)
-{
-	return (curthread == dp->dp_tx.tx_sync_thread ||
-	    spa_is_initializing(dp->dp_spa) ||
-	    taskq_member(dp->dp_sync_taskq, curthread));
-}
-
-/*
- * This function returns the amount of allocatable space in the pool
- * minus whatever space is currently reserved by ZFS for specific
- * purposes. Specifically:
- *
- * 1] Any reserved SLOP space
- * 2] Any space used by the checkpoint
- * 3] Any space used for deferred frees
- *
- * The latter 2 are especially important because they are needed to
- * rectify the SPA's and DMU's different understanding of how much space
- * is used. Now the DMU is aware of that extra space tracked by the SPA
- * without having to maintain a separate special dir (e.g similar to
- * $MOS, $FREEING, and $LEAKED).
- *
- * Note: By deferred frees here, we mean the frees that were deferred
- * in spa_sync() after sync pass 1 (spa_deferred_bpobj), and not the
- * segments placed in ms_defer trees during metaslab_sync_done().
- */
-uint64_t
-dsl_pool_adjustedsize(dsl_pool_t *dp, zfs_space_check_t slop_policy)
-{
-	spa_t *spa = dp->dp_spa;
-	uint64_t space, resv, adjustedsize;
-	uint64_t spa_deferred_frees =
-	    spa->spa_deferred_bpobj.bpo_phys->bpo_bytes;
-
-	space = spa_get_dspace(spa)
-	    - spa_get_checkpoint_space(spa) - spa_deferred_frees;
-	resv = spa_get_slop_space(spa);
-
-	switch (slop_policy) {
-	case ZFS_SPACE_CHECK_NORMAL:
-		break;
-	case ZFS_SPACE_CHECK_RESERVED:
-		resv >>= 1;
-		break;
-	case ZFS_SPACE_CHECK_EXTRA_RESERVED:
-		resv >>= 2;
-		break;
-	case ZFS_SPACE_CHECK_NONE:
-		resv = 0;
-		break;
-	default:
-		panic("invalid slop policy value: %d", slop_policy);
-		break;
-	}
-	adjustedsize = (space >= resv) ? (space - resv) : 0;
-
-	return (adjustedsize);
-}
-
-uint64_t
-dsl_pool_unreserved_space(dsl_pool_t *dp, zfs_space_check_t slop_policy)
-{
-	uint64_t poolsize = dsl_pool_adjustedsize(dp, slop_policy);
-	uint64_t deferred =
-	    metaslab_class_get_deferred(spa_normal_class(dp->dp_spa));
-	uint64_t quota = (poolsize >= deferred) ? (poolsize - deferred) : 0;
-	return (quota);
-}
-
-boolean_t
-dsl_pool_need_dirty_delay(dsl_pool_t *dp)
-{
-	uint64_t delay_min_bytes =
-	    zfs_dirty_data_max * zfs_delay_min_dirty_percent / 100;
-	uint64_t dirty_min_bytes =
-	    zfs_dirty_data_max * zfs_dirty_data_sync_pct / 100;
-	boolean_t rv;
-
-	mutex_enter(&dp->dp_lock);
-	if (dp->dp_dirty_total > dirty_min_bytes)
-		txg_kick(dp);
-	rv = (dp->dp_dirty_total > delay_min_bytes);
-	mutex_exit(&dp->dp_lock);
-	return (rv);
-}
-
-void
-dsl_pool_dirty_space(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx)
-{
-	if (space > 0) {
-		mutex_enter(&dp->dp_lock);
-		dp->dp_dirty_pertxg[tx->tx_txg & TXG_MASK] += space;
-		dsl_pool_dirty_delta(dp, space);
-		mutex_exit(&dp->dp_lock);
-	}
-}
-
-void
-dsl_pool_undirty_space(dsl_pool_t *dp, int64_t space, uint64_t txg)
-{
-	ASSERT3S(space, >=, 0);
-	if (space == 0)
-		return;
-	mutex_enter(&dp->dp_lock);
-	if (dp->dp_dirty_pertxg[txg & TXG_MASK] < space) {
-		/* XXX writing something we didn't dirty? */
-		space = dp->dp_dirty_pertxg[txg & TXG_MASK];
-	}
-	ASSERT3U(dp->dp_dirty_pertxg[txg & TXG_MASK], >=, space);
-	dp->dp_dirty_pertxg[txg & TXG_MASK] -= space;
-	ASSERT3U(dp->dp_dirty_total, >=, space);
-	dsl_pool_dirty_delta(dp, -space);
-	mutex_exit(&dp->dp_lock);
-}
-
-/* ARGSUSED */
-static int
-upgrade_clones_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg)
-{
-	dmu_tx_t *tx = arg;
-	dsl_dataset_t *ds, *prev = NULL;
-	int err;
-
-	err = dsl_dataset_hold_obj(dp, hds->ds_object, FTAG, &ds);
-	if (err)
-		return (err);
-
-	while (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) {
-		err = dsl_dataset_hold_obj(dp,
-		    dsl_dataset_phys(ds)->ds_prev_snap_obj, FTAG, &prev);
-		if (err) {
-			dsl_dataset_rele(ds, FTAG);
-			return (err);
-		}
-
-		if (dsl_dataset_phys(prev)->ds_next_snap_obj != ds->ds_object)
-			break;
-		dsl_dataset_rele(ds, FTAG);
-		ds = prev;
-		prev = NULL;
-	}
-
-	if (prev == NULL) {
-		prev = dp->dp_origin_snap;
-
-		/*
-		 * The $ORIGIN can't have any data, or the accounting
-		 * will be wrong.
-		 */
-		rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
-		ASSERT0(dsl_dataset_phys(prev)->ds_bp.blk_birth);
-		rrw_exit(&ds->ds_bp_rwlock, FTAG);
-
-		/* The origin doesn't get attached to itself */
-		if (ds->ds_object == prev->ds_object) {
-			dsl_dataset_rele(ds, FTAG);
-			return (0);
-		}
-
-		dmu_buf_will_dirty(ds->ds_dbuf, tx);
-		dsl_dataset_phys(ds)->ds_prev_snap_obj = prev->ds_object;
-		dsl_dataset_phys(ds)->ds_prev_snap_txg =
-		    dsl_dataset_phys(prev)->ds_creation_txg;
-
-		dmu_buf_will_dirty(ds->ds_dir->dd_dbuf, tx);
-		dsl_dir_phys(ds->ds_dir)->dd_origin_obj = prev->ds_object;
-
-		dmu_buf_will_dirty(prev->ds_dbuf, tx);
-		dsl_dataset_phys(prev)->ds_num_children++;
-
-		if (dsl_dataset_phys(ds)->ds_next_snap_obj == 0) {
-			ASSERT(ds->ds_prev == NULL);
-			VERIFY0(dsl_dataset_hold_obj(dp,
-			    dsl_dataset_phys(ds)->ds_prev_snap_obj,
-			    ds, &ds->ds_prev));
-		}
-	}
-
-	ASSERT3U(dsl_dir_phys(ds->ds_dir)->dd_origin_obj, ==, prev->ds_object);
-	ASSERT3U(dsl_dataset_phys(ds)->ds_prev_snap_obj, ==, prev->ds_object);
-
-	if (dsl_dataset_phys(prev)->ds_next_clones_obj == 0) {
-		dmu_buf_will_dirty(prev->ds_dbuf, tx);
-		dsl_dataset_phys(prev)->ds_next_clones_obj =
-		    zap_create(dp->dp_meta_objset,
-		    DMU_OT_NEXT_CLONES, DMU_OT_NONE, 0, tx);
-	}
-	VERIFY0(zap_add_int(dp->dp_meta_objset,
-	    dsl_dataset_phys(prev)->ds_next_clones_obj, ds->ds_object, tx));
-
-	dsl_dataset_rele(ds, FTAG);
-	if (prev != dp->dp_origin_snap)
-		dsl_dataset_rele(prev, FTAG);
-	return (0);
-}
-
-void
-dsl_pool_upgrade_clones(dsl_pool_t *dp, dmu_tx_t *tx)
-{
-	ASSERT(dmu_tx_is_syncing(tx));
-	ASSERT(dp->dp_origin_snap != NULL);
-
-	VERIFY0(dmu_objset_find_dp(dp, dp->dp_root_dir_obj, upgrade_clones_cb,
-	    tx, DS_FIND_CHILDREN | DS_FIND_SERIALIZE));
-}
-
-/* ARGSUSED */
-static int
-upgrade_dir_clones_cb(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg)
-{
-	dmu_tx_t *tx = arg;
-	objset_t *mos = dp->dp_meta_objset;
-
-	if (dsl_dir_phys(ds->ds_dir)->dd_origin_obj != 0) {
-		dsl_dataset_t *origin;
-
-		VERIFY0(dsl_dataset_hold_obj(dp,
-		    dsl_dir_phys(ds->ds_dir)->dd_origin_obj, FTAG, &origin));
-
-		if (dsl_dir_phys(origin->ds_dir)->dd_clones == 0) {
-			dmu_buf_will_dirty(origin->ds_dir->dd_dbuf, tx);
-			dsl_dir_phys(origin->ds_dir)->dd_clones =
-			    zap_create(mos, DMU_OT_DSL_CLONES, DMU_OT_NONE,
-			    0, tx);
-		}
-
-		VERIFY0(zap_add_int(dp->dp_meta_objset,
-		    dsl_dir_phys(origin->ds_dir)->dd_clones,
-		    ds->ds_object, tx));
-
-		dsl_dataset_rele(origin, FTAG);
-	}
-	return (0);
-}
-
-void
-dsl_pool_upgrade_dir_clones(dsl_pool_t *dp, dmu_tx_t *tx)
-{
-	ASSERT(dmu_tx_is_syncing(tx));
-	uint64_t obj;
-
-	(void) dsl_dir_create_sync(dp, dp->dp_root_dir, FREE_DIR_NAME, tx);
-	VERIFY0(dsl_pool_open_special_dir(dp,
-	    FREE_DIR_NAME, &dp->dp_free_dir));
-
-	/*
-	 * We can't use bpobj_alloc(), because spa_version() still
-	 * returns the old version, and we need a new-version bpobj with
-	 * subobj support.  So call dmu_object_alloc() directly.
-	 */
-	obj = dmu_object_alloc(dp->dp_meta_objset, DMU_OT_BPOBJ,
-	    SPA_OLD_MAXBLOCKSIZE, DMU_OT_BPOBJ_HDR, sizeof (bpobj_phys_t), tx);
-	VERIFY0(zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
-	    DMU_POOL_FREE_BPOBJ, sizeof (uint64_t), 1, &obj, tx));
-	VERIFY0(bpobj_open(&dp->dp_free_bpobj, dp->dp_meta_objset, obj));
-
-	VERIFY0(dmu_objset_find_dp(dp, dp->dp_root_dir_obj,
-	    upgrade_dir_clones_cb, tx, DS_FIND_CHILDREN | DS_FIND_SERIALIZE));
-}
-
-void
-dsl_pool_create_origin(dsl_pool_t *dp, dmu_tx_t *tx)
-{
-	uint64_t dsobj;
-	dsl_dataset_t *ds;
-
-	ASSERT(dmu_tx_is_syncing(tx));
-	ASSERT(dp->dp_origin_snap == NULL);
-	ASSERT(rrw_held(&dp->dp_config_rwlock, RW_WRITER));
-
-	/* create the origin dir, ds, & snap-ds */
-	dsobj = dsl_dataset_create_sync(dp->dp_root_dir, ORIGIN_DIR_NAME,
-	    NULL, 0, kcred, tx);
-	VERIFY0(dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds));
-	dsl_dataset_snapshot_sync_impl(ds, ORIGIN_DIR_NAME, tx);
-	VERIFY0(dsl_dataset_hold_obj(dp, dsl_dataset_phys(ds)->ds_prev_snap_obj,
-	    dp, &dp->dp_origin_snap));
-	dsl_dataset_rele(ds, FTAG);
-}
-
-taskq_t *
-dsl_pool_vnrele_taskq(dsl_pool_t *dp)
-{
-	return (dp->dp_vnrele_taskq);
-}
-
-/*
- * Walk through the pool-wide zap object of temporary snapshot user holds
- * and release them.
- */
-void
-dsl_pool_clean_tmp_userrefs(dsl_pool_t *dp)
-{
-	zap_attribute_t za;
-	zap_cursor_t zc;
-	objset_t *mos = dp->dp_meta_objset;
-	uint64_t zapobj = dp->dp_tmp_userrefs_obj;
-	nvlist_t *holds;
-
-	if (zapobj == 0)
-		return;
-	ASSERT(spa_version(dp->dp_spa) >= SPA_VERSION_USERREFS);
-
-	holds = fnvlist_alloc();
-
-	for (zap_cursor_init(&zc, mos, zapobj);
-	    zap_cursor_retrieve(&zc, &za) == 0;
-	    zap_cursor_advance(&zc)) {
-		char *htag;
-		nvlist_t *tags;
-
-		htag = strchr(za.za_name, '-');
-		*htag = '\0';
-		++htag;
-		if (nvlist_lookup_nvlist(holds, za.za_name, &tags) != 0) {
-			tags = fnvlist_alloc();
-			fnvlist_add_boolean(tags, htag);
-			fnvlist_add_nvlist(holds, za.za_name, tags);
-			fnvlist_free(tags);
-		} else {
-			fnvlist_add_boolean(tags, htag);
-		}
-	}
-	dsl_dataset_user_release_tmp(dp, holds);
-	fnvlist_free(holds);
-	zap_cursor_fini(&zc);
-}
-
-/*
- * Create the pool-wide zap object for storing temporary snapshot holds.
- */
-void
-dsl_pool_user_hold_create_obj(dsl_pool_t *dp, dmu_tx_t *tx)
-{
-	objset_t *mos = dp->dp_meta_objset;
-
-	ASSERT(dp->dp_tmp_userrefs_obj == 0);
-	ASSERT(dmu_tx_is_syncing(tx));
-
-	dp->dp_tmp_userrefs_obj = zap_create_link(mos, DMU_OT_USERREFS,
-	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_TMP_USERREFS, tx);
-}
-
-static int
-dsl_pool_user_hold_rele_impl(dsl_pool_t *dp, uint64_t dsobj,
-    const char *tag, uint64_t now, dmu_tx_t *tx, boolean_t holding)
-{
-	objset_t *mos = dp->dp_meta_objset;
-	uint64_t zapobj = dp->dp_tmp_userrefs_obj;
-	char *name;
-	int error;
-
-	ASSERT(spa_version(dp->dp_spa) >= SPA_VERSION_USERREFS);
-	ASSERT(dmu_tx_is_syncing(tx));
-
-	/*
-	 * If the pool was created prior to SPA_VERSION_USERREFS, the
-	 * zap object for temporary holds might not exist yet.
-	 */
-	if (zapobj == 0) {
-		if (holding) {
-			dsl_pool_user_hold_create_obj(dp, tx);
-			zapobj = dp->dp_tmp_userrefs_obj;
-		} else {
-			return (SET_ERROR(ENOENT));
-		}
-	}
-
-	name = kmem_asprintf("%llx-%s", (u_longlong_t)dsobj, tag);
-	if (holding)
-		error = zap_add(mos, zapobj, name, 8, 1, &now, tx);
-	else
-		error = zap_remove(mos, zapobj, name, tx);
-	strfree(name);
-
-	return (error);
-}
-
-/*
- * Add a temporary hold for the given dataset object and tag.
- */
-int
-dsl_pool_user_hold(dsl_pool_t *dp, uint64_t dsobj, const char *tag,
-    uint64_t now, dmu_tx_t *tx)
-{
-	return (dsl_pool_user_hold_rele_impl(dp, dsobj, tag, now, tx, B_TRUE));
-}
-
-/*
- * Release a temporary hold for the given dataset object and tag.
- */
-int
-dsl_pool_user_release(dsl_pool_t *dp, uint64_t dsobj, const char *tag,
-    dmu_tx_t *tx)
-{
-	return (dsl_pool_user_hold_rele_impl(dp, dsobj, tag, 0, tx, B_FALSE));
-}
-
-/*
- * DSL Pool Configuration Lock
- *
- * The dp_config_rwlock protects against changes to DSL state (e.g. dataset
- * creation / destruction / rename / property setting).  It must be held for
- * read to hold a dataset or dsl_dir.  I.e. you must call
- * dsl_pool_config_enter() or dsl_pool_hold() before calling
- * dsl_{dataset,dir}_hold{_obj}.  In most circumstances, the dp_config_rwlock
- * must be held continuously until all datasets and dsl_dirs are released.
- *
- * The only exception to this rule is that if a "long hold" is placed on
- * a dataset, then the dp_config_rwlock may be dropped while the dataset
- * is still held.  The long hold will prevent the dataset from being
- * destroyed -- the destroy will fail with EBUSY.  A long hold can be
- * obtained by calling dsl_dataset_long_hold(), or by "owning" a dataset
- * (by calling dsl_{dataset,objset}_{try}own{_obj}).
- *
- * Legitimate long-holders (including owners) should be long-running, cancelable
- * tasks that should cause "zfs destroy" to fail.  This includes DMU
- * consumers (i.e. a ZPL filesystem being mounted or ZVOL being open),
- * "zfs send", and "zfs diff".  There are several other long-holders whose
- * uses are suboptimal (e.g. "zfs promote", and zil_suspend()).
- *
- * The usual formula for long-holding would be:
- * dsl_pool_hold()
- * dsl_dataset_hold()
- * ... perform checks ...
- * dsl_dataset_long_hold()
- * dsl_pool_rele()
- * ... perform long-running task ...
- * dsl_dataset_long_rele()
- * dsl_dataset_rele()
- *
- * Note that when the long hold is released, the dataset is still held but
- * the pool is not held.  The dataset may change arbitrarily during this time
- * (e.g. it could be destroyed).  Therefore you shouldn't do anything to the
- * dataset except release it.
- *
- * User-initiated operations (e.g. ioctls, zfs_ioc_*()) are either read-only
- * or modifying operations.
- *
- * Modifying operations should generally use dsl_sync_task().  The synctask
- * infrastructure enforces proper locking strategy with respect to the
- * dp_config_rwlock.  See the comment above dsl_sync_task() for details.
- *
- * Read-only operations will manually hold the pool, then the dataset, obtain
- * information from the dataset, then release the pool and dataset.
- * dmu_objset_{hold,rele}() are convenience routines that also do the pool
- * hold/rele.
- */
-
-int
-dsl_pool_hold(const char *name, void *tag, dsl_pool_t **dp)
-{
-	spa_t *spa;
-	int error;
-
-	error = spa_open(name, &spa, tag);
-	if (error == 0) {
-		*dp = spa_get_dsl(spa);
-		dsl_pool_config_enter(*dp, tag);
-	}
-	return (error);
-}
-
-void
-dsl_pool_rele(dsl_pool_t *dp, void *tag)
-{
-	dsl_pool_config_exit(dp, tag);
-	spa_close(dp->dp_spa, tag);
-}
-
-void
-dsl_pool_config_enter(dsl_pool_t *dp, void *tag)
-{
-	/*
-	 * We use a "reentrant" reader-writer lock, but not reentrantly.
-	 *
-	 * The rrwlock can (with the track_all flag) track all reading threads,
-	 * which is very useful for debugging which code path failed to release
-	 * the lock, and for verifying that the *current* thread does hold
-	 * the lock.
-	 *
-	 * (Unlike a rwlock, which knows that N threads hold it for
-	 * read, but not *which* threads, so rw_held(RW_READER) returns TRUE
-	 * if any thread holds it for read, even if this thread doesn't).
-	 */
-	ASSERT(!rrw_held(&dp->dp_config_rwlock, RW_READER));
-	rrw_enter(&dp->dp_config_rwlock, RW_READER, tag);
-}
-
-void
-dsl_pool_config_enter_prio(dsl_pool_t *dp, void *tag)
-{
-	ASSERT(!rrw_held(&dp->dp_config_rwlock, RW_READER));
-	rrw_enter_read_prio(&dp->dp_config_rwlock, tag);
-}
-
-void
-dsl_pool_config_exit(dsl_pool_t *dp, void *tag)
-{
-	rrw_exit(&dp->dp_config_rwlock, tag);
-}
-
-boolean_t
-dsl_pool_config_held(dsl_pool_t *dp)
-{
-	return (RRW_LOCK_HELD(&dp->dp_config_rwlock));
-}
-
-boolean_t
-dsl_pool_config_held_writer(dsl_pool_t *dp)
-{
-	return (RRW_WRITE_HELD(&dp->dp_config_rwlock));
-}
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_prop.c
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_prop.c
+++ /dev/null
@@ -1,1211 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
- * Copyright 2015, Joyent, Inc.
- */
-
-#include <sys/zfs_context.h>
-#include <sys/dmu.h>
-#include <sys/dmu_objset.h>
-#include <sys/dmu_tx.h>
-#include <sys/dsl_dataset.h>
-#include <sys/dsl_dir.h>
-#include <sys/dsl_prop.h>
-#include <sys/dsl_synctask.h>
-#include <sys/spa.h>
-#include <sys/zap.h>
-#include <sys/fs/zfs.h>
-
-#include "zfs_prop.h"
-
-#define	ZPROP_INHERIT_SUFFIX "$inherit"
-#define	ZPROP_RECVD_SUFFIX "$recvd"
-
-static int
-dodefault(zfs_prop_t prop, int intsz, int numints, void *buf)
-{
-	/*
-	 * The setonce properties are read-only, BUT they still
-	 * have a default value that can be used as the initial
-	 * value.
-	 */
-	if (prop == ZPROP_INVAL ||
-	    (zfs_prop_readonly(prop) && !zfs_prop_setonce(prop)))
-		return (SET_ERROR(ENOENT));
-
-	if (zfs_prop_get_type(prop) == PROP_TYPE_STRING) {
-		if (zfs_prop_default_string(prop) == NULL)
-			return (SET_ERROR(ENOENT));
-		if (intsz != 1)
-			return (SET_ERROR(EOVERFLOW));
-		(void) strncpy(buf, zfs_prop_default_string(prop),
-		    numints);
-	} else {
-		if (intsz != 8 || numints < 1)
-			return (SET_ERROR(EOVERFLOW));
-
-		*(uint64_t *)buf = zfs_prop_default_numeric(prop);
-	}
-
-	return (0);
-}
-
-int
-dsl_prop_get_dd(dsl_dir_t *dd, const char *propname,
-    int intsz, int numints, void *buf, char *setpoint, boolean_t snapshot)
-{
-	int err = ENOENT;
-	dsl_dir_t *target = dd;
-	objset_t *mos = dd->dd_pool->dp_meta_objset;
-	zfs_prop_t prop;
-	boolean_t inheritable;
-	boolean_t inheriting = B_FALSE;
-	char *inheritstr;
-	char *recvdstr;
-
-	ASSERT(dsl_pool_config_held(dd->dd_pool));
-
-	if (setpoint)
-		setpoint[0] = '\0';
-
-	prop = zfs_name_to_prop(propname);
-	inheritable = (prop == ZPROP_INVAL || zfs_prop_inheritable(prop));
-	inheritstr = kmem_asprintf("%s%s", propname, ZPROP_INHERIT_SUFFIX);
-	recvdstr = kmem_asprintf("%s%s", propname, ZPROP_RECVD_SUFFIX);
-
-	/*
-	 * Note: dd may become NULL, therefore we shouldn't dereference it
-	 * after this loop.
-	 */
-	for (; dd != NULL; dd = dd->dd_parent) {
-		if (dd != target || snapshot) {
-			if (!inheritable)
-				break;
-			inheriting = B_TRUE;
-		}
-
-		/* Check for a local value. */
-		err = zap_lookup(mos, dsl_dir_phys(dd)->dd_props_zapobj,
-		    propname, intsz, numints, buf);
-		if (err != ENOENT) {
-			if (setpoint != NULL && err == 0)
-				dsl_dir_name(dd, setpoint);
-			break;
-		}
-
-		/*
-		 * Skip the check for a received value if there is an explicit
-		 * inheritance entry.
-		 */
-		err = zap_contains(mos, dsl_dir_phys(dd)->dd_props_zapobj,
-		    inheritstr);
-		if (err != 0 && err != ENOENT)
-			break;
-
-		if (err == ENOENT) {
-			/* Check for a received value. */
-			err = zap_lookup(mos, dsl_dir_phys(dd)->dd_props_zapobj,
-			    recvdstr, intsz, numints, buf);
-			if (err != ENOENT) {
-				if (setpoint != NULL && err == 0) {
-					if (inheriting) {
-						dsl_dir_name(dd, setpoint);
-					} else {
-						(void) strcpy(setpoint,
-						    ZPROP_SOURCE_VAL_RECVD);
-					}
-				}
-				break;
-			}
-		}
-
-		/*
-		 * If we found an explicit inheritance entry, err is zero even
-		 * though we haven't yet found the value, so reinitializing err
-		 * at the end of the loop (instead of at the beginning) ensures
-		 * that err has a valid post-loop value.
-		 */
-		err = SET_ERROR(ENOENT);
-	}
-
-	if (err == ENOENT)
-		err = dodefault(prop, intsz, numints, buf);
-
-	strfree(inheritstr);
-	strfree(recvdstr);
-
-	return (err);
-}
-
-int
-dsl_prop_get_ds(dsl_dataset_t *ds, const char *propname,
-    int intsz, int numints, void *buf, char *setpoint)
-{
-	zfs_prop_t prop = zfs_name_to_prop(propname);
-	boolean_t inheritable;
-	uint64_t zapobj;
-
-	ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool));
-	inheritable = (prop == ZPROP_INVAL || zfs_prop_inheritable(prop));
-	zapobj = dsl_dataset_phys(ds)->ds_props_obj;
-
-	if (zapobj != 0) {
-		objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
-		int err;
-
-		ASSERT(ds->ds_is_snapshot);
-
-		/* Check for a local value. */
-		err = zap_lookup(mos, zapobj, propname, intsz, numints, buf);
-		if (err != ENOENT) {
-			if (setpoint != NULL && err == 0)
-				dsl_dataset_name(ds, setpoint);
-			return (err);
-		}
-
-		/*
-		 * Skip the check for a received value if there is an explicit
-		 * inheritance entry.
-		 */
-		if (inheritable) {
-			char *inheritstr = kmem_asprintf("%s%s", propname,
-			    ZPROP_INHERIT_SUFFIX);
-			err = zap_contains(mos, zapobj, inheritstr);
-			strfree(inheritstr);
-			if (err != 0 && err != ENOENT)
-				return (err);
-		}
-
-		if (err == ENOENT) {
-			/* Check for a received value. */
-			char *recvdstr = kmem_asprintf("%s%s", propname,
-			    ZPROP_RECVD_SUFFIX);
-			err = zap_lookup(mos, zapobj, recvdstr,
-			    intsz, numints, buf);
-			strfree(recvdstr);
-			if (err != ENOENT) {
-				if (setpoint != NULL && err == 0)
-					(void) strcpy(setpoint,
-					    ZPROP_SOURCE_VAL_RECVD);
-				return (err);
-			}
-		}
-	}
-
-	return (dsl_prop_get_dd(ds->ds_dir, propname,
-	    intsz, numints, buf, setpoint, ds->ds_is_snapshot));
-}
-
-static dsl_prop_record_t *
-dsl_prop_record_find(dsl_dir_t *dd, const char *propname)
-{
-	dsl_prop_record_t *pr = NULL;
-
-	ASSERT(MUTEX_HELD(&dd->dd_lock));
-
-	for (pr = list_head(&dd->dd_props);
-	    pr != NULL; pr = list_next(&dd->dd_props, pr)) {
-		if (strcmp(pr->pr_propname, propname) == 0)
-			break;
-	}
-
-	return (pr);
-}
-
-static dsl_prop_record_t *
-dsl_prop_record_create(dsl_dir_t *dd, const char *propname)
-{
-	dsl_prop_record_t *pr;
-
-	ASSERT(MUTEX_HELD(&dd->dd_lock));
-
-	pr = kmem_alloc(sizeof (dsl_prop_record_t), KM_SLEEP);
-	pr->pr_propname = spa_strdup(propname);
-	list_create(&pr->pr_cbs, sizeof (dsl_prop_cb_record_t),
-	    offsetof(dsl_prop_cb_record_t, cbr_pr_node));
-	list_insert_head(&dd->dd_props, pr);
-
-	return (pr);
-}
-
-void
-dsl_prop_init(dsl_dir_t *dd)
-{
-	list_create(&dd->dd_props, sizeof (dsl_prop_record_t),
-	    offsetof(dsl_prop_record_t, pr_node));
-}
-
-void
-dsl_prop_fini(dsl_dir_t *dd)
-{
-	dsl_prop_record_t *pr;
-
-	while ((pr = list_remove_head(&dd->dd_props)) != NULL) {
-		list_destroy(&pr->pr_cbs);
-		strfree((char *)pr->pr_propname);
-		kmem_free(pr, sizeof (dsl_prop_record_t));
-	}
-	list_destroy(&dd->dd_props);
-}
-
-/*
- * Register interest in the named property.  We'll call the callback
- * once to notify it of the current property value, and again each time
- * the property changes, until this callback is unregistered.
- *
- * Return 0 on success, errno if the prop is not an integer value.
- */
-int
-dsl_prop_register(dsl_dataset_t *ds, const char *propname,
-    dsl_prop_changed_cb_t *callback, void *cbarg)
-{
-	dsl_dir_t *dd = ds->ds_dir;
-	dsl_pool_t *dp = dd->dd_pool;
-	uint64_t value;
-	dsl_prop_record_t *pr;
-	dsl_prop_cb_record_t *cbr;
-	int err;
-
-	ASSERT(dsl_pool_config_held(dp));
-
-	err = dsl_prop_get_int_ds(ds, propname, &value);
-	if (err != 0)
-		return (err);
-
-	cbr = kmem_alloc(sizeof (dsl_prop_cb_record_t), KM_SLEEP);
-	cbr->cbr_ds = ds;
-	cbr->cbr_func = callback;
-	cbr->cbr_arg = cbarg;
-
-	mutex_enter(&dd->dd_lock);
-	pr = dsl_prop_record_find(dd, propname);
-	if (pr == NULL)
-		pr = dsl_prop_record_create(dd, propname);
-	cbr->cbr_pr = pr;
-	list_insert_head(&pr->pr_cbs, cbr);
-	list_insert_head(&ds->ds_prop_cbs, cbr);
-	mutex_exit(&dd->dd_lock);
-
-	cbr->cbr_func(cbr->cbr_arg, value);
-	return (0);
-}
-
-int
-dsl_prop_get(const char *dsname, const char *propname,
-    int intsz, int numints, void *buf, char *setpoint)
-{
-	objset_t *os;
-	int error;
-
-	error = dmu_objset_hold(dsname, FTAG, &os);
-	if (error != 0)
-		return (error);
-
-	error = dsl_prop_get_ds(dmu_objset_ds(os), propname,
-	    intsz, numints, buf, setpoint);
-
-	dmu_objset_rele(os, FTAG);
-	return (error);
-}
-
-/*
- * Get the current property value.  It may have changed by the time this
- * function returns, so it is NOT safe to follow up with
- * dsl_prop_register() and assume that the value has not changed in
- * between.
- *
- * Return 0 on success, ENOENT if ddname is invalid.
- */
-int
-dsl_prop_get_integer(const char *ddname, const char *propname,
-    uint64_t *valuep, char *setpoint)
-{
-	return (dsl_prop_get(ddname, propname, 8, 1, valuep, setpoint));
-}
-
-int
-dsl_prop_get_int_ds(dsl_dataset_t *ds, const char *propname,
-    uint64_t *valuep)
-{
-	return (dsl_prop_get_ds(ds, propname, 8, 1, valuep, NULL));
-}
-
-/*
- * Predict the effective value of the given special property if it were set with
- * the given value and source. This is not a general purpose function. It exists
- * only to handle the special requirements of the quota and reservation
- * properties. The fact that these properties are non-inheritable greatly
- * simplifies the prediction logic.
- *
- * Returns 0 on success, a positive error code on failure, or -1 if called with
- * a property not handled by this function.
- */
-int
-dsl_prop_predict(dsl_dir_t *dd, const char *propname,
-    zprop_source_t source, uint64_t value, uint64_t *newvalp)
-{
-	zfs_prop_t prop = zfs_name_to_prop(propname);
-	objset_t *mos;
-	uint64_t zapobj;
-	uint64_t version;
-	char *recvdstr;
-	int err = 0;
-
-	switch (prop) {
-	case ZFS_PROP_QUOTA:
-	case ZFS_PROP_RESERVATION:
-	case ZFS_PROP_REFQUOTA:
-	case ZFS_PROP_REFRESERVATION:
-		break;
-	default:
-		return (-1);
-	}
-
-	mos = dd->dd_pool->dp_meta_objset;
-	zapobj = dsl_dir_phys(dd)->dd_props_zapobj;
-	recvdstr = kmem_asprintf("%s%s", propname, ZPROP_RECVD_SUFFIX);
-
-	version = spa_version(dd->dd_pool->dp_spa);
-	if (version < SPA_VERSION_RECVD_PROPS) {
-		if (source & ZPROP_SRC_NONE)
-			source = ZPROP_SRC_NONE;
-		else if (source & ZPROP_SRC_RECEIVED)
-			source = ZPROP_SRC_LOCAL;
-	}
-
-	switch (source) {
-	case ZPROP_SRC_NONE:
-		/* Revert to the received value, if any. */
-		err = zap_lookup(mos, zapobj, recvdstr, 8, 1, newvalp);
-		if (err == ENOENT)
-			*newvalp = 0;
-		break;
-	case ZPROP_SRC_LOCAL:
-		*newvalp = value;
-		break;
-	case ZPROP_SRC_RECEIVED:
-		/*
-		 * If there's no local setting, then the new received value will
-		 * be the effective value.
-		 */
-		err = zap_lookup(mos, zapobj, propname, 8, 1, newvalp);
-		if (err == ENOENT)
-			*newvalp = value;
-		break;
-	case (ZPROP_SRC_NONE | ZPROP_SRC_RECEIVED):
-		/*
-		 * We're clearing the received value, so the local setting (if
-		 * it exists) remains the effective value.
-		 */
-		err = zap_lookup(mos, zapobj, propname, 8, 1, newvalp);
-		if (err == ENOENT)
-			*newvalp = 0;
-		break;
-	default:
-		panic("unexpected property source: %d", source);
-	}
-
-	strfree(recvdstr);
-
-	if (err == ENOENT)
-		return (0);
-
-	return (err);
-}
-
-/*
- * Unregister all callbacks that are registered with the
- * given callback argument.
- */
-void
-dsl_prop_unregister_all(dsl_dataset_t *ds, void *cbarg)
-{
-	dsl_prop_cb_record_t *cbr, *next_cbr;
-
-	dsl_dir_t *dd = ds->ds_dir;
-
-	mutex_enter(&dd->dd_lock);
-	next_cbr = list_head(&ds->ds_prop_cbs);
-	while (next_cbr != NULL) {
-		cbr = next_cbr;
-		next_cbr = list_next(&ds->ds_prop_cbs, cbr);
-		if (cbr->cbr_arg == cbarg) {
-			list_remove(&ds->ds_prop_cbs, cbr);
-			list_remove(&cbr->cbr_pr->pr_cbs, cbr);
-			kmem_free(cbr, sizeof (dsl_prop_cb_record_t));
-		}
-	}
-	mutex_exit(&dd->dd_lock);
-}
-
-boolean_t
-dsl_prop_hascb(dsl_dataset_t *ds)
-{
-	return (!list_is_empty(&ds->ds_prop_cbs));
-}
-
-/* ARGSUSED */
-static int
-dsl_prop_notify_all_cb(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg)
-{
-	dsl_dir_t *dd = ds->ds_dir;
-	dsl_prop_record_t *pr;
-	dsl_prop_cb_record_t *cbr;
-
-	mutex_enter(&dd->dd_lock);
-	for (pr = list_head(&dd->dd_props);
-	    pr; pr = list_next(&dd->dd_props, pr)) {
-		for (cbr = list_head(&pr->pr_cbs); cbr;
-		    cbr = list_next(&pr->pr_cbs, cbr)) {
-			uint64_t value;
-
-			/*
-			 * Callback entries do not have holds on their
-			 * datasets so that datasets with registered
-			 * callbacks are still eligible for eviction.
-			 * Unlike operations to update properties on a
-			 * single dataset, we are performing a recursive
-			 * descent of related head datasets.  The caller
-			 * of this function only has a dataset hold on
-			 * the passed in head dataset, not the snapshots
-			 * associated with this dataset.  Without a hold,
-			 * the dataset pointer within callback records
-			 * for snapshots can be invalidated by eviction
-			 * at any time.
-			 *
-			 * Use dsl_dataset_try_add_ref() to verify
-			 * that the dataset for a snapshot has not
-			 * begun eviction processing and to prevent
-			 * eviction from occurring for the duration of
-			 * the callback.  If the hold attempt fails,
-			 * this object is already being evicted and the
-			 * callback can be safely ignored.
-			 */
-			if (ds != cbr->cbr_ds &&
-			    !dsl_dataset_try_add_ref(dp, cbr->cbr_ds, FTAG))
-				continue;
-
-			if (dsl_prop_get_ds(cbr->cbr_ds,
-			    cbr->cbr_pr->pr_propname, sizeof (value), 1,
-			    &value, NULL) == 0)
-				cbr->cbr_func(cbr->cbr_arg, value);
-
-			if (ds != cbr->cbr_ds)
-				dsl_dataset_rele(cbr->cbr_ds, FTAG);
-		}
-	}
-	mutex_exit(&dd->dd_lock);
-
-	return (0);
-}
-
-/*
- * Update all property values for ddobj & its descendants.  This is used
- * when renaming the dir.
- */
-void
-dsl_prop_notify_all(dsl_dir_t *dd)
-{
-	dsl_pool_t *dp = dd->dd_pool;
-	ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock));
-	(void) dmu_objset_find_dp(dp, dd->dd_object, dsl_prop_notify_all_cb,
-	    NULL, DS_FIND_CHILDREN);
-}
-
-static void
-dsl_prop_changed_notify(dsl_pool_t *dp, uint64_t ddobj,
-    const char *propname, uint64_t value, int first)
-{
-	dsl_dir_t *dd;
-	dsl_prop_record_t *pr;
-	dsl_prop_cb_record_t *cbr;
-	objset_t *mos = dp->dp_meta_objset;
-	zap_cursor_t zc;
-	zap_attribute_t *za;
-	int err;
-
-	ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock));
-	err = dsl_dir_hold_obj(dp, ddobj, NULL, FTAG, &dd);
-	if (err)
-		return;
-
-	if (!first) {
-		/*
-		 * If the prop is set here, then this change is not
-		 * being inherited here or below; stop the recursion.
-		 */
-		err = zap_contains(mos, dsl_dir_phys(dd)->dd_props_zapobj,
-		    propname);
-		if (err == 0) {
-			dsl_dir_rele(dd, FTAG);
-			return;
-		}
-		ASSERT3U(err, ==, ENOENT);
-	}
-
-	mutex_enter(&dd->dd_lock);
-	pr = dsl_prop_record_find(dd, propname);
-	if (pr != NULL) {
-		for (cbr = list_head(&pr->pr_cbs); cbr;
-		    cbr = list_next(&pr->pr_cbs, cbr)) {
-			uint64_t propobj;
-
-			/*
-			 * cbr->cbr_ds may be invalidated due to eviction,
-			 * requiring the use of dsl_dataset_try_add_ref().
-			 * See comment block in dsl_prop_notify_all_cb()
-			 * for details.
-			 */
-			if (!dsl_dataset_try_add_ref(dp, cbr->cbr_ds, FTAG))
-				continue;
-
-			propobj = dsl_dataset_phys(cbr->cbr_ds)->ds_props_obj;
-
-			/*
-			 * If the property is not set on this ds, then it is
-			 * inherited here; call the callback.
-			 */
-			if (propobj == 0 ||
-			    zap_contains(mos, propobj, propname) != 0)
-				cbr->cbr_func(cbr->cbr_arg, value);
-
-			dsl_dataset_rele(cbr->cbr_ds, FTAG);
-		}
-	}
-	mutex_exit(&dd->dd_lock);
-
-	za = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP);
-	for (zap_cursor_init(&zc, mos,
-	    dsl_dir_phys(dd)->dd_child_dir_zapobj);
-	    zap_cursor_retrieve(&zc, za) == 0;
-	    zap_cursor_advance(&zc)) {
-		dsl_prop_changed_notify(dp, za->za_first_integer,
-		    propname, value, FALSE);
-	}
-	kmem_free(za, sizeof (zap_attribute_t));
-	zap_cursor_fini(&zc);
-	dsl_dir_rele(dd, FTAG);
-}
-
-void
-dsl_prop_set_sync_impl(dsl_dataset_t *ds, const char *propname,
-    zprop_source_t source, int intsz, int numints, const void *value,
-    dmu_tx_t *tx)
-{
-	objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
-	uint64_t zapobj, intval, dummy;
-	int isint;
-	char valbuf[32];
-	const char *valstr = NULL;
-	char *inheritstr;
-	char *recvdstr;
-	char *tbuf = NULL;
-	int err;
-	uint64_t version = spa_version(ds->ds_dir->dd_pool->dp_spa);
-
-	isint = (dodefault(zfs_name_to_prop(propname), 8, 1, &intval) == 0);
-
-	if (ds->ds_is_snapshot) {
-		ASSERT(version >= SPA_VERSION_SNAP_PROPS);
-		if (dsl_dataset_phys(ds)->ds_props_obj == 0) {
-			dmu_buf_will_dirty(ds->ds_dbuf, tx);
-			dsl_dataset_phys(ds)->ds_props_obj =
-			    zap_create(mos,
-			    DMU_OT_DSL_PROPS, DMU_OT_NONE, 0, tx);
-		}
-		zapobj = dsl_dataset_phys(ds)->ds_props_obj;
-	} else {
-		zapobj = dsl_dir_phys(ds->ds_dir)->dd_props_zapobj;
-	}
-
-	if (version < SPA_VERSION_RECVD_PROPS) {
-		if (source & ZPROP_SRC_NONE)
-			source = ZPROP_SRC_NONE;
-		else if (source & ZPROP_SRC_RECEIVED)
-			source = ZPROP_SRC_LOCAL;
-	}
-
-	inheritstr = kmem_asprintf("%s%s", propname, ZPROP_INHERIT_SUFFIX);
-	recvdstr = kmem_asprintf("%s%s", propname, ZPROP_RECVD_SUFFIX);
-
-	switch (source) {
-	case ZPROP_SRC_NONE:
-		/*
-		 * revert to received value, if any (inherit -S)
-		 * - remove propname
-		 * - remove propname$inherit
-		 */
-		err = zap_remove(mos, zapobj, propname, tx);
-		ASSERT(err == 0 || err == ENOENT);
-		err = zap_remove(mos, zapobj, inheritstr, tx);
-		ASSERT(err == 0 || err == ENOENT);
-		break;
-	case ZPROP_SRC_LOCAL:
-		/*
-		 * remove propname$inherit
-		 * set propname -> value
-		 */
-		err = zap_remove(mos, zapobj, inheritstr, tx);
-		ASSERT(err == 0 || err == ENOENT);
-		VERIFY0(zap_update(mos, zapobj, propname,
-		    intsz, numints, value, tx));
-		break;
-	case ZPROP_SRC_INHERITED:
-		/*
-		 * explicitly inherit
-		 * - remove propname
-		 * - set propname$inherit
-		 */
-		err = zap_remove(mos, zapobj, propname, tx);
-		ASSERT(err == 0 || err == ENOENT);
-		if (version >= SPA_VERSION_RECVD_PROPS &&
-		    dsl_prop_get_int_ds(ds, ZPROP_HAS_RECVD, &dummy) == 0) {
-			dummy = 0;
-			VERIFY0(zap_update(mos, zapobj, inheritstr,
-			    8, 1, &dummy, tx));
-		}
-		break;
-	case ZPROP_SRC_RECEIVED:
-		/*
-		 * set propname$recvd -> value
-		 */
-		err = zap_update(mos, zapobj, recvdstr,
-		    intsz, numints, value, tx);
-		ASSERT(err == 0);
-		break;
-	case (ZPROP_SRC_NONE | ZPROP_SRC_LOCAL | ZPROP_SRC_RECEIVED):
-		/*
-		 * clear local and received settings
-		 * - remove propname
-		 * - remove propname$inherit
-		 * - remove propname$recvd
-		 */
-		err = zap_remove(mos, zapobj, propname, tx);
-		ASSERT(err == 0 || err == ENOENT);
-		err = zap_remove(mos, zapobj, inheritstr, tx);
-		ASSERT(err == 0 || err == ENOENT);
-		/* FALLTHRU */
-	case (ZPROP_SRC_NONE | ZPROP_SRC_RECEIVED):
-		/*
-		 * remove propname$recvd
-		 */
-		err = zap_remove(mos, zapobj, recvdstr, tx);
-		ASSERT(err == 0 || err == ENOENT);
-		break;
-	default:
-		cmn_err(CE_PANIC, "unexpected property source: %d", source);
-	}
-
-	strfree(inheritstr);
-	strfree(recvdstr);
-
-	if (isint) {
-		VERIFY0(dsl_prop_get_int_ds(ds, propname, &intval));
-
-		if (ds->ds_is_snapshot) {
-			dsl_prop_cb_record_t *cbr;
-			/*
-			 * It's a snapshot; nothing can inherit this
-			 * property, so just look for callbacks on this
-			 * ds here.
-			 */
-			mutex_enter(&ds->ds_dir->dd_lock);
-			for (cbr = list_head(&ds->ds_prop_cbs); cbr;
-			    cbr = list_next(&ds->ds_prop_cbs, cbr)) {
-				if (strcmp(cbr->cbr_pr->pr_propname,
-				    propname) == 0)
-					cbr->cbr_func(cbr->cbr_arg, intval);
-			}
-			mutex_exit(&ds->ds_dir->dd_lock);
-		} else {
-			dsl_prop_changed_notify(ds->ds_dir->dd_pool,
-			    ds->ds_dir->dd_object, propname, intval, TRUE);
-		}
-
-		(void) snprintf(valbuf, sizeof (valbuf),
-		    "%lld", (longlong_t)intval);
-		valstr = valbuf;
-	} else {
-		if (source == ZPROP_SRC_LOCAL) {
-			valstr = value;
-		} else {
-			tbuf = kmem_alloc(ZAP_MAXVALUELEN, KM_SLEEP);
-			if (dsl_prop_get_ds(ds, propname, 1,
-			    ZAP_MAXVALUELEN, tbuf, NULL) == 0)
-				valstr = tbuf;
-		}
-	}
-
-	spa_history_log_internal_ds(ds, (source == ZPROP_SRC_NONE ||
-	    source == ZPROP_SRC_INHERITED) ? "inherit" : "set", tx,
-	    "%s=%s", propname, (valstr == NULL ? "" : valstr));
-
-	if (tbuf != NULL)
-		kmem_free(tbuf, ZAP_MAXVALUELEN);
-}
-
-int
-dsl_prop_set_int(const char *dsname, const char *propname,
-    zprop_source_t source, uint64_t value)
-{
-	nvlist_t *nvl = fnvlist_alloc();
-	int error;
-
-	fnvlist_add_uint64(nvl, propname, value);
-	error = dsl_props_set(dsname, source, nvl);
-	fnvlist_free(nvl);
-	return (error);
-}
-
-int
-dsl_prop_set_string(const char *dsname, const char *propname,
-    zprop_source_t source, const char *value)
-{
-	nvlist_t *nvl = fnvlist_alloc();
-	int error;
-
-	fnvlist_add_string(nvl, propname, value);
-	error = dsl_props_set(dsname, source, nvl);
-	fnvlist_free(nvl);
-	return (error);
-}
-
-int
-dsl_prop_inherit(const char *dsname, const char *propname,
-    zprop_source_t source)
-{
-	nvlist_t *nvl = fnvlist_alloc();
-	int error;
-
-	fnvlist_add_boolean(nvl, propname);
-	error = dsl_props_set(dsname, source, nvl);
-	fnvlist_free(nvl);
-	return (error);
-}
-
-typedef struct dsl_props_set_arg {
-	const char *dpsa_dsname;
-	zprop_source_t dpsa_source;
-	nvlist_t *dpsa_props;
-} dsl_props_set_arg_t;
-
-static int
-dsl_props_set_check(void *arg, dmu_tx_t *tx)
-{
-	dsl_props_set_arg_t *dpsa = arg;
-	dsl_pool_t *dp = dmu_tx_pool(tx);
-	dsl_dataset_t *ds;
-	uint64_t version;
-	nvpair_t *elem = NULL;
-	int err;
-
-	err = dsl_dataset_hold(dp, dpsa->dpsa_dsname, FTAG, &ds);
-	if (err != 0)
-		return (err);
-
-	version = spa_version(ds->ds_dir->dd_pool->dp_spa);
-	while ((elem = nvlist_next_nvpair(dpsa->dpsa_props, elem)) != NULL) {
-		if (strlen(nvpair_name(elem)) >= ZAP_MAXNAMELEN) {
-			dsl_dataset_rele(ds, FTAG);
-			return (SET_ERROR(ENAMETOOLONG));
-		}
-		if (nvpair_type(elem) == DATA_TYPE_STRING) {
-			char *valstr = fnvpair_value_string(elem);
-			if (strlen(valstr) >= (version <
-			    SPA_VERSION_STMF_PROP ?
-			    ZAP_OLDMAXVALUELEN : ZAP_MAXVALUELEN)) {
-				dsl_dataset_rele(ds, FTAG);
-				return (E2BIG);
-			}
-		}
-	}
-
-	if (ds->ds_is_snapshot && version < SPA_VERSION_SNAP_PROPS) {
-		dsl_dataset_rele(ds, FTAG);
-		return (SET_ERROR(ENOTSUP));
-	}
-	dsl_dataset_rele(ds, FTAG);
-	return (0);
-}
-
-void
-dsl_props_set_sync_impl(dsl_dataset_t *ds, zprop_source_t source,
-    nvlist_t *props, dmu_tx_t *tx)
-{
-	nvpair_t *elem = NULL;
-
-	while ((elem = nvlist_next_nvpair(props, elem)) != NULL) {
-		nvpair_t *pair = elem;
-		const char *name = nvpair_name(pair);
-
-		if (nvpair_type(pair) == DATA_TYPE_NVLIST) {
-			/*
-			 * This usually happens when we reuse the nvlist_t data
-			 * returned by the counterpart dsl_prop_get_all_impl().
-			 * For instance we do this to restore the original
-			 * received properties when an error occurs in the
-			 * zfs_ioc_recv() codepath.
-			 */
-			nvlist_t *attrs = fnvpair_value_nvlist(pair);
-			pair = fnvlist_lookup_nvpair(attrs, ZPROP_VALUE);
-		}
-
-		if (nvpair_type(pair) == DATA_TYPE_STRING) {
-			const char *value = fnvpair_value_string(pair);
-			dsl_prop_set_sync_impl(ds, name,
-			    source, 1, strlen(value) + 1, value, tx);
-		} else if (nvpair_type(pair) == DATA_TYPE_UINT64) {
-			uint64_t intval = fnvpair_value_uint64(pair);
-			dsl_prop_set_sync_impl(ds, name,
-			    source, sizeof (intval), 1, &intval, tx);
-		} else if (nvpair_type(pair) == DATA_TYPE_BOOLEAN) {
-			dsl_prop_set_sync_impl(ds, name,
-			    source, 0, 0, NULL, tx);
-		} else {
-			panic("invalid nvpair type");
-		}
-	}
-}
-
-static void
-dsl_props_set_sync(void *arg, dmu_tx_t *tx)
-{
-	dsl_props_set_arg_t *dpsa = arg;
-	dsl_pool_t *dp = dmu_tx_pool(tx);
-	dsl_dataset_t *ds;
-
-	VERIFY0(dsl_dataset_hold(dp, dpsa->dpsa_dsname, FTAG, &ds));
-	dsl_props_set_sync_impl(ds, dpsa->dpsa_source, dpsa->dpsa_props, tx);
-	dsl_dataset_rele(ds, FTAG);
-}
-
-/*
- * All-or-nothing; if any prop can't be set, nothing will be modified.
- */
-int
-dsl_props_set(const char *dsname, zprop_source_t source, nvlist_t *props)
-{
-	dsl_props_set_arg_t dpsa;
-	int nblks = 0;
-
-	dpsa.dpsa_dsname = dsname;
-	dpsa.dpsa_source = source;
-	dpsa.dpsa_props = props;
-
-	/*
-	 * If the source includes NONE, then we will only be removing entries
-	 * from the ZAP object.  In that case don't check for ENOSPC.
-	 */
-	if ((source & ZPROP_SRC_NONE) == 0)
-		nblks = 2 * fnvlist_num_pairs(props);
-
-	return (dsl_sync_task(dsname, dsl_props_set_check, dsl_props_set_sync,
-	    &dpsa, nblks, ZFS_SPACE_CHECK_RESERVED));
-}
-
-typedef enum dsl_prop_getflags {
-	DSL_PROP_GET_INHERITING = 0x1,	/* searching parent of target ds */
-	DSL_PROP_GET_SNAPSHOT = 0x2,	/* snapshot dataset */
-	DSL_PROP_GET_LOCAL = 0x4,	/* local properties */
-	DSL_PROP_GET_RECEIVED = 0x8	/* received properties */
-} dsl_prop_getflags_t;
-
-static int
-dsl_prop_get_all_impl(objset_t *mos, uint64_t propobj,
-    const char *setpoint, dsl_prop_getflags_t flags, nvlist_t *nv)
-{
-	zap_cursor_t zc;
-	zap_attribute_t za;
-	int err = 0;
-
-	for (zap_cursor_init(&zc, mos, propobj);
-	    (err = zap_cursor_retrieve(&zc, &za)) == 0;
-	    zap_cursor_advance(&zc)) {
-		nvlist_t *propval;
-		zfs_prop_t prop;
-		char buf[ZAP_MAXNAMELEN];
-		char *valstr;
-		const char *suffix;
-		const char *propname;
-		const char *source;
-
-		suffix = strchr(za.za_name, '$');
-
-		if (suffix == NULL) {
-			/*
-			 * Skip local properties if we only want received
-			 * properties.
-			 */
-			if (flags & DSL_PROP_GET_RECEIVED)
-				continue;
-
-			propname = za.za_name;
-			source = setpoint;
-		} else if (strcmp(suffix, ZPROP_INHERIT_SUFFIX) == 0) {
-			/* Skip explicitly inherited entries. */
-			continue;
-		} else if (strcmp(suffix, ZPROP_RECVD_SUFFIX) == 0) {
-			if (flags & DSL_PROP_GET_LOCAL)
-				continue;
-
-			(void) strncpy(buf, za.za_name, (suffix - za.za_name));
-			buf[suffix - za.za_name] = '\0';
-			propname = buf;
-
-			if (!(flags & DSL_PROP_GET_RECEIVED)) {
-				/* Skip if locally overridden. */
-				err = zap_contains(mos, propobj, propname);
-				if (err == 0)
-					continue;
-				if (err != ENOENT)
-					break;
-
-				/* Skip if explicitly inherited. */
-				valstr = kmem_asprintf("%s%s", propname,
-				    ZPROP_INHERIT_SUFFIX);
-				err = zap_contains(mos, propobj, valstr);
-				strfree(valstr);
-				if (err == 0)
-					continue;
-				if (err != ENOENT)
-					break;
-			}
-
-			source = ((flags & DSL_PROP_GET_INHERITING) ?
-			    setpoint : ZPROP_SOURCE_VAL_RECVD);
-		} else {
-			/*
-			 * For backward compatibility, skip suffixes we don't
-			 * recognize.
-			 */
-			continue;
-		}
-
-		prop = zfs_name_to_prop(propname);
-
-		/* Skip non-inheritable properties. */
-		if ((flags & DSL_PROP_GET_INHERITING) && prop != ZPROP_INVAL &&
-		    !zfs_prop_inheritable(prop))
-			continue;
-
-		/* Skip properties not valid for this type. */
-		if ((flags & DSL_PROP_GET_SNAPSHOT) && prop != ZPROP_INVAL &&
-		    !zfs_prop_valid_for_type(prop, ZFS_TYPE_SNAPSHOT))
-			continue;
-
-		/* Skip properties already defined. */
-		if (nvlist_exists(nv, propname))
-			continue;
-
-		VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0);
-		if (za.za_integer_length == 1) {
-			/*
-			 * String property
-			 */
-			char *tmp = kmem_alloc(za.za_num_integers,
-			    KM_SLEEP);
-			err = zap_lookup(mos, propobj,
-			    za.za_name, 1, za.za_num_integers, tmp);
-			if (err != 0) {
-				kmem_free(tmp, za.za_num_integers);
-				break;
-			}
-			VERIFY(nvlist_add_string(propval, ZPROP_VALUE,
-			    tmp) == 0);
-			kmem_free(tmp, za.za_num_integers);
-		} else {
-			/*
-			 * Integer property
-			 */
-			ASSERT(za.za_integer_length == 8);
-			(void) nvlist_add_uint64(propval, ZPROP_VALUE,
-			    za.za_first_integer);
-		}
-
-		VERIFY(nvlist_add_string(propval, ZPROP_SOURCE, source) == 0);
-		VERIFY(nvlist_add_nvlist(nv, propname, propval) == 0);
-		nvlist_free(propval);
-	}
-	zap_cursor_fini(&zc);
-	if (err == ENOENT)
-		err = 0;
-	return (err);
-}
-
-/*
- * Iterate over all properties for this dataset and return them in an nvlist.
- */
-static int
-dsl_prop_get_all_ds(dsl_dataset_t *ds, nvlist_t **nvp,
-    dsl_prop_getflags_t flags)
-{
-	dsl_dir_t *dd = ds->ds_dir;
-	dsl_pool_t *dp = dd->dd_pool;
-	objset_t *mos = dp->dp_meta_objset;
-	int err = 0;
-	char setpoint[ZFS_MAX_DATASET_NAME_LEN];
-
-	VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0);
-
-	if (ds->ds_is_snapshot)
-		flags |= DSL_PROP_GET_SNAPSHOT;
-
-	ASSERT(dsl_pool_config_held(dp));
-
-	if (dsl_dataset_phys(ds)->ds_props_obj != 0) {
-		ASSERT(flags & DSL_PROP_GET_SNAPSHOT);
-		dsl_dataset_name(ds, setpoint);
-		err = dsl_prop_get_all_impl(mos,
-		    dsl_dataset_phys(ds)->ds_props_obj, setpoint, flags, *nvp);
-		if (err)
-			goto out;
-	}
-
-	for (; dd != NULL; dd = dd->dd_parent) {
-		if (dd != ds->ds_dir || (flags & DSL_PROP_GET_SNAPSHOT)) {
-			if (flags & (DSL_PROP_GET_LOCAL |
-			    DSL_PROP_GET_RECEIVED))
-				break;
-			flags |= DSL_PROP_GET_INHERITING;
-		}
-		dsl_dir_name(dd, setpoint);
-		err = dsl_prop_get_all_impl(mos,
-		    dsl_dir_phys(dd)->dd_props_zapobj, setpoint, flags, *nvp);
-		if (err)
-			break;
-	}
-out:
-	return (err);
-}
-
-boolean_t
-dsl_prop_get_hasrecvd(const char *dsname)
-{
-	uint64_t dummy;
-
-	return (0 ==
-	    dsl_prop_get_integer(dsname, ZPROP_HAS_RECVD, &dummy, NULL));
-}
-
-static int
-dsl_prop_set_hasrecvd_impl(const char *dsname, zprop_source_t source)
-{
-	uint64_t version;
-	spa_t *spa;
-	int error = 0;
-
-	VERIFY0(spa_open(dsname, &spa, FTAG));
-	version = spa_version(spa);
-	spa_close(spa, FTAG);
-
-	if (version >= SPA_VERSION_RECVD_PROPS)
-		error = dsl_prop_set_int(dsname, ZPROP_HAS_RECVD, source, 0);
-	return (error);
-}
-
-/*
- * Call after successfully receiving properties to ensure that only the first
- * receive on or after SPA_VERSION_RECVD_PROPS blows away local properties.
- */
-int
-dsl_prop_set_hasrecvd(const char *dsname)
-{
-	int error = 0;
-	if (!dsl_prop_get_hasrecvd(dsname))
-		error = dsl_prop_set_hasrecvd_impl(dsname, ZPROP_SRC_LOCAL);
-	return (error);
-}
-
-void
-dsl_prop_unset_hasrecvd(const char *dsname)
-{
-	VERIFY0(dsl_prop_set_hasrecvd_impl(dsname, ZPROP_SRC_NONE));
-}
-
-int
-dsl_prop_get_all(objset_t *os, nvlist_t **nvp)
-{
-	return (dsl_prop_get_all_ds(os->os_dsl_dataset, nvp, 0));
-}
-
-int
-dsl_prop_get_received(const char *dsname, nvlist_t **nvp)
-{
-	objset_t *os;
-	int error;
-
-	/*
-	 * Received properties are not distinguishable from local properties
-	 * until the dataset has received properties on or after
-	 * SPA_VERSION_RECVD_PROPS.
-	 */
-	dsl_prop_getflags_t flags = (dsl_prop_get_hasrecvd(dsname) ?
-	    DSL_PROP_GET_RECEIVED : DSL_PROP_GET_LOCAL);
-
-	error = dmu_objset_hold(dsname, FTAG, &os);
-	if (error != 0)
-		return (error);
-	error = dsl_prop_get_all_ds(os->os_dsl_dataset, nvp, flags);
-	dmu_objset_rele(os, FTAG);
-	return (error);
-}
-
-void
-dsl_prop_nvlist_add_uint64(nvlist_t *nv, zfs_prop_t prop, uint64_t value)
-{
-	nvlist_t *propval;
-	const char *propname = zfs_prop_to_name(prop);
-	uint64_t default_value;
-
-	if (nvlist_lookup_nvlist(nv, propname, &propval) == 0) {
-		VERIFY(nvlist_add_uint64(propval, ZPROP_VALUE, value) == 0);
-		return;
-	}
-
-	VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0);
-	VERIFY(nvlist_add_uint64(propval, ZPROP_VALUE, value) == 0);
-	/* Indicate the default source if we can. */
-	if (dodefault(prop, 8, 1, &default_value) == 0 &&
-	    value == default_value) {
-		VERIFY(nvlist_add_string(propval, ZPROP_SOURCE, "") == 0);
-	}
-	VERIFY(nvlist_add_nvlist(nv, propname, propval) == 0);
-	nvlist_free(propval);
-}
-
-void
-dsl_prop_nvlist_add_string(nvlist_t *nv, zfs_prop_t prop, const char *value)
-{
-	nvlist_t *propval;
-	const char *propname = zfs_prop_to_name(prop);
-
-	if (nvlist_lookup_nvlist(nv, propname, &propval) == 0) {
-		VERIFY(nvlist_add_string(propval, ZPROP_VALUE, value) == 0);
-		return;
-	}
-
-	VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0);
-	VERIFY(nvlist_add_string(propval, ZPROP_VALUE, value) == 0);
-	VERIFY(nvlist_add_nvlist(nv, propname, propval) == 0);
-	nvlist_free(propval);
-}
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scan.c
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scan.c
+++ /dev/null
@@ -1,4001 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
- * Copyright 2016 Gary Mills
- * Copyright (c) 2011, 2017 by Delphix. All rights reserved.
- * Copyright 2017 Joyent, Inc.
- * Copyright (c) 2017 Datto Inc.
- */
-
-#include <sys/dsl_scan.h>
-#include <sys/dsl_pool.h>
-#include <sys/dsl_dataset.h>
-#include <sys/dsl_prop.h>
-#include <sys/dsl_dir.h>
-#include <sys/dsl_synctask.h>
-#include <sys/dnode.h>
-#include <sys/dmu_tx.h>
-#include <sys/dmu_objset.h>
-#include <sys/arc.h>
-#include <sys/zap.h>
-#include <sys/zio.h>
-#include <sys/zfs_context.h>
-#include <sys/fs/zfs.h>
-#include <sys/zfs_znode.h>
-#include <sys/spa_impl.h>
-#include <sys/vdev_impl.h>
-#include <sys/zil_impl.h>
-#include <sys/zio_checksum.h>
-#include <sys/ddt.h>
-#include <sys/sa.h>
-#include <sys/sa_impl.h>
-#include <sys/zfeature.h>
-#include <sys/abd.h>
-#include <sys/range_tree.h>
-#ifdef _KERNEL
-#include <sys/zfs_vfsops.h>
-#endif
-
-/*
- * Grand theory statement on scan queue sorting
- *
- * Scanning is implemented by recursively traversing all indirection levels
- * in an object and reading all blocks referenced from said objects. This
- * results in us approximately traversing the object from lowest logical
- * offset to the highest. For best performance, we would want the logical
- * blocks to be physically contiguous. However, this is frequently not the
- * case with pools given the allocation patterns of copy-on-write filesystems.
- * So instead, we put the I/Os into a reordering queue and issue them in a
- * way that will most benefit physical disks (LBA-order).
- *
- * Queue management:
- *
- * Ideally, we would want to scan all metadata and queue up all block I/O
- * prior to starting to issue it, because that allows us to do an optimal
- * sorting job. This can however consume large amounts of memory. Therefore
- * we continuously monitor the size of the queues and constrain them to 5%
- * (zfs_scan_mem_lim_fact) of physmem. If the queues grow larger than this
- * limit, we clear out a few of the largest extents at the head of the queues
- * to make room for more scanning. Hopefully, these extents will be fairly
- * large and contiguous, allowing us to approach sequential I/O throughput
- * even without a fully sorted tree.
- *
- * Metadata scanning takes place in dsl_scan_visit(), which is called from
- * dsl_scan_sync() every spa_sync(). If we have either fully scanned all
- * metadata on the pool, or we need to make room in memory because our
- * queues are too large, dsl_scan_visit() is postponed and
- * scan_io_queues_run() is called from dsl_scan_sync() instead. This implies
- * that metadata scanning and queued I/O issuing are mutually exclusive. This
- * allows us to provide maximum sequential I/O throughput for the majority of
- * I/O's issued since sequential I/O performance is significantly negatively
- * impacted if it is interleaved with random I/O.
- *
- * Implementation Notes
- *
- * One side effect of the queued scanning algorithm is that the scanning code
- * needs to be notified whenever a block is freed. This is needed to allow
- * the scanning code to remove these I/Os from the issuing queue. Additionally,
- * we do not attempt to queue gang blocks to be issued sequentially since this
- * is very hard to do and would have an extremely limitted performance benefit.
- * Instead, we simply issue gang I/Os as soon as we find them using the legacy
- * algorithm.
- *
- * Backwards compatibility
- *
- * This new algorithm is backwards compatible with the legacy on-disk data
- * structures (and therefore does not require a new feature flag).
- * Periodically during scanning (see zfs_scan_checkpoint_intval), the scan
- * will stop scanning metadata (in logical order) and wait for all outstanding
- * sorted I/O to complete. Once this is done, we write out a checkpoint
- * bookmark, indicating that we have scanned everything logically before it.
- * If the pool is imported on a machine without the new sorting algorithm,
- * the scan simply resumes from the last checkpoint using the legacy algorithm.
- */
-
-typedef int (scan_cb_t)(dsl_pool_t *, const blkptr_t *,
-    const zbookmark_phys_t *);
-
-static scan_cb_t dsl_scan_scrub_cb;
-
-static int scan_ds_queue_compare(const void *a, const void *b);
-static int scan_prefetch_queue_compare(const void *a, const void *b);
-static void scan_ds_queue_clear(dsl_scan_t *scn);
-static boolean_t scan_ds_queue_contains(dsl_scan_t *scn, uint64_t dsobj,
-    uint64_t *txg);
-static void scan_ds_queue_insert(dsl_scan_t *scn, uint64_t dsobj, uint64_t txg);
-static void scan_ds_queue_remove(dsl_scan_t *scn, uint64_t dsobj);
-static void scan_ds_queue_sync(dsl_scan_t *scn, dmu_tx_t *tx);
-static uint64_t dsl_scan_count_leaves(vdev_t *vd);
-
-extern int zfs_vdev_async_write_active_min_dirty_percent;
-
-/*
- * By default zfs will check to ensure it is not over the hard memory
- * limit before each txg. If finer-grained control of this is needed
- * this value can be set to 1 to enable checking before scanning each
- * block.
- */
-int zfs_scan_strict_mem_lim = B_FALSE;
-
-unsigned int zfs_resilver_delay = 2;	/* number of ticks to delay resilver -- 2 is a good number */
-unsigned int zfs_scrub_delay = 4;	/* number of ticks to delay scrub -- 4 is a good number */
-unsigned int zfs_scan_idle = 50;	/* idle window in clock ticks */
-
-/*
- * Maximum number of parallelly executed bytes per leaf vdev. We attempt
- * to strike a balance here between keeping the vdev queues full of I/Os
- * at all times and not overflowing the queues to cause long latency,
- * which would cause long txg sync times. No matter what, we will not
- * overload the drives with I/O, since that is protected by
- * zfs_vdev_scrub_max_active.
- */
-unsigned long zfs_scan_vdev_limit = 4 << 20;
-
-int zfs_scan_issue_strategy = 0;
-int zfs_scan_legacy = B_FALSE;	/* don't queue & sort zios, go direct */
-uint64_t zfs_scan_max_ext_gap = 2 << 20;	/* in bytes */
-
-unsigned int zfs_scan_checkpoint_intval = 7200;	/* seconds */
-#define	ZFS_SCAN_CHECKPOINT_INTVAL	SEC_TO_TICK(zfs_scan_checkpoint_intval)
-
-/*
- * fill_weight is non-tunable at runtime, so we copy it at module init from
- * zfs_scan_fill_weight. Runtime adjustments to zfs_scan_fill_weight would
- * break queue sorting.
- */
-uint64_t zfs_scan_fill_weight = 3;
-static uint64_t fill_weight;
-
-/* See dsl_scan_should_clear() for details on the memory limit tunables */
-uint64_t zfs_scan_mem_lim_min = 16 << 20;	/* bytes */
-uint64_t zfs_scan_mem_lim_soft_max = 128 << 20;	/* bytes */
-int zfs_scan_mem_lim_fact = 20;		/* fraction of physmem */
-int zfs_scan_mem_lim_soft_fact = 20;	/* fraction of mem lim above */
-
-unsigned int zfs_scrub_min_time_ms = 1000; /* min millisecs to scrub per txg */
-unsigned int zfs_free_min_time_ms = 1000; /* min millisecs to free per txg */
-unsigned int zfs_obsolete_min_time_ms = 500; /* min millisecs to obsolete per txg */
-unsigned int zfs_resilver_min_time_ms = 3000; /* min millisecs to resilver per txg */
-boolean_t zfs_no_scrub_io = B_FALSE; /* set to disable scrub i/o */
-boolean_t zfs_no_scrub_prefetch = B_FALSE; /* set to disable scrub prefetch */
-
-SYSCTL_DECL(_vfs_zfs);
-SYSCTL_UINT(_vfs_zfs, OID_AUTO, resilver_delay, CTLFLAG_RWTUN,
-    &zfs_resilver_delay, 0, "Number of ticks to delay resilver");
-SYSCTL_UINT(_vfs_zfs, OID_AUTO, scrub_delay, CTLFLAG_RWTUN,
-    &zfs_scrub_delay, 0, "Number of ticks to delay scrub");
-SYSCTL_UINT(_vfs_zfs, OID_AUTO, scan_idle, CTLFLAG_RWTUN,
-    &zfs_scan_idle, 0, "Idle scan window in clock ticks");
-SYSCTL_UINT(_vfs_zfs, OID_AUTO, scan_min_time_ms, CTLFLAG_RWTUN,
-    &zfs_scrub_min_time_ms, 0, "Min millisecs to scrub per txg");
-SYSCTL_UINT(_vfs_zfs, OID_AUTO, free_min_time_ms, CTLFLAG_RWTUN,
-    &zfs_free_min_time_ms, 0, "Min millisecs to free per txg");
-SYSCTL_UINT(_vfs_zfs, OID_AUTO, resilver_min_time_ms, CTLFLAG_RWTUN,
-    &zfs_resilver_min_time_ms, 0, "Min millisecs to resilver per txg");
-SYSCTL_INT(_vfs_zfs, OID_AUTO, no_scrub_io, CTLFLAG_RWTUN,
-    &zfs_no_scrub_io, 0, "Disable scrub I/O");
-SYSCTL_INT(_vfs_zfs, OID_AUTO, no_scrub_prefetch, CTLFLAG_RWTUN,
-    &zfs_no_scrub_prefetch, 0, "Disable scrub prefetching");
-SYSCTL_UINT(_vfs_zfs, OID_AUTO, zfs_scan_legacy, CTLFLAG_RWTUN,
-    &zfs_scan_legacy, 0, "Scrub using legacy non-sequential method");
-SYSCTL_UINT(_vfs_zfs, OID_AUTO, zfs_scan_checkpoint_interval, CTLFLAG_RWTUN,
-    &zfs_scan_checkpoint_intval, 0, "Scan progress on-disk checkpointing interval");
-
-enum ddt_class zfs_scrub_ddt_class_max = DDT_CLASS_DUPLICATE;
-/* max number of blocks to free in a single TXG */
-uint64_t zfs_async_block_max_blocks = UINT64_MAX;
-SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, free_max_blocks, CTLFLAG_RWTUN,
-    &zfs_async_block_max_blocks, 0, "Maximum number of blocks to free in one TXG");
-
-/*
- * We wait a few txgs after importing a pool to begin scanning so that
- * the import / mounting code isn't held up by scrub / resilver IO.
- * Unfortunately, it is a bit difficult to determine exactly how long
- * this will take since userspace will trigger fs mounts asynchronously
- * and the kernel will create zvol minors asynchronously. As a result,
- * the value provided here is a bit arbitrary, but represents a
- * reasonable estimate of how many txgs it will take to finish fully
- * importing a pool
- */
-#define        SCAN_IMPORT_WAIT_TXGS           5
-
-
-#define	DSL_SCAN_IS_SCRUB_RESILVER(scn) \
-	((scn)->scn_phys.scn_func == POOL_SCAN_SCRUB || \
-	(scn)->scn_phys.scn_func == POOL_SCAN_RESILVER)
-
-extern int zfs_txg_timeout;
-
-/*
- * Enable/disable the processing of the free_bpobj object.
- */
-boolean_t zfs_free_bpobj_enabled = B_TRUE;
-
-SYSCTL_INT(_vfs_zfs, OID_AUTO, free_bpobj_enabled, CTLFLAG_RWTUN,
-    &zfs_free_bpobj_enabled, 0, "Enable free_bpobj processing");
-
-/* the order has to match pool_scan_type */
-static scan_cb_t *scan_funcs[POOL_SCAN_FUNCS] = {
-	NULL,
-	dsl_scan_scrub_cb,	/* POOL_SCAN_SCRUB */
-	dsl_scan_scrub_cb,	/* POOL_SCAN_RESILVER */
-};
-
-/* In core node for the scn->scn_queue. Represents a dataset to be scanned */
-typedef struct {
-	uint64_t	sds_dsobj;
-	uint64_t	sds_txg;
-	avl_node_t	sds_node;
-} scan_ds_t;
-
-/*
- * This controls what conditions are placed on dsl_scan_sync_state():
- * SYNC_OPTIONAL) write out scn_phys iff scn_bytes_pending == 0
- * SYNC_MANDATORY) write out scn_phys always. scn_bytes_pending must be 0.
- * SYNC_CACHED) if scn_bytes_pending == 0, write out scn_phys. Otherwise
- *	write out the scn_phys_cached version.
- * See dsl_scan_sync_state for details.
- */
-typedef enum {
-	SYNC_OPTIONAL,
-	SYNC_MANDATORY,
-	SYNC_CACHED
-} state_sync_type_t;
-
-/*
- * This struct represents the minimum information needed to reconstruct a
- * zio for sequential scanning. This is useful because many of these will
- * accumulate in the sequential IO queues before being issued, so saving
- * memory matters here.
- */
-typedef struct scan_io {
-	/* fields from blkptr_t */
-	uint64_t		sio_offset;
-	uint64_t		sio_blk_prop;
-	uint64_t		sio_phys_birth;
-	uint64_t		sio_birth;
-	zio_cksum_t		sio_cksum;
-	uint32_t		sio_asize;
-
-	/* fields from zio_t */
-	int			sio_flags;
-	zbookmark_phys_t	sio_zb;
-
-	/* members for queue sorting */
-	union {
-		avl_node_t	sio_addr_node; /* link into issueing queue */
-		list_node_t	sio_list_node; /* link for issuing to disk */
-	} sio_nodes;
-} scan_io_t;
-
-struct dsl_scan_io_queue {
-	dsl_scan_t	*q_scn; /* associated dsl_scan_t */
-	vdev_t		*q_vd; /* top-level vdev that this queue represents */
-
-	/* trees used for sorting I/Os and extents of I/Os */
-	range_tree_t	*q_exts_by_addr;
-	avl_tree_t	q_exts_by_size;
-	avl_tree_t	q_sios_by_addr;
-
-	/* members for zio rate limiting */
-	uint64_t	q_maxinflight_bytes;
-	uint64_t	q_inflight_bytes;
-	kcondvar_t	q_zio_cv; /* used under vd->vdev_scan_io_queue_lock */
-
-	/* per txg statistics */
-	uint64_t	q_total_seg_size_this_txg;
-	uint64_t	q_segs_this_txg;
-	uint64_t	q_total_zio_size_this_txg;
-	uint64_t	q_zios_this_txg;
-};
-
-/* private data for dsl_scan_prefetch_cb() */
-typedef struct scan_prefetch_ctx {
-	zfs_refcount_t spc_refcnt;	/* refcount for memory management */
-	dsl_scan_t *spc_scn;		/* dsl_scan_t for the pool */
-	boolean_t spc_root;		/* is this prefetch for an objset? */
-	uint8_t spc_indblkshift;	/* dn_indblkshift of current dnode */
-	uint16_t spc_datablkszsec;	/* dn_idatablkszsec of current dnode */
-} scan_prefetch_ctx_t;
-
-/* private data for dsl_scan_prefetch() */
-typedef struct scan_prefetch_issue_ctx {
-	avl_node_t spic_avl_node;	/* link into scn->scn_prefetch_queue */
-	scan_prefetch_ctx_t *spic_spc;	/* spc for the callback */
-	blkptr_t spic_bp;		/* bp to prefetch */
-	zbookmark_phys_t spic_zb;	/* bookmark to prefetch */
-} scan_prefetch_issue_ctx_t;
-
-static void scan_exec_io(dsl_pool_t *dp, const blkptr_t *bp, int zio_flags,
-    const zbookmark_phys_t *zb, dsl_scan_io_queue_t *queue);
-static void scan_io_queue_insert_impl(dsl_scan_io_queue_t *queue,
-    scan_io_t *sio);
-
-static dsl_scan_io_queue_t *scan_io_queue_create(vdev_t *vd);
-static void scan_io_queues_destroy(dsl_scan_t *scn);
-
-static kmem_cache_t *sio_cache;
-
-void
-scan_init(void)
-{
-	/*
-	 * This is used in ext_size_compare() to weight segments
-	 * based on how sparse they are. This cannot be changed
-	 * mid-scan and the tree comparison functions don't currently
-	 * have a mechansim for passing additional context to the
-	 * compare functions. Thus we store this value globally and
-	 * we only allow it to be set at module intiailization time
-	 */
-	fill_weight = zfs_scan_fill_weight;
-	
-	sio_cache = kmem_cache_create("sio_cache",
-	    sizeof (scan_io_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
-}
-
-void
-scan_fini(void)
-{
-	kmem_cache_destroy(sio_cache);
-}
-
-static inline boolean_t
-dsl_scan_is_running(const dsl_scan_t *scn)
-{
-	return (scn->scn_phys.scn_state == DSS_SCANNING);
-}
-
-boolean_t
-dsl_scan_resilvering(dsl_pool_t *dp)
-{
-	return (dsl_scan_is_running(dp->dp_scan) &&
-	    dp->dp_scan->scn_phys.scn_func == POOL_SCAN_RESILVER);
-}
-
-static inline void
-sio2bp(const scan_io_t *sio, blkptr_t *bp, uint64_t vdev_id)
-{
-	bzero(bp, sizeof (*bp));
-	DVA_SET_ASIZE(&bp->blk_dva[0], sio->sio_asize);
-	DVA_SET_VDEV(&bp->blk_dva[0], vdev_id);
-	DVA_SET_OFFSET(&bp->blk_dva[0], sio->sio_offset);
-	bp->blk_prop = sio->sio_blk_prop;
-	bp->blk_phys_birth = sio->sio_phys_birth;
-	bp->blk_birth = sio->sio_birth;
-	bp->blk_fill = 1;	/* we always only work with data pointers */
-	bp->blk_cksum = sio->sio_cksum;
-}
-
-static inline void
-bp2sio(const blkptr_t *bp, scan_io_t *sio, int dva_i)
-{
-	/* we discard the vdev id, since we can deduce it from the queue */
-	sio->sio_offset = DVA_GET_OFFSET(&bp->blk_dva[dva_i]);
-	sio->sio_asize = DVA_GET_ASIZE(&bp->blk_dva[dva_i]);
-	sio->sio_blk_prop = bp->blk_prop;
-	sio->sio_phys_birth = bp->blk_phys_birth;
-	sio->sio_birth = bp->blk_birth;
-	sio->sio_cksum = bp->blk_cksum;
-}
-
-void
-dsl_scan_global_init(void)
-{
-	/*
-	 * This is used in ext_size_compare() to weight segments
-	 * based on how sparse they are. This cannot be changed
-	 * mid-scan and the tree comparison functions don't currently
-	 * have a mechansim for passing additional context to the
-	 * compare functions. Thus we store this value globally and
-	 * we only allow it to be set at module intiailization time
-	 */
-	fill_weight = zfs_scan_fill_weight;
-}
-
-int
-dsl_scan_init(dsl_pool_t *dp, uint64_t txg)
-{
-	int err;
-	dsl_scan_t *scn;
-	spa_t *spa = dp->dp_spa;
-	uint64_t f;
-
-	scn = dp->dp_scan = kmem_zalloc(sizeof (dsl_scan_t), KM_SLEEP);
-	scn->scn_dp = dp;
-
-	/*
-	 * It's possible that we're resuming a scan after a reboot so
-	 * make sure that the scan_async_destroying flag is initialized
-	 * appropriately.
-	 */
-	ASSERT(!scn->scn_async_destroying);
-	scn->scn_async_destroying = spa_feature_is_active(dp->dp_spa,
-	    SPA_FEATURE_ASYNC_DESTROY);
-
-	/*
-	 * Calculate the max number of in-flight bytes for pool-wide
-	 * scanning operations (minimum 1MB). Limits for the issuing
-	 * phase are done per top-level vdev and are handled separately.
-	 */
-	scn->scn_maxinflight_bytes = MAX(zfs_scan_vdev_limit *
-	    dsl_scan_count_leaves(spa->spa_root_vdev), 1ULL << 20);
-
-	avl_create(&scn->scn_queue, scan_ds_queue_compare, sizeof (scan_ds_t),
-	    offsetof(scan_ds_t, sds_node));
-	avl_create(&scn->scn_prefetch_queue, scan_prefetch_queue_compare,
-	    sizeof (scan_prefetch_issue_ctx_t),
-	    offsetof(scan_prefetch_issue_ctx_t, spic_avl_node));
-
-	err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
-	    "scrub_func", sizeof (uint64_t), 1, &f);
-	if (err == 0) {
-		/*
-		 * There was an old-style scrub in progress.  Restart a
-		 * new-style scrub from the beginning.
-		 */
-		scn->scn_restart_txg = txg;
-		zfs_dbgmsg("old-style scrub was in progress; "
-		    "restarting new-style scrub in txg %llu",
-		    (longlong_t)scn->scn_restart_txg);
-
-		/*
-		 * Load the queue obj from the old location so that it
-		 * can be freed by dsl_scan_done().
-		 */
-		(void) zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
-		    "scrub_queue", sizeof (uint64_t), 1,
-		    &scn->scn_phys.scn_queue_obj);
-	} else {
-		err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
-		    DMU_POOL_SCAN, sizeof (uint64_t), SCAN_PHYS_NUMINTS,
-		    &scn->scn_phys);
-		if (err == ENOENT)
-			return (0);
-		else if (err)
-			return (err);
-
-		/*
-		 * We might be restarting after a reboot, so jump the issued
-		 * counter to how far we've scanned. We know we're consistent
-		 * up to here.
-		 */
-		scn->scn_issued_before_pass = scn->scn_phys.scn_examined;
-
-		if (dsl_scan_is_running(scn) &&
-		    spa_prev_software_version(dp->dp_spa) < SPA_VERSION_SCAN) {
-			/*
-			 * A new-type scrub was in progress on an old
-			 * pool, and the pool was accessed by old
-			 * software.  Restart from the beginning, since
-			 * the old software may have changed the pool in
-			 * the meantime.
-			 */
-			scn->scn_restart_txg = txg;
-			zfs_dbgmsg("new-style scrub was modified "
-			    "by old software; restarting in txg %llu",
-			    (longlong_t)scn->scn_restart_txg);
-		}
-	}
-
-	bcopy(&scn->scn_phys, &scn->scn_phys_cached, sizeof (scn->scn_phys));
-
-	/* reload the queue into the in-core state */
-	if (scn->scn_phys.scn_queue_obj != 0) {
-		zap_cursor_t zc;
-		zap_attribute_t za;
-
-		for (zap_cursor_init(&zc, dp->dp_meta_objset,
-		    scn->scn_phys.scn_queue_obj);
-		    zap_cursor_retrieve(&zc, &za) == 0;
-		    (void) zap_cursor_advance(&zc)) {
-			scan_ds_queue_insert(scn,
-			    zfs_strtonum(za.za_name, NULL),
-			    za.za_first_integer);
-		}
-		zap_cursor_fini(&zc);
-	}
-
-	spa_scan_stat_init(spa);
-	return (0);
-}
-
-void
-dsl_scan_fini(dsl_pool_t *dp)
-{
-	if (dp->dp_scan != NULL) {
-		dsl_scan_t *scn = dp->dp_scan;
-
-		if (scn->scn_taskq != NULL)
-			taskq_destroy(scn->scn_taskq);
-		scan_ds_queue_clear(scn);
-		avl_destroy(&scn->scn_queue);
-		avl_destroy(&scn->scn_prefetch_queue);
-
-		kmem_free(dp->dp_scan, sizeof (dsl_scan_t));
-		dp->dp_scan = NULL;
-	}
-}
-
-static boolean_t
-dsl_scan_restarting(dsl_scan_t *scn, dmu_tx_t *tx)
-{
-	return (scn->scn_restart_txg != 0 &&
-	    scn->scn_restart_txg <= tx->tx_txg);
-}
-
-boolean_t
-dsl_scan_scrubbing(const dsl_pool_t *dp)
-{
-	dsl_scan_phys_t *scn_phys = &dp->dp_scan->scn_phys;
-
-	return (scn_phys->scn_state == DSS_SCANNING &&
-	    scn_phys->scn_func == POOL_SCAN_SCRUB);
-}
-
-boolean_t
-dsl_scan_is_paused_scrub(const dsl_scan_t *scn)
-{
-	return (dsl_scan_scrubbing(scn->scn_dp) &&
-	    scn->scn_phys.scn_flags & DSF_SCRUB_PAUSED);
-}
-
-/*
- * Writes out a persistent dsl_scan_phys_t record to the pool directory.
- * Because we can be running in the block sorting algorithm, we do not always
- * want to write out the record, only when it is "safe" to do so. This safety
- * condition is achieved by making sure that the sorting queues are empty
- * (scn_bytes_pending == 0). When this condition is not true, the sync'd state
- * is inconsistent with how much actual scanning progress has been made. The
- * kind of sync to be performed is specified by the sync_type argument. If the
- * sync is optional, we only sync if the queues are empty. If the sync is
- * mandatory, we do a hard ASSERT to make sure that the queues are empty. The
- * third possible state is a "cached" sync. This is done in response to:
- * 1) The dataset that was in the last sync'd dsl_scan_phys_t having been
- *	destroyed, so we wouldn't be able to restart scanning from it.
- * 2) The snapshot that was in the last sync'd dsl_scan_phys_t having been
- *	superseded by a newer snapshot.
- * 3) The dataset that was in the last sync'd dsl_scan_phys_t having been
- *	swapped with its clone.
- * In all cases, a cached sync simply rewrites the last record we've written,
- * just slightly modified. For the modifications that are performed to the
- * last written dsl_scan_phys_t, see dsl_scan_ds_destroyed,
- * dsl_scan_ds_snapshotted and dsl_scan_ds_clone_swapped.
- */
-static void
-dsl_scan_sync_state(dsl_scan_t *scn, dmu_tx_t *tx, state_sync_type_t sync_type)
-{
-	int i;
-	spa_t *spa = scn->scn_dp->dp_spa;
-
-	ASSERT(sync_type != SYNC_MANDATORY || scn->scn_bytes_pending == 0);
-	if (scn->scn_bytes_pending == 0) {
-		for (i = 0; i < spa->spa_root_vdev->vdev_children; i++) {
-			vdev_t *vd = spa->spa_root_vdev->vdev_child[i];
-			dsl_scan_io_queue_t *q = vd->vdev_scan_io_queue;
-
-			if (q == NULL)
-				continue;
-
-			mutex_enter(&vd->vdev_scan_io_queue_lock);
-			ASSERT3P(avl_first(&q->q_sios_by_addr), ==, NULL);
-			ASSERT3P(avl_first(&q->q_exts_by_size), ==, NULL);
-			ASSERT3P(range_tree_first(q->q_exts_by_addr), ==, NULL);
-			mutex_exit(&vd->vdev_scan_io_queue_lock);
-		}
-
-		if (scn->scn_phys.scn_queue_obj != 0)
-			scan_ds_queue_sync(scn, tx);
-		VERIFY0(zap_update(scn->scn_dp->dp_meta_objset,
-		    DMU_POOL_DIRECTORY_OBJECT,
-		    DMU_POOL_SCAN, sizeof (uint64_t), SCAN_PHYS_NUMINTS,
-		    &scn->scn_phys, tx));
-		bcopy(&scn->scn_phys, &scn->scn_phys_cached,
-		    sizeof (scn->scn_phys));
-
-		if (scn->scn_checkpointing)
-			zfs_dbgmsg("finish scan checkpoint");
-
-		scn->scn_checkpointing = B_FALSE;
-		scn->scn_last_checkpoint = ddi_get_lbolt();
-	} else if (sync_type == SYNC_CACHED) {
-		VERIFY0(zap_update(scn->scn_dp->dp_meta_objset,
-		    DMU_POOL_DIRECTORY_OBJECT,
-		    DMU_POOL_SCAN, sizeof (uint64_t), SCAN_PHYS_NUMINTS,
-		    &scn->scn_phys_cached, tx));
-	}
-}
-
-/* ARGSUSED */
-static int
-dsl_scan_setup_check(void *arg, dmu_tx_t *tx)
-{
-	dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan;
-
-	if (dsl_scan_is_running(scn))
-		return (SET_ERROR(EBUSY));
-
-	return (0);
-}
-
-static void
-dsl_scan_setup_sync(void *arg, dmu_tx_t *tx)
-{
-	dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan;
-	pool_scan_func_t *funcp = arg;
-	dmu_object_type_t ot = 0;
-	dsl_pool_t *dp = scn->scn_dp;
-	spa_t *spa = dp->dp_spa;
-
-	ASSERT(!dsl_scan_is_running(scn));
-	ASSERT(*funcp > POOL_SCAN_NONE && *funcp < POOL_SCAN_FUNCS);
-	bzero(&scn->scn_phys, sizeof (scn->scn_phys));
-	scn->scn_phys.scn_func = *funcp;
-	scn->scn_phys.scn_state = DSS_SCANNING;
-	scn->scn_phys.scn_min_txg = 0;
-	scn->scn_phys.scn_max_txg = tx->tx_txg;
-	scn->scn_phys.scn_ddt_class_max = DDT_CLASSES - 1; /* the entire DDT */
-	scn->scn_phys.scn_start_time = gethrestime_sec();
-	scn->scn_phys.scn_errors = 0;
-	scn->scn_phys.scn_to_examine = spa->spa_root_vdev->vdev_stat.vs_alloc;
-	scn->scn_issued_before_pass = 0;
-	scn->scn_restart_txg = 0;
-	scn->scn_done_txg = 0;
-	scn->scn_last_checkpoint = 0;
-	scn->scn_checkpointing = B_FALSE;
-	spa_scan_stat_init(spa);
-
-	if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) {
-		scn->scn_phys.scn_ddt_class_max = zfs_scrub_ddt_class_max;
-
-		/* rewrite all disk labels */
-		vdev_config_dirty(spa->spa_root_vdev);
-
-		if (vdev_resilver_needed(spa->spa_root_vdev,
-		    &scn->scn_phys.scn_min_txg, &scn->scn_phys.scn_max_txg)) {
-			spa_event_notify(spa, NULL, NULL,
-			    ESC_ZFS_RESILVER_START);
-		} else {
-			spa_event_notify(spa, NULL, NULL, ESC_ZFS_SCRUB_START);
-		}
-
-		spa->spa_scrub_started = B_TRUE;
-		/*
-		 * If this is an incremental scrub, limit the DDT scrub phase
-		 * to just the auto-ditto class (for correctness); the rest
-		 * of the scrub should go faster using top-down pruning.
-		 */
-		if (scn->scn_phys.scn_min_txg > TXG_INITIAL)
-			scn->scn_phys.scn_ddt_class_max = DDT_CLASS_DITTO;
-
-	}
-
-	/* back to the generic stuff */
-
-	if (dp->dp_blkstats == NULL) {
-		dp->dp_blkstats =
-		    kmem_alloc(sizeof (zfs_all_blkstats_t), KM_SLEEP);
-		mutex_init(&dp->dp_blkstats->zab_lock, NULL,
-		    MUTEX_DEFAULT, NULL);
-	}
-	bzero(&dp->dp_blkstats->zab_type, sizeof (dp->dp_blkstats->zab_type));
-
-	if (spa_version(spa) < SPA_VERSION_DSL_SCRUB)
-		ot = DMU_OT_ZAP_OTHER;
-
-	scn->scn_phys.scn_queue_obj = zap_create(dp->dp_meta_objset,
-	    ot ? ot : DMU_OT_SCAN_QUEUE, DMU_OT_NONE, 0, tx);
-
-	bcopy(&scn->scn_phys, &scn->scn_phys_cached, sizeof (scn->scn_phys));
-
-	dsl_scan_sync_state(scn, tx, SYNC_MANDATORY);
-
-	spa_history_log_internal(spa, "scan setup", tx,
-	    "func=%u mintxg=%llu maxtxg=%llu",
-	    *funcp, scn->scn_phys.scn_min_txg, scn->scn_phys.scn_max_txg);
-}
-
-/*
- * Called by the ZFS_IOC_POOL_SCAN ioctl to start a scrub or resilver.
- * Can also be called to resume a paused scrub.
- */
-int
-dsl_scan(dsl_pool_t *dp, pool_scan_func_t func)
-{
-	spa_t *spa = dp->dp_spa;
-	dsl_scan_t *scn = dp->dp_scan;
-
-	/*
-	 * Purge all vdev caches and probe all devices.  We do this here
-	 * rather than in sync context because this requires a writer lock
-	 * on the spa_config lock, which we can't do from sync context.  The
-	 * spa_scrub_reopen flag indicates that vdev_open() should not
-	 * attempt to start another scrub.
-	 */
-	spa_vdev_state_enter(spa, SCL_NONE);
-	spa->spa_scrub_reopen = B_TRUE;
-	vdev_reopen(spa->spa_root_vdev);
-	spa->spa_scrub_reopen = B_FALSE;
-	(void) spa_vdev_state_exit(spa, NULL, 0);
-
-	if (func == POOL_SCAN_SCRUB && dsl_scan_is_paused_scrub(scn)) {
-		/* got scrub start cmd, resume paused scrub */
-		int err = dsl_scrub_set_pause_resume(scn->scn_dp,
-		    POOL_SCRUB_NORMAL);
-		if (err == 0) {
-			spa_event_notify(spa, NULL, NULL, ESC_ZFS_SCRUB_RESUME);
-			return (ECANCELED);
-		}
-		return (SET_ERROR(err));
-	}
-
-	return (dsl_sync_task(spa_name(spa), dsl_scan_setup_check,
-	    dsl_scan_setup_sync, &func, 0, ZFS_SPACE_CHECK_EXTRA_RESERVED));
-}
-
-/* ARGSUSED */
-static void
-dsl_scan_done(dsl_scan_t *scn, boolean_t complete, dmu_tx_t *tx)
-{
-	static const char *old_names[] = {
-		"scrub_bookmark",
-		"scrub_ddt_bookmark",
-		"scrub_ddt_class_max",
-		"scrub_queue",
-		"scrub_min_txg",
-		"scrub_max_txg",
-		"scrub_func",
-		"scrub_errors",
-		NULL
-	};
-
-	dsl_pool_t *dp = scn->scn_dp;
-	spa_t *spa = dp->dp_spa;
-	int i;
-
-	/* Remove any remnants of an old-style scrub. */
-	for (i = 0; old_names[i]; i++) {
-		(void) zap_remove(dp->dp_meta_objset,
-		    DMU_POOL_DIRECTORY_OBJECT, old_names[i], tx);
-	}
-
-	if (scn->scn_phys.scn_queue_obj != 0) {
-		VERIFY0(dmu_object_free(dp->dp_meta_objset,
-		    scn->scn_phys.scn_queue_obj, tx));
-		scn->scn_phys.scn_queue_obj = 0;
-	}
-	scan_ds_queue_clear(scn);
-
-	scn->scn_phys.scn_flags &= ~DSF_SCRUB_PAUSED;
-
-	/*
-	 * If we were "restarted" from a stopped state, don't bother
-	 * with anything else.
-	 */
-	if (!dsl_scan_is_running(scn)) {
-		ASSERT(!scn->scn_is_sorted);
-		return;
-	}
-
-	if (scn->scn_is_sorted) {
-		scan_io_queues_destroy(scn);
-		scn->scn_is_sorted = B_FALSE;
-
-		if (scn->scn_taskq != NULL) {
-			taskq_destroy(scn->scn_taskq);
-			scn->scn_taskq = NULL;
-		}
-	}
-
-	scn->scn_phys.scn_state = complete ? DSS_FINISHED : DSS_CANCELED;
-
-	if (dsl_scan_restarting(scn, tx))
-		spa_history_log_internal(spa, "scan aborted, restarting", tx,
-		    "errors=%llu", spa_get_errlog_size(spa));
-	else if (!complete)
-		spa_history_log_internal(spa, "scan cancelled", tx,
-		    "errors=%llu", spa_get_errlog_size(spa));
-	else
-		spa_history_log_internal(spa, "scan done", tx,
-		    "errors=%llu", spa_get_errlog_size(spa));
-
-	if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) {
-		spa->spa_scrub_started = B_FALSE;
-		spa->spa_scrub_active = B_FALSE;
-
-		/*
-		 * If the scrub/resilver completed, update all DTLs to
-		 * reflect this.  Whether it succeeded or not, vacate
-		 * all temporary scrub DTLs.
-		 *
-		 * As the scrub does not currently support traversing
-		 * data that have been freed but are part of a checkpoint,
-		 * we don't mark the scrub as done in the DTLs as faults
-		 * may still exist in those vdevs.
-		 */
-		if (complete &&
-		    !spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) {
-			vdev_dtl_reassess(spa->spa_root_vdev, tx->tx_txg,
-			    scn->scn_phys.scn_max_txg, B_TRUE);
-
-			spa_event_notify(spa, NULL, NULL,
-			    scn->scn_phys.scn_min_txg ?
-			    ESC_ZFS_RESILVER_FINISH : ESC_ZFS_SCRUB_FINISH);
-		} else {
-			vdev_dtl_reassess(spa->spa_root_vdev, tx->tx_txg,
-			    0, B_TRUE);
-		}
-		spa_errlog_rotate(spa);
-
-		/*
-		 * We may have finished replacing a device.
-		 * Let the async thread assess this and handle the detach.
-		 */
-		spa_async_request(spa, SPA_ASYNC_RESILVER_DONE);
-	}
-
-	scn->scn_phys.scn_end_time = gethrestime_sec();
-
-	ASSERT(!dsl_scan_is_running(scn));
-}
-
-/* ARGSUSED */
-static int
-dsl_scan_cancel_check(void *arg, dmu_tx_t *tx)
-{
-	dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan;
-
-	if (!dsl_scan_is_running(scn))
-		return (SET_ERROR(ENOENT));
-	return (0);
-}
-
-/* ARGSUSED */
-static void
-dsl_scan_cancel_sync(void *arg, dmu_tx_t *tx)
-{
-	dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan;
-
-	dsl_scan_done(scn, B_FALSE, tx);
-	dsl_scan_sync_state(scn, tx, SYNC_MANDATORY);
-	spa_event_notify(scn->scn_dp->dp_spa, NULL, NULL, ESC_ZFS_SCRUB_ABORT);
-}
-
-int
-dsl_scan_cancel(dsl_pool_t *dp)
-{
-	return (dsl_sync_task(spa_name(dp->dp_spa), dsl_scan_cancel_check,
-	    dsl_scan_cancel_sync, NULL, 3, ZFS_SPACE_CHECK_RESERVED));
-}
-
-static int
-dsl_scrub_pause_resume_check(void *arg, dmu_tx_t *tx)
-{
-	pool_scrub_cmd_t *cmd = arg;
-	dsl_pool_t *dp = dmu_tx_pool(tx);
-	dsl_scan_t *scn = dp->dp_scan;
-
-	if (*cmd == POOL_SCRUB_PAUSE) {
-		/* can't pause a scrub when there is no in-progress scrub */
-		if (!dsl_scan_scrubbing(dp))
-			return (SET_ERROR(ENOENT));
-
-		/* can't pause a paused scrub */
-		if (dsl_scan_is_paused_scrub(scn))
-			return (SET_ERROR(EBUSY));
-	} else if (*cmd != POOL_SCRUB_NORMAL) {
-		return (SET_ERROR(ENOTSUP));
-	}
-
-	return (0);
-}
-
-static void
-dsl_scrub_pause_resume_sync(void *arg, dmu_tx_t *tx)
-{
-	pool_scrub_cmd_t *cmd = arg;
-	dsl_pool_t *dp = dmu_tx_pool(tx);
-	spa_t *spa = dp->dp_spa;
-	dsl_scan_t *scn = dp->dp_scan;
-
-	if (*cmd == POOL_SCRUB_PAUSE) {
-		/* can't pause a scrub when there is no in-progress scrub */
-		spa->spa_scan_pass_scrub_pause = gethrestime_sec();
-		scn->scn_phys.scn_flags |= DSF_SCRUB_PAUSED;
-		scn->scn_phys_cached.scn_flags |= DSF_SCRUB_PAUSED;
-		dsl_scan_sync_state(scn, tx, SYNC_CACHED);
-		spa_event_notify(spa, NULL, NULL, ESC_ZFS_SCRUB_PAUSED);
-	} else {
-		ASSERT3U(*cmd, ==, POOL_SCRUB_NORMAL);
-		if (dsl_scan_is_paused_scrub(scn)) {
-			/*
-			 * We need to keep track of how much time we spend
-			 * paused per pass so that we can adjust the scrub rate
-			 * shown in the output of 'zpool status'
-			 */
-			spa->spa_scan_pass_scrub_spent_paused +=
-			    gethrestime_sec() - spa->spa_scan_pass_scrub_pause;
-			spa->spa_scan_pass_scrub_pause = 0;
-			scn->scn_phys.scn_flags &= ~DSF_SCRUB_PAUSED;
-			scn->scn_phys_cached.scn_flags &= ~DSF_SCRUB_PAUSED;
-			dsl_scan_sync_state(scn, tx, SYNC_CACHED);
-		}
-	}
-}
-
-/*
- * Set scrub pause/resume state if it makes sense to do so
- */
-int
-dsl_scrub_set_pause_resume(const dsl_pool_t *dp, pool_scrub_cmd_t cmd)
-{
-	return (dsl_sync_task(spa_name(dp->dp_spa),
-	    dsl_scrub_pause_resume_check, dsl_scrub_pause_resume_sync, &cmd, 3,
-	    ZFS_SPACE_CHECK_RESERVED));
-}
-
-
-/* start a new scan, or restart an existing one. */
-void
-dsl_resilver_restart(dsl_pool_t *dp, uint64_t txg)
-{
-	if (txg == 0) {
-		dmu_tx_t *tx;
-		tx = dmu_tx_create_dd(dp->dp_mos_dir);
-		VERIFY(0 == dmu_tx_assign(tx, TXG_WAIT));
-
-		txg = dmu_tx_get_txg(tx);
-		dp->dp_scan->scn_restart_txg = txg;
-		dmu_tx_commit(tx);
-	} else {
-		dp->dp_scan->scn_restart_txg = txg;
-	}
-	zfs_dbgmsg("restarting resilver txg=%llu", txg);
-}
-
-void
-dsl_free(dsl_pool_t *dp, uint64_t txg, const blkptr_t *bp)
-{
-	zio_free(dp->dp_spa, txg, bp);
-}
-
-void
-dsl_free_sync(zio_t *pio, dsl_pool_t *dp, uint64_t txg, const blkptr_t *bpp)
-{
-	ASSERT(dsl_pool_sync_context(dp));
-	zio_nowait(zio_free_sync(pio, dp->dp_spa, txg, bpp, BP_GET_PSIZE(bpp),
-	    pio->io_flags));
-}
-
-static int
-scan_ds_queue_compare(const void *a, const void *b)
-{
-	const scan_ds_t *sds_a = a, *sds_b = b;
-
-	if (sds_a->sds_dsobj < sds_b->sds_dsobj)
-		return (-1);
-	if (sds_a->sds_dsobj == sds_b->sds_dsobj)
-		return (0);
-	return (1);
-}
-
-static void
-scan_ds_queue_clear(dsl_scan_t *scn)
-{
-	void *cookie = NULL;
-	scan_ds_t *sds;
-	while ((sds = avl_destroy_nodes(&scn->scn_queue, &cookie)) != NULL) {
-		kmem_free(sds, sizeof (*sds));
-	}
-}
-
-static boolean_t
-scan_ds_queue_contains(dsl_scan_t *scn, uint64_t dsobj, uint64_t *txg)
-{
-	scan_ds_t srch, *sds;
-
-	srch.sds_dsobj = dsobj;
-	sds = avl_find(&scn->scn_queue, &srch, NULL);
-	if (sds != NULL && txg != NULL)
-		*txg = sds->sds_txg;
-	return (sds != NULL);
-}
-
-static void
-scan_ds_queue_insert(dsl_scan_t *scn, uint64_t dsobj, uint64_t txg)
-{
-	scan_ds_t *sds;
-	avl_index_t where;
-
-	sds = kmem_zalloc(sizeof (*sds), KM_SLEEP);
-	sds->sds_dsobj = dsobj;
-	sds->sds_txg = txg;
-
-	VERIFY3P(avl_find(&scn->scn_queue, sds, &where), ==, NULL);
-	avl_insert(&scn->scn_queue, sds, where);
-}
-
-static void
-scan_ds_queue_remove(dsl_scan_t *scn, uint64_t dsobj)
-{
-	scan_ds_t srch, *sds;
-
-	srch.sds_dsobj = dsobj;
-
-	sds = avl_find(&scn->scn_queue, &srch, NULL);
-	VERIFY(sds != NULL);
-	avl_remove(&scn->scn_queue, sds);
-	kmem_free(sds, sizeof (*sds));
-}
-
-static void
-scan_ds_queue_sync(dsl_scan_t *scn, dmu_tx_t *tx)
-{
-	dsl_pool_t *dp = scn->scn_dp;
-	spa_t *spa = dp->dp_spa;
-	dmu_object_type_t ot = (spa_version(spa) >= SPA_VERSION_DSL_SCRUB) ?
-	    DMU_OT_SCAN_QUEUE : DMU_OT_ZAP_OTHER;
-
-	ASSERT0(scn->scn_bytes_pending);
-	ASSERT(scn->scn_phys.scn_queue_obj != 0);
-
-	VERIFY0(dmu_object_free(dp->dp_meta_objset,
-	    scn->scn_phys.scn_queue_obj, tx));
-	scn->scn_phys.scn_queue_obj = zap_create(dp->dp_meta_objset, ot,
-	    DMU_OT_NONE, 0, tx);
-	for (scan_ds_t *sds = avl_first(&scn->scn_queue);
-	    sds != NULL; sds = AVL_NEXT(&scn->scn_queue, sds)) {
-		VERIFY0(zap_add_int_key(dp->dp_meta_objset,
-		    scn->scn_phys.scn_queue_obj, sds->sds_dsobj,
-		    sds->sds_txg, tx));
-	}
-}
-
-/*
- * Computes the memory limit state that we're currently in. A sorted scan
- * needs quite a bit of memory to hold the sorting queue, so we need to
- * reasonably constrain the size so it doesn't impact overall system
- * performance. We compute two limits:
- * 1) Hard memory limit: if the amount of memory used by the sorting
- *	queues on a pool gets above this value, we stop the metadata
- *	scanning portion and start issuing the queued up and sorted
- *	I/Os to reduce memory usage.
- *	This limit is calculated as a fraction of physmem (by default 5%).
- *	We constrain the lower bound of the hard limit to an absolute
- *	minimum of zfs_scan_mem_lim_min (default: 16 MiB). We also constrain
- *	the upper bound to 5% of the total pool size - no chance we'll
- *	ever need that much memory, but just to keep the value in check.
- * 2) Soft memory limit: once we hit the hard memory limit, we start
- *	issuing I/O to reduce queue memory usage, but we don't want to
- *	completely empty out the queues, since we might be able to find I/Os
- *	that will fill in the gaps of our non-sequential IOs at some point
- *	in the future. So we stop the issuing of I/Os once the amount of
- *	memory used drops below the soft limit (at which point we stop issuing
- *	I/O and start scanning metadata again).
- *
- *	This limit is calculated by subtracting a fraction of the hard
- *	limit from the hard limit. By default this fraction is 5%, so
- *	the soft limit is 95% of the hard limit. We cap the size of the
- *	difference between the hard and soft limits at an absolute
- *	maximum of zfs_scan_mem_lim_soft_max (default: 128 MiB) - this is
- *	sufficient to not cause too frequent switching between the
- *	metadata scan and I/O issue (even at 2k recordsize, 128 MiB's
- *	worth of queues is about 1.2 GiB of on-pool data, so scanning
- *	that should take at least a decent fraction of a second).
- */
-static boolean_t
-dsl_scan_should_clear(dsl_scan_t *scn)
-{
-	spa_t *spa = scn->scn_dp->dp_spa;
-	vdev_t *rvd = scn->scn_dp->dp_spa->spa_root_vdev;
-	uint64_t alloc, mlim_hard, mlim_soft, mused;
-
-	alloc = metaslab_class_get_alloc(spa_normal_class(spa));
-	alloc += metaslab_class_get_alloc(spa_special_class(spa));
-	alloc += metaslab_class_get_alloc(spa_dedup_class(spa));
-
-	mlim_hard = MAX((physmem / zfs_scan_mem_lim_fact) * PAGESIZE,
-	    zfs_scan_mem_lim_min);
-	mlim_hard = MIN(mlim_hard, alloc / 20);
-	mlim_soft = mlim_hard - MIN(mlim_hard / zfs_scan_mem_lim_soft_fact,
-	    zfs_scan_mem_lim_soft_max);
-	mused = 0;
-	for (uint64_t i = 0; i < rvd->vdev_children; i++) {
-		vdev_t *tvd = rvd->vdev_child[i];
-		dsl_scan_io_queue_t *queue;
-
-		mutex_enter(&tvd->vdev_scan_io_queue_lock);
-		queue = tvd->vdev_scan_io_queue;
-		if (queue != NULL) {
-			/* #extents in exts_by_size = # in exts_by_addr */
-			mused += avl_numnodes(&queue->q_exts_by_size) *
-			    sizeof (range_seg_t) +
-			    avl_numnodes(&queue->q_sios_by_addr) *
-			    sizeof (scan_io_t);
-		}
-		mutex_exit(&tvd->vdev_scan_io_queue_lock);
-	}
-
-	dprintf("current scan memory usage: %llu bytes\n", (longlong_t)mused);
-
-	if (mused == 0)
-		ASSERT0(scn->scn_bytes_pending);
-
-	/*
-	 * If we are above our hard limit, we need to clear out memory.
-	 * If we are below our soft limit, we need to accumulate sequential IOs.
-	 * Otherwise, we should keep doing whatever we are currently doing.
-	 */
-	if (mused >= mlim_hard)
-		return (B_TRUE);
-	else if (mused < mlim_soft)
-		return (B_FALSE);
-	else
-		return (scn->scn_clearing);
-}
-
-static boolean_t
-dsl_scan_check_suspend(dsl_scan_t *scn, const zbookmark_phys_t *zb)
-{
-	/* we never skip user/group accounting objects */
-	if (zb && (int64_t)zb->zb_object < 0)
-		return (B_FALSE);
-
-	if (scn->scn_suspending)
-		return (B_TRUE); /* we're already suspending */
-
-	if (!ZB_IS_ZERO(&scn->scn_phys.scn_bookmark))
-		return (B_FALSE); /* we're resuming */
-
-	/* We only know how to resume from level-0 blocks. */
-	if (zb && zb->zb_level != 0)
-		return (B_FALSE);
-
-	/*
-	 * We suspend if:
-	 *  - we have scanned for at least the minimum time (default 1 sec
-	 *    for scrub, 3 sec for resilver), and either we have sufficient
-	 *    dirty data that we are starting to write more quickly
-	 *    (default 30%), or someone is explicitly waiting for this txg
-	 *    to complete.
-	 *  or
-	 *  - the spa is shutting down because this pool is being exported
-	 *    or the machine is rebooting.
-	 *  or
-	 *  - the scan queue has reached its memory use limit
-	 */
-	uint64_t elapsed_nanosecs = gethrtime();
-	uint64_t curr_time_ns = gethrtime();
-	uint64_t scan_time_ns = curr_time_ns - scn->scn_sync_start_time;
-	uint64_t sync_time_ns = curr_time_ns -
-	    scn->scn_dp->dp_spa->spa_sync_starttime;
-
-	int dirty_pct = scn->scn_dp->dp_dirty_total * 100 / zfs_dirty_data_max;
-	int mintime = (scn->scn_phys.scn_func == POOL_SCAN_RESILVER) ?
-	    zfs_resilver_min_time_ms : zfs_scrub_min_time_ms;
-
-	if ((NSEC2MSEC(scan_time_ns) > mintime &&
-            (dirty_pct >= zfs_vdev_async_write_active_min_dirty_percent ||
-            txg_sync_waiting(scn->scn_dp) ||
-            NSEC2SEC(sync_time_ns) >= zfs_txg_timeout)) ||
-            spa_shutting_down(scn->scn_dp->dp_spa) ||
-	    (zfs_scan_strict_mem_lim && dsl_scan_should_clear(scn))) {
-		if (zb) {
-			dprintf("suspending at bookmark %llx/%llx/%llx/%llx\n",
-			    (longlong_t)zb->zb_objset,
-			    (longlong_t)zb->zb_object,
-			    (longlong_t)zb->zb_level,
-			    (longlong_t)zb->zb_blkid);
-			scn->scn_phys.scn_bookmark = *zb;
-		} else {
-			dsl_scan_phys_t *scnp = &scn->scn_phys;
-
-			dprintf("suspending at at DDT bookmark "
-			    "%llx/%llx/%llx/%llx\n",
-			    (longlong_t)scnp->scn_ddt_bookmark.ddb_class,
-			    (longlong_t)scnp->scn_ddt_bookmark.ddb_type,
-			    (longlong_t)scnp->scn_ddt_bookmark.ddb_checksum,
-			    (longlong_t)scnp->scn_ddt_bookmark.ddb_cursor);
-		}
-		scn->scn_suspending = B_TRUE;
-		return (B_TRUE);
-	}
-	return (B_FALSE);
-}
-
-typedef struct zil_scan_arg {
-	dsl_pool_t	*zsa_dp;
-	zil_header_t	*zsa_zh;
-} zil_scan_arg_t;
-
-/* ARGSUSED */
-static int
-dsl_scan_zil_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg)
-{
-	zil_scan_arg_t *zsa = arg;
-	dsl_pool_t *dp = zsa->zsa_dp;
-	dsl_scan_t *scn = dp->dp_scan;
-	zil_header_t *zh = zsa->zsa_zh;
-	zbookmark_phys_t zb;
-
-	if (BP_IS_HOLE(bp) || bp->blk_birth <= scn->scn_phys.scn_cur_min_txg)
-		return (0);
-
-	/*
-	 * One block ("stubby") can be allocated a long time ago; we
-	 * want to visit that one because it has been allocated
-	 * (on-disk) even if it hasn't been claimed (even though for
-	 * scrub there's nothing to do to it).
-	 */
-	if (claim_txg == 0 && bp->blk_birth >= spa_min_claim_txg(dp->dp_spa))
-		return (0);
-
-	SET_BOOKMARK(&zb, zh->zh_log.blk_cksum.zc_word[ZIL_ZC_OBJSET],
-	    ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, bp->blk_cksum.zc_word[ZIL_ZC_SEQ]);
-
-	VERIFY(0 == scan_funcs[scn->scn_phys.scn_func](dp, bp, &zb));
-	return (0);
-}
-
-/* ARGSUSED */
-static int
-dsl_scan_zil_record(zilog_t *zilog, lr_t *lrc, void *arg, uint64_t claim_txg)
-{
-	if (lrc->lrc_txtype == TX_WRITE) {
-		zil_scan_arg_t *zsa = arg;
-		dsl_pool_t *dp = zsa->zsa_dp;
-		dsl_scan_t *scn = dp->dp_scan;
-		zil_header_t *zh = zsa->zsa_zh;
-		lr_write_t *lr = (lr_write_t *)lrc;
-		blkptr_t *bp = &lr->lr_blkptr;
-		zbookmark_phys_t zb;
-
-		if (BP_IS_HOLE(bp) ||
-		    bp->blk_birth <= scn->scn_phys.scn_cur_min_txg)
-			return (0);
-
-		/*
-		 * birth can be < claim_txg if this record's txg is
-		 * already txg sync'ed (but this log block contains
-		 * other records that are not synced)
-		 */
-		if (claim_txg == 0 || bp->blk_birth < claim_txg)
-			return (0);
-
-		SET_BOOKMARK(&zb, zh->zh_log.blk_cksum.zc_word[ZIL_ZC_OBJSET],
-		    lr->lr_foid, ZB_ZIL_LEVEL,
-		    lr->lr_offset / BP_GET_LSIZE(bp));
-
-		VERIFY(0 == scan_funcs[scn->scn_phys.scn_func](dp, bp, &zb));
-	}
-	return (0);
-}
-
-static void
-dsl_scan_zil(dsl_pool_t *dp, zil_header_t *zh)
-{
-	uint64_t claim_txg = zh->zh_claim_txg;
-	zil_scan_arg_t zsa = { dp, zh };
-	zilog_t *zilog;
-
-	ASSERT(spa_writeable(dp->dp_spa));
-
-	/*
-	 * We only want to visit blocks that have been claimed
-	 * but not yet replayed.
-	 */
-	if (claim_txg == 0)
-		return;
-
-	zilog = zil_alloc(dp->dp_meta_objset, zh);
-
-	(void) zil_parse(zilog, dsl_scan_zil_block, dsl_scan_zil_record, &zsa,
-	    claim_txg);
-
-	zil_free(zilog);
-}
-
-/*
- * We compare scan_prefetch_issue_ctx_t's based on their bookmarks. The idea
- * here is to sort the AVL tree by the order each block will be needed.
- */
-static int
-scan_prefetch_queue_compare(const void *a, const void *b)
-{
-	const scan_prefetch_issue_ctx_t *spic_a = a, *spic_b = b;
-	const scan_prefetch_ctx_t *spc_a = spic_a->spic_spc;
-	const scan_prefetch_ctx_t *spc_b = spic_b->spic_spc;
-
-	return (zbookmark_compare(spc_a->spc_datablkszsec,
-	    spc_a->spc_indblkshift, spc_b->spc_datablkszsec,
-	    spc_b->spc_indblkshift, &spic_a->spic_zb, &spic_b->spic_zb));
-}
-
-static void
-scan_prefetch_ctx_rele(scan_prefetch_ctx_t *spc, void *tag)
-{
-	if (zfs_refcount_remove(&spc->spc_refcnt, tag) == 0) {
-		zfs_refcount_destroy(&spc->spc_refcnt);
-		kmem_free(spc, sizeof (scan_prefetch_ctx_t));
-	}
-}
-
-static scan_prefetch_ctx_t *
-scan_prefetch_ctx_create(dsl_scan_t *scn, dnode_phys_t *dnp, void *tag)
-{
-	scan_prefetch_ctx_t *spc;
-
-	spc = kmem_alloc(sizeof (scan_prefetch_ctx_t), KM_SLEEP);
-	zfs_refcount_create(&spc->spc_refcnt);
-	zfs_refcount_add(&spc->spc_refcnt, tag);
-	spc->spc_scn = scn;
-	if (dnp != NULL) {
-		spc->spc_datablkszsec = dnp->dn_datablkszsec;
-		spc->spc_indblkshift = dnp->dn_indblkshift;
-		spc->spc_root = B_FALSE;
-	} else {
-		spc->spc_datablkszsec = 0;
-		spc->spc_indblkshift = 0;
-		spc->spc_root = B_TRUE;
-	}
-
-	return (spc);
-}
-
-static void
-scan_prefetch_ctx_add_ref(scan_prefetch_ctx_t *spc, void *tag)
-{
-	zfs_refcount_add(&spc->spc_refcnt, tag);
-}
-
-static boolean_t
-dsl_scan_check_prefetch_resume(scan_prefetch_ctx_t *spc,
-    const zbookmark_phys_t *zb)
-{
-	zbookmark_phys_t *last_zb = &spc->spc_scn->scn_prefetch_bookmark;
-	dnode_phys_t tmp_dnp;
-	dnode_phys_t *dnp = (spc->spc_root) ? NULL : &tmp_dnp;
-
-	if (zb->zb_objset != last_zb->zb_objset)
-		return (B_TRUE);
-	if ((int64_t)zb->zb_object < 0)
-		return (B_FALSE);
-
-	tmp_dnp.dn_datablkszsec = spc->spc_datablkszsec;
-	tmp_dnp.dn_indblkshift = spc->spc_indblkshift;
-
-	if (zbookmark_subtree_completed(dnp, zb, last_zb))
-		return (B_TRUE);
-
-	return (B_FALSE);
-}
-
-static void
-dsl_scan_prefetch(scan_prefetch_ctx_t *spc, blkptr_t *bp, zbookmark_phys_t *zb)
-{
-	avl_index_t idx;
-	dsl_scan_t *scn = spc->spc_scn;
-	spa_t *spa = scn->scn_dp->dp_spa;
-	scan_prefetch_issue_ctx_t *spic;
-
-	if (zfs_no_scrub_prefetch)
-		return;
-
-	if (BP_IS_HOLE(bp) || bp->blk_birth <= scn->scn_phys.scn_cur_min_txg ||
-	    (BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_DNODE &&
-	    BP_GET_TYPE(bp) != DMU_OT_OBJSET))
-		return;
-
-	if (dsl_scan_check_prefetch_resume(spc, zb))
-		return;
-
-	scan_prefetch_ctx_add_ref(spc, scn);
-	spic = kmem_alloc(sizeof (scan_prefetch_issue_ctx_t), KM_SLEEP);
-	spic->spic_spc = spc;
-	spic->spic_bp = *bp;
-	spic->spic_zb = *zb;
-
-	/*
-	 * Add the IO to the queue of blocks to prefetch. This allows us to
-	 * prioritize blocks that we will need first for the main traversal
-	 * thread.
-	 */
-	mutex_enter(&spa->spa_scrub_lock);
-	if (avl_find(&scn->scn_prefetch_queue, spic, &idx) != NULL) {
-		/* this block is already queued for prefetch */
-		kmem_free(spic, sizeof (scan_prefetch_issue_ctx_t));
-		scan_prefetch_ctx_rele(spc, scn);
-		mutex_exit(&spa->spa_scrub_lock);
-		return;
-	}
-
-	avl_insert(&scn->scn_prefetch_queue, spic, idx);
-	cv_broadcast(&spa->spa_scrub_io_cv);
-	mutex_exit(&spa->spa_scrub_lock);
-}
-
-static void
-dsl_scan_prefetch_dnode(dsl_scan_t *scn, dnode_phys_t *dnp,
-    uint64_t objset, uint64_t object)
-{
-	int i;
-	zbookmark_phys_t zb;
-	scan_prefetch_ctx_t *spc;
-
-	if (dnp->dn_nblkptr == 0 && !(dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR))
-		return;
-
-	SET_BOOKMARK(&zb, objset, object, 0, 0);
-
-	spc = scan_prefetch_ctx_create(scn, dnp, FTAG);
-
-	for (i = 0; i < dnp->dn_nblkptr; i++) {
-		zb.zb_level = BP_GET_LEVEL(&dnp->dn_blkptr[i]);
-		zb.zb_blkid = i;
-		dsl_scan_prefetch(spc, &dnp->dn_blkptr[i], &zb);
-	}
-
-	if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
-		zb.zb_level = 0;
-		zb.zb_blkid = DMU_SPILL_BLKID;
-		dsl_scan_prefetch(spc, &dnp->dn_spill, &zb);
-	}
-
-	scan_prefetch_ctx_rele(spc, FTAG);
-}
-
-void
-dsl_scan_prefetch_cb(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp,
-    arc_buf_t *buf, void *private)
-{
-	scan_prefetch_ctx_t *spc = private;
-	dsl_scan_t *scn = spc->spc_scn;
-	spa_t *spa = scn->scn_dp->dp_spa;
-
-	/* broadcast that the IO has completed for rate limitting purposes */
-	mutex_enter(&spa->spa_scrub_lock);
-	ASSERT3U(spa->spa_scrub_inflight, >=, BP_GET_PSIZE(bp));
-	spa->spa_scrub_inflight -= BP_GET_PSIZE(bp);
-	cv_broadcast(&spa->spa_scrub_io_cv);
-	mutex_exit(&spa->spa_scrub_lock);
-
-	/* if there was an error or we are done prefetching, just cleanup */
-	if (buf == NULL || scn->scn_suspending)
-		goto out;
-
-	if (BP_GET_LEVEL(bp) > 0) {
-		int i;
-		blkptr_t *cbp;
-		int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT;
-		zbookmark_phys_t czb;
-
-		for (i = 0, cbp = buf->b_data; i < epb; i++, cbp++) {
-			SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object,
-			    zb->zb_level - 1, zb->zb_blkid * epb + i);
-			dsl_scan_prefetch(spc, cbp, &czb);
-		}
-	} else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) {
-		dnode_phys_t *cdnp = buf->b_data;
-		int i;
-		int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT;
-
-		for (i = 0, cdnp = buf->b_data; i < epb;
-		    i += cdnp->dn_extra_slots + 1,
-		    cdnp += cdnp->dn_extra_slots + 1) {
-			dsl_scan_prefetch_dnode(scn, cdnp,
-			    zb->zb_objset, zb->zb_blkid * epb + i);
-		}
-	} else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) {
-		objset_phys_t *osp = buf->b_data;
-
-		dsl_scan_prefetch_dnode(scn, &osp->os_meta_dnode,
-		    zb->zb_objset, DMU_META_DNODE_OBJECT);
-
-		if (OBJSET_BUF_HAS_USERUSED(buf)) {
-			dsl_scan_prefetch_dnode(scn,
-			    &osp->os_groupused_dnode, zb->zb_objset,
-			    DMU_GROUPUSED_OBJECT);
-			dsl_scan_prefetch_dnode(scn,
-			    &osp->os_userused_dnode, zb->zb_objset,
-			    DMU_USERUSED_OBJECT);
-		}
-	}
-
-out:
-	if (buf != NULL)
-		arc_buf_destroy(buf, private);
-	scan_prefetch_ctx_rele(spc, scn);
-}
-
-/* ARGSUSED */
-static void
-dsl_scan_prefetch_thread(void *arg)
-{
-	dsl_scan_t *scn = arg;
-	spa_t *spa = scn->scn_dp->dp_spa;
-	vdev_t *rvd = spa->spa_root_vdev;
-	scan_prefetch_issue_ctx_t *spic;
-
-	/* loop until we are told to stop */
-	while (!scn->scn_prefetch_stop) {
-		arc_flags_t flags = ARC_FLAG_NOWAIT |
-                    ARC_FLAG_PRESCIENT_PREFETCH | ARC_FLAG_PREFETCH;
-		int zio_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SCAN_THREAD;
-		
-		mutex_enter(&spa->spa_scrub_lock);
-
-		/*
-		 * Wait until we have an IO to issue and are not above our
-		 * maximum in flight limit.
-		 */
-		while (!scn->scn_prefetch_stop &&
-		    (avl_numnodes(&scn->scn_prefetch_queue) == 0 ||
-		    spa->spa_scrub_inflight >= scn->scn_maxinflight_bytes)) {
-			cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
-		}
-
-		/* recheck if we should stop since we waited for the cv */
-		if (scn->scn_prefetch_stop) {
-			mutex_exit(&spa->spa_scrub_lock);
-			break;
-		}
-
-		/* remove the prefetch IO from the tree */
-		spic = avl_first(&scn->scn_prefetch_queue);
-		spa->spa_scrub_inflight += BP_GET_PSIZE(&spic->spic_bp);
-		avl_remove(&scn->scn_prefetch_queue, spic);
-
-		mutex_exit(&spa->spa_scrub_lock);
-
-		/* issue the prefetch asynchronously */
-		(void) arc_read(scn->scn_zio_root, scn->scn_dp->dp_spa,
-		    &spic->spic_bp, dsl_scan_prefetch_cb, spic->spic_spc,
-		    ZIO_PRIORITY_SCRUB, zio_flags, &flags, &spic->spic_zb);
-
-		kmem_free(spic, sizeof (scan_prefetch_issue_ctx_t));
-	}
-
-	ASSERT(scn->scn_prefetch_stop);
-
-	/* free any prefetches we didn't get to complete */
-	mutex_enter(&spa->spa_scrub_lock);
-	while ((spic = avl_first(&scn->scn_prefetch_queue)) != NULL) {
-		avl_remove(&scn->scn_prefetch_queue, spic);
-		scan_prefetch_ctx_rele(spic->spic_spc, scn);
-		kmem_free(spic, sizeof (scan_prefetch_issue_ctx_t));
-	}
-	ASSERT0(avl_numnodes(&scn->scn_prefetch_queue));
-	mutex_exit(&spa->spa_scrub_lock);
-}
-
-static boolean_t
-dsl_scan_check_resume(dsl_scan_t *scn, const dnode_phys_t *dnp,
-    const zbookmark_phys_t *zb)
-{
-	/*
-	 * We never skip over user/group accounting objects (obj<0)
-	 */
-	if (!ZB_IS_ZERO(&scn->scn_phys.scn_bookmark) &&
-	    (int64_t)zb->zb_object >= 0) {
-		/*
-		 * If we already visited this bp & everything below (in
-		 * a prior txg sync), don't bother doing it again.
-		 */
-		if (zbookmark_subtree_completed(dnp, zb,
-		    &scn->scn_phys.scn_bookmark))
-			return (B_TRUE);
-
-		/*
-		 * If we found the block we're trying to resume from, or
-		 * we went past it to a different object, zero it out to
-		 * indicate that it's OK to start checking for suspending
-		 * again.
-		 */
-		if (bcmp(zb, &scn->scn_phys.scn_bookmark, sizeof (*zb)) == 0 ||
-		    zb->zb_object > scn->scn_phys.scn_bookmark.zb_object) {
-			dprintf("resuming at %llx/%llx/%llx/%llx\n",
-			    (longlong_t)zb->zb_objset,
-			    (longlong_t)zb->zb_object,
-			    (longlong_t)zb->zb_level,
-			    (longlong_t)zb->zb_blkid);
-			bzero(&scn->scn_phys.scn_bookmark, sizeof (*zb));
-		}
-	}
-	return (B_FALSE);
-}
-
-static void dsl_scan_visitbp(blkptr_t *bp, const zbookmark_phys_t *zb,
-    dnode_phys_t *dnp, dsl_dataset_t *ds, dsl_scan_t *scn,
-    dmu_objset_type_t ostype, dmu_tx_t *tx);
-static void dsl_scan_visitdnode(
-    dsl_scan_t *, dsl_dataset_t *ds, dmu_objset_type_t ostype,
-    dnode_phys_t *dnp, uint64_t object, dmu_tx_t *tx);
-
-/*
- * Return nonzero on i/o error.
- * Return new buf to write out in *bufp.
- */
-static int
-dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype,
-    dnode_phys_t *dnp, const blkptr_t *bp,
-    const zbookmark_phys_t *zb, dmu_tx_t *tx)
-{
-	dsl_pool_t *dp = scn->scn_dp;
-	int zio_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SCAN_THREAD;
-	int err;
-
-	if (BP_GET_LEVEL(bp) > 0) {
-		arc_flags_t flags = ARC_FLAG_WAIT;
-		int i;
-		blkptr_t *cbp;
-		int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT;
-		arc_buf_t *buf;
-
-		err = arc_read(NULL, dp->dp_spa, bp, arc_getbuf_func, &buf,
-		    ZIO_PRIORITY_SCRUB, zio_flags, &flags, zb);
-		if (err) {
-			scn->scn_phys.scn_errors++;
-			return (err);
-		}
-		for (i = 0, cbp = buf->b_data; i < epb; i++, cbp++) {
-			zbookmark_phys_t czb;
-
-			SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object,
-			    zb->zb_level - 1,
-			    zb->zb_blkid * epb + i);
-			dsl_scan_visitbp(cbp, &czb, dnp,
-			    ds, scn, ostype, tx);
-		}
-		arc_buf_destroy(buf, &buf);
-	} else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) {
-		arc_flags_t flags = ARC_FLAG_WAIT;
-		dnode_phys_t *cdnp;
-		int i;
-		int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT;
-		arc_buf_t *buf;
-
-		err = arc_read(NULL, dp->dp_spa, bp, arc_getbuf_func, &buf,
-		    ZIO_PRIORITY_SCRUB, zio_flags, &flags, zb);
-		if (err) {
-			scn->scn_phys.scn_errors++;
-			return (err);
-		}
-		for (i = 0, cdnp = buf->b_data; i < epb;
-		    i += cdnp->dn_extra_slots + 1,
-		    cdnp += cdnp->dn_extra_slots + 1) {
-			dsl_scan_visitdnode(scn, ds, ostype,
-			    cdnp, zb->zb_blkid * epb + i, tx);
-		}
-
-		arc_buf_destroy(buf, &buf);
-	} else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) {
-		arc_flags_t flags = ARC_FLAG_WAIT;
-		objset_phys_t *osp;
-		arc_buf_t *buf;
-
-		err = arc_read(NULL, dp->dp_spa, bp, arc_getbuf_func, &buf,
-		    ZIO_PRIORITY_SCRUB, zio_flags, &flags, zb);
-		if (err) {
-			scn->scn_phys.scn_errors++;
-			return (err);
-		}
-
-		osp = buf->b_data;
-
-		dsl_scan_visitdnode(scn, ds, osp->os_type,
-		    &osp->os_meta_dnode, DMU_META_DNODE_OBJECT, tx);
-
-		if (OBJSET_BUF_HAS_USERUSED(buf)) {
-			/*
-			 * We also always visit user/group accounting
-			 * objects, and never skip them, even if we are
-			 * suspending.  This is necessary so that the space
-			 * deltas from this txg get integrated.
-			 */
-			dsl_scan_visitdnode(scn, ds, osp->os_type,
-			    &osp->os_groupused_dnode,
-			    DMU_GROUPUSED_OBJECT, tx);
-			dsl_scan_visitdnode(scn, ds, osp->os_type,
-			    &osp->os_userused_dnode,
-			    DMU_USERUSED_OBJECT, tx);
-		}
-		arc_buf_destroy(buf, &buf);
-	}
-
-	return (0);
-}
-
-static void
-dsl_scan_visitdnode(dsl_scan_t *scn, dsl_dataset_t *ds,
-    dmu_objset_type_t ostype, dnode_phys_t *dnp,
-    uint64_t object, dmu_tx_t *tx)
-{
-	int j;
-
-	for (j = 0; j < dnp->dn_nblkptr; j++) {
-		zbookmark_phys_t czb;
-
-		SET_BOOKMARK(&czb, ds ? ds->ds_object : 0, object,
-		    dnp->dn_nlevels - 1, j);
-		dsl_scan_visitbp(&dnp->dn_blkptr[j],
-		    &czb, dnp, ds, scn, ostype, tx);
-	}
-
-	if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
-		zbookmark_phys_t czb;
-		SET_BOOKMARK(&czb, ds ? ds->ds_object : 0, object,
-		    0, DMU_SPILL_BLKID);
-		dsl_scan_visitbp(DN_SPILL_BLKPTR(dnp),
-		    &czb, dnp, ds, scn, ostype, tx);
-	}
-}
-
-/*
- * The arguments are in this order because mdb can only print the
- * first 5; we want them to be useful.
- */
-static void
-dsl_scan_visitbp(blkptr_t *bp, const zbookmark_phys_t *zb,
-    dnode_phys_t *dnp, dsl_dataset_t *ds, dsl_scan_t *scn,
-    dmu_objset_type_t ostype, dmu_tx_t *tx)
-{
-	dsl_pool_t *dp = scn->scn_dp;
-	blkptr_t *bp_toread = NULL;
-
-	if (dsl_scan_check_suspend(scn, zb))
-		return;
-
-	if (dsl_scan_check_resume(scn, dnp, zb))
-		return;
-
-	scn->scn_visited_this_txg++;
-
-	dprintf_bp(bp,
-	    "visiting ds=%p/%llu zb=%llx/%llx/%llx/%llx bp=%p",
-	    ds, ds ? ds->ds_object : 0,
-	    zb->zb_objset, zb->zb_object, zb->zb_level, zb->zb_blkid,
-	    bp);
-
-	if (BP_IS_HOLE(bp)) {
-		scn->scn_holes_this_txg++;
-		return;
-	}
-
-	if (bp->blk_birth <= scn->scn_phys.scn_cur_min_txg) {
-		scn->scn_lt_min_this_txg++;
-		return;
-	}
-
-	bp_toread = kmem_alloc(sizeof (blkptr_t), KM_SLEEP);
-	*bp_toread = *bp;
-
-	if (dsl_scan_recurse(scn, ds, ostype, dnp, bp_toread, zb, tx) != 0)
-		goto out;
-
-	/*
-	 * If dsl_scan_ddt() has already visited this block, it will have
-	 * already done any translations or scrubbing, so don't call the
-	 * callback again.
-	 */
-	if (ddt_class_contains(dp->dp_spa,
-	    scn->scn_phys.scn_ddt_class_max, bp)) {
-		scn->scn_ddt_contained_this_txg++;
-		goto out;
-	}
-
-	/*
-	 * If this block is from the future (after cur_max_txg), then we
-	 * are doing this on behalf of a deleted snapshot, and we will
-	 * revisit the future block on the next pass of this dataset.
-	 * Don't scan it now unless we need to because something
-	 * under it was modified.
-	 */
-	if (BP_PHYSICAL_BIRTH(bp) > scn->scn_phys.scn_cur_max_txg) {
-		scn->scn_gt_max_this_txg++;
-		goto out;
-	}
-
-	scan_funcs[scn->scn_phys.scn_func](dp, bp, zb);
-out:
-	kmem_free(bp_toread, sizeof (blkptr_t));
-}
-
-static void
-dsl_scan_visit_rootbp(dsl_scan_t *scn, dsl_dataset_t *ds, blkptr_t *bp,
-    dmu_tx_t *tx)
-{
-	zbookmark_phys_t zb;
-	scan_prefetch_ctx_t *spc;
-
-	SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET,
-	    ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
-
-	if (ZB_IS_ZERO(&scn->scn_phys.scn_bookmark)) {
-		SET_BOOKMARK(&scn->scn_prefetch_bookmark,
-		    zb.zb_objset, 0, 0, 0);
-	} else {
-		scn->scn_prefetch_bookmark = scn->scn_phys.scn_bookmark;
-	}
-
-	scn->scn_objsets_visited_this_txg++;
-
-	spc = scan_prefetch_ctx_create(scn, NULL, FTAG);
-	dsl_scan_prefetch(spc, bp, &zb);
-	scan_prefetch_ctx_rele(spc, FTAG);
-
-	dsl_scan_visitbp(bp, &zb, NULL, ds, scn, DMU_OST_NONE, tx);
-
-	dprintf_ds(ds, "finished scan%s", "");
-}
-
-static void
-ds_destroyed_scn_phys(dsl_dataset_t *ds, dsl_scan_phys_t *scn_phys)
-{
-	if (scn_phys->scn_bookmark.zb_objset == ds->ds_object) {
-		if (ds->ds_is_snapshot) {
-			/*
-			 * Note:
-			 *  - scn_cur_{min,max}_txg stays the same.
-			 *  - Setting the flag is not really necessary if
-			 *    scn_cur_max_txg == scn_max_txg, because there
-			 *    is nothing after this snapshot that we care
-			 *    about.  However, we set it anyway and then
-			 *    ignore it when we retraverse it in
-			 *    dsl_scan_visitds().
-			 */
-			scn_phys->scn_bookmark.zb_objset =
-			    dsl_dataset_phys(ds)->ds_next_snap_obj;
-			zfs_dbgmsg("destroying ds %llu; currently traversing; "
-			    "reset zb_objset to %llu",
-			    (u_longlong_t)ds->ds_object,
-			    (u_longlong_t)dsl_dataset_phys(ds)->
-			    ds_next_snap_obj);
-			scn_phys->scn_flags |= DSF_VISIT_DS_AGAIN;
-		} else {
-			SET_BOOKMARK(&scn_phys->scn_bookmark,
-			    ZB_DESTROYED_OBJSET, 0, 0, 0);
-			zfs_dbgmsg("destroying ds %llu; currently traversing; "
-			    "reset bookmark to -1,0,0,0",
-			    (u_longlong_t)ds->ds_object);
-		}
-	}
-}
-
-/*
- * Invoked when a dataset is destroyed. We need to make sure that:
- *
- * 1) If it is the dataset that was currently being scanned, we write
- *	a new dsl_scan_phys_t and marking the objset reference in it
- *	as destroyed.
- * 2) Remove it from the work queue, if it was present.
- *
- * If the dataset was actually a snapshot, instead of marking the dataset
- * as destroyed, we instead substitute the next snapshot in line.
- */
-void
-dsl_scan_ds_destroyed(dsl_dataset_t *ds, dmu_tx_t *tx)
-{
-	dsl_pool_t *dp = ds->ds_dir->dd_pool;
-	dsl_scan_t *scn = dp->dp_scan;
-	uint64_t mintxg;
-
-	if (!dsl_scan_is_running(scn))
-		return;
-
-	ds_destroyed_scn_phys(ds, &scn->scn_phys);
-	ds_destroyed_scn_phys(ds, &scn->scn_phys_cached);
-
-	if (scan_ds_queue_contains(scn, ds->ds_object, &mintxg)) {
-		scan_ds_queue_remove(scn, ds->ds_object);
-		if (ds->ds_is_snapshot)
-			scan_ds_queue_insert(scn,
-			    dsl_dataset_phys(ds)->ds_next_snap_obj, mintxg);
-	}
-
-	if (zap_lookup_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj,
-	    ds->ds_object, &mintxg) == 0) {
-		ASSERT3U(dsl_dataset_phys(ds)->ds_num_children, <=, 1);
-		VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset,
-		    scn->scn_phys.scn_queue_obj, ds->ds_object, tx));
-		if (ds->ds_is_snapshot) {
-			/*
-			 * We keep the same mintxg; it could be >
-			 * ds_creation_txg if the previous snapshot was
-			 * deleted too.
-			 */
-			VERIFY(zap_add_int_key(dp->dp_meta_objset,
-			    scn->scn_phys.scn_queue_obj,
-			    dsl_dataset_phys(ds)->ds_next_snap_obj,
-			    mintxg, tx) == 0);
-			zfs_dbgmsg("destroying ds %llu; in queue; "
-			    "replacing with %llu",
-			    (u_longlong_t)ds->ds_object,
-			    (u_longlong_t)dsl_dataset_phys(ds)->
-			    ds_next_snap_obj);
-		} else {
-			zfs_dbgmsg("destroying ds %llu; in queue; removing",
-			    (u_longlong_t)ds->ds_object);
-		}
-	}
-
-	/*
-	 * dsl_scan_sync() should be called after this, and should sync
-	 * out our changed state, but just to be safe, do it here.
-	 */
-	dsl_scan_sync_state(scn, tx, SYNC_CACHED);
-}
-
-static void
-ds_snapshotted_bookmark(dsl_dataset_t *ds, zbookmark_phys_t *scn_bookmark)
-{
-	if (scn_bookmark->zb_objset == ds->ds_object) {
-		scn_bookmark->zb_objset =
-		    dsl_dataset_phys(ds)->ds_prev_snap_obj;
-		zfs_dbgmsg("snapshotting ds %llu; currently traversing; "
-		    "reset zb_objset to %llu",
-		    (u_longlong_t)ds->ds_object,
-		    (u_longlong_t)dsl_dataset_phys(ds)->ds_prev_snap_obj);
-	}
-}
-
-/*
- * Called when a dataset is snapshotted. If we were currently traversing
- * this snapshot, we reset our bookmark to point at the newly created
- * snapshot. We also modify our work queue to remove the old snapshot and
- * replace with the new one.
- */
-void
-dsl_scan_ds_snapshotted(dsl_dataset_t *ds, dmu_tx_t *tx)
-{
-	dsl_pool_t *dp = ds->ds_dir->dd_pool;
-	dsl_scan_t *scn = dp->dp_scan;
-	uint64_t mintxg;
-
-	if (!dsl_scan_is_running(scn))
-		return;
-
-	ASSERT(dsl_dataset_phys(ds)->ds_prev_snap_obj != 0);
-
-	ds_snapshotted_bookmark(ds, &scn->scn_phys.scn_bookmark);
-	ds_snapshotted_bookmark(ds, &scn->scn_phys_cached.scn_bookmark);
-
-	if (scan_ds_queue_contains(scn, ds->ds_object, &mintxg)) {
-		scan_ds_queue_remove(scn, ds->ds_object);
-		scan_ds_queue_insert(scn,
-		    dsl_dataset_phys(ds)->ds_prev_snap_obj, mintxg);
-	}
-
-	if (zap_lookup_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj,
-	    ds->ds_object, &mintxg) == 0) {
-		VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset,
-		    scn->scn_phys.scn_queue_obj, ds->ds_object, tx));
-		VERIFY(zap_add_int_key(dp->dp_meta_objset,
-		    scn->scn_phys.scn_queue_obj,
-		    dsl_dataset_phys(ds)->ds_prev_snap_obj, mintxg, tx) == 0);
-		zfs_dbgmsg("snapshotting ds %llu; in queue; "
-		    "replacing with %llu",
-		    (u_longlong_t)ds->ds_object,
-		    (u_longlong_t)dsl_dataset_phys(ds)->ds_prev_snap_obj);
-	}
-
-	dsl_scan_sync_state(scn, tx, SYNC_CACHED);
-}
-
-static void
-ds_clone_swapped_bookmark(dsl_dataset_t *ds1, dsl_dataset_t *ds2,
-    zbookmark_phys_t *scn_bookmark)
-{
-	if (scn_bookmark->zb_objset == ds1->ds_object) {
-		scn_bookmark->zb_objset = ds2->ds_object;
-		zfs_dbgmsg("clone_swap ds %llu; currently traversing; "
-		    "reset zb_objset to %llu",
-		    (u_longlong_t)ds1->ds_object,
-		    (u_longlong_t)ds2->ds_object);
-	} else if (scn_bookmark->zb_objset == ds2->ds_object) {
-		scn_bookmark->zb_objset = ds1->ds_object;
-		zfs_dbgmsg("clone_swap ds %llu; currently traversing; "
-		    "reset zb_objset to %llu",
-		    (u_longlong_t)ds2->ds_object,
-		    (u_longlong_t)ds1->ds_object);
-	}
-}
-
-/*
- * Called when an origin dataset and its clone are swapped.  If we were
- * currently traversing the dataset, we need to switch to traversing the
- * newly promoted clone.
- */
-void
-dsl_scan_ds_clone_swapped(dsl_dataset_t *ds1, dsl_dataset_t *ds2, dmu_tx_t *tx)
-{
-	dsl_pool_t *dp = ds1->ds_dir->dd_pool;
-	dsl_scan_t *scn = dp->dp_scan;
-	uint64_t mintxg1, mintxg2;
-	boolean_t ds1_queued, ds2_queued;
-
-	if (!dsl_scan_is_running(scn))
-		return;
-
-	ds_clone_swapped_bookmark(ds1, ds2, &scn->scn_phys.scn_bookmark);
-	ds_clone_swapped_bookmark(ds1, ds2, &scn->scn_phys_cached.scn_bookmark);
-
-	/*
-	 * Handle the in-memory scan queue.
-	 */
-	ds1_queued = scan_ds_queue_contains(scn, ds1->ds_object, &mintxg1);
-	ds2_queued = scan_ds_queue_contains(scn, ds2->ds_object, &mintxg2);
-
-	/* Sanity checking. */
-	if (ds1_queued) {
-		ASSERT3U(mintxg1, ==, dsl_dataset_phys(ds1)->ds_prev_snap_txg);
-		ASSERT3U(mintxg1, ==, dsl_dataset_phys(ds2)->ds_prev_snap_txg);
-	}
-	if (ds2_queued) {
-		ASSERT3U(mintxg2, ==, dsl_dataset_phys(ds1)->ds_prev_snap_txg);
-		ASSERT3U(mintxg2, ==, dsl_dataset_phys(ds2)->ds_prev_snap_txg);
-	}
-
-	if (ds1_queued && ds2_queued) {
-		/*
-		 * If both are queued, we don't need to do anything.
-		 * The swapping code below would not handle this case correctly,
-		 * since we can't insert ds2 if it is already there. That's
-		 * because scan_ds_queue_insert() prohibits a duplicate insert
-		 * and panics.
-		 */
-	} else if (ds1_queued) {
-		scan_ds_queue_remove(scn, ds1->ds_object);
-		scan_ds_queue_insert(scn, ds2->ds_object, mintxg1);
-	} else if (ds2_queued) {
-		scan_ds_queue_remove(scn, ds2->ds_object);
-		scan_ds_queue_insert(scn, ds1->ds_object, mintxg2);
-	}
-
-	/*
-	 * Handle the on-disk scan queue.
-	 * The on-disk state is an out-of-date version of the in-memory state,
-	 * so the in-memory and on-disk values for ds1_queued and ds2_queued may
-	 * be different. Therefore we need to apply the swap logic to the
-	 * on-disk state independently of the in-memory state.
-	 */
-	ds1_queued = zap_lookup_int_key(dp->dp_meta_objset,
-	    scn->scn_phys.scn_queue_obj, ds1->ds_object, &mintxg1) == 0;
-	ds2_queued = zap_lookup_int_key(dp->dp_meta_objset,
-	    scn->scn_phys.scn_queue_obj, ds2->ds_object, &mintxg2) == 0;
-
-	/* Sanity checking. */
-	if (ds1_queued) {
-		ASSERT3U(mintxg1, ==, dsl_dataset_phys(ds1)->ds_prev_snap_txg);
-		ASSERT3U(mintxg1, ==, dsl_dataset_phys(ds2)->ds_prev_snap_txg);
-	}
-	if (ds2_queued) {
-		ASSERT3U(mintxg2, ==, dsl_dataset_phys(ds1)->ds_prev_snap_txg);
-		ASSERT3U(mintxg2, ==, dsl_dataset_phys(ds2)->ds_prev_snap_txg);
-	}
-
-	if (ds1_queued && ds2_queued) {
-		/*
-		 * If both are queued, we don't need to do anything.
-		 * Alternatively, we could check for EEXIST from
-		 * zap_add_int_key() and back out to the original state, but
-		 * that would be more work than checking for this case upfront.
-		 */
-	} else if (ds1_queued) {
-		VERIFY3S(0, ==, zap_remove_int(dp->dp_meta_objset,
-		    scn->scn_phys.scn_queue_obj, ds1->ds_object, tx));
-		VERIFY3S(0, ==, zap_add_int_key(dp->dp_meta_objset,
-		    scn->scn_phys.scn_queue_obj, ds2->ds_object, mintxg1, tx));
-		zfs_dbgmsg("clone_swap ds %llu; in queue; "
-		    "replacing with %llu",
-		    (u_longlong_t)ds1->ds_object,
-		    (u_longlong_t)ds2->ds_object);
-	} else if (ds2_queued) {
-		VERIFY3S(0, ==, zap_remove_int(dp->dp_meta_objset,
-		    scn->scn_phys.scn_queue_obj, ds2->ds_object, tx));
-		VERIFY3S(0, ==, zap_add_int_key(dp->dp_meta_objset,
-		    scn->scn_phys.scn_queue_obj, ds1->ds_object, mintxg2, tx));
-		zfs_dbgmsg("clone_swap ds %llu; in queue; "
-		    "replacing with %llu",
-		    (u_longlong_t)ds2->ds_object,
-		    (u_longlong_t)ds1->ds_object);
-	}
-
-	dsl_scan_sync_state(scn, tx, SYNC_CACHED);
-}
-
-/* ARGSUSED */
-static int
-enqueue_clones_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg)
-{
-	uint64_t originobj = *(uint64_t *)arg;
-	dsl_dataset_t *ds;
-	int err;
-	dsl_scan_t *scn = dp->dp_scan;
-
-	if (dsl_dir_phys(hds->ds_dir)->dd_origin_obj != originobj)
-		return (0);
-
-	err = dsl_dataset_hold_obj(dp, hds->ds_object, FTAG, &ds);
-	if (err)
-		return (err);
-
-	while (dsl_dataset_phys(ds)->ds_prev_snap_obj != originobj) {
-		dsl_dataset_t *prev;
-		err = dsl_dataset_hold_obj(dp,
-		    dsl_dataset_phys(ds)->ds_prev_snap_obj, FTAG, &prev);
-
-		dsl_dataset_rele(ds, FTAG);
-		if (err)
-			return (err);
-		ds = prev;
-	}
-	scan_ds_queue_insert(scn, ds->ds_object,
-	    dsl_dataset_phys(ds)->ds_prev_snap_txg);
-	dsl_dataset_rele(ds, FTAG);
-	return (0);
-}
-
-static void
-dsl_scan_visitds(dsl_scan_t *scn, uint64_t dsobj, dmu_tx_t *tx)
-{
-	dsl_pool_t *dp = scn->scn_dp;
-	dsl_dataset_t *ds;
-
-	VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds));
-
-	if (scn->scn_phys.scn_cur_min_txg >=
-	    scn->scn_phys.scn_max_txg) {
-		/*
-		 * This can happen if this snapshot was created after the
-		 * scan started, and we already completed a previous snapshot
-		 * that was created after the scan started.  This snapshot
-		 * only references blocks with:
-		 *
-		 *	birth < our ds_creation_txg
-		 *	cur_min_txg is no less than ds_creation_txg.
-		 *	We have already visited these blocks.
-		 * or
-		 *	birth > scn_max_txg
-		 *	The scan requested not to visit these blocks.
-		 *
-		 * Subsequent snapshots (and clones) can reference our
-		 * blocks, or blocks with even higher birth times.
-		 * Therefore we do not need to visit them either,
-		 * so we do not add them to the work queue.
-		 *
-		 * Note that checking for cur_min_txg >= cur_max_txg
-		 * is not sufficient, because in that case we may need to
-		 * visit subsequent snapshots.  This happens when min_txg > 0,
-		 * which raises cur_min_txg.  In this case we will visit
-		 * this dataset but skip all of its blocks, because the
-		 * rootbp's birth time is < cur_min_txg.  Then we will
-		 * add the next snapshots/clones to the work queue.
-		 */
-		char *dsname = kmem_alloc(MAXNAMELEN, KM_SLEEP);
-		dsl_dataset_name(ds, dsname);
-		zfs_dbgmsg("scanning dataset %llu (%s) is unnecessary because "
-		    "cur_min_txg (%llu) >= max_txg (%llu)",
-		    (longlong_t)dsobj, dsname,
-		    (longlong_t)scn->scn_phys.scn_cur_min_txg,
-		    (longlong_t)scn->scn_phys.scn_max_txg);
-		kmem_free(dsname, MAXNAMELEN);
-
-		goto out;
-	}
-
-	/*
-	 * Only the ZIL in the head (non-snapshot) is valid. Even though
-	 * snapshots can have ZIL block pointers (which may be the same
-	 * BP as in the head), they must be ignored. In addition, $ORIGIN
-	 * doesn't have a objset (i.e. its ds_bp is a hole) so we don't
-	 * need to look for a ZIL in it either. So we traverse the ZIL here,
-	 * rather than in scan_recurse(), because the regular snapshot
-	 * block-sharing rules don't apply to it.
-	 */
-	if (DSL_SCAN_IS_SCRUB_RESILVER(scn) && !dsl_dataset_is_snapshot(ds) &&
-	    (dp->dp_origin_snap == NULL ||
-	    ds->ds_dir != dp->dp_origin_snap->ds_dir)) {
-		objset_t *os;
-		if (dmu_objset_from_ds(ds, &os) != 0) {
-			goto out;
-		}
-		dsl_scan_zil(dp, &os->os_zil_header);
-	}
-
-	/*
-	 * Iterate over the bps in this ds.
-	 */
-	dmu_buf_will_dirty(ds->ds_dbuf, tx);
-	rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
-	dsl_scan_visit_rootbp(scn, ds, &dsl_dataset_phys(ds)->ds_bp, tx);
-	rrw_exit(&ds->ds_bp_rwlock, FTAG);
-
-	char *dsname = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP);
-	dsl_dataset_name(ds, dsname);
-	zfs_dbgmsg("scanned dataset %llu (%s) with min=%llu max=%llu; "
-	    "suspending=%u",
-	    (longlong_t)dsobj, dsname,
-	    (longlong_t)scn->scn_phys.scn_cur_min_txg,
-	    (longlong_t)scn->scn_phys.scn_cur_max_txg,
-	    (int)scn->scn_suspending);
-	kmem_free(dsname, ZFS_MAX_DATASET_NAME_LEN);
-
-	if (scn->scn_suspending)
-		goto out;
-
-	/*
-	 * We've finished this pass over this dataset.
-	 */
-
-	/*
-	 * If we did not completely visit this dataset, do another pass.
-	 */
-	if (scn->scn_phys.scn_flags & DSF_VISIT_DS_AGAIN) {
-		zfs_dbgmsg("incomplete pass; visiting again");
-		scn->scn_phys.scn_flags &= ~DSF_VISIT_DS_AGAIN;
-		scan_ds_queue_insert(scn, ds->ds_object,
-		    scn->scn_phys.scn_cur_max_txg);
-		goto out;
-	}
-
-	/*
-	 * Add descendent datasets to work queue.
-	 */
-	if (dsl_dataset_phys(ds)->ds_next_snap_obj != 0) {
-		scan_ds_queue_insert(scn,
-		    dsl_dataset_phys(ds)->ds_next_snap_obj,
-		    dsl_dataset_phys(ds)->ds_creation_txg);
-	}
-	if (dsl_dataset_phys(ds)->ds_num_children > 1) {
-		boolean_t usenext = B_FALSE;
-		if (dsl_dataset_phys(ds)->ds_next_clones_obj != 0) {
-			uint64_t count;
-			/*
-			 * A bug in a previous version of the code could
-			 * cause upgrade_clones_cb() to not set
-			 * ds_next_snap_obj when it should, leading to a
-			 * missing entry.  Therefore we can only use the
-			 * next_clones_obj when its count is correct.
-			 */
-			int err = zap_count(dp->dp_meta_objset,
-			    dsl_dataset_phys(ds)->ds_next_clones_obj, &count);
-			if (err == 0 &&
-			    count == dsl_dataset_phys(ds)->ds_num_children - 1)
-				usenext = B_TRUE;
-		}
-
-		if (usenext) {
-			zap_cursor_t zc;
-			zap_attribute_t za;
-			for (zap_cursor_init(&zc, dp->dp_meta_objset,
-			    dsl_dataset_phys(ds)->ds_next_clones_obj);
-			    zap_cursor_retrieve(&zc, &za) == 0;
-			    (void) zap_cursor_advance(&zc)) {
-				scan_ds_queue_insert(scn,
-				    zfs_strtonum(za.za_name, NULL),
-				    dsl_dataset_phys(ds)->ds_creation_txg);
-			}
-			zap_cursor_fini(&zc);
-		} else {
-			VERIFY0(dmu_objset_find_dp(dp, dp->dp_root_dir_obj,
-			    enqueue_clones_cb, &ds->ds_object,
-			    DS_FIND_CHILDREN));
-		}
-	}
-
-out:
-	dsl_dataset_rele(ds, FTAG);
-}
-
-/* ARGSUSED */
-static int
-enqueue_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg)
-{
-	dsl_dataset_t *ds;
-	int err;
-	dsl_scan_t *scn = dp->dp_scan;
-
-	err = dsl_dataset_hold_obj(dp, hds->ds_object, FTAG, &ds);
-	if (err)
-		return (err);
-
-	while (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) {
-		dsl_dataset_t *prev;
-		err = dsl_dataset_hold_obj(dp,
-		    dsl_dataset_phys(ds)->ds_prev_snap_obj, FTAG, &prev);
-		if (err) {
-			dsl_dataset_rele(ds, FTAG);
-			return (err);
-		}
-
-		/*
-		 * If this is a clone, we don't need to worry about it for now.
-		 */
-		if (dsl_dataset_phys(prev)->ds_next_snap_obj != ds->ds_object) {
-			dsl_dataset_rele(ds, FTAG);
-			dsl_dataset_rele(prev, FTAG);
-			return (0);
-		}
-		dsl_dataset_rele(ds, FTAG);
-		ds = prev;
-	}
-
-	scan_ds_queue_insert(scn, ds->ds_object,
-	    dsl_dataset_phys(ds)->ds_prev_snap_txg);
-	dsl_dataset_rele(ds, FTAG);
-	return (0);
-}
-
-/* ARGSUSED */
-void
-dsl_scan_ddt_entry(dsl_scan_t *scn, enum zio_checksum checksum,
-    ddt_entry_t *dde, dmu_tx_t *tx)
-{
-	const ddt_key_t *ddk = &dde->dde_key;
-	ddt_phys_t *ddp = dde->dde_phys;
-	blkptr_t bp;
-	zbookmark_phys_t zb = { 0 };
-	int p;
-
-	if (!dsl_scan_is_running(scn))
-		return;
-
-	/*
-	 * This function is special because it is the only thing
-	 * that can add scan_io_t's to the vdev scan queues from
-	 * outside dsl_scan_sync(). For the most part this is ok
-	 * as long as it is called from within syncing context.
-	 * However, dsl_scan_sync() expects that no new sio's will
-	 * be added between when all the work for a scan is done
-	 * and the next txg when the scan is actually marked as
-	 * completed. This check ensures we do not issue new sio's
-	 * during this period.
-	 */
-	if (scn->scn_done_txg != 0)
-		return;
-
-	for (p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
-		if (ddp->ddp_phys_birth == 0 ||
-		    ddp->ddp_phys_birth > scn->scn_phys.scn_max_txg)
-			continue;
-		ddt_bp_create(checksum, ddk, ddp, &bp);
-
-		scn->scn_visited_this_txg++;
-		scan_funcs[scn->scn_phys.scn_func](scn->scn_dp, &bp, &zb);
-	}
-}
-
-/*
- * Scrub/dedup interaction.
- *
- * If there are N references to a deduped block, we don't want to scrub it
- * N times -- ideally, we should scrub it exactly once.
- *
- * We leverage the fact that the dde's replication class (enum ddt_class)
- * is ordered from highest replication class (DDT_CLASS_DITTO) to lowest
- * (DDT_CLASS_UNIQUE) so that we may walk the DDT in that order.
- *
- * To prevent excess scrubbing, the scrub begins by walking the DDT
- * to find all blocks with refcnt > 1, and scrubs each of these once.
- * Since there are two replication classes which contain blocks with
- * refcnt > 1, we scrub the highest replication class (DDT_CLASS_DITTO) first.
- * Finally the top-down scrub begins, only visiting blocks with refcnt == 1.
- *
- * There would be nothing more to say if a block's refcnt couldn't change
- * during a scrub, but of course it can so we must account for changes
- * in a block's replication class.
- *
- * Here's an example of what can occur:
- *
- * If a block has refcnt > 1 during the DDT scrub phase, but has refcnt == 1
- * when visited during the top-down scrub phase, it will be scrubbed twice.
- * This negates our scrub optimization, but is otherwise harmless.
- *
- * If a block has refcnt == 1 during the DDT scrub phase, but has refcnt > 1
- * on each visit during the top-down scrub phase, it will never be scrubbed.
- * To catch this, ddt_sync_entry() notifies the scrub code whenever a block's
- * reference class transitions to a higher level (i.e DDT_CLASS_UNIQUE to
- * DDT_CLASS_DUPLICATE); if it transitions from refcnt == 1 to refcnt > 1
- * while a scrub is in progress, it scrubs the block right then.
- */
-static void
-dsl_scan_ddt(dsl_scan_t *scn, dmu_tx_t *tx)
-{
-	ddt_bookmark_t *ddb = &scn->scn_phys.scn_ddt_bookmark;
-	ddt_entry_t dde = { 0 };
-	int error;
-	uint64_t n = 0;
-
-	while ((error = ddt_walk(scn->scn_dp->dp_spa, ddb, &dde)) == 0) {
-		ddt_t *ddt;
-
-		if (ddb->ddb_class > scn->scn_phys.scn_ddt_class_max)
-			break;
-		dprintf("visiting ddb=%llu/%llu/%llu/%llx\n",
-		    (longlong_t)ddb->ddb_class,
-		    (longlong_t)ddb->ddb_type,
-		    (longlong_t)ddb->ddb_checksum,
-		    (longlong_t)ddb->ddb_cursor);
-
-		/* There should be no pending changes to the dedup table */
-		ddt = scn->scn_dp->dp_spa->spa_ddt[ddb->ddb_checksum];
-		ASSERT(avl_first(&ddt->ddt_tree) == NULL);
-
-		dsl_scan_ddt_entry(scn, ddb->ddb_checksum, &dde, tx);
-		n++;
-
-		if (dsl_scan_check_suspend(scn, NULL))
-			break;
-	}
-
-	zfs_dbgmsg("scanned %llu ddt entries with class_max = %u; "
-	    "suspending=%u", (longlong_t)n,
-	    (int)scn->scn_phys.scn_ddt_class_max, (int)scn->scn_suspending);
-
-	ASSERT(error == 0 || error == ENOENT);
-	ASSERT(error != ENOENT ||
-	    ddb->ddb_class > scn->scn_phys.scn_ddt_class_max);
-}
-
-static uint64_t
-dsl_scan_ds_maxtxg(dsl_dataset_t *ds)
-{
-	uint64_t smt = ds->ds_dir->dd_pool->dp_scan->scn_phys.scn_max_txg;
-	if (ds->ds_is_snapshot)
-		return (MIN(smt, dsl_dataset_phys(ds)->ds_creation_txg));
-	return (smt);
-}
-
-static void
-dsl_scan_visit(dsl_scan_t *scn, dmu_tx_t *tx)
-{
-	scan_ds_t *sds;
-	dsl_pool_t *dp = scn->scn_dp;
-
-	if (scn->scn_phys.scn_ddt_bookmark.ddb_class <=
-	    scn->scn_phys.scn_ddt_class_max) {
-		scn->scn_phys.scn_cur_min_txg = scn->scn_phys.scn_min_txg;
-		scn->scn_phys.scn_cur_max_txg = scn->scn_phys.scn_max_txg;
-		dsl_scan_ddt(scn, tx);
-		if (scn->scn_suspending)
-			return;
-	}
-
-	if (scn->scn_phys.scn_bookmark.zb_objset == DMU_META_OBJSET) {
-		/* First do the MOS & ORIGIN */
-
-		scn->scn_phys.scn_cur_min_txg = scn->scn_phys.scn_min_txg;
-		scn->scn_phys.scn_cur_max_txg = scn->scn_phys.scn_max_txg;
-		dsl_scan_visit_rootbp(scn, NULL,
-		    &dp->dp_meta_rootbp, tx);
-		spa_set_rootblkptr(dp->dp_spa, &dp->dp_meta_rootbp);
-		if (scn->scn_suspending)
-			return;
-
-		if (spa_version(dp->dp_spa) < SPA_VERSION_DSL_SCRUB) {
-			VERIFY0(dmu_objset_find_dp(dp, dp->dp_root_dir_obj,
-			    enqueue_cb, NULL, DS_FIND_CHILDREN));
-		} else {
-			dsl_scan_visitds(scn,
-			    dp->dp_origin_snap->ds_object, tx);
-		}
-		ASSERT(!scn->scn_suspending);
-	} else if (scn->scn_phys.scn_bookmark.zb_objset !=
-	    ZB_DESTROYED_OBJSET) {
-		uint64_t dsobj = scn->scn_phys.scn_bookmark.zb_objset;
-		/*
-		 * If we were suspended, continue from here. Note if the
-		 * ds we were suspended on was deleted, the zb_objset may
-		 * be -1, so we will skip this and find a new objset
-		 * below.
-		 */
-		dsl_scan_visitds(scn, dsobj, tx);
-		if (scn->scn_suspending)
-			return;
-	}
-
-	/*
-	 * In case we suspended right at the end of the ds, zero the
-	 * bookmark so we don't think that we're still trying to resume.
-	 */
-	bzero(&scn->scn_phys.scn_bookmark, sizeof (zbookmark_phys_t));
-
-	/*
-	 * Keep pulling things out of the dataset avl queue. Updates to the
-	 * persistent zap-object-as-queue happen only at checkpoints.
-	 */
-	while ((sds = avl_first(&scn->scn_queue)) != NULL) {
-		dsl_dataset_t *ds;
-		uint64_t dsobj = sds->sds_dsobj;
-		uint64_t txg = sds->sds_txg;
-
-		/* dequeue and free the ds from the queue */
-		scan_ds_queue_remove(scn, dsobj);
-		sds = NULL;	/* must not be touched after removal */
-
-		/* Set up min / max txg */
-		VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds));
-		if (txg != 0) {
-			scn->scn_phys.scn_cur_min_txg =
-			    MAX(scn->scn_phys.scn_min_txg, txg);
-		} else {
-			scn->scn_phys.scn_cur_min_txg =
-			    MAX(scn->scn_phys.scn_min_txg,
-			    dsl_dataset_phys(ds)->ds_prev_snap_txg);
-		}
-		scn->scn_phys.scn_cur_max_txg = dsl_scan_ds_maxtxg(ds);
-		dsl_dataset_rele(ds, FTAG);
-
-		dsl_scan_visitds(scn, dsobj, tx);
-		if (scn->scn_suspending)
-			return;
-	}
-	/* No more objsets to fetch, we're done */
-	scn->scn_phys.scn_bookmark.zb_objset = ZB_DESTROYED_OBJSET;
-	ASSERT0(scn->scn_suspending);
-}
-
-static uint64_t
-dsl_scan_count_leaves(vdev_t *vd)
-{
-	uint64_t i, leaves = 0;
-	
-	/* we only count leaves that belong to the main pool and are readable */
-	if (vd->vdev_islog || vd->vdev_isspare ||
-	    vd->vdev_isl2cache || !vdev_readable(vd))
-		return (0);
-	
-	if (vd->vdev_ops->vdev_op_leaf)
-		return (1);
-	
-	for (i = 0; i < vd->vdev_children; i++) {
-		leaves += dsl_scan_count_leaves(vd->vdev_child[i]);
-	}
-	
-	return (leaves);
-}
-
-
-static void
-scan_io_queues_update_zio_stats(dsl_scan_io_queue_t *q, const blkptr_t *bp)
-{
-	int i;
-	uint64_t cur_size = 0;
-
-	for (i = 0; i < BP_GET_NDVAS(bp); i++) {
-		cur_size += DVA_GET_ASIZE(&bp->blk_dva[i]);
-	}
-
-	q->q_total_zio_size_this_txg += cur_size;
-	q->q_zios_this_txg++;
-}
-
-static void
-scan_io_queues_update_seg_stats(dsl_scan_io_queue_t *q, uint64_t start,
-    uint64_t end)
-{
-	q->q_total_seg_size_this_txg += end - start;
-	q->q_segs_this_txg++;
-}
-
-static boolean_t
-scan_io_queue_check_suspend(dsl_scan_t *scn)
-{
-	/* See comment in dsl_scan_check_suspend() */
-	uint64_t curr_time_ns = gethrtime();
-	uint64_t scan_time_ns = curr_time_ns - scn->scn_sync_start_time;
-	uint64_t sync_time_ns = curr_time_ns -
-	    scn->scn_dp->dp_spa->spa_sync_starttime;
-	int dirty_pct = scn->scn_dp->dp_dirty_total * 100 / zfs_dirty_data_max;
-	int mintime = (scn->scn_phys.scn_func == POOL_SCAN_RESILVER) ?
-	    zfs_resilver_min_time_ms : zfs_scrub_min_time_ms;
-       
-	return ((NSEC2MSEC(scan_time_ns) > mintime &&
-	    (dirty_pct >= zfs_vdev_async_write_active_min_dirty_percent ||
-	    txg_sync_waiting(scn->scn_dp) ||
-	    NSEC2SEC(sync_time_ns) >= zfs_txg_timeout)) ||
-	    spa_shutting_down(scn->scn_dp->dp_spa));
-}
-
-/*
- * Given a list of scan_io_t's in io_list, this issues the io's out to
- * disk. This consumes the io_list and frees the scan_io_t's. This is
- * called when emptying queues, either when we're up against the memory
- * limit or when we have finished scanning. Returns B_TRUE if we stopped
- * processing the list before we finished. Any zios that were not issued
- * will remain in the io_list.
- */
-static boolean_t
-scan_io_queue_issue(dsl_scan_io_queue_t *queue, list_t *io_list)
-{
-	dsl_scan_t *scn = queue->q_scn;
-	scan_io_t *sio;
-	int64_t bytes_issued = 0;
-	boolean_t suspended = B_FALSE;
-
-	while ((sio = list_head(io_list)) != NULL) {
-		blkptr_t bp;
-
-		if (scan_io_queue_check_suspend(scn)) {
-			suspended = B_TRUE;
-			break;
-		}
-
-		sio2bp(sio, &bp, queue->q_vd->vdev_id);
-		bytes_issued += sio->sio_asize;
-		scan_exec_io(scn->scn_dp, &bp, sio->sio_flags,
-		    &sio->sio_zb, queue);
-		(void) list_remove_head(io_list);
-		scan_io_queues_update_zio_stats(queue, &bp);
-		kmem_free(sio, sizeof (*sio));
-	}
-
-	atomic_add_64(&scn->scn_bytes_pending, -bytes_issued);
-
-	return (suspended);
-}
-
-/*
- * Given a range_seg_t (extent) and a list, this function passes over a
- * scan queue and gathers up the appropriate ios which fit into that
- * scan seg (starting from lowest LBA). At the end, we remove the segment
- * from the q_exts_by_addr range tree.
- */
-static boolean_t
-scan_io_queue_gather(dsl_scan_io_queue_t *queue, range_seg_t *rs, list_t *list)
-{
-	scan_io_t srch_sio, *sio, *next_sio;
-	avl_index_t idx;
-	uint_t num_sios = 0;
-	int64_t bytes_issued = 0;
-
-	ASSERT(rs != NULL);
-	ASSERT(MUTEX_HELD(&queue->q_vd->vdev_scan_io_queue_lock));
-
-	srch_sio.sio_offset = rs->rs_start;
-
-	/*
-	 * The exact start of the extent might not contain any matching zios,
-	 * so if that's the case, examine the next one in the tree.
-	 */
-	sio = avl_find(&queue->q_sios_by_addr, &srch_sio, &idx);
-	if (sio == NULL)
-		sio = avl_nearest(&queue->q_sios_by_addr, idx, AVL_AFTER);
-
-	while (sio != NULL && sio->sio_offset < rs->rs_end && num_sios <= 32) {
-		ASSERT3U(sio->sio_offset, >=, rs->rs_start);
-		ASSERT3U(sio->sio_offset + sio->sio_asize, <=, rs->rs_end);
-
-		next_sio = AVL_NEXT(&queue->q_sios_by_addr, sio);
-		avl_remove(&queue->q_sios_by_addr, sio);
-
-		bytes_issued += sio->sio_asize;
-		num_sios++;
-		list_insert_tail(list, sio);
-		sio = next_sio;
-	}
-
-	/*
-	 * We limit the number of sios we process at once to 32 to avoid
-	 * biting off more than we can chew. If we didn't take everything
-	 * in the segment we update it to reflect the work we were able to
-	 * complete. Otherwise, we remove it from the range tree entirely.
-	 */
-	if (sio != NULL && sio->sio_offset < rs->rs_end) {
-		range_tree_adjust_fill(queue->q_exts_by_addr, rs,
-		    -bytes_issued);
-		range_tree_resize_segment(queue->q_exts_by_addr, rs,
-		    sio->sio_offset, rs->rs_end - sio->sio_offset);
-
-		return (B_TRUE);
-	} else {
-		range_tree_remove(queue->q_exts_by_addr, rs->rs_start,
-		    rs->rs_end - rs->rs_start);
-		return (B_FALSE);
-	}
-}
-
-
-/*
- * This is called from the queue emptying thread and selects the next
- * extent from which we are to issue io's. The behavior of this function
- * depends on the state of the scan, the current memory consumption and
- * whether or not we are performing a scan shutdown.
- * 1) We select extents in an elevator algorithm (LBA-order) if the scan
- * 	needs to perform a checkpoint
- * 2) We select the largest available extent if we are up against the
- * 	memory limit.
- * 3) Otherwise we don't select any extents.
- */
-static const range_seg_t *
-scan_io_queue_fetch_ext(dsl_scan_io_queue_t *queue)
-{
-	dsl_scan_t *scn = queue->q_scn;
-
-	ASSERT(MUTEX_HELD(&queue->q_vd->vdev_scan_io_queue_lock));
-	ASSERT(scn->scn_is_sorted);
-
-	/* handle tunable overrides */
-	if (scn->scn_checkpointing || scn->scn_clearing) {
-		if (zfs_scan_issue_strategy == 1) {
-			return (range_tree_first(queue->q_exts_by_addr));
-		} else if (zfs_scan_issue_strategy == 2) {
-			return (avl_first(&queue->q_exts_by_size));
-		}
-	}
-
-	/*
-	 * During normal clearing, we want to issue our largest segments
-	 * first, keeping IO as sequential as possible, and leaving the
-	 * smaller extents for later with the hope that they might eventually
-	 * grow to larger sequential segments. However, when the scan is
-	 * checkpointing, no new extents will be added to the sorting queue,
-	 * so the way we are sorted now is as good as it will ever get.
-	 * In this case, we instead switch to issuing extents in LBA order.
-	 */
-	if (scn->scn_checkpointing) {
-		return (range_tree_first(queue->q_exts_by_addr));
-	} else if (scn->scn_clearing) {
-		return (avl_first(&queue->q_exts_by_size));
-	} else {
-		return (NULL);
-	}
-}
-
-static void
-scan_io_queues_run_one(void *arg)
-{
-	dsl_scan_io_queue_t *queue = arg;
-	kmutex_t *q_lock = &queue->q_vd->vdev_scan_io_queue_lock;
-	boolean_t suspended = B_FALSE;
-	range_seg_t *rs = NULL;
-	scan_io_t *sio = NULL;
-	list_t sio_list;
-	uint64_t bytes_per_leaf = zfs_scan_vdev_limit;
-	uint64_t nr_leaves = dsl_scan_count_leaves(queue->q_vd);
-
-	ASSERT(queue->q_scn->scn_is_sorted);
-
-	list_create(&sio_list, sizeof (scan_io_t),
-	    offsetof(scan_io_t, sio_nodes.sio_list_node));
-	mutex_enter(q_lock);
-
-	/* calculate maximum in-flight bytes for this txg (min 1MB) */
-	queue->q_maxinflight_bytes =
-	    MAX(nr_leaves * bytes_per_leaf, 1ULL << 20);
-
-	/* reset per-queue scan statistics for this txg */
-	queue->q_total_seg_size_this_txg = 0;
-	queue->q_segs_this_txg = 0;
-	queue->q_total_zio_size_this_txg = 0;
-	queue->q_zios_this_txg = 0;
-
-	/* loop until we have run out of time or sios */
-	while ((rs = (range_seg_t*)scan_io_queue_fetch_ext(queue)) != NULL) {
-		uint64_t seg_start = 0, seg_end = 0;
-		boolean_t more_left = B_TRUE;
-
-		ASSERT(list_is_empty(&sio_list));
-
-		/* loop while we still have sios left to process in this rs */
-		while (more_left) {
-			scan_io_t *first_sio, *last_sio;
-
-			/*
-			 * We have selected which extent needs to be
-			 * processed next. Gather up the corresponding sios.
-			 */
-			more_left = scan_io_queue_gather(queue, rs, &sio_list);
-			ASSERT(!list_is_empty(&sio_list));
-			first_sio = list_head(&sio_list);
-			last_sio = list_tail(&sio_list);
-
-			seg_end = last_sio->sio_offset + last_sio->sio_asize;
-			if (seg_start == 0)
-				seg_start = first_sio->sio_offset;
-
-			/*
-			 * Issuing sios can take a long time so drop the
-			 * queue lock. The sio queue won't be updated by
-			 * other threads since we're in syncing context so
-			 * we can be sure that our trees will remain exactly
-			 * as we left them.
-			 */
-			mutex_exit(q_lock);
-			suspended = scan_io_queue_issue(queue, &sio_list);
-			mutex_enter(q_lock);
-
-			if (suspended)
-				break;
-		}
-		/* update statistics for debugging purposes */
-		scan_io_queues_update_seg_stats(queue, seg_start, seg_end);
-		
-		if (suspended)
-			break;
-	}
-		
-
-	/* If we were suspended in the middle of processing,
-	 * requeue any unfinished sios and exit.
-	 */
-	while ((sio = list_head(&sio_list)) != NULL) {
-		list_remove(&sio_list, sio);
-		scan_io_queue_insert_impl(queue, sio);
-	}
-
-	mutex_exit(q_lock);
-	list_destroy(&sio_list);
-}
-
-/*
- * Performs an emptying run on all scan queues in the pool. This just
- * punches out one thread per top-level vdev, each of which processes
- * only that vdev's scan queue. We can parallelize the I/O here because
- * we know that each queue's io's only affect its own top-level vdev.
- *
- * This function waits for the queue runs to complete, and must be
- * called from dsl_scan_sync (or in general, syncing context).
- */
-static void
-scan_io_queues_run(dsl_scan_t *scn)
-{
-	spa_t *spa = scn->scn_dp->dp_spa;
-
-	ASSERT(scn->scn_is_sorted);
-	ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER));
-
-	if (scn->scn_bytes_pending == 0)
-		return;
-
-	if (scn->scn_taskq == NULL) {
-		char *tq_name = kmem_zalloc(ZFS_MAX_DATASET_NAME_LEN + 16,
-		    KM_SLEEP);
-		int nthreads = spa->spa_root_vdev->vdev_children;
-
-		/*
-		 * We need to make this taskq *always* execute as many
-		 * threads in parallel as we have top-level vdevs and no
-		 * less, otherwise strange serialization of the calls to
-		 * scan_io_queues_run_one can occur during spa_sync runs
-		 * and that significantly impacts performance.
-		 */
-		(void) snprintf(tq_name, ZFS_MAX_DATASET_NAME_LEN + 16,
-		    "dsl_scan_tq_%s", spa->spa_name);
-		scn->scn_taskq = taskq_create(tq_name, nthreads, minclsyspri,
-		    nthreads, nthreads, TASKQ_PREPOPULATE);
-		kmem_free(tq_name, ZFS_MAX_DATASET_NAME_LEN + 16);
-	}
-
-	for (uint64_t i = 0; i < spa->spa_root_vdev->vdev_children; i++) {
-		vdev_t *vd = spa->spa_root_vdev->vdev_child[i];
-
-		mutex_enter(&vd->vdev_scan_io_queue_lock);
-		if (vd->vdev_scan_io_queue != NULL) {
-			VERIFY(taskq_dispatch(scn->scn_taskq,
-			    scan_io_queues_run_one, vd->vdev_scan_io_queue,
-			    TQ_SLEEP) != TASKQID_INVALID);
-		}
-		mutex_exit(&vd->vdev_scan_io_queue_lock);
-	}
-
-	/*
-	 * Wait for the queues to finish issuing thir IOs for this run
-	 * before we return. There may still be IOs in flight at this
-	 * point.
-	 */
-	taskq_wait(scn->scn_taskq);
-}
-
-static boolean_t
-dsl_scan_async_block_should_pause(dsl_scan_t *scn)
-{
-	uint64_t elapsed_nanosecs;
-
-	if (zfs_recover)
-		return (B_FALSE);
-
-	if (scn->scn_visited_this_txg >= zfs_async_block_max_blocks)
-		return (B_TRUE);
-
-	elapsed_nanosecs = gethrtime() - scn->scn_sync_start_time;
-	return (elapsed_nanosecs / NANOSEC > zfs_txg_timeout ||
-	    (NSEC2MSEC(elapsed_nanosecs) > scn->scn_async_block_min_time_ms &&
-	    txg_sync_waiting(scn->scn_dp)) ||
-	    spa_shutting_down(scn->scn_dp->dp_spa));
-}
-
-static int
-dsl_scan_free_block_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
-{
-	dsl_scan_t *scn = arg;
-
-	if (!scn->scn_is_bptree ||
-	    (BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_OBJSET)) {
-		if (dsl_scan_async_block_should_pause(scn))
-			return (SET_ERROR(ERESTART));
-	}
-
-	zio_nowait(zio_free_sync(scn->scn_zio_root, scn->scn_dp->dp_spa,
-	    dmu_tx_get_txg(tx), bp, BP_GET_PSIZE(bp), 0));
-	dsl_dir_diduse_space(tx->tx_pool->dp_free_dir, DD_USED_HEAD,
-	    -bp_get_dsize_sync(scn->scn_dp->dp_spa, bp),
-	    -BP_GET_PSIZE(bp), -BP_GET_UCSIZE(bp), tx);
-	scn->scn_visited_this_txg++;
-	return (0);
-}
-
-static void
-dsl_scan_update_stats(dsl_scan_t *scn)
-{
-	spa_t *spa = scn->scn_dp->dp_spa;
-	uint64_t i;
-	uint64_t seg_size_total = 0, zio_size_total = 0;
-	uint64_t seg_count_total = 0, zio_count_total = 0;
-
-	for (i = 0; i < spa->spa_root_vdev->vdev_children; i++) {
-		vdev_t *vd = spa->spa_root_vdev->vdev_child[i];
-		dsl_scan_io_queue_t *queue = vd->vdev_scan_io_queue;
-
-		if (queue == NULL)
-			continue;
-
-		seg_size_total += queue->q_total_seg_size_this_txg;
-		zio_size_total += queue->q_total_zio_size_this_txg;
-		seg_count_total += queue->q_segs_this_txg;
-		zio_count_total += queue->q_zios_this_txg;
-	}
-
-	if (seg_count_total == 0 || zio_count_total == 0) {
-		scn->scn_avg_seg_size_this_txg = 0;
-		scn->scn_avg_zio_size_this_txg = 0;
-		scn->scn_segs_this_txg = 0;
-		scn->scn_zios_this_txg = 0;
-		return;
-	}
-
-	scn->scn_avg_seg_size_this_txg = seg_size_total / seg_count_total;
-	scn->scn_avg_zio_size_this_txg = zio_size_total / zio_count_total;
-	scn->scn_segs_this_txg = seg_count_total;
-	scn->scn_zios_this_txg = zio_count_total;
-}
-
-static int
-dsl_scan_obsolete_block_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
-{
-	dsl_scan_t *scn = arg;
-	const dva_t *dva = &bp->blk_dva[0];
-
-	if (dsl_scan_async_block_should_pause(scn))
-		return (SET_ERROR(ERESTART));
-
-	spa_vdev_indirect_mark_obsolete(scn->scn_dp->dp_spa,
-	    DVA_GET_VDEV(dva), DVA_GET_OFFSET(dva),
-	    DVA_GET_ASIZE(dva), tx);
-	scn->scn_visited_this_txg++;
-	return (0);
-}
-
-boolean_t
-dsl_scan_active(dsl_scan_t *scn)
-{
-	spa_t *spa = scn->scn_dp->dp_spa;
-	uint64_t used = 0, comp, uncomp;
-
-	if (spa->spa_load_state != SPA_LOAD_NONE)
-		return (B_FALSE);
-	if (spa_shutting_down(spa))
-		return (B_FALSE);
-	if ((dsl_scan_is_running(scn) && !dsl_scan_is_paused_scrub(scn)) ||
-	    (scn->scn_async_destroying && !scn->scn_async_stalled))
-		return (B_TRUE);
-
-	if (spa_version(scn->scn_dp->dp_spa) >= SPA_VERSION_DEADLISTS) {
-		(void) bpobj_space(&scn->scn_dp->dp_free_bpobj,
-		    &used, &comp, &uncomp);
-	}
-	return (used != 0);
-}
-
-static boolean_t
-dsl_scan_need_resilver(spa_t *spa, const dva_t *dva, size_t psize,
-    uint64_t phys_birth)
-{
-	vdev_t *vd;
-
-	vd = vdev_lookup_top(spa, DVA_GET_VDEV(dva));
-
-	if (vd->vdev_ops == &vdev_indirect_ops) {
-		/*
-		 * The indirect vdev can point to multiple
-		 * vdevs.  For simplicity, always create
-		 * the resilver zio_t. zio_vdev_io_start()
-		 * will bypass the child resilver i/o's if
-		 * they are on vdevs that don't have DTL's.
-		 */
-		return (B_TRUE);
-	}
-
-	if (DVA_GET_GANG(dva)) {
-		/*
-		 * Gang members may be spread across multiple
-		 * vdevs, so the best estimate we have is the
-		 * scrub range, which has already been checked.
-		 * XXX -- it would be better to change our
-		 * allocation policy to ensure that all
-		 * gang members reside on the same vdev.
-		 */
-		return (B_TRUE);
-	}
-
-	/*
-	 * Check if the txg falls within the range which must be
-	 * resilvered.  DVAs outside this range can always be skipped.
-	 */
-	if (!vdev_dtl_contains(vd, DTL_PARTIAL, phys_birth, 1))
-		return (B_FALSE);
-
-	/*
-	 * Check if the top-level vdev must resilver this offset.
-	 * When the offset does not intersect with a dirty leaf DTL
-	 * then it may be possible to skip the resilver IO.  The psize
-	 * is provided instead of asize to simplify the check for RAIDZ.
-	 */
-	if (!vdev_dtl_need_resilver(vd, DVA_GET_OFFSET(dva), psize))
-		return (B_FALSE);
-
-	return (B_TRUE);
-}
-
-static int
-dsl_process_async_destroys(dsl_pool_t *dp, dmu_tx_t *tx)
-{
-	int err = 0;
-	dsl_scan_t *scn = dp->dp_scan;
-	spa_t *spa = dp->dp_spa;
-
-	if (spa_suspend_async_destroy(spa))
-		return (0);
-
-	if (zfs_free_bpobj_enabled &&
-	    spa_version(spa) >= SPA_VERSION_DEADLISTS) {
-		scn->scn_is_bptree = B_FALSE;
-		scn->scn_async_block_min_time_ms = zfs_free_min_time_ms;
-		scn->scn_zio_root = zio_root(spa, NULL,
-		    NULL, ZIO_FLAG_MUSTSUCCEED);
-		err = bpobj_iterate(&dp->dp_free_bpobj,
-		    dsl_scan_free_block_cb, scn, tx);
-		VERIFY0(zio_wait(scn->scn_zio_root));
-		scn->scn_zio_root = NULL;
-
-		if (err != 0 && err != ERESTART)
-			zfs_panic_recover("error %u from bpobj_iterate()", err);
-	}
-
-	if (err == 0 && spa_feature_is_active(spa, SPA_FEATURE_ASYNC_DESTROY)) {
-		ASSERT(scn->scn_async_destroying);
-		scn->scn_is_bptree = B_TRUE;
-		scn->scn_zio_root = zio_root(spa, NULL,
-		    NULL, ZIO_FLAG_MUSTSUCCEED);
-		err = bptree_iterate(dp->dp_meta_objset,
-		    dp->dp_bptree_obj, B_TRUE, dsl_scan_free_block_cb, scn, tx);
-		VERIFY0(zio_wait(scn->scn_zio_root));
-		scn->scn_zio_root = NULL;
-
-		if (err == EIO || err == ECKSUM) {
-			err = 0;
-		} else if (err != 0 && err != ERESTART) {
-			zfs_panic_recover("error %u from "
-			    "traverse_dataset_destroyed()", err);
-		}
-
-		if (bptree_is_empty(dp->dp_meta_objset, dp->dp_bptree_obj)) {
-			/* finished; deactivate async destroy feature */
-			spa_feature_decr(spa, SPA_FEATURE_ASYNC_DESTROY, tx);
-			ASSERT(!spa_feature_is_active(spa,
-			    SPA_FEATURE_ASYNC_DESTROY));
-			VERIFY0(zap_remove(dp->dp_meta_objset,
-			    DMU_POOL_DIRECTORY_OBJECT,
-			    DMU_POOL_BPTREE_OBJ, tx));
-			VERIFY0(bptree_free(dp->dp_meta_objset,
-			    dp->dp_bptree_obj, tx));
-			dp->dp_bptree_obj = 0;
-			scn->scn_async_destroying = B_FALSE;
-			scn->scn_async_stalled = B_FALSE;
-		} else {
-			/*
-			 * If we didn't make progress, mark the async
-			 * destroy as stalled, so that we will not initiate
-			 * a spa_sync() on its behalf.  Note that we only
-			 * check this if we are not finished, because if the
-			 * bptree had no blocks for us to visit, we can
-			 * finish without "making progress".
-			 */
-			scn->scn_async_stalled =
-			    (scn->scn_visited_this_txg == 0);
-		}
-	}
-	if (scn->scn_visited_this_txg) {
-		zfs_dbgmsg("freed %llu blocks in %llums from "
-		    "free_bpobj/bptree txg %llu; err=%d",
-		    (longlong_t)scn->scn_visited_this_txg,
-		    (longlong_t)
-		    NSEC2MSEC(gethrtime() - scn->scn_sync_start_time),
-		    (longlong_t)tx->tx_txg, err);
-		scn->scn_visited_this_txg = 0;
-
-		/*
-		 * Write out changes to the DDT that may be required as a
-		 * result of the blocks freed.  This ensures that the DDT
-		 * is clean when a scrub/resilver runs.
-		 */
-		ddt_sync(spa, tx->tx_txg);
-	}
-	if (err != 0)
-		return (err);
-	if (dp->dp_free_dir != NULL && !scn->scn_async_destroying &&
-	    zfs_free_leak_on_eio &&
-	    (dsl_dir_phys(dp->dp_free_dir)->dd_used_bytes != 0 ||
-	    dsl_dir_phys(dp->dp_free_dir)->dd_compressed_bytes != 0 ||
-	    dsl_dir_phys(dp->dp_free_dir)->dd_uncompressed_bytes != 0)) {
-		/*
-		 * We have finished background destroying, but there is still
-		 * some space left in the dp_free_dir. Transfer this leaked
-		 * space to the dp_leak_dir.
-		 */
-		if (dp->dp_leak_dir == NULL) {
-			rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG);
-			(void) dsl_dir_create_sync(dp, dp->dp_root_dir,
-			    LEAK_DIR_NAME, tx);
-			VERIFY0(dsl_pool_open_special_dir(dp,
-			    LEAK_DIR_NAME, &dp->dp_leak_dir));
-			rrw_exit(&dp->dp_config_rwlock, FTAG);
-		}
-		dsl_dir_diduse_space(dp->dp_leak_dir, DD_USED_HEAD,
-		    dsl_dir_phys(dp->dp_free_dir)->dd_used_bytes,
-		    dsl_dir_phys(dp->dp_free_dir)->dd_compressed_bytes,
-		    dsl_dir_phys(dp->dp_free_dir)->dd_uncompressed_bytes, tx);
-		dsl_dir_diduse_space(dp->dp_free_dir, DD_USED_HEAD,
-		    -dsl_dir_phys(dp->dp_free_dir)->dd_used_bytes,
-		    -dsl_dir_phys(dp->dp_free_dir)->dd_compressed_bytes,
-		    -dsl_dir_phys(dp->dp_free_dir)->dd_uncompressed_bytes, tx);
-	}
-
-	if (dp->dp_free_dir != NULL && !scn->scn_async_destroying) {
-		/* finished; verify that space accounting went to zero */
-		ASSERT0(dsl_dir_phys(dp->dp_free_dir)->dd_used_bytes);
-		ASSERT0(dsl_dir_phys(dp->dp_free_dir)->dd_compressed_bytes);
-		ASSERT0(dsl_dir_phys(dp->dp_free_dir)->dd_uncompressed_bytes);
-	}
-
-	EQUIV(bpobj_is_open(&dp->dp_obsolete_bpobj),
-	    0 == zap_contains(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
-	    DMU_POOL_OBSOLETE_BPOBJ));
-	if (err == 0 && bpobj_is_open(&dp->dp_obsolete_bpobj)) {
-		ASSERT(spa_feature_is_active(dp->dp_spa,
-		    SPA_FEATURE_OBSOLETE_COUNTS));
-
-		scn->scn_is_bptree = B_FALSE;
-		scn->scn_async_block_min_time_ms = zfs_obsolete_min_time_ms;
-		err = bpobj_iterate(&dp->dp_obsolete_bpobj,
-		    dsl_scan_obsolete_block_cb, scn, tx);
-		if (err != 0 && err != ERESTART)
-			zfs_panic_recover("error %u from bpobj_iterate()", err);
-
-		if (bpobj_is_empty(&dp->dp_obsolete_bpobj))
-			dsl_pool_destroy_obsolete_bpobj(dp, tx);
-	}
-
-	return (0);
-}
-
-/*
- * This is the primary entry point for scans that is called from syncing
- * context. Scans must happen entirely during syncing context so that we
- * cna guarantee that blocks we are currently scanning will not change out
- * from under us. While a scan is active, this funciton controls how quickly
- * transaction groups proceed, instead of the normal handling provided by
- * txg_sync_thread().
- */
-void
-dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx)
-{
-	dsl_scan_t *scn = dp->dp_scan;
-	spa_t *spa = dp->dp_spa;
-	int err = 0;
-	state_sync_type_t sync_type = SYNC_OPTIONAL;
-
-	/*
-	 * Check for scn_restart_txg before checking spa_load_state, so
-	 * that we can restart an old-style scan while the pool is being
-	 * imported (see dsl_scan_init).
-	 */
-	if (dsl_scan_restarting(scn, tx)) {
-		pool_scan_func_t func = POOL_SCAN_SCRUB;
-		dsl_scan_done(scn, B_FALSE, tx);
-		if (vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL))
-			func = POOL_SCAN_RESILVER;
-		zfs_dbgmsg("restarting scan func=%u txg=%llu",
-		    func, (longlong_t)tx->tx_txg);
-		dsl_scan_setup_sync(&func, tx);
-	}
-
-	/*
-	 * Only process scans in sync pass 1.
-	 */
-	if (spa_sync_pass(dp->dp_spa) > 1)
-		return;
-
-	/*
-	 * If the spa is shutting down, then stop scanning. This will
-	 * ensure that the scan does not dirty any new data during the
-	 * shutdown phase.
-	 */
-	if (spa_shutting_down(spa))
-		return;
-
-	/*
-	 * If the scan is inactive due to a stalled async destroy, try again.
-	 */
-	if (!scn->scn_async_stalled && !dsl_scan_active(scn))
-		return;
-
-	/* reset scan statistics */
-	scn->scn_visited_this_txg = 0;
-	scn->scn_holes_this_txg = 0;
-	scn->scn_lt_min_this_txg = 0;
-	scn->scn_gt_max_this_txg = 0;
-	scn->scn_ddt_contained_this_txg = 0;
-	scn->scn_objsets_visited_this_txg = 0;
-	scn->scn_avg_seg_size_this_txg = 0;
-	scn->scn_segs_this_txg = 0;
-	scn->scn_avg_zio_size_this_txg = 0;
-	scn->scn_zios_this_txg = 0;
-	scn->scn_suspending = B_FALSE;
-	scn->scn_sync_start_time = gethrtime();
-	spa->spa_scrub_active = B_TRUE;
-
-	/*
-	 * First process the async destroys.  If we pause, don't do
-	 * any scrubbing or resilvering.  This ensures that there are no
-	 * async destroys while we are scanning, so the scan code doesn't
-	 * have to worry about traversing it.  It is also faster to free the
-	 * blocks than to scrub them.
-	 */
-	err = dsl_process_async_destroys(dp, tx);
-	if (err != 0)
-		return;
-
-	if (!dsl_scan_is_running(scn) || dsl_scan_is_paused_scrub(scn))
-		return;
-
-	/*
-	 * Wait a few txgs after importing to begin scanning so that
-	 * we can get the pool imported quickly.
-	 */
-	if (spa->spa_syncing_txg < spa->spa_first_txg + SCAN_IMPORT_WAIT_TXGS)
-		return;
-
-	/*
-	 * It is possible to switch from unsorted to sorted at any time,
-	 * but afterwards the scan will remain sorted unless reloaded from
-	 * a checkpoint after a reboot.
-	 */
-	if (!zfs_scan_legacy) {
-		scn->scn_is_sorted = B_TRUE;
-		if (scn->scn_last_checkpoint == 0)
-			scn->scn_last_checkpoint = ddi_get_lbolt();
-	}
-
-	/*
-	 * For sorted scans, determine what kind of work we will be doing
-	 * this txg based on our memory limitations and whether or not we
-	 * need to perform a checkpoint.
-	 */
-	if (scn->scn_is_sorted) {
-		/*
-		 * If we are over our checkpoint interval, set scn_clearing
-		 * so that we can begin checkpointing immediately. The
-		 * checkpoint allows us to save a consisent bookmark
-		 * representing how much data we have scrubbed so far.
-		 * Otherwise, use the memory limit to determine if we should
-		 * scan for metadata or start issue scrub IOs. We accumulate
-		 * metadata until we hit our hard memory limit at which point
-		 * we issue scrub IOs until we are at our soft memory limit.
-		 */
-		if (scn->scn_checkpointing ||
-		    ddi_get_lbolt() - scn->scn_last_checkpoint >
-		    SEC_TO_TICK(zfs_scan_checkpoint_intval)) {
-			if (!scn->scn_checkpointing)
-				zfs_dbgmsg("begin scan checkpoint");
-
-			scn->scn_checkpointing = B_TRUE;
-			scn->scn_clearing = B_TRUE;
-		} else {
-			boolean_t should_clear = dsl_scan_should_clear(scn);
-			if (should_clear && !scn->scn_clearing) {
-				zfs_dbgmsg("begin scan clearing");
-				scn->scn_clearing = B_TRUE;
-			} else if (!should_clear && scn->scn_clearing) {
-				zfs_dbgmsg("finish scan clearing");
-				scn->scn_clearing = B_FALSE;
-			}
-		}
-	} else {
-		ASSERT0(scn->scn_checkpointing);
-                ASSERT0(scn->scn_clearing);
-	}
-
-	if (!scn->scn_clearing && scn->scn_done_txg == 0) {
-		/* Need to scan metadata for more blocks to scrub */
-		dsl_scan_phys_t *scnp = &scn->scn_phys;
-		taskqid_t prefetch_tqid;
-		uint64_t bytes_per_leaf = zfs_scan_vdev_limit;
-		uint64_t nr_leaves = dsl_scan_count_leaves(spa->spa_root_vdev);
-
-		/*
-		 * Recalculate the max number of in-flight bytes for pool-wide
-		 * scanning operations (minimum 1MB). Limits for the issuing
-		 * phase are done per top-level vdev and are handled separately.
-		 */
-		scn->scn_maxinflight_bytes =
-		    MAX(nr_leaves * bytes_per_leaf, 1ULL << 20);
-
-		if (scnp->scn_ddt_bookmark.ddb_class <=
-		    scnp->scn_ddt_class_max) {
-			ASSERT(ZB_IS_ZERO(&scnp->scn_bookmark));
-			zfs_dbgmsg("doing scan sync txg %llu; "
-			    "ddt bm=%llu/%llu/%llu/%llx",
-			    (longlong_t)tx->tx_txg,
-			    (longlong_t)scnp->scn_ddt_bookmark.ddb_class,
-			    (longlong_t)scnp->scn_ddt_bookmark.ddb_type,
-			    (longlong_t)scnp->scn_ddt_bookmark.ddb_checksum,
-			    (longlong_t)scnp->scn_ddt_bookmark.ddb_cursor);
-		} else {
-			zfs_dbgmsg("doing scan sync txg %llu; "
-			    "bm=%llu/%llu/%llu/%llu",
-			    (longlong_t)tx->tx_txg,
-			    (longlong_t)scnp->scn_bookmark.zb_objset,
-			    (longlong_t)scnp->scn_bookmark.zb_object,
-			    (longlong_t)scnp->scn_bookmark.zb_level,
-			    (longlong_t)scnp->scn_bookmark.zb_blkid);
-		}
-
-		scn->scn_zio_root = zio_root(dp->dp_spa, NULL,
-		    NULL, ZIO_FLAG_CANFAIL);
-
-		scn->scn_prefetch_stop = B_FALSE;
-		prefetch_tqid = taskq_dispatch(dp->dp_sync_taskq,
-		    dsl_scan_prefetch_thread, scn, TQ_SLEEP);
-		ASSERT(prefetch_tqid != TASKQID_INVALID);
-
-		dsl_pool_config_enter(dp, FTAG);
-		dsl_scan_visit(scn, tx);
-		dsl_pool_config_exit(dp, FTAG);
-
-		mutex_enter(&dp->dp_spa->spa_scrub_lock);
-		scn->scn_prefetch_stop = B_TRUE;
-		cv_broadcast(&spa->spa_scrub_io_cv);
-		mutex_exit(&dp->dp_spa->spa_scrub_lock);
-
-		taskq_wait_id(dp->dp_sync_taskq, prefetch_tqid);
-		(void) zio_wait(scn->scn_zio_root);
-		scn->scn_zio_root = NULL;
-
-		zfs_dbgmsg("scan visited %llu blocks in %llums "
-		    "(%llu os's, %llu holes, %llu < mintxg, "
-		    "%llu in ddt, %llu > maxtxg)",
-		    (longlong_t)scn->scn_visited_this_txg,
-		    (longlong_t)NSEC2MSEC(gethrtime() -
-		    scn->scn_sync_start_time),
-		    (longlong_t)scn->scn_objsets_visited_this_txg,
-		    (longlong_t)scn->scn_holes_this_txg,
-		    (longlong_t)scn->scn_lt_min_this_txg,
-		    (longlong_t)scn->scn_ddt_contained_this_txg,
-		    (longlong_t)scn->scn_gt_max_this_txg);
-
-		if (!scn->scn_suspending) {
-			ASSERT0(avl_numnodes(&scn->scn_queue));
-			scn->scn_done_txg = tx->tx_txg + 1;
-			if (scn->scn_is_sorted) {
-				scn->scn_checkpointing = B_TRUE;
-				scn->scn_clearing = B_TRUE;
-			}
-			zfs_dbgmsg("scan complete txg %llu",
-				   (longlong_t)tx->tx_txg);
-		}
-	} else if (scn->scn_is_sorted && scn->scn_bytes_pending != 0) {
-		ASSERT(scn->scn_clearing);
-
-		/* need to issue scrubbing IOs from per-vdev queues */
-		scn->scn_zio_root = zio_root(dp->dp_spa, NULL,
-		    NULL, ZIO_FLAG_CANFAIL);
-		scan_io_queues_run(scn);
-		(void) zio_wait(scn->scn_zio_root);
-		scn->scn_zio_root = NULL;
-
-		/* calculate and dprintf the current memory usage */
-		(void) dsl_scan_should_clear(scn);
-		dsl_scan_update_stats(scn);
-
-		zfs_dbgmsg("scrubbed %llu blocks (%llu segs) in %llums "
-		    "(avg_block_size = %llu, avg_seg_size = %llu)",
-		    (longlong_t)scn->scn_zios_this_txg,
-		    (longlong_t)scn->scn_segs_this_txg,
-		    (longlong_t)NSEC2MSEC(gethrtime() -
-		    scn->scn_sync_start_time),
-		    (longlong_t)scn->scn_avg_zio_size_this_txg,
-		    (longlong_t)scn->scn_avg_seg_size_this_txg);
-	} else if (scn->scn_done_txg != 0 && scn->scn_done_txg <= tx->tx_txg) {
-		/* Finished with everything. Mark the scrub as complete */
-		zfs_dbgmsg("scan issuing complete txg %llu",
-		    (longlong_t)tx->tx_txg);
-		ASSERT3U(scn->scn_done_txg, !=, 0);
-		ASSERT0(spa->spa_scrub_inflight);
-		ASSERT0(scn->scn_bytes_pending);
-		dsl_scan_done(scn, B_TRUE, tx);
-		sync_type = SYNC_MANDATORY;
-	}
-
-	dsl_scan_sync_state(scn, tx, sync_type);
-}
-
-static void
-count_block(dsl_scan_t *scn, zfs_all_blkstats_t *zab, const blkptr_t *bp)
-{
-	int i;
-
-	/* update the spa's stats on how many bytes we have issued */
-	for (i = 0; i < BP_GET_NDVAS(bp); i++) {
-		atomic_add_64(&scn->scn_dp->dp_spa->spa_scan_pass_issued,
-		    DVA_GET_ASIZE(&bp->blk_dva[i]));
-	}
-
-	/*
-	 * If we resume after a reboot, zab will be NULL; don't record
-	 * incomplete stats in that case.
-	 */
-	if (zab == NULL)
-		return;
-
-	mutex_enter(&zab->zab_lock);
-
-	for (i = 0; i < 4; i++) {
-		int l = (i < 2) ? BP_GET_LEVEL(bp) : DN_MAX_LEVELS;
-		int t = (i & 1) ? BP_GET_TYPE(bp) : DMU_OT_TOTAL;
-		if (t & DMU_OT_NEWTYPE)
-			t = DMU_OT_OTHER;
-		zfs_blkstat_t *zb = &zab->zab_type[l][t];
-		int equal;
-
-		zb->zb_count++;
-		zb->zb_asize += BP_GET_ASIZE(bp);
-		zb->zb_lsize += BP_GET_LSIZE(bp);
-		zb->zb_psize += BP_GET_PSIZE(bp);
-		zb->zb_gangs += BP_COUNT_GANG(bp);
-
-		switch (BP_GET_NDVAS(bp)) {
-		case 2:
-			if (DVA_GET_VDEV(&bp->blk_dva[0]) ==
-			    DVA_GET_VDEV(&bp->blk_dva[1]))
-				zb->zb_ditto_2_of_2_samevdev++;
-			break;
-		case 3:
-			equal = (DVA_GET_VDEV(&bp->blk_dva[0]) ==
-			    DVA_GET_VDEV(&bp->blk_dva[1])) +
-			    (DVA_GET_VDEV(&bp->blk_dva[0]) ==
-			    DVA_GET_VDEV(&bp->blk_dva[2])) +
-			    (DVA_GET_VDEV(&bp->blk_dva[1]) ==
-			    DVA_GET_VDEV(&bp->blk_dva[2]));
-			if (equal == 1)
-				zb->zb_ditto_2_of_3_samevdev++;
-			else if (equal == 3)
-				zb->zb_ditto_3_of_3_samevdev++;
-			break;
-		}
-	}
-
-	mutex_exit(&zab->zab_lock);
-}
-
-static void
-scan_io_queue_insert_impl(dsl_scan_io_queue_t *queue, scan_io_t *sio)
-{
-	avl_index_t idx;
-	int64_t asize = sio->sio_asize;
-	dsl_scan_t *scn = queue->q_scn;
-
-	ASSERT(MUTEX_HELD(&queue->q_vd->vdev_scan_io_queue_lock));
-
-	if (avl_find(&queue->q_sios_by_addr, sio, &idx) != NULL) {
-		/* block is already scheduled for reading */
-		atomic_add_64(&scn->scn_bytes_pending, -asize);
-		kmem_free(sio, sizeof (*sio));
-		return;
-	}
-	avl_insert(&queue->q_sios_by_addr, sio, idx);
-	range_tree_add(queue->q_exts_by_addr, sio->sio_offset, asize);
-}
-
-/*
- * Given all the info we got from our metadata scanning process, we
- * construct a scan_io_t and insert it into the scan sorting queue. The
- * I/O must already be suitable for us to process. This is controlled
- * by dsl_scan_enqueue().
- */
-static void
-scan_io_queue_insert(dsl_scan_io_queue_t *queue, const blkptr_t *bp, int dva_i,
-    int zio_flags, const zbookmark_phys_t *zb)
-{
-	dsl_scan_t *scn = queue->q_scn;
-	scan_io_t *sio = kmem_zalloc(sizeof (*sio), KM_SLEEP);
-
-	ASSERT0(BP_IS_GANG(bp));
-	ASSERT(MUTEX_HELD(&queue->q_vd->vdev_scan_io_queue_lock));
-
-	bp2sio(bp, sio, dva_i);
-	sio->sio_flags = zio_flags;
-	sio->sio_zb = *zb;
-
-	/*
-	 * Increment the bytes pending counter now so that we can't
-	 * get an integer underflow in case the worker processes the
-	 * zio before we get to incrementing this counter.
-	 */
-	atomic_add_64(&scn->scn_bytes_pending, sio->sio_asize);
-
-	scan_io_queue_insert_impl(queue, sio);
-}
-
-/*
- * Given a set of I/O parameters as discovered by the metadata traversal
- * process, attempts to place the I/O into the sorted queues (if allowed),
- * or immediately executes the I/O.
- */
-static void
-dsl_scan_enqueue(dsl_pool_t *dp, const blkptr_t *bp, int zio_flags,
-    const zbookmark_phys_t *zb)
-{
-	spa_t *spa = dp->dp_spa;
-
-	ASSERT(!BP_IS_EMBEDDED(bp));
-
-	/*
-	 * Gang blocks are hard to issue sequentially, so we just issue them
-	 * here immediately instead of queuing them.
-	 */
-	if (!dp->dp_scan->scn_is_sorted || BP_IS_GANG(bp)) {
-		scan_exec_io(dp, bp, zio_flags, zb, NULL);
-		return;
-	}
-	for (int i = 0; i < BP_GET_NDVAS(bp); i++) {
-		dva_t dva;
-		vdev_t *vdev;
-
-		dva = bp->blk_dva[i];
-		vdev = vdev_lookup_top(spa, DVA_GET_VDEV(&dva));
-		ASSERT(vdev != NULL);
-
-		mutex_enter(&vdev->vdev_scan_io_queue_lock);
-		if (vdev->vdev_scan_io_queue == NULL)
-			vdev->vdev_scan_io_queue = scan_io_queue_create(vdev);
-		ASSERT(dp->dp_scan != NULL);
-		scan_io_queue_insert(vdev->vdev_scan_io_queue, bp,
-		    i, zio_flags, zb);
-		mutex_exit(&vdev->vdev_scan_io_queue_lock);
-	}
-}
-
-static int
-dsl_scan_scrub_cb(dsl_pool_t *dp,
-    const blkptr_t *bp, const zbookmark_phys_t *zb)
-{
-	dsl_scan_t *scn = dp->dp_scan;
-	spa_t *spa = dp->dp_spa;
-	uint64_t phys_birth = BP_PHYSICAL_BIRTH(bp);
-	size_t psize = BP_GET_PSIZE(bp);
-	boolean_t needs_io;
-	int zio_flags = ZIO_FLAG_SCAN_THREAD | ZIO_FLAG_RAW | ZIO_FLAG_CANFAIL;
-	int d;
-
-	if (phys_birth <= scn->scn_phys.scn_min_txg ||
-	    phys_birth >= scn->scn_phys.scn_max_txg) {
-		count_block(scn, dp->dp_blkstats, bp);
-		return (0);
-	}
-
-	/* Embedded BP's have phys_birth==0, so we reject them above. */
-	ASSERT(!BP_IS_EMBEDDED(bp));
-
-	ASSERT(DSL_SCAN_IS_SCRUB_RESILVER(scn));
-	if (scn->scn_phys.scn_func == POOL_SCAN_SCRUB) {
-		zio_flags |= ZIO_FLAG_SCRUB;
-		needs_io = B_TRUE;
-	} else {
-		ASSERT3U(scn->scn_phys.scn_func, ==, POOL_SCAN_RESILVER);
-		zio_flags |= ZIO_FLAG_RESILVER;
-		needs_io = B_FALSE;
-	}
-
-	/* If it's an intent log block, failure is expected. */
-	if (zb->zb_level == ZB_ZIL_LEVEL)
-		zio_flags |= ZIO_FLAG_SPECULATIVE;
-
-	for (d = 0; d < BP_GET_NDVAS(bp); d++) {
-		const dva_t *dva = &bp->blk_dva[d];
-
-		/*
-		 * Keep track of how much data we've examined so that
-		 * zpool(1M) status can make useful progress reports.
-		 */
-		scn->scn_phys.scn_examined += DVA_GET_ASIZE(dva);
-		spa->spa_scan_pass_exam += DVA_GET_ASIZE(dva);
-
-		/* if it's a resilver, this may not be in the target range */
-		if (!needs_io)
-			needs_io = dsl_scan_need_resilver(spa, dva, psize,
-                            phys_birth);
-	}
-
-	if (needs_io && !zfs_no_scrub_io) {
-		dsl_scan_enqueue(dp, bp, zio_flags, zb);
-	} else {
-		count_block(scn, dp->dp_blkstats, bp);
-	}
-
-	/* do not relocate this block */
-	return (0);
-}
-
-static void
-dsl_scan_scrub_done(zio_t *zio)
-{
-	spa_t *spa = zio->io_spa;
-	blkptr_t *bp = zio->io_bp;
-	dsl_scan_io_queue_t *queue = zio->io_private;
-
-	abd_free(zio->io_abd);
-
-	if (queue == NULL) {
-		mutex_enter(&spa->spa_scrub_lock);
-		ASSERT3U(spa->spa_scrub_inflight, >=, BP_GET_PSIZE(bp));
-		spa->spa_scrub_inflight -= BP_GET_PSIZE(bp);
-		cv_broadcast(&spa->spa_scrub_io_cv);
-		mutex_exit(&spa->spa_scrub_lock);
-	} else {
-		mutex_enter(&queue->q_vd->vdev_scan_io_queue_lock);
-		ASSERT3U(queue->q_inflight_bytes, >=, BP_GET_PSIZE(bp));
-		queue->q_inflight_bytes -= BP_GET_PSIZE(bp);
-		cv_broadcast(&queue->q_zio_cv);
-		mutex_exit(&queue->q_vd->vdev_scan_io_queue_lock);
-	}
-
-	if (zio->io_error && (zio->io_error != ECKSUM ||
-	    !(zio->io_flags & ZIO_FLAG_SPECULATIVE))) {
-		atomic_inc_64(&spa->spa_dsl_pool->dp_scan->scn_phys.scn_errors);
-	}
-}
-
-/*
- * Given a scanning zio's information, executes the zio. The zio need
- * not necessarily be only sortable, this function simply executes the
- * zio, no matter what it is. The optional queue argument allows the
- * caller to specify that they want per top level vdev IO rate limiting
- * instead of the legacy global limiting.
- */
-static void
-scan_exec_io(dsl_pool_t *dp, const blkptr_t *bp, int zio_flags,
-    const zbookmark_phys_t *zb, dsl_scan_io_queue_t *queue)
-{
-	spa_t *spa = dp->dp_spa;
-	dsl_scan_t *scn = dp->dp_scan;
-	size_t size = BP_GET_PSIZE(bp);
-	abd_t *data = abd_alloc_for_io(size, B_FALSE);
-	unsigned int scan_delay = 0;
-
-	ASSERT3U(scn->scn_maxinflight_bytes, >, 0);
-
-	if (queue == NULL) {
-		mutex_enter(&spa->spa_scrub_lock);
-		while (spa->spa_scrub_inflight >= scn->scn_maxinflight_bytes)
-			cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
-		spa->spa_scrub_inflight += BP_GET_PSIZE(bp);
-		mutex_exit(&spa->spa_scrub_lock);
-	} else {
-		kmutex_t *q_lock = &queue->q_vd->vdev_scan_io_queue_lock;
-
-		mutex_enter(q_lock);
-		while (queue->q_inflight_bytes >= queue->q_maxinflight_bytes)
-			cv_wait(&queue->q_zio_cv, q_lock);
-		queue->q_inflight_bytes += BP_GET_PSIZE(bp);
-		mutex_exit(q_lock);
-	}
-
-	if (zio_flags & ZIO_FLAG_RESILVER)
-		scan_delay = zfs_resilver_delay;
-	else {
-		ASSERT(zio_flags & ZIO_FLAG_SCRUB);
-		scan_delay = zfs_scrub_delay;
-	}
-
-	if (scan_delay && (ddi_get_lbolt64() - spa->spa_last_io <= zfs_scan_idle))
-		delay(MAX((int)scan_delay, 0));
-	
-	count_block(dp->dp_scan, dp->dp_blkstats, bp);
-	zio_nowait(zio_read(dp->dp_scan->scn_zio_root, spa, bp, data, size,
-	    dsl_scan_scrub_done, queue, ZIO_PRIORITY_SCRUB, zio_flags, zb));
-}
-
-/*
- * This is the primary extent sorting algorithm. We balance two parameters:
- * 1) how many bytes of I/O are in an extent
- * 2) how well the extent is filled with I/O (as a fraction of its total size)
- * Since we allow extents to have gaps between their constituent I/Os, it's
- * possible to have a fairly large extent that contains the same amount of
- * I/O bytes than a much smaller extent, which just packs the I/O more tightly.
- * The algorithm sorts based on a score calculated from the extent's size,
- * the relative fill volume (in %) and a "fill weight" parameter that controls
- * the split between whether we prefer larger extents or more well populated
- * extents:
- *
- * SCORE = FILL_IN_BYTES + (FILL_IN_PERCENT * FILL_IN_BYTES * FILL_WEIGHT)
- *
- * Example:
- * 1) assume extsz = 64 MiB
- * 2) assume fill = 32 MiB (extent is half full)
- * 3) assume fill_weight = 3
- * 4)	SCORE = 32M + (((32M * 100) / 64M) * 3 * 32M) / 100
- *	SCORE = 32M + (50 * 3 * 32M) / 100
- *	SCORE = 32M + (4800M / 100)
- *	SCORE = 32M + 48M
- *	         ^     ^
- *	         |     +--- final total relative fill-based score
- *	         +--------- final total fill-based score
- *	SCORE = 80M
- *
- * As can be seen, at fill_ratio=3, the algorithm is slightly biased towards
- * extents that are more completely filled (in a 3:2 ratio) vs just larger.
- * Note that as an optimization, we replace multiplication and division by
- * 100 with bitshifting by 7 (which effecitvely multiplies and divides by 128).
- */
-static int
-ext_size_compare(const void *x, const void *y)
-{
-	const range_seg_t *rsa = x, *rsb = y;
-	uint64_t sa = rsa->rs_end - rsa->rs_start,
-	    sb = rsb->rs_end - rsb->rs_start;
-	uint64_t score_a, score_b;
-
-	score_a = rsa->rs_fill + ((((rsa->rs_fill << 7) / sa) *
-	    fill_weight * rsa->rs_fill) >> 7);
-	score_b = rsb->rs_fill + ((((rsb->rs_fill << 7) / sb) *
-	    fill_weight * rsb->rs_fill) >> 7);
-
-	if (score_a > score_b)
-		return (-1);
-	if (score_a == score_b) {
-		if (rsa->rs_start < rsb->rs_start)
-			return (-1);
-		if (rsa->rs_start == rsb->rs_start)
-			return (0);
-		return (1);
-	}
-	return (1);
-}
-
-/*
- * Comparator for the q_sios_by_addr tree. Sorting is simply performed
- * based on LBA-order (from lowest to highest).
- */
-static int
-io_addr_compare(const void *x, const void *y)
-{
-	const scan_io_t *a = x, *b = y;
-
-	if (a->sio_offset < b->sio_offset)
-		return (-1);
-	if (a->sio_offset == b->sio_offset)
-		return (0);
-	return (1);
-}
-
-/* IO queues are created on demand when they are needed. */
-static dsl_scan_io_queue_t *
-scan_io_queue_create(vdev_t *vd)
-{
-	dsl_scan_t *scn = vd->vdev_spa->spa_dsl_pool->dp_scan;
-	dsl_scan_io_queue_t *q = kmem_zalloc(sizeof (*q), KM_SLEEP);
-
-	q->q_scn = scn;
-	q->q_vd = vd;
-	cv_init(&q->q_zio_cv, NULL, CV_DEFAULT, NULL);
-	q->q_exts_by_addr = range_tree_create_impl(&rt_avl_ops,
-	    &q->q_exts_by_size, ext_size_compare, zfs_scan_max_ext_gap);
-	avl_create(&q->q_sios_by_addr, io_addr_compare,
-	    sizeof (scan_io_t), offsetof(scan_io_t, sio_nodes.sio_addr_node));
-
-	return (q);
-}
-
-/*
- * Destroys a scan queue and all segments and scan_io_t's contained in it.
- * No further execution of I/O occurs, anything pending in the queue is
- * simply freed without being executed.
- */
-void
-dsl_scan_io_queue_destroy(dsl_scan_io_queue_t *queue)
-{
-	dsl_scan_t *scn = queue->q_scn;
-	scan_io_t *sio;
-	void *cookie = NULL;
-	int64_t bytes_dequeued = 0;
-
-	ASSERT(MUTEX_HELD(&queue->q_vd->vdev_scan_io_queue_lock));
-
-	while ((sio = avl_destroy_nodes(&queue->q_sios_by_addr, &cookie)) !=
-	    NULL) {
-		ASSERT(range_tree_contains(queue->q_exts_by_addr,
-		    sio->sio_offset, sio->sio_asize));
-		bytes_dequeued += sio->sio_asize;
-		kmem_free(sio, sizeof (*sio));
-	}
-
-	atomic_add_64(&scn->scn_bytes_pending, -bytes_dequeued);
-	range_tree_vacate(queue->q_exts_by_addr, NULL, queue);
-	range_tree_destroy(queue->q_exts_by_addr);
-	avl_destroy(&queue->q_sios_by_addr);
-	cv_destroy(&queue->q_zio_cv);
-
-	kmem_free(queue, sizeof (*queue));
-}
-
-/*
- * Properly transfers a dsl_scan_queue_t from `svd' to `tvd'. This is
- * called on behalf of vdev_top_transfer when creating or destroying
- * a mirror vdev due to zpool attach/detach.
- */
-void
-dsl_scan_io_queue_vdev_xfer(vdev_t *svd, vdev_t *tvd)
-{
-	mutex_enter(&svd->vdev_scan_io_queue_lock);
-	mutex_enter(&tvd->vdev_scan_io_queue_lock);
-
-	VERIFY3P(tvd->vdev_scan_io_queue, ==, NULL);
-	tvd->vdev_scan_io_queue = svd->vdev_scan_io_queue;
-	svd->vdev_scan_io_queue = NULL;
-	if (tvd->vdev_scan_io_queue != NULL)
-		tvd->vdev_scan_io_queue->q_vd = tvd;
-
-	mutex_exit(&tvd->vdev_scan_io_queue_lock);
-	mutex_exit(&svd->vdev_scan_io_queue_lock);
-}
-
-static void
-scan_io_queues_destroy(dsl_scan_t *scn)
-{
-	vdev_t *rvd = scn->scn_dp->dp_spa->spa_root_vdev;
-
-	for (uint64_t i = 0; i < rvd->vdev_children; i++) {
-		vdev_t *tvd = rvd->vdev_child[i];
-
-		mutex_enter(&tvd->vdev_scan_io_queue_lock);
-		if (tvd->vdev_scan_io_queue != NULL)
-			dsl_scan_io_queue_destroy(tvd->vdev_scan_io_queue);
-		tvd->vdev_scan_io_queue = NULL;
-		mutex_exit(&tvd->vdev_scan_io_queue_lock);
-	}
-}
-
-static void
-dsl_scan_freed_dva(spa_t *spa, const blkptr_t *bp, int dva_i)
-{
-	dsl_pool_t *dp = spa->spa_dsl_pool;
-	dsl_scan_t *scn = dp->dp_scan;
-	vdev_t *vdev;
-	kmutex_t *q_lock;
-	dsl_scan_io_queue_t *queue;
-	scan_io_t srch, *sio;
-	avl_index_t idx;
-	uint64_t start, size;
-
-	vdev = vdev_lookup_top(spa, DVA_GET_VDEV(&bp->blk_dva[dva_i]));
-	ASSERT(vdev != NULL);
-	q_lock = &vdev->vdev_scan_io_queue_lock;
-	queue = vdev->vdev_scan_io_queue;
-
-	mutex_enter(q_lock);
-	if (queue == NULL) {
-		mutex_exit(q_lock);
-		return;
-	}
-
-	bp2sio(bp, &srch, dva_i);
-	start = srch.sio_offset;
-	size = srch.sio_asize;
-
-	/*
-	 * We can find the zio in two states:
-	 * 1) Cold, just sitting in the queue of zio's to be issued at
-	 *	some point in the future. In this case, all we do is
-	 *	remove the zio from the q_sios_by_addr tree, decrement
-	 *	its data volume from the containing range_seg_t and
-	 *	resort the q_exts_by_size tree to reflect that the
-	 *	range_seg_t has lost some of its 'fill'. We don't shorten
-	 *	the range_seg_t - this is usually rare enough not to be
-	 *	worth the extra hassle of trying keep track of precise
-	 *	extent boundaries.
-	 * 2) Hot, where the zio is currently in-flight in
-	 *	dsl_scan_issue_ios. In this case, we can't simply
-	 *	reach in and stop the in-flight zio's, so we instead
-	 *	block the caller. Eventually, dsl_scan_issue_ios will
-	 *	be done with issuing the zio's it gathered and will
-	 *	signal us.
-	 */
-	sio = avl_find(&queue->q_sios_by_addr, &srch, &idx);
-	if (sio != NULL) {
-		int64_t asize = sio->sio_asize;
-		blkptr_t tmpbp;
-
-		/* Got it while it was cold in the queue */
-		ASSERT3U(start, ==, sio->sio_offset);
-		ASSERT3U(size, ==, asize);
-		avl_remove(&queue->q_sios_by_addr, sio);
-
-		ASSERT(range_tree_contains(queue->q_exts_by_addr, start, size));
-		range_tree_remove_fill(queue->q_exts_by_addr, start, size);
-
-		/*
-		 * We only update scn_bytes_pending in the cold path,
-		 * otherwise it will already have been accounted for as
-		 * part of the zio's execution.
-		 */
-		atomic_add_64(&scn->scn_bytes_pending, -asize);
-
-		/* count the block as though we issued it */
-		sio2bp(sio, &tmpbp, dva_i);
-		count_block(scn, dp->dp_blkstats, &tmpbp);
-
-		kmem_free(sio, sizeof (*sio));
-	}
-	mutex_exit(q_lock);
-}
-
-/*
- * Callback invoked when a zio_free() zio is executing. This needs to be
- * intercepted to prevent the zio from deallocating a particular portion
- * of disk space and it then getting reallocated and written to, while we
- * still have it queued up for processing.
- */
-void
-dsl_scan_freed(spa_t *spa, const blkptr_t *bp)
-{
-	dsl_pool_t *dp = spa->spa_dsl_pool;
-	dsl_scan_t *scn = dp->dp_scan;
-
-	ASSERT(!BP_IS_EMBEDDED(bp));
-	ASSERT(scn != NULL);
-	if (!dsl_scan_is_running(scn))
-		return;
-
-	for (int i = 0; i < BP_GET_NDVAS(bp); i++)
-		dsl_scan_freed_dva(spa, bp, i);
-}
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_synctask.c
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_synctask.c
+++ /dev/null
@@ -1,256 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
- */
-
-#include <sys/dmu.h>
-#include <sys/dmu_tx.h>
-#include <sys/dsl_pool.h>
-#include <sys/dsl_dir.h>
-#include <sys/dsl_synctask.h>
-#include <sys/metaslab.h>
-
-#define	DST_AVG_BLKSHIFT 14
-
-/* ARGSUSED */
-static int
-dsl_null_checkfunc(void *arg, dmu_tx_t *tx)
-{
-	return (0);
-}
-
-static int
-dsl_sync_task_common(const char *pool, dsl_checkfunc_t *checkfunc,
-    dsl_syncfunc_t *syncfunc, dsl_sigfunc_t *sigfunc, void *arg,
-    int blocks_modified, zfs_space_check_t space_check, boolean_t early)
-{
-	spa_t *spa;
-	dmu_tx_t *tx;
-	int err;
-	dsl_sync_task_t dst = { 0 };
-	dsl_pool_t *dp;
-
-	err = spa_open(pool, &spa, FTAG);
-	if (err != 0)
-		return (err);
-	dp = spa_get_dsl(spa);
-
-top:
-	tx = dmu_tx_create_dd(dp->dp_mos_dir);
-	VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
-
-	dst.dst_pool = dp;
-	dst.dst_txg = dmu_tx_get_txg(tx);
-	dst.dst_space = blocks_modified << DST_AVG_BLKSHIFT;
-	dst.dst_space_check = space_check;
-	dst.dst_checkfunc = checkfunc != NULL ? checkfunc : dsl_null_checkfunc;
-	dst.dst_syncfunc = syncfunc;
-	dst.dst_arg = arg;
-	dst.dst_error = 0;
-	dst.dst_nowaiter = B_FALSE;
-
-	dsl_pool_config_enter(dp, FTAG);
-	err = dst.dst_checkfunc(arg, tx);
-	dsl_pool_config_exit(dp, FTAG);
-
-	if (err != 0) {
-		dmu_tx_commit(tx);
-		spa_close(spa, FTAG);
-		return (err);
-	}
-
-	txg_list_t *task_list = (early) ?
-	    &dp->dp_early_sync_tasks : &dp->dp_sync_tasks;
-	VERIFY(txg_list_add_tail(task_list, &dst, dst.dst_txg));
-
-	dmu_tx_commit(tx);
-
-	if (sigfunc != NULL && txg_wait_synced_sig(dp, dst.dst_txg)) {
-		/* current contract is to call func once */
-		sigfunc(arg, tx);
-		sigfunc = NULL;	/* in case of an EAGAIN retry */
-	}
-	txg_wait_synced(dp, dst.dst_txg);
-
-	if (dst.dst_error == EAGAIN) {
-		txg_wait_synced(dp, dst.dst_txg + TXG_DEFER_SIZE);
-		goto top;
-	}
-
-	spa_close(spa, FTAG);
-	return (dst.dst_error);
-}
-
-/*
- * Called from open context to perform a callback in syncing context.  Waits
- * for the operation to complete.
- *
- * The checkfunc will be called from open context as a preliminary check
- * which can quickly fail.  If it succeeds, it will be called again from
- * syncing context.  The checkfunc should generally be designed to work
- * properly in either context, but if necessary it can check
- * dmu_tx_is_syncing(tx).
- *
- * The synctask infrastructure enforces proper locking strategy with respect
- * to the dp_config_rwlock -- the lock will always be held when the callbacks
- * are called.  It will be held for read during the open-context (preliminary)
- * call to the checkfunc, and then held for write from syncing context during
- * the calls to the check and sync funcs.
- *
- * A dataset or pool name can be passed as the first argument.  Typically,
- * the check func will hold, check the return value of the hold, and then
- * release the dataset.  The sync func will VERIFYO(hold()) the dataset.
- * This is safe because no changes can be made between the check and sync funcs,
- * and the sync func will only be called if the check func successfully opened
- * the dataset.
- */
-int
-dsl_sync_task(const char *pool, dsl_checkfunc_t *checkfunc,
-    dsl_syncfunc_t *syncfunc, void *arg,
-    int blocks_modified, zfs_space_check_t space_check)
-{
-	return (dsl_sync_task_common(pool, checkfunc, syncfunc, NULL, arg,
-	    blocks_modified, space_check, B_FALSE));
-}
-
-/*
- * An early synctask works exactly as a standard synctask with one important
- * difference on the way it is handled during syncing context. Standard
- * synctasks run after we've written out all the dirty blocks of dirty
- * datasets. Early synctasks are executed before writing out any dirty data,
- * and thus before standard synctasks.
- *
- * For that reason, early synctasks can affect the process of writing dirty
- * changes to disk for the txg that they run and should be used with caution.
- * In addition, early synctasks should not dirty any metaslabs as this would
- * invalidate the precodition/invariant for subsequent early synctasks.
- * [see dsl_pool_sync() and dsl_early_sync_task_verify()]
- */
-int
-dsl_early_sync_task(const char *pool, dsl_checkfunc_t *checkfunc,
-    dsl_syncfunc_t *syncfunc, void *arg,
-    int blocks_modified, zfs_space_check_t space_check)
-{
-	return (dsl_sync_task_common(pool, checkfunc, syncfunc, NULL, arg,
-	    blocks_modified, space_check, B_TRUE));
-}
-
-/*
- * A standard synctask that can be interrupted from a signal. The sigfunc
- * is called once if a signal occurred while waiting for the task to sync.
- */
-int
-dsl_sync_task_sig(const char *pool, dsl_checkfunc_t *checkfunc,
-    dsl_syncfunc_t *syncfunc, dsl_sigfunc_t *sigfunc, void *arg,
-    int blocks_modified, zfs_space_check_t space_check)
-{
-	return (dsl_sync_task_common(pool, checkfunc, syncfunc, sigfunc, arg,
-	    blocks_modified, space_check, B_FALSE));
-}
-
-static void
-dsl_sync_task_nowait_common(dsl_pool_t *dp, dsl_syncfunc_t *syncfunc, void *arg,
-    int blocks_modified, zfs_space_check_t space_check, dmu_tx_t *tx,
-    boolean_t early)
-{
-	dsl_sync_task_t *dst = kmem_zalloc(sizeof (*dst), KM_SLEEP);
-
-	dst->dst_pool = dp;
-	dst->dst_txg = dmu_tx_get_txg(tx);
-	dst->dst_space = blocks_modified << DST_AVG_BLKSHIFT;
-	dst->dst_space_check = space_check;
-	dst->dst_checkfunc = dsl_null_checkfunc;
-	dst->dst_syncfunc = syncfunc;
-	dst->dst_arg = arg;
-	dst->dst_error = 0;
-	dst->dst_nowaiter = B_TRUE;
-
-	txg_list_t *task_list = (early) ?
-	    &dp->dp_early_sync_tasks : &dp->dp_sync_tasks;
-	VERIFY(txg_list_add_tail(task_list, dst, dst->dst_txg));
-}
-
-void
-dsl_sync_task_nowait(dsl_pool_t *dp, dsl_syncfunc_t *syncfunc, void *arg,
-    int blocks_modified, zfs_space_check_t space_check, dmu_tx_t *tx)
-{
-	dsl_sync_task_nowait_common(dp, syncfunc, arg,
-	    blocks_modified, space_check, tx, B_FALSE);
-}
-
-void
-dsl_early_sync_task_nowait(dsl_pool_t *dp, dsl_syncfunc_t *syncfunc, void *arg,
-    int blocks_modified, zfs_space_check_t space_check, dmu_tx_t *tx)
-{
-	dsl_sync_task_nowait_common(dp, syncfunc, arg,
-	    blocks_modified, space_check, tx, B_TRUE);
-}
-
-/*
- * Called in syncing context to execute the synctask.
- */
-void
-dsl_sync_task_sync(dsl_sync_task_t *dst, dmu_tx_t *tx)
-{
-	dsl_pool_t *dp = dst->dst_pool;
-
-	ASSERT0(dst->dst_error);
-
-	/*
-	 * Check for sufficient space.
-	 *
-	 * When the sync task was created, the caller specified the
-	 * type of space checking required.  See the comment in
-	 * zfs_space_check_t for details on the semantics of each
-	 * type of space checking.
-	 *
-	 * We just check against what's on-disk; we don't want any
-	 * in-flight accounting to get in our way, because open context
-	 * may have already used up various in-core limits
-	 * (arc_tempreserve, dsl_pool_tempreserve).
-	 */
-	if (dst->dst_space_check != ZFS_SPACE_CHECK_NONE) {
-		uint64_t quota = dsl_pool_unreserved_space(dp,
-		    dst->dst_space_check);
-		uint64_t used = dsl_dir_phys(dp->dp_root_dir)->dd_used_bytes;
-
-		/* MOS space is triple-dittoed, so we multiply by 3. */
-		if (used + dst->dst_space * 3 > quota) {
-			dst->dst_error = SET_ERROR(ENOSPC);
-			if (dst->dst_nowaiter)
-				kmem_free(dst, sizeof (*dst));
-			return;
-		}
-	}
-
-	/*
-	 * Check for errors by calling checkfunc.
-	 */
-	rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG);
-	dst->dst_error = dst->dst_checkfunc(dst->dst_arg, tx);
-	if (dst->dst_error == 0)
-		dst->dst_syncfunc(dst->dst_arg, tx);
-	rrw_exit(&dp->dp_config_rwlock, FTAG);
-	if (dst->dst_nowaiter)
-		kmem_free(dst, sizeof (*dst));
-}
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_userhold.c
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_userhold.c
+++ /dev/null
@@ -1,667 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
- * Copyright (c) 2013 Steven Hartland. All rights reserved.
- */
-
-#include <sys/zfs_context.h>
-#include <sys/dsl_userhold.h>
-#include <sys/dsl_dataset.h>
-#include <sys/dsl_destroy.h>
-#include <sys/dsl_synctask.h>
-#include <sys/dmu_tx.h>
-#include <sys/zfs_onexit.h>
-#include <sys/dsl_pool.h>
-#include <sys/dsl_dir.h>
-#include <sys/zfs_ioctl.h>
-#include <sys/zap.h>
-
-typedef struct dsl_dataset_user_hold_arg {
-	nvlist_t *dduha_holds;
-	nvlist_t *dduha_chkholds;
-	nvlist_t *dduha_errlist;
-	minor_t dduha_minor;
-} dsl_dataset_user_hold_arg_t;
-
-/*
- * If you add new checks here, you may need to add additional checks to the
- * "temporary" case in snapshot_check() in dmu_objset.c.
- */
-int
-dsl_dataset_user_hold_check_one(dsl_dataset_t *ds, const char *htag,
-    boolean_t temphold, dmu_tx_t *tx)
-{
-	dsl_pool_t *dp = dmu_tx_pool(tx);
-	objset_t *mos = dp->dp_meta_objset;
-	int error = 0;
-
-	ASSERT(dsl_pool_config_held(dp));
-
-	if (strlen(htag) > MAXNAMELEN)
-		return (SET_ERROR(E2BIG));
-	/* Tempholds have a more restricted length */
-	if (temphold && strlen(htag) + MAX_TAG_PREFIX_LEN >= MAXNAMELEN)
-		return (SET_ERROR(E2BIG));
-
-	/* tags must be unique (if ds already exists) */
-	if (ds != NULL && dsl_dataset_phys(ds)->ds_userrefs_obj != 0) {
-		uint64_t value;
-
-		error = zap_lookup(mos, dsl_dataset_phys(ds)->ds_userrefs_obj,
-		    htag, 8, 1, &value);
-		if (error == 0)
-			error = SET_ERROR(EEXIST);
-		else if (error == ENOENT)
-			error = 0;
-	}
-
-	return (error);
-}
-
-static int
-dsl_dataset_user_hold_check(void *arg, dmu_tx_t *tx)
-{
-	dsl_dataset_user_hold_arg_t *dduha = arg;
-	dsl_pool_t *dp = dmu_tx_pool(tx);
-
-	if (spa_version(dp->dp_spa) < SPA_VERSION_USERREFS)
-		return (SET_ERROR(ENOTSUP));
-
-	if (!dmu_tx_is_syncing(tx))
-		return (0);
-
-	for (nvpair_t *pair = nvlist_next_nvpair(dduha->dduha_holds, NULL);
-	    pair != NULL; pair = nvlist_next_nvpair(dduha->dduha_holds, pair)) {
-		dsl_dataset_t *ds;
-		int error = 0;
-		char *htag, *name;
-
-		/* must be a snapshot */
-		name = nvpair_name(pair);
-		if (strchr(name, '@') == NULL)
-			error = SET_ERROR(EINVAL);
-
-		if (error == 0)
-			error = nvpair_value_string(pair, &htag);
-
-		if (error == 0)
-			error = dsl_dataset_hold(dp, name, FTAG, &ds);
-
-		if (error == 0) {
-			error = dsl_dataset_user_hold_check_one(ds, htag,
-			    dduha->dduha_minor != 0, tx);
-			dsl_dataset_rele(ds, FTAG);
-		}
-
-		if (error == 0) {
-			fnvlist_add_string(dduha->dduha_chkholds, name, htag);
-		} else {
-			/*
-			 * We register ENOENT errors so they can be correctly
-			 * reported if needed, such as when all holds fail.
-			 */
-			fnvlist_add_int32(dduha->dduha_errlist, name, error);
-			if (error != ENOENT)
-				return (error);
-		}
-	}
-
-	return (0);
-}
-
-
-static void
-dsl_dataset_user_hold_sync_one_impl(nvlist_t *tmpholds, dsl_dataset_t *ds,
-    const char *htag, minor_t minor, uint64_t now, dmu_tx_t *tx)
-{
-	dsl_pool_t *dp = ds->ds_dir->dd_pool;
-	objset_t *mos = dp->dp_meta_objset;
-	uint64_t zapobj;
-
-	ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock));
-
-	if (dsl_dataset_phys(ds)->ds_userrefs_obj == 0) {
-		/*
-		 * This is the first user hold for this dataset.  Create
-		 * the userrefs zap object.
-		 */
-		dmu_buf_will_dirty(ds->ds_dbuf, tx);
-		zapobj = dsl_dataset_phys(ds)->ds_userrefs_obj =
-		    zap_create(mos, DMU_OT_USERREFS, DMU_OT_NONE, 0, tx);
-	} else {
-		zapobj = dsl_dataset_phys(ds)->ds_userrefs_obj;
-	}
-	ds->ds_userrefs++;
-
-	VERIFY0(zap_add(mos, zapobj, htag, 8, 1, &now, tx));
-
-	if (minor != 0) {
-		char name[MAXNAMELEN];
-		nvlist_t *tags;
-
-		VERIFY0(dsl_pool_user_hold(dp, ds->ds_object,
-		    htag, now, tx));
-		(void) snprintf(name, sizeof (name), "%llx",
-		    (u_longlong_t)ds->ds_object);
-
-		if (nvlist_lookup_nvlist(tmpholds, name, &tags) != 0) {
-			tags = fnvlist_alloc();
-			fnvlist_add_boolean(tags, htag);
-			fnvlist_add_nvlist(tmpholds, name, tags);
-			fnvlist_free(tags);
-		} else {
-			fnvlist_add_boolean(tags, htag);
-		}
-	}
-
-	spa_history_log_internal_ds(ds, "hold", tx,
-	    "tag=%s temp=%d refs=%llu",
-	    htag, minor != 0, ds->ds_userrefs);
-}
-
-typedef struct zfs_hold_cleanup_arg {
-	char zhca_spaname[ZFS_MAX_DATASET_NAME_LEN];
-	uint64_t zhca_spa_load_guid;
-	nvlist_t *zhca_holds;
-} zfs_hold_cleanup_arg_t;
-
-static void
-dsl_dataset_user_release_onexit(void *arg)
-{
-	zfs_hold_cleanup_arg_t *ca = arg;
-	spa_t *spa;
-	int error;
-
-	error = spa_open(ca->zhca_spaname, &spa, FTAG);
-	if (error != 0) {
-		zfs_dbgmsg("couldn't release holds on pool=%s "
-		    "because pool is no longer loaded",
-		    ca->zhca_spaname);
-		return;
-	}
-	if (spa_load_guid(spa) != ca->zhca_spa_load_guid) {
-		zfs_dbgmsg("couldn't release holds on pool=%s "
-		    "because pool is no longer loaded (guid doesn't match)",
-		    ca->zhca_spaname);
-		spa_close(spa, FTAG);
-		return;
-	}
-
-	(void) dsl_dataset_user_release_tmp(spa_get_dsl(spa), ca->zhca_holds);
-	fnvlist_free(ca->zhca_holds);
-	kmem_free(ca, sizeof (zfs_hold_cleanup_arg_t));
-	spa_close(spa, FTAG);
-}
-
-static void
-dsl_onexit_hold_cleanup(spa_t *spa, nvlist_t *holds, minor_t minor)
-{
-	zfs_hold_cleanup_arg_t *ca;
-
-	if (minor == 0 || nvlist_empty(holds)) {
-		fnvlist_free(holds);
-		return;
-	}
-
-	ASSERT(spa != NULL);
-	ca = kmem_alloc(sizeof (*ca), KM_SLEEP);
-
-	(void) strlcpy(ca->zhca_spaname, spa_name(spa),
-	    sizeof (ca->zhca_spaname));
-	ca->zhca_spa_load_guid = spa_load_guid(spa);
-	ca->zhca_holds = holds;
-	VERIFY0(zfs_onexit_add_cb(minor,
-	    dsl_dataset_user_release_onexit, ca, NULL));
-}
-
-void
-dsl_dataset_user_hold_sync_one(dsl_dataset_t *ds, const char *htag,
-    minor_t minor, uint64_t now, dmu_tx_t *tx)
-{
-	nvlist_t *tmpholds;
-
-	if (minor != 0)
-		tmpholds = fnvlist_alloc();
-	else
-		tmpholds = NULL;
-	dsl_dataset_user_hold_sync_one_impl(tmpholds, ds, htag, minor, now, tx);
-	dsl_onexit_hold_cleanup(dsl_dataset_get_spa(ds), tmpholds, minor);
-}
-
-static void
-dsl_dataset_user_hold_sync(void *arg, dmu_tx_t *tx)
-{
-	dsl_dataset_user_hold_arg_t *dduha = arg;
-	dsl_pool_t *dp = dmu_tx_pool(tx);
-	nvlist_t *tmpholds;
-	uint64_t now = gethrestime_sec();
-
-	if (dduha->dduha_minor != 0)
-		tmpholds = fnvlist_alloc();
-	else
-		tmpholds = NULL;
-	for (nvpair_t *pair = nvlist_next_nvpair(dduha->dduha_chkholds, NULL);
-	    pair != NULL;
-	    pair = nvlist_next_nvpair(dduha->dduha_chkholds, pair)) {
-		dsl_dataset_t *ds;
-
-		VERIFY0(dsl_dataset_hold(dp, nvpair_name(pair), FTAG, &ds));
-		dsl_dataset_user_hold_sync_one_impl(tmpholds, ds,
-		    fnvpair_value_string(pair), dduha->dduha_minor, now, tx);
-		dsl_dataset_rele(ds, FTAG);
-	}
-	dsl_onexit_hold_cleanup(dp->dp_spa, tmpholds, dduha->dduha_minor);
-}
-
-/*
- * The full semantics of this function are described in the comment above
- * lzc_hold().
- *
- * To summarize:
- * holds is nvl of snapname -> holdname
- * errlist will be filled in with snapname -> error
- *
- * The snaphosts must all be in the same pool.
- *
- * Holds for snapshots that don't exist will be skipped.
- *
- * If none of the snapshots for requested holds exist then ENOENT will be
- * returned.
- *
- * If cleanup_minor is not 0, the holds will be temporary, which will be cleaned
- * up when the process exits.
- *
- * On success all the holds, for snapshots that existed, will be created and 0
- * will be returned.
- *
- * On failure no holds will be created, the errlist will be filled in,
- * and an errno will returned.
- *
- * In all cases the errlist will contain entries for holds where the snapshot
- * didn't exist.
- */
-int
-dsl_dataset_user_hold(nvlist_t *holds, minor_t cleanup_minor, nvlist_t *errlist)
-{
-	dsl_dataset_user_hold_arg_t dduha;
-	nvpair_t *pair;
-	int ret;
-
-	pair = nvlist_next_nvpair(holds, NULL);
-	if (pair == NULL)
-		return (0);
-
-	dduha.dduha_holds = holds;
-	dduha.dduha_chkholds = fnvlist_alloc();
-	dduha.dduha_errlist = errlist;
-	dduha.dduha_minor = cleanup_minor;
-
-	ret = dsl_sync_task(nvpair_name(pair), dsl_dataset_user_hold_check,
-	    dsl_dataset_user_hold_sync, &dduha,
-	    fnvlist_num_pairs(holds), ZFS_SPACE_CHECK_RESERVED);
-	fnvlist_free(dduha.dduha_chkholds);
-
-	return (ret);
-}
-
-typedef int (dsl_holdfunc_t)(dsl_pool_t *dp, const char *name, void *tag,
-    dsl_dataset_t **dsp);
-
-typedef struct dsl_dataset_user_release_arg {
-	dsl_holdfunc_t *ddura_holdfunc;
-	nvlist_t *ddura_holds;
-	nvlist_t *ddura_todelete;
-	nvlist_t *ddura_errlist;
-	nvlist_t *ddura_chkholds;
-} dsl_dataset_user_release_arg_t;
-
-/* Place a dataset hold on the snapshot identified by passed dsobj string */
-static int
-dsl_dataset_hold_obj_string(dsl_pool_t *dp, const char *dsobj, void *tag,
-    dsl_dataset_t **dsp)
-{
-	return (dsl_dataset_hold_obj(dp, zfs_strtonum(dsobj, NULL), tag, dsp));
-}
-
-static int
-dsl_dataset_user_release_check_one(dsl_dataset_user_release_arg_t *ddura,
-    dsl_dataset_t *ds, nvlist_t *holds, const char *snapname)
-{
-	uint64_t zapobj;
-	nvlist_t *holds_found;
-	objset_t *mos;
-	int numholds;
-
-	if (!ds->ds_is_snapshot)
-		return (SET_ERROR(EINVAL));
-
-	if (nvlist_empty(holds))
-		return (0);
-
-	numholds = 0;
-	mos = ds->ds_dir->dd_pool->dp_meta_objset;
-	zapobj = dsl_dataset_phys(ds)->ds_userrefs_obj;
-	holds_found = fnvlist_alloc();
-
-	for (nvpair_t *pair = nvlist_next_nvpair(holds, NULL); pair != NULL;
-	    pair = nvlist_next_nvpair(holds, pair)) {
-		uint64_t tmp;
-		int error;
-		const char *holdname = nvpair_name(pair);
-
-		if (zapobj != 0)
-			error = zap_lookup(mos, zapobj, holdname, 8, 1, &tmp);
-		else
-			error = SET_ERROR(ENOENT);
-
-		/*
-		 * Non-existent holds are put on the errlist, but don't
-		 * cause an overall failure.
-		 */
-		if (error == ENOENT) {
-			if (ddura->ddura_errlist != NULL) {
-				char *errtag = kmem_asprintf("%s#%s",
-				    snapname, holdname);
-				fnvlist_add_int32(ddura->ddura_errlist, errtag,
-				    ENOENT);
-				strfree(errtag);
-			}
-			continue;
-		}
-
-		if (error != 0) {
-			fnvlist_free(holds_found);
-			return (error);
-		}
-
-		fnvlist_add_boolean(holds_found, holdname);
-		numholds++;
-	}
-
-	if (DS_IS_DEFER_DESTROY(ds) &&
-	    dsl_dataset_phys(ds)->ds_num_children == 1 &&
-	    ds->ds_userrefs == numholds) {
-		/* we need to destroy the snapshot as well */
-		if (dsl_dataset_long_held(ds)) {
-			fnvlist_free(holds_found);
-			return (SET_ERROR(EBUSY));
-		}
-		fnvlist_add_boolean(ddura->ddura_todelete, snapname);
-	}
-
-	if (numholds != 0) {
-		fnvlist_add_nvlist(ddura->ddura_chkholds, snapname,
-		    holds_found);
-	}
-	fnvlist_free(holds_found);
-
-	return (0);
-}
-
-static int
-dsl_dataset_user_release_check(void *arg, dmu_tx_t *tx)
-{
-	dsl_dataset_user_release_arg_t *ddura;
-	dsl_holdfunc_t *holdfunc;
-	dsl_pool_t *dp;
-
-	if (!dmu_tx_is_syncing(tx))
-		return (0);
-
-	dp = dmu_tx_pool(tx);
-
-	ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock));
-
-	ddura = arg;
-	holdfunc = ddura->ddura_holdfunc;
-
-	for (nvpair_t *pair = nvlist_next_nvpair(ddura->ddura_holds, NULL);
-	    pair != NULL; pair = nvlist_next_nvpair(ddura->ddura_holds, pair)) {
-		int error;
-		dsl_dataset_t *ds;
-		nvlist_t *holds;
-		const char *snapname = nvpair_name(pair);
-
-		error = nvpair_value_nvlist(pair, &holds);
-		if (error != 0)
-			error = (SET_ERROR(EINVAL));
-		else
-			error = holdfunc(dp, snapname, FTAG, &ds);
-		if (error == 0) {
-			error = dsl_dataset_user_release_check_one(ddura, ds,
-			    holds, snapname);
-			dsl_dataset_rele(ds, FTAG);
-		}
-		if (error != 0) {
-			if (ddura->ddura_errlist != NULL) {
-				fnvlist_add_int32(ddura->ddura_errlist,
-				    snapname, error);
-			}
-			/*
-			 * Non-existent snapshots are put on the errlist,
-			 * but don't cause an overall failure.
-			 */
-			if (error != ENOENT)
-				return (error);
-		}
-	}
-
-	return (0);
-}
-
-static void
-dsl_dataset_user_release_sync_one(dsl_dataset_t *ds, nvlist_t *holds,
-    dmu_tx_t *tx)
-{
-	dsl_pool_t *dp = ds->ds_dir->dd_pool;
-	objset_t *mos = dp->dp_meta_objset;
-
-	for (nvpair_t *pair = nvlist_next_nvpair(holds, NULL); pair != NULL;
-	    pair = nvlist_next_nvpair(holds, pair)) {
-		int error;
-		const char *holdname = nvpair_name(pair);
-
-		/* Remove temporary hold if one exists. */
-		error = dsl_pool_user_release(dp, ds->ds_object, holdname, tx);
-		VERIFY(error == 0 || error == ENOENT);
-
-		VERIFY0(zap_remove(mos, dsl_dataset_phys(ds)->ds_userrefs_obj,
-		    holdname, tx));
-		ds->ds_userrefs--;
-
-		spa_history_log_internal_ds(ds, "release", tx,
-		    "tag=%s refs=%lld", holdname, (longlong_t)ds->ds_userrefs);
-	}
-}
-
-static void
-dsl_dataset_user_release_sync(void *arg, dmu_tx_t *tx)
-{
-	dsl_dataset_user_release_arg_t *ddura = arg;
-	dsl_holdfunc_t *holdfunc = ddura->ddura_holdfunc;
-	dsl_pool_t *dp = dmu_tx_pool(tx);
-
-	ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock));
-
-	for (nvpair_t *pair = nvlist_next_nvpair(ddura->ddura_chkholds, NULL);
-	    pair != NULL; pair = nvlist_next_nvpair(ddura->ddura_chkholds,
-	    pair)) {
-		dsl_dataset_t *ds;
-		const char *name = nvpair_name(pair);
-
-		VERIFY0(holdfunc(dp, name, FTAG, &ds));
-
-		dsl_dataset_user_release_sync_one(ds,
-		    fnvpair_value_nvlist(pair), tx);
-		if (nvlist_exists(ddura->ddura_todelete, name)) {
-			ASSERT(ds->ds_userrefs == 0 &&
-			    dsl_dataset_phys(ds)->ds_num_children == 1 &&
-			    DS_IS_DEFER_DESTROY(ds));
-			dsl_destroy_snapshot_sync_impl(ds, B_FALSE, tx);
-		}
-		dsl_dataset_rele(ds, FTAG);
-	}
-}
-
-/*
- * The full semantics of this function are described in the comment above
- * lzc_release().
- *
- * To summarize:
- * Releases holds specified in the nvl holds.
- *
- * holds is nvl of snapname -> { holdname, ... }
- * errlist will be filled in with snapname -> error
- *
- * If tmpdp is not NULL the names for holds should be the dsobj's of snapshots,
- * otherwise they should be the names of shapshots.
- *
- * As a release may cause snapshots to be destroyed this trys to ensure they
- * aren't mounted.
- *
- * The release of non-existent holds are skipped.
- *
- * At least one hold must have been released for the this function to succeed
- * and return 0.
- */
-static int
-dsl_dataset_user_release_impl(nvlist_t *holds, nvlist_t *errlist,
-    dsl_pool_t *tmpdp)
-{
-	dsl_dataset_user_release_arg_t ddura;
-	nvpair_t *pair;
-	char *pool;
-	int error;
-
-	pair = nvlist_next_nvpair(holds, NULL);
-	if (pair == NULL)
-		return (0);
-
-	/*
-	 * The release may cause snapshots to be destroyed; make sure they
-	 * are not mounted.
-	 */
-	if (tmpdp != NULL) {
-		/* Temporary holds are specified by dsobj string. */
-		ddura.ddura_holdfunc = dsl_dataset_hold_obj_string;
-		pool = spa_name(tmpdp->dp_spa);
-#ifdef _KERNEL
-		for (pair = nvlist_next_nvpair(holds, NULL); pair != NULL;
-		    pair = nvlist_next_nvpair(holds, pair)) {
-			dsl_dataset_t *ds;
-
-			dsl_pool_config_enter(tmpdp, FTAG);
-			error = dsl_dataset_hold_obj_string(tmpdp,
-			    nvpair_name(pair), FTAG, &ds);
-			if (error == 0) {
-				char name[ZFS_MAX_DATASET_NAME_LEN];
-				dsl_dataset_name(ds, name);
-				dsl_pool_config_exit(tmpdp, FTAG);
-				dsl_dataset_rele(ds, FTAG);
-				(void) zfs_unmount_snap(name);
-			} else {
-				dsl_pool_config_exit(tmpdp, FTAG);
-			}
-		}
-#endif
-	} else {
-		/* Non-temporary holds are specified by name. */
-		ddura.ddura_holdfunc = dsl_dataset_hold;
-		pool = nvpair_name(pair);
-#ifdef _KERNEL
-		for (pair = nvlist_next_nvpair(holds, NULL); pair != NULL;
-		    pair = nvlist_next_nvpair(holds, pair)) {
-			(void) zfs_unmount_snap(nvpair_name(pair));
-		}
-#endif
-	}
-
-	ddura.ddura_holds = holds;
-	ddura.ddura_errlist = errlist;
-	ddura.ddura_todelete = fnvlist_alloc();
-	ddura.ddura_chkholds = fnvlist_alloc();
-
-	error = dsl_sync_task(pool, dsl_dataset_user_release_check,
-	    dsl_dataset_user_release_sync, &ddura, 0,
-	    ZFS_SPACE_CHECK_EXTRA_RESERVED);
-	fnvlist_free(ddura.ddura_todelete);
-	fnvlist_free(ddura.ddura_chkholds);
-
-	return (error);
-}
-
-/*
- * holds is nvl of snapname -> { holdname, ... }
- * errlist will be filled in with snapname -> error
- */
-int
-dsl_dataset_user_release(nvlist_t *holds, nvlist_t *errlist)
-{
-	return (dsl_dataset_user_release_impl(holds, errlist, NULL));
-}
-
-/*
- * holds is nvl of snapdsobj -> { holdname, ... }
- */
-void
-dsl_dataset_user_release_tmp(struct dsl_pool *dp, nvlist_t *holds)
-{
-	ASSERT(dp != NULL);
-	(void) dsl_dataset_user_release_impl(holds, NULL, dp);
-}
-
-int
-dsl_dataset_get_holds(const char *dsname, nvlist_t *nvl)
-{
-	dsl_pool_t *dp;
-	dsl_dataset_t *ds;
-	int err;
-
-	err = dsl_pool_hold(dsname, FTAG, &dp);
-	if (err != 0)
-		return (err);
-	err = dsl_dataset_hold(dp, dsname, FTAG, &ds);
-	if (err != 0) {
-		dsl_pool_rele(dp, FTAG);
-		return (err);
-	}
-
-	if (dsl_dataset_phys(ds)->ds_userrefs_obj != 0) {
-		zap_attribute_t *za;
-		zap_cursor_t zc;
-
-		za = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP);
-		for (zap_cursor_init(&zc, ds->ds_dir->dd_pool->dp_meta_objset,
-		    dsl_dataset_phys(ds)->ds_userrefs_obj);
-		    zap_cursor_retrieve(&zc, za) == 0;
-		    zap_cursor_advance(&zc)) {
-			fnvlist_add_uint64(nvl, za->za_name,
-			    za->za_first_integer);
-		}
-		zap_cursor_fini(&zc);
-		kmem_free(za, sizeof (zap_attribute_t));
-	}
-	dsl_dataset_rele(ds, FTAG);
-	dsl_pool_rele(dp, FTAG);
-	return (0);
-}
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/edonr_zfs.c
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/edonr_zfs.c
+++ /dev/null
@@ -1,114 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://opensource.org/licenses/CDDL-1.0.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2013 Saso Kiselkov.  All rights reserved.
- * Use is subject to license terms.
- */
-/*
- * Copyright (c) 2016 by Delphix. All rights reserved.
- */
-#include <sys/zfs_context.h>
-#include <sys/zio.h>
-#include <sys/edonr.h>
-#include <sys/abd.h>
-
-#define	EDONR_MODE		512
-#define	EDONR_BLOCK_SIZE	EdonR512_BLOCK_SIZE
-
-static int
-edonr_incremental(void *buf, size_t size, void *arg)
-{
-	EdonRState *ctx = arg;
-	EdonRUpdate(ctx, buf, size * 8);
-	return (0);
-}
-
-/*
- * Native zio_checksum interface for the Edon-R hash function.
- */
-/*ARGSUSED*/
-void
-abd_checksum_edonr_native(abd_t *abd, uint64_t size,
-    const void *ctx_template, zio_cksum_t *zcp)
-{
-	uint8_t		digest[EDONR_MODE / 8];
-	EdonRState	ctx;
-
-	ASSERT(ctx_template != NULL);
-	bcopy(ctx_template, &ctx, sizeof (ctx));
-	(void) abd_iterate_func(abd, 0, size, edonr_incremental, &ctx);
-	EdonRFinal(&ctx, digest);
-	bcopy(digest, zcp->zc_word, sizeof (zcp->zc_word));
-}
-
-/*
- * Byteswapped zio_checksum interface for the Edon-R hash function.
- */
-void
-abd_checksum_edonr_byteswap(abd_t *abd, uint64_t size,
-    const void *ctx_template, zio_cksum_t *zcp)
-{
-	zio_cksum_t	tmp;
-
-	abd_checksum_edonr_native(abd, size, ctx_template, &tmp);
-	zcp->zc_word[0] = BSWAP_64(zcp->zc_word[0]);
-	zcp->zc_word[1] = BSWAP_64(zcp->zc_word[1]);
-	zcp->zc_word[2] = BSWAP_64(zcp->zc_word[2]);
-	zcp->zc_word[3] = BSWAP_64(zcp->zc_word[3]);
-}
-
-void *
-abd_checksum_edonr_tmpl_init(const zio_cksum_salt_t *salt)
-{
-	EdonRState	*ctx;
-	uint8_t		salt_block[EDONR_BLOCK_SIZE];
-
-	/*
-	 * Edon-R needs all but the last hash invocation to be on full-size
-	 * blocks, but the salt is too small. Rather than simply padding it
-	 * with zeros, we expand the salt into a new salt block of proper
-	 * size by double-hashing it (the new salt block will be composed of
-	 * H(salt) || H(H(salt))).
-	 */
-	CTASSERT(EDONR_BLOCK_SIZE == 2 * (EDONR_MODE / 8));
-	EdonRHash(EDONR_MODE, salt->zcs_bytes, sizeof (salt->zcs_bytes) * 8,
-	    salt_block);
-	EdonRHash(EDONR_MODE, salt_block, EDONR_MODE, salt_block +
-	    EDONR_MODE / 8);
-
-	/*
-	 * Feed the new salt block into the hash function - this will serve
-	 * as our MAC key.
-	 */
-	ctx = kmem_zalloc(sizeof (*ctx), KM_SLEEP);
-	EdonRInit(ctx, EDONR_MODE);
-	EdonRUpdate(ctx, salt_block, sizeof (salt_block) * 8);
-	return (ctx);
-}
-
-void
-abd_checksum_edonr_tmpl_free(void *ctx_template)
-{
-	EdonRState	*ctx = ctx_template;
-
-	bzero(ctx, sizeof (*ctx));
-	kmem_free(ctx, sizeof (*ctx));
-}
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/gzip.c
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/gzip.c
+++ /dev/null
@@ -1,69 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-
-/*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
-#include <sys/debug.h>
-#include <sys/types.h>
-#include <sys/zmod.h>
-
-#ifdef _KERNEL
-#include <sys/systm.h>
-#else
-#include <strings.h>
-#endif
-
-size_t
-gzip_compress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n)
-{
-	size_t dstlen = d_len;
-
-	ASSERT(d_len <= s_len);
-
-	if (z_compress_level(d_start, &dstlen, s_start, s_len, n) != Z_OK) {
-		if (d_len != s_len)
-			return (s_len);
-
-		bcopy(s_start, d_start, s_len);
-		return (s_len);
-	}
-
-	return (dstlen);
-}
-
-/*ARGSUSED*/
-int
-gzip_decompress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n)
-{
-	size_t dstlen = d_len;
-
-	ASSERT(d_len >= s_len);
-
-	if (z_uncompress(d_start, &dstlen, s_start, s_len) != Z_OK)
-		return (-1);
-
-	return (0);
-}
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/README.zfs
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/README.zfs
+++ /dev/null
@@ -1,80 +0,0 @@
-#
-# CDDL HEADER START
-#
-# This file and its contents are supplied under the terms of the
-# Common Development and Distribution License ("CDDL"), version 1.0.
-# You may only use this file in accordance with the terms of version
-# 1.0 of the CDDL.
-#
-# A full copy of the text of the CDDL should have accompanied this
-# source.  A copy of the CDDL is also available via the Internet at
-# http://www.illumos.org/license/CDDL.
-#
-# CDDL HEADER END
-#
-
-#
-# Copyright (c) 2017 by Delphix. All rights reserved.
-#
-
-Introduction
-------------
-
-This README describes the Lua interpreter source code that lives in the ZFS
-source tree to enable execution of ZFS channel programs, including its
-maintenance policy, the modifications that have been made to it, and how it
-should (and should not) be used.
-
-For a description of the Lua language and features exposed by ZFS channel
-programs, please refer to the zfs-program(1m) man page instead.
-
-
-Maintenance policy
-------------------
-
-The Lua runtime is considered stable software. Channel programs don't need much
-complicated logic, so updates to the Lua runtime from upstream are viewed as
-nice-to-have, but not required for channel programs to be well-supported. As
-such, the Lua runtime in ZFS should be updated on an as-needed basis for
-security vulnerabilities, but not much else.
-
-
-Modifications to Lua
---------------------
-
-The version of the Lua runtime we're using in ZFS has been modified in a variety
-of ways to make it more useful for the specific purpose of running channel
-programs. These changes include:
-
-1. "Normal" Lua uses floating point for all numbers it stores, but those aren't
-   useful inside ZFS / the kernel. We have changed the runtime to use int64_t
-   throughout for all numbers.
-2. Some of the Lua standard libraries do file I/O or spawn processes, but
-   neither of these make sense from inside channel programs. We have removed
-   those libraries rather than reimplementing them using kernel APIs.
-3. The "normal" Lua runtime handles errors by failing fatally, but since this
-   version of Lua runs inside the kernel we must handle these failures and
-   return meaningful error codes to userland. We have customized the Lua
-   failure paths so that they aren't fatal.
-4. Running poorly-vetted code inside the kernel is always a risk; even if the
-   ability to do so is restricted to the root user, it's still possible to write
-   an incorrect program that results in an infinite loop or massive memory use.
-   We've added new protections into the Lua interpreter to limit the runtime
-   (measured in number of Lua instructions run) and memory overhead of running
-   a channel program.
-5. The Lua bytecode is not designed to be secure / safe, so it would be easy to
-   pass invalid bytecode which can panic the kernel. By comparison, the parser
-   is hardened and fails gracefully on invalid input. Therefore, we only accept
-   Lua source code at the ioctl level and then interpret it inside the kernel.
-
-Each of these modifications have been tested in the zfs-test suite. If / when
-new modifications are made, new tests should be added to the suite located in
-zfs-tests/tests/functional/channel_program/lua_core.
-
-
-How to use this Lua interpreter
--------------------------------
-
-From the above, it should be clear that this is not a general-purpose Lua
-interpreter. Additional work would be required to extricate this custom version
-of Lua from ZFS and make it usable by other areas of the kernel.
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lapi.h
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lapi.h
+++ /dev/null
@@ -1,24 +0,0 @@
-/*
-** $Id: lapi.h,v 2.7.1.1 2013/04/12 18:48:47 roberto Exp $
-** Auxiliary functions from Lua API
-** See Copyright Notice in lua.h
-*/
-
-#ifndef lapi_h
-#define lapi_h
-
-
-#include "llimits.h"
-#include "lstate.h"
-
-#define api_incr_top(L)   {L->top++; api_check(L, L->top <= L->ci->top, \
-				"stack overflow");}
-
-#define adjustresults(L,nres) \
-    { if ((nres) == LUA_MULTRET && L->ci->top < L->top) L->ci->top = L->top; }
-
-#define api_checknelems(L,n)	api_check(L, (n) < (L->top - L->ci->func), \
-				  "not enough elements in the stack")
-
-
-#endif
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lapi.c
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lapi.c
+++ /dev/null
@@ -1,1283 +0,0 @@
-/*
-** $Id: lapi.c,v 2.171.1.1 2013/04/12 18:48:47 roberto Exp $
-** Lua API
-** See Copyright Notice in lua.h
-*/
-
-
-#include <sys/zfs_context.h>
-
-#define lapi_c
-#define LUA_CORE
-
-#include "lua.h"
-
-#include "lapi.h"
-#include "ldebug.h"
-#include "ldo.h"
-#include "lfunc.h"
-#include "lgc.h"
-#include "lmem.h"
-#include "lobject.h"
-#include "lstate.h"
-#include "lstring.h"
-#include "ltable.h"
-#include "ltm.h"
-#include "lundump.h"
-#include "lvm.h"
-
-
-
-const char lua_ident[] =
-  "$LuaVersion: " LUA_COPYRIGHT " $"
-  "$LuaAuthors: " LUA_AUTHORS " $";
-
-
-/* value at a non-valid index */
-#define NONVALIDVALUE		cast(TValue *, luaO_nilobject)
-
-/* corresponding test */
-#define isvalid(o)	((o) != luaO_nilobject)
-
-/* test for pseudo index */
-#define ispseudo(i)		((i) <= LUA_REGISTRYINDEX)
-
-/* test for valid but not pseudo index */
-#define isstackindex(i, o)	(isvalid(o) && !ispseudo(i))
-
-#define api_checkvalidindex(L, o)  api_check(L, isvalid(o), "invalid index")
-
-#define api_checkstackindex(L, i, o)  \
-	api_check(L, isstackindex(i, o), "index not in the stack")
-
-
-static TValue *index2addr (lua_State *L, int idx) {
-  CallInfo *ci = L->ci;
-  if (idx > 0) {
-    TValue *o = ci->func + idx;
-    api_check(L, idx <= ci->top - (ci->func + 1), "unacceptable index");
-    if (o >= L->top) return NONVALIDVALUE;
-    else return o;
-  }
-  else if (!ispseudo(idx)) {  /* negative index */
-    api_check(L, idx != 0 && -idx <= L->top - (ci->func + 1), "invalid index");
-    return L->top + idx;
-  }
-  else if (idx == LUA_REGISTRYINDEX)
-    return &G(L)->l_registry;
-  else {  /* upvalues */
-    idx = LUA_REGISTRYINDEX - idx;
-    api_check(L, idx <= MAXUPVAL + 1, "upvalue index too large");
-    if (ttislcf(ci->func))  /* light C function? */
-      return NONVALIDVALUE;  /* it has no upvalues */
-    else {
-      CClosure *func = clCvalue(ci->func);
-      return (idx <= func->nupvalues) ? &func->upvalue[idx-1] : NONVALIDVALUE;
-    }
-  }
-}
-
-
-/*
-** to be called by 'lua_checkstack' in protected mode, to grow stack
-** capturing memory errors
-*/
-static void growstack (lua_State *L, void *ud) {
-  int size = *(int *)ud;
-  luaD_growstack(L, size);
-}
-
-
-LUA_API int lua_checkstack (lua_State *L, int size) {
-  int res;
-  CallInfo *ci = L->ci;
-  lua_lock(L);
-  if (L->stack_last - L->top > size)  /* stack large enough? */
-    res = 1;  /* yes; check is OK */
-  else {  /* no; need to grow stack */
-    int inuse = cast_int(L->top - L->stack) + EXTRA_STACK;
-    if (inuse > LUAI_MAXSTACK - size)  /* can grow without overflow? */
-      res = 0;  /* no */
-    else  /* try to grow stack */
-      res = (luaD_rawrunprotected(L, &growstack, &size) == LUA_OK);
-  }
-  if (res && ci->top < L->top + size)
-    ci->top = L->top + size;  /* adjust frame top */
-  lua_unlock(L);
-  return res;
-}
-
-
-LUA_API void lua_xmove (lua_State *from, lua_State *to, int n) {
-  int i;
-  if (from == to) return;
-  lua_lock(to);
-  api_checknelems(from, n);
-  api_check(from, G(from) == G(to), "moving among independent states");
-  api_check(from, to->ci->top - to->top >= n, "not enough elements to move");
-  from->top -= n;
-  for (i = 0; i < n; i++) {
-    setobj2s(to, to->top++, from->top + i);
-  }
-  lua_unlock(to);
-}
-
-
-LUA_API lua_CFunction lua_atpanic (lua_State *L, lua_CFunction panicf) {
-  lua_CFunction old;
-  lua_lock(L);
-  old = G(L)->panic;
-  G(L)->panic = panicf;
-  lua_unlock(L);
-  return old;
-}
-
-
-LUA_API const lua_Number *lua_version (lua_State *L) {
-  static const lua_Number version = LUA_VERSION_NUM;
-  if (L == NULL) return &version;
-  else return G(L)->version;
-}
-
-
-
-/*
-** basic stack manipulation
-*/
-
-
-/*
-** convert an acceptable stack index into an absolute index
-*/
-LUA_API int lua_absindex (lua_State *L, int idx) {
-  return (idx > 0 || ispseudo(idx))
-         ? idx
-         : cast_int(L->top - L->ci->func + idx);
-}
-
-
-LUA_API int lua_gettop (lua_State *L) {
-  return cast_int(L->top - (L->ci->func + 1));
-}
-
-
-LUA_API void lua_settop (lua_State *L, int idx) {
-  StkId func = L->ci->func;
-  lua_lock(L);
-  if (idx >= 0) {
-    api_check(L, idx <= L->stack_last - (func + 1), "new top too large");
-    while (L->top < (func + 1) + idx)
-      setnilvalue(L->top++);
-    L->top = (func + 1) + idx;
-  }
-  else {
-    api_check(L, -(idx+1) <= (L->top - (func + 1)), "invalid new top");
-    L->top += idx+1;  /* `subtract' index (index is negative) */
-  }
-  lua_unlock(L);
-}
-
-
-LUA_API void lua_remove (lua_State *L, int idx) {
-  StkId p;
-  lua_lock(L);
-  p = index2addr(L, idx);
-  api_checkstackindex(L, idx, p);
-  while (++p < L->top) setobjs2s(L, p-1, p);
-  L->top--;
-  lua_unlock(L);
-}
-
-
-LUA_API void lua_insert (lua_State *L, int idx) {
-  StkId p;
-  StkId q;
-  lua_lock(L);
-  p = index2addr(L, idx);
-  api_checkstackindex(L, idx, p);
-  for (q = L->top; q > p; q--)  /* use L->top as a temporary */
-    setobjs2s(L, q, q - 1);
-  setobjs2s(L, p, L->top);
-  lua_unlock(L);
-}
-
-
-static void moveto (lua_State *L, TValue *fr, int idx) {
-  TValue *to = index2addr(L, idx);
-  api_checkvalidindex(L, to);
-  setobj(L, to, fr);
-  if (idx < LUA_REGISTRYINDEX)  /* function upvalue? */
-    luaC_barrier(L, clCvalue(L->ci->func), fr);
-  /* LUA_REGISTRYINDEX does not need gc barrier
-     (collector revisits it before finishing collection) */
-}
-
-
-LUA_API void lua_replace (lua_State *L, int idx) {
-  lua_lock(L);
-  api_checknelems(L, 1);
-  moveto(L, L->top - 1, idx);
-  L->top--;
-  lua_unlock(L);
-}
-
-
-LUA_API void lua_copy (lua_State *L, int fromidx, int toidx) {
-  TValue *fr;
-  lua_lock(L);
-  fr = index2addr(L, fromidx);
-  moveto(L, fr, toidx);
-  lua_unlock(L);
-}
-
-
-LUA_API void lua_pushvalue (lua_State *L, int idx) {
-  lua_lock(L);
-  setobj2s(L, L->top, index2addr(L, idx));
-  api_incr_top(L);
-  lua_unlock(L);
-}
-
-
-
-/*
-** access functions (stack -> C)
-*/
-
-
-LUA_API int lua_type (lua_State *L, int idx) {
-  StkId o = index2addr(L, idx);
-  return (isvalid(o) ? ttypenv(o) : LUA_TNONE);
-}
-
-
-LUA_API const char *lua_typename (lua_State *L, int t) {
-  UNUSED(L);
-  return ttypename(t);
-}
-
-
-LUA_API int lua_iscfunction (lua_State *L, int idx) {
-  StkId o = index2addr(L, idx);
-  return (ttislcf(o) || (ttisCclosure(o)));
-}
-
-
-LUA_API int lua_isnumber (lua_State *L, int idx) {
-  TValue n;
-  const TValue *o = index2addr(L, idx);
-  return tonumber(o, &n);
-}
-
-
-LUA_API int lua_isstring (lua_State *L, int idx) {
-  int t = lua_type(L, idx);
-  return (t == LUA_TSTRING || t == LUA_TNUMBER);
-}
-
-
-LUA_API int lua_isuserdata (lua_State *L, int idx) {
-  const TValue *o = index2addr(L, idx);
-  return (ttisuserdata(o) || ttislightuserdata(o));
-}
-
-
-LUA_API int lua_rawequal (lua_State *L, int index1, int index2) {
-  StkId o1 = index2addr(L, index1);
-  StkId o2 = index2addr(L, index2);
-  return (isvalid(o1) && isvalid(o2)) ? luaV_rawequalobj(o1, o2) : 0;
-}
-
-
-LUA_API void lua_arith (lua_State *L, int op) {
-  StkId o1;  /* 1st operand */
-  StkId o2;  /* 2nd operand */
-  lua_lock(L);
-  if (op != LUA_OPUNM) /* all other operations expect two operands */
-    api_checknelems(L, 2);
-  else {  /* for unary minus, add fake 2nd operand */
-    api_checknelems(L, 1);
-    setobjs2s(L, L->top, L->top - 1);
-    L->top++;
-  }
-  o1 = L->top - 2;
-  o2 = L->top - 1;
-  if (ttisnumber(o1) && ttisnumber(o2)) {
-    setnvalue(o1, luaO_arith(op, nvalue(o1), nvalue(o2)));
-  }
-  else
-    luaV_arith(L, o1, o1, o2, cast(TMS, op - LUA_OPADD + TM_ADD));
-  L->top--;
-  lua_unlock(L);
-}
-
-
-LUA_API int lua_compare (lua_State *L, int index1, int index2, int op) {
-  StkId o1, o2;
-  int i = 0;
-  lua_lock(L);  /* may call tag method */
-  o1 = index2addr(L, index1);
-  o2 = index2addr(L, index2);
-  if (isvalid(o1) && isvalid(o2)) {
-    switch (op) {
-      case LUA_OPEQ: i = equalobj(L, o1, o2); break;
-      case LUA_OPLT: i = luaV_lessthan(L, o1, o2); break;
-      case LUA_OPLE: i = luaV_lessequal(L, o1, o2); break;
-      default: api_check(L, 0, "invalid option");
-    }
-  }
-  lua_unlock(L);
-  return i;
-}
-
-
-LUA_API lua_Number lua_tonumberx (lua_State *L, int idx, int *isnum) {
-  TValue n;
-  const TValue *o = index2addr(L, idx);
-  if (tonumber(o, &n)) {
-    if (isnum) *isnum = 1;
-    return nvalue(o);
-  }
-  else {
-    if (isnum) *isnum = 0;
-    return 0;
-  }
-}
-
-
-LUA_API lua_Integer lua_tointegerx (lua_State *L, int idx, int *isnum) {
-  TValue n;
-  const TValue *o = index2addr(L, idx);
-  if (tonumber(o, &n)) {
-    lua_Integer res;
-    lua_Number num = nvalue(o);
-    lua_number2integer(res, num);
-    if (isnum) *isnum = 1;
-    return res;
-  }
-  else {
-    if (isnum) *isnum = 0;
-    return 0;
-  }
-}
-
-
-LUA_API lua_Unsigned lua_tounsignedx (lua_State *L, int idx, int *isnum) {
-  TValue n;
-  const TValue *o = index2addr(L, idx);
-  if (tonumber(o, &n)) {
-    lua_Unsigned res;
-    lua_Number num = nvalue(o);
-    lua_number2unsigned(res, num);
-    if (isnum) *isnum = 1;
-    return res;
-  }
-  else {
-    if (isnum) *isnum = 0;
-    return 0;
-  }
-}
-
-
-LUA_API int lua_toboolean (lua_State *L, int idx) {
-  const TValue *o = index2addr(L, idx);
-  return !l_isfalse(o);
-}
-
-
-LUA_API const char *lua_tolstring (lua_State *L, int idx, size_t *len) {
-  StkId o = index2addr(L, idx);
-  if (!ttisstring(o)) {
-    lua_lock(L);  /* `luaV_tostring' may create a new string */
-    if (!luaV_tostring(L, o)) {  /* conversion failed? */
-      if (len != NULL) *len = 0;
-      lua_unlock(L);
-      return NULL;
-    }
-    luaC_checkGC(L);
-    o = index2addr(L, idx);  /* previous call may reallocate the stack */
-    lua_unlock(L);
-  }
-  if (len != NULL) *len = tsvalue(o)->len;
-  return svalue(o);
-}
-
-
-LUA_API size_t lua_rawlen (lua_State *L, int idx) {
-  StkId o = index2addr(L, idx);
-  switch (ttypenv(o)) {
-    case LUA_TSTRING: return tsvalue(o)->len;
-    case LUA_TUSERDATA: return uvalue(o)->len;
-    case LUA_TTABLE: return luaH_getn(hvalue(o));
-    default: return 0;
-  }
-}
-
-
-LUA_API lua_CFunction lua_tocfunction (lua_State *L, int idx) {
-  StkId o = index2addr(L, idx);
-  if (ttislcf(o)) return fvalue(o);
-  else if (ttisCclosure(o))
-    return clCvalue(o)->f;
-  else return NULL;  /* not a C function */
-}
-
-
-LUA_API void *lua_touserdata (lua_State *L, int idx) {
-  StkId o = index2addr(L, idx);
-  switch (ttypenv(o)) {
-    case LUA_TUSERDATA: return (rawuvalue(o) + 1);
-    case LUA_TLIGHTUSERDATA: return pvalue(o);
-    default: return NULL;
-  }
-}
-
-
-LUA_API lua_State *lua_tothread (lua_State *L, int idx) {
-  StkId o = index2addr(L, idx);
-  return (!ttisthread(o)) ? NULL : thvalue(o);
-}
-
-
-LUA_API const void *lua_topointer (lua_State *L, int idx) {
-  StkId o = index2addr(L, idx);
-  switch (ttype(o)) {
-    case LUA_TTABLE: return hvalue(o);
-    case LUA_TLCL: return clLvalue(o);
-    case LUA_TCCL: return clCvalue(o);
-    case LUA_TLCF: return cast(void *, cast(size_t, fvalue(o)));
-    case LUA_TTHREAD: return thvalue(o);
-    case LUA_TUSERDATA:
-    case LUA_TLIGHTUSERDATA:
-      return lua_touserdata(L, idx);
-    default: return NULL;
-  }
-}
-
-
-
-/*
-** push functions (C -> stack)
-*/
-
-
-LUA_API void lua_pushnil (lua_State *L) {
-  lua_lock(L);
-  setnilvalue(L->top);
-  api_incr_top(L);
-  lua_unlock(L);
-}
-
-
-LUA_API void lua_pushnumber (lua_State *L, lua_Number n) {
-  lua_lock(L);
-  setnvalue(L->top, n);
-  luai_checknum(L, L->top,
-    luaG_runerror(L, "C API - attempt to push a signaling NaN"));
-  api_incr_top(L);
-  lua_unlock(L);
-}
-
-
-LUA_API void lua_pushinteger (lua_State *L, lua_Integer n) {
-  lua_lock(L);
-  setnvalue(L->top, cast_num(n));
-  api_incr_top(L);
-  lua_unlock(L);
-}
-
-
-LUA_API void lua_pushunsigned (lua_State *L, lua_Unsigned u) {
-  lua_Number n;
-  lua_lock(L);
-  n = lua_unsigned2number(u);
-  setnvalue(L->top, n);
-  api_incr_top(L);
-  lua_unlock(L);
-}
-
-
-LUA_API const char *lua_pushlstring (lua_State *L, const char *s, size_t len) {
-  TString *ts;
-  lua_lock(L);
-  luaC_checkGC(L);
-  ts = luaS_newlstr(L, s, len);
-  setsvalue2s(L, L->top, ts);
-  api_incr_top(L);
-  lua_unlock(L);
-  return getstr(ts);
-}
-
-
-LUA_API const char *lua_pushstring (lua_State *L, const char *s) {
-  if (s == NULL) {
-    lua_pushnil(L);
-    return NULL;
-  }
-  else {
-    TString *ts;
-    lua_lock(L);
-    luaC_checkGC(L);
-    ts = luaS_new(L, s);
-    setsvalue2s(L, L->top, ts);
-    api_incr_top(L);
-    lua_unlock(L);
-    return getstr(ts);
-  }
-}
-
-
-LUA_API const char *lua_pushvfstring (lua_State *L, const char *fmt,
-                                      va_list argp) {
-  const char *ret;
-  lua_lock(L);
-  luaC_checkGC(L);
-  ret = luaO_pushvfstring(L, fmt, argp);
-  lua_unlock(L);
-  return ret;
-}
-
-
-LUA_API const char *lua_pushfstring (lua_State *L, const char *fmt, ...) {
-  const char *ret;
-  va_list argp;
-  lua_lock(L);
-  luaC_checkGC(L);
-  va_start(argp, fmt);
-  ret = luaO_pushvfstring(L, fmt, argp);
-  va_end(argp);
-  lua_unlock(L);
-  return ret;
-}
-
-
-LUA_API void lua_pushcclosure (lua_State *L, lua_CFunction fn, int n) {
-  lua_lock(L);
-  if (n == 0) {
-    setfvalue(L->top, fn);
-  }
-  else {
-    Closure *cl;
-    api_checknelems(L, n);
-    api_check(L, n <= MAXUPVAL, "upvalue index too large");
-    luaC_checkGC(L);
-    cl = luaF_newCclosure(L, n);
-    cl->c.f = fn;
-    L->top -= n;
-    while (n--)
-      setobj2n(L, &cl->c.upvalue[n], L->top + n);
-    setclCvalue(L, L->top, cl);
-  }
-  api_incr_top(L);
-  lua_unlock(L);
-}
-
-
-LUA_API void lua_pushboolean (lua_State *L, int b) {
-  lua_lock(L);
-  setbvalue(L->top, (b != 0));  /* ensure that true is 1 */
-  api_incr_top(L);
-  lua_unlock(L);
-}
-
-
-LUA_API void lua_pushlightuserdata (lua_State *L, void *p) {
-  lua_lock(L);
-  setpvalue(L->top, p);
-  api_incr_top(L);
-  lua_unlock(L);
-}
-
-
-LUA_API int lua_pushthread (lua_State *L) {
-  lua_lock(L);
-  setthvalue(L, L->top, L);
-  api_incr_top(L);
-  lua_unlock(L);
-  return (G(L)->mainthread == L);
-}
-
-
-
-/*
-** get functions (Lua -> stack)
-*/
-
-
-LUA_API void lua_getglobal (lua_State *L, const char *var) {
-  Table *reg = hvalue(&G(L)->l_registry);
-  const TValue *gt;  /* global table */
-  lua_lock(L);
-  gt = luaH_getint(reg, LUA_RIDX_GLOBALS);
-  setsvalue2s(L, L->top++, luaS_new(L, var));
-  luaV_gettable(L, gt, L->top - 1, L->top - 1);
-  lua_unlock(L);
-}
-
-
-LUA_API void lua_gettable (lua_State *L, int idx) {
-  StkId t;
-  lua_lock(L);
-  t = index2addr(L, idx);
-  luaV_gettable(L, t, L->top - 1, L->top - 1);
-  lua_unlock(L);
-}
-
-
-LUA_API void lua_getfield (lua_State *L, int idx, const char *k) {
-  StkId t;
-  lua_lock(L);
-  t = index2addr(L, idx);
-  setsvalue2s(L, L->top, luaS_new(L, k));
-  api_incr_top(L);
-  luaV_gettable(L, t, L->top - 1, L->top - 1);
-  lua_unlock(L);
-}
-
-
-LUA_API void lua_rawget (lua_State *L, int idx) {
-  StkId t;
-  lua_lock(L);
-  t = index2addr(L, idx);
-  api_check(L, ttistable(t), "table expected");
-  setobj2s(L, L->top - 1, luaH_get(hvalue(t), L->top - 1));
-  lua_unlock(L);
-}
-
-
-LUA_API void lua_rawgeti (lua_State *L, int idx, int n) {
-  StkId t;
-  lua_lock(L);
-  t = index2addr(L, idx);
-  api_check(L, ttistable(t), "table expected");
-  setobj2s(L, L->top, luaH_getint(hvalue(t), n));
-  api_incr_top(L);
-  lua_unlock(L);
-}
-
-
-LUA_API void lua_rawgetp (lua_State *L, int idx, const void *p) {
-  StkId t;
-  TValue k;
-  lua_lock(L);
-  t = index2addr(L, idx);
-  api_check(L, ttistable(t), "table expected");
-  setpvalue(&k, cast(void *, p));
-  setobj2s(L, L->top, luaH_get(hvalue(t), &k));
-  api_incr_top(L);
-  lua_unlock(L);
-}
-
-
-LUA_API void lua_createtable (lua_State *L, int narray, int nrec) {
-  Table *t;
-  lua_lock(L);
-  luaC_checkGC(L);
-  t = luaH_new(L);
-  sethvalue(L, L->top, t);
-  api_incr_top(L);
-  if (narray > 0 || nrec > 0)
-    luaH_resize(L, t, narray, nrec);
-  lua_unlock(L);
-}
-
-
-LUA_API int lua_getmetatable (lua_State *L, int objindex) {
-  const TValue *obj;
-  Table *mt = NULL;
-  int res;
-  lua_lock(L);
-  obj = index2addr(L, objindex);
-  switch (ttypenv(obj)) {
-    case LUA_TTABLE:
-      mt = hvalue(obj)->metatable;
-      break;
-    case LUA_TUSERDATA:
-      mt = uvalue(obj)->metatable;
-      break;
-    default:
-      mt = G(L)->mt[ttypenv(obj)];
-      break;
-  }
-  if (mt == NULL)
-    res = 0;
-  else {
-    sethvalue(L, L->top, mt);
-    api_incr_top(L);
-    res = 1;
-  }
-  lua_unlock(L);
-  return res;
-}
-
-
-LUA_API void lua_getuservalue (lua_State *L, int idx) {
-  StkId o;
-  lua_lock(L);
-  o = index2addr(L, idx);
-  api_check(L, ttisuserdata(o), "userdata expected");
-  if (uvalue(o)->env) {
-    sethvalue(L, L->top, uvalue(o)->env);
-  } else
-    setnilvalue(L->top);
-  api_incr_top(L);
-  lua_unlock(L);
-}
-
-
-/*
-** set functions (stack -> Lua)
-*/
-
-
-LUA_API void lua_setglobal (lua_State *L, const char *var) {
-  Table *reg = hvalue(&G(L)->l_registry);
-  const TValue *gt;  /* global table */
-  lua_lock(L);
-  api_checknelems(L, 1);
-  gt = luaH_getint(reg, LUA_RIDX_GLOBALS);
-  setsvalue2s(L, L->top++, luaS_new(L, var));
-  luaV_settable(L, gt, L->top - 1, L->top - 2);
-  L->top -= 2;  /* pop value and key */
-  lua_unlock(L);
-}
-
-
-LUA_API void lua_settable (lua_State *L, int idx) {
-  StkId t;
-  lua_lock(L);
-  api_checknelems(L, 2);
-  t = index2addr(L, idx);
-  luaV_settable(L, t, L->top - 2, L->top - 1);
-  L->top -= 2;  /* pop index and value */
-  lua_unlock(L);
-}
-
-
-LUA_API void lua_setfield (lua_State *L, int idx, const char *k) {
-  StkId t;
-  lua_lock(L);
-  api_checknelems(L, 1);
-  t = index2addr(L, idx);
-  setsvalue2s(L, L->top++, luaS_new(L, k));
-  luaV_settable(L, t, L->top - 1, L->top - 2);
-  L->top -= 2;  /* pop value and key */
-  lua_unlock(L);
-}
-
-
-LUA_API void lua_rawset (lua_State *L, int idx) {
-  StkId t;
-  lua_lock(L);
-  api_checknelems(L, 2);
-  t = index2addr(L, idx);
-  api_check(L, ttistable(t), "table expected");
-  setobj2t(L, luaH_set(L, hvalue(t), L->top-2), L->top-1);
-  invalidateTMcache(hvalue(t));
-  luaC_barrierback(L, gcvalue(t), L->top-1);
-  L->top -= 2;
-  lua_unlock(L);
-}
-
-
-LUA_API void lua_rawseti (lua_State *L, int idx, int n) {
-  StkId t;
-  lua_lock(L);
-  api_checknelems(L, 1);
-  t = index2addr(L, idx);
-  api_check(L, ttistable(t), "table expected");
-  luaH_setint(L, hvalue(t), n, L->top - 1);
-  luaC_barrierback(L, gcvalue(t), L->top-1);
-  L->top--;
-  lua_unlock(L);
-}
-
-
-LUA_API void lua_rawsetp (lua_State *L, int idx, const void *p) {
-  StkId t;
-  TValue k;
-  lua_lock(L);
-  api_checknelems(L, 1);
-  t = index2addr(L, idx);
-  api_check(L, ttistable(t), "table expected");
-  setpvalue(&k, cast(void *, p));
-  setobj2t(L, luaH_set(L, hvalue(t), &k), L->top - 1);
-  luaC_barrierback(L, gcvalue(t), L->top - 1);
-  L->top--;
-  lua_unlock(L);
-}
-
-
-LUA_API int lua_setmetatable (lua_State *L, int objindex) {
-  TValue *obj;
-  Table *mt;
-  lua_lock(L);
-  api_checknelems(L, 1);
-  obj = index2addr(L, objindex);
-  if (ttisnil(L->top - 1))
-    mt = NULL;
-  else {
-    api_check(L, ttistable(L->top - 1), "table expected");
-    mt = hvalue(L->top - 1);
-  }
-  switch (ttypenv(obj)) {
-    case LUA_TTABLE: {
-      hvalue(obj)->metatable = mt;
-      if (mt) {
-        luaC_objbarrierback(L, gcvalue(obj), mt);
-        luaC_checkfinalizer(L, gcvalue(obj), mt);
-      }
-      break;
-    }
-    case LUA_TUSERDATA: {
-      uvalue(obj)->metatable = mt;
-      if (mt) {
-        luaC_objbarrier(L, rawuvalue(obj), mt);
-        luaC_checkfinalizer(L, gcvalue(obj), mt);
-      }
-      break;
-    }
-    default: {
-      G(L)->mt[ttypenv(obj)] = mt;
-      break;
-    }
-  }
-  L->top--;
-  lua_unlock(L);
-  return 1;
-}
-
-
-LUA_API void lua_setuservalue (lua_State *L, int idx) {
-  StkId o;
-  lua_lock(L);
-  api_checknelems(L, 1);
-  o = index2addr(L, idx);
-  api_check(L, ttisuserdata(o), "userdata expected");
-  if (ttisnil(L->top - 1))
-    uvalue(o)->env = NULL;
-  else {
-    api_check(L, ttistable(L->top - 1), "table expected");
-    uvalue(o)->env = hvalue(L->top - 1);
-    luaC_objbarrier(L, gcvalue(o), hvalue(L->top - 1));
-  }
-  L->top--;
-  lua_unlock(L);
-}
-
-
-/*
-** `load' and `call' functions (run Lua code)
-*/
-
-
-#define checkresults(L,na,nr) \
-     api_check(L, (nr) == LUA_MULTRET || (L->ci->top - L->top >= (nr) - (na)), \
-	"results from function overflow current stack size")
-
-
-LUA_API int lua_getctx (lua_State *L, int *ctx) {
-  if (L->ci->callstatus & CIST_YIELDED) {
-    if (ctx) *ctx = L->ci->u.c.ctx;
-    return L->ci->u.c.status;
-  }
-  else return LUA_OK;
-}
-
-
-LUA_API void lua_callk (lua_State *L, int nargs, int nresults, int ctx,
-                        lua_CFunction k) {
-  StkId func;
-  lua_lock(L);
-  api_check(L, k == NULL || !isLua(L->ci),
-    "cannot use continuations inside hooks");
-  api_checknelems(L, nargs+1);
-  api_check(L, L->status == LUA_OK, "cannot do calls on non-normal thread");
-  checkresults(L, nargs, nresults);
-  func = L->top - (nargs+1);
-  if (k != NULL && L->nny == 0) {  /* need to prepare continuation? */
-    L->ci->u.c.k = k;  /* save continuation */
-    L->ci->u.c.ctx = ctx;  /* save context */
-    luaD_call(L, func, nresults, 1);  /* do the call */
-  }
-  else  /* no continuation or no yieldable */
-    luaD_call(L, func, nresults, 0);  /* just do the call */
-  adjustresults(L, nresults);
-  lua_unlock(L);
-}
-
-
-
-/*
-** Execute a protected call.
-*/
-struct CallS {  /* data to `f_call' */
-  StkId func;
-  int nresults;
-};
-
-
-static void f_call (lua_State *L, void *ud) {
-  struct CallS *c = cast(struct CallS *, ud);
-  luaD_call(L, c->func, c->nresults, 0);
-}
-
-
-
-LUA_API int lua_pcallk (lua_State *L, int nargs, int nresults, int errfunc,
-                        int ctx, lua_CFunction k) {
-  struct CallS c;
-  int status;
-  ptrdiff_t func;
-  lua_lock(L);
-  api_check(L, k == NULL || !isLua(L->ci),
-    "cannot use continuations inside hooks");
-  api_checknelems(L, nargs+1);
-  api_check(L, L->status == LUA_OK, "cannot do calls on non-normal thread");
-  checkresults(L, nargs, nresults);
-  if (errfunc == 0)
-    func = 0;
-  else {
-    StkId o = index2addr(L, errfunc);
-    api_checkstackindex(L, errfunc, o);
-    func = savestack(L, o);
-  }
-  c.func = L->top - (nargs+1);  /* function to be called */
-  if (k == NULL || L->nny > 0) {  /* no continuation or no yieldable? */
-    c.nresults = nresults;  /* do a 'conventional' protected call */
-    status = luaD_pcall(L, f_call, &c, savestack(L, c.func), func);
-  }
-  else {  /* prepare continuation (call is already protected by 'resume') */
-    CallInfo *ci = L->ci;
-    ci->u.c.k = k;  /* save continuation */
-    ci->u.c.ctx = ctx;  /* save context */
-    /* save information for error recovery */
-    ci->extra = savestack(L, c.func);
-    ci->u.c.old_allowhook = L->allowhook;
-    ci->u.c.old_errfunc = L->errfunc;
-    L->errfunc = func;
-    /* mark that function may do error recovery */
-    ci->callstatus |= CIST_YPCALL;
-    luaD_call(L, c.func, nresults, 1);  /* do the call */
-    ci->callstatus &= ~CIST_YPCALL;
-    L->errfunc = ci->u.c.old_errfunc;
-    status = LUA_OK;  /* if it is here, there were no errors */
-  }
-  adjustresults(L, nresults);
-  lua_unlock(L);
-  return status;
-}
-
-
-LUA_API int lua_load (lua_State *L, lua_Reader reader, void *data,
-                      const char *chunkname, const char *mode) {
-  ZIO z;
-  int status;
-  lua_lock(L);
-  if (!chunkname) chunkname = "?";
-  luaZ_init(L, &z, reader, data);
-  status = luaD_protectedparser(L, &z, chunkname, mode);
-  if (status == LUA_OK) {  /* no errors? */
-    LClosure *f = clLvalue(L->top - 1);  /* get newly created function */
-    if (f->nupvalues == 1) {  /* does it have one upvalue? */
-      /* get global table from registry */
-      Table *reg = hvalue(&G(L)->l_registry);
-      const TValue *gt = luaH_getint(reg, LUA_RIDX_GLOBALS);
-      /* set global table as 1st upvalue of 'f' (may be LUA_ENV) */
-      setobj(L, f->upvals[0]->v, gt);
-      luaC_barrier(L, f->upvals[0], gt);
-    }
-  }
-  lua_unlock(L);
-  return status;
-}
-
-
-LUA_API int lua_dump (lua_State *L, lua_Writer writer, void *data) {
-  int status;
-  TValue *o;
-  lua_lock(L);
-  api_checknelems(L, 1);
-  o = L->top - 1;
-  if (isLfunction(o))
-    status = luaU_dump(L, getproto(o), writer, data, 0);
-  else
-    status = 1;
-  lua_unlock(L);
-  return status;
-}
-
-
-LUA_API int lua_status (lua_State *L) {
-  return L->status;
-}
-
-
-/*
-** Garbage-collection function
-*/
-
-LUA_API int lua_gc (lua_State *L, int what, int data) {
-  int res = 0;
-  global_State *g;
-  lua_lock(L);
-  g = G(L);
-  switch (what) {
-    case LUA_GCSTOP: {
-      g->gcrunning = 0;
-      break;
-    }
-    case LUA_GCRESTART: {
-      luaE_setdebt(g, 0);
-      g->gcrunning = 1;
-      break;
-    }
-    case LUA_GCCOLLECT: {
-      luaC_fullgc(L, 0);
-      break;
-    }
-    case LUA_GCCOUNT: {
-      /* GC values are expressed in Kbytes: #bytes/2^10 */
-      res = cast_int(gettotalbytes(g) >> 10);
-      break;
-    }
-    case LUA_GCCOUNTB: {
-      res = cast_int(gettotalbytes(g) & 0x3ff);
-      break;
-    }
-    case LUA_GCSTEP: {
-      if (g->gckind == KGC_GEN) {  /* generational mode? */
-        res = (g->GCestimate == 0);  /* true if it will do major collection */
-        luaC_forcestep(L);  /* do a single step */
-      }
-      else {
-       lu_mem debt = cast(lu_mem, data) * 1024 - GCSTEPSIZE;
-       if (g->gcrunning)
-         debt += g->GCdebt;  /* include current debt */
-       luaE_setdebt(g, debt);
-       luaC_forcestep(L);
-       if (g->gcstate == GCSpause)  /* end of cycle? */
-         res = 1;  /* signal it */
-      }
-      break;
-    }
-    case LUA_GCSETPAUSE: {
-      res = g->gcpause;
-      g->gcpause = data;
-      break;
-    }
-    case LUA_GCSETMAJORINC: {
-      res = g->gcmajorinc;
-      g->gcmajorinc = data;
-      break;
-    }
-    case LUA_GCSETSTEPMUL: {
-      res = g->gcstepmul;
-      g->gcstepmul = data;
-      break;
-    }
-    case LUA_GCISRUNNING: {
-      res = g->gcrunning;
-      break;
-    }
-    case LUA_GCGEN: {  /* change collector to generational mode */
-      luaC_changemode(L, KGC_GEN);
-      break;
-    }
-    case LUA_GCINC: {  /* change collector to incremental mode */
-      luaC_changemode(L, KGC_NORMAL);
-      break;
-    }
-    default: res = -1;  /* invalid option */
-  }
-  lua_unlock(L);
-  return res;
-}
-
-
-
-/*
-** miscellaneous functions
-*/
-
-
-LUA_API int lua_error (lua_State *L) {
-  lua_lock(L);
-  api_checknelems(L, 1);
-  luaG_errormsg(L);
-  /* code unreachable; will unlock when control actually leaves the kernel */
-  return 0;  /* to avoid warnings */
-}
-
-
-LUA_API int lua_next (lua_State *L, int idx) {
-  StkId t;
-  int more;
-  lua_lock(L);
-  t = index2addr(L, idx);
-  api_check(L, ttistable(t), "table expected");
-  more = luaH_next(L, hvalue(t), L->top - 1);
-  if (more) {
-    api_incr_top(L);
-  }
-  else  /* no more elements */
-    L->top -= 1;  /* remove key */
-  lua_unlock(L);
-  return more;
-}
-
-
-LUA_API void lua_concat (lua_State *L, int n) {
-  lua_lock(L);
-  api_checknelems(L, n);
-  if (n >= 2) {
-    luaC_checkGC(L);
-    luaV_concat(L, n);
-  }
-  else if (n == 0) {  /* push empty string */
-    setsvalue2s(L, L->top, luaS_newlstr(L, "", 0));
-    api_incr_top(L);
-  }
-  /* else n == 1; nothing to do */
-  lua_unlock(L);
-}
-
-
-LUA_API void lua_len (lua_State *L, int idx) {
-  StkId t;
-  lua_lock(L);
-  t = index2addr(L, idx);
-  luaV_objlen(L, L->top, t);
-  api_incr_top(L);
-  lua_unlock(L);
-}
-
-
-LUA_API lua_Alloc lua_getallocf (lua_State *L, void **ud) {
-  lua_Alloc f;
-  lua_lock(L);
-  if (ud) *ud = G(L)->ud;
-  f = G(L)->frealloc;
-  lua_unlock(L);
-  return f;
-}
-
-
-LUA_API void lua_setallocf (lua_State *L, lua_Alloc f, void *ud) {
-  lua_lock(L);
-  G(L)->ud = ud;
-  G(L)->frealloc = f;
-  lua_unlock(L);
-}
-
-
-LUA_API void *lua_newuserdata (lua_State *L, size_t size) {
-  Udata *u;
-  lua_lock(L);
-  luaC_checkGC(L);
-  u = luaS_newudata(L, size, NULL);
-  setuvalue(L, L->top, u);
-  api_incr_top(L);
-  lua_unlock(L);
-  return u + 1;
-}
-
-
-
-static const char *aux_upvalue (StkId fi, int n, TValue **val,
-                                GCObject **owner) {
-  switch (ttype(fi)) {
-    case LUA_TCCL: {  /* C closure */
-      CClosure *f = clCvalue(fi);
-      if (!(1 <= n && n <= f->nupvalues)) return NULL;
-      *val = &f->upvalue[n-1];
-      if (owner) *owner = obj2gco(f);
-      return "";
-    }
-    case LUA_TLCL: {  /* Lua closure */
-      LClosure *f = clLvalue(fi);
-      TString *name;
-      Proto *p = f->p;
-      if (!(1 <= n && n <= p->sizeupvalues)) return NULL;
-      *val = f->upvals[n-1]->v;
-      if (owner) *owner = obj2gco(f->upvals[n - 1]);
-      name = p->upvalues[n-1].name;
-      return (name == NULL) ? "" : getstr(name);
-    }
-    default: return NULL;  /* not a closure */
-  }
-}
-
-
-LUA_API const char *lua_getupvalue (lua_State *L, int funcindex, int n) {
-  const char *name;
-  TValue *val = NULL;  /* to avoid warnings */
-  lua_lock(L);
-  name = aux_upvalue(index2addr(L, funcindex), n, &val, NULL);
-  if (name) {
-    setobj2s(L, L->top, val);
-    api_incr_top(L);
-  }
-  lua_unlock(L);
-  return name;
-}
-
-
-LUA_API const char *lua_setupvalue (lua_State *L, int funcindex, int n) {
-  const char *name;
-  TValue *val = NULL;  /* to avoid warnings */
-  GCObject *owner = NULL;  /* to avoid warnings */
-  StkId fi;
-  lua_lock(L);
-  fi = index2addr(L, funcindex);
-  api_checknelems(L, 1);
-  name = aux_upvalue(fi, n, &val, &owner);
-  if (name) {
-    L->top--;
-    setobj(L, val, L->top);
-    luaC_barrier(L, owner, L->top);
-  }
-  lua_unlock(L);
-  return name;
-}
-
-
-static UpVal **getupvalref (lua_State *L, int fidx, int n, LClosure **pf) {
-  LClosure *f;
-  StkId fi = index2addr(L, fidx);
-  api_check(L, ttisLclosure(fi), "Lua function expected");
-  f = clLvalue(fi);
-  api_check(L, (1 <= n && n <= f->p->sizeupvalues), "invalid upvalue index");
-  if (pf) *pf = f;
-  return &f->upvals[n - 1];  /* get its upvalue pointer */
-}
-
-
-LUA_API void *lua_upvalueid (lua_State *L, int fidx, int n) {
-  StkId fi = index2addr(L, fidx);
-  switch (ttype(fi)) {
-    case LUA_TLCL: {  /* lua closure */
-      return *getupvalref(L, fidx, n, NULL);
-    }
-    case LUA_TCCL: {  /* C closure */
-      CClosure *f = clCvalue(fi);
-      api_check(L, 1 <= n && n <= f->nupvalues, "invalid upvalue index");
-      return &f->upvalue[n - 1];
-    }
-    default: {
-      api_check(L, 0, "closure expected");
-      return NULL;
-    }
-  }
-}
-
-
-LUA_API void lua_upvaluejoin (lua_State *L, int fidx1, int n1,
-                                            int fidx2, int n2) {
-  LClosure *f1;
-  UpVal **up1 = getupvalref(L, fidx1, n1, &f1);
-  UpVal **up2 = getupvalref(L, fidx2, n2, NULL);
-  *up1 = *up2;
-  luaC_objbarrier(L, f1, *up2);
-}
-
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lauxlib.h
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lauxlib.h
+++ /dev/null
@@ -1,176 +0,0 @@
-/*
-** $Id: lauxlib.h,v 1.120.1.1 2013/04/12 18:48:47 roberto Exp $
-** Auxiliary functions for building Lua libraries
-** See Copyright Notice in lua.h
-*/
-
-
-#ifndef lauxlib_h
-#define lauxlib_h
-
-
-#include <sys/zfs_context.h>
-
-#include "lua.h"
-
-
-
-/* extra error code for `luaL_load' */
-#define LUA_ERRFILE     (LUA_ERRERR+1)
-
-
-typedef struct luaL_Reg {
-  const char *name;
-  lua_CFunction func;
-} luaL_Reg;
-
-
-LUALIB_API void (luaL_checkversion_) (lua_State *L, lua_Number ver);
-#define luaL_checkversion(L)	luaL_checkversion_(L, LUA_VERSION_NUM)
-
-LUALIB_API int (luaL_getmetafield) (lua_State *L, int obj, const char *e);
-LUALIB_API int (luaL_callmeta) (lua_State *L, int obj, const char *e);
-LUALIB_API const char *(luaL_tolstring) (lua_State *L, int idx, size_t *len);
-LUALIB_API int (luaL_argerror) (lua_State *L, int numarg, const char *extramsg);
-LUALIB_API const char *(luaL_checklstring) (lua_State *L, int numArg,
-                                                          size_t *l);
-LUALIB_API const char *(luaL_optlstring) (lua_State *L, int numArg,
-                                          const char *def, size_t *l);
-LUALIB_API lua_Number (luaL_checknumber) (lua_State *L, int numArg);
-LUALIB_API lua_Number (luaL_optnumber) (lua_State *L, int nArg, lua_Number def);
-
-LUALIB_API lua_Integer (luaL_checkinteger) (lua_State *L, int numArg);
-LUALIB_API lua_Integer (luaL_optinteger) (lua_State *L, int nArg,
-                                          lua_Integer def);
-LUALIB_API lua_Unsigned (luaL_checkunsigned) (lua_State *L, int numArg);
-LUALIB_API lua_Unsigned (luaL_optunsigned) (lua_State *L, int numArg,
-                                            lua_Unsigned def);
-
-LUALIB_API void (luaL_checkstack) (lua_State *L, int sz, const char *msg);
-LUALIB_API void (luaL_checktype) (lua_State *L, int narg, int t);
-LUALIB_API void (luaL_checkany) (lua_State *L, int narg);
-
-LUALIB_API int   (luaL_newmetatable) (lua_State *L, const char *tname);
-LUALIB_API void  (luaL_setmetatable) (lua_State *L, const char *tname);
-LUALIB_API void *(luaL_testudata) (lua_State *L, int ud, const char *tname);
-LUALIB_API void *(luaL_checkudata) (lua_State *L, int ud, const char *tname);
-
-LUALIB_API void (luaL_where) (lua_State *L, int lvl);
-LUALIB_API int (luaL_error) (lua_State *L, const char *fmt, ...);
-
-LUALIB_API int (luaL_checkoption) (lua_State *L, int narg, const char *def,
-                                   const char *const lst[]);
-
-/* pre-defined references */
-#define LUA_NOREF       (-2)
-#define LUA_REFNIL      (-1)
-
-LUALIB_API int (luaL_ref) (lua_State *L, int t);
-LUALIB_API void (luaL_unref) (lua_State *L, int t, int ref);
-
-LUALIB_API int (luaL_loadbufferx) (lua_State *L, const char *buff, size_t sz,
-                                   const char *name, const char *mode);
-LUALIB_API int (luaL_loadstring) (lua_State *L, const char *s);
-
-LUALIB_API int (luaL_len) (lua_State *L, int idx);
-
-LUALIB_API const char *(luaL_gsub) (lua_State *L, const char *s, const char *p,
-                                                  const char *r);
-
-LUALIB_API void (luaL_setfuncs) (lua_State *L, const luaL_Reg *l, int nup);
-
-LUALIB_API int (luaL_getsubtable) (lua_State *L, int idx, const char *fname);
-
-LUALIB_API void (luaL_traceback) (lua_State *L, lua_State *L1,
-                                  const char *msg, int level);
-
-LUALIB_API void (luaL_requiref) (lua_State *L, const char *modname,
-                                 lua_CFunction openf, int glb);
-
-/*
-** ===============================================================
-** some useful macros
-** ===============================================================
-*/
-
-
-#define luaL_newlibtable(L,l)	\
-  lua_createtable(L, 0, sizeof(l)/sizeof((l)[0]) - 1)
-
-#define luaL_newlib(L,l)	(luaL_newlibtable(L,l), luaL_setfuncs(L,l,0))
-
-#define luaL_argcheck(L, cond,numarg,extramsg)	\
-		((void)((cond) || luaL_argerror(L, (numarg), (extramsg))))
-#define luaL_checkstring(L,n)	(luaL_checklstring(L, (n), NULL))
-#define luaL_optstring(L,n,d)	(luaL_optlstring(L, (n), (d), NULL))
-#define luaL_checkint(L,n)	((int)luaL_checkinteger(L, (n)))
-#define luaL_optint(L,n,d)	((int)luaL_optinteger(L, (n), (d)))
-#define luaL_checklong(L,n)	((long)luaL_checkinteger(L, (n)))
-#define luaL_optlong(L,n,d)	((long)luaL_optinteger(L, (n), (d)))
-
-#define luaL_typename(L,i)	lua_typename(L, lua_type(L,(i)))
-
-#define luaL_dofile(L, fn) \
-	(luaL_loadfile(L, fn) || lua_pcall(L, 0, LUA_MULTRET, 0))
-
-#define luaL_dostring(L, s) \
-	(luaL_loadstring(L, s) || lua_pcall(L, 0, LUA_MULTRET, 0))
-
-#define luaL_getmetatable(L,n)	(lua_getfield(L, LUA_REGISTRYINDEX, (n)))
-
-#define luaL_opt(L,f,n,d)	(lua_isnoneornil(L,(n)) ? (d) : f(L,(n)))
-
-#define luaL_loadbuffer(L,s,sz,n)	luaL_loadbufferx(L,s,sz,n,NULL)
-
-
-/*
-** {======================================================
-** Generic Buffer manipulation
-** =======================================================
-*/
-
-typedef struct luaL_Buffer {
-  char *b;  /* buffer address */
-  size_t size;  /* buffer size */
-  size_t n;  /* number of characters in buffer */
-  lua_State *L;
-  char initb[LUAL_BUFFERSIZE];  /* initial buffer */
-} luaL_Buffer;
-
-
-#define luaL_addchar(B,c) \
-  ((void)((B)->n < (B)->size || luaL_prepbuffsize((B), 1)), \
-   ((B)->b[(B)->n++] = (c)))
-
-#define luaL_addsize(B,s)	((B)->n += (s))
-
-LUALIB_API void (luaL_buffinit) (lua_State *L, luaL_Buffer *B);
-LUALIB_API char *(luaL_prepbuffsize) (luaL_Buffer *B, size_t sz);
-LUALIB_API void (luaL_addlstring) (luaL_Buffer *B, const char *s, size_t l);
-LUALIB_API void (luaL_addstring) (luaL_Buffer *B, const char *s);
-LUALIB_API void (luaL_addvalue) (luaL_Buffer *B);
-LUALIB_API void (luaL_pushresult) (luaL_Buffer *B);
-LUALIB_API void (luaL_pushresultsize) (luaL_Buffer *B, size_t sz);
-LUALIB_API char *(luaL_buffinitsize) (lua_State *L, luaL_Buffer *B, size_t sz);
-
-#define luaL_prepbuffer(B)	luaL_prepbuffsize(B, LUAL_BUFFERSIZE)
-
-/* }====================================================== */
-
-
-/* compatibility with old module system */
-#if defined(LUA_COMPAT_MODULE)
-
-LUALIB_API void (luaL_pushmodule) (lua_State *L, const char *modname,
-                                   int sizehint);
-LUALIB_API void (luaL_openlib) (lua_State *L, const char *libname,
-                                const luaL_Reg *l, int nup);
-
-#define luaL_register(L,n,l)	(luaL_openlib(L,(n),(l),0))
-
-#endif
-
-
-#endif
-
-
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lauxlib.c
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lauxlib.c
+++ /dev/null
@@ -1,791 +0,0 @@
-/*
-** $Id: lauxlib.c,v 1.248.1.1 2013/04/12 18:48:47 roberto Exp $
-** Auxiliary functions for building Lua libraries
-** See Copyright Notice in lua.h
-*/
-
-
-#include <sys/zfs_context.h>
-
-/* This file uses only the official API of Lua.
-** Any function declared here could be written as an application function.
-*/
-
-#define lauxlib_c
-#define LUA_LIB
-
-#include "lua.h"
-
-#include "lauxlib.h"
-
-
-/*
-** {======================================================
-** Traceback
-** =======================================================
-*/
-
-
-#define LEVELS1	12	/* size of the first part of the stack */
-#define LEVELS2	10	/* size of the second part of the stack */
-
-
-
-/*
-** search for 'objidx' in table at index -1.
-** return 1 + string at top if find a good name.
-*/
-static int findfield (lua_State *L, int objidx, int level) {
-  if (level == 0 || !lua_istable(L, -1))
-    return 0;  /* not found */
-  lua_pushnil(L);  /* start 'next' loop */
-  while (lua_next(L, -2)) {  /* for each pair in table */
-    if (lua_type(L, -2) == LUA_TSTRING) {  /* ignore non-string keys */
-      if (lua_rawequal(L, objidx, -1)) {  /* found object? */
-        lua_pop(L, 1);  /* remove value (but keep name) */
-        return 1;
-      }
-      else if (findfield(L, objidx, level - 1)) {  /* try recursively */
-        lua_remove(L, -2);  /* remove table (but keep name) */
-        lua_pushliteral(L, ".");
-        lua_insert(L, -2);  /* place '.' between the two names */
-        lua_concat(L, 3);
-        return 1;
-      }
-    }
-    lua_pop(L, 1);  /* remove value */
-  }
-  return 0;  /* not found */
-}
-
-
-static int pushglobalfuncname (lua_State *L, lua_Debug *ar) {
-  int top = lua_gettop(L);
-  lua_getinfo(L, "f", ar);  /* push function */
-  lua_pushglobaltable(L);
-  if (findfield(L, top + 1, 2)) {
-    lua_copy(L, -1, top + 1);  /* move name to proper place */
-    lua_pop(L, 2);  /* remove pushed values */
-    return 1;
-  }
-  else {
-    lua_settop(L, top);  /* remove function and global table */
-    return 0;
-  }
-}
-
-
-static void pushfuncname (lua_State *L, lua_Debug *ar) {
-  if (*ar->namewhat != '\0')  /* is there a name? */
-    lua_pushfstring(L, "function " LUA_QS, ar->name);
-  else if (*ar->what == 'm')  /* main? */
-      lua_pushliteral(L, "main chunk");
-  else if (*ar->what == 'C') {
-    if (pushglobalfuncname(L, ar)) {
-      lua_pushfstring(L, "function " LUA_QS, lua_tostring(L, -1));
-      lua_remove(L, -2);  /* remove name */
-    }
-    else
-      lua_pushliteral(L, "?");
-  }
-  else
-    lua_pushfstring(L, "function <%s:%d>", ar->short_src, ar->linedefined);
-}
-
-
-static int countlevels (lua_State *L) {
-  lua_Debug ar;
-  int li = 1, le = 1;
-  /* find an upper bound */
-  while (lua_getstack(L, le, &ar)) { li = le; le *= 2; }
-  /* do a binary search */
-  while (li < le) {
-    int m = (li + le)/2;
-    if (lua_getstack(L, m, &ar)) li = m + 1;
-    else le = m;
-  }
-  return le - 1;
-}
-
-
-LUALIB_API void luaL_traceback (lua_State *L, lua_State *L1,
-                                const char *msg, int level) {
-  lua_Debug ar;
-  int top = lua_gettop(L);
-  int numlevels = countlevels(L1);
-  int mark = (numlevels > LEVELS1 + LEVELS2) ? LEVELS1 : 0;
-  if (msg) lua_pushfstring(L, "%s\n", msg);
-  lua_pushliteral(L, "stack traceback:");
-  while (lua_getstack(L1, level++, &ar)) {
-    if (level == mark) {  /* too many levels? */
-      lua_pushliteral(L, "\n\t...");  /* add a '...' */
-      level = numlevels - LEVELS2;  /* and skip to last ones */
-    }
-    else {
-      lua_getinfo(L1, "Slnt", &ar);
-      lua_pushfstring(L, "\n\t%s:", ar.short_src);
-      if (ar.currentline > 0)
-        lua_pushfstring(L, "%d:", ar.currentline);
-      lua_pushliteral(L, " in ");
-      pushfuncname(L, &ar);
-      if (ar.istailcall)
-        lua_pushliteral(L, "\n\t(...tail calls...)");
-      lua_concat(L, lua_gettop(L) - top);
-    }
-  }
-  lua_concat(L, lua_gettop(L) - top);
-}
-
-/* }====================================================== */
-
-
-/*
-** {======================================================
-** Error-report functions
-** =======================================================
-*/
-
-LUALIB_API int luaL_argerror (lua_State *L, int narg, const char *extramsg) {
-  lua_Debug ar;
-  if (!lua_getstack(L, 0, &ar))  /* no stack frame? */
-    return luaL_error(L, "bad argument #%d (%s)", narg, extramsg);
-  lua_getinfo(L, "n", &ar);
-  if (strcmp(ar.namewhat, "method") == 0) {
-    narg--;  /* do not count `self' */
-    if (narg == 0)  /* error is in the self argument itself? */
-      return luaL_error(L, "calling " LUA_QS " on bad self (%s)",
-                           ar.name, extramsg);
-  }
-  if (ar.name == NULL)
-    ar.name = (pushglobalfuncname(L, &ar)) ? lua_tostring(L, -1) : "?";
-  return luaL_error(L, "bad argument #%d to " LUA_QS " (%s)",
-                        narg, ar.name, extramsg);
-}
-
-
-static int typeerror (lua_State *L, int narg, const char *tname) {
-  const char *msg = lua_pushfstring(L, "%s expected, got %s",
-                                    tname, luaL_typename(L, narg));
-  return luaL_argerror(L, narg, msg);
-}
-
-
-static void tag_error (lua_State *L, int narg, int tag) {
-  typeerror(L, narg, lua_typename(L, tag));
-}
-
-
-LUALIB_API void luaL_where (lua_State *L, int level) {
-  lua_Debug ar;
-  if (lua_getstack(L, level, &ar)) {  /* check function at level */
-    lua_getinfo(L, "Sl", &ar);  /* get info about it */
-    if (ar.currentline > 0) {  /* is there info? */
-      lua_pushfstring(L, "%s:%d: ", ar.short_src, ar.currentline);
-      return;
-    }
-  }
-  lua_pushliteral(L, "");  /* else, no information available... */
-}
-
-
-LUALIB_API int luaL_error (lua_State *L, const char *fmt, ...) {
-  va_list argp;
-  va_start(argp, fmt);
-  luaL_where(L, 1);
-  lua_pushvfstring(L, fmt, argp);
-  va_end(argp);
-  lua_concat(L, 2);
-  return lua_error(L);
-}
-
-
-#if !defined(inspectstat)	/* { */
-
-#if defined(LUA_USE_POSIX)
-
-#include <sys/wait.h>
-
-/*
-** use appropriate macros to interpret 'pclose' return status
-*/
-#define inspectstat(stat,what)  \
-   if (WIFEXITED(stat)) { stat = WEXITSTATUS(stat); } \
-   else if (WIFSIGNALED(stat)) { stat = WTERMSIG(stat); what = "signal"; }
-
-#else
-
-#define inspectstat(stat,what)  /* no op */
-
-#endif
-
-#endif				/* } */
-
-
-/* }====================================================== */
-
-
-/*
-** {======================================================
-** Userdata's metatable manipulation
-** =======================================================
-*/
-
-LUALIB_API int luaL_newmetatable (lua_State *L, const char *tname) {
-  luaL_getmetatable(L, tname);  /* try to get metatable */
-  if (!lua_isnil(L, -1))  /* name already in use? */
-    return 0;  /* leave previous value on top, but return 0 */
-  lua_pop(L, 1);
-  lua_newtable(L);  /* create metatable */
-  lua_pushvalue(L, -1);
-  lua_setfield(L, LUA_REGISTRYINDEX, tname);  /* registry.name = metatable */
-  return 1;
-}
-
-
-LUALIB_API void luaL_setmetatable (lua_State *L, const char *tname) {
-  luaL_getmetatable(L, tname);
-  lua_setmetatable(L, -2);
-}
-
-
-LUALIB_API void *luaL_testudata (lua_State *L, int ud, const char *tname) {
-  void *p = lua_touserdata(L, ud);
-  if (p != NULL) {  /* value is a userdata? */
-    if (lua_getmetatable(L, ud)) {  /* does it have a metatable? */
-      luaL_getmetatable(L, tname);  /* get correct metatable */
-      if (!lua_rawequal(L, -1, -2))  /* not the same? */
-        p = NULL;  /* value is a userdata with wrong metatable */
-      lua_pop(L, 2);  /* remove both metatables */
-      return p;
-    }
-  }
-  return NULL;  /* value is not a userdata with a metatable */
-}
-
-
-LUALIB_API void *luaL_checkudata (lua_State *L, int ud, const char *tname) {
-  void *p = luaL_testudata(L, ud, tname);
-  if (p == NULL) typeerror(L, ud, tname);
-  return p;
-}
-
-/* }====================================================== */
-
-
-/*
-** {======================================================
-** Argument check functions
-** =======================================================
-*/
-
-LUALIB_API int luaL_checkoption (lua_State *L, int narg, const char *def,
-                                 const char *const lst[]) {
-  const char *name = (def) ? luaL_optstring(L, narg, def) :
-                             luaL_checkstring(L, narg);
-  int i;
-  for (i=0; lst[i]; i++)
-    if (strcmp(lst[i], name) == 0)
-      return i;
-  return luaL_argerror(L, narg,
-                       lua_pushfstring(L, "invalid option " LUA_QS, name));
-}
-
-
-LUALIB_API void luaL_checkstack (lua_State *L, int space, const char *msg) {
-  /* keep some extra space to run error routines, if needed */
-  const int extra = LUA_MINSTACK;
-  if (!lua_checkstack(L, space + extra)) {
-    if (msg)
-      luaL_error(L, "stack overflow (%s)", msg);
-    else
-      luaL_error(L, "stack overflow");
-  }
-}
-
-
-LUALIB_API void luaL_checktype (lua_State *L, int narg, int t) {
-  if (lua_type(L, narg) != t)
-    tag_error(L, narg, t);
-}
-
-
-LUALIB_API void luaL_checkany (lua_State *L, int narg) {
-  if (lua_type(L, narg) == LUA_TNONE)
-    luaL_argerror(L, narg, "value expected");
-}
-
-
-LUALIB_API const char *luaL_checklstring (lua_State *L, int narg, size_t *len) {
-  const char *s = lua_tolstring(L, narg, len);
-  if (!s) tag_error(L, narg, LUA_TSTRING);
-  return s;
-}
-
-
-LUALIB_API const char *luaL_optlstring (lua_State *L, int narg,
-                                        const char *def, size_t *len) {
-  if (lua_isnoneornil(L, narg)) {
-    if (len)
-      *len = (def ? strlen(def) : 0);
-    return def;
-  }
-  else return luaL_checklstring(L, narg, len);
-}
-
-
-LUALIB_API lua_Number luaL_checknumber (lua_State *L, int narg) {
-  int isnum;
-  lua_Number d = lua_tonumberx(L, narg, &isnum);
-  if (!isnum)
-    tag_error(L, narg, LUA_TNUMBER);
-  return d;
-}
-
-
-LUALIB_API lua_Number luaL_optnumber (lua_State *L, int narg, lua_Number def) {
-  return luaL_opt(L, luaL_checknumber, narg, def);
-}
-
-
-LUALIB_API lua_Integer luaL_checkinteger (lua_State *L, int narg) {
-  int isnum;
-  lua_Integer d = lua_tointegerx(L, narg, &isnum);
-  if (!isnum)
-    tag_error(L, narg, LUA_TNUMBER);
-  return d;
-}
-
-
-LUALIB_API lua_Unsigned luaL_checkunsigned (lua_State *L, int narg) {
-  int isnum;
-  lua_Unsigned d = lua_tounsignedx(L, narg, &isnum);
-  if (!isnum)
-    tag_error(L, narg, LUA_TNUMBER);
-  return d;
-}
-
-
-LUALIB_API lua_Integer luaL_optinteger (lua_State *L, int narg,
-                                                      lua_Integer def) {
-  return luaL_opt(L, luaL_checkinteger, narg, def);
-}
-
-
-LUALIB_API lua_Unsigned luaL_optunsigned (lua_State *L, int narg,
-                                                        lua_Unsigned def) {
-  return luaL_opt(L, luaL_checkunsigned, narg, def);
-}
-
-/* }====================================================== */
-
-
-/*
-** {======================================================
-** Generic Buffer manipulation
-** =======================================================
-*/
-
-/*
-** check whether buffer is using a userdata on the stack as a temporary
-** buffer
-*/
-#define buffonstack(B)	((B)->b != (B)->initb)
-
-
-/*
-** returns a pointer to a free area with at least 'sz' bytes
-*/
-LUALIB_API char *luaL_prepbuffsize (luaL_Buffer *B, size_t sz) {
-  lua_State *L = B->L;
-  if (B->size - B->n < sz) {  /* not enough space? */
-    char *newbuff;
-    size_t newsize = B->size * 2;  /* double buffer size */
-    if (newsize - B->n < sz)  /* not big enough? */
-      newsize = B->n + sz;
-    if (newsize < B->n || newsize - B->n < sz)
-      luaL_error(L, "buffer too large");
-    /* create larger buffer */
-    newbuff = (char *)lua_newuserdata(L, newsize * sizeof(char));
-    /* move content to new buffer */
-    memcpy(newbuff, B->b, B->n * sizeof(char));
-    if (buffonstack(B))
-      lua_remove(L, -2);  /* remove old buffer */
-    B->b = newbuff;
-    B->size = newsize;
-  }
-  return &B->b[B->n];
-}
-
-
-LUALIB_API void luaL_addlstring (luaL_Buffer *B, const char *s, size_t l) {
-  char *b = luaL_prepbuffsize(B, l);
-  memcpy(b, s, l * sizeof(char));
-  luaL_addsize(B, l);
-}
-
-
-LUALIB_API void luaL_addstring (luaL_Buffer *B, const char *s) {
-  luaL_addlstring(B, s, strlen(s));
-}
-
-
-LUALIB_API void luaL_pushresult (luaL_Buffer *B) {
-  lua_State *L = B->L;
-  lua_pushlstring(L, B->b, B->n);
-  if (buffonstack(B))
-    lua_remove(L, -2);  /* remove old buffer */
-}
-
-
-LUALIB_API void luaL_pushresultsize (luaL_Buffer *B, size_t sz) {
-  luaL_addsize(B, sz);
-  luaL_pushresult(B);
-}
-
-
-LUALIB_API void luaL_addvalue (luaL_Buffer *B) {
-  lua_State *L = B->L;
-  size_t l;
-  const char *s = lua_tolstring(L, -1, &l);
-  if (buffonstack(B))
-    lua_insert(L, -2);  /* put value below buffer */
-  luaL_addlstring(B, s, l);
-  lua_remove(L, (buffonstack(B)) ? -2 : -1);  /* remove value */
-}
-
-
-LUALIB_API void luaL_buffinit (lua_State *L, luaL_Buffer *B) {
-  B->L = L;
-  B->b = B->initb;
-  B->n = 0;
-  B->size = LUAL_BUFFERSIZE;
-}
-
-
-LUALIB_API char *luaL_buffinitsize (lua_State *L, luaL_Buffer *B, size_t sz) {
-  luaL_buffinit(L, B);
-  return luaL_prepbuffsize(B, sz);
-}
-
-/* }====================================================== */
-
-
-/*
-** {======================================================
-** Reference system
-** =======================================================
-*/
-
-/* index of free-list header */
-#define freelist	0
-
-
-LUALIB_API int luaL_ref (lua_State *L, int t) {
-  int ref;
-  if (lua_isnil(L, -1)) {
-    lua_pop(L, 1);  /* remove from stack */
-    return LUA_REFNIL;  /* `nil' has a unique fixed reference */
-  }
-  t = lua_absindex(L, t);
-  lua_rawgeti(L, t, freelist);  /* get first free element */
-  ref = (int)lua_tointeger(L, -1);  /* ref = t[freelist] */
-  lua_pop(L, 1);  /* remove it from stack */
-  if (ref != 0) {  /* any free element? */
-    lua_rawgeti(L, t, ref);  /* remove it from list */
-    lua_rawseti(L, t, freelist);  /* (t[freelist] = t[ref]) */
-  }
-  else  /* no free elements */
-    ref = (int)lua_rawlen(L, t) + 1;  /* get a new reference */
-  lua_rawseti(L, t, ref);
-  return ref;
-}
-
-
-LUALIB_API void luaL_unref (lua_State *L, int t, int ref) {
-  if (ref >= 0) {
-    t = lua_absindex(L, t);
-    lua_rawgeti(L, t, freelist);
-    lua_rawseti(L, t, ref);  /* t[ref] = t[freelist] */
-    lua_pushinteger(L, ref);
-    lua_rawseti(L, t, freelist);  /* t[freelist] = ref */
-  }
-}
-
-/* }====================================================== */
-
-
-/*
-** {======================================================
-** Load functions
-** =======================================================
-*/
-
-typedef struct LoadS {
-  const char *s;
-  size_t size;
-} LoadS;
-
-
-static const char *getS (lua_State *L, void *ud, size_t *size) {
-  LoadS *ls = (LoadS *)ud;
-  (void)L;  /* not used */
-  if (ls->size == 0) return NULL;
-  *size = ls->size;
-  ls->size = 0;
-  return ls->s;
-}
-
-
-LUALIB_API int luaL_loadbufferx (lua_State *L, const char *buff, size_t size,
-                                 const char *name, const char *mode) {
-  LoadS ls;
-  ls.s = buff;
-  ls.size = size;
-  return lua_load(L, getS, &ls, name, mode);
-}
-
-
-LUALIB_API int luaL_loadstring (lua_State *L, const char *s) {
-  return luaL_loadbuffer(L, s, strlen(s), s);
-}
-
-/* }====================================================== */
-
-
-
-LUALIB_API int luaL_getmetafield (lua_State *L, int obj, const char *event) {
-  if (!lua_getmetatable(L, obj))  /* no metatable? */
-    return 0;
-  lua_pushstring(L, event);
-  lua_rawget(L, -2);
-  if (lua_isnil(L, -1)) {
-    lua_pop(L, 2);  /* remove metatable and metafield */
-    return 0;
-  }
-  else {
-    lua_remove(L, -2);  /* remove only metatable */
-    return 1;
-  }
-}
-
-
-LUALIB_API int luaL_callmeta (lua_State *L, int obj, const char *event) {
-  obj = lua_absindex(L, obj);
-  if (!luaL_getmetafield(L, obj, event))  /* no metafield? */
-    return 0;
-  lua_pushvalue(L, obj);
-  lua_call(L, 1, 1);
-  return 1;
-}
-
-
-LUALIB_API int luaL_len (lua_State *L, int idx) {
-  int l;
-  int isnum;
-  lua_len(L, idx);
-  l = (int)lua_tointegerx(L, -1, &isnum);
-  if (!isnum)
-    luaL_error(L, "object length is not a number");
-  lua_pop(L, 1);  /* remove object */
-  return l;
-}
-
-
-LUALIB_API const char *luaL_tolstring (lua_State *L, int idx, size_t *len) {
-  if (!luaL_callmeta(L, idx, "__tostring")) {  /* no metafield? */
-    switch (lua_type(L, idx)) {
-      case LUA_TNUMBER:
-      case LUA_TSTRING:
-        lua_pushvalue(L, idx);
-        break;
-      case LUA_TBOOLEAN:
-        lua_pushstring(L, (lua_toboolean(L, idx) ? "true" : "false"));
-        break;
-      case LUA_TNIL:
-        lua_pushliteral(L, "nil");
-        break;
-      default:
-        lua_pushfstring(L, "%s: %p", luaL_typename(L, idx),
-                                            lua_topointer(L, idx));
-        break;
-    }
-  }
-  return lua_tolstring(L, -1, len);
-}
-
-
-/*
-** {======================================================
-** Compatibility with 5.1 module functions
-** =======================================================
-*/
-#if defined(LUA_COMPAT_MODULE)
-
-static const char *luaL_findtable (lua_State *L, int idx,
-                                   const char *fname, int szhint) {
-  const char *e;
-  if (idx) lua_pushvalue(L, idx);
-  do {
-    e = strchr(fname, '.');
-    if (e == NULL) e = fname + strlen(fname);
-    lua_pushlstring(L, fname, e - fname);
-    lua_rawget(L, -2);
-    if (lua_isnil(L, -1)) {  /* no such field? */
-      lua_pop(L, 1);  /* remove this nil */
-      lua_createtable(L, 0, (*e == '.' ? 1 : szhint)); /* new table for field */
-      lua_pushlstring(L, fname, e - fname);
-      lua_pushvalue(L, -2);
-      lua_settable(L, -4);  /* set new table into field */
-    }
-    else if (!lua_istable(L, -1)) {  /* field has a non-table value? */
-      lua_pop(L, 2);  /* remove table and value */
-      return fname;  /* return problematic part of the name */
-    }
-    lua_remove(L, -2);  /* remove previous table */
-    fname = e + 1;
-  } while (*e == '.');
-  return NULL;
-}
-
-
-/*
-** Count number of elements in a luaL_Reg list.
-*/
-static int libsize (const luaL_Reg *l) {
-  int size = 0;
-  for (; l && l->name; l++) size++;
-  return size;
-}
-
-
-/*
-** Find or create a module table with a given name. The function
-** first looks at the _LOADED table and, if that fails, try a
-** global variable with that name. In any case, leaves on the stack
-** the module table.
-*/
-LUALIB_API void luaL_pushmodule (lua_State *L, const char *modname,
-                                 int sizehint) {
-  luaL_findtable(L, LUA_REGISTRYINDEX, "_LOADED", 1);  /* get _LOADED table */
-  lua_getfield(L, -1, modname);  /* get _LOADED[modname] */
-  if (!lua_istable(L, -1)) {  /* not found? */
-    lua_pop(L, 1);  /* remove previous result */
-    /* try global variable (and create one if it does not exist) */
-    lua_pushglobaltable(L);
-    if (luaL_findtable(L, 0, modname, sizehint) != NULL)
-      luaL_error(L, "name conflict for module " LUA_QS, modname);
-    lua_pushvalue(L, -1);
-    lua_setfield(L, -3, modname);  /* _LOADED[modname] = new table */
-  }
-  lua_remove(L, -2);  /* remove _LOADED table */
-}
-
-
-LUALIB_API void luaL_openlib (lua_State *L, const char *libname,
-                               const luaL_Reg *l, int nup) {
-  luaL_checkversion(L);
-  if (libname) {
-    luaL_pushmodule(L, libname, libsize(l));  /* get/create library table */
-    lua_insert(L, -(nup + 1));  /* move library table to below upvalues */
-  }
-  if (l)
-    luaL_setfuncs(L, l, nup);
-  else
-    lua_pop(L, nup);  /* remove upvalues */
-}
-
-#endif
-/* }====================================================== */
-
-/*
-** set functions from list 'l' into table at top - 'nup'; each
-** function gets the 'nup' elements at the top as upvalues.
-** Returns with only the table at the stack.
-*/
-LUALIB_API void luaL_setfuncs (lua_State *L, const luaL_Reg *l, int nup) {
-  luaL_checkversion(L);
-  luaL_checkstack(L, nup, "too many upvalues");
-  for (; l->name != NULL; l++) {  /* fill the table with given functions */
-    int i;
-    for (i = 0; i < nup; i++)  /* copy upvalues to the top */
-      lua_pushvalue(L, -nup);
-    lua_pushcclosure(L, l->func, nup);  /* closure with those upvalues */
-    lua_setfield(L, -(nup + 2), l->name);
-  }
-  lua_pop(L, nup);  /* remove upvalues */
-}
-
-
-/*
-** ensure that stack[idx][fname] has a table and push that table
-** into the stack
-*/
-LUALIB_API int luaL_getsubtable (lua_State *L, int idx, const char *fname) {
-  lua_getfield(L, idx, fname);
-  if (lua_istable(L, -1)) return 1;  /* table already there */
-  else {
-    lua_pop(L, 1);  /* remove previous result */
-    idx = lua_absindex(L, idx);
-    lua_newtable(L);
-    lua_pushvalue(L, -1);  /* copy to be left at top */
-    lua_setfield(L, idx, fname);  /* assign new table to field */
-    return 0;  /* false, because did not find table there */
-  }
-}
-
-
-/*
-** stripped-down 'require'. Calls 'openf' to open a module,
-** registers the result in 'package.loaded' table and, if 'glb'
-** is true, also registers the result in the global table.
-** Leaves resulting module on the top.
-*/
-LUALIB_API void luaL_requiref (lua_State *L, const char *modname,
-                               lua_CFunction openf, int glb) {
-  lua_pushcfunction(L, openf);
-  lua_pushstring(L, modname);  /* argument to open function */
-  lua_call(L, 1, 1);  /* open module */
-  luaL_getsubtable(L, LUA_REGISTRYINDEX, "_LOADED");
-  lua_pushvalue(L, -2);  /* make copy of module (call result) */
-  lua_setfield(L, -2, modname);  /* _LOADED[modname] = module */
-  lua_pop(L, 1);  /* remove _LOADED table */
-  if (glb) {
-    lua_pushvalue(L, -1);  /* copy of 'mod' */
-    lua_setglobal(L, modname);  /* _G[modname] = module */
-  }
-}
-
-
-LUALIB_API const char *luaL_gsub (lua_State *L, const char *s, const char *p,
-                                                               const char *r) {
-  const char *wild;
-  size_t l = strlen(p);
-  luaL_Buffer b;
-  luaL_buffinit(L, &b);
-  while ((wild = strstr(s, p)) != NULL) {
-    luaL_addlstring(&b, s, wild - s);  /* push prefix */
-    luaL_addstring(&b, r);  /* push replacement in place of pattern */
-    s = wild + l;  /* continue after `p' */
-  }
-  luaL_addstring(&b, s);  /* push last suffix */
-  luaL_pushresult(&b);
-  return lua_tostring(L, -1);
-}
-
-
-LUALIB_API void luaL_checkversion_ (lua_State *L, lua_Number ver) {
-  const lua_Number *v = lua_version(L);
-  if (v != lua_version(NULL))
-    luaL_error(L, "multiple Lua VMs detected");
-  else if (*v != ver)
-    luaL_error(L, "version mismatch: app. needs %f, Lua core provides %f",
-                  ver, *v);
-  /* check conversions number -> integer types */
-  lua_pushnumber(L, -(lua_Number)0x1234);
-  if (lua_tointeger(L, -1) != -0x1234 ||
-      lua_tounsigned(L, -1) != (lua_Unsigned)-0x1234)
-    luaL_error(L, "bad conversion number->int;"
-                  " must recompile Lua with proper settings");
-  lua_pop(L, 1);
-}
-
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lbaselib.c
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lbaselib.c
+++ /dev/null
@@ -1,296 +0,0 @@
-/*
-** $Id: lbaselib.c,v 1.276.1.1 2013/04/12 18:48:47 roberto Exp $
-** Basic library
-** See Copyright Notice in lua.h
-*/
-
-/* The following built-in lua functions have been removed and are not available
- * for use in ZFS channel programs:
- *
- * dofile
- * loadfile
- * load
- * pcall
- * print
- * xpcall
- */
-
-#include <sys/zfs_context.h>
-#include <sys/ctype.h>
-#ifdef illumos
-#define	toupper(C)	(((C) >= 'a' && (C) <= 'z')? (C) - 'a' + 'A': (C))
-#else
-#define	isalnum(C)	(isalpha(C) || isdigit(C))
-#endif
-
-#define lbaselib_c
-#define LUA_LIB
-
-#include "lua.h"
-
-#include "lauxlib.h"
-#include "lualib.h"
-
-#define SPACECHARS	" \f\n\r\t\v"
-
-static int luaB_tonumber (lua_State *L) {
-  if (lua_isnoneornil(L, 2)) {  /* standard conversion */
-    int isnum;
-    lua_Number n = lua_tonumberx(L, 1, &isnum);
-    if (isnum) {
-      lua_pushnumber(L, n);
-      return 1;
-    }  /* else not a number; must be something */
-    luaL_checkany(L, 1);
-  }
-  else {
-    size_t l;
-    const char *s = luaL_checklstring(L, 1, &l);
-    const char *e = s + l;  /* end point for 's' */
-    int base = luaL_checkint(L, 2);
-    int neg = 0;
-    luaL_argcheck(L, 2 <= base && base <= 36, 2, "base out of range");
-    s += strspn(s, SPACECHARS);  /* skip initial spaces */
-    if (*s == '-') { s++; neg = 1; }  /* handle signal */
-    else if (*s == '+') s++;
-    if (isalnum((unsigned char)*s)) {
-      lua_Number n = 0;
-      do {
-        int digit = (isdigit((unsigned char)*s)) ? *s - '0'
-                       : toupper((unsigned char)*s) - 'A' + 10;
-        if (digit >= base) break;  /* invalid numeral; force a fail */
-        n = n * (lua_Number)base + (lua_Number)digit;
-        s++;
-      } while (isalnum((unsigned char)*s));
-      s += strspn(s, SPACECHARS);  /* skip trailing spaces */
-      if (s == e) {  /* no invalid trailing characters? */
-        lua_pushnumber(L, (neg) ? -n : n);
-        return 1;
-      }  /* else not a number */
-    }  /* else not a number */
-  }
-  lua_pushnil(L);  /* not a number */
-  return 1;
-}
-
-
-static int luaB_error (lua_State *L) {
-  int level = luaL_optint(L, 2, 1);
-  lua_settop(L, 1);
-  if (lua_isstring(L, 1) && level > 0) {  /* add extra information? */
-    luaL_where(L, level);
-    lua_pushvalue(L, 1);
-    lua_concat(L, 2);
-  }
-  return lua_error(L);
-}
-
-
-static int luaB_getmetatable (lua_State *L) {
-  luaL_checkany(L, 1);
-  if (!lua_getmetatable(L, 1)) {
-    lua_pushnil(L);
-    return 1;  /* no metatable */
-  }
-  luaL_getmetafield(L, 1, "__metatable");
-  return 1;  /* returns either __metatable field (if present) or metatable */
-}
-
-
-static int luaB_setmetatable (lua_State *L) {
-  int t = lua_type(L, 2);
-  luaL_checktype(L, 1, LUA_TTABLE);
-  luaL_argcheck(L, t == LUA_TNIL || t == LUA_TTABLE, 2,
-                    "nil or table expected");
-  if (luaL_getmetafield(L, 1, "__metatable"))
-    return luaL_error(L, "cannot change a protected metatable");
-  lua_settop(L, 2);
-  lua_setmetatable(L, 1);
-  return 1;
-}
-
-
-static int luaB_rawequal (lua_State *L) {
-  luaL_checkany(L, 1);
-  luaL_checkany(L, 2);
-  lua_pushboolean(L, lua_rawequal(L, 1, 2));
-  return 1;
-}
-
-
-static int luaB_rawlen (lua_State *L) {
-  int t = lua_type(L, 1);
-  luaL_argcheck(L, t == LUA_TTABLE || t == LUA_TSTRING, 1,
-                   "table or string expected");
-  lua_pushinteger(L, lua_rawlen(L, 1));
-  return 1;
-}
-
-
-static int luaB_rawget (lua_State *L) {
-  luaL_checktype(L, 1, LUA_TTABLE);
-  luaL_checkany(L, 2);
-  lua_settop(L, 2);
-  lua_rawget(L, 1);
-  return 1;
-}
-
-static int luaB_rawset (lua_State *L) {
-  luaL_checktype(L, 1, LUA_TTABLE);
-  luaL_checkany(L, 2);
-  luaL_checkany(L, 3);
-  lua_settop(L, 3);
-  lua_rawset(L, 1);
-  return 1;
-}
-
-
-static int luaB_collectgarbage (lua_State *L) {
-  static const char *const opts[] = {"stop", "restart", "collect",
-    "count", "step", "setpause", "setstepmul",
-    "setmajorinc", "isrunning", "generational", "incremental", NULL};
-  static const int optsnum[] = {LUA_GCSTOP, LUA_GCRESTART, LUA_GCCOLLECT,
-    LUA_GCCOUNT, LUA_GCSTEP, LUA_GCSETPAUSE, LUA_GCSETSTEPMUL,
-    LUA_GCSETMAJORINC, LUA_GCISRUNNING, LUA_GCGEN, LUA_GCINC};
-  int o = optsnum[luaL_checkoption(L, 1, "collect", opts)];
-  int ex = luaL_optint(L, 2, 0);
-  int res = lua_gc(L, o, ex);
-  switch (o) {
-    case LUA_GCCOUNT: {
-      int b = lua_gc(L, LUA_GCCOUNTB, 0);
-      lua_pushnumber(L, res + ((lua_Number)b/1024));
-      lua_pushinteger(L, b);
-      return 2;
-    }
-    case LUA_GCSTEP: case LUA_GCISRUNNING: {
-      lua_pushboolean(L, res);
-      return 1;
-    }
-    default: {
-      lua_pushinteger(L, res);
-      return 1;
-    }
-  }
-}
-
-
-static int luaB_type (lua_State *L) {
-  luaL_checkany(L, 1);
-  lua_pushstring(L, luaL_typename(L, 1));
-  return 1;
-}
-
-
-static int pairsmeta (lua_State *L, const char *method, int iszero,
-                      lua_CFunction iter) {
-  if (!luaL_getmetafield(L, 1, method)) {  /* no metamethod? */
-    luaL_checktype(L, 1, LUA_TTABLE);  /* argument must be a table */
-    lua_pushcfunction(L, iter);  /* will return generator, */
-    lua_pushvalue(L, 1);  /* state, */
-    if (iszero) lua_pushinteger(L, 0);  /* and initial value */
-    else lua_pushnil(L);
-  }
-  else {
-    lua_pushvalue(L, 1);  /* argument 'self' to metamethod */
-    lua_call(L, 1, 3);  /* get 3 values from metamethod */
-  }
-  return 3;
-}
-
-
-static int luaB_next (lua_State *L) {
-  luaL_checktype(L, 1, LUA_TTABLE);
-  lua_settop(L, 2);  /* create a 2nd argument if there isn't one */
-  if (lua_next(L, 1))
-    return 2;
-  else {
-    lua_pushnil(L);
-    return 1;
-  }
-}
-
-
-static int luaB_pairs (lua_State *L) {
-  return pairsmeta(L, "__pairs", 0, luaB_next);
-}
-
-
-static int ipairsaux (lua_State *L) {
-  int i = luaL_checkint(L, 2);
-  luaL_checktype(L, 1, LUA_TTABLE);
-  i++;  /* next value */
-  lua_pushinteger(L, i);
-  lua_rawgeti(L, 1, i);
-  return (lua_isnil(L, -1)) ? 1 : 2;
-}
-
-
-static int luaB_ipairs (lua_State *L) {
-  return pairsmeta(L, "__ipairs", 1, ipairsaux);
-}
-
-
-static int luaB_assert (lua_State *L) {
-  if (!lua_toboolean(L, 1))
-    return luaL_error(L, "%s", luaL_optstring(L, 2, "assertion failed!"));
-  return lua_gettop(L);
-}
-
-
-static int luaB_select (lua_State *L) {
-  int n = lua_gettop(L);
-  if (lua_type(L, 1) == LUA_TSTRING && *lua_tostring(L, 1) == '#') {
-    lua_pushinteger(L, n-1);
-    return 1;
-  }
-  else {
-    int i = luaL_checkint(L, 1);
-    if (i < 0) i = n + i;
-    else if (i > n) i = n;
-    luaL_argcheck(L, 1 <= i, 1, "index out of range");
-    return n - i;
-  }
-}
-
-static int luaB_tostring (lua_State *L) {
-  luaL_checkany(L, 1);
-  luaL_tolstring(L, 1, NULL);
-  return 1;
-}
-
-static const luaL_Reg base_funcs[] = {
-  {"assert", luaB_assert},
-  {"collectgarbage", luaB_collectgarbage},
-  {"error", luaB_error},
-  {"getmetatable", luaB_getmetatable},
-  {"ipairs", luaB_ipairs},
-#if defined(LUA_COMPAT_LOADSTRING)
-  {"loadstring", luaB_load},
-#endif
-  {"next", luaB_next},
-  {"pairs", luaB_pairs},
-  {"rawequal", luaB_rawequal},
-  {"rawlen", luaB_rawlen},
-  {"rawget", luaB_rawget},
-  {"rawset", luaB_rawset},
-  {"select", luaB_select},
-  {"setmetatable", luaB_setmetatable},
-  {"tonumber", luaB_tonumber},
-  {"tostring", luaB_tostring},
-  {"type", luaB_type},
-  {NULL, NULL}
-};
-
-
-LUAMOD_API int luaopen_base (lua_State *L) {
-  /* set global _G */
-  lua_pushglobaltable(L);
-  lua_pushglobaltable(L);
-  lua_setfield(L, -2, "_G");
-  /* open lib into global table */
-  luaL_setfuncs(L, base_funcs, 0);
-  lua_pushliteral(L, LUA_VERSION);
-  lua_setfield(L, -2, "_VERSION");  /* set global _VERSION */
-  return 1;
-}
-
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lbitlib.c
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lbitlib.c
+++ /dev/null
@@ -1,212 +0,0 @@
-/*
-** $Id: lbitlib.c,v 1.18.1.2 2013/07/09 18:01:41 roberto Exp $
-** Standard library for bitwise operations
-** See Copyright Notice in lua.h
-*/
-
-#define lbitlib_c
-#define LUA_LIB
-
-#include "lua.h"
-
-#include "lauxlib.h"
-#include "lualib.h"
-
-
-/* number of bits to consider in a number */
-#if !defined(LUA_NBITS)
-#define LUA_NBITS	32
-#endif
-
-
-#define ALLONES		(~(((~(lua_Unsigned)0) << (LUA_NBITS - 1)) << 1))
-
-/* macro to trim extra bits */
-#define trim(x)		((x) & ALLONES)
-
-
-/* builds a number with 'n' ones (1 <= n <= LUA_NBITS) */
-#define mask(n)		(~((ALLONES << 1) << ((n) - 1)))
-
-
-typedef lua_Unsigned b_uint;
-
-
-
-static b_uint andaux (lua_State *L) {
-  int i, n = lua_gettop(L);
-  b_uint r = ~(b_uint)0;
-  for (i = 1; i <= n; i++)
-    r &= luaL_checkunsigned(L, i);
-  return trim(r);
-}
-
-
-static int b_and (lua_State *L) {
-  b_uint r = andaux(L);
-  lua_pushunsigned(L, r);
-  return 1;
-}
-
-
-static int b_test (lua_State *L) {
-  b_uint r = andaux(L);
-  lua_pushboolean(L, r != 0);
-  return 1;
-}
-
-
-static int b_or (lua_State *L) {
-  int i, n = lua_gettop(L);
-  b_uint r = 0;
-  for (i = 1; i <= n; i++)
-    r |= luaL_checkunsigned(L, i);
-  lua_pushunsigned(L, trim(r));
-  return 1;
-}
-
-
-static int b_xor (lua_State *L) {
-  int i, n = lua_gettop(L);
-  b_uint r = 0;
-  for (i = 1; i <= n; i++)
-    r ^= luaL_checkunsigned(L, i);
-  lua_pushunsigned(L, trim(r));
-  return 1;
-}
-
-
-static int b_not (lua_State *L) {
-  b_uint r = ~luaL_checkunsigned(L, 1);
-  lua_pushunsigned(L, trim(r));
-  return 1;
-}
-
-
-static int b_shift (lua_State *L, b_uint r, int i) {
-  if (i < 0) {  /* shift right? */
-    i = -i;
-    r = trim(r);
-    if (i >= LUA_NBITS) r = 0;
-    else r >>= i;
-  }
-  else {  /* shift left */
-    if (i >= LUA_NBITS) r = 0;
-    else r <<= i;
-    r = trim(r);
-  }
-  lua_pushunsigned(L, r);
-  return 1;
-}
-
-
-static int b_lshift (lua_State *L) {
-  return b_shift(L, luaL_checkunsigned(L, 1), luaL_checkint(L, 2));
-}
-
-
-static int b_rshift (lua_State *L) {
-  return b_shift(L, luaL_checkunsigned(L, 1), -luaL_checkint(L, 2));
-}
-
-
-static int b_arshift (lua_State *L) {
-  b_uint r = luaL_checkunsigned(L, 1);
-  int i = luaL_checkint(L, 2);
-  if (i < 0 || !(r & ((b_uint)1 << (LUA_NBITS - 1))))
-    return b_shift(L, r, -i);
-  else {  /* arithmetic shift for 'negative' number */
-    if (i >= LUA_NBITS) r = ALLONES;
-    else
-      r = trim((r >> i) | ~(~(b_uint)0 >> i));  /* add signal bit */
-    lua_pushunsigned(L, r);
-    return 1;
-  }
-}
-
-
-static int b_rot (lua_State *L, int i) {
-  b_uint r = luaL_checkunsigned(L, 1);
-  i &= (LUA_NBITS - 1);  /* i = i % NBITS */
-  r = trim(r);
-  if (i != 0)  /* avoid undefined shift of LUA_NBITS when i == 0 */
-    r = (r << i) | (r >> (LUA_NBITS - i));
-  lua_pushunsigned(L, trim(r));
-  return 1;
-}
-
-
-static int b_lrot (lua_State *L) {
-  return b_rot(L, luaL_checkint(L, 2));
-}
-
-
-static int b_rrot (lua_State *L) {
-  return b_rot(L, -luaL_checkint(L, 2));
-}
-
-
-/*
-** get field and width arguments for field-manipulation functions,
-** checking whether they are valid.
-** ('luaL_error' called without 'return' to avoid later warnings about
-** 'width' being used uninitialized.)
-*/
-static int fieldargs (lua_State *L, int farg, int *width) {
-  int f = luaL_checkint(L, farg);
-  int w = luaL_optint(L, farg + 1, 1);
-  luaL_argcheck(L, 0 <= f, farg, "field cannot be negative");
-  luaL_argcheck(L, 0 < w, farg + 1, "width must be positive");
-  if (f + w > LUA_NBITS)
-    luaL_error(L, "trying to access non-existent bits");
-  *width = w;
-  return f;
-}
-
-
-static int b_extract (lua_State *L) {
-  int w;
-  b_uint r = luaL_checkunsigned(L, 1);
-  int f = fieldargs(L, 2, &w);
-  r = (r >> f) & mask(w);
-  lua_pushunsigned(L, r);
-  return 1;
-}
-
-
-static int b_replace (lua_State *L) {
-  int w;
-  b_uint r = luaL_checkunsigned(L, 1);
-  b_uint v = luaL_checkunsigned(L, 2);
-  int f = fieldargs(L, 3, &w);
-  int m = mask(w);
-  v &= m;  /* erase bits outside given width */
-  r = (r & ~(m << f)) | (v << f);
-  lua_pushunsigned(L, r);
-  return 1;
-}
-
-
-static const luaL_Reg bitlib[] = {
-  {"arshift", b_arshift},
-  {"band", b_and},
-  {"bnot", b_not},
-  {"bor", b_or},
-  {"bxor", b_xor},
-  {"btest", b_test},
-  {"extract", b_extract},
-  {"lrotate", b_lrot},
-  {"lshift", b_lshift},
-  {"replace", b_replace},
-  {"rrotate", b_rrot},
-  {"rshift", b_rshift},
-  {NULL, NULL}
-};
-
-
-
-LUAMOD_API int luaopen_bit32 (lua_State *L) {
-  luaL_newlib(L, bitlib);
-  return 1;
-}
-
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lcode.h
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lcode.h
+++ /dev/null
@@ -1,83 +0,0 @@
-/*
-** $Id: lcode.h,v 1.58.1.1 2013/04/12 18:48:47 roberto Exp $
-** Code generator for Lua
-** See Copyright Notice in lua.h
-*/
-
-#ifndef lcode_h
-#define lcode_h
-
-#include "llex.h"
-#include "lobject.h"
-#include "lopcodes.h"
-#include "lparser.h"
-
-
-/*
-** Marks the end of a patch list. It is an invalid value both as an absolute
-** address, and as a list link (would link an element to itself).
-*/
-#define NO_JUMP (-1)
-
-
-/*
-** grep "ORDER OPR" if you change these enums  (ORDER OP)
-*/
-typedef enum BinOpr {
-  OPR_ADD, OPR_SUB, OPR_MUL, OPR_DIV, OPR_MOD, OPR_POW,
-  OPR_CONCAT,
-  OPR_EQ, OPR_LT, OPR_LE,
-  OPR_NE, OPR_GT, OPR_GE,
-  OPR_AND, OPR_OR,
-  OPR_NOBINOPR
-} BinOpr;
-
-
-typedef enum UnOpr { OPR_MINUS, OPR_NOT, OPR_LEN, OPR_NOUNOPR } UnOpr;
-
-
-#define getcode(fs,e)	((fs)->f->code[(e)->u.info])
-
-#define luaK_codeAsBx(fs,o,A,sBx)	luaK_codeABx(fs,o,A,(sBx)+MAXARG_sBx)
-
-#define luaK_setmultret(fs,e)	luaK_setreturns(fs, e, LUA_MULTRET)
-
-#define luaK_jumpto(fs,t)	luaK_patchlist(fs, luaK_jump(fs), t)
-
-LUAI_FUNC int luaK_codeABx (FuncState *fs, OpCode o, int A, unsigned int Bx);
-LUAI_FUNC int luaK_codeABC (FuncState *fs, OpCode o, int A, int B, int C);
-LUAI_FUNC int luaK_codek (FuncState *fs, int reg, int k);
-LUAI_FUNC void luaK_fixline (FuncState *fs, int line);
-LUAI_FUNC void luaK_nil (FuncState *fs, int from, int n);
-LUAI_FUNC void luaK_reserveregs (FuncState *fs, int n);
-LUAI_FUNC void luaK_checkstack (FuncState *fs, int n);
-LUAI_FUNC int luaK_stringK (FuncState *fs, TString *s);
-LUAI_FUNC int luaK_numberK (FuncState *fs, lua_Number r);
-LUAI_FUNC void luaK_dischargevars (FuncState *fs, expdesc *e);
-LUAI_FUNC int luaK_exp2anyreg (FuncState *fs, expdesc *e);
-LUAI_FUNC void luaK_exp2anyregup (FuncState *fs, expdesc *e);
-LUAI_FUNC void luaK_exp2nextreg (FuncState *fs, expdesc *e);
-LUAI_FUNC void luaK_exp2val (FuncState *fs, expdesc *e);
-LUAI_FUNC int luaK_exp2RK (FuncState *fs, expdesc *e);
-LUAI_FUNC void luaK_self (FuncState *fs, expdesc *e, expdesc *key);
-LUAI_FUNC void luaK_indexed (FuncState *fs, expdesc *t, expdesc *k);
-LUAI_FUNC void luaK_goiftrue (FuncState *fs, expdesc *e);
-LUAI_FUNC void luaK_goiffalse (FuncState *fs, expdesc *e);
-LUAI_FUNC void luaK_storevar (FuncState *fs, expdesc *var, expdesc *e);
-LUAI_FUNC void luaK_setreturns (FuncState *fs, expdesc *e, int nresults);
-LUAI_FUNC void luaK_setoneret (FuncState *fs, expdesc *e);
-LUAI_FUNC int luaK_jump (FuncState *fs);
-LUAI_FUNC void luaK_ret (FuncState *fs, int first, int nret);
-LUAI_FUNC void luaK_patchlist (FuncState *fs, int list, int target);
-LUAI_FUNC void luaK_patchtohere (FuncState *fs, int list);
-LUAI_FUNC void luaK_patchclose (FuncState *fs, int list, int level);
-LUAI_FUNC void luaK_concat (FuncState *fs, int *l1, int l2);
-LUAI_FUNC int luaK_getlabel (FuncState *fs);
-LUAI_FUNC void luaK_prefix (FuncState *fs, UnOpr op, expdesc *v, int line);
-LUAI_FUNC void luaK_infix (FuncState *fs, BinOpr op, expdesc *v);
-LUAI_FUNC void luaK_posfix (FuncState *fs, BinOpr op, expdesc *v1,
-                            expdesc *v2, int line);
-LUAI_FUNC void luaK_setlist (FuncState *fs, int base, int nelems, int tostore);
-
-
-#endif
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lcode.c
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lcode.c
+++ /dev/null
@@ -1,885 +0,0 @@
-/*
-** $Id: lcode.c,v 2.62.1.1 2013/04/12 18:48:47 roberto Exp $
-** Code generator for Lua
-** See Copyright Notice in lua.h
-*/
-
-#include <sys/zfs_context.h>
-
-#define lcode_c
-#define LUA_CORE
-
-#include "lua.h"
-
-#include "lcode.h"
-#include "ldebug.h"
-#include "ldo.h"
-#include "lgc.h"
-#include "llex.h"
-#include "lmem.h"
-#include "lobject.h"
-#include "lopcodes.h"
-#include "lparser.h"
-#include "lstring.h"
-#include "ltable.h"
-#include "lvm.h"
-
-
-#define hasjumps(e)	((e)->t != (e)->f)
-
-
-static int isnumeral(expdesc *e) {
-  return (e->k == VKNUM && e->t == NO_JUMP && e->f == NO_JUMP);
-}
-
-
-void luaK_nil (FuncState *fs, int from, int n) {
-  Instruction *previous;
-  int l = from + n - 1;  /* last register to set nil */
-  if (fs->pc > fs->lasttarget) {  /* no jumps to current position? */
-    previous = &fs->f->code[fs->pc-1];
-    if (GET_OPCODE(*previous) == OP_LOADNIL) {
-      int pfrom = GETARG_A(*previous);
-      int pl = pfrom + GETARG_B(*previous);
-      if ((pfrom <= from && from <= pl + 1) ||
-          (from <= pfrom && pfrom <= l + 1)) {  /* can connect both? */
-        if (pfrom < from) from = pfrom;  /* from = min(from, pfrom) */
-        if (pl > l) l = pl;  /* l = max(l, pl) */
-        SETARG_A(*previous, from);
-        SETARG_B(*previous, l - from);
-        return;
-      }
-    }  /* else go through */
-  }
-  luaK_codeABC(fs, OP_LOADNIL, from, n - 1, 0);  /* else no optimization */
-}
-
-
-int luaK_jump (FuncState *fs) {
-  int jpc = fs->jpc;  /* save list of jumps to here */
-  int j;
-  fs->jpc = NO_JUMP;
-  j = luaK_codeAsBx(fs, OP_JMP, 0, NO_JUMP);
-  luaK_concat(fs, &j, jpc);  /* keep them on hold */
-  return j;
-}
-
-
-void luaK_ret (FuncState *fs, int first, int nret) {
-  luaK_codeABC(fs, OP_RETURN, first, nret+1, 0);
-}
-
-
-static int condjump (FuncState *fs, OpCode op, int A, int B, int C) {
-  luaK_codeABC(fs, op, A, B, C);
-  return luaK_jump(fs);
-}
-
-
-static void fixjump (FuncState *fs, int pc, int dest) {
-  Instruction *jmp = &fs->f->code[pc];
-  int offset = dest-(pc+1);
-  lua_assert(dest != NO_JUMP);
-  if (abs(offset) > MAXARG_sBx)
-    luaX_syntaxerror(fs->ls, "control structure too long");
-  SETARG_sBx(*jmp, offset);
-}
-
-
-/*
-** returns current `pc' and marks it as a jump target (to avoid wrong
-** optimizations with consecutive instructions not in the same basic block).
-*/
-int luaK_getlabel (FuncState *fs) {
-  fs->lasttarget = fs->pc;
-  return fs->pc;
-}
-
-
-static int getjump (FuncState *fs, int pc) {
-  int offset = GETARG_sBx(fs->f->code[pc]);
-  if (offset == NO_JUMP)  /* point to itself represents end of list */
-    return NO_JUMP;  /* end of list */
-  else
-    return (pc+1)+offset;  /* turn offset into absolute position */
-}
-
-
-static Instruction *getjumpcontrol (FuncState *fs, int pc) {
-  Instruction *pi = &fs->f->code[pc];
-  if (pc >= 1 && testTMode(GET_OPCODE(*(pi-1))))
-    return pi-1;
-  else
-    return pi;
-}
-
-
-/*
-** check whether list has any jump that do not produce a value
-** (or produce an inverted value)
-*/
-static int need_value (FuncState *fs, int list) {
-  for (; list != NO_JUMP; list = getjump(fs, list)) {
-    Instruction i = *getjumpcontrol(fs, list);
-    if (GET_OPCODE(i) != OP_TESTSET) return 1;
-  }
-  return 0;  /* not found */
-}
-
-
-static int patchtestreg (FuncState *fs, int node, int reg) {
-  Instruction *i = getjumpcontrol(fs, node);
-  if (GET_OPCODE(*i) != OP_TESTSET)
-    return 0;  /* cannot patch other instructions */
-  if (reg != NO_REG && reg != GETARG_B(*i))
-    SETARG_A(*i, reg);
-  else  /* no register to put value or register already has the value */
-    *i = CREATE_ABC(OP_TEST, GETARG_B(*i), 0, GETARG_C(*i));
-
-  return 1;
-}
-
-
-static void removevalues (FuncState *fs, int list) {
-  for (; list != NO_JUMP; list = getjump(fs, list))
-      patchtestreg(fs, list, NO_REG);
-}
-
-
-static void patchlistaux (FuncState *fs, int list, int vtarget, int reg,
-                          int dtarget) {
-  while (list != NO_JUMP) {
-    int next = getjump(fs, list);
-    if (patchtestreg(fs, list, reg))
-      fixjump(fs, list, vtarget);
-    else
-      fixjump(fs, list, dtarget);  /* jump to default target */
-    list = next;
-  }
-}
-
-
-static void dischargejpc (FuncState *fs) {
-  patchlistaux(fs, fs->jpc, fs->pc, NO_REG, fs->pc);
-  fs->jpc = NO_JUMP;
-}
-
-
-void luaK_patchlist (FuncState *fs, int list, int target) {
-  if (target == fs->pc)
-    luaK_patchtohere(fs, list);
-  else {
-    lua_assert(target < fs->pc);
-    patchlistaux(fs, list, target, NO_REG, target);
-  }
-}
-
-
-LUAI_FUNC void luaK_patchclose (FuncState *fs, int list, int level) {
-  level++;  /* argument is +1 to reserve 0 as non-op */
-  while (list != NO_JUMP) {
-    int next = getjump(fs, list);
-    lua_assert(GET_OPCODE(fs->f->code[list]) == OP_JMP &&
-                (GETARG_A(fs->f->code[list]) == 0 ||
-                 GETARG_A(fs->f->code[list]) >= level));
-    SETARG_A(fs->f->code[list], level);
-    list = next;
-  }
-}
-
-
-void luaK_patchtohere (FuncState *fs, int list) {
-  luaK_getlabel(fs);
-  luaK_concat(fs, &fs->jpc, list);
-}
-
-
-void luaK_concat (FuncState *fs, int *l1, int l2) {
-  if (l2 == NO_JUMP) return;
-  else if (*l1 == NO_JUMP)
-    *l1 = l2;
-  else {
-    int list = *l1;
-    int next;
-    while ((next = getjump(fs, list)) != NO_JUMP)  /* find last element */
-      list = next;
-    fixjump(fs, list, l2);
-  }
-}
-
-
-static int luaK_code (FuncState *fs, Instruction i) {
-  Proto *f = fs->f;
-  dischargejpc(fs);  /* `pc' will change */
-  /* put new instruction in code array */
-  luaM_growvector(fs->ls->L, f->code, fs->pc, f->sizecode, Instruction,
-                  MAX_INT, "opcodes");
-  f->code[fs->pc] = i;
-  /* save corresponding line information */
-  luaM_growvector(fs->ls->L, f->lineinfo, fs->pc, f->sizelineinfo, int,
-                  MAX_INT, "opcodes");
-  f->lineinfo[fs->pc] = fs->ls->lastline;
-  return fs->pc++;
-}
-
-
-int luaK_codeABC (FuncState *fs, OpCode o, int a, int b, int c) {
-  lua_assert(getOpMode(o) == iABC);
-  lua_assert(getBMode(o) != OpArgN || b == 0);
-  lua_assert(getCMode(o) != OpArgN || c == 0);
-  lua_assert(a <= MAXARG_A && b <= MAXARG_B && c <= MAXARG_C);
-  return luaK_code(fs, CREATE_ABC(o, a, b, c));
-}
-
-
-int luaK_codeABx (FuncState *fs, OpCode o, int a, unsigned int bc) {
-  lua_assert(getOpMode(o) == iABx || getOpMode(o) == iAsBx);
-  lua_assert(getCMode(o) == OpArgN);
-  lua_assert(a <= MAXARG_A && bc <= MAXARG_Bx);
-  return luaK_code(fs, CREATE_ABx(o, a, bc));
-}
-
-
-static int codeextraarg (FuncState *fs, int a) {
-  lua_assert(a <= MAXARG_Ax);
-  return luaK_code(fs, CREATE_Ax(OP_EXTRAARG, a));
-}
-
-
-int luaK_codek (FuncState *fs, int reg, int k) {
-  if (k <= MAXARG_Bx)
-    return luaK_codeABx(fs, OP_LOADK, reg, k);
-  else {
-    int p = luaK_codeABx(fs, OP_LOADKX, reg, 0);
-    codeextraarg(fs, k);
-    return p;
-  }
-}
-
-
-void luaK_checkstack (FuncState *fs, int n) {
-  int newstack = fs->freereg + n;
-  if (newstack > fs->f->maxstacksize) {
-    if (newstack >= MAXSTACK)
-      luaX_syntaxerror(fs->ls, "function or expression too complex");
-    fs->f->maxstacksize = cast_byte(newstack);
-  }
-}
-
-
-void luaK_reserveregs (FuncState *fs, int n) {
-  luaK_checkstack(fs, n);
-  fs->freereg += n;
-}
-
-
-static void freereg (FuncState *fs, int reg) {
-  if (!ISK(reg) && reg >= fs->nactvar) {
-    fs->freereg--;
-    lua_assert(reg == fs->freereg);
-  }
-}
-
-
-static void freeexp (FuncState *fs, expdesc *e) {
-  if (e->k == VNONRELOC)
-    freereg(fs, e->u.info);
-}
-
-
-static int addk (FuncState *fs, TValue *key, TValue *v) {
-  lua_State *L = fs->ls->L;
-  TValue *idx = luaH_set(L, fs->h, key);
-  Proto *f = fs->f;
-  int k, oldsize;
-  if (ttisnumber(idx)) {
-    lua_Number n = nvalue(idx);
-    lua_number2int(k, n);
-    if (luaV_rawequalobj(&f->k[k], v))
-      return k;
-    /* else may be a collision (e.g., between 0.0 and "\0\0\0\0\0\0\0\0");
-       go through and create a new entry for this value */
-  }
-  /* constant not found; create a new entry */
-  oldsize = f->sizek;
-  k = fs->nk;
-  /* numerical value does not need GC barrier;
-     table has no metatable, so it does not need to invalidate cache */
-  setnvalue(idx, cast_num(k));
-  luaM_growvector(L, f->k, k, f->sizek, TValue, MAXARG_Ax, "constants");
-  while (oldsize < f->sizek) setnilvalue(&f->k[oldsize++]);
-  setobj(L, &f->k[k], v);
-  fs->nk++;
-  luaC_barrier(L, f, v);
-  return k;
-}
-
-
-int luaK_stringK (FuncState *fs, TString *s) {
-  TValue o;
-  setsvalue(fs->ls->L, &o, s);
-  return addk(fs, &o, &o);
-}
-
-
-int luaK_numberK (FuncState *fs, lua_Number r) {
-  int n;
-  lua_State *L = fs->ls->L;
-  TValue o;
-  setnvalue(&o, r);
-  if (r == 0 || luai_numisnan(NULL, r)) {  /* handle -0 and NaN */
-    /* use raw representation as key to avoid numeric problems */
-    setsvalue(L, L->top++, luaS_newlstr(L, (char *)&r, sizeof(r)));
-    n = addk(fs, L->top - 1, &o);
-    L->top--;
-  }
-  else
-    n = addk(fs, &o, &o);  /* regular case */
-  return n;
-}
-
-
-static int boolK (FuncState *fs, int b) {
-  TValue o;
-  setbvalue(&o, b);
-  return addk(fs, &o, &o);
-}
-
-
-static int nilK (FuncState *fs) {
-  TValue k, v;
-  setnilvalue(&v);
-  /* cannot use nil as key; instead use table itself to represent nil */
-  sethvalue(fs->ls->L, &k, fs->h);
-  return addk(fs, &k, &v);
-}
-
-
-void luaK_setreturns (FuncState *fs, expdesc *e, int nresults) {
-  if (e->k == VCALL) {  /* expression is an open function call? */
-    SETARG_C(getcode(fs, e), nresults+1);
-  }
-  else if (e->k == VVARARG) {
-    SETARG_B(getcode(fs, e), nresults+1);
-    SETARG_A(getcode(fs, e), fs->freereg);
-    luaK_reserveregs(fs, 1);
-  }
-}
-
-
-void luaK_setoneret (FuncState *fs, expdesc *e) {
-  if (e->k == VCALL) {  /* expression is an open function call? */
-    e->k = VNONRELOC;
-    e->u.info = GETARG_A(getcode(fs, e));
-  }
-  else if (e->k == VVARARG) {
-    SETARG_B(getcode(fs, e), 2);
-    e->k = VRELOCABLE;  /* can relocate its simple result */
-  }
-}
-
-
-void luaK_dischargevars (FuncState *fs, expdesc *e) {
-  switch (e->k) {
-    case VLOCAL: {
-      e->k = VNONRELOC;
-      break;
-    }
-    case VUPVAL: {
-      e->u.info = luaK_codeABC(fs, OP_GETUPVAL, 0, e->u.info, 0);
-      e->k = VRELOCABLE;
-      break;
-    }
-    case VINDEXED: {
-      OpCode op = OP_GETTABUP;  /* assume 't' is in an upvalue */
-      freereg(fs, e->u.ind.idx);
-      if (e->u.ind.vt == VLOCAL) {  /* 't' is in a register? */
-        freereg(fs, e->u.ind.t);
-        op = OP_GETTABLE;
-      }
-      e->u.info = luaK_codeABC(fs, op, 0, e->u.ind.t, e->u.ind.idx);
-      e->k = VRELOCABLE;
-      break;
-    }
-    case VVARARG:
-    case VCALL: {
-      luaK_setoneret(fs, e);
-      break;
-    }
-    default: break;  /* there is one value available (somewhere) */
-  }
-}
-
-
-static int code_label (FuncState *fs, int A, int b, int jump) {
-  luaK_getlabel(fs);  /* those instructions may be jump targets */
-  return luaK_codeABC(fs, OP_LOADBOOL, A, b, jump);
-}
-
-
-static void discharge2reg (FuncState *fs, expdesc *e, int reg) {
-  luaK_dischargevars(fs, e);
-  switch (e->k) {
-    case VNIL: {
-      luaK_nil(fs, reg, 1);
-      break;
-    }
-    case VFALSE: case VTRUE: {
-      luaK_codeABC(fs, OP_LOADBOOL, reg, e->k == VTRUE, 0);
-      break;
-    }
-    case VK: {
-      luaK_codek(fs, reg, e->u.info);
-      break;
-    }
-    case VKNUM: {
-      luaK_codek(fs, reg, luaK_numberK(fs, e->u.nval));
-      break;
-    }
-    case VRELOCABLE: {
-      Instruction *pc = &getcode(fs, e);
-      SETARG_A(*pc, reg);
-      break;
-    }
-    case VNONRELOC: {
-      if (reg != e->u.info)
-        luaK_codeABC(fs, OP_MOVE, reg, e->u.info, 0);
-      break;
-    }
-    default: {
-      lua_assert(e->k == VVOID || e->k == VJMP);
-      return;  /* nothing to do... */
-    }
-  }
-  e->u.info = reg;
-  e->k = VNONRELOC;
-}
-
-
-static void discharge2anyreg (FuncState *fs, expdesc *e) {
-  if (e->k != VNONRELOC) {
-    luaK_reserveregs(fs, 1);
-    discharge2reg(fs, e, fs->freereg-1);
-  }
-}
-
-
-static void exp2reg (FuncState *fs, expdesc *e, int reg) {
-  discharge2reg(fs, e, reg);
-  if (e->k == VJMP)
-    luaK_concat(fs, &e->t, e->u.info);  /* put this jump in `t' list */
-  if (hasjumps(e)) {
-    int final;  /* position after whole expression */
-    int p_f = NO_JUMP;  /* position of an eventual LOAD false */
-    int p_t = NO_JUMP;  /* position of an eventual LOAD true */
-    if (need_value(fs, e->t) || need_value(fs, e->f)) {
-      int fj = (e->k == VJMP) ? NO_JUMP : luaK_jump(fs);
-      p_f = code_label(fs, reg, 0, 1);
-      p_t = code_label(fs, reg, 1, 0);
-      luaK_patchtohere(fs, fj);
-    }
-    final = luaK_getlabel(fs);
-    patchlistaux(fs, e->f, final, reg, p_f);
-    patchlistaux(fs, e->t, final, reg, p_t);
-  }
-  e->f = e->t = NO_JUMP;
-  e->u.info = reg;
-  e->k = VNONRELOC;
-}
-
-
-void luaK_exp2nextreg (FuncState *fs, expdesc *e) {
-  luaK_dischargevars(fs, e);
-  freeexp(fs, e);
-  luaK_reserveregs(fs, 1);
-  exp2reg(fs, e, fs->freereg - 1);
-}
-
-
-int luaK_exp2anyreg (FuncState *fs, expdesc *e) {
-  luaK_dischargevars(fs, e);
-  if (e->k == VNONRELOC) {
-    if (!hasjumps(e)) return e->u.info;  /* exp is already in a register */
-    if (e->u.info >= fs->nactvar) {  /* reg. is not a local? */
-      exp2reg(fs, e, e->u.info);  /* put value on it */
-      return e->u.info;
-    }
-  }
-  luaK_exp2nextreg(fs, e);  /* default */
-  return e->u.info;
-}
-
-
-void luaK_exp2anyregup (FuncState *fs, expdesc *e) {
-  if (e->k != VUPVAL || hasjumps(e))
-    luaK_exp2anyreg(fs, e);
-}
-
-
-void luaK_exp2val (FuncState *fs, expdesc *e) {
-  if (hasjumps(e))
-    luaK_exp2anyreg(fs, e);
-  else
-    luaK_dischargevars(fs, e);
-}
-
-
-int luaK_exp2RK (FuncState *fs, expdesc *e) {
-  luaK_exp2val(fs, e);
-  switch (e->k) {
-    case VTRUE:
-    case VFALSE:
-    case VNIL: {
-      if (fs->nk <= MAXINDEXRK) {  /* constant fits in RK operand? */
-        e->u.info = (e->k == VNIL) ? nilK(fs) : boolK(fs, (e->k == VTRUE));
-        e->k = VK;
-        return RKASK(e->u.info);
-      }
-      else break;
-    }
-    case VKNUM: {
-      e->u.info = luaK_numberK(fs, e->u.nval);
-      e->k = VK;
-      /* go through */
-    }
-    case VK: {
-      if (e->u.info <= MAXINDEXRK)  /* constant fits in argC? */
-        return RKASK(e->u.info);
-      else break;
-    }
-    default: break;
-  }
-  /* not a constant in the right range: put it in a register */
-  return luaK_exp2anyreg(fs, e);
-}
-
-
-void luaK_storevar (FuncState *fs, expdesc *var, expdesc *ex) {
-  switch (var->k) {
-    case VLOCAL: {
-      freeexp(fs, ex);
-      exp2reg(fs, ex, var->u.info);
-      return;
-    }
-    case VUPVAL: {
-      int e = luaK_exp2anyreg(fs, ex);
-      luaK_codeABC(fs, OP_SETUPVAL, e, var->u.info, 0);
-      break;
-    }
-    case VINDEXED: {
-      OpCode op = (var->u.ind.vt == VLOCAL) ? OP_SETTABLE : OP_SETTABUP;
-      int e = luaK_exp2RK(fs, ex);
-      luaK_codeABC(fs, op, var->u.ind.t, var->u.ind.idx, e);
-      break;
-    }
-    default: {
-      lua_assert(0);  /* invalid var kind to store */
-      break;
-    }
-  }
-  freeexp(fs, ex);
-}
-
-
-void luaK_self (FuncState *fs, expdesc *e, expdesc *key) {
-  int ereg;
-  luaK_exp2anyreg(fs, e);
-  ereg = e->u.info;  /* register where 'e' was placed */
-  freeexp(fs, e);
-  e->u.info = fs->freereg;  /* base register for op_self */
-  e->k = VNONRELOC;
-  luaK_reserveregs(fs, 2);  /* function and 'self' produced by op_self */
-  luaK_codeABC(fs, OP_SELF, e->u.info, ereg, luaK_exp2RK(fs, key));
-  freeexp(fs, key);
-}
-
-
-static void invertjump (FuncState *fs, expdesc *e) {
-  Instruction *pc = getjumpcontrol(fs, e->u.info);
-  lua_assert(testTMode(GET_OPCODE(*pc)) && GET_OPCODE(*pc) != OP_TESTSET &&
-                                           GET_OPCODE(*pc) != OP_TEST);
-  SETARG_A(*pc, !(GETARG_A(*pc)));
-}
-
-
-static int jumponcond (FuncState *fs, expdesc *e, int cond) {
-  if (e->k == VRELOCABLE) {
-    Instruction ie = getcode(fs, e);
-    if (GET_OPCODE(ie) == OP_NOT) {
-      fs->pc--;  /* remove previous OP_NOT */
-      return condjump(fs, OP_TEST, GETARG_B(ie), 0, !cond);
-    }
-    /* else go through */
-  }
-  discharge2anyreg(fs, e);
-  freeexp(fs, e);
-  return condjump(fs, OP_TESTSET, NO_REG, e->u.info, cond);
-}
-
-
-void luaK_goiftrue (FuncState *fs, expdesc *e) {
-  int pc;  /* pc of last jump */
-  luaK_dischargevars(fs, e);
-  switch (e->k) {
-    case VJMP: {
-      invertjump(fs, e);
-      pc = e->u.info;
-      break;
-    }
-    case VK: case VKNUM: case VTRUE: {
-      pc = NO_JUMP;  /* always true; do nothing */
-      break;
-    }
-    default: {
-      pc = jumponcond(fs, e, 0);
-      break;
-    }
-  }
-  luaK_concat(fs, &e->f, pc);  /* insert last jump in `f' list */
-  luaK_patchtohere(fs, e->t);
-  e->t = NO_JUMP;
-}
-
-
-void luaK_goiffalse (FuncState *fs, expdesc *e) {
-  int pc;  /* pc of last jump */
-  luaK_dischargevars(fs, e);
-  switch (e->k) {
-    case VJMP: {
-      pc = e->u.info;
-      break;
-    }
-    case VNIL: case VFALSE: {
-      pc = NO_JUMP;  /* always false; do nothing */
-      break;
-    }
-    default: {
-      pc = jumponcond(fs, e, 1);
-      break;
-    }
-  }
-  luaK_concat(fs, &e->t, pc);  /* insert last jump in `t' list */
-  luaK_patchtohere(fs, e->f);
-  e->f = NO_JUMP;
-}
-
-
-static void codenot (FuncState *fs, expdesc *e) {
-  luaK_dischargevars(fs, e);
-  switch (e->k) {
-    case VNIL: case VFALSE: {
-      e->k = VTRUE;
-      break;
-    }
-    case VK: case VKNUM: case VTRUE: {
-      e->k = VFALSE;
-      break;
-    }
-    case VJMP: {
-      invertjump(fs, e);
-      break;
-    }
-    case VRELOCABLE:
-    case VNONRELOC: {
-      discharge2anyreg(fs, e);
-      freeexp(fs, e);
-      e->u.info = luaK_codeABC(fs, OP_NOT, 0, e->u.info, 0);
-      e->k = VRELOCABLE;
-      break;
-    }
-    default: {
-      lua_assert(0);  /* cannot happen */
-      break;
-    }
-  }
-  /* interchange true and false lists */
-  { int temp = e->f; e->f = e->t; e->t = temp; }
-  removevalues(fs, e->f);
-  removevalues(fs, e->t);
-}
-
-
-void luaK_indexed (FuncState *fs, expdesc *t, expdesc *k) {
-  lua_assert(!hasjumps(t));
-  t->u.ind.t = t->u.info;
-  t->u.ind.idx = luaK_exp2RK(fs, k);
-  t->u.ind.vt = (t->k == VUPVAL) ? VUPVAL
-                                 : check_exp(vkisinreg(t->k), VLOCAL);
-  t->k = VINDEXED;
-}
-
-
-static int constfolding (OpCode op, expdesc *e1, expdesc *e2) {
-  lua_Number r;
-  if (!isnumeral(e1) || !isnumeral(e2)) return 0;
-  if ((op == OP_DIV || op == OP_MOD) && e2->u.nval == 0)
-    return 0;  /* do not attempt to divide by 0 */
-  /*
-   * Patched: check for MIN_INT / -1
-   */
-  if (op == OP_DIV && e1->u.nval == INT64_MIN && e2->u.nval == -1)
-    return 0;
-  r = luaO_arith(op - OP_ADD + LUA_OPADD, e1->u.nval, e2->u.nval);
-  e1->u.nval = r;
-  return 1;
-}
-
-
-static void codearith (FuncState *fs, OpCode op,
-                       expdesc *e1, expdesc *e2, int line) {
-  if (constfolding(op, e1, e2))
-    return;
-  else {
-    int o2 = (op != OP_UNM && op != OP_LEN) ? luaK_exp2RK(fs, e2) : 0;
-    int o1 = luaK_exp2RK(fs, e1);
-    if (o1 > o2) {
-      freeexp(fs, e1);
-      freeexp(fs, e2);
-    }
-    else {
-      freeexp(fs, e2);
-      freeexp(fs, e1);
-    }
-    e1->u.info = luaK_codeABC(fs, op, 0, o1, o2);
-    e1->k = VRELOCABLE;
-    luaK_fixline(fs, line);
-  }
-}
-
-
-static void codecomp (FuncState *fs, OpCode op, int cond, expdesc *e1,
-                                                          expdesc *e2) {
-  int o1 = luaK_exp2RK(fs, e1);
-  int o2 = luaK_exp2RK(fs, e2);
-  freeexp(fs, e2);
-  freeexp(fs, e1);
-  if (cond == 0 && op != OP_EQ) {
-    int temp;  /* exchange args to replace by `<' or `<=' */
-    temp = o1; o1 = o2; o2 = temp;  /* o1 <==> o2 */
-    cond = 1;
-  }
-  e1->u.info = condjump(fs, op, cond, o1, o2);
-  e1->k = VJMP;
-}
-
-
-void luaK_prefix (FuncState *fs, UnOpr op, expdesc *e, int line) {
-  expdesc e2;
-  e2.t = e2.f = NO_JUMP; e2.k = VKNUM; e2.u.nval = 0;
-  switch (op) {
-    case OPR_MINUS: {
-      if (isnumeral(e))  /* minus constant? */
-        e->u.nval = luai_numunm(NULL, e->u.nval);  /* fold it */
-      else {
-        luaK_exp2anyreg(fs, e);
-        codearith(fs, OP_UNM, e, &e2, line);
-      }
-      break;
-    }
-    case OPR_NOT: codenot(fs, e); break;
-    case OPR_LEN: {
-      luaK_exp2anyreg(fs, e);  /* cannot operate on constants */
-      codearith(fs, OP_LEN, e, &e2, line);
-      break;
-    }
-    default: lua_assert(0);
-  }
-}
-
-
-void luaK_infix (FuncState *fs, BinOpr op, expdesc *v) {
-  switch (op) {
-    case OPR_AND: {
-      luaK_goiftrue(fs, v);
-      break;
-    }
-    case OPR_OR: {
-      luaK_goiffalse(fs, v);
-      break;
-    }
-    case OPR_CONCAT: {
-      luaK_exp2nextreg(fs, v);  /* operand must be on the `stack' */
-      break;
-    }
-    case OPR_ADD: case OPR_SUB: case OPR_MUL: case OPR_DIV:
-    case OPR_MOD: case OPR_POW: {
-      if (!isnumeral(v)) luaK_exp2RK(fs, v);
-      break;
-    }
-    default: {
-      luaK_exp2RK(fs, v);
-      break;
-    }
-  }
-}
-
-
-void luaK_posfix (FuncState *fs, BinOpr op,
-                  expdesc *e1, expdesc *e2, int line) {
-  switch (op) {
-    case OPR_AND: {
-      lua_assert(e1->t == NO_JUMP);  /* list must be closed */
-      luaK_dischargevars(fs, e2);
-      luaK_concat(fs, &e2->f, e1->f);
-      *e1 = *e2;
-      break;
-    }
-    case OPR_OR: {
-      lua_assert(e1->f == NO_JUMP);  /* list must be closed */
-      luaK_dischargevars(fs, e2);
-      luaK_concat(fs, &e2->t, e1->t);
-      *e1 = *e2;
-      break;
-    }
-    case OPR_CONCAT: {
-      luaK_exp2val(fs, e2);
-      if (e2->k == VRELOCABLE && GET_OPCODE(getcode(fs, e2)) == OP_CONCAT) {
-        lua_assert(e1->u.info == GETARG_B(getcode(fs, e2))-1);
-        freeexp(fs, e1);
-        SETARG_B(getcode(fs, e2), e1->u.info);
-        e1->k = VRELOCABLE; e1->u.info = e2->u.info;
-      }
-      else {
-        luaK_exp2nextreg(fs, e2);  /* operand must be on the 'stack' */
-        codearith(fs, OP_CONCAT, e1, e2, line);
-      }
-      break;
-    }
-    case OPR_ADD: case OPR_SUB: case OPR_MUL: case OPR_DIV:
-    case OPR_MOD: case OPR_POW: {
-      codearith(fs, cast(OpCode, op - OPR_ADD + OP_ADD), e1, e2, line);
-      break;
-    }
-    case OPR_EQ: case OPR_LT: case OPR_LE: {
-      codecomp(fs, cast(OpCode, op - OPR_EQ + OP_EQ), 1, e1, e2);
-      break;
-    }
-    case OPR_NE: case OPR_GT: case OPR_GE: {
-      codecomp(fs, cast(OpCode, op - OPR_NE + OP_EQ), 0, e1, e2);
-      break;
-    }
-    default: lua_assert(0);
-  }
-}
-
-
-void luaK_fixline (FuncState *fs, int line) {
-  fs->f->lineinfo[fs->pc - 1] = line;
-}
-
-
-void luaK_setlist (FuncState *fs, int base, int nelems, int tostore) {
-  int c =  (nelems - 1)/LFIELDS_PER_FLUSH + 1;
-  int b = (tostore == LUA_MULTRET) ? 0 : tostore;
-  lua_assert(tostore != 0);
-  if (c <= MAXARG_C)
-    luaK_codeABC(fs, OP_SETLIST, base, b, c);
-  else if (c <= MAXARG_Ax) {
-    luaK_codeABC(fs, OP_SETLIST, base, b, 0);
-    codeextraarg(fs, c);
-  }
-  else
-    luaX_syntaxerror(fs->ls, "constructor too long");
-  fs->freereg = base + 1;  /* free registers with list values */
-}
-
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lcompat.c
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lcompat.c
+++ /dev/null
@@ -1,102 +0,0 @@
-/*
- * Copyright (c) 2016 by Delphix. All rights reserved.
- */
-
-#include "lua.h"
-
-#include <sys/zfs_context.h>
-
-ssize_t
-lcompat_sprintf(char *buf, const char *fmt, ...)
-{
-	ssize_t res;
-	va_list args;
-
-	va_start(args, fmt);
-	res = vsnprintf(buf, INT_MAX, fmt, args);
-	va_end(args);
-
-	return (res);
-}
-
-int64_t
-lcompat_strtoll(const char *str, char **ptr)
-{
-	int base;
-	const char *cp;
-	int digits;
-	int64_t value;
-	boolean_t is_negative;
-
-	cp = str;
-	while (*cp == ' ' || *cp == '\t' || *cp == '\n') {
-		cp++;
-	}
-	is_negative = (*cp == '-');
-	if (is_negative) {
-		cp++;
-	}
-	base = 10;
-
-	if (*cp == '0') {
-		base = 8;
-		cp++;
-		if (*cp == 'x' || *cp == 'X') {
-			base = 16;
-			cp++;
-		}
-	}
-
-	value = 0;
-	for (; *cp != '\0'; cp++) {
-		if (*cp >= '0' && *cp <= '9') {
-			digits = *cp - '0';
-		} else if (*cp >= 'a' && *cp <= 'f') {
-			digits = *cp - 'a' + 10;
-		} else if (*cp >= 'A' && *cp <= 'F') {
-			digits = *cp - 'A' + 10;
-		} else {
-			break;
-		}
-		if (digits >= base) {
-			break;
-		}
-		value = (value * base) + digits;
-	}
-
-	if (ptr != NULL) {
-		*ptr = (char *)cp;
-	}
-	if (is_negative) {
-		value = -value;
-	}
-	return (value);
-}
-
-int64_t
-lcompat_pow(int64_t x, int64_t y)
-{
-	int64_t result = 1;
-	if (y < 0)
-		return (0);
-
-	while (y) {
-		if (y & 1)
-			result *= x;
-		y >>= 1;
-		x *= x;
-	}
-	return (result);
-}
-
-int
-lcompat_hashnum(int64_t x)
-{
-	x = (~x) + (x << 18);
-	x = x ^ (x >> 31);
-	x = x * 21;
-	x = x ^ (x >> 11);
-	x = x + (x << 6);
-	x = x ^ (x >> 22);
-	return ((int)x);
-}
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lcorolib.c
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lcorolib.c
+++ /dev/null
@@ -1,154 +0,0 @@
-/*
-** $Id: lcorolib.c,v 1.5.1.1 2013/04/12 18:48:47 roberto Exp $
-** Coroutine Library
-** See Copyright Notice in lua.h
-*/
-
-
-#include <sys/zfs_context.h>
-
-#define lcorolib_c
-#define LUA_LIB
-
-#include "lua.h"
-
-#include "lauxlib.h"
-#include "lualib.h"
-
-
-static int auxresume (lua_State *L, lua_State *co, int narg) {
-  int status;
-  if (!lua_checkstack(co, narg)) {
-    lua_pushliteral(L, "too many arguments to resume");
-    return -1;  /* error flag */
-  }
-  if (lua_status(co) == LUA_OK && lua_gettop(co) == 0) {
-    lua_pushliteral(L, "cannot resume dead coroutine");
-    return -1;  /* error flag */
-  }
-  lua_xmove(L, co, narg);
-  status = lua_resume(co, L, narg);
-  if (status == LUA_OK || status == LUA_YIELD) {
-    int nres = lua_gettop(co);
-    if (!lua_checkstack(L, nres + 1)) {
-      lua_pop(co, nres);  /* remove results anyway */
-      lua_pushliteral(L, "too many results to resume");
-      return -1;  /* error flag */
-    }
-    lua_xmove(co, L, nres);  /* move yielded values */
-    return nres;
-  }
-  else {
-    lua_xmove(co, L, 1);  /* move error message */
-    return -1;  /* error flag */
-  }
-}
-
-
-static int luaB_coresume (lua_State *L) {
-  lua_State *co = lua_tothread(L, 1);
-  int r;
-  luaL_argcheck(L, co, 1, "coroutine expected");
-  r = auxresume(L, co, lua_gettop(L) - 1);
-  if (r < 0) {
-    lua_pushboolean(L, 0);
-    lua_insert(L, -2);
-    return 2;  /* return false + error message */
-  }
-  else {
-    lua_pushboolean(L, 1);
-    lua_insert(L, -(r + 1));
-    return r + 1;  /* return true + `resume' returns */
-  }
-}
-
-
-static int luaB_auxwrap (lua_State *L) {
-  lua_State *co = lua_tothread(L, lua_upvalueindex(1));
-  int r = auxresume(L, co, lua_gettop(L));
-  if (r < 0) {
-    if (lua_isstring(L, -1)) {  /* error object is a string? */
-      luaL_where(L, 1);  /* add extra info */
-      lua_insert(L, -2);
-      lua_concat(L, 2);
-    }
-    return lua_error(L);  /* propagate error */
-  }
-  return r;
-}
-
-
-static int luaB_cocreate (lua_State *L) {
-  lua_State *NL;
-  luaL_checktype(L, 1, LUA_TFUNCTION);
-  NL = lua_newthread(L);
-  lua_pushvalue(L, 1);  /* move function to top */
-  lua_xmove(L, NL, 1);  /* move function from L to NL */
-  return 1;
-}
-
-
-static int luaB_cowrap (lua_State *L) {
-  luaB_cocreate(L);
-  lua_pushcclosure(L, luaB_auxwrap, 1);
-  return 1;
-}
-
-
-static int luaB_yield (lua_State *L) {
-  return lua_yield(L, lua_gettop(L));
-}
-
-
-static int luaB_costatus (lua_State *L) {
-  lua_State *co = lua_tothread(L, 1);
-  luaL_argcheck(L, co, 1, "coroutine expected");
-  if (L == co) lua_pushliteral(L, "running");
-  else {
-    switch (lua_status(co)) {
-      case LUA_YIELD:
-        lua_pushliteral(L, "suspended");
-        break;
-      case LUA_OK: {
-        lua_Debug ar;
-        if (lua_getstack(co, 0, &ar) > 0)  /* does it have frames? */
-          lua_pushliteral(L, "normal");  /* it is running */
-        else if (lua_gettop(co) == 0)
-            lua_pushliteral(L, "dead");
-        else
-          lua_pushliteral(L, "suspended");  /* initial state */
-        break;
-      }
-      default:  /* some error occurred */
-        lua_pushliteral(L, "dead");
-        break;
-    }
-  }
-  return 1;
-}
-
-
-static int luaB_corunning (lua_State *L) {
-  int ismain = lua_pushthread(L);
-  lua_pushboolean(L, ismain);
-  return 2;
-}
-
-
-static const luaL_Reg co_funcs[] = {
-  {"create", luaB_cocreate},
-  {"resume", luaB_coresume},
-  {"running", luaB_corunning},
-  {"status", luaB_costatus},
-  {"wrap", luaB_cowrap},
-  {"yield", luaB_yield},
-  {NULL, NULL}
-};
-
-
-
-LUAMOD_API int luaopen_coroutine (lua_State *L) {
-  luaL_newlib(L, co_funcs);
-  return 1;
-}
-
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lctype.h
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lctype.h
+++ /dev/null
@@ -1,93 +0,0 @@
-/*
-** $Id: lctype.h,v 1.12.1.1 2013/04/12 18:48:47 roberto Exp $
-** 'ctype' functions for Lua
-** See Copyright Notice in lua.h
-*/
-
-#ifndef lctype_h
-#define lctype_h
-
-#include "lua.h"
-
-
-/*
-** WARNING: the functions defined here do not necessarily correspond
-** to the similar functions in the standard C ctype.h. They are
-** optimized for the specific needs of Lua
-*/
-
-#if !defined(LUA_USE_CTYPE)
-
-#if 'A' == 65 && '0' == 48
-/* ASCII case: can use its own tables; faster and fixed */
-#define LUA_USE_CTYPE	0
-#else
-/* must use standard C ctype */
-#define LUA_USE_CTYPE	1
-#endif
-
-#endif
-
-
-#if !LUA_USE_CTYPE	/* { */
-
-#include "llimits.h"
-
-
-#define ALPHABIT	0
-#define DIGITBIT	1
-#define PRINTBIT	2
-#define SPACEBIT	3
-#define XDIGITBIT	4
-
-
-#define MASK(B)		(1 << (B))
-
-
-/*
-** add 1 to char to allow index -1 (EOZ)
-*/
-#define testprop(c,p)	(luai_ctype_[(c)+1] & (p))
-
-/*
-** 'lalpha' (Lua alphabetic) and 'lalnum' (Lua alphanumeric) both include '_'
-*/
-#define lislalpha(c)	testprop(c, MASK(ALPHABIT))
-#define lislalnum(c)	testprop(c, (MASK(ALPHABIT) | MASK(DIGITBIT)))
-#define lisdigit(c)	testprop(c, MASK(DIGITBIT))
-#define lisspace(c)	testprop(c, MASK(SPACEBIT))
-#define lisprint(c)	testprop(c, MASK(PRINTBIT))
-#define lisxdigit(c)	testprop(c, MASK(XDIGITBIT))
-
-/*
-** this 'ltolower' only works for alphabetic characters
-*/
-#define ltolower(c)	((c) | ('A' ^ 'a'))
-
-
-/* two more entries for 0 and -1 (EOZ) */
-LUAI_DDEC const lu_byte luai_ctype_[UCHAR_MAX + 2];
-
-
-#else			/* }{ */
-
-/*
-** use standard C ctypes
-*/
-
-#include <ctype.h>
-
-
-#define lislalpha(c)	(isalpha(c) || (c) == '_')
-#define lislalnum(c)	(isalnum(c) || (c) == '_')
-#define lisdigit(c)	(isdigit(c))
-#define lisspace(c)	(isspace(c))
-#define lisprint(c)	(isprint(c))
-#define lisxdigit(c)	(isxdigit(c))
-
-#define ltolower(c)	(tolower(c))
-
-#endif			/* } */
-
-#endif
-
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lctype.c
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lctype.c
+++ /dev/null
@@ -1,52 +0,0 @@
-/*
-** $Id: lctype.c,v 1.11.1.1 2013/04/12 18:48:47 roberto Exp $
-** 'ctype' functions for Lua
-** See Copyright Notice in lua.h
-*/
-
-#define lctype_c
-#define LUA_CORE
-
-#include "lctype.h"
-
-#if !LUA_USE_CTYPE	/* { */
-
-#include <sys/zfs_context.h>
-
-LUAI_DDEF const lu_byte luai_ctype_[UCHAR_MAX + 2] = {
-  0x00,  /* EOZ */
-  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,	/* 0. */
-  0x00,  0x08,  0x08,  0x08,  0x08,  0x08,  0x00,  0x00,
-  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,	/* 1. */
-  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,
-  0x0c,  0x04,  0x04,  0x04,  0x04,  0x04,  0x04,  0x04,	/* 2. */
-  0x04,  0x04,  0x04,  0x04,  0x04,  0x04,  0x04,  0x04,
-  0x16,  0x16,  0x16,  0x16,  0x16,  0x16,  0x16,  0x16,	/* 3. */
-  0x16,  0x16,  0x04,  0x04,  0x04,  0x04,  0x04,  0x04,
-  0x04,  0x15,  0x15,  0x15,  0x15,  0x15,  0x15,  0x05,	/* 4. */
-  0x05,  0x05,  0x05,  0x05,  0x05,  0x05,  0x05,  0x05,
-  0x05,  0x05,  0x05,  0x05,  0x05,  0x05,  0x05,  0x05,	/* 5. */
-  0x05,  0x05,  0x05,  0x04,  0x04,  0x04,  0x04,  0x05,
-  0x04,  0x15,  0x15,  0x15,  0x15,  0x15,  0x15,  0x05,	/* 6. */
-  0x05,  0x05,  0x05,  0x05,  0x05,  0x05,  0x05,  0x05,
-  0x05,  0x05,  0x05,  0x05,  0x05,  0x05,  0x05,  0x05,	/* 7. */
-  0x05,  0x05,  0x05,  0x04,  0x04,  0x04,  0x04,  0x00,
-  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,	/* 8. */
-  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,
-  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,	/* 9. */
-  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,
-  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,	/* a. */
-  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,
-  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,	/* b. */
-  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,
-  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,	/* c. */
-  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,
-  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,	/* d. */
-  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,
-  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,	/* e. */
-  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,
-  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,	/* f. */
-  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,
-};
-
-#endif			/* } */
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/ldebug.h
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/ldebug.h
+++ /dev/null
@@ -1,34 +0,0 @@
-/*
-** $Id: ldebug.h,v 2.7.1.1 2013/04/12 18:48:47 roberto Exp $
-** Auxiliary functions from Debug Interface module
-** See Copyright Notice in lua.h
-*/
-
-#ifndef ldebug_h
-#define ldebug_h
-
-
-#include "lstate.h"
-
-
-#define pcRel(pc, p)	(cast(int, (pc) - (p)->code) - 1)
-
-#define getfuncline(f,pc)	(((f)->lineinfo) ? (f)->lineinfo[pc] : 0)
-
-#define resethookcount(L)	(L->hookcount = L->basehookcount)
-
-/* Active Lua function (given call info) */
-#define ci_func(ci)		(clLvalue((ci)->func))
-
-
-LUAI_FUNC l_noret luaG_typeerror (lua_State *L, const TValue *o,
-                                                const char *opname);
-LUAI_FUNC l_noret luaG_concaterror (lua_State *L, StkId p1, StkId p2);
-LUAI_FUNC l_noret luaG_aritherror (lua_State *L, const TValue *p1,
-                                                 const TValue *p2);
-LUAI_FUNC l_noret luaG_ordererror (lua_State *L, const TValue *p1,
-                                                 const TValue *p2);
-LUAI_FUNC l_noret luaG_runerror (lua_State *L, const char *fmt, ...);
-LUAI_FUNC l_noret luaG_errormsg (lua_State *L);
-
-#endif
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/ldebug.c
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/ldebug.c
+++ /dev/null
@@ -1,607 +0,0 @@
-/*
-** $Id: ldebug.c,v 2.90.1.4 2015/02/19 17:05:13 roberto Exp $
-** Debug Interface
-** See Copyright Notice in lua.h
-*/
-
-
-#include <sys/zfs_context.h>
-
-#define ldebug_c
-#define LUA_CORE
-
-#include "lua.h"
-
-#include "lapi.h"
-#include "lcode.h"
-#include "ldebug.h"
-#include "ldo.h"
-#include "lfunc.h"
-#include "lobject.h"
-#include "lopcodes.h"
-#include "lstate.h"
-#include "lstring.h"
-#include "ltable.h"
-#include "ltm.h"
-#include "lvm.h"
-
-
-
-#define noLuaClosure(f)		((f) == NULL || (f)->c.tt == LUA_TCCL)
-
-
-static const char *getfuncname (lua_State *L, CallInfo *ci, const char **name);
-
-
-static int currentpc (CallInfo *ci) {
-  lua_assert(isLua(ci));
-  return pcRel(ci->u.l.savedpc, ci_func(ci)->p);
-}
-
-
-static int currentline (CallInfo *ci) {
-  return getfuncline(ci_func(ci)->p, currentpc(ci));
-}
-
-
-static void swapextra (lua_State *L) {
-  if (L->status == LUA_YIELD) {
-    CallInfo *ci = L->ci;  /* get function that yielded */
-    StkId temp = ci->func;  /* exchange its 'func' and 'extra' values */
-    ci->func = restorestack(L, ci->extra);
-    ci->extra = savestack(L, temp);
-  }
-}
-
-
-/*
-** this function can be called asynchronous (e.g. during a signal)
-*/
-LUA_API int lua_sethook (lua_State *L, lua_Hook func, int mask, int count) {
-  if (func == NULL || mask == 0) {  /* turn off hooks? */
-    mask = 0;
-    func = NULL;
-  }
-  if (isLua(L->ci))
-    L->oldpc = L->ci->u.l.savedpc;
-  L->hook = func;
-  L->basehookcount = count;
-  resethookcount(L);
-  L->hookmask = cast_byte(mask);
-  return 1;
-}
-
-
-LUA_API lua_Hook lua_gethook (lua_State *L) {
-  return L->hook;
-}
-
-
-LUA_API int lua_gethookmask (lua_State *L) {
-  return L->hookmask;
-}
-
-
-LUA_API int lua_gethookcount (lua_State *L) {
-  return L->basehookcount;
-}
-
-
-LUA_API int lua_getstack (lua_State *L, int level, lua_Debug *ar) {
-  int status;
-  CallInfo *ci;
-  if (level < 0) return 0;  /* invalid (negative) level */
-  lua_lock(L);
-  for (ci = L->ci; level > 0 && ci != &L->base_ci; ci = ci->previous)
-    level--;
-  if (level == 0 && ci != &L->base_ci) {  /* level found? */
-    status = 1;
-    ar->i_ci = ci;
-  }
-  else status = 0;  /* no such level */
-  lua_unlock(L);
-  return status;
-}
-
-
-static const char *upvalname (Proto *p, int uv) {
-  TString *s = check_exp(uv < p->sizeupvalues, p->upvalues[uv].name);
-  if (s == NULL) return "?";
-  else return getstr(s);
-}
-
-
-static const char *findvararg (CallInfo *ci, int n, StkId *pos) {
-  int nparams = clLvalue(ci->func)->p->numparams;
-  if (n >= ci->u.l.base - ci->func - nparams)
-    return NULL;  /* no such vararg */
-  else {
-    *pos = ci->func + nparams + n;
-    return "(*vararg)";  /* generic name for any vararg */
-  }
-}
-
-
-static const char *findlocal (lua_State *L, CallInfo *ci, int n,
-                              StkId *pos) {
-  const char *name = NULL;
-  StkId base;
-  if (isLua(ci)) {
-    if (n < 0)  /* access to vararg values? */
-      return findvararg(ci, -n, pos);
-    else {
-      base = ci->u.l.base;
-      name = luaF_getlocalname(ci_func(ci)->p, n, currentpc(ci));
-    }
-  }
-  else
-    base = ci->func + 1;
-  if (name == NULL) {  /* no 'standard' name? */
-    StkId limit = (ci == L->ci) ? L->top : ci->next->func;
-    if (limit - base >= n && n > 0)  /* is 'n' inside 'ci' stack? */
-      name = "(*temporary)";  /* generic name for any valid slot */
-    else
-      return NULL;  /* no name */
-  }
-  *pos = base + (n - 1);
-  return name;
-}
-
-
-LUA_API const char *lua_getlocal (lua_State *L, const lua_Debug *ar, int n) {
-  const char *name;
-  lua_lock(L);
-  swapextra(L);
-  if (ar == NULL) {  /* information about non-active function? */
-    if (!isLfunction(L->top - 1))  /* not a Lua function? */
-      name = NULL;
-    else  /* consider live variables at function start (parameters) */
-      name = luaF_getlocalname(clLvalue(L->top - 1)->p, n, 0);
-  }
-  else {  /* active function; get information through 'ar' */
-    StkId pos = 0;  /* to avoid warnings */
-    name = findlocal(L, ar->i_ci, n, &pos);
-    if (name) {
-      setobj2s(L, L->top, pos);
-      api_incr_top(L);
-    }
-  }
-  swapextra(L);
-  lua_unlock(L);
-  return name;
-}
-
-
-LUA_API const char *lua_setlocal (lua_State *L, const lua_Debug *ar, int n) {
-  StkId pos = 0;  /* to avoid warnings */
-  const char *name;
-  lua_lock(L);
-  swapextra(L);
-  name = findlocal(L, ar->i_ci, n, &pos);
-  if (name)
-    setobjs2s(L, pos, L->top - 1);
-  L->top--;  /* pop value */
-  swapextra(L);
-  lua_unlock(L);
-  return name;
-}
-
-
-static void funcinfo (lua_Debug *ar, Closure *cl) {
-  if (noLuaClosure(cl)) {
-    ar->source = "=[C]";
-    ar->linedefined = -1;
-    ar->lastlinedefined = -1;
-    ar->what = "C";
-  }
-  else {
-    Proto *p = cl->l.p;
-    ar->source = p->source ? getstr(p->source) : "=?";
-    ar->linedefined = p->linedefined;
-    ar->lastlinedefined = p->lastlinedefined;
-    ar->what = (ar->linedefined == 0) ? "main" : "Lua";
-  }
-  luaO_chunkid(ar->short_src, ar->source, LUA_IDSIZE);
-}
-
-
-static void collectvalidlines (lua_State *L, Closure *f) {
-  if (noLuaClosure(f)) {
-    setnilvalue(L->top);
-    api_incr_top(L);
-  }
-  else {
-    int i;
-    TValue v;
-    int *lineinfo = f->l.p->lineinfo;
-    Table *t = luaH_new(L);  /* new table to store active lines */
-    sethvalue(L, L->top, t);  /* push it on stack */
-    api_incr_top(L);
-    setbvalue(&v, 1);  /* boolean 'true' to be the value of all indices */
-    for (i = 0; i < f->l.p->sizelineinfo; i++)  /* for all lines with code */
-      luaH_setint(L, t, lineinfo[i], &v);  /* table[line] = true */
-  }
-}
-
-
-static int auxgetinfo (lua_State *L, const char *what, lua_Debug *ar,
-                       Closure *f, CallInfo *ci) {
-  int status = 1;
-  for (; *what; what++) {
-    switch (*what) {
-      case 'S': {
-        funcinfo(ar, f);
-        break;
-      }
-      case 'l': {
-        ar->currentline = (ci && isLua(ci)) ? currentline(ci) : -1;
-        break;
-      }
-      case 'u': {
-        ar->nups = (f == NULL) ? 0 : f->c.nupvalues;
-        if (noLuaClosure(f)) {
-          ar->isvararg = 1;
-          ar->nparams = 0;
-        }
-        else {
-          ar->isvararg = f->l.p->is_vararg;
-          ar->nparams = f->l.p->numparams;
-        }
-        break;
-      }
-      case 't': {
-        ar->istailcall = (ci) ? ci->callstatus & CIST_TAIL : 0;
-        break;
-      }
-      case 'n': {
-        /* calling function is a known Lua function? */
-        if (ci && !(ci->callstatus & CIST_TAIL) && isLua(ci->previous))
-          ar->namewhat = getfuncname(L, ci->previous, &ar->name);
-        else
-          ar->namewhat = NULL;
-        if (ar->namewhat == NULL) {
-          ar->namewhat = "";  /* not found */
-          ar->name = NULL;
-        }
-        break;
-      }
-      case 'L':
-      case 'f':  /* handled by lua_getinfo */
-        break;
-      default: status = 0;  /* invalid option */
-    }
-  }
-  return status;
-}
-
-
-LUA_API int lua_getinfo (lua_State *L, const char *what, lua_Debug *ar) {
-  int status;
-  Closure *cl;
-  CallInfo *ci;
-  StkId func;
-  lua_lock(L);
-  swapextra(L);
-  if (*what == '>') {
-    ci = NULL;
-    func = L->top - 1;
-    api_check(L, ttisfunction(func), "function expected");
-    what++;  /* skip the '>' */
-    L->top--;  /* pop function */
-  }
-  else {
-    ci = ar->i_ci;
-    func = ci->func;
-    lua_assert(ttisfunction(ci->func));
-  }
-  cl = ttisclosure(func) ? clvalue(func) : NULL;
-  status = auxgetinfo(L, what, ar, cl, ci);
-  if (strchr(what, 'f')) {
-    setobjs2s(L, L->top, func);
-    api_incr_top(L);
-  }
-  swapextra(L);
-  if (strchr(what, 'L'))
-    collectvalidlines(L, cl);
-  lua_unlock(L);
-  return status;
-}
-
-
-/*
-** {======================================================
-** Symbolic Execution
-** =======================================================
-*/
-
-static const char *getobjname (Proto *p, int lastpc, int reg,
-                               const char **name);
-
-
-/*
-** find a "name" for the RK value 'c'
-*/
-static void kname (Proto *p, int pc, int c, const char **name) {
-  if (ISK(c)) {  /* is 'c' a constant? */
-    TValue *kvalue = &p->k[INDEXK(c)];
-    if (ttisstring(kvalue)) {  /* literal constant? */
-      *name = svalue(kvalue);  /* it is its own name */
-      return;
-    }
-    /* else no reasonable name found */
-  }
-  else {  /* 'c' is a register */
-    const char *what = getobjname(p, pc, c, name); /* search for 'c' */
-    if (what && *what == 'c') {  /* found a constant name? */
-      return;  /* 'name' already filled */
-    }
-    /* else no reasonable name found */
-  }
-  *name = "?";  /* no reasonable name found */
-}
-
-
-static int filterpc (int pc, int jmptarget) {
-  if (pc < jmptarget)  /* is code conditional (inside a jump)? */
-    return -1;  /* cannot know who sets that register */
-  else return pc;  /* current position sets that register */
-}
-
-
-/*
-** try to find last instruction before 'lastpc' that modified register 'reg'
-*/
-static int findsetreg (Proto *p, int lastpc, int reg) {
-  int pc;
-  int setreg = -1;  /* keep last instruction that changed 'reg' */
-  int jmptarget = 0;  /* any code before this address is conditional */
-  for (pc = 0; pc < lastpc; pc++) {
-    Instruction i = p->code[pc];
-    OpCode op = GET_OPCODE(i);
-    int a = GETARG_A(i);
-    switch (op) {
-      case OP_LOADNIL: {
-        int b = GETARG_B(i);
-        if (a <= reg && reg <= a + b)  /* set registers from 'a' to 'a+b' */
-          setreg = filterpc(pc, jmptarget);
-        break;
-      }
-      case OP_TFORCALL: {
-        if (reg >= a + 2)  /* affect all regs above its base */
-          setreg = filterpc(pc, jmptarget);
-        break;
-      }
-      case OP_CALL:
-      case OP_TAILCALL: {
-        if (reg >= a)  /* affect all registers above base */
-          setreg = filterpc(pc, jmptarget);
-        break;
-      }
-      case OP_JMP: {
-        int b = GETARG_sBx(i);
-        int dest = pc + 1 + b;
-        /* jump is forward and do not skip `lastpc'? */
-        if (pc < dest && dest <= lastpc) {
-          if (dest > jmptarget)
-            jmptarget = dest;  /* update 'jmptarget' */
-        }
-        break;
-      }
-      case OP_TEST: {
-        if (reg == a)  /* jumped code can change 'a' */
-          setreg = filterpc(pc, jmptarget);
-        break;
-      }
-      default:
-        if (testAMode(op) && reg == a)  /* any instruction that set A */
-          setreg = filterpc(pc, jmptarget);
-        break;
-    }
-  }
-  return setreg;
-}
-
-
-static const char *getobjname (Proto *p, int lastpc, int reg,
-                               const char **name) {
-  int pc;
-  *name = luaF_getlocalname(p, reg + 1, lastpc);
-  if (*name)  /* is a local? */
-    return "local";
-  /* else try symbolic execution */
-  pc = findsetreg(p, lastpc, reg);
-  if (pc != -1) {  /* could find instruction? */
-    Instruction i = p->code[pc];
-    OpCode op = GET_OPCODE(i);
-    switch (op) {
-      case OP_MOVE: {
-        int b = GETARG_B(i);  /* move from 'b' to 'a' */
-        if (b < GETARG_A(i))
-          return getobjname(p, pc, b, name);  /* get name for 'b' */
-        break;
-      }
-      case OP_GETTABUP:
-      case OP_GETTABLE: {
-        int k = GETARG_C(i);  /* key index */
-        int t = GETARG_B(i);  /* table index */
-        const char *vn = (op == OP_GETTABLE)  /* name of indexed variable */
-                         ? luaF_getlocalname(p, t + 1, pc)
-                         : upvalname(p, t);
-        kname(p, pc, k, name);
-        return (vn && strcmp(vn, LUA_ENV) == 0) ? "global" : "field";
-      }
-      case OP_GETUPVAL: {
-        *name = upvalname(p, GETARG_B(i));
-        return "upvalue";
-      }
-      case OP_LOADK:
-      case OP_LOADKX: {
-        int b = (op == OP_LOADK) ? GETARG_Bx(i)
-                                 : GETARG_Ax(p->code[pc + 1]);
-        if (ttisstring(&p->k[b])) {
-          *name = svalue(&p->k[b]);
-          return "constant";
-        }
-        break;
-      }
-      case OP_SELF: {
-        int k = GETARG_C(i);  /* key index */
-        kname(p, pc, k, name);
-        return "method";
-      }
-      default: break;  /* go through to return NULL */
-    }
-  }
-  return NULL;  /* could not find reasonable name */
-}
-
-
-static const char *getfuncname (lua_State *L, CallInfo *ci, const char **name) {
-  TMS tm;
-  Proto *p = ci_func(ci)->p;  /* calling function */
-  int pc = currentpc(ci);  /* calling instruction index */
-  Instruction i = p->code[pc];  /* calling instruction */
-  switch (GET_OPCODE(i)) {
-    case OP_CALL:
-    case OP_TAILCALL:  /* get function name */
-      return getobjname(p, pc, GETARG_A(i), name);
-    case OP_TFORCALL: {  /* for iterator */
-      *name = "for iterator";
-       return "for iterator";
-    }
-    /* all other instructions can call only through metamethods */
-    case OP_SELF:
-    case OP_GETTABUP:
-    case OP_GETTABLE: tm = TM_INDEX; break;
-    case OP_SETTABUP:
-    case OP_SETTABLE: tm = TM_NEWINDEX; break;
-    case OP_EQ: tm = TM_EQ; break;
-    case OP_ADD: tm = TM_ADD; break;
-    case OP_SUB: tm = TM_SUB; break;
-    case OP_MUL: tm = TM_MUL; break;
-    case OP_DIV: tm = TM_DIV; break;
-    case OP_MOD: tm = TM_MOD; break;
-    case OP_POW: tm = TM_POW; break;
-    case OP_UNM: tm = TM_UNM; break;
-    case OP_LEN: tm = TM_LEN; break;
-    case OP_LT: tm = TM_LT; break;
-    case OP_LE: tm = TM_LE; break;
-    case OP_CONCAT: tm = TM_CONCAT; break;
-    default:
-      return NULL;  /* else no useful name can be found */
-  }
-  *name = getstr(G(L)->tmname[tm]);
-  return "metamethod";
-}
-
-/* }====================================================== */
-
-
-
-/*
-** only ANSI way to check whether a pointer points to an array
-** (used only for error messages, so efficiency is not a big concern)
-*/
-static int isinstack (CallInfo *ci, const TValue *o) {
-  StkId p;
-  for (p = ci->u.l.base; p < ci->top; p++)
-    if (o == p) return 1;
-  return 0;
-}
-
-
-static const char *getupvalname (CallInfo *ci, const TValue *o,
-                                 const char **name) {
-  LClosure *c = ci_func(ci);
-  int i;
-  for (i = 0; i < c->nupvalues; i++) {
-    if (c->upvals[i]->v == o) {
-      *name = upvalname(c->p, i);
-      return "upvalue";
-    }
-  }
-  return NULL;
-}
-
-
-l_noret luaG_typeerror (lua_State *L, const TValue *o, const char *op) {
-  CallInfo *ci = L->ci;
-  const char *name = NULL;
-  const char *t = objtypename(o);
-  const char *kind = NULL;
-  if (isLua(ci)) {
-    kind = getupvalname(ci, o, &name);  /* check whether 'o' is an upvalue */
-    if (!kind && isinstack(ci, o))  /* no? try a register */
-      kind = getobjname(ci_func(ci)->p, currentpc(ci),
-                        cast_int(o - ci->u.l.base), &name);
-  }
-  if (kind)
-    luaG_runerror(L, "attempt to %s %s " LUA_QS " (a %s value)",
-                op, kind, name, t);
-  else
-    luaG_runerror(L, "attempt to %s a %s value", op, t);
-}
-
-
-l_noret luaG_concaterror (lua_State *L, StkId p1, StkId p2) {
-  if (ttisstring(p1) || ttisnumber(p1)) p1 = p2;
-  lua_assert(!ttisstring(p1) && !ttisnumber(p1));
-  luaG_typeerror(L, p1, "concatenate");
-}
-
-
-l_noret luaG_aritherror (lua_State *L, const TValue *p1, const TValue *p2) {
-  TValue temp;
-  if (luaV_tonumber(p1, &temp) == NULL)
-    p2 = p1;  /* first operand is wrong */
-  luaG_typeerror(L, p2, "perform arithmetic on");
-}
-
-
-l_noret luaG_ordererror (lua_State *L, const TValue *p1, const TValue *p2) {
-  const char *t1 = objtypename(p1);
-  const char *t2 = objtypename(p2);
-  if (t1 == t2)
-    luaG_runerror(L, "attempt to compare two %s values", t1);
-  else
-    luaG_runerror(L, "attempt to compare %s with %s", t1, t2);
-}
-
-
-static void addinfo (lua_State *L, const char *msg) {
-  CallInfo *ci = L->ci;
-  if (isLua(ci)) {  /* is Lua code? */
-    char buff[LUA_IDSIZE];  /* add file:line information */
-    int line = currentline(ci);
-    TString *src = ci_func(ci)->p->source;
-    if (src)
-      luaO_chunkid(buff, getstr(src), LUA_IDSIZE);
-    else {  /* no source available; use "?" instead */
-      buff[0] = '?'; buff[1] = '\0';
-    }
-    luaO_pushfstring(L, "%s:%d: %s", buff, line, msg);
-  }
-}
-
-
-l_noret luaG_errormsg (lua_State *L) {
-  if (L->errfunc != 0) {  /* is there an error handling function? */
-    StkId errfunc = restorestack(L, L->errfunc);
-    if (!ttisfunction(errfunc)) luaD_throw(L, LUA_ERRERR);
-    setobjs2s(L, L->top, L->top - 1);  /* move argument */
-    setobjs2s(L, L->top - 1, errfunc);  /* push function */
-    L->top++;
-    luaD_call(L, L->top - 2, 1, 0);  /* call it */
-  }
-  luaD_throw(L, LUA_ERRRUN);
-}
-
-
-l_noret luaG_runerror (lua_State *L, const char *fmt, ...) {
-  va_list argp;
-  va_start(argp, fmt);
-  addinfo(L, luaO_pushvfstring(L, fmt, argp));
-  va_end(argp);
-  luaG_errormsg(L);
-}
-
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/ldo.h
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/ldo.h
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
-** $Id: ldo.h,v 2.20.1.1 2013/04/12 18:48:47 roberto Exp $
-** Stack and Call structure of Lua
-** See Copyright Notice in lua.h
-*/
-
-#ifndef ldo_h
-#define ldo_h
-
-
-#include "lobject.h"
-#include "lstate.h"
-#include "lzio.h"
-
-
-#define luaD_checkstack(L,n)	if (L->stack_last - L->top <= (n)) \
-				    luaD_growstack(L, n); else condmovestack(L);
-
-
-#define incr_top(L) {L->top++; luaD_checkstack(L,0);}
-
-#define savestack(L,p)		((char *)(p) - (char *)L->stack)
-#define restorestack(L,n)	((TValue *)((char *)L->stack + (n)))
-
-
-/* type of protected functions, to be ran by `runprotected' */
-typedef void (*Pfunc) (lua_State *L, void *ud);
-
-LUAI_FUNC int luaD_protectedparser (lua_State *L, ZIO *z, const char *name,
-                                                  const char *mode);
-LUAI_FUNC void luaD_hook (lua_State *L, int event, int line);
-LUAI_FUNC int luaD_precall (lua_State *L, StkId func, int nresults);
-LUAI_FUNC void luaD_call (lua_State *L, StkId func, int nResults,
-                                        int allowyield);
-LUAI_FUNC int luaD_pcall (lua_State *L, Pfunc func, void *u,
-                                        ptrdiff_t oldtop, ptrdiff_t ef);
-LUAI_FUNC int luaD_poscall (lua_State *L, StkId firstResult);
-LUAI_FUNC void luaD_reallocstack (lua_State *L, int newsize);
-LUAI_FUNC void luaD_growstack (lua_State *L, int n);
-LUAI_FUNC void luaD_shrinkstack (lua_State *L);
-
-LUAI_FUNC l_noret luaD_throw (lua_State *L, int errcode);
-LUAI_FUNC int luaD_rawrunprotected (lua_State *L, Pfunc f, void *ud);
-
-#endif
-
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/ldo.c
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/ldo.c
+++ /dev/null
@@ -1,691 +0,0 @@
-/*
-** $Id: ldo.c,v 2.108.1.3 2013/11/08 18:22:50 roberto Exp $
-** Stack and Call structure of Lua
-** See Copyright Notice in lua.h
-*/
-
-
-#include <sys/zfs_context.h>
-
-#define ldo_c
-#define LUA_CORE
-
-#include "lua.h"
-
-#include "lapi.h"
-#include "ldebug.h"
-#include "ldo.h"
-#include "lfunc.h"
-#include "lgc.h"
-#include "lmem.h"
-#include "lobject.h"
-#include "lopcodes.h"
-#include "lparser.h"
-#include "lstate.h"
-#include "lstring.h"
-#include "ltable.h"
-#include "ltm.h"
-#include "lundump.h"
-#include "lvm.h"
-#include "lzio.h"
-
-
-
-
-/*
-** {======================================================
-** Error-recovery functions
-** =======================================================
-*/
-
-/*
-** LUAI_THROW/LUAI_TRY define how Lua does exception handling. By
-** default, Lua handles errors with exceptions when compiling as
-** C++ code, with _longjmp/_setjmp when asked to use them, and with
-** longjmp/setjmp otherwise.
-*/
-#if !defined(LUAI_THROW)
-
-#ifdef _KERNEL
-#ifdef illumos
-#define LUAI_THROW(L,c)		longjmp(&(c)->b)
-#define LUAI_TRY(L,c,a)		if (setjmp(&(c)->b) == 0) { a }
-#define luai_jmpbuf		label_t
-#else
-#define LUAI_THROW(L,c)		longjmp((c)->b, 1)
-#define LUAI_TRY(L,c,a)		if (setjmp((c)->b) == 0) { a }
-#define luai_jmpbuf		jmp_buf
-#endif
-#else
-#if defined(__cplusplus) && !defined(LUA_USE_LONGJMP)
-/* C++ exceptions */
-#define LUAI_THROW(L,c)		throw(c)
-#define LUAI_TRY(L,c,a) \
-	try { a } catch(...) { if ((c)->status == 0) (c)->status = -1; }
-#define luai_jmpbuf		int  /* dummy variable */
-
-#elif defined(LUA_USE_ULONGJMP)
-/* in Unix, try _longjmp/_setjmp (more efficient) */
-#define LUAI_THROW(L,c)		_longjmp((c)->b, 1)
-#define LUAI_TRY(L,c,a)		if (_setjmp((c)->b) == 0) { a }
-#define luai_jmpbuf		jmp_buf
-
-#else
-/* default handling with long jumps */
-#define LUAI_THROW(L,c)		longjmp((c)->b, 1)
-#define LUAI_TRY(L,c,a)		if (setjmp((c)->b) == 0) { a }
-#define luai_jmpbuf		jmp_buf
-
-#endif
-
-#endif
-
-#endif
-
-
-/* chain list of long jump buffers */
-struct lua_longjmp {
-  struct lua_longjmp *previous;
-  luai_jmpbuf b;
-  volatile int status;  /* error code */
-};
-
-
-static void seterrorobj (lua_State *L, int errcode, StkId oldtop) {
-  switch (errcode) {
-    case LUA_ERRMEM: {  /* memory error? */
-      setsvalue2s(L, oldtop, G(L)->memerrmsg); /* reuse preregistered msg. */
-      break;
-    }
-    case LUA_ERRERR: {
-      setsvalue2s(L, oldtop, luaS_newliteral(L, "error in error handling"));
-      break;
-    }
-    default: {
-      setobjs2s(L, oldtop, L->top - 1);  /* error message on current top */
-      break;
-    }
-  }
-  L->top = oldtop + 1;
-}
-
-
-l_noret luaD_throw (lua_State *L, int errcode) {
-  if (L->errorJmp) {  /* thread has an error handler? */
-    L->errorJmp->status = errcode;  /* set status */
-    LUAI_THROW(L, L->errorJmp);  /* jump to it */
-  }
-  else {  /* thread has no error handler */
-    L->status = cast_byte(errcode);  /* mark it as dead */
-    if (G(L)->mainthread->errorJmp) {  /* main thread has a handler? */
-      setobjs2s(L, G(L)->mainthread->top++, L->top - 1);  /* copy error obj. */
-      luaD_throw(G(L)->mainthread, errcode);  /* re-throw in main thread */
-    }
-    else {  /* no handler at all; abort */
-      if (G(L)->panic) {  /* panic function? */
-        lua_unlock(L);
-        G(L)->panic(L);  /* call it (last chance to jump out) */
-      }
-      panic("no error handler");
-    }
-  }
-}
-
-
-int luaD_rawrunprotected (lua_State *L, Pfunc f, void *ud) {
-  unsigned short oldnCcalls = L->nCcalls;
-  struct lua_longjmp lj;
-  lj.status = LUA_OK;
-  lj.previous = L->errorJmp;  /* chain new error handler */
-  L->errorJmp = &lj;
-  LUAI_TRY(L, &lj,
-    (*f)(L, ud);
-  );
-  L->errorJmp = lj.previous;  /* restore old error handler */
-  L->nCcalls = oldnCcalls;
-  return lj.status;
-}
-
-/* }====================================================== */
-
-
-static void correctstack (lua_State *L, TValue *oldstack) {
-  CallInfo *ci;
-  GCObject *up;
-  L->top = (L->top - oldstack) + L->stack;
-  for (up = L->openupval; up != NULL; up = up->gch.next)
-    gco2uv(up)->v = (gco2uv(up)->v - oldstack) + L->stack;
-  for (ci = L->ci; ci != NULL; ci = ci->previous) {
-    ci->top = (ci->top - oldstack) + L->stack;
-    ci->func = (ci->func - oldstack) + L->stack;
-    if (isLua(ci))
-      ci->u.l.base = (ci->u.l.base - oldstack) + L->stack;
-  }
-}
-
-
-/* some space for error handling */
-#define ERRORSTACKSIZE	(LUAI_MAXSTACK + 200)
-
-
-void luaD_reallocstack (lua_State *L, int newsize) {
-  TValue *oldstack = L->stack;
-  int lim = L->stacksize;
-  lua_assert(newsize <= LUAI_MAXSTACK || newsize == ERRORSTACKSIZE);
-  lua_assert(L->stack_last - L->stack == L->stacksize - EXTRA_STACK);
-  luaM_reallocvector(L, L->stack, L->stacksize, newsize, TValue);
-  for (; lim < newsize; lim++)
-    setnilvalue(L->stack + lim); /* erase new segment */
-  L->stacksize = newsize;
-  L->stack_last = L->stack + newsize - EXTRA_STACK;
-  correctstack(L, oldstack);
-}
-
-
-void luaD_growstack (lua_State *L, int n) {
-  int size = L->stacksize;
-  if (size > LUAI_MAXSTACK)  /* error after extra size? */
-    luaD_throw(L, LUA_ERRERR);
-  else {
-    int needed = cast_int(L->top - L->stack) + n + EXTRA_STACK;
-    int newsize = 2 * size;
-    if (newsize > LUAI_MAXSTACK) newsize = LUAI_MAXSTACK;
-    if (newsize < needed) newsize = needed;
-    if (newsize > LUAI_MAXSTACK) {  /* stack overflow? */
-      luaD_reallocstack(L, ERRORSTACKSIZE);
-      luaG_runerror(L, "stack overflow");
-    }
-    else
-      luaD_reallocstack(L, newsize);
-  }
-}
-
-
-static int stackinuse (lua_State *L) {
-  CallInfo *ci;
-  StkId lim = L->top;
-  for (ci = L->ci; ci != NULL; ci = ci->previous) {
-    lua_assert(ci->top <= L->stack_last);
-    if (lim < ci->top) lim = ci->top;
-  }
-  return cast_int(lim - L->stack) + 1;  /* part of stack in use */
-}
-
-
-void luaD_shrinkstack (lua_State *L) {
-  int inuse = stackinuse(L);
-  int goodsize = inuse + (inuse / 8) + 2*EXTRA_STACK;
-  if (goodsize > LUAI_MAXSTACK) goodsize = LUAI_MAXSTACK;
-  if (inuse > LUAI_MAXSTACK ||  /* handling stack overflow? */
-      goodsize >= L->stacksize)  /* would grow instead of shrink? */
-    condmovestack(L);  /* don't change stack (change only for debugging) */
-  else
-    luaD_reallocstack(L, goodsize);  /* shrink it */
-}
-
-
-void luaD_hook (lua_State *L, int event, int line) {
-  lua_Hook hook = L->hook;
-  if (hook && L->allowhook) {
-    CallInfo *ci = L->ci;
-    ptrdiff_t top = savestack(L, L->top);
-    ptrdiff_t ci_top = savestack(L, ci->top);
-    lua_Debug ar;
-    ar.event = event;
-    ar.currentline = line;
-    ar.i_ci = ci;
-    luaD_checkstack(L, LUA_MINSTACK);  /* ensure minimum stack size */
-    ci->top = L->top + LUA_MINSTACK;
-    lua_assert(ci->top <= L->stack_last);
-    L->allowhook = 0;  /* cannot call hooks inside a hook */
-    ci->callstatus |= CIST_HOOKED;
-    lua_unlock(L);
-    (*hook)(L, &ar);
-    lua_lock(L);
-    lua_assert(!L->allowhook);
-    L->allowhook = 1;
-    ci->top = restorestack(L, ci_top);
-    L->top = restorestack(L, top);
-    ci->callstatus &= ~CIST_HOOKED;
-  }
-}
-
-
-static void callhook (lua_State *L, CallInfo *ci) {
-  int hook = LUA_HOOKCALL;
-  ci->u.l.savedpc++;  /* hooks assume 'pc' is already incremented */
-  if (isLua(ci->previous) &&
-      GET_OPCODE(*(ci->previous->u.l.savedpc - 1)) == OP_TAILCALL) {
-    ci->callstatus |= CIST_TAIL;
-    hook = LUA_HOOKTAILCALL;
-  }
-  luaD_hook(L, hook, -1);
-  ci->u.l.savedpc--;  /* correct 'pc' */
-}
-
-
-static StkId adjust_varargs (lua_State *L, Proto *p, int actual) {
-  int i;
-  int nfixargs = p->numparams;
-  StkId base, fixed;
-  lua_assert(actual >= nfixargs);
-  /* move fixed parameters to final position */
-  luaD_checkstack(L, p->maxstacksize);  /* check again for new 'base' */
-  fixed = L->top - actual;  /* first fixed argument */
-  base = L->top;  /* final position of first argument */
-  for (i=0; i<nfixargs; i++) {
-    setobjs2s(L, L->top++, fixed + i);
-    setnilvalue(fixed + i);
-  }
-  return base;
-}
-
-
-static StkId tryfuncTM (lua_State *L, StkId func) {
-  const TValue *tm = luaT_gettmbyobj(L, func, TM_CALL);
-  StkId p;
-  ptrdiff_t funcr = savestack(L, func);
-  if (!ttisfunction(tm))
-    luaG_typeerror(L, func, "call");
-  /* Open a hole inside the stack at `func' */
-  for (p = L->top; p > func; p--) setobjs2s(L, p, p-1);
-  incr_top(L);
-  func = restorestack(L, funcr);  /* previous call may change stack */
-  setobj2s(L, func, tm);  /* tag method is the new function to be called */
-  return func;
-}
-
-
-
-#define next_ci(L) (L->ci = (L->ci->next ? L->ci->next : luaE_extendCI(L)))
-
-
-/*
-** returns true if function has been executed (C function)
-*/
-int luaD_precall (lua_State *L, StkId func, int nresults) {
-  lua_CFunction f;
-  CallInfo *ci;
-  int n;  /* number of arguments (Lua) or returns (C) */
-  ptrdiff_t funcr = savestack(L, func);
-  switch (ttype(func)) {
-    case LUA_TLCF:  /* light C function */
-      f = fvalue(func);
-      goto Cfunc;
-    case LUA_TCCL: {  /* C closure */
-      f = clCvalue(func)->f;
-     Cfunc:
-      luaD_checkstack(L, LUA_MINSTACK);  /* ensure minimum stack size */
-      ci = next_ci(L);  /* now 'enter' new function */
-      ci->nresults = nresults;
-      ci->func = restorestack(L, funcr);
-      ci->top = L->top + LUA_MINSTACK;
-      lua_assert(ci->top <= L->stack_last);
-      ci->callstatus = 0;
-      luaC_checkGC(L);  /* stack grow uses memory */
-      if (L->hookmask & LUA_MASKCALL)
-        luaD_hook(L, LUA_HOOKCALL, -1);
-      lua_unlock(L);
-      n = (*f)(L);  /* do the actual call */
-      lua_lock(L);
-      api_checknelems(L, n);
-      luaD_poscall(L, L->top - n);
-      return 1;
-    }
-    case LUA_TLCL: {  /* Lua function: prepare its call */
-      StkId base;
-      Proto *p = clLvalue(func)->p;
-      n = cast_int(L->top - func) - 1;  /* number of real arguments */
-      luaD_checkstack(L, p->maxstacksize);
-      for (; n < p->numparams; n++)
-        setnilvalue(L->top++);  /* complete missing arguments */
-      if (!p->is_vararg) {
-        func = restorestack(L, funcr);
-        base = func + 1;
-      }
-      else {
-        base = adjust_varargs(L, p, n);
-        func = restorestack(L, funcr);  /* previous call can change stack */
-      }
-      ci = next_ci(L);  /* now 'enter' new function */
-      ci->nresults = nresults;
-      ci->func = func;
-      ci->u.l.base = base;
-      ci->top = base + p->maxstacksize;
-      lua_assert(ci->top <= L->stack_last);
-      ci->u.l.savedpc = p->code;  /* starting point */
-      ci->callstatus = CIST_LUA;
-      L->top = ci->top;
-      luaC_checkGC(L);  /* stack grow uses memory */
-      if (L->hookmask & LUA_MASKCALL)
-        callhook(L, ci);
-      return 0;
-    }
-    default: {  /* not a function */
-      func = tryfuncTM(L, func);  /* retry with 'function' tag method */
-      return luaD_precall(L, func, nresults);  /* now it must be a function */
-    }
-  }
-}
-
-
-int luaD_poscall (lua_State *L, StkId firstResult) {
-  StkId res;
-  int wanted, i;
-  CallInfo *ci = L->ci;
-  if (L->hookmask & (LUA_MASKRET | LUA_MASKLINE)) {
-    if (L->hookmask & LUA_MASKRET) {
-      ptrdiff_t fr = savestack(L, firstResult);  /* hook may change stack */
-      luaD_hook(L, LUA_HOOKRET, -1);
-      firstResult = restorestack(L, fr);
-    }
-    L->oldpc = ci->previous->u.l.savedpc;  /* 'oldpc' for caller function */
-  }
-  res = ci->func;  /* res == final position of 1st result */
-  wanted = ci->nresults;
-  L->ci = ci = ci->previous;  /* back to caller */
-  /* move results to correct place */
-  for (i = wanted; i != 0 && firstResult < L->top; i--)
-    setobjs2s(L, res++, firstResult++);
-  while (i-- > 0)
-    setnilvalue(res++);
-  L->top = res;
-  return (wanted - LUA_MULTRET);  /* 0 iff wanted == LUA_MULTRET */
-}
-
-
-/*
-** Call a function (C or Lua). The function to be called is at *func.
-** The arguments are on the stack, right after the function.
-** When returns, all the results are on the stack, starting at the original
-** function position.
-*/
-void luaD_call (lua_State *L, StkId func, int nResults, int allowyield) {
-  if (++L->nCcalls >= LUAI_MAXCCALLS) {
-    if (L->nCcalls == LUAI_MAXCCALLS)
-      luaG_runerror(L, "C stack overflow");
-    else if (L->nCcalls >= (LUAI_MAXCCALLS + (LUAI_MAXCCALLS>>3)))
-      luaD_throw(L, LUA_ERRERR);  /* error while handing stack error */
-  }
-  if (!allowyield) L->nny++;
-  if (!luaD_precall(L, func, nResults))  /* is a Lua function? */
-    luaV_execute(L);  /* call it */
-  if (!allowyield) L->nny--;
-  L->nCcalls--;
-}
-
-
-static void finishCcall (lua_State *L) {
-  CallInfo *ci = L->ci;
-  int n;
-  lua_assert(ci->u.c.k != NULL);  /* must have a continuation */
-  lua_assert(L->nny == 0);
-  if (ci->callstatus & CIST_YPCALL) {  /* was inside a pcall? */
-    ci->callstatus &= ~CIST_YPCALL;  /* finish 'lua_pcall' */
-    L->errfunc = ci->u.c.old_errfunc;
-  }
-  /* finish 'lua_callk'/'lua_pcall' */
-  adjustresults(L, ci->nresults);
-  /* call continuation function */
-  if (!(ci->callstatus & CIST_STAT))  /* no call status? */
-    ci->u.c.status = LUA_YIELD;  /* 'default' status */
-  lua_assert(ci->u.c.status != LUA_OK);
-  ci->callstatus = (ci->callstatus & ~(CIST_YPCALL | CIST_STAT)) | CIST_YIELDED;
-  lua_unlock(L);
-  n = (*ci->u.c.k)(L);
-  lua_lock(L);
-  api_checknelems(L, n);
-  /* finish 'luaD_precall' */
-  luaD_poscall(L, L->top - n);
-}
-
-
-static void unroll (lua_State *L, void *ud) {
-  UNUSED(ud);
-  for (;;) {
-    if (L->ci == &L->base_ci)  /* stack is empty? */
-      return;  /* coroutine finished normally */
-    if (!isLua(L->ci))  /* C function? */
-      finishCcall(L);
-    else {  /* Lua function */
-      luaV_finishOp(L);  /* finish interrupted instruction */
-      luaV_execute(L);  /* execute down to higher C 'boundary' */
-    }
-  }
-}
-
-
-/*
-** check whether thread has a suspended protected call
-*/
-static CallInfo *findpcall (lua_State *L) {
-  CallInfo *ci;
-  for (ci = L->ci; ci != NULL; ci = ci->previous) {  /* search for a pcall */
-    if (ci->callstatus & CIST_YPCALL)
-      return ci;
-  }
-  return NULL;  /* no pending pcall */
-}
-
-
-static int recover (lua_State *L, int status) {
-  StkId oldtop;
-  CallInfo *ci = findpcall(L);
-  if (ci == NULL) return 0;  /* no recovery point */
-  /* "finish" luaD_pcall */
-  oldtop = restorestack(L, ci->extra);
-  luaF_close(L, oldtop);
-  seterrorobj(L, status, oldtop);
-  L->ci = ci;
-  L->allowhook = ci->u.c.old_allowhook;
-  L->nny = 0;  /* should be zero to be yieldable */
-  luaD_shrinkstack(L);
-  L->errfunc = ci->u.c.old_errfunc;
-  ci->callstatus |= CIST_STAT;  /* call has error status */
-  ci->u.c.status = status;  /* (here it is) */
-  return 1;  /* continue running the coroutine */
-}
-
-
-/*
-** signal an error in the call to 'resume', not in the execution of the
-** coroutine itself. (Such errors should not be handled by any coroutine
-** error handler and should not kill the coroutine.)
-*/
-static l_noret resume_error (lua_State *L, const char *msg, StkId firstArg) {
-  L->top = firstArg;  /* remove args from the stack */
-  setsvalue2s(L, L->top, luaS_new(L, msg));  /* push error message */
-  api_incr_top(L);
-  luaD_throw(L, -1);  /* jump back to 'lua_resume' */
-}
-
-
-/*
-** do the work for 'lua_resume' in protected mode
-*/
-static void resume_cb (lua_State *L, void *ud) {
-  int nCcalls = L->nCcalls;
-  StkId firstArg = cast(StkId, ud);
-  CallInfo *ci = L->ci;
-  if (nCcalls >= LUAI_MAXCCALLS)
-    resume_error(L, "C stack overflow", firstArg);
-  if (L->status == LUA_OK) {  /* may be starting a coroutine */
-    if (ci != &L->base_ci)  /* not in base level? */
-      resume_error(L, "cannot resume non-suspended coroutine", firstArg);
-    /* coroutine is in base level; start running it */
-    if (!luaD_precall(L, firstArg - 1, LUA_MULTRET))  /* Lua function? */
-      luaV_execute(L);  /* call it */
-  }
-  else if (L->status != LUA_YIELD)
-    resume_error(L, "cannot resume dead coroutine", firstArg);
-  else {  /* resuming from previous yield */
-    L->status = LUA_OK;
-    ci->func = restorestack(L, ci->extra);
-    if (isLua(ci))  /* yielded inside a hook? */
-      luaV_execute(L);  /* just continue running Lua code */
-    else {  /* 'common' yield */
-      if (ci->u.c.k != NULL) {  /* does it have a continuation? */
-        int n;
-        ci->u.c.status = LUA_YIELD;  /* 'default' status */
-        ci->callstatus |= CIST_YIELDED;
-        lua_unlock(L);
-        n = (*ci->u.c.k)(L);  /* call continuation */
-        lua_lock(L);
-        api_checknelems(L, n);
-        firstArg = L->top - n;  /* yield results come from continuation */
-      }
-      luaD_poscall(L, firstArg);  /* finish 'luaD_precall' */
-    }
-    unroll(L, NULL);
-  }
-  lua_assert(nCcalls == L->nCcalls);
-}
-
-
-LUA_API int lua_resume (lua_State *L, lua_State *from, int nargs) {
-  int status;
-  int oldnny = L->nny;  /* save 'nny' */
-  lua_lock(L);
-  luai_userstateresume(L, nargs);
-  L->nCcalls = (from) ? from->nCcalls + 1 : 1;
-  L->nny = 0;  /* allow yields */
-  api_checknelems(L, (L->status == LUA_OK) ? nargs + 1 : nargs);
-  status = luaD_rawrunprotected(L, resume_cb, L->top - nargs);
-  if (status == -1)  /* error calling 'lua_resume'? */
-    status = LUA_ERRRUN;
-  else {  /* yield or regular error */
-    while (status != LUA_OK && status != LUA_YIELD) {  /* error? */
-      if (recover(L, status))  /* recover point? */
-        status = luaD_rawrunprotected(L, unroll, NULL);  /* run continuation */
-      else {  /* unrecoverable error */
-        L->status = cast_byte(status);  /* mark thread as `dead' */
-        seterrorobj(L, status, L->top);
-        L->ci->top = L->top;
-        break;
-      }
-    }
-    lua_assert(status == L->status);
-  }
-  L->nny = oldnny;  /* restore 'nny' */
-  L->nCcalls--;
-  lua_assert(L->nCcalls == ((from) ? from->nCcalls : 0));
-  lua_unlock(L);
-  return status;
-}
-
-
-LUA_API int lua_yieldk (lua_State *L, int nresults, int ctx, lua_CFunction k) {
-  CallInfo *ci = L->ci;
-  luai_userstateyield(L, nresults);
-  lua_lock(L);
-  api_checknelems(L, nresults);
-  if (L->nny > 0) {
-    if (L != G(L)->mainthread)
-      luaG_runerror(L, "attempt to yield across a C-call boundary");
-    else
-      luaG_runerror(L, "attempt to yield from outside a coroutine");
-  }
-  L->status = LUA_YIELD;
-  ci->extra = savestack(L, ci->func);  /* save current 'func' */
-  if (isLua(ci)) {  /* inside a hook? */
-    api_check(L, k == NULL, "hooks cannot continue after yielding");
-  }
-  else {
-    if ((ci->u.c.k = k) != NULL)  /* is there a continuation? */
-      ci->u.c.ctx = ctx;  /* save context */
-    ci->func = L->top - nresults - 1;  /* protect stack below results */
-    luaD_throw(L, LUA_YIELD);
-  }
-  lua_assert(ci->callstatus & CIST_HOOKED);  /* must be inside a hook */
-  lua_unlock(L);
-  return 0;  /* return to 'luaD_hook' */
-}
-
-
-int luaD_pcall (lua_State *L, Pfunc func, void *u,
-                ptrdiff_t old_top, ptrdiff_t ef) {
-  int status;
-  CallInfo *old_ci = L->ci;
-  lu_byte old_allowhooks = L->allowhook;
-  unsigned short old_nny = L->nny;
-  ptrdiff_t old_errfunc = L->errfunc;
-  L->errfunc = ef;
-  status = luaD_rawrunprotected(L, func, u);
-  if (status != LUA_OK) {  /* an error occurred? */
-    StkId oldtop = restorestack(L, old_top);
-    luaF_close(L, oldtop);  /* close possible pending closures */
-    seterrorobj(L, status, oldtop);
-    L->ci = old_ci;
-    L->allowhook = old_allowhooks;
-    L->nny = old_nny;
-    luaD_shrinkstack(L);
-  }
-  L->errfunc = old_errfunc;
-  return status;
-}
-
-
-
-/*
-** Execute a protected parser.
-*/
-struct SParser {  /* data to `f_parser' */
-  ZIO *z;
-  Mbuffer buff;  /* dynamic structure used by the scanner */
-  Dyndata dyd;  /* dynamic structures used by the parser */
-  const char *mode;
-  const char *name;
-};
-
-
-static void checkmode (lua_State *L, const char *mode, const char *x) {
-  if (mode && strchr(mode, x[0]) == NULL) {
-    luaO_pushfstring(L,
-       "attempt to load a %s chunk (mode is " LUA_QS ")", x, mode);
-    luaD_throw(L, LUA_ERRSYNTAX);
-  }
-}
-
-
-static void f_parser (lua_State *L, void *ud) {
-  int i;
-  Closure *cl;
-  struct SParser *p = cast(struct SParser *, ud);
-  int c = zgetc(p->z);  /* read first character */
-  if (c == LUA_SIGNATURE[0]) {
-    checkmode(L, p->mode, "binary");
-    cl = luaU_undump(L, p->z, &p->buff, p->name);
-  }
-  else {
-    checkmode(L, p->mode, "text");
-    cl = luaY_parser(L, p->z, &p->buff, &p->dyd, p->name, c);
-  }
-  lua_assert(cl->l.nupvalues == cl->l.p->sizeupvalues);
-  for (i = 0; i < cl->l.nupvalues; i++) {  /* initialize upvalues */
-    UpVal *up = luaF_newupval(L);
-    cl->l.upvals[i] = up;
-    luaC_objbarrier(L, cl, up);
-  }
-}
-
-
-int luaD_protectedparser (lua_State *L, ZIO *z, const char *name,
-                                        const char *mode) {
-  struct SParser p;
-  int status;
-  L->nny++;  /* cannot yield during parsing */
-  p.z = z; p.name = name; p.mode = mode;
-  p.dyd.actvar.arr = NULL; p.dyd.actvar.size = 0;
-  p.dyd.gt.arr = NULL; p.dyd.gt.size = 0;
-  p.dyd.label.arr = NULL; p.dyd.label.size = 0;
-  luaZ_initbuffer(L, &p.buff);
-  status = luaD_pcall(L, f_parser, &p, savestack(L, L->top), L->errfunc);
-  luaZ_freebuffer(L, &p.buff);
-  luaM_freearray(L, p.dyd.actvar.arr, p.dyd.actvar.size);
-  luaM_freearray(L, p.dyd.gt.arr, p.dyd.gt.size);
-  luaM_freearray(L, p.dyd.label.arr, p.dyd.label.size);
-  L->nny--;
-  return status;
-}
-
-
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/ldump.c
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/ldump.c
+++ /dev/null
@@ -1,173 +0,0 @@
-/*
-** $Id: ldump.c,v 2.17.1.1 2013/04/12 18:48:47 roberto Exp $
-** save precompiled Lua chunks
-** See Copyright Notice in lua.h
-*/
-
-#include <sys/zfs_context.h>
-
-#define ldump_c
-#define LUA_CORE
-
-#include "lua.h"
-
-#include "lobject.h"
-#include "lstate.h"
-#include "lundump.h"
-
-typedef struct {
- lua_State* L;
- lua_Writer writer;
- void* data;
- int strip;
- int status;
-} DumpState;
-
-#define DumpMem(b,n,size,D)	DumpBlock(b,(n)*(size),D)
-#define DumpVar(x,D)		DumpMem(&x,1,sizeof(x),D)
-
-static void DumpBlock(const void* b, size_t size, DumpState* D)
-{
- if (D->status==0)
- {
-  lua_unlock(D->L);
-  D->status=(*D->writer)(D->L,b,size,D->data);
-  lua_lock(D->L);
- }
-}
-
-static void DumpChar(int y, DumpState* D)
-{
- char x=(char)y;
- DumpVar(x,D);
-}
-
-static void DumpInt(int x, DumpState* D)
-{
- DumpVar(x,D);
-}
-
-static void DumpNumber(lua_Number x, DumpState* D)
-{
- DumpVar(x,D);
-}
-
-static void DumpVector(const void* b, int n, size_t size, DumpState* D)
-{
- DumpInt(n,D);
- DumpMem(b,n,size,D);
-}
-
-static void DumpString(const TString* s, DumpState* D)
-{
- if (s==NULL)
- {
-  size_t size=0;
-  DumpVar(size,D);
- }
- else
- {
-  size_t size=s->tsv.len+1;		/* include trailing '\0' */
-  DumpVar(size,D);
-  DumpBlock(getstr(s),size*sizeof(char),D);
- }
-}
-
-#define DumpCode(f,D)	 DumpVector(f->code,f->sizecode,sizeof(Instruction),D)
-
-static void DumpFunction(const Proto* f, DumpState* D);
-
-static void DumpConstants(const Proto* f, DumpState* D)
-{
- int i,n=f->sizek;
- DumpInt(n,D);
- for (i=0; i<n; i++)
- {
-  const TValue* o=&f->k[i];
-  DumpChar(ttypenv(o),D);
-  switch (ttypenv(o))
-  {
-   case LUA_TNIL:
-	break;
-   case LUA_TBOOLEAN:
-	DumpChar(bvalue(o),D);
-	break;
-   case LUA_TNUMBER:
-	DumpNumber(nvalue(o),D);
-	break;
-   case LUA_TSTRING:
-	DumpString(rawtsvalue(o),D);
-	break;
-    default: lua_assert(0);
-  }
- }
- n=f->sizep;
- DumpInt(n,D);
- for (i=0; i<n; i++) DumpFunction(f->p[i],D);
-}
-
-static void DumpUpvalues(const Proto* f, DumpState* D)
-{
- int i,n=f->sizeupvalues;
- DumpInt(n,D);
- for (i=0; i<n; i++)
- {
-  DumpChar(f->upvalues[i].instack,D);
-  DumpChar(f->upvalues[i].idx,D);
- }
-}
-
-static void DumpDebug(const Proto* f, DumpState* D)
-{
- int i,n;
- DumpString((D->strip) ? NULL : f->source,D);
- n= (D->strip) ? 0 : f->sizelineinfo;
- DumpVector(f->lineinfo,n,sizeof(int),D);
- n= (D->strip) ? 0 : f->sizelocvars;
- DumpInt(n,D);
- for (i=0; i<n; i++)
- {
-  DumpString(f->locvars[i].varname,D);
-  DumpInt(f->locvars[i].startpc,D);
-  DumpInt(f->locvars[i].endpc,D);
- }
- n= (D->strip) ? 0 : f->sizeupvalues;
- DumpInt(n,D);
- for (i=0; i<n; i++) DumpString(f->upvalues[i].name,D);
-}
-
-static void DumpFunction(const Proto* f, DumpState* D)
-{
- DumpInt(f->linedefined,D);
- DumpInt(f->lastlinedefined,D);
- DumpChar(f->numparams,D);
- DumpChar(f->is_vararg,D);
- DumpChar(f->maxstacksize,D);
- DumpCode(f,D);
- DumpConstants(f,D);
- DumpUpvalues(f,D);
- DumpDebug(f,D);
-}
-
-static void DumpHeader(DumpState* D)
-{
- lu_byte h[LUAC_HEADERSIZE];
- luaU_header(h);
- DumpBlock(h,LUAC_HEADERSIZE,D);
-}
-
-/*
-** dump Lua function as precompiled chunk
-*/
-int luaU_dump (lua_State* L, const Proto* f, lua_Writer w, void* data, int strip)
-{
- DumpState D;
- D.L=L;
- D.writer=w;
- D.data=data;
- D.strip=strip;
- D.status=0;
- DumpHeader(&D);
- DumpFunction(f,&D);
- return D.status;
-}
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lfunc.h
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lfunc.h
+++ /dev/null
@@ -1,33 +0,0 @@
-/*
-** $Id: lfunc.h,v 2.8.1.1 2013/04/12 18:48:47 roberto Exp $
-** Auxiliary functions to manipulate prototypes and closures
-** See Copyright Notice in lua.h
-*/
-
-#ifndef lfunc_h
-#define lfunc_h
-
-
-#include "lobject.h"
-
-
-#define sizeCclosure(n)	(cast(int, sizeof(CClosure)) + \
-                         cast(int, sizeof(TValue)*((n)-1)))
-
-#define sizeLclosure(n)	(cast(int, sizeof(LClosure)) + \
-                         cast(int, sizeof(TValue *)*((n)-1)))
-
-
-LUAI_FUNC Proto *luaF_newproto (lua_State *L);
-LUAI_FUNC Closure *luaF_newCclosure (lua_State *L, int nelems);
-LUAI_FUNC Closure *luaF_newLclosure (lua_State *L, int nelems);
-LUAI_FUNC UpVal *luaF_newupval (lua_State *L);
-LUAI_FUNC UpVal *luaF_findupval (lua_State *L, StkId level);
-LUAI_FUNC void luaF_close (lua_State *L, StkId level);
-LUAI_FUNC void luaF_freeproto (lua_State *L, Proto *f);
-LUAI_FUNC void luaF_freeupval (lua_State *L, UpVal *uv);
-LUAI_FUNC const char *luaF_getlocalname (const Proto *func, int local_number,
-                                         int pc);
-
-
-#endif
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lfunc.c
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lfunc.c
+++ /dev/null
@@ -1,161 +0,0 @@
-/*
-** $Id: lfunc.c,v 2.30.1.1 2013/04/12 18:48:47 roberto Exp $
-** Auxiliary functions to manipulate prototypes and closures
-** See Copyright Notice in lua.h
-*/
-
-
-#include <sys/zfs_context.h>
-
-#define lfunc_c
-#define LUA_CORE
-
-#include "lua.h"
-
-#include "lfunc.h"
-#include "lgc.h"
-#include "lmem.h"
-#include "lobject.h"
-#include "lstate.h"
-
-
-
-Closure *luaF_newCclosure (lua_State *L, int n) {
-  Closure *c = &luaC_newobj(L, LUA_TCCL, sizeCclosure(n), NULL, 0)->cl;
-  c->c.nupvalues = cast_byte(n);
-  return c;
-}
-
-
-Closure *luaF_newLclosure (lua_State *L, int n) {
-  Closure *c = &luaC_newobj(L, LUA_TLCL, sizeLclosure(n), NULL, 0)->cl;
-  c->l.p = NULL;
-  c->l.nupvalues = cast_byte(n);
-  while (n--) c->l.upvals[n] = NULL;
-  return c;
-}
-
-
-UpVal *luaF_newupval (lua_State *L) {
-  UpVal *uv = &luaC_newobj(L, LUA_TUPVAL, sizeof(UpVal), NULL, 0)->uv;
-  uv->v = &uv->u.value;
-  setnilvalue(uv->v);
-  return uv;
-}
-
-
-UpVal *luaF_findupval (lua_State *L, StkId level) {
-  global_State *g = G(L);
-  GCObject **pp = &L->openupval;
-  UpVal *p;
-  UpVal *uv;
-  while (*pp != NULL && (p = gco2uv(*pp))->v >= level) {
-    GCObject *o = obj2gco(p);
-    lua_assert(p->v != &p->u.value);
-    lua_assert(!isold(o) || isold(obj2gco(L)));
-    if (p->v == level) {  /* found a corresponding upvalue? */
-      if (isdead(g, o))  /* is it dead? */
-        changewhite(o);  /* resurrect it */
-      return p;
-    }
-    pp = &p->next;
-  }
-  /* not found: create a new one */
-  uv = &luaC_newobj(L, LUA_TUPVAL, sizeof(UpVal), pp, 0)->uv;
-  uv->v = level;  /* current value lives in the stack */
-  uv->u.l.prev = &g->uvhead;  /* double link it in `uvhead' list */
-  uv->u.l.next = g->uvhead.u.l.next;
-  uv->u.l.next->u.l.prev = uv;
-  g->uvhead.u.l.next = uv;
-  lua_assert(uv->u.l.next->u.l.prev == uv && uv->u.l.prev->u.l.next == uv);
-  return uv;
-}
-
-
-static void unlinkupval (UpVal *uv) {
-  lua_assert(uv->u.l.next->u.l.prev == uv && uv->u.l.prev->u.l.next == uv);
-  uv->u.l.next->u.l.prev = uv->u.l.prev;  /* remove from `uvhead' list */
-  uv->u.l.prev->u.l.next = uv->u.l.next;
-}
-
-
-void luaF_freeupval (lua_State *L, UpVal *uv) {
-  if (uv->v != &uv->u.value)  /* is it open? */
-    unlinkupval(uv);  /* remove from open list */
-  luaM_free(L, uv);  /* free upvalue */
-}
-
-
-void luaF_close (lua_State *L, StkId level) {
-  UpVal *uv;
-  global_State *g = G(L);
-  while (L->openupval != NULL && (uv = gco2uv(L->openupval))->v >= level) {
-    GCObject *o = obj2gco(uv);
-    lua_assert(!isblack(o) && uv->v != &uv->u.value);
-    L->openupval = uv->next;  /* remove from `open' list */
-    if (isdead(g, o))
-      luaF_freeupval(L, uv);  /* free upvalue */
-    else {
-      unlinkupval(uv);  /* remove upvalue from 'uvhead' list */
-      setobj(L, &uv->u.value, uv->v);  /* move value to upvalue slot */
-      uv->v = &uv->u.value;  /* now current value lives here */
-      gch(o)->next = g->allgc;  /* link upvalue into 'allgc' list */
-      g->allgc = o;
-      luaC_checkupvalcolor(g, uv);
-    }
-  }
-}
-
-
-Proto *luaF_newproto (lua_State *L) {
-  Proto *f = &luaC_newobj(L, LUA_TPROTO, sizeof(Proto), NULL, 0)->p;
-  f->k = NULL;
-  f->sizek = 0;
-  f->p = NULL;
-  f->sizep = 0;
-  f->code = NULL;
-  f->cache = NULL;
-  f->sizecode = 0;
-  f->lineinfo = NULL;
-  f->sizelineinfo = 0;
-  f->upvalues = NULL;
-  f->sizeupvalues = 0;
-  f->numparams = 0;
-  f->is_vararg = 0;
-  f->maxstacksize = 0;
-  f->locvars = NULL;
-  f->sizelocvars = 0;
-  f->linedefined = 0;
-  f->lastlinedefined = 0;
-  f->source = NULL;
-  return f;
-}
-
-
-void luaF_freeproto (lua_State *L, Proto *f) {
-  luaM_freearray(L, f->code, f->sizecode);
-  luaM_freearray(L, f->p, f->sizep);
-  luaM_freearray(L, f->k, f->sizek);
-  luaM_freearray(L, f->lineinfo, f->sizelineinfo);
-  luaM_freearray(L, f->locvars, f->sizelocvars);
-  luaM_freearray(L, f->upvalues, f->sizeupvalues);
-  luaM_free(L, f);
-}
-
-
-/*
-** Look for n-th local variable at line `line' in function `func'.
-** Returns NULL if not found.
-*/
-const char *luaF_getlocalname (const Proto *f, int local_number, int pc) {
-  int i;
-  for (i = 0; i<f->sizelocvars && f->locvars[i].startpc <= pc; i++) {
-    if (pc < f->locvars[i].endpc) {  /* is variable active? */
-      local_number--;
-      if (local_number == 0)
-        return getstr(f->locvars[i].varname);
-    }
-  }
-  return NULL;  /* not found */
-}
-
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lgc.h
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lgc.h
+++ /dev/null
@@ -1,157 +0,0 @@
-/*
-** $Id: lgc.h,v 2.58.1.1 2013/04/12 18:48:47 roberto Exp $
-** Garbage Collector
-** See Copyright Notice in lua.h
-*/
-
-#ifndef lgc_h
-#define lgc_h
-
-
-#include "lobject.h"
-#include "lstate.h"
-
-/*
-** Collectable objects may have one of three colors: white, which
-** means the object is not marked; gray, which means the
-** object is marked, but its references may be not marked; and
-** black, which means that the object and all its references are marked.
-** The main invariant of the garbage collector, while marking objects,
-** is that a black object can never point to a white one. Moreover,
-** any gray object must be in a "gray list" (gray, grayagain, weak,
-** allweak, ephemeron) so that it can be visited again before finishing
-** the collection cycle. These lists have no meaning when the invariant
-** is not being enforced (e.g., sweep phase).
-*/
-
-
-
-/* how much to allocate before next GC step */
-#if !defined(GCSTEPSIZE)
-/* ~100 small strings */
-#define GCSTEPSIZE	(cast_int(100 * sizeof(TString)))
-#endif
-
-
-/*
-** Possible states of the Garbage Collector
-*/
-#define GCSpropagate	0
-#define GCSatomic	1
-#define GCSsweepstring	2
-#define GCSsweepudata	3
-#define GCSsweep	4
-#define GCSpause	5
-
-
-#define issweepphase(g)  \
-	(GCSsweepstring <= (g)->gcstate && (g)->gcstate <= GCSsweep)
-
-#define isgenerational(g)	((g)->gckind == KGC_GEN)
-
-/*
-** macros to tell when main invariant (white objects cannot point to black
-** ones) must be kept. During a non-generational collection, the sweep
-** phase may break the invariant, as objects turned white may point to
-** still-black objects. The invariant is restored when sweep ends and
-** all objects are white again. During a generational collection, the
-** invariant must be kept all times.
-*/
-
-#define keepinvariant(g)	(isgenerational(g) || g->gcstate <= GCSatomic)
-
-
-/*
-** Outside the collector, the state in generational mode is kept in
-** 'propagate', so 'keepinvariant' is always true.
-*/
-#define keepinvariantout(g)  \
-  check_exp(g->gcstate == GCSpropagate || !isgenerational(g),  \
-            g->gcstate <= GCSatomic)
-
-
-/*
-** some useful bit tricks
-*/
-#define resetbits(x,m)		((x) &= cast(lu_byte, ~(m)))
-#define setbits(x,m)		((x) |= (m))
-#define testbits(x,m)		((x) & (m))
-#define bitmask(b)		(1<<(b))
-#define bit2mask(b1,b2)		(bitmask(b1) | bitmask(b2))
-#define l_setbit(x,b)		setbits(x, bitmask(b))
-#define resetbit(x,b)		resetbits(x, bitmask(b))
-#define testbit(x,b)		testbits(x, bitmask(b))
-
-
-/* Layout for bit use in `marked' field: */
-#define WHITE0BIT	0  /* object is white (type 0) */
-#define WHITE1BIT	1  /* object is white (type 1) */
-#define BLACKBIT	2  /* object is black */
-#define FINALIZEDBIT	3  /* object has been separated for finalization */
-#define SEPARATED	4  /* object is in 'finobj' list or in 'tobefnz' */
-#define FIXEDBIT	5  /* object is fixed (should not be collected) */
-#define OLDBIT		6  /* object is old (only in generational mode) */
-/* bit 7 is currently used by tests (luaL_checkmemory) */
-
-#define WHITEBITS	bit2mask(WHITE0BIT, WHITE1BIT)
-
-
-#define iswhite(x)      testbits((x)->gch.marked, WHITEBITS)
-#define isblack(x)      testbit((x)->gch.marked, BLACKBIT)
-#define isgray(x)  /* neither white nor black */  \
-	(!testbits((x)->gch.marked, WHITEBITS | bitmask(BLACKBIT)))
-
-#define isold(x)	testbit((x)->gch.marked, OLDBIT)
-
-/* MOVE OLD rule: whenever an object is moved to the beginning of
-   a GC list, its old bit must be cleared */
-#define resetoldbit(o)	resetbit((o)->gch.marked, OLDBIT)
-
-#define otherwhite(g)	(g->currentwhite ^ WHITEBITS)
-#define isdeadm(ow,m)	(!(((m) ^ WHITEBITS) & (ow)))
-#define isdead(g,v)	isdeadm(otherwhite(g), (v)->gch.marked)
-
-#define changewhite(x)	((x)->gch.marked ^= WHITEBITS)
-#define gray2black(x)	l_setbit((x)->gch.marked, BLACKBIT)
-
-#define valiswhite(x)	(iscollectable(x) && iswhite(gcvalue(x)))
-
-#define luaC_white(g)	cast(lu_byte, (g)->currentwhite & WHITEBITS)
-
-
-#define luaC_condGC(L,c) \
-	{if (G(L)->GCdebt > 0) {c;}; condchangemem(L);}
-#define luaC_checkGC(L)		luaC_condGC(L, luaC_step(L);)
-
-
-#define luaC_barrier(L,p,v) { if (valiswhite(v) && isblack(obj2gco(p)))  \
-	luaC_barrier_(L,obj2gco(p),gcvalue(v)); }
-
-#define luaC_barrierback(L,p,v) { if (valiswhite(v) && isblack(obj2gco(p)))  \
-	luaC_barrierback_(L,p); }
-
-#define luaC_objbarrier(L,p,o)  \
-	{ if (iswhite(obj2gco(o)) && isblack(obj2gco(p))) \
-		luaC_barrier_(L,obj2gco(p),obj2gco(o)); }
-
-#define luaC_objbarrierback(L,p,o)  \
-   { if (iswhite(obj2gco(o)) && isblack(obj2gco(p))) luaC_barrierback_(L,p); }
-
-#define luaC_barrierproto(L,p,c) \
-   { if (isblack(obj2gco(p))) luaC_barrierproto_(L,p,c); }
-
-LUAI_FUNC void luaC_freeallobjects (lua_State *L);
-LUAI_FUNC void luaC_step (lua_State *L);
-LUAI_FUNC void luaC_forcestep (lua_State *L);
-LUAI_FUNC void luaC_runtilstate (lua_State *L, int statesmask);
-LUAI_FUNC void luaC_fullgc (lua_State *L, int isemergency);
-LUAI_FUNC GCObject *luaC_newobj (lua_State *L, int tt, size_t sz,
-                                 GCObject **list, int offset);
-LUAI_FUNC void luaC_barrier_ (lua_State *L, GCObject *o, GCObject *v);
-LUAI_FUNC void luaC_barrierback_ (lua_State *L, GCObject *o);
-LUAI_FUNC void luaC_barrierproto_ (lua_State *L, Proto *p, Closure *c);
-LUAI_FUNC void luaC_checkfinalizer (lua_State *L, GCObject *o, Table *mt);
-LUAI_FUNC void luaC_checkupvalcolor (global_State *g, UpVal *uv);
-LUAI_FUNC void luaC_changemode (lua_State *L, int mode);
-
-#endif
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lgc.c
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lgc.c
+++ /dev/null
@@ -1,1220 +0,0 @@
-/*
-** $Id: lgc.c,v 2.140.1.3 2014/09/01 16:55:08 roberto Exp $
-** Garbage Collector
-** See Copyright Notice in lua.h
-*/
-
-#include <sys/zfs_context.h>
-
-#define lgc_c
-#define LUA_CORE
-
-#include "lua.h"
-
-#include "ldebug.h"
-#include "ldo.h"
-#include "lfunc.h"
-#include "lgc.h"
-#include "lmem.h"
-#include "lobject.h"
-#include "lstate.h"
-#include "lstring.h"
-#include "ltable.h"
-#include "ltm.h"
-
-
-
-/*
-** cost of sweeping one element (the size of a small object divided
-** by some adjust for the sweep speed)
-*/
-#define GCSWEEPCOST	((sizeof(TString) + 4) / 4)
-
-/* maximum number of elements to sweep in each single step */
-#define GCSWEEPMAX	(cast_int((GCSTEPSIZE / GCSWEEPCOST) / 4))
-
-/* maximum number of finalizers to call in each GC step */
-#define GCFINALIZENUM	4
-
-
-/*
-** macro to adjust 'stepmul': 'stepmul' is actually used like
-** 'stepmul / STEPMULADJ' (value chosen by tests)
-*/
-#define STEPMULADJ		200
-
-
-/*
-** macro to adjust 'pause': 'pause' is actually used like
-** 'pause / PAUSEADJ' (value chosen by tests)
-*/
-#define PAUSEADJ		100
-
-
-/*
-** 'makewhite' erases all color bits plus the old bit and then
-** sets only the current white bit
-*/
-#define maskcolors	(~(bit2mask(BLACKBIT, OLDBIT) | WHITEBITS))
-#define makewhite(g,x)	\
- (gch(x)->marked = cast_byte((gch(x)->marked & maskcolors) | luaC_white(g)))
-
-#define white2gray(x)	resetbits(gch(x)->marked, WHITEBITS)
-#define black2gray(x)	resetbit(gch(x)->marked, BLACKBIT)
-
-
-#define isfinalized(x)		testbit(gch(x)->marked, FINALIZEDBIT)
-
-#define checkdeadkey(n)	lua_assert(!ttisdeadkey(gkey(n)) || ttisnil(gval(n)))
-
-
-#define checkconsistency(obj)  \
-  lua_longassert(!iscollectable(obj) || righttt(obj))
-
-
-#define markvalue(g,o) { checkconsistency(o); \
-  if (valiswhite(o)) reallymarkobject(g,gcvalue(o)); }
-
-#define markobject(g,t) { if ((t) && iswhite(obj2gco(t))) \
-		reallymarkobject(g, obj2gco(t)); }
-
-static void reallymarkobject (global_State *g, GCObject *o);
-
-
-/*
-** {======================================================
-** Generic functions
-** =======================================================
-*/
-
-
-/*
-** one after last element in a hash array
-*/
-#define gnodelast(h)	gnode(h, cast(size_t, sizenode(h)))
-
-
-/*
-** link table 'h' into list pointed by 'p'
-*/
-#define linktable(h,p)	((h)->gclist = *(p), *(p) = obj2gco(h))
-
-
-/*
-** if key is not marked, mark its entry as dead (therefore removing it
-** from the table)
-*/
-static void removeentry (Node *n) {
-  lua_assert(ttisnil(gval(n)));
-  if (valiswhite(gkey(n)))
-    setdeadvalue(gkey(n));  /* unused and unmarked key; remove it */
-}
-
-
-/*
-** tells whether a key or value can be cleared from a weak
-** table. Non-collectable objects are never removed from weak
-** tables. Strings behave as `values', so are never removed too. for
-** other objects: if really collected, cannot keep them; for objects
-** being finalized, keep them in keys, but not in values
-*/
-static int iscleared (global_State *g, const TValue *o) {
-  if (!iscollectable(o)) return 0;
-  else if (ttisstring(o)) {
-    markobject(g, rawtsvalue(o));  /* strings are `values', so are never weak */
-    return 0;
-  }
-  else return iswhite(gcvalue(o));
-}
-
-
-/*
-** barrier that moves collector forward, that is, mark the white object
-** being pointed by a black object.
-*/
-void luaC_barrier_ (lua_State *L, GCObject *o, GCObject *v) {
-  global_State *g = G(L);
-  lua_assert(isblack(o) && iswhite(v) && !isdead(g, v) && !isdead(g, o));
-  lua_assert(g->gcstate != GCSpause);
-  lua_assert(gch(o)->tt != LUA_TTABLE);
-  if (keepinvariantout(g))  /* must keep invariant? */
-    reallymarkobject(g, v);  /* restore invariant */
-  else {  /* sweep phase */
-    lua_assert(issweepphase(g));
-    makewhite(g, o);  /* mark main obj. as white to avoid other barriers */
-  }
-}
-
-
-/*
-** barrier that moves collector backward, that is, mark the black object
-** pointing to a white object as gray again. (Current implementation
-** only works for tables; access to 'gclist' is not uniform across
-** different types.)
-*/
-void luaC_barrierback_ (lua_State *L, GCObject *o) {
-  global_State *g = G(L);
-  lua_assert(isblack(o) && !isdead(g, o) && gch(o)->tt == LUA_TTABLE);
-  black2gray(o);  /* make object gray (again) */
-  gco2t(o)->gclist = g->grayagain;
-  g->grayagain = o;
-}
-
-
-/*
-** barrier for prototypes. When creating first closure (cache is
-** NULL), use a forward barrier; this may be the only closure of the
-** prototype (if it is a "regular" function, with a single instance)
-** and the prototype may be big, so it is better to avoid traversing
-** it again. Otherwise, use a backward barrier, to avoid marking all
-** possible instances.
-*/
-LUAI_FUNC void luaC_barrierproto_ (lua_State *L, Proto *p, Closure *c) {
-  global_State *g = G(L);
-  lua_assert(isblack(obj2gco(p)));
-  if (p->cache == NULL) {  /* first time? */
-    luaC_objbarrier(L, p, c);
-  }
-  else {  /* use a backward barrier */
-    black2gray(obj2gco(p));  /* make prototype gray (again) */
-    p->gclist = g->grayagain;
-    g->grayagain = obj2gco(p);
-  }
-}
-
-
-/*
-** check color (and invariants) for an upvalue that was closed,
-** i.e., moved into the 'allgc' list
-*/
-void luaC_checkupvalcolor (global_State *g, UpVal *uv) {
-  GCObject *o = obj2gco(uv);
-  lua_assert(!isblack(o));  /* open upvalues are never black */
-  if (isgray(o)) {
-    if (keepinvariant(g)) {
-      resetoldbit(o);  /* see MOVE OLD rule */
-      gray2black(o);  /* it is being visited now */
-      markvalue(g, uv->v);
-    }
-    else {
-      lua_assert(issweepphase(g));
-      makewhite(g, o);
-    }
-  }
-}
-
-
-/*
-** create a new collectable object (with given type and size) and link
-** it to '*list'. 'offset' tells how many bytes to allocate before the
-** object itself (used only by states).
-*/
-GCObject *luaC_newobj (lua_State *L, int tt, size_t sz, GCObject **list,
-                       int offset) {
-  global_State *g = G(L);
-  char *raw = cast(char *, luaM_newobject(L, novariant(tt), sz));
-  GCObject *o = obj2gco(raw + offset);
-  if (list == NULL)
-    list = &g->allgc;  /* standard list for collectable objects */
-  gch(o)->marked = luaC_white(g);
-  gch(o)->tt = tt;
-  gch(o)->next = *list;
-  *list = o;
-  return o;
-}
-
-/* }====================================================== */
-
-
-
-/*
-** {======================================================
-** Mark functions
-** =======================================================
-*/
-
-
-/*
-** mark an object. Userdata, strings, and closed upvalues are visited
-** and turned black here. Other objects are marked gray and added
-** to appropriate list to be visited (and turned black) later. (Open
-** upvalues are already linked in 'headuv' list.)
-*/
-static void reallymarkobject (global_State *g, GCObject *o) {
-  lu_mem size;
-  white2gray(o);
-  switch (gch(o)->tt) {
-    case LUA_TSHRSTR:
-    case LUA_TLNGSTR: {
-      size = sizestring(gco2ts(o));
-      break;  /* nothing else to mark; make it black */
-    }
-    case LUA_TUSERDATA: {
-      Table *mt = gco2u(o)->metatable;
-      markobject(g, mt);
-      markobject(g, gco2u(o)->env);
-      size = sizeudata(gco2u(o));
-      break;
-    }
-    case LUA_TUPVAL: {
-      UpVal *uv = gco2uv(o);
-      markvalue(g, uv->v);
-      if (uv->v != &uv->u.value)  /* open? */
-        return;  /* open upvalues remain gray */
-      size = sizeof(UpVal);
-      break;
-    }
-    case LUA_TLCL: {
-      gco2lcl(o)->gclist = g->gray;
-      g->gray = o;
-      return;
-    }
-    case LUA_TCCL: {
-      gco2ccl(o)->gclist = g->gray;
-      g->gray = o;
-      return;
-    }
-    case LUA_TTABLE: {
-      linktable(gco2t(o), &g->gray);
-      return;
-    }
-    case LUA_TTHREAD: {
-      gco2th(o)->gclist = g->gray;
-      g->gray = o;
-      return;
-    }
-    case LUA_TPROTO: {
-      gco2p(o)->gclist = g->gray;
-      g->gray = o;
-      return;
-    }
-    default: lua_assert(0); return;
-  }
-  gray2black(o);
-  g->GCmemtrav += size;
-}
-
-
-/*
-** mark metamethods for basic types
-*/
-static void markmt (global_State *g) {
-  int i;
-  for (i=0; i < LUA_NUMTAGS; i++)
-    markobject(g, g->mt[i]);
-}
-
-
-/*
-** mark all objects in list of being-finalized
-*/
-static void markbeingfnz (global_State *g) {
-  GCObject *o;
-  for (o = g->tobefnz; o != NULL; o = gch(o)->next) {
-    makewhite(g, o);
-    reallymarkobject(g, o);
-  }
-}
-
-
-/*
-** mark all values stored in marked open upvalues. (See comment in
-** 'lstate.h'.)
-*/
-static void remarkupvals (global_State *g) {
-  UpVal *uv;
-  for (uv = g->uvhead.u.l.next; uv != &g->uvhead; uv = uv->u.l.next) {
-    if (isgray(obj2gco(uv)))
-      markvalue(g, uv->v);
-  }
-}
-
-
-/*
-** mark root set and reset all gray lists, to start a new
-** incremental (or full) collection
-*/
-static void restartcollection (global_State *g) {
-  g->gray = g->grayagain = NULL;
-  g->weak = g->allweak = g->ephemeron = NULL;
-  markobject(g, g->mainthread);
-  markvalue(g, &g->l_registry);
-  markmt(g);
-  markbeingfnz(g);  /* mark any finalizing object left from previous cycle */
-}
-
-/* }====================================================== */
-
-
-/*
-** {======================================================
-** Traverse functions
-** =======================================================
-*/
-
-static void traverseweakvalue (global_State *g, Table *h) {
-  Node *n, *limit = gnodelast(h);
-  /* if there is array part, assume it may have white values (do not
-     traverse it just to check) */
-  int hasclears = (h->sizearray > 0);
-  for (n = gnode(h, 0); n < limit; n++) {
-    checkdeadkey(n);
-    if (ttisnil(gval(n)))  /* entry is empty? */
-      removeentry(n);  /* remove it */
-    else {
-      lua_assert(!ttisnil(gkey(n)));
-      markvalue(g, gkey(n));  /* mark key */
-      if (!hasclears && iscleared(g, gval(n)))  /* is there a white value? */
-        hasclears = 1;  /* table will have to be cleared */
-    }
-  }
-  if (hasclears)
-    linktable(h, &g->weak);  /* has to be cleared later */
-  else  /* no white values */
-    linktable(h, &g->grayagain);  /* no need to clean */
-}
-
-
-static int traverseephemeron (global_State *g, Table *h) {
-  int marked = 0;  /* true if an object is marked in this traversal */
-  int hasclears = 0;  /* true if table has white keys */
-  int prop = 0;  /* true if table has entry "white-key -> white-value" */
-  Node *n, *limit = gnodelast(h);
-  int i;
-  /* traverse array part (numeric keys are 'strong') */
-  for (i = 0; i < h->sizearray; i++) {
-    if (valiswhite(&h->array[i])) {
-      marked = 1;
-      reallymarkobject(g, gcvalue(&h->array[i]));
-    }
-  }
-  /* traverse hash part */
-  for (n = gnode(h, 0); n < limit; n++) {
-    checkdeadkey(n);
-    if (ttisnil(gval(n)))  /* entry is empty? */
-      removeentry(n);  /* remove it */
-    else if (iscleared(g, gkey(n))) {  /* key is not marked (yet)? */
-      hasclears = 1;  /* table must be cleared */
-      if (valiswhite(gval(n)))  /* value not marked yet? */
-        prop = 1;  /* must propagate again */
-    }
-    else if (valiswhite(gval(n))) {  /* value not marked yet? */
-      marked = 1;
-      reallymarkobject(g, gcvalue(gval(n)));  /* mark it now */
-    }
-  }
-  if (g->gcstate != GCSatomic || prop)
-    linktable(h, &g->ephemeron);  /* have to propagate again */
-  else if (hasclears)  /* does table have white keys? */
-    linktable(h, &g->allweak);  /* may have to clean white keys */
-  else  /* no white keys */
-    linktable(h, &g->grayagain);  /* no need to clean */
-  return marked;
-}
-
-
-static void traversestrongtable (global_State *g, Table *h) {
-  Node *n, *limit = gnodelast(h);
-  int i;
-  for (i = 0; i < h->sizearray; i++)  /* traverse array part */
-    markvalue(g, &h->array[i]);
-  for (n = gnode(h, 0); n < limit; n++) {  /* traverse hash part */
-    checkdeadkey(n);
-    if (ttisnil(gval(n)))  /* entry is empty? */
-      removeentry(n);  /* remove it */
-    else {
-      lua_assert(!ttisnil(gkey(n)));
-      markvalue(g, gkey(n));  /* mark key */
-      markvalue(g, gval(n));  /* mark value */
-    }
-  }
-}
-
-
-static lu_mem traversetable (global_State *g, Table *h) {
-  const char *weakkey, *weakvalue;
-  const TValue *mode = gfasttm(g, h->metatable, TM_MODE);
-  markobject(g, h->metatable);
-  if (mode && ttisstring(mode) &&  /* is there a weak mode? */
-      ((weakkey = strchr(svalue(mode), 'k')),
-       (weakvalue = strchr(svalue(mode), 'v')),
-       (weakkey || weakvalue))) {  /* is really weak? */
-    black2gray(obj2gco(h));  /* keep table gray */
-    if (!weakkey)  /* strong keys? */
-      traverseweakvalue(g, h);
-    else if (!weakvalue)  /* strong values? */
-      traverseephemeron(g, h);
-    else  /* all weak */
-      linktable(h, &g->allweak);  /* nothing to traverse now */
-  }
-  else  /* not weak */
-    traversestrongtable(g, h);
-  return sizeof(Table) + sizeof(TValue) * h->sizearray +
-                         sizeof(Node) * cast(size_t, sizenode(h));
-}
-
-
-static int traverseproto (global_State *g, Proto *f) {
-  int i;
-  if (f->cache && iswhite(obj2gco(f->cache)))
-    f->cache = NULL;  /* allow cache to be collected */
-  markobject(g, f->source);
-  for (i = 0; i < f->sizek; i++)  /* mark literals */
-    markvalue(g, &f->k[i]);
-  for (i = 0; i < f->sizeupvalues; i++)  /* mark upvalue names */
-    markobject(g, f->upvalues[i].name);
-  for (i = 0; i < f->sizep; i++)  /* mark nested protos */
-    markobject(g, f->p[i]);
-  for (i = 0; i < f->sizelocvars; i++)  /* mark local-variable names */
-    markobject(g, f->locvars[i].varname);
-  return sizeof(Proto) + sizeof(Instruction) * f->sizecode +
-                         sizeof(Proto *) * f->sizep +
-                         sizeof(TValue) * f->sizek +
-                         sizeof(int) * f->sizelineinfo +
-                         sizeof(LocVar) * f->sizelocvars +
-                         sizeof(Upvaldesc) * f->sizeupvalues;
-}
-
-
-static lu_mem traverseCclosure (global_State *g, CClosure *cl) {
-  int i;
-  for (i = 0; i < cl->nupvalues; i++)  /* mark its upvalues */
-    markvalue(g, &cl->upvalue[i]);
-  return sizeCclosure(cl->nupvalues);
-}
-
-static lu_mem traverseLclosure (global_State *g, LClosure *cl) {
-  int i;
-  markobject(g, cl->p);  /* mark its prototype */
-  for (i = 0; i < cl->nupvalues; i++)  /* mark its upvalues */
-    markobject(g, cl->upvals[i]);
-  return sizeLclosure(cl->nupvalues);
-}
-
-
-static lu_mem traversestack (global_State *g, lua_State *th) {
-  int n = 0;
-  StkId o = th->stack;
-  if (o == NULL)
-    return 1;  /* stack not completely built yet */
-  for (; o < th->top; o++)  /* mark live elements in the stack */
-    markvalue(g, o);
-  if (g->gcstate == GCSatomic) {  /* final traversal? */
-    StkId lim = th->stack + th->stacksize;  /* real end of stack */
-    for (; o < lim; o++)  /* clear not-marked stack slice */
-      setnilvalue(o);
-  }
-  else {  /* count call infos to compute size */
-    CallInfo *ci;
-    for (ci = &th->base_ci; ci != th->ci; ci = ci->next)
-      n++;
-  }
-  return sizeof(lua_State) + sizeof(TValue) * th->stacksize +
-         sizeof(CallInfo) * n;
-}
-
-
-/*
-** traverse one gray object, turning it to black (except for threads,
-** which are always gray).
-*/
-static void propagatemark (global_State *g) {
-  lu_mem size;
-  GCObject *o = g->gray;
-  lua_assert(isgray(o));
-  gray2black(o);
-  switch (gch(o)->tt) {
-    case LUA_TTABLE: {
-      Table *h = gco2t(o);
-      g->gray = h->gclist;  /* remove from 'gray' list */
-      size = traversetable(g, h);
-      break;
-    }
-    case LUA_TLCL: {
-      LClosure *cl = gco2lcl(o);
-      g->gray = cl->gclist;  /* remove from 'gray' list */
-      size = traverseLclosure(g, cl);
-      break;
-    }
-    case LUA_TCCL: {
-      CClosure *cl = gco2ccl(o);
-      g->gray = cl->gclist;  /* remove from 'gray' list */
-      size = traverseCclosure(g, cl);
-      break;
-    }
-    case LUA_TTHREAD: {
-      lua_State *th = gco2th(o);
-      g->gray = th->gclist;  /* remove from 'gray' list */
-      th->gclist = g->grayagain;
-      g->grayagain = o;  /* insert into 'grayagain' list */
-      black2gray(o);
-      size = traversestack(g, th);
-      break;
-    }
-    case LUA_TPROTO: {
-      Proto *p = gco2p(o);
-      g->gray = p->gclist;  /* remove from 'gray' list */
-      size = traverseproto(g, p);
-      break;
-    }
-    default: lua_assert(0); return;
-  }
-  g->GCmemtrav += size;
-}
-
-
-static void propagateall (global_State *g) {
-  while (g->gray) propagatemark(g);
-}
-
-
-static void propagatelist (global_State *g, GCObject *l) {
-  lua_assert(g->gray == NULL);  /* no grays left */
-  g->gray = l;
-  propagateall(g);  /* traverse all elements from 'l' */
-}
-
-/*
-** retraverse all gray lists. Because tables may be reinserted in other
-** lists when traversed, traverse the original lists to avoid traversing
-** twice the same table (which is not wrong, but inefficient)
-*/
-static void retraversegrays (global_State *g) {
-  GCObject *weak = g->weak;  /* save original lists */
-  GCObject *grayagain = g->grayagain;
-  GCObject *ephemeron = g->ephemeron;
-  g->weak = g->grayagain = g->ephemeron = NULL;
-  propagateall(g);  /* traverse main gray list */
-  propagatelist(g, grayagain);
-  propagatelist(g, weak);
-  propagatelist(g, ephemeron);
-}
-
-
-static void convergeephemerons (global_State *g) {
-  int changed;
-  do {
-    GCObject *w;
-    GCObject *next = g->ephemeron;  /* get ephemeron list */
-    g->ephemeron = NULL;  /* tables will return to this list when traversed */
-    changed = 0;
-    while ((w = next) != NULL) {
-      next = gco2t(w)->gclist;
-      if (traverseephemeron(g, gco2t(w))) {  /* traverse marked some value? */
-        propagateall(g);  /* propagate changes */
-        changed = 1;  /* will have to revisit all ephemeron tables */
-      }
-    }
-  } while (changed);
-}
-
-/* }====================================================== */
-
-
-/*
-** {======================================================
-** Sweep Functions
-** =======================================================
-*/
-
-
-/*
-** clear entries with unmarked keys from all weaktables in list 'l' up
-** to element 'f'
-*/
-static void clearkeys (global_State *g, GCObject *l, GCObject *f) {
-  for (; l != f; l = gco2t(l)->gclist) {
-    Table *h = gco2t(l);
-    Node *n, *limit = gnodelast(h);
-    for (n = gnode(h, 0); n < limit; n++) {
-      if (!ttisnil(gval(n)) && (iscleared(g, gkey(n)))) {
-        setnilvalue(gval(n));  /* remove value ... */
-        removeentry(n);  /* and remove entry from table */
-      }
-    }
-  }
-}
-
-
-/*
-** clear entries with unmarked values from all weaktables in list 'l' up
-** to element 'f'
-*/
-static void clearvalues (global_State *g, GCObject *l, GCObject *f) {
-  for (; l != f; l = gco2t(l)->gclist) {
-    Table *h = gco2t(l);
-    Node *n, *limit = gnodelast(h);
-    int i;
-    for (i = 0; i < h->sizearray; i++) {
-      TValue *o = &h->array[i];
-      if (iscleared(g, o))  /* value was collected? */
-        setnilvalue(o);  /* remove value */
-    }
-    for (n = gnode(h, 0); n < limit; n++) {
-      if (!ttisnil(gval(n)) && iscleared(g, gval(n))) {
-        setnilvalue(gval(n));  /* remove value ... */
-        removeentry(n);  /* and remove entry from table */
-      }
-    }
-  }
-}
-
-
-static void freeobj (lua_State *L, GCObject *o) {
-  switch (gch(o)->tt) {
-    case LUA_TPROTO: luaF_freeproto(L, gco2p(o)); break;
-    case LUA_TLCL: {
-      luaM_freemem(L, o, sizeLclosure(gco2lcl(o)->nupvalues));
-      break;
-    }
-    case LUA_TCCL: {
-      luaM_freemem(L, o, sizeCclosure(gco2ccl(o)->nupvalues));
-      break;
-    }
-    case LUA_TUPVAL: luaF_freeupval(L, gco2uv(o)); break;
-    case LUA_TTABLE: luaH_free(L, gco2t(o)); break;
-    case LUA_TTHREAD: luaE_freethread(L, gco2th(o)); break;
-    case LUA_TUSERDATA: luaM_freemem(L, o, sizeudata(gco2u(o))); break;
-    case LUA_TSHRSTR:
-      G(L)->strt.nuse--;
-      /* FALLTHROUGH */
-    case LUA_TLNGSTR: {
-      luaM_freemem(L, o, sizestring(gco2ts(o)));
-      break;
-    }
-    default: lua_assert(0);
-  }
-}
-
-
-#define sweepwholelist(L,p)	sweeplist(L,p,MAX_LUMEM)
-static GCObject **sweeplist (lua_State *L, GCObject **p, lu_mem count);
-
-
-/*
-** sweep the (open) upvalues of a thread and resize its stack and
-** list of call-info structures.
-*/
-static void sweepthread (lua_State *L, lua_State *L1) {
-  if (L1->stack == NULL) return;  /* stack not completely built yet */
-  sweepwholelist(L, &L1->openupval);  /* sweep open upvalues */
-  luaE_freeCI(L1);  /* free extra CallInfo slots */
-  /* should not change the stack during an emergency gc cycle */
-  if (G(L)->gckind != KGC_EMERGENCY)
-    luaD_shrinkstack(L1);
-}
-
-
-/*
-** sweep at most 'count' elements from a list of GCObjects erasing dead
-** objects, where a dead (not alive) object is one marked with the "old"
-** (non current) white and not fixed.
-** In non-generational mode, change all non-dead objects back to white,
-** preparing for next collection cycle.
-** In generational mode, keep black objects black, and also mark them as
-** old; stop when hitting an old object, as all objects after that
-** one will be old too.
-** When object is a thread, sweep its list of open upvalues too.
-*/
-static GCObject **sweeplist (lua_State *L, GCObject **p, lu_mem count) {
-  global_State *g = G(L);
-  int ow = otherwhite(g);
-  int toclear, toset;  /* bits to clear and to set in all live objects */
-  int tostop;  /* stop sweep when this is true */
-  if (isgenerational(g)) {  /* generational mode? */
-    toclear = ~0;  /* clear nothing */
-    toset = bitmask(OLDBIT);  /* set the old bit of all surviving objects */
-    tostop = bitmask(OLDBIT);  /* do not sweep old generation */
-  }
-  else {  /* normal mode */
-    toclear = maskcolors;  /* clear all color bits + old bit */
-    toset = luaC_white(g);  /* make object white */
-    tostop = 0;  /* do not stop */
-  }
-  while (*p != NULL && count-- > 0) {
-    GCObject *curr = *p;
-    int marked = gch(curr)->marked;
-    if (isdeadm(ow, marked)) {  /* is 'curr' dead? */
-      *p = gch(curr)->next;  /* remove 'curr' from list */
-      freeobj(L, curr);  /* erase 'curr' */
-    }
-    else {
-      if (testbits(marked, tostop))
-        return NULL;  /* stop sweeping this list */
-      if (gch(curr)->tt == LUA_TTHREAD)
-        sweepthread(L, gco2th(curr));  /* sweep thread's upvalues */
-      /* update marks */
-      gch(curr)->marked = cast_byte((marked & toclear) | toset);
-      p = &gch(curr)->next;  /* go to next element */
-    }
-  }
-  return (*p == NULL) ? NULL : p;
-}
-
-
-/*
-** sweep a list until a live object (or end of list)
-*/
-static GCObject **sweeptolive (lua_State *L, GCObject **p, int *n) {
-  GCObject ** old = p;
-  int i = 0;
-  do {
-    i++;
-    p = sweeplist(L, p, 1);
-  } while (p == old);
-  if (n) *n += i;
-  return p;
-}
-
-/* }====================================================== */
-
-
-/*
-** {======================================================
-** Finalization
-** =======================================================
-*/
-
-static void checkSizes (lua_State *L) {
-  global_State *g = G(L);
-  if (g->gckind != KGC_EMERGENCY) {  /* do not change sizes in emergency */
-    int hs = g->strt.size / 2;  /* half the size of the string table */
-    if (g->strt.nuse < cast(lu_int32, hs))  /* using less than that half? */
-      luaS_resize(L, hs);  /* halve its size */
-    luaZ_freebuffer(L, &g->buff);  /* free concatenation buffer */
-  }
-}
-
-
-static GCObject *udata2finalize (global_State *g) {
-  GCObject *o = g->tobefnz;  /* get first element */
-  lua_assert(isfinalized(o));
-  g->tobefnz = gch(o)->next;  /* remove it from 'tobefnz' list */
-  gch(o)->next = g->allgc;  /* return it to 'allgc' list */
-  g->allgc = o;
-  resetbit(gch(o)->marked, SEPARATED);  /* mark that it is not in 'tobefnz' */
-  lua_assert(!isold(o));  /* see MOVE OLD rule */
-  if (!keepinvariantout(g))  /* not keeping invariant? */
-    makewhite(g, o);  /* "sweep" object */
-  return o;
-}
-
-
-static void dothecall (lua_State *L, void *ud) {
-  UNUSED(ud);
-  luaD_call(L, L->top - 2, 0, 0);
-}
-
-
-static void GCTM (lua_State *L, int propagateerrors) {
-  global_State *g = G(L);
-  const TValue *tm;
-  TValue v;
-  setgcovalue(L, &v, udata2finalize(g));
-  tm = luaT_gettmbyobj(L, &v, TM_GC);
-  if (tm != NULL && ttisfunction(tm)) {  /* is there a finalizer? */
-    int status;
-    lu_byte oldah = L->allowhook;
-    int running  = g->gcrunning;
-    L->allowhook = 0;  /* stop debug hooks during GC metamethod */
-    g->gcrunning = 0;  /* avoid GC steps */
-    setobj2s(L, L->top, tm);  /* push finalizer... */
-    setobj2s(L, L->top + 1, &v);  /* ... and its argument */
-    L->top += 2;  /* and (next line) call the finalizer */
-    status = luaD_pcall(L, dothecall, NULL, savestack(L, L->top - 2), 0);
-    L->allowhook = oldah;  /* restore hooks */
-    g->gcrunning = running;  /* restore state */
-    if (status != LUA_OK && propagateerrors) {  /* error while running __gc? */
-      if (status == LUA_ERRRUN) {  /* is there an error object? */
-        const char *msg = (ttisstring(L->top - 1))
-                            ? svalue(L->top - 1)
-                            : "no message";
-        luaO_pushfstring(L, "error in __gc metamethod (%s)", msg);
-        status = LUA_ERRGCMM;  /* error in __gc metamethod */
-      }
-      luaD_throw(L, status);  /* re-throw error */
-    }
-  }
-}
-
-
-/*
-** move all unreachable objects (or 'all' objects) that need
-** finalization from list 'finobj' to list 'tobefnz' (to be finalized)
-*/
-static void separatetobefnz (lua_State *L, int all) {
-  global_State *g = G(L);
-  GCObject **p = &g->finobj;
-  GCObject *curr;
-  GCObject **lastnext = &g->tobefnz;
-  /* find last 'next' field in 'tobefnz' list (to add elements in its end) */
-  while (*lastnext != NULL)
-    lastnext = &gch(*lastnext)->next;
-  while ((curr = *p) != NULL) {  /* traverse all finalizable objects */
-    lua_assert(!isfinalized(curr));
-    lua_assert(testbit(gch(curr)->marked, SEPARATED));
-    if (!(iswhite(curr) || all))  /* not being collected? */
-      p = &gch(curr)->next;  /* don't bother with it */
-    else {
-      l_setbit(gch(curr)->marked, FINALIZEDBIT); /* won't be finalized again */
-      *p = gch(curr)->next;  /* remove 'curr' from 'finobj' list */
-      gch(curr)->next = *lastnext;  /* link at the end of 'tobefnz' list */
-      *lastnext = curr;
-      lastnext = &gch(curr)->next;
-    }
-  }
-}
-
-
-/*
-** if object 'o' has a finalizer, remove it from 'allgc' list (must
-** search the list to find it) and link it in 'finobj' list.
-*/
-void luaC_checkfinalizer (lua_State *L, GCObject *o, Table *mt) {
-  global_State *g = G(L);
-  if (testbit(gch(o)->marked, SEPARATED) || /* obj. is already separated... */
-      isfinalized(o) ||                           /* ... or is finalized... */
-      gfasttm(g, mt, TM_GC) == NULL)                /* or has no finalizer? */
-    return;  /* nothing to be done */
-  else {  /* move 'o' to 'finobj' list */
-    GCObject **p;
-    GCheader *ho = gch(o);
-    if (g->sweepgc == &ho->next) {  /* avoid removing current sweep object */
-      lua_assert(issweepphase(g));
-      g->sweepgc = sweeptolive(L, g->sweepgc, NULL);
-    }
-    /* search for pointer pointing to 'o' */
-    for (p = &g->allgc; *p != o; p = &gch(*p)->next) { /* empty */ }
-    *p = ho->next;  /* remove 'o' from root list */
-    ho->next = g->finobj;  /* link it in list 'finobj' */
-    g->finobj = o;
-    l_setbit(ho->marked, SEPARATED);  /* mark it as such */
-    if (!keepinvariantout(g))  /* not keeping invariant? */
-      makewhite(g, o);  /* "sweep" object */
-    else
-      resetoldbit(o);  /* see MOVE OLD rule */
-  }
-}
-
-/* }====================================================== */
-
-
-/*
-** {======================================================
-** GC control
-** =======================================================
-*/
-
-
-/*
-** set a reasonable "time" to wait before starting a new GC cycle;
-** cycle will start when memory use hits threshold
-*/
-static void setpause (global_State *g, l_mem estimate) {
-  l_mem debt, threshold;
-  estimate = estimate / PAUSEADJ;  /* adjust 'estimate' */
-  threshold = (g->gcpause < MAX_LMEM / estimate)  /* overflow? */
-            ? estimate * g->gcpause  /* no overflow */
-            : MAX_LMEM;  /* overflow; truncate to maximum */
-  debt = -cast(l_mem, threshold - gettotalbytes(g));
-  luaE_setdebt(g, debt);
-}
-
-
-#define sweepphases  \
-	(bitmask(GCSsweepstring) | bitmask(GCSsweepudata) | bitmask(GCSsweep))
-
-
-/*
-** enter first sweep phase (strings) and prepare pointers for other
-** sweep phases.  The calls to 'sweeptolive' make pointers point to an
-** object inside the list (instead of to the header), so that the real
-** sweep do not need to skip objects created between "now" and the start
-** of the real sweep.
-** Returns how many objects it swept.
-*/
-static int entersweep (lua_State *L) {
-  global_State *g = G(L);
-  int n = 0;
-  g->gcstate = GCSsweepstring;
-  lua_assert(g->sweepgc == NULL && g->sweepfin == NULL);
-  /* prepare to sweep strings, finalizable objects, and regular objects */
-  g->sweepstrgc = 0;
-  g->sweepfin = sweeptolive(L, &g->finobj, &n);
-  g->sweepgc = sweeptolive(L, &g->allgc, &n);
-  return n;
-}
-
-
-/*
-** change GC mode
-*/
-void luaC_changemode (lua_State *L, int mode) {
-  global_State *g = G(L);
-  if (mode == g->gckind) return;  /* nothing to change */
-  if (mode == KGC_GEN) {  /* change to generational mode */
-    /* make sure gray lists are consistent */
-    luaC_runtilstate(L, bitmask(GCSpropagate));
-    g->GCestimate = gettotalbytes(g);
-    g->gckind = KGC_GEN;
-  }
-  else {  /* change to incremental mode */
-    /* sweep all objects to turn them back to white
-       (as white has not changed, nothing extra will be collected) */
-    g->gckind = KGC_NORMAL;
-    entersweep(L);
-    luaC_runtilstate(L, ~sweepphases);
-  }
-}
-
-
-/*
-** call all pending finalizers
-*/
-static void callallpendingfinalizers (lua_State *L, int propagateerrors) {
-  global_State *g = G(L);
-  while (g->tobefnz) {
-    resetoldbit(g->tobefnz);
-    GCTM(L, propagateerrors);
-  }
-}
-
-
-void luaC_freeallobjects (lua_State *L) {
-  global_State *g = G(L);
-  int i;
-  separatetobefnz(L, 1);  /* separate all objects with finalizers */
-  lua_assert(g->finobj == NULL);
-  callallpendingfinalizers(L, 0);
-  g->currentwhite = WHITEBITS; /* this "white" makes all objects look dead */
-  g->gckind = KGC_NORMAL;
-  sweepwholelist(L, &g->finobj);  /* finalizers can create objs. in 'finobj' */
-  sweepwholelist(L, &g->allgc);
-  for (i = 0; i < g->strt.size; i++)  /* free all string lists */
-    sweepwholelist(L, &g->strt.hash[i]);
-  lua_assert(g->strt.nuse == 0);
-}
-
-
-static l_mem atomic (lua_State *L) {
-  global_State *g = G(L);
-  l_mem work = -cast(l_mem, g->GCmemtrav);  /* start counting work */
-  GCObject *origweak, *origall;
-  lua_assert(!iswhite(obj2gco(g->mainthread)));
-  markobject(g, L);  /* mark running thread */
-  /* registry and global metatables may be changed by API */
-  markvalue(g, &g->l_registry);
-  markmt(g);  /* mark basic metatables */
-  /* remark occasional upvalues of (maybe) dead threads */
-  remarkupvals(g);
-  propagateall(g);  /* propagate changes */
-  work += g->GCmemtrav;  /* stop counting (do not (re)count grays) */
-  /* traverse objects caught by write barrier and by 'remarkupvals' */
-  retraversegrays(g);
-  work -= g->GCmemtrav;  /* restart counting */
-  convergeephemerons(g);
-  /* at this point, all strongly accessible objects are marked. */
-  /* clear values from weak tables, before checking finalizers */
-  clearvalues(g, g->weak, NULL);
-  clearvalues(g, g->allweak, NULL);
-  origweak = g->weak; origall = g->allweak;
-  work += g->GCmemtrav;  /* stop counting (objects being finalized) */
-  separatetobefnz(L, 0);  /* separate objects to be finalized */
-  markbeingfnz(g);  /* mark objects that will be finalized */
-  propagateall(g);  /* remark, to propagate `preserveness' */
-  work -= g->GCmemtrav;  /* restart counting */
-  convergeephemerons(g);
-  /* at this point, all resurrected objects are marked. */
-  /* remove dead objects from weak tables */
-  clearkeys(g, g->ephemeron, NULL);  /* clear keys from all ephemeron tables */
-  clearkeys(g, g->allweak, NULL);  /* clear keys from all allweak tables */
-  /* clear values from resurrected weak tables */
-  clearvalues(g, g->weak, origweak);
-  clearvalues(g, g->allweak, origall);
-  g->currentwhite = cast_byte(otherwhite(g));  /* flip current white */
-  work += g->GCmemtrav;  /* complete counting */
-  return work;  /* estimate of memory marked by 'atomic' */
-}
-
-
-static lu_mem singlestep (lua_State *L) {
-  global_State *g = G(L);
-  switch (g->gcstate) {
-    case GCSpause: {
-      /* start to count memory traversed */
-      g->GCmemtrav = g->strt.size * sizeof(GCObject*);
-      lua_assert(!isgenerational(g));
-      restartcollection(g);
-      g->gcstate = GCSpropagate;
-      return g->GCmemtrav;
-    }
-    case GCSpropagate: {
-      if (g->gray) {
-        lu_mem oldtrav = g->GCmemtrav;
-        propagatemark(g);
-        return g->GCmemtrav - oldtrav;  /* memory traversed in this step */
-      }
-      else {  /* no more `gray' objects */
-        lu_mem work;
-        int sw;
-        g->gcstate = GCSatomic;  /* finish mark phase */
-        g->GCestimate = g->GCmemtrav;  /* save what was counted */;
-        work = atomic(L);  /* add what was traversed by 'atomic' */
-        g->GCestimate += work;  /* estimate of total memory traversed */
-        sw = entersweep(L);
-        return work + sw * GCSWEEPCOST;
-      }
-    }
-    case GCSsweepstring: {
-      int i;
-      for (i = 0; i < GCSWEEPMAX && g->sweepstrgc + i < g->strt.size; i++)
-        sweepwholelist(L, &g->strt.hash[g->sweepstrgc + i]);
-      g->sweepstrgc += i;
-      if (g->sweepstrgc >= g->strt.size)  /* no more strings to sweep? */
-        g->gcstate = GCSsweepudata;
-      return i * GCSWEEPCOST;
-    }
-    case GCSsweepudata: {
-      if (g->sweepfin) {
-        g->sweepfin = sweeplist(L, g->sweepfin, GCSWEEPMAX);
-        return GCSWEEPMAX*GCSWEEPCOST;
-      }
-      else {
-        g->gcstate = GCSsweep;
-        return 0;
-      }
-    }
-    case GCSsweep: {
-      if (g->sweepgc) {
-        g->sweepgc = sweeplist(L, g->sweepgc, GCSWEEPMAX);
-        return GCSWEEPMAX*GCSWEEPCOST;
-      }
-      else {
-        /* sweep main thread */
-        GCObject *mt = obj2gco(g->mainthread);
-        sweeplist(L, &mt, 1);
-        checkSizes(L);
-        g->gcstate = GCSpause;  /* finish collection */
-        return GCSWEEPCOST;
-      }
-    }
-    default: lua_assert(0); return 0;
-  }
-}
-
-
-/*
-** advances the garbage collector until it reaches a state allowed
-** by 'statemask'
-*/
-void luaC_runtilstate (lua_State *L, int statesmask) {
-  global_State *g = G(L);
-  while (!testbit(statesmask, g->gcstate))
-    singlestep(L);
-}
-
-
-static void generationalcollection (lua_State *L) {
-  global_State *g = G(L);
-  lua_assert(g->gcstate == GCSpropagate);
-  if (g->GCestimate == 0) {  /* signal for another major collection? */
-    luaC_fullgc(L, 0);  /* perform a full regular collection */
-    g->GCestimate = gettotalbytes(g);  /* update control */
-  }
-  else {
-    lu_mem estimate = g->GCestimate;
-    luaC_runtilstate(L, bitmask(GCSpause));  /* run complete (minor) cycle */
-    g->gcstate = GCSpropagate;  /* skip restart */
-    if (gettotalbytes(g) > (estimate / 100) * g->gcmajorinc)
-      g->GCestimate = 0;  /* signal for a major collection */
-    else
-      g->GCestimate = estimate;  /* keep estimate from last major coll. */
-
-  }
-  setpause(g, gettotalbytes(g));
-  lua_assert(g->gcstate == GCSpropagate);
-}
-
-
-static void incstep (lua_State *L) {
-  global_State *g = G(L);
-  l_mem debt = g->GCdebt;
-  int stepmul = g->gcstepmul;
-  if (stepmul < 40) stepmul = 40;  /* avoid ridiculous low values (and 0) */
-  /* convert debt from Kb to 'work units' (avoid zero debt and overflows) */
-  debt = (debt / STEPMULADJ) + 1;
-  debt = (debt < MAX_LMEM / stepmul) ? debt * stepmul : MAX_LMEM;
-  do {  /* always perform at least one single step */
-    lu_mem work = singlestep(L);  /* do some work */
-    debt -= work;
-  } while (debt > -GCSTEPSIZE && g->gcstate != GCSpause);
-  if (g->gcstate == GCSpause)
-    setpause(g, g->GCestimate);  /* pause until next cycle */
-  else {
-    debt = (debt / stepmul) * STEPMULADJ;  /* convert 'work units' to Kb */
-    luaE_setdebt(g, debt);
-  }
-}
-
-
-/*
-** performs a basic GC step
-*/
-void luaC_forcestep (lua_State *L) {
-  global_State *g = G(L);
-  int i;
-  if (isgenerational(g)) generationalcollection(L);
-  else incstep(L);
-  /* run a few finalizers (or all of them at the end of a collect cycle) */
-  for (i = 0; g->tobefnz && (i < GCFINALIZENUM || g->gcstate == GCSpause); i++)
-    GCTM(L, 1);  /* call one finalizer */
-}
-
-
-/*
-** performs a basic GC step only if collector is running
-*/
-void luaC_step (lua_State *L) {
-  global_State *g = G(L);
-  if (g->gcrunning) luaC_forcestep(L);
-  else luaE_setdebt(g, -GCSTEPSIZE);  /* avoid being called too often */
-}
-
-
-
-/*
-** performs a full GC cycle; if "isemergency", does not call
-** finalizers (which could change stack positions)
-*/
-void luaC_fullgc (lua_State *L, int isemergency) {
-  global_State *g = G(L);
-  int origkind = g->gckind;
-  lua_assert(origkind != KGC_EMERGENCY);
-  if (isemergency)  /* do not run finalizers during emergency GC */
-    g->gckind = KGC_EMERGENCY;
-  else {
-    g->gckind = KGC_NORMAL;
-    callallpendingfinalizers(L, 1);
-  }
-  if (keepinvariant(g)) {  /* may there be some black objects? */
-    /* must sweep all objects to turn them back to white
-       (as white has not changed, nothing will be collected) */
-    entersweep(L);
-  }
-  /* finish any pending sweep phase to start a new cycle */
-  luaC_runtilstate(L, bitmask(GCSpause));
-  luaC_runtilstate(L, ~bitmask(GCSpause));  /* start new collection */
-  luaC_runtilstate(L, bitmask(GCSpause));  /* run entire collection */
-  if (origkind == KGC_GEN) {  /* generational mode? */
-    /* generational mode must be kept in propagate phase */
-    luaC_runtilstate(L, bitmask(GCSpropagate));
-  }
-  g->gckind = origkind;
-  setpause(g, gettotalbytes(g));
-  if (!isemergency)   /* do not run finalizers during emergency GC */
-    callallpendingfinalizers(L, 1);
-}
-
-/* }====================================================== */
-
-
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/llex.h
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/llex.h
+++ /dev/null
@@ -1,78 +0,0 @@
-/*
-** $Id: llex.h,v 1.72.1.1 2013/04/12 18:48:47 roberto Exp $
-** Lexical Analyzer
-** See Copyright Notice in lua.h
-*/
-
-#ifndef llex_h
-#define llex_h
-
-#include "lobject.h"
-#include "lzio.h"
-
-
-#define FIRST_RESERVED	257
-
-
-
-/*
-* WARNING: if you change the order of this enumeration,
-* grep "ORDER RESERVED"
-*/
-enum RESERVED {
-  /* terminal symbols denoted by reserved words */
-  TK_AND = FIRST_RESERVED, TK_BREAK,
-  TK_DO, TK_ELSE, TK_ELSEIF, TK_END, TK_FALSE, TK_FOR, TK_FUNCTION,
-  TK_GOTO, TK_IF, TK_IN, TK_LOCAL, TK_NIL, TK_NOT, TK_OR, TK_REPEAT,
-  TK_RETURN, TK_THEN, TK_TRUE, TK_UNTIL, TK_WHILE,
-  /* other terminal symbols */
-  TK_CONCAT, TK_DOTS, TK_EQ, TK_GE, TK_LE, TK_NE, TK_DBCOLON, TK_EOS,
-  TK_NUMBER, TK_NAME, TK_STRING
-};
-
-/* number of reserved words */
-#define NUM_RESERVED	(cast(int, TK_WHILE-FIRST_RESERVED+1))
-
-
-typedef union {
-  lua_Number r;
-  TString *ts;
-} SemInfo;  /* semantics information */
-
-
-typedef struct Token {
-  int token;
-  SemInfo seminfo;
-} Token;
-
-
-/* state of the lexer plus state of the parser when shared by all
-   functions */
-typedef struct LexState {
-  int current;  /* current character (charint) */
-  int linenumber;  /* input line counter */
-  int lastline;  /* line of last token `consumed' */
-  Token t;  /* current token */
-  Token lookahead;  /* look ahead token */
-  struct FuncState *fs;  /* current function (parser) */
-  struct lua_State *L;
-  ZIO *z;  /* input stream */
-  Mbuffer *buff;  /* buffer for tokens */
-  struct Dyndata *dyd;  /* dynamic structures used by the parser */
-  TString *source;  /* current source name */
-  TString *envn;  /* environment variable name */
-  char decpoint;  /* locale decimal point */
-} LexState;
-
-
-LUAI_FUNC void luaX_init (lua_State *L);
-LUAI_FUNC void luaX_setinput (lua_State *L, LexState *ls, ZIO *z,
-                              TString *source, int firstchar);
-LUAI_FUNC TString *luaX_newstring (LexState *ls, const char *str, size_t l);
-LUAI_FUNC void luaX_next (LexState *ls);
-LUAI_FUNC int luaX_lookahead (LexState *ls);
-LUAI_FUNC l_noret luaX_syntaxerror (LexState *ls, const char *s);
-LUAI_FUNC const char *luaX_token2str (LexState *ls, int token);
-
-
-#endif
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/llex.c
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/llex.c
+++ /dev/null
@@ -1,529 +0,0 @@
-/*
-** $Id: llex.c,v 2.63.1.3 2015/02/09 17:56:34 roberto Exp $
-** Lexical Analyzer
-** See Copyright Notice in lua.h
-*/
-
-#include <sys/zfs_context.h>
-
-#define llex_c
-#define LUA_CORE
-
-#include "lua.h"
-
-#include "lctype.h"
-#include "ldo.h"
-#include "llex.h"
-#include "lobject.h"
-#include "lparser.h"
-#include "lstate.h"
-#include "lstring.h"
-#include "ltable.h"
-#include "lzio.h"
-
-
-
-#define next(ls) (ls->current = zgetc(ls->z))
-
-
-
-#define currIsNewline(ls)	(ls->current == '\n' || ls->current == '\r')
-
-
-/* ORDER RESERVED */
-static const char *const luaX_tokens [] = {
-    "and", "break", "do", "else", "elseif",
-    "end", "false", "for", "function", "goto", "if",
-    "in", "local", "nil", "not", "or", "repeat",
-    "return", "then", "true", "until", "while",
-    "..", "...", "==", ">=", "<=", "~=", "::", "<eof>",
-    "<number>", "<name>", "<string>"
-};
-
-
-#define save_and_next(ls) (save(ls, ls->current), next(ls))
-
-
-static l_noret lexerror (LexState *ls, const char *msg, int token);
-
-
-static void save (LexState *ls, int c) {
-  Mbuffer *b = ls->buff;
-  if (luaZ_bufflen(b) + 1 > luaZ_sizebuffer(b)) {
-    size_t newsize;
-    if (luaZ_sizebuffer(b) >= MAX_SIZET/2)
-      lexerror(ls, "lexical element too long", 0);
-    newsize = luaZ_sizebuffer(b) * 2;
-    luaZ_resizebuffer(ls->L, b, newsize);
-  }
-  b->buffer[luaZ_bufflen(b)++] = cast(char, c);
-}
-
-
-void luaX_init (lua_State *L) {
-  int i;
-  for (i=0; i<NUM_RESERVED; i++) {
-    TString *ts = luaS_new(L, luaX_tokens[i]);
-    luaS_fix(ts);  /* reserved words are never collected */
-    ts->tsv.extra = cast_byte(i+1);  /* reserved word */
-  }
-}
-
-
-const char *luaX_token2str (LexState *ls, int token) {
-  if (token < FIRST_RESERVED) {  /* single-byte symbols? */
-    lua_assert(token == cast(unsigned char, token));
-    return (lisprint(token)) ? luaO_pushfstring(ls->L, LUA_QL("%c"), token) :
-                              luaO_pushfstring(ls->L, "char(%d)", token);
-  }
-  else {
-    const char *s = luaX_tokens[token - FIRST_RESERVED];
-    if (token < TK_EOS)  /* fixed format (symbols and reserved words)? */
-      return luaO_pushfstring(ls->L, LUA_QS, s);
-    else  /* names, strings, and numerals */
-      return s;
-  }
-}
-
-
-static const char *txtToken (LexState *ls, int token) {
-  switch (token) {
-    case TK_NAME:
-    case TK_STRING:
-    case TK_NUMBER:
-      save(ls, '\0');
-      return luaO_pushfstring(ls->L, LUA_QS, luaZ_buffer(ls->buff));
-    default:
-      return luaX_token2str(ls, token);
-  }
-}
-
-
-static l_noret lexerror (LexState *ls, const char *msg, int token) {
-  char buff[LUA_IDSIZE];
-  luaO_chunkid(buff, getstr(ls->source), LUA_IDSIZE);
-  msg = luaO_pushfstring(ls->L, "%s:%d: %s", buff, ls->linenumber, msg);
-  if (token)
-    luaO_pushfstring(ls->L, "%s near %s", msg, txtToken(ls, token));
-  luaD_throw(ls->L, LUA_ERRSYNTAX);
-}
-
-
-l_noret luaX_syntaxerror (LexState *ls, const char *msg) {
-  lexerror(ls, msg, ls->t.token);
-}
-
-
-/*
-** creates a new string and anchors it in function's table so that
-** it will not be collected until the end of the function's compilation
-** (by that time it should be anchored in function's prototype)
-*/
-TString *luaX_newstring (LexState *ls, const char *str, size_t l) {
-  lua_State *L = ls->L;
-  TValue *o;  /* entry for `str' */
-  TString *ts = luaS_newlstr(L, str, l);  /* create new string */
-  setsvalue2s(L, L->top++, ts);  /* temporarily anchor it in stack */
-  o = luaH_set(L, ls->fs->h, L->top - 1);
-  if (ttisnil(o)) {  /* not in use yet? (see 'addK') */
-    /* boolean value does not need GC barrier;
-       table has no metatable, so it does not need to invalidate cache */
-    setbvalue(o, 1);  /* t[string] = true */
-    luaC_checkGC(L);
-  }
-  else {  /* string already present */
-    ts = rawtsvalue(keyfromval(o));  /* re-use value previously stored */
-  }
-  L->top--;  /* remove string from stack */
-  return ts;
-}
-
-
-/*
-** increment line number and skips newline sequence (any of
-** \n, \r, \n\r, or \r\n)
-*/
-static void inclinenumber (LexState *ls) {
-  int old = ls->current;
-  lua_assert(currIsNewline(ls));
-  next(ls);  /* skip `\n' or `\r' */
-  if (currIsNewline(ls) && ls->current != old)
-    next(ls);  /* skip `\n\r' or `\r\n' */
-  if (++ls->linenumber >= MAX_INT)
-    lexerror(ls, "chunk has too many lines", 0);
-}
-
-
-void luaX_setinput (lua_State *L, LexState *ls, ZIO *z, TString *source,
-                    int firstchar) {
-  ls->decpoint = '.';
-  ls->L = L;
-  ls->current = firstchar;
-  ls->lookahead.token = TK_EOS;  /* no look-ahead token */
-  ls->z = z;
-  ls->fs = NULL;
-  ls->linenumber = 1;
-  ls->lastline = 1;
-  ls->source = source;
-  ls->envn = luaS_new(L, LUA_ENV);  /* create env name */
-  luaS_fix(ls->envn);  /* never collect this name */
-  luaZ_resizebuffer(ls->L, ls->buff, LUA_MINBUFFER);  /* initialize buffer */
-}
-
-
-
-/*
-** =======================================================
-** LEXICAL ANALYZER
-** =======================================================
-*/
-
-
-
-static int check_next (LexState *ls, const char *set) {
-  if (ls->current == '\0' || !strchr(set, ls->current))
-    return 0;
-  save_and_next(ls);
-  return 1;
-}
-
-
-/*
-** change all characters 'from' in buffer to 'to'
-*/
-static void buffreplace (LexState *ls, char from, char to) {
-  size_t n = luaZ_bufflen(ls->buff);
-  char *p = luaZ_buffer(ls->buff);
-  while (n--)
-    if (p[n] == from) p[n] = to;
-}
-
-
-#if !defined(getlocaledecpoint)
-#define getlocaledecpoint()	(localeconv()->decimal_point[0])
-#endif
-
-
-#define buff2d(b,e)	luaO_str2d(luaZ_buffer(b), luaZ_bufflen(b) - 1, e)
-
-/*
-** in case of format error, try to change decimal point separator to
-** the one defined in the current locale and check again
-*/
-static void trydecpoint (LexState *ls, SemInfo *seminfo) {
-  char old = ls->decpoint;
-  ls->decpoint = getlocaledecpoint();
-  buffreplace(ls, old, ls->decpoint);  /* try new decimal separator */
-  if (!buff2d(ls->buff, &seminfo->r)) {
-    /* format error with correct decimal point: no more options */
-    buffreplace(ls, ls->decpoint, '.');  /* undo change (for error message) */
-    lexerror(ls, "malformed number", TK_NUMBER);
-  }
-}
-
-
-/* LUA_NUMBER */
-/*
-** this function is quite liberal in what it accepts, as 'luaO_str2d'
-** will reject ill-formed numerals.
-*/
-static void read_numeral (LexState *ls, SemInfo *seminfo) {
-  const char *expo = "Ee";
-  int first = ls->current;
-  lua_assert(lisdigit(ls->current));
-  save_and_next(ls);
-  if (first == '0' && check_next(ls, "Xx"))  /* hexadecimal? */
-    expo = "Pp";
-  for (;;) {
-    if (check_next(ls, expo))  /* exponent part? */
-      check_next(ls, "+-");  /* optional exponent sign */
-    if (lisxdigit(ls->current) || ls->current == '.')
-      save_and_next(ls);
-    else  break;
-  }
-  save(ls, '\0');
-  buffreplace(ls, '.', ls->decpoint);  /* follow locale for decimal point */
-  if (!buff2d(ls->buff, &seminfo->r))  /* format error? */
-    trydecpoint(ls, seminfo); /* try to update decimal point separator */
-}
-
-
-/*
-** skip a sequence '[=*[' or ']=*]' and return its number of '='s or
-** -1 if sequence is malformed
-*/
-static int skip_sep (LexState *ls) {
-  int count = 0;
-  int s = ls->current;
-  lua_assert(s == '[' || s == ']');
-  save_and_next(ls);
-  while (ls->current == '=') {
-    save_and_next(ls);
-    count++;
-  }
-  return (ls->current == s) ? count : (-count) - 1;
-}
-
-
-static void read_long_string (LexState *ls, SemInfo *seminfo, int sep) {
-  save_and_next(ls);  /* skip 2nd `[' */
-  if (currIsNewline(ls))  /* string starts with a newline? */
-    inclinenumber(ls);  /* skip it */
-  for (;;) {
-    switch (ls->current) {
-      case EOZ:
-        lexerror(ls, (seminfo) ? "unfinished long string" :
-                                 "unfinished long comment", TK_EOS);
-        break;  /* to avoid warnings */
-      case ']': {
-        if (skip_sep(ls) == sep) {
-          save_and_next(ls);  /* skip 2nd `]' */
-          goto endloop;
-        }
-        break;
-      }
-      case '\n': case '\r': {
-        save(ls, '\n');
-        inclinenumber(ls);
-        if (!seminfo) luaZ_resetbuffer(ls->buff);  /* avoid wasting space */
-        break;
-      }
-      default: {
-        if (seminfo) save_and_next(ls);
-        else next(ls);
-      }
-    }
-  } endloop:
-  if (seminfo)
-    seminfo->ts = luaX_newstring(ls, luaZ_buffer(ls->buff) + (2 + sep),
-                                     luaZ_bufflen(ls->buff) - 2*(2 + sep));
-}
-
-
-static void escerror (LexState *ls, int *c, int n, const char *msg) {
-  int i;
-  luaZ_resetbuffer(ls->buff);  /* prepare error message */
-  save(ls, '\\');
-  for (i = 0; i < n && c[i] != EOZ; i++)
-    save(ls, c[i]);
-  lexerror(ls, msg, TK_STRING);
-}
-
-
-static int readhexaesc (LexState *ls) {
-  int c[3], i;  /* keep input for error message */
-  int r = 0;  /* result accumulator */
-  c[0] = 'x';  /* for error message */
-  for (i = 1; i < 3; i++) {  /* read two hexadecimal digits */
-    c[i] = next(ls);
-    if (!lisxdigit(c[i]))
-      escerror(ls, c, i + 1, "hexadecimal digit expected");
-    r = (r << 4) + luaO_hexavalue(c[i]);
-  }
-  return r;
-}
-
-
-static int readdecesc (LexState *ls) {
-  int c[3], i;
-  int r = 0;  /* result accumulator */
-  for (i = 0; i < 3 && lisdigit(ls->current); i++) {  /* read up to 3 digits */
-    c[i] = ls->current;
-    r = 10*r + c[i] - '0';
-    next(ls);
-  }
-  if (r > UCHAR_MAX)
-    escerror(ls, c, i, "decimal escape too large");
-  return r;
-}
-
-
-static void read_string (LexState *ls, int del, SemInfo *seminfo) {
-  save_and_next(ls);  /* keep delimiter (for error messages) */
-  while (ls->current != del) {
-    switch (ls->current) {
-      case EOZ:
-        lexerror(ls, "unfinished string", TK_EOS);
-        break;  /* to avoid warnings */
-      case '\n':
-      case '\r':
-        lexerror(ls, "unfinished string", TK_STRING);
-        break;  /* to avoid warnings */
-      case '\\': {  /* escape sequences */
-        int c;  /* final character to be saved */
-        next(ls);  /* do not save the `\' */
-        switch (ls->current) {
-          case 'a': c = '\a'; goto read_save;
-          case 'b': c = '\b'; goto read_save;
-          case 'f': c = '\f'; goto read_save;
-          case 'n': c = '\n'; goto read_save;
-          case 'r': c = '\r'; goto read_save;
-          case 't': c = '\t'; goto read_save;
-          case 'v': c = '\v'; goto read_save;
-          case 'x': c = readhexaesc(ls); goto read_save;
-          case '\n': case '\r':
-            inclinenumber(ls); c = '\n'; goto only_save;
-          case '\\': case '\"': case '\'':
-            c = ls->current; goto read_save;
-          case EOZ: goto no_save;  /* will raise an error next loop */
-          case 'z': {  /* zap following span of spaces */
-            next(ls);  /* skip the 'z' */
-            while (lisspace(ls->current)) {
-              if (currIsNewline(ls)) inclinenumber(ls);
-              else next(ls);
-            }
-            goto no_save;
-          }
-          default: {
-            if (!lisdigit(ls->current))
-              escerror(ls, &ls->current, 1, "invalid escape sequence");
-            /* digital escape \ddd */
-            c = readdecesc(ls);
-            goto only_save;
-          }
-        }
-       read_save: next(ls);  /* read next character */
-       only_save: save(ls, c);  /* save 'c' */
-       no_save: break;
-      }
-      default:
-        save_and_next(ls);
-    }
-  }
-  save_and_next(ls);  /* skip delimiter */
-  seminfo->ts = luaX_newstring(ls, luaZ_buffer(ls->buff) + 1,
-                                   luaZ_bufflen(ls->buff) - 2);
-}
-
-
-static int llex (LexState *ls, SemInfo *seminfo) {
-  luaZ_resetbuffer(ls->buff);
-  for (;;) {
-    switch (ls->current) {
-      case '\n': case '\r': {  /* line breaks */
-        inclinenumber(ls);
-        break;
-      }
-      case ' ': case '\f': case '\t': case '\v': {  /* spaces */
-        next(ls);
-        break;
-      }
-      case '-': {  /* '-' or '--' (comment) */
-        next(ls);
-        if (ls->current != '-') return '-';
-        /* else is a comment */
-        next(ls);
-        if (ls->current == '[') {  /* long comment? */
-          int sep = skip_sep(ls);
-          luaZ_resetbuffer(ls->buff);  /* `skip_sep' may dirty the buffer */
-          if (sep >= 0) {
-            read_long_string(ls, NULL, sep);  /* skip long comment */
-            luaZ_resetbuffer(ls->buff);  /* previous call may dirty the buff. */
-            break;
-          }
-        }
-        /* else short comment */
-        while (!currIsNewline(ls) && ls->current != EOZ)
-          next(ls);  /* skip until end of line (or end of file) */
-        break;
-      }
-      case '[': {  /* long string or simply '[' */
-        int sep = skip_sep(ls);
-        if (sep >= 0) {
-          read_long_string(ls, seminfo, sep);
-          return TK_STRING;
-        }
-        else if (sep == -1) return '[';
-        else lexerror(ls, "invalid long string delimiter", TK_STRING);
-      }
-      case '=': {
-        next(ls);
-        if (ls->current != '=') return '=';
-        else { next(ls); return TK_EQ; }
-      }
-      case '<': {
-        next(ls);
-        if (ls->current != '=') return '<';
-        else { next(ls); return TK_LE; }
-      }
-      case '>': {
-        next(ls);
-        if (ls->current != '=') return '>';
-        else { next(ls); return TK_GE; }
-      }
-      case '~': {
-        next(ls);
-        if (ls->current != '=') return '~';
-        else { next(ls); return TK_NE; }
-      }
-      case ':': {
-        next(ls);
-        if (ls->current != ':') return ':';
-        else { next(ls); return TK_DBCOLON; }
-      }
-      case '"': case '\'': {  /* short literal strings */
-        read_string(ls, ls->current, seminfo);
-        return TK_STRING;
-      }
-      case '.': {  /* '.', '..', '...', or number */
-        save_and_next(ls);
-        if (check_next(ls, ".")) {
-          if (check_next(ls, "."))
-            return TK_DOTS;   /* '...' */
-          else return TK_CONCAT;   /* '..' */
-        }
-        else if (!lisdigit(ls->current)) return '.';
-        /* else go through */
-      }
-      /* FALLTHROUGH */
-      case '0': case '1': case '2': case '3': case '4':
-      case '5': case '6': case '7': case '8': case '9': {
-        read_numeral(ls, seminfo);
-        return TK_NUMBER;
-      }
-      case EOZ: {
-        return TK_EOS;
-      }
-      default: {
-        if (lislalpha(ls->current)) {  /* identifier or reserved word? */
-          TString *ts;
-          do {
-            save_and_next(ls);
-          } while (lislalnum(ls->current));
-          ts = luaX_newstring(ls, luaZ_buffer(ls->buff),
-                                  luaZ_bufflen(ls->buff));
-          seminfo->ts = ts;
-          if (isreserved(ts))  /* reserved word? */
-            return ts->tsv.extra - 1 + FIRST_RESERVED;
-          else {
-            return TK_NAME;
-          }
-        }
-        else {  /* single-char tokens (+ - / ...) */
-          int c = ls->current;
-          next(ls);
-          return c;
-        }
-      }
-    }
-  }
-}
-
-
-void luaX_next (LexState *ls) {
-  ls->lastline = ls->linenumber;
-  if (ls->lookahead.token != TK_EOS) {  /* is there a look-ahead token? */
-    ls->t = ls->lookahead;  /* use this one */
-    ls->lookahead.token = TK_EOS;  /* and discharge it */
-  }
-  else
-    ls->t.token = llex(ls, &ls->t.seminfo);  /* read next token */
-}
-
-
-int luaX_lookahead (LexState *ls) {
-  lua_assert(ls->lookahead.token == TK_EOS);
-  ls->lookahead.token = llex(ls, &ls->lookahead.seminfo);
-  return ls->lookahead.token;
-}
-
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/llimits.h
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/llimits.h
+++ /dev/null
@@ -1,308 +0,0 @@
-/*
-** $Id: llimits.h,v 1.103.1.1 2013/04/12 18:48:47 roberto Exp $
-** Limits, basic types, and some other `installation-dependent' definitions
-** See Copyright Notice in lua.h
-*/
-
-#ifndef llimits_h
-#define llimits_h
-
-
-#include <sys/zfs_context.h>
-
-#include "lua.h"
-
-
-typedef unsigned LUA_INT32 lu_int32;
-
-typedef LUAI_UMEM lu_mem;
-
-typedef LUAI_MEM l_mem;
-
-
-
-/* chars used as small naturals (so that `char' is reserved for characters) */
-typedef unsigned char lu_byte;
-
-
-#define MAX_SIZET	((size_t)(~(size_t)0)-2)
-
-#define MAX_LUMEM	((lu_mem)(~(lu_mem)0)-2)
-
-#define MAX_LMEM	((l_mem) ((MAX_LUMEM >> 1) - 2))
-
-
-#define MAX_INT (INT_MAX-2)  /* maximum value of an int (-2 for safety) */
-
-/*
-** conversion of pointer to integer
-** this is for hashing only; there is no problem if the integer
-** cannot hold the whole pointer value
-*/
-#define IntPoint(p)  ((unsigned int)(lu_mem)(p))
-
-
-
-/* type to ensure maximum alignment */
-#if !defined(LUAI_USER_ALIGNMENT_T)
-#define LUAI_USER_ALIGNMENT_T	union { double u; void *s; long l; }
-#endif
-
-typedef LUAI_USER_ALIGNMENT_T L_Umaxalign;
-
-
-/* result of a `usual argument conversion' over lua_Number */
-typedef LUAI_UACNUMBER l_uacNumber;
-
-
-/* internal assertions for in-house debugging */
-#if defined(lua_assert)
-#define check_exp(c,e)		(lua_assert(c), (e))
-/* to avoid problems with conditions too long */
-#define lua_longassert(c)	{ if (!(c)) lua_assert(0); }
-#else
-#define lua_assert(c)		((void)0)
-#define check_exp(c,e)		(e)
-#define lua_longassert(c)	((void)0)
-#endif
-
-/*
-** assertion for checking API calls
-*/
-#if !defined(luai_apicheck)
-
-#if defined(LUA_USE_APICHECK)
-#include <assert.h>
-#define luai_apicheck(L,e)	assert(e)
-#else
-#define luai_apicheck(L,e)	lua_assert(e)
-#endif
-
-#endif
-
-#define api_check(l,e,msg)	luai_apicheck(l,(e) && msg)
-
-
-#if !defined(UNUSED)
-#define UNUSED(x)	((void)(x))	/* to avoid warnings */
-#endif
-
-
-#define cast(t, exp)	((t)(exp))
-
-#define cast_byte(i)	cast(lu_byte, (i))
-#define cast_num(i)	cast(lua_Number, (i))
-#define cast_int(i)	cast(int, (i))
-#define cast_uchar(i)	cast(unsigned char, (i))
-
-
-/*
-** non-return type
-*/
-#if defined(__GNUC__)
-#define l_noret		void __attribute__((noreturn))
-#elif defined(_MSC_VER)
-#define l_noret		void __declspec(noreturn)
-#else
-#define l_noret		void
-#endif
-
-
-
-/*
-** maximum depth for nested C calls and syntactical nested non-terminals
-** in a program. (Value must fit in an unsigned short int.)
-**
-** Note: On amd64 platform, the limit has been measured to be 45.  We set
-** the maximum lower to give a margin for changing the amount of stack
-** used by various functions involved in parsing and executing code.
-*/
-#if !defined(LUAI_MAXCCALLS)
-#define LUAI_MAXCCALLS		20
-#endif
-
-/*
-** maximum number of upvalues in a closure (both C and Lua). (Value
-** must fit in an unsigned char.)
-*/
-#define MAXUPVAL	UCHAR_MAX
-
-
-/*
-** type for virtual-machine instructions
-** must be an unsigned with (at least) 4 bytes (see details in lopcodes.h)
-*/
-typedef lu_int32 Instruction;
-
-
-
-/* maximum stack for a Lua function */
-#define MAXSTACK	250
-
-
-
-/* minimum size for the string table (must be power of 2) */
-#if !defined(MINSTRTABSIZE)
-#define MINSTRTABSIZE	32
-#endif
-
-
-/* minimum size for string buffer */
-#if !defined(LUA_MINBUFFER)
-#define LUA_MINBUFFER	32
-#endif
-
-
-#if !defined(lua_lock)
-#define lua_lock(L)     ((void) 0)
-#define lua_unlock(L)   ((void) 0)
-#endif
-
-#if !defined(luai_threadyield)
-#define luai_threadyield(L)     {lua_unlock(L); lua_lock(L);}
-#endif
-
-
-/*
-** these macros allow user-specific actions on threads when you defined
-** LUAI_EXTRASPACE and need to do something extra when a thread is
-** created/deleted/resumed/yielded.
-*/
-#if !defined(luai_userstateopen)
-#define luai_userstateopen(L)		((void)L)
-#endif
-
-#if !defined(luai_userstateclose)
-#define luai_userstateclose(L)		((void)L)
-#endif
-
-#if !defined(luai_userstatethread)
-#define luai_userstatethread(L,L1)	((void)L)
-#endif
-
-#if !defined(luai_userstatefree)
-#define luai_userstatefree(L,L1)	((void)L)
-#endif
-
-#if !defined(luai_userstateresume)
-#define luai_userstateresume(L,n)       ((void)L)
-#endif
-
-#if !defined(luai_userstateyield)
-#define luai_userstateyield(L,n)        ((void)L)
-#endif
-
-/*
-** lua_number2int is a macro to convert lua_Number to int.
-** lua_number2integer is a macro to convert lua_Number to lua_Integer.
-** lua_number2unsigned is a macro to convert a lua_Number to a lua_Unsigned.
-** lua_unsigned2number is a macro to convert a lua_Unsigned to a lua_Number.
-** luai_hashnum is a macro to hash a lua_Number value into an integer.
-** The hash must be deterministic and give reasonable values for
-** both small and large values (outside the range of integers).
-*/
-
-#if defined(MS_ASMTRICK) || defined(LUA_MSASMTRICK)	/* { */
-/* trick with Microsoft assembler for X86 */
-
-#define lua_number2int(i,n)  __asm {__asm fld n   __asm fistp i}
-#define lua_number2integer(i,n)		lua_number2int(i, n)
-#define lua_number2unsigned(i,n)  \
-  {__int64 l; __asm {__asm fld n   __asm fistp l} i = (unsigned int)l;}
-
-
-#elif defined(LUA_IEEE754TRICK)		/* }{ */
-/* the next trick should work on any machine using IEEE754 with
-   a 32-bit int type */
-
-union luai_Cast { double l_d; LUA_INT32 l_p[2]; };
-
-#if !defined(LUA_IEEEENDIAN)	/* { */
-#define LUAI_EXTRAIEEE	\
-  static const union luai_Cast ieeeendian = {-(33.0 + 6755399441055744.0)};
-#define LUA_IEEEENDIANLOC	(ieeeendian.l_p[1] == 33)
-#else
-#define LUA_IEEEENDIANLOC	LUA_IEEEENDIAN
-#define LUAI_EXTRAIEEE		/* empty */
-#endif				/* } */
-
-#define lua_number2int32(i,n,t) \
-  { LUAI_EXTRAIEEE \
-    volatile union luai_Cast u; u.l_d = (n) + 6755399441055744.0; \
-    (i) = (t)u.l_p[LUA_IEEEENDIANLOC]; }
-
-#define luai_hashnum(i,n)  \
-  { volatile union luai_Cast u; u.l_d = (n) + 1.0;  /* avoid -0 */ \
-    (i) = u.l_p[0]; (i) += u.l_p[1]; }  /* add double bits for his hash */
-
-#define lua_number2int(i,n)		lua_number2int32(i, n, int)
-#define lua_number2unsigned(i,n)	lua_number2int32(i, n, lua_Unsigned)
-
-/* the trick can be expanded to lua_Integer when it is a 32-bit value */
-#if defined(LUA_IEEELL)
-#define lua_number2integer(i,n)		lua_number2int32(i, n, lua_Integer)
-#endif
-
-#endif				/* } */
-
-
-/* the following definitions always work, but may be slow */
-
-#if !defined(lua_number2int)
-#define lua_number2int(i,n)	((i)=(int)(n))
-#endif
-
-#if !defined(lua_number2integer)
-#define lua_number2integer(i,n)	((i)=(lua_Integer)(n))
-#endif
-
-#if !defined(lua_number2unsigned)	/* { */
-/* the following definition assures proper modulo behavior */
-#if defined(LUA_NUMBER_DOUBLE) || defined(LUA_NUMBER_FLOAT)
-#include <math.h>
-#define SUPUNSIGNED	((lua_Number)(~(lua_Unsigned)0) + 1)
-#define lua_number2unsigned(i,n)  \
-	((i)=(lua_Unsigned)((n) - floor((n)/SUPUNSIGNED)*SUPUNSIGNED))
-#else
-#define lua_number2unsigned(i,n)	((i)=(lua_Unsigned)(n))
-#endif
-#endif				/* } */
-
-
-#if !defined(lua_unsigned2number)
-/* on several machines, coercion from unsigned to double is slow,
-   so it may be worth to avoid */
-#define lua_unsigned2number(u)  \
-    (((u) <= (lua_Unsigned)INT_MAX) ? (lua_Number)(int)(u) : (lua_Number)(u))
-#endif
-
-
-
-#if defined(ltable_c) && !defined(luai_hashnum)
-
-extern int lcompat_hashnum(int64_t);
-
-#define luai_hashnum(i,n) (i = lcompat_hashnum(n))
-
-#endif
-
-
-
-/*
-** macro to control inclusion of some hard tests on stack reallocation
-*/
-#if !defined(HARDSTACKTESTS)
-#define condmovestack(L)	((void)0)
-#else
-/* realloc stack keeping its size */
-#define condmovestack(L)	luaD_reallocstack((L), (L)->stacksize)
-#endif
-
-#if !defined(HARDMEMTESTS)
-#define condchangemem(L)	condmovestack(L)
-#else
-#define condchangemem(L)  \
-	((void)(!(G(L)->gcrunning) || (luaC_fullgc(L, 0), 1)))
-#endif
-
-#endif
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lmem.h
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lmem.h
+++ /dev/null
@@ -1,57 +0,0 @@
-/*
-** $Id: lmem.h,v 1.40.1.1 2013/04/12 18:48:47 roberto Exp $
-** Interface to Memory Manager
-** See Copyright Notice in lua.h
-*/
-
-#ifndef lmem_h
-#define lmem_h
-
-
-#include <sys/zfs_context.h>
-
-#include "llimits.h"
-#include "lua.h"
-
-
-/*
-** This macro avoids the runtime division MAX_SIZET/(e), as 'e' is
-** always constant.
-** The macro is somewhat complex to avoid warnings:
-** +1 avoids warnings of "comparison has constant result";
-** cast to 'void' avoids warnings of "value unused".
-*/
-#define luaM_reallocv(L,b,on,n,e) \
-  (cast(void, \
-     (cast(size_t, (n)+1) > MAX_SIZET/(e)) ? (luaM_toobig(L), 0) : 0), \
-   luaM_realloc_(L, (b), (on)*(e), (n)*(e)))
-
-#define luaM_freemem(L, b, s)	luaM_realloc_(L, (b), (s), 0)
-#define luaM_free(L, b)		luaM_realloc_(L, (b), sizeof(*(b)), 0)
-#define luaM_freearray(L, b, n)   luaM_reallocv(L, (b), n, 0, sizeof((b)[0]))
-
-#define luaM_malloc(L,s)	luaM_realloc_(L, NULL, 0, (s))
-#define luaM_new(L,t)		cast(t *, luaM_malloc(L, sizeof(t)))
-#define luaM_newvector(L,n,t) \
-		cast(t *, luaM_reallocv(L, NULL, 0, n, sizeof(t)))
-
-#define luaM_newobject(L,tag,s)	luaM_realloc_(L, NULL, tag, (s))
-
-#define luaM_growvector(L,v,nelems,size,t,limit,e) \
-          if ((nelems)+1 > (size)) \
-            ((v)=cast(t *, luaM_growaux_(L,v,&(size),sizeof(t),limit,e)))
-
-#define luaM_reallocvector(L, v,oldn,n,t) \
-   ((v)=cast(t *, luaM_reallocv(L, v, oldn, n, sizeof(t))))
-
-LUAI_FUNC l_noret luaM_toobig (lua_State *L);
-
-/* not to be called directly */
-LUAI_FUNC void *luaM_realloc_ (lua_State *L, void *block, size_t oldsize,
-                                                          size_t size);
-LUAI_FUNC void *luaM_growaux_ (lua_State *L, void *block, int *size,
-                               size_t size_elem, int limit,
-                               const char *what);
-
-#endif
-
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lmem.c
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lmem.c
+++ /dev/null
@@ -1,99 +0,0 @@
-/*
-** $Id: lmem.c,v 1.84.1.1 2013/04/12 18:48:47 roberto Exp $
-** Interface to Memory Manager
-** See Copyright Notice in lua.h
-*/
-
-
-#include <sys/zfs_context.h>
-
-#define lmem_c
-#define LUA_CORE
-
-#include "lua.h"
-
-#include "ldebug.h"
-#include "ldo.h"
-#include "lgc.h"
-#include "lmem.h"
-#include "lobject.h"
-#include "lstate.h"
-
-
-
-/*
-** About the realloc function:
-** void * frealloc (void *ud, void *ptr, size_t osize, size_t nsize);
-** (`osize' is the old size, `nsize' is the new size)
-**
-** * frealloc(ud, NULL, x, s) creates a new block of size `s' (no
-** matter 'x').
-**
-** * frealloc(ud, p, x, 0) frees the block `p'
-** (in this specific case, frealloc must return NULL);
-** particularly, frealloc(ud, NULL, 0, 0) does nothing
-** (which is equivalent to free(NULL) in ANSI C)
-**
-** frealloc returns NULL if it cannot create or reallocate the area
-** (any reallocation to an equal or smaller size cannot fail!)
-*/
-
-
-
-#define MINSIZEARRAY	4
-
-
-void *luaM_growaux_ (lua_State *L, void *block, int *size, size_t size_elems,
-                     int limit, const char *what) {
-  void *newblock;
-  int newsize;
-  if (*size >= limit/2) {  /* cannot double it? */
-    if (*size >= limit)  /* cannot grow even a little? */
-      luaG_runerror(L, "too many %s (limit is %d)", what, limit);
-    newsize = limit;  /* still have at least one free place */
-  }
-  else {
-    newsize = (*size)*2;
-    if (newsize < MINSIZEARRAY)
-      newsize = MINSIZEARRAY;  /* minimum size */
-  }
-  newblock = luaM_reallocv(L, block, *size, newsize, size_elems);
-  *size = newsize;  /* update only when everything else is OK */
-  return newblock;
-}
-
-
-l_noret luaM_toobig (lua_State *L) {
-  luaG_runerror(L, "memory allocation error: block too big");
-}
-
-
-
-/*
-** generic allocation routine.
-*/
-void *luaM_realloc_ (lua_State *L, void *block, size_t osize, size_t nsize) {
-  void *newblock;
-  global_State *g = G(L);
-  size_t realosize = (block) ? osize : 0;
-  lua_assert((realosize == 0) == (block == NULL));
-#if defined(HARDMEMTESTS)
-  if (nsize > realosize && g->gcrunning)
-    luaC_fullgc(L, 1);  /* force a GC whenever possible */
-#endif
-  newblock = (*g->frealloc)(g->ud, block, osize, nsize);
-  if (newblock == NULL && nsize > 0) {
-    api_check(L, nsize > realosize,
-                 "realloc cannot fail when shrinking a block");
-    if (g->gcrunning) {
-      luaC_fullgc(L, 1);  /* try to free some memory... */
-      newblock = (*g->frealloc)(g->ud, block, osize, nsize);  /* try again */
-    }
-    if (newblock == NULL)
-      luaD_throw(L, LUA_ERRMEM);
-  }
-  lua_assert((nsize == 0) == (newblock == NULL));
-  g->GCdebt = (g->GCdebt + nsize) - realosize;
-  return newblock;
-}
-
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lobject.h
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lobject.h
+++ /dev/null
@@ -1,606 +0,0 @@
-/*
-** $Id: lobject.h,v 2.71.1.2 2014/05/07 14:14:58 roberto Exp $
-** Type definitions for Lua objects
-** See Copyright Notice in lua.h
-*/
-
-
-#ifndef lobject_h
-#define lobject_h
-
-
-#include <sys/zfs_context.h>
-
-#include "llimits.h"
-#include "lua.h"
-
-
-/*
-** Extra tags for non-values
-*/
-#define LUA_TPROTO	LUA_NUMTAGS
-#define LUA_TUPVAL	(LUA_NUMTAGS+1)
-#define LUA_TDEADKEY	(LUA_NUMTAGS+2)
-
-/*
-** number of all possible tags (including LUA_TNONE but excluding DEADKEY)
-*/
-#define LUA_TOTALTAGS	(LUA_TUPVAL+2)
-
-
-/*
-** tags for Tagged Values have the following use of bits:
-** bits 0-3: actual tag (a LUA_T* value)
-** bits 4-5: variant bits
-** bit 6: whether value is collectable
-*/
-
-#define VARBITS		(3 << 4)
-
-
-/*
-** LUA_TFUNCTION variants:
-** 0 - Lua function
-** 1 - light C function
-** 2 - regular C function (closure)
-*/
-
-/* Variant tags for functions */
-#define LUA_TLCL	(LUA_TFUNCTION | (0 << 4))  /* Lua closure */
-#define LUA_TLCF	(LUA_TFUNCTION | (1 << 4))  /* light C function */
-#define LUA_TCCL	(LUA_TFUNCTION | (2 << 4))  /* C closure */
-
-
-/* Variant tags for strings */
-#define LUA_TSHRSTR	(LUA_TSTRING | (0 << 4))  /* short strings */
-#define LUA_TLNGSTR	(LUA_TSTRING | (1 << 4))  /* long strings */
-
-
-/* Bit mark for collectable types */
-#define BIT_ISCOLLECTABLE	(1 << 6)
-
-/* mark a tag as collectable */
-#define ctb(t)			((t) | BIT_ISCOLLECTABLE)
-
-
-/*
-** Union of all collectable objects
-*/
-typedef union GCObject GCObject;
-
-
-/*
-** Common Header for all collectable objects (in macro form, to be
-** included in other objects)
-*/
-#define CommonHeader	GCObject *next; lu_byte tt; lu_byte marked
-
-
-/*
-** Common header in struct form
-*/
-typedef struct GCheader {
-  CommonHeader;
-} GCheader;
-
-
-
-/*
-** Union of all Lua values
-*/
-typedef union Value Value;
-
-
-#define numfield	lua_Number n;    /* numbers */
-
-
-
-/*
-** Tagged Values. This is the basic representation of values in Lua,
-** an actual value plus a tag with its type.
-*/
-
-#define TValuefields	Value value_; int tt_
-
-typedef struct lua_TValue TValue;
-
-
-/* macro defining a nil value */
-#define NILCONSTANT	{NULL}, LUA_TNIL
-
-
-#define val_(o)		((o)->value_)
-#define num_(o)		(val_(o).n)
-
-
-/* raw type tag of a TValue */
-#define rttype(o)	((o)->tt_)
-
-/* tag with no variants (bits 0-3) */
-#define novariant(x)	((x) & 0x0F)
-
-/* type tag of a TValue (bits 0-3 for tags + variant bits 4-5) */
-#define ttype(o)	(rttype(o) & 0x3F)
-
-/* type tag of a TValue with no variants (bits 0-3) */
-#define ttypenv(o)	(novariant(rttype(o)))
-
-
-/* Macros to test type */
-#define checktag(o,t)		(rttype(o) == (t))
-#define checktype(o,t)		(ttypenv(o) == (t))
-#define ttisnumber(o)		checktag((o), LUA_TNUMBER)
-#define ttisnil(o)		checktag((o), LUA_TNIL)
-#define ttisboolean(o)		checktag((o), LUA_TBOOLEAN)
-#define ttislightuserdata(o)	checktag((o), LUA_TLIGHTUSERDATA)
-#define ttisstring(o)		checktype((o), LUA_TSTRING)
-#define ttisshrstring(o)	checktag((o), ctb(LUA_TSHRSTR))
-#define ttislngstring(o)	checktag((o), ctb(LUA_TLNGSTR))
-#define ttistable(o)		checktag((o), ctb(LUA_TTABLE))
-#define ttisfunction(o)		checktype(o, LUA_TFUNCTION)
-#define ttisclosure(o)		((rttype(o) & 0x1F) == LUA_TFUNCTION)
-#define ttisCclosure(o)		checktag((o), ctb(LUA_TCCL))
-#define ttisLclosure(o)		checktag((o), ctb(LUA_TLCL))
-#define ttislcf(o)		checktag((o), LUA_TLCF)
-#define ttisuserdata(o)		checktag((o), ctb(LUA_TUSERDATA))
-#define ttisthread(o)		checktag((o), ctb(LUA_TTHREAD))
-#define ttisdeadkey(o)		checktag((o), LUA_TDEADKEY)
-
-#define ttisequal(o1,o2)	(rttype(o1) == rttype(o2))
-
-/* Macros to access values */
-#define nvalue(o)	check_exp(ttisnumber(o), num_(o))
-#define gcvalue(o)	check_exp(iscollectable(o), val_(o).gc)
-#define pvalue(o)	check_exp(ttislightuserdata(o), val_(o).p)
-#define rawtsvalue(o)	check_exp(ttisstring(o), &val_(o).gc->ts)
-#define tsvalue(o)	(&rawtsvalue(o)->tsv)
-#define rawuvalue(o)	check_exp(ttisuserdata(o), &val_(o).gc->u)
-#define uvalue(o)	(&rawuvalue(o)->uv)
-#define clvalue(o)	check_exp(ttisclosure(o), &val_(o).gc->cl)
-#define clLvalue(o)	check_exp(ttisLclosure(o), &val_(o).gc->cl.l)
-#define clCvalue(o)	check_exp(ttisCclosure(o), &val_(o).gc->cl.c)
-#define fvalue(o)	check_exp(ttislcf(o), val_(o).f)
-#define hvalue(o)	check_exp(ttistable(o), &val_(o).gc->h)
-#define bvalue(o)	check_exp(ttisboolean(o), val_(o).b)
-#define thvalue(o)	check_exp(ttisthread(o), &val_(o).gc->th)
-/* a dead value may get the 'gc' field, but cannot access its contents */
-#define deadvalue(o)	check_exp(ttisdeadkey(o), cast(void *, val_(o).gc))
-
-#define l_isfalse(o)	(ttisnil(o) || (ttisboolean(o) && bvalue(o) == 0))
-
-
-#define iscollectable(o)	(rttype(o) & BIT_ISCOLLECTABLE)
-
-
-/* Macros for internal tests */
-#define righttt(obj)		(ttype(obj) == gcvalue(obj)->gch.tt)
-
-#define checkliveness(g,obj) \
-	lua_longassert(!iscollectable(obj) || \
-			(righttt(obj) && !isdead(g,gcvalue(obj))))
-
-
-/* Macros to set values */
-#define settt_(o,t)	((o)->tt_=(t))
-
-#define setnvalue(obj,x) \
-  { TValue *io=(obj); num_(io)=(x); settt_(io, LUA_TNUMBER); }
-
-#define setnilvalue(obj) settt_(obj, LUA_TNIL)
-
-#define setfvalue(obj,x) \
-  { TValue *io=(obj); val_(io).f=(x); settt_(io, LUA_TLCF); }
-
-#define setpvalue(obj,x) \
-  { TValue *io=(obj); val_(io).p=(x); settt_(io, LUA_TLIGHTUSERDATA); }
-
-#define setbvalue(obj,x) \
-  { TValue *io=(obj); val_(io).b=(x); settt_(io, LUA_TBOOLEAN); }
-
-#define setgcovalue(L,obj,x) \
-  { TValue *io=(obj); GCObject *i_g=(x); \
-    val_(io).gc=i_g; settt_(io, ctb(gch(i_g)->tt)); }
-
-#define setsvalue(L,obj,x) \
-  { TValue *io=(obj); \
-    TString *x_ = (x); \
-    val_(io).gc=cast(GCObject *, x_); settt_(io, ctb(x_->tsv.tt)); \
-    checkliveness(G(L),io); }
-
-#define setuvalue(L,obj,x) \
-  { TValue *io=(obj); \
-    val_(io).gc=cast(GCObject *, (x)); settt_(io, ctb(LUA_TUSERDATA)); \
-    checkliveness(G(L),io); }
-
-#define setthvalue(L,obj,x) \
-  { TValue *io=(obj); \
-    val_(io).gc=cast(GCObject *, (x)); settt_(io, ctb(LUA_TTHREAD)); \
-    checkliveness(G(L),io); }
-
-#define setclLvalue(L,obj,x) \
-  { TValue *io=(obj); \
-    val_(io).gc=cast(GCObject *, (x)); settt_(io, ctb(LUA_TLCL)); \
-    checkliveness(G(L),io); }
-
-#define setclCvalue(L,obj,x) \
-  { TValue *io=(obj); \
-    val_(io).gc=cast(GCObject *, (x)); settt_(io, ctb(LUA_TCCL)); \
-    checkliveness(G(L),io); }
-
-#define sethvalue(L,obj,x) \
-  { TValue *io=(obj); \
-    val_(io).gc=cast(GCObject *, (x)); settt_(io, ctb(LUA_TTABLE)); \
-    checkliveness(G(L),io); }
-
-#define setdeadvalue(obj)	settt_(obj, LUA_TDEADKEY)
-
-
-
-#define setobj(L,obj1,obj2) \
-	{ const TValue *io2=(obj2); TValue *io1=(obj1); \
-	  io1->value_ = io2->value_; io1->tt_ = io2->tt_; \
-	  checkliveness(G(L),io1); }
-
-
-/*
-** different types of assignments, according to destination
-*/
-
-/* from stack to (same) stack */
-#define setobjs2s	setobj
-/* to stack (not from same stack) */
-#define setobj2s	setobj
-#define setsvalue2s	setsvalue
-#define sethvalue2s	sethvalue
-#define setptvalue2s	setptvalue
-/* from table to same table */
-#define setobjt2t	setobj
-/* to table */
-#define setobj2t	setobj
-/* to new object */
-#define setobj2n	setobj
-#define setsvalue2n	setsvalue
-
-
-/* check whether a number is valid (useful only for NaN trick) */
-#define luai_checknum(L,o,c)	{ /* empty */ }
-
-
-/*
-** {======================================================
-** NaN Trick
-** =======================================================
-*/
-#if defined(LUA_NANTRICK)
-
-/*
-** numbers are represented in the 'd_' field. All other values have the
-** value (NNMARK | tag) in 'tt__'. A number with such pattern would be
-** a "signaled NaN", which is never generated by regular operations by
-** the CPU (nor by 'strtod')
-*/
-
-/* allows for external implementation for part of the trick */
-#if !defined(NNMARK)	/* { */
-
-
-#if !defined(LUA_IEEEENDIAN)
-#error option 'LUA_NANTRICK' needs 'LUA_IEEEENDIAN'
-#endif
-
-
-#define NNMARK		0x7FF7A500
-#define NNMASK		0x7FFFFF00
-
-#undef TValuefields
-#undef NILCONSTANT
-
-#if (LUA_IEEEENDIAN == 0)	/* { */
-
-/* little endian */
-#define TValuefields  \
-	union { struct { Value v__; int tt__; } i; double d__; } u
-#define NILCONSTANT	{{{NULL}, tag2tt(LUA_TNIL)}}
-/* field-access macros */
-#define v_(o)		((o)->u.i.v__)
-#define d_(o)		((o)->u.d__)
-#define tt_(o)		((o)->u.i.tt__)
-
-#else				/* }{ */
-
-/* big endian */
-#define TValuefields  \
-	union { struct { int tt__; Value v__; } i; double d__; } u
-#define NILCONSTANT	{{tag2tt(LUA_TNIL), {NULL}}}
-/* field-access macros */
-#define v_(o)		((o)->u.i.v__)
-#define d_(o)		((o)->u.d__)
-#define tt_(o)		((o)->u.i.tt__)
-
-#endif				/* } */
-
-#endif			/* } */
-
-
-/* correspondence with standard representation */
-#undef val_
-#define val_(o)		v_(o)
-#undef num_
-#define num_(o)		d_(o)
-
-
-#undef numfield
-#define numfield	/* no such field; numbers are the entire struct */
-
-/* basic check to distinguish numbers from non-numbers */
-#undef ttisnumber
-#define ttisnumber(o)	((tt_(o) & NNMASK) != NNMARK)
-
-#define tag2tt(t)	(NNMARK | (t))
-
-#undef rttype
-#define rttype(o)	(ttisnumber(o) ? LUA_TNUMBER : tt_(o) & 0xff)
-
-#undef settt_
-#define settt_(o,t)	(tt_(o) = tag2tt(t))
-
-#undef setnvalue
-#define setnvalue(obj,x) \
-	{ TValue *io_=(obj); num_(io_)=(x); lua_assert(ttisnumber(io_)); }
-
-#undef setobj
-#define setobj(L,obj1,obj2) \
-	{ const TValue *o2_=(obj2); TValue *o1_=(obj1); \
-	  o1_->u = o2_->u; \
-	  checkliveness(G(L),o1_); }
-
-
-/*
-** these redefinitions are not mandatory, but these forms are more efficient
-*/
-
-#undef checktag
-#undef checktype
-#define checktag(o,t)	(tt_(o) == tag2tt(t))
-#define checktype(o,t)	(ctb(tt_(o) | VARBITS) == ctb(tag2tt(t) | VARBITS))
-
-#undef ttisequal
-#define ttisequal(o1,o2)  \
-	(ttisnumber(o1) ? ttisnumber(o2) : (tt_(o1) == tt_(o2)))
-
-
-#undef luai_checknum
-#define luai_checknum(L,o,c)	{ if (!ttisnumber(o)) c; }
-
-#endif
-/* }====================================================== */
-
-
-
-/*
-** {======================================================
-** types and prototypes
-** =======================================================
-*/
-
-
-union Value {
-  GCObject *gc;    /* collectable objects */
-  void *p;         /* light userdata */
-  int b;           /* booleans */
-  lua_CFunction f; /* light C functions */
-  numfield         /* numbers */
-};
-
-
-struct lua_TValue {
-  TValuefields;
-};
-
-
-typedef TValue *StkId;  /* index to stack elements */
-
-
-
-
-/*
-** Header for string value; string bytes follow the end of this structure
-*/
-typedef union TString {
-  L_Umaxalign dummy;  /* ensures maximum alignment for strings */
-  struct {
-    CommonHeader;
-    lu_byte extra;  /* reserved words for short strings; "has hash" for longs */
-    unsigned int hash;
-    size_t len;  /* number of characters in string */
-  } tsv;
-} TString;
-
-
-/* get the actual string (array of bytes) from a TString */
-#define getstr(ts)	cast(const char *, (ts) + 1)
-
-/* get the actual string (array of bytes) from a Lua value */
-#define svalue(o)       getstr(rawtsvalue(o))
-
-
-/*
-** Header for userdata; memory area follows the end of this structure
-*/
-typedef union Udata {
-  L_Umaxalign dummy;  /* ensures maximum alignment for `local' udata */
-  struct {
-    CommonHeader;
-    struct Table *metatable;
-    struct Table *env;
-    size_t len;  /* number of bytes */
-  } uv;
-} Udata;
-
-
-
-/*
-** Description of an upvalue for function prototypes
-*/
-typedef struct Upvaldesc {
-  TString *name;  /* upvalue name (for debug information) */
-  lu_byte instack;  /* whether it is in stack */
-  lu_byte idx;  /* index of upvalue (in stack or in outer function's list) */
-} Upvaldesc;
-
-
-/*
-** Description of a local variable for function prototypes
-** (used for debug information)
-*/
-typedef struct LocVar {
-  TString *varname;
-  int startpc;  /* first point where variable is active */
-  int endpc;    /* first point where variable is dead */
-} LocVar;
-
-
-/*
-** Function Prototypes
-*/
-typedef struct Proto {
-  CommonHeader;
-  TValue *k;  /* constants used by the function */
-  Instruction *code;
-  struct Proto **p;  /* functions defined inside the function */
-  int *lineinfo;  /* map from opcodes to source lines (debug information) */
-  LocVar *locvars;  /* information about local variables (debug information) */
-  Upvaldesc *upvalues;  /* upvalue information */
-  union Closure *cache;  /* last created closure with this prototype */
-  TString  *source;  /* used for debug information */
-  int sizeupvalues;  /* size of 'upvalues' */
-  int sizek;  /* size of `k' */
-  int sizecode;
-  int sizelineinfo;
-  int sizep;  /* size of `p' */
-  int sizelocvars;
-  int linedefined;
-  int lastlinedefined;
-  GCObject *gclist;
-  lu_byte numparams;  /* number of fixed parameters */
-  lu_byte is_vararg;
-  lu_byte maxstacksize;  /* maximum stack used by this function */
-} Proto;
-
-
-
-/*
-** Lua Upvalues
-*/
-typedef struct UpVal {
-  CommonHeader;
-  TValue *v;  /* points to stack or to its own value */
-  union {
-    TValue value;  /* the value (when closed) */
-    struct {  /* double linked list (when open) */
-      struct UpVal *prev;
-      struct UpVal *next;
-    } l;
-  } u;
-} UpVal;
-
-
-/*
-** Closures
-*/
-
-#define ClosureHeader \
-	CommonHeader; lu_byte nupvalues; GCObject *gclist
-
-typedef struct CClosure {
-  ClosureHeader;
-  lua_CFunction f;
-  TValue upvalue[1];  /* list of upvalues */
-} CClosure;
-
-
-typedef struct LClosure {
-  ClosureHeader;
-  struct Proto *p;
-  UpVal *upvals[1];  /* list of upvalues */
-} LClosure;
-
-
-typedef union Closure {
-  CClosure c;
-  LClosure l;
-} Closure;
-
-
-#define isLfunction(o)	ttisLclosure(o)
-
-#define getproto(o)	(clLvalue(o)->p)
-
-
-/*
-** Tables
-*/
-
-typedef union TKey {
-  struct {
-    TValuefields;
-    struct Node *next;  /* for chaining */
-  } nk;
-  TValue tvk;
-} TKey;
-
-
-typedef struct Node {
-  TValue i_val;
-  TKey i_key;
-} Node;
-
-
-typedef struct Table {
-  CommonHeader;
-  lu_byte flags;  /* 1<<p means tagmethod(p) is not present */
-  lu_byte lsizenode;  /* log2 of size of `node' array */
-  int sizearray;  /* size of `array' array */
-  TValue *array;  /* array part */
-  Node *node;
-  Node *lastfree;  /* any free position is before this position */
-  struct Table *metatable;
-  GCObject *gclist;
-} Table;
-
-
-
-/*
-** `module' operation for hashing (size is always a power of 2)
-*/
-#define lmod(s,size) \
-	(check_exp((size&(size-1))==0, (cast(int, (s) & ((size)-1)))))
-
-
-#define twoto(x)	(1<<(x))
-#define sizenode(t)	(twoto((t)->lsizenode))
-
-
-/*
-** (address of) a fixed nil value
-*/
-#define luaO_nilobject		(&luaO_nilobject_)
-
-
-LUAI_DDEC const TValue luaO_nilobject_;
-
-
-LUAI_FUNC int luaO_int2fb (unsigned int x);
-LUAI_FUNC int luaO_fb2int (int x);
-LUAI_FUNC int luaO_ceillog2 (unsigned int x);
-LUAI_FUNC lua_Number luaO_arith (int op, lua_Number v1, lua_Number v2);
-LUAI_FUNC int luaO_str2d (const char *s, size_t len, lua_Number *result);
-LUAI_FUNC int luaO_hexavalue (int c);
-LUAI_FUNC const char *luaO_pushvfstring (lua_State *L, const char *fmt,
-                                                       va_list argp);
-LUAI_FUNC const char *luaO_pushfstring (lua_State *L, const char *fmt, ...);
-LUAI_FUNC void luaO_chunkid (char *out, const char *source, size_t len);
-
-
-#endif
-
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lobject.c
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lobject.c
+++ /dev/null
@@ -1,283 +0,0 @@
-/*
-** $Id: lobject.c,v 2.58.1.1 2013/04/12 18:48:47 roberto Exp $
-** Some generic functions over Lua objects
-** See Copyright Notice in lua.h
-*/
-
-#include <sys/zfs_context.h>
-
-#define lobject_c
-#define LUA_CORE
-
-#include "lua.h"
-
-#include "lctype.h"
-#include "ldebug.h"
-#include "ldo.h"
-#include "lmem.h"
-#include "lobject.h"
-#include "lstate.h"
-#include "lstring.h"
-#include "lvm.h"
-
-
-
-LUAI_DDEF const TValue luaO_nilobject_ = {NILCONSTANT};
-
-
-/*
-** converts an integer to a "floating point byte", represented as
-** (eeeeexxx), where the real value is (1xxx) * 2^(eeeee - 1) if
-** eeeee != 0 and (xxx) otherwise.
-*/
-int luaO_int2fb (unsigned int x) {
-  int e = 0;  /* exponent */
-  if (x < 8) return x;
-  while (x >= 0x10) {
-    x = (x+1) >> 1;
-    e++;
-  }
-  return ((e+1) << 3) | (cast_int(x) - 8);
-}
-
-
-/* converts back */
-int luaO_fb2int (int x) {
-  int e = (x >> 3) & 0x1f;
-  if (e == 0) return x;
-  else return ((x & 7) + 8) << (e - 1);
-}
-
-
-int luaO_ceillog2 (unsigned int x) {
-  static const lu_byte log_2[256] = {
-    0,1,2,2,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,
-    6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,
-    7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
-    7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
-    8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
-    8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
-    8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
-    8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8
-  };
-  int l = 0;
-  x--;
-  while (x >= 256) { l += 8; x >>= 8; }
-  return l + log_2[x];
-}
-
-
-lua_Number luaO_arith (int op, lua_Number v1, lua_Number v2) {
-  switch (op) {
-    case LUA_OPADD: return luai_numadd(NULL, v1, v2);
-    case LUA_OPSUB: return luai_numsub(NULL, v1, v2);
-    case LUA_OPMUL: return luai_nummul(NULL, v1, v2);
-    case LUA_OPDIV: return luai_numdiv(NULL, v1, v2);
-    case LUA_OPMOD: return luai_nummod(NULL, v1, v2);
-    case LUA_OPPOW: return luai_numpow(NULL, v1, v2);
-    case LUA_OPUNM: return luai_numunm(NULL, v1);
-    default: lua_assert(0); return 0;
-  }
-}
-
-
-int luaO_hexavalue (int c) {
-  if (lisdigit(c)) return c - '0';
-  else return ltolower(c) - 'a' + 10;
-}
-
-
-#if !defined(lua_strx2number)
-
-
-
-static int isneg (const char **s) {
-  if (**s == '-') { (*s)++; return 1; }
-  else if (**s == '+') (*s)++;
-  return 0;
-}
-
-
-static lua_Number readhexa (const char **s, lua_Number r, int *count) {
-  for (; lisxdigit(cast_uchar(**s)); (*s)++) {  /* read integer part */
-    r = (r * cast_num(16.0)) + cast_num(luaO_hexavalue(cast_uchar(**s)));
-    (*count)++;
-  }
-  return r;
-}
-
-
-/*
-** convert an hexadecimal numeric string to a number, following
-** C99 specification for 'strtod'
-*/
-static lua_Number lua_strx2number (const char *s, char **endptr) {
-  lua_Number r = 0.0;
-  int e = 0, i = 0;
-  int neg = 0;  /* 1 if number is negative */
-  *endptr = cast(char *, s);  /* nothing is valid yet */
-  while (lisspace(cast_uchar(*s))) s++;  /* skip initial spaces */
-  neg = isneg(&s);  /* check signal */
-  if (!(*s == '0' && (*(s + 1) == 'x' || *(s + 1) == 'X')))  /* check '0x' */
-    return 0.0;  /* invalid format (no '0x') */
-  s += 2;  /* skip '0x' */
-  r = readhexa(&s, r, &i);  /* read integer part */
-  if (*s == '.') {
-    s++;  /* skip dot */
-    r = readhexa(&s, r, &e);  /* read fractional part */
-  }
-  if (i == 0 && e == 0)
-    return 0.0;  /* invalid format (no digit) */
-  e *= -4;  /* each fractional digit divides value by 2^-4 */
-  *endptr = cast(char *, s);  /* valid up to here */
-  if (*s == 'p' || *s == 'P') {  /* exponent part? */
-    int exp1 = 0;
-    int neg1;
-    s++;  /* skip 'p' */
-    neg1 = isneg(&s);  /* signal */
-    if (!lisdigit(cast_uchar(*s)))
-      goto ret;  /* must have at least one digit */
-    while (lisdigit(cast_uchar(*s)))  /* read exponent */
-      exp1 = exp1 * 10 + *(s++) - '0';
-    if (neg1) exp1 = -exp1;
-    e += exp1;
-  }
-  *endptr = cast(char *, s);  /* valid up to here */
- ret:
-  if (neg) r = -r;
-  return (r * (1 << e));
-}
-
-#endif
-
-
-int luaO_str2d (const char *s, size_t len, lua_Number *result) {
-  char *endptr;
-  if (strpbrk(s, "nN"))  /* reject 'inf' and 'nan' */
-    return 0;
-  else if (strpbrk(s, "xX"))  /* hexa? */
-    *result = lua_strx2number(s, &endptr);
-  else
-    *result = lua_str2number(s, &endptr);
-  if (endptr == s) return 0;  /* nothing recognized */
-  while (lisspace(cast_uchar(*endptr))) endptr++;
-  return (endptr == s + len);  /* OK if no trailing characters */
-}
-
-
-
-static void pushstr (lua_State *L, const char *str, size_t l) {
-  setsvalue2s(L, L->top++, luaS_newlstr(L, str, l));
-}
-
-
-/* this function handles only `%d', `%c', %f, %p, and `%s' formats */
-const char *luaO_pushvfstring (lua_State *L, const char *fmt, va_list argp) {
-  int n = 0;
-  for (;;) {
-    const char *e = strchr(fmt, '%');
-    if (e == NULL) break;
-    luaD_checkstack(L, 2);  /* fmt + item */
-    pushstr(L, fmt, e - fmt);
-    switch (*(e+1)) {
-      case 's': {
-        const char *s = va_arg(argp, char *);
-        if (s == NULL) s = "(null)";
-        pushstr(L, s, strlen(s));
-        break;
-      }
-      case 'c': {
-        char buff;
-        buff = cast(char, va_arg(argp, int));
-        pushstr(L, &buff, 1);
-        break;
-      }
-      case 'd': {
-        setnvalue(L->top++, cast_num(va_arg(argp, int)));
-        break;
-      }
-      case 'f': {
-        setnvalue(L->top++, cast_num(va_arg(argp, l_uacNumber)));
-        break;
-      }
-      case 'p': {
-        char buff[4*sizeof(void *) + 8]; /* should be enough space for a `%p' */
-        int l = lcompat_sprintf(buff, "%p", va_arg(argp, void *));
-        pushstr(L, buff, l);
-        break;
-      }
-      case '%': {
-        pushstr(L, "%", 1);
-        break;
-      }
-      default: {
-        luaG_runerror(L,
-            "invalid option " LUA_QL("%%%c") " to " LUA_QL("lua_pushfstring"),
-            *(e + 1));
-      }
-    }
-    n += 2;
-    fmt = e+2;
-  }
-  luaD_checkstack(L, 1);
-  pushstr(L, fmt, strlen(fmt));
-  if (n > 0) luaV_concat(L, n + 1);
-  return svalue(L->top - 1);
-}
-
-
-const char *luaO_pushfstring (lua_State *L, const char *fmt, ...) {
-  const char *msg;
-  va_list argp;
-  va_start(argp, fmt);
-  msg = luaO_pushvfstring(L, fmt, argp);
-  va_end(argp);
-  return msg;
-}
-
-
-/* number of chars of a literal string without the ending \0 */
-#define LL(x)	(sizeof(x)/sizeof(char) - 1)
-
-#define RETS	"..."
-#define PRE	"[string \""
-#define POS	"\"]"
-
-#define addstr(a,b,l)	( memcpy(a,b,(l) * sizeof(char)), a += (l) )
-
-void luaO_chunkid (char *out, const char *source, size_t bufflen) {
-  size_t l = strlen(source);
-  if (*source == '=') {  /* 'literal' source */
-    if (l <= bufflen)  /* small enough? */
-      memcpy(out, source + 1, l * sizeof(char));
-    else {  /* truncate it */
-      addstr(out, source + 1, bufflen - 1);
-      *out = '\0';
-    }
-  }
-  else if (*source == '@') {  /* file name */
-    if (l <= bufflen)  /* small enough? */
-      memcpy(out, source + 1, l * sizeof(char));
-    else {  /* add '...' before rest of name */
-      addstr(out, RETS, LL(RETS));
-      bufflen -= LL(RETS);
-      memcpy(out, source + 1 + l - bufflen, bufflen * sizeof(char));
-    }
-  }
-  else {  /* string; format as [string "source"] */
-    const char *nl = strchr(source, '\n');  /* find first new line (if any) */
-    addstr(out, PRE, LL(PRE));  /* add prefix */
-    bufflen -= LL(PRE RETS POS) + 1;  /* save space for prefix+suffix+'\0' */
-    if (l < bufflen && nl == NULL) {  /* small one-line source? */
-      addstr(out, source, l);  /* keep it */
-    }
-    else {
-      if (nl != NULL) l = nl - source;  /* stop at first newline */
-      if (l > bufflen) l = bufflen;
-      addstr(out, source, l);
-      addstr(out, RETS, LL(RETS));
-    }
-    memcpy(out, POS, (LL(POS) + 1) * sizeof(char));
-  }
-}
-
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lopcodes.h
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lopcodes.h
+++ /dev/null
@@ -1,288 +0,0 @@
-/*
-** $Id: lopcodes.h,v 1.142.1.2 2014/10/20 18:32:09 roberto Exp $
-** Opcodes for Lua virtual machine
-** See Copyright Notice in lua.h
-*/
-
-#ifndef lopcodes_h
-#define lopcodes_h
-
-#include "llimits.h"
-
-
-/*===========================================================================
-  We assume that instructions are unsigned numbers.
-  All instructions have an opcode in the first 6 bits.
-  Instructions can have the following fields:
-	`A' : 8 bits
-	`B' : 9 bits
-	`C' : 9 bits
-	'Ax' : 26 bits ('A', 'B', and 'C' together)
-	`Bx' : 18 bits (`B' and `C' together)
-	`sBx' : signed Bx
-
-  A signed argument is represented in excess K; that is, the number
-  value is the unsigned value minus K. K is exactly the maximum value
-  for that argument (so that -max is represented by 0, and +max is
-  represented by 2*max), which is half the maximum for the corresponding
-  unsigned argument.
-===========================================================================*/
-
-
-enum OpMode {iABC, iABx, iAsBx, iAx};  /* basic instruction format */
-
-
-/*
-** size and position of opcode arguments.
-*/
-#define SIZE_C		9
-#define SIZE_B		9
-#define SIZE_Bx		(SIZE_C + SIZE_B)
-#define SIZE_A		8
-#define SIZE_Ax		(SIZE_C + SIZE_B + SIZE_A)
-
-#define SIZE_OP		6
-
-#define POS_OP		0
-#define POS_A		(POS_OP + SIZE_OP)
-#define POS_C		(POS_A + SIZE_A)
-#define POS_B		(POS_C + SIZE_C)
-#define POS_Bx		POS_C
-#define POS_Ax		POS_A
-
-
-/*
-** limits for opcode arguments.
-** we use (signed) int to manipulate most arguments,
-** so they must fit in LUAI_BITSINT-1 bits (-1 for sign)
-*/
-#if SIZE_Bx < LUAI_BITSINT-1
-#define MAXARG_Bx        ((1<<SIZE_Bx)-1)
-#define MAXARG_sBx        (MAXARG_Bx>>1)         /* `sBx' is signed */
-#else
-#define MAXARG_Bx        MAX_INT
-#define MAXARG_sBx        MAX_INT
-#endif
-
-#if SIZE_Ax < LUAI_BITSINT-1
-#define MAXARG_Ax	((1<<SIZE_Ax)-1)
-#else
-#define MAXARG_Ax	MAX_INT
-#endif
-
-
-#define MAXARG_A        ((1<<SIZE_A)-1)
-#define MAXARG_B        ((1<<SIZE_B)-1)
-#define MAXARG_C        ((1<<SIZE_C)-1)
-
-
-/* creates a mask with `n' 1 bits at position `p' */
-#define MASK1(n,p)	((~((~(Instruction)0)<<(n)))<<(p))
-
-/* creates a mask with `n' 0 bits at position `p' */
-#define MASK0(n,p)	(~MASK1(n,p))
-
-/*
-** the following macros help to manipulate instructions
-*/
-
-#define GET_OPCODE(i)	(cast(OpCode, ((i)>>POS_OP) & MASK1(SIZE_OP,0)))
-#define SET_OPCODE(i,o)	((i) = (((i)&MASK0(SIZE_OP,POS_OP)) | \
-		((cast(Instruction, o)<<POS_OP)&MASK1(SIZE_OP,POS_OP))))
-
-#define getarg(i,pos,size)	(cast(int, ((i)>>pos) & MASK1(size,0)))
-#define setarg(i,v,pos,size)	((i) = (((i)&MASK0(size,pos)) | \
-                ((cast(Instruction, v)<<pos)&MASK1(size,pos))))
-
-#define GETARG_A(i)	getarg(i, POS_A, SIZE_A)
-#define SETARG_A(i,v)	setarg(i, v, POS_A, SIZE_A)
-
-#define GETARG_B(i)	getarg(i, POS_B, SIZE_B)
-#define SETARG_B(i,v)	setarg(i, v, POS_B, SIZE_B)
-
-#define GETARG_C(i)	getarg(i, POS_C, SIZE_C)
-#define SETARG_C(i,v)	setarg(i, v, POS_C, SIZE_C)
-
-#define GETARG_Bx(i)	getarg(i, POS_Bx, SIZE_Bx)
-#define SETARG_Bx(i,v)	setarg(i, v, POS_Bx, SIZE_Bx)
-
-#define GETARG_Ax(i)	getarg(i, POS_Ax, SIZE_Ax)
-#define SETARG_Ax(i,v)	setarg(i, v, POS_Ax, SIZE_Ax)
-
-#define GETARG_sBx(i)	(GETARG_Bx(i)-MAXARG_sBx)
-#define SETARG_sBx(i,b)	SETARG_Bx((i),cast(unsigned int, (b)+MAXARG_sBx))
-
-
-#define CREATE_ABC(o,a,b,c)	((cast(Instruction, o)<<POS_OP) \
-			| (cast(Instruction, a)<<POS_A) \
-			| (cast(Instruction, b)<<POS_B) \
-			| (cast(Instruction, c)<<POS_C))
-
-#define CREATE_ABx(o,a,bc)	((cast(Instruction, o)<<POS_OP) \
-			| (cast(Instruction, a)<<POS_A) \
-			| (cast(Instruction, bc)<<POS_Bx))
-
-#define CREATE_Ax(o,a)		((cast(Instruction, o)<<POS_OP) \
-			| (cast(Instruction, a)<<POS_Ax))
-
-
-/*
-** Macros to operate RK indices
-*/
-
-/* this bit 1 means constant (0 means register) */
-#define BITRK		(1 << (SIZE_B - 1))
-
-/* test whether value is a constant */
-#define ISK(x)		((x) & BITRK)
-
-/* gets the index of the constant */
-#define INDEXK(r)	((int)(r) & ~BITRK)
-
-#define MAXINDEXRK	(BITRK - 1)
-
-/* code a constant index as a RK value */
-#define RKASK(x)	((x) | BITRK)
-
-
-/*
-** invalid register that fits in 8 bits
-*/
-#define NO_REG		MAXARG_A
-
-
-/*
-** R(x) - register
-** Kst(x) - constant (in constant table)
-** RK(x) == if ISK(x) then Kst(INDEXK(x)) else R(x)
-*/
-
-
-/*
-** grep "ORDER OP" if you change these enums
-*/
-
-typedef enum {
-/*----------------------------------------------------------------------
-name		args	description
-------------------------------------------------------------------------*/
-OP_MOVE,/*	A B	R(A) := R(B)					*/
-OP_LOADK,/*	A Bx	R(A) := Kst(Bx)					*/
-OP_LOADKX,/*	A 	R(A) := Kst(extra arg)				*/
-OP_LOADBOOL,/*	A B C	R(A) := (Bool)B; if (C) pc++			*/
-OP_LOADNIL,/*	A B	R(A), R(A+1), ..., R(A+B) := nil		*/
-OP_GETUPVAL,/*	A B	R(A) := UpValue[B]				*/
-
-OP_GETTABUP,/*	A B C	R(A) := UpValue[B][RK(C)]			*/
-OP_GETTABLE,/*	A B C	R(A) := R(B)[RK(C)]				*/
-
-OP_SETTABUP,/*	A B C	UpValue[A][RK(B)] := RK(C)			*/
-OP_SETUPVAL,/*	A B	UpValue[B] := R(A)				*/
-OP_SETTABLE,/*	A B C	R(A)[RK(B)] := RK(C)				*/
-
-OP_NEWTABLE,/*	A B C	R(A) := {} (size = B,C)				*/
-
-OP_SELF,/*	A B C	R(A+1) := R(B); R(A) := R(B)[RK(C)]		*/
-
-OP_ADD,/*	A B C	R(A) := RK(B) + RK(C)				*/
-OP_SUB,/*	A B C	R(A) := RK(B) - RK(C)				*/
-OP_MUL,/*	A B C	R(A) := RK(B) * RK(C)				*/
-OP_DIV,/*	A B C	R(A) := RK(B) / RK(C)				*/
-OP_MOD,/*	A B C	R(A) := RK(B) % RK(C)				*/
-OP_POW,/*	A B C	R(A) := RK(B) ^ RK(C)				*/
-OP_UNM,/*	A B	R(A) := -R(B)					*/
-OP_NOT,/*	A B	R(A) := not R(B)				*/
-OP_LEN,/*	A B	R(A) := length of R(B)				*/
-
-OP_CONCAT,/*	A B C	R(A) := R(B).. ... ..R(C)			*/
-
-OP_JMP,/*	A sBx	pc+=sBx; if (A) close all upvalues >= R(A - 1)	*/
-OP_EQ,/*	A B C	if ((RK(B) == RK(C)) ~= A) then pc++		*/
-OP_LT,/*	A B C	if ((RK(B) <  RK(C)) ~= A) then pc++		*/
-OP_LE,/*	A B C	if ((RK(B) <= RK(C)) ~= A) then pc++		*/
-
-OP_TEST,/*	A C	if not (R(A) <=> C) then pc++			*/
-OP_TESTSET,/*	A B C	if (R(B) <=> C) then R(A) := R(B) else pc++	*/
-
-OP_CALL,/*	A B C	R(A), ... ,R(A+C-2) := R(A)(R(A+1), ... ,R(A+B-1)) */
-OP_TAILCALL,/*	A B C	return R(A)(R(A+1), ... ,R(A+B-1))		*/
-OP_RETURN,/*	A B	return R(A), ... ,R(A+B-2)	(see note)	*/
-
-OP_FORLOOP,/*	A sBx	R(A)+=R(A+2);
-			if R(A) <?= R(A+1) then { pc+=sBx; R(A+3)=R(A) }*/
-OP_FORPREP,/*	A sBx	R(A)-=R(A+2); pc+=sBx				*/
-
-OP_TFORCALL,/*	A C	R(A+3), ... ,R(A+2+C) := R(A)(R(A+1), R(A+2));	*/
-OP_TFORLOOP,/*	A sBx	if R(A+1) ~= nil then { R(A)=R(A+1); pc += sBx }*/
-
-OP_SETLIST,/*	A B C	R(A)[(C-1)*FPF+i] := R(A+i), 1 <= i <= B	*/
-
-OP_CLOSURE,/*	A Bx	R(A) := closure(KPROTO[Bx])			*/
-
-OP_VARARG,/*	A B	R(A), R(A+1), ..., R(A+B-2) = vararg		*/
-
-OP_EXTRAARG/*	Ax	extra (larger) argument for previous opcode	*/
-} OpCode;
-
-
-#define NUM_OPCODES	(cast(int, OP_EXTRAARG) + 1)
-
-
-
-/*===========================================================================
-  Notes:
-  (*) In OP_CALL, if (B == 0) then B = top. If (C == 0), then `top' is
-  set to last_result+1, so next open instruction (OP_CALL, OP_RETURN,
-  OP_SETLIST) may use `top'.
-
-  (*) In OP_VARARG, if (B == 0) then use actual number of varargs and
-  set top (like in OP_CALL with C == 0).
-
-  (*) In OP_RETURN, if (B == 0) then return up to `top'.
-
-  (*) In OP_SETLIST, if (B == 0) then B = `top'; if (C == 0) then next
-  'instruction' is EXTRAARG(real C).
-
-  (*) In OP_LOADKX, the next 'instruction' is always EXTRAARG.
-
-  (*) For comparisons, A specifies what condition the test should accept
-  (true or false).
-
-  (*) All `skips' (pc++) assume that next instruction is a jump.
-
-===========================================================================*/
-
-
-/*
-** masks for instruction properties. The format is:
-** bits 0-1: op mode
-** bits 2-3: C arg mode
-** bits 4-5: B arg mode
-** bit 6: instruction set register A
-** bit 7: operator is a test (next instruction must be a jump)
-*/
-
-enum OpArgMask {
-  OpArgN,  /* argument is not used */
-  OpArgU,  /* argument is used */
-  OpArgR,  /* argument is a register or a jump offset */
-  OpArgK   /* argument is a constant or register/constant */
-};
-
-LUAI_DDEC const lu_byte luaP_opmodes[NUM_OPCODES];
-
-#define getOpMode(m)	(cast(enum OpMode, luaP_opmodes[m] & 3))
-#define getBMode(m)	(cast(enum OpArgMask, (luaP_opmodes[m] >> 4) & 3))
-#define getCMode(m)	(cast(enum OpArgMask, (luaP_opmodes[m] >> 2) & 3))
-#define testAMode(m)	(luaP_opmodes[m] & (1 << 6))
-#define testTMode(m)	(luaP_opmodes[m] & (1 << 7))
-
-
-LUAI_DDEC const char *const luaP_opnames[NUM_OPCODES+1];  /* opcode names */
-
-
-/* number of list items to accumulate before a SETLIST instruction */
-#define LFIELDS_PER_FLUSH	50
-
-
-#endif
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lopcodes.c
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lopcodes.c
+++ /dev/null
@@ -1,107 +0,0 @@
-/*
-** $Id: lopcodes.c,v 1.49.1.1 2013/04/12 18:48:47 roberto Exp $
-** Opcodes for Lua virtual machine
-** See Copyright Notice in lua.h
-*/
-
-
-#define lopcodes_c
-#define LUA_CORE
-
-
-#include "lopcodes.h"
-
-
-/* ORDER OP */
-
-LUAI_DDEF const char *const luaP_opnames[NUM_OPCODES+1] = {
-  "MOVE",
-  "LOADK",
-  "LOADKX",
-  "LOADBOOL",
-  "LOADNIL",
-  "GETUPVAL",
-  "GETTABUP",
-  "GETTABLE",
-  "SETTABUP",
-  "SETUPVAL",
-  "SETTABLE",
-  "NEWTABLE",
-  "SELF",
-  "ADD",
-  "SUB",
-  "MUL",
-  "DIV",
-  "MOD",
-  "POW",
-  "UNM",
-  "NOT",
-  "LEN",
-  "CONCAT",
-  "JMP",
-  "EQ",
-  "LT",
-  "LE",
-  "TEST",
-  "TESTSET",
-  "CALL",
-  "TAILCALL",
-  "RETURN",
-  "FORLOOP",
-  "FORPREP",
-  "TFORCALL",
-  "TFORLOOP",
-  "SETLIST",
-  "CLOSURE",
-  "VARARG",
-  "EXTRAARG",
-  NULL
-};
-
-
-#define opmode(t,a,b,c,m) (((t)<<7) | ((a)<<6) | ((b)<<4) | ((c)<<2) | (m))
-
-LUAI_DDEF const lu_byte luaP_opmodes[NUM_OPCODES] = {
-/*       T  A    B       C     mode		   opcode	*/
-  opmode(0, 1, OpArgR, OpArgN, iABC)		/* OP_MOVE */
- ,opmode(0, 1, OpArgK, OpArgN, iABx)		/* OP_LOADK */
- ,opmode(0, 1, OpArgN, OpArgN, iABx)		/* OP_LOADKX */
- ,opmode(0, 1, OpArgU, OpArgU, iABC)		/* OP_LOADBOOL */
- ,opmode(0, 1, OpArgU, OpArgN, iABC)		/* OP_LOADNIL */
- ,opmode(0, 1, OpArgU, OpArgN, iABC)		/* OP_GETUPVAL */
- ,opmode(0, 1, OpArgU, OpArgK, iABC)		/* OP_GETTABUP */
- ,opmode(0, 1, OpArgR, OpArgK, iABC)		/* OP_GETTABLE */
- ,opmode(0, 0, OpArgK, OpArgK, iABC)		/* OP_SETTABUP */
- ,opmode(0, 0, OpArgU, OpArgN, iABC)		/* OP_SETUPVAL */
- ,opmode(0, 0, OpArgK, OpArgK, iABC)		/* OP_SETTABLE */
- ,opmode(0, 1, OpArgU, OpArgU, iABC)		/* OP_NEWTABLE */
- ,opmode(0, 1, OpArgR, OpArgK, iABC)		/* OP_SELF */
- ,opmode(0, 1, OpArgK, OpArgK, iABC)		/* OP_ADD */
- ,opmode(0, 1, OpArgK, OpArgK, iABC)		/* OP_SUB */
- ,opmode(0, 1, OpArgK, OpArgK, iABC)		/* OP_MUL */
- ,opmode(0, 1, OpArgK, OpArgK, iABC)		/* OP_DIV */
- ,opmode(0, 1, OpArgK, OpArgK, iABC)		/* OP_MOD */
- ,opmode(0, 1, OpArgK, OpArgK, iABC)		/* OP_POW */
- ,opmode(0, 1, OpArgR, OpArgN, iABC)		/* OP_UNM */
- ,opmode(0, 1, OpArgR, OpArgN, iABC)		/* OP_NOT */
- ,opmode(0, 1, OpArgR, OpArgN, iABC)		/* OP_LEN */
- ,opmode(0, 1, OpArgR, OpArgR, iABC)		/* OP_CONCAT */
- ,opmode(0, 0, OpArgR, OpArgN, iAsBx)		/* OP_JMP */
- ,opmode(1, 0, OpArgK, OpArgK, iABC)		/* OP_EQ */
- ,opmode(1, 0, OpArgK, OpArgK, iABC)		/* OP_LT */
- ,opmode(1, 0, OpArgK, OpArgK, iABC)		/* OP_LE */
- ,opmode(1, 0, OpArgN, OpArgU, iABC)		/* OP_TEST */
- ,opmode(1, 1, OpArgR, OpArgU, iABC)		/* OP_TESTSET */
- ,opmode(0, 1, OpArgU, OpArgU, iABC)		/* OP_CALL */
- ,opmode(0, 1, OpArgU, OpArgU, iABC)		/* OP_TAILCALL */
- ,opmode(0, 0, OpArgU, OpArgN, iABC)		/* OP_RETURN */
- ,opmode(0, 1, OpArgR, OpArgN, iAsBx)		/* OP_FORLOOP */
- ,opmode(0, 1, OpArgR, OpArgN, iAsBx)		/* OP_FORPREP */
- ,opmode(0, 0, OpArgN, OpArgU, iABC)		/* OP_TFORCALL */
- ,opmode(0, 1, OpArgR, OpArgN, iAsBx)		/* OP_TFORLOOP */
- ,opmode(0, 0, OpArgU, OpArgU, iABC)		/* OP_SETLIST */
- ,opmode(0, 1, OpArgU, OpArgN, iABx)		/* OP_CLOSURE */
- ,opmode(0, 1, OpArgU, OpArgN, iABC)		/* OP_VARARG */
- ,opmode(0, 0, OpArgU, OpArgU, iAx)		/* OP_EXTRAARG */
-};
-
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lparser.h
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lparser.h
+++ /dev/null
@@ -1,119 +0,0 @@
-/*
-** $Id: lparser.h,v 1.70.1.1 2013/04/12 18:48:47 roberto Exp $
-** Lua Parser
-** See Copyright Notice in lua.h
-*/
-
-#ifndef lparser_h
-#define lparser_h
-
-#include "llimits.h"
-#include "lobject.h"
-#include "lzio.h"
-
-
-/*
-** Expression descriptor
-*/
-
-typedef enum {
-  VVOID,	/* no value */
-  VNIL,
-  VTRUE,
-  VFALSE,
-  VK,		/* info = index of constant in `k' */
-  VKNUM,	/* nval = numerical value */
-  VNONRELOC,	/* info = result register */
-  VLOCAL,	/* info = local register */
-  VUPVAL,       /* info = index of upvalue in 'upvalues' */
-  VINDEXED,	/* t = table register/upvalue; idx = index R/K */
-  VJMP,		/* info = instruction pc */
-  VRELOCABLE,	/* info = instruction pc */
-  VCALL,	/* info = instruction pc */
-  VVARARG	/* info = instruction pc */
-} expkind;
-
-
-#define vkisvar(k)	(VLOCAL <= (k) && (k) <= VINDEXED)
-#define vkisinreg(k)	((k) == VNONRELOC || (k) == VLOCAL)
-
-typedef struct expdesc {
-  expkind k;
-  union {
-    struct {  /* for indexed variables (VINDEXED) */
-      short idx;  /* index (R/K) */
-      lu_byte t;  /* table (register or upvalue) */
-      lu_byte vt;  /* whether 't' is register (VLOCAL) or upvalue (VUPVAL) */
-    } ind;
-    int info;  /* for generic use */
-    lua_Number nval;  /* for VKNUM */
-  } u;
-  int t;  /* patch list of `exit when true' */
-  int f;  /* patch list of `exit when false' */
-} expdesc;
-
-
-/* description of active local variable */
-typedef struct Vardesc {
-  short idx;  /* variable index in stack */
-} Vardesc;
-
-
-/* description of pending goto statements and label statements */
-typedef struct Labeldesc {
-  TString *name;  /* label identifier */
-  int pc;  /* position in code */
-  int line;  /* line where it appeared */
-  lu_byte nactvar;  /* local level where it appears in current block */
-} Labeldesc;
-
-
-/* list of labels or gotos */
-typedef struct Labellist {
-  Labeldesc *arr;  /* array */
-  int n;  /* number of entries in use */
-  int size;  /* array size */
-} Labellist;
-
-
-/* dynamic structures used by the parser */
-typedef struct Dyndata {
-  struct {  /* list of active local variables */
-    Vardesc *arr;
-    int n;
-    int size;
-  } actvar;
-  Labellist gt;  /* list of pending gotos */
-  Labellist label;   /* list of active labels */
-} Dyndata;
-
-
-/* control of blocks */
-struct BlockCnt;  /* defined in lparser.c */
-
-
-/* state needed to generate code for a given function */
-typedef struct FuncState {
-  Proto *f;  /* current function header */
-  Table *h;  /* table to find (and reuse) elements in `k' */
-  struct FuncState *prev;  /* enclosing function */
-  struct LexState *ls;  /* lexical state */
-  struct BlockCnt *bl;  /* chain of current blocks */
-  int pc;  /* next position to code (equivalent to `ncode') */
-  int lasttarget;   /* 'label' of last 'jump label' */
-  int jpc;  /* list of pending jumps to `pc' */
-  int nk;  /* number of elements in `k' */
-  int np;  /* number of elements in `p' */
-  int firstlocal;  /* index of first local var (in Dyndata array) */
-  short nlocvars;  /* number of elements in 'f->locvars' */
-  lu_byte nactvar;  /* number of active local variables */
-  lu_byte nups;  /* number of upvalues */
-  lu_byte freereg;  /* first free register */
-} FuncState;
-
-
-LUAI_FUNC Closure *luaY_parser (lua_State *L, ZIO *z, Mbuffer *buff,
-                                Dyndata *dyd, const char *name, int firstchar);
-
-
-#endif
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lparser.c
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lparser.c
+++ /dev/null
@@ -1,1637 +0,0 @@
-/*
-** $Id: lparser.c,v 2.130.1.1 2013/04/12 18:48:47 roberto Exp $
-** Lua Parser
-** See Copyright Notice in lua.h
-*/
-
-#include <sys/zfs_context.h>
-
-#define lparser_c
-#define LUA_CORE
-
-#include "lua.h"
-
-#include "lcode.h"
-#include "ldebug.h"
-#include "ldo.h"
-#include "lfunc.h"
-#include "llex.h"
-#include "lmem.h"
-#include "lobject.h"
-#include "lopcodes.h"
-#include "lparser.h"
-#include "lstate.h"
-#include "lstring.h"
-#include "ltable.h"
-
-
-
-/* maximum number of local variables per function (must be smaller
-   than 250, due to the bytecode format) */
-#define MAXVARS		200
-
-
-#define hasmultret(k)		((k) == VCALL || (k) == VVARARG)
-
-
-
-/*
-** nodes for block list (list of active blocks)
-*/
-typedef struct BlockCnt {
-  struct BlockCnt *previous;  /* chain */
-  short firstlabel;  /* index of first label in this block */
-  short firstgoto;  /* index of first pending goto in this block */
-  lu_byte nactvar;  /* # active locals outside the block */
-  lu_byte upval;  /* true if some variable in the block is an upvalue */
-  lu_byte isloop;  /* true if `block' is a loop */
-} BlockCnt;
-
-
-
-/*
-** prototypes for recursive non-terminal functions
-*/
-static void statement (LexState *ls);
-static void expr (LexState *ls, expdesc *v);
-
-
-static void anchor_token (LexState *ls) {
-  /* last token from outer function must be EOS */
-  lua_assert(ls->fs != NULL || ls->t.token == TK_EOS);
-  if (ls->t.token == TK_NAME || ls->t.token == TK_STRING) {
-    TString *ts = ls->t.seminfo.ts;
-    luaX_newstring(ls, getstr(ts), ts->tsv.len);
-  }
-}
-
-
-/* semantic error */
-static l_noret semerror (LexState *ls, const char *msg) {
-  ls->t.token = 0;  /* remove 'near to' from final message */
-  luaX_syntaxerror(ls, msg);
-}
-
-
-static l_noret error_expected (LexState *ls, int token) {
-  luaX_syntaxerror(ls,
-      luaO_pushfstring(ls->L, "%s expected", luaX_token2str(ls, token)));
-}
-
-
-static l_noret errorlimit (FuncState *fs, int limit, const char *what) {
-  lua_State *L = fs->ls->L;
-  const char *msg;
-  int line = fs->f->linedefined;
-  const char *where = (line == 0)
-                      ? "main function"
-                      : luaO_pushfstring(L, "function at line %d", line);
-  msg = luaO_pushfstring(L, "too many %s (limit is %d) in %s",
-                             what, limit, where);
-  luaX_syntaxerror(fs->ls, msg);
-}
-
-
-static void checklimit (FuncState *fs, int v, int l, const char *what) {
-  if (v > l) errorlimit(fs, l, what);
-}
-
-
-static int testnext (LexState *ls, int c) {
-  if (ls->t.token == c) {
-    luaX_next(ls);
-    return 1;
-  }
-  else return 0;
-}
-
-
-static void check (LexState *ls, int c) {
-  if (ls->t.token != c)
-    error_expected(ls, c);
-}
-
-
-static void checknext (LexState *ls, int c) {
-  check(ls, c);
-  luaX_next(ls);
-}
-
-
-#define check_condition(ls,c,msg)	{ if (!(c)) luaX_syntaxerror(ls, msg); }
-
-
-
-static void check_match (LexState *ls, int what, int who, int where) {
-  if (!testnext(ls, what)) {
-    if (where == ls->linenumber)
-      error_expected(ls, what);
-    else {
-      luaX_syntaxerror(ls, luaO_pushfstring(ls->L,
-             "%s expected (to close %s at line %d)",
-              luaX_token2str(ls, what), luaX_token2str(ls, who), where));
-    }
-  }
-}
-
-
-static TString *str_checkname (LexState *ls) {
-  TString *ts;
-  check(ls, TK_NAME);
-  ts = ls->t.seminfo.ts;
-  luaX_next(ls);
-  return ts;
-}
-
-
-static void init_exp (expdesc *e, expkind k, int i) {
-  e->f = e->t = NO_JUMP;
-  e->k = k;
-  e->u.info = i;
-}
-
-
-static void codestring (LexState *ls, expdesc *e, TString *s) {
-  init_exp(e, VK, luaK_stringK(ls->fs, s));
-}
-
-
-static void checkname (LexState *ls, expdesc *e) {
-  codestring(ls, e, str_checkname(ls));
-}
-
-
-static int registerlocalvar (LexState *ls, TString *varname) {
-  FuncState *fs = ls->fs;
-  Proto *f = fs->f;
-  int oldsize = f->sizelocvars;
-  luaM_growvector(ls->L, f->locvars, fs->nlocvars, f->sizelocvars,
-                  LocVar, SHRT_MAX, "local variables");
-  while (oldsize < f->sizelocvars) f->locvars[oldsize++].varname = NULL;
-  f->locvars[fs->nlocvars].varname = varname;
-  luaC_objbarrier(ls->L, f, varname);
-  return fs->nlocvars++;
-}
-
-
-static void new_localvar (LexState *ls, TString *name) {
-  FuncState *fs = ls->fs;
-  Dyndata *dyd = ls->dyd;
-  int reg = registerlocalvar(ls, name);
-  checklimit(fs, dyd->actvar.n + 1 - fs->firstlocal,
-                  MAXVARS, "local variables");
-  luaM_growvector(ls->L, dyd->actvar.arr, dyd->actvar.n + 1,
-                  dyd->actvar.size, Vardesc, MAX_INT, "local variables");
-  dyd->actvar.arr[dyd->actvar.n++].idx = cast(short, reg);
-}
-
-
-static void new_localvarliteral_ (LexState *ls, const char *name, size_t sz) {
-  new_localvar(ls, luaX_newstring(ls, name, sz));
-}
-
-#define new_localvarliteral(ls,v) \
-	new_localvarliteral_(ls, "" v, (sizeof(v)/sizeof(char))-1)
-
-
-static LocVar *getlocvar (FuncState *fs, int i) {
-  int idx = fs->ls->dyd->actvar.arr[fs->firstlocal + i].idx;
-  lua_assert(idx < fs->nlocvars);
-  return &fs->f->locvars[idx];
-}
-
-
-static void adjustlocalvars (LexState *ls, int nvars) {
-  FuncState *fs = ls->fs;
-  fs->nactvar = cast_byte(fs->nactvar + nvars);
-  for (; nvars; nvars--) {
-    getlocvar(fs, fs->nactvar - nvars)->startpc = fs->pc;
-  }
-}
-
-
-static void removevars (FuncState *fs, int tolevel) {
-  fs->ls->dyd->actvar.n -= (fs->nactvar - tolevel);
-  while (fs->nactvar > tolevel)
-    getlocvar(fs, --fs->nactvar)->endpc = fs->pc;
-}
-
-
-static int searchupvalue (FuncState *fs, TString *name) {
-  int i;
-  Upvaldesc *up = fs->f->upvalues;
-  for (i = 0; i < fs->nups; i++) {
-    if (luaS_eqstr(up[i].name, name)) return i;
-  }
-  return -1;  /* not found */
-}
-
-
-static int newupvalue (FuncState *fs, TString *name, expdesc *v) {
-  Proto *f = fs->f;
-  int oldsize = f->sizeupvalues;
-  checklimit(fs, fs->nups + 1, MAXUPVAL, "upvalues");
-  luaM_growvector(fs->ls->L, f->upvalues, fs->nups, f->sizeupvalues,
-                  Upvaldesc, MAXUPVAL, "upvalues");
-  while (oldsize < f->sizeupvalues) f->upvalues[oldsize++].name = NULL;
-  f->upvalues[fs->nups].instack = (v->k == VLOCAL);
-  f->upvalues[fs->nups].idx = cast_byte(v->u.info);
-  f->upvalues[fs->nups].name = name;
-  luaC_objbarrier(fs->ls->L, f, name);
-  return fs->nups++;
-}
-
-
-static int searchvar (FuncState *fs, TString *n) {
-  int i;
-  for (i = cast_int(fs->nactvar) - 1; i >= 0; i--) {
-    if (luaS_eqstr(n, getlocvar(fs, i)->varname))
-      return i;
-  }
-  return -1;  /* not found */
-}
-
-
-/*
-  Mark block where variable at given level was defined
-  (to emit close instructions later).
-*/
-static void markupval (FuncState *fs, int level) {
-  BlockCnt *bl = fs->bl;
-  while (bl->nactvar > level) bl = bl->previous;
-  bl->upval = 1;
-}
-
-
-/*
-  Find variable with given name 'n'. If it is an upvalue, add this
-  upvalue into all intermediate functions.
-*/
-static int singlevaraux (FuncState *fs, TString *n, expdesc *var, int base) {
-  if (fs == NULL)  /* no more levels? */
-    return VVOID;  /* default is global */
-  else {
-    int v = searchvar(fs, n);  /* look up locals at current level */
-    if (v >= 0) {  /* found? */
-      init_exp(var, VLOCAL, v);  /* variable is local */
-      if (!base)
-        markupval(fs, v);  /* local will be used as an upval */
-      return VLOCAL;
-    }
-    else {  /* not found as local at current level; try upvalues */
-      int idx = searchupvalue(fs, n);  /* try existing upvalues */
-      if (idx < 0) {  /* not found? */
-        if (singlevaraux(fs->prev, n, var, 0) == VVOID) /* try upper levels */
-          return VVOID;  /* not found; is a global */
-        /* else was LOCAL or UPVAL */
-        idx  = newupvalue(fs, n, var);  /* will be a new upvalue */
-      }
-      init_exp(var, VUPVAL, idx);
-      return VUPVAL;
-    }
-  }
-}
-
-
-static void singlevar (LexState *ls, expdesc *var) {
-  TString *varname = str_checkname(ls);
-  FuncState *fs = ls->fs;
-  if (singlevaraux(fs, varname, var, 1) == VVOID) {  /* global name? */
-    expdesc key;
-    singlevaraux(fs, ls->envn, var, 1);  /* get environment variable */
-    lua_assert(var->k == VLOCAL || var->k == VUPVAL);
-    codestring(ls, &key, varname);  /* key is variable name */
-    luaK_indexed(fs, var, &key);  /* env[varname] */
-  }
-}
-
-
-static void adjust_assign (LexState *ls, int nvars, int nexps, expdesc *e) {
-  FuncState *fs = ls->fs;
-  int extra = nvars - nexps;
-  if (hasmultret(e->k)) {
-    extra++;  /* includes call itself */
-    if (extra < 0) extra = 0;
-    luaK_setreturns(fs, e, extra);  /* last exp. provides the difference */
-    if (extra > 1) luaK_reserveregs(fs, extra-1);
-  }
-  else {
-    if (e->k != VVOID) luaK_exp2nextreg(fs, e);  /* close last expression */
-    if (extra > 0) {
-      int reg = fs->freereg;
-      luaK_reserveregs(fs, extra);
-      luaK_nil(fs, reg, extra);
-    }
-  }
-}
-
-
-static void enterlevel (LexState *ls) {
-  lua_State *L = ls->L;
-  ++L->nCcalls;
-  checklimit(ls->fs, L->nCcalls, LUAI_MAXCCALLS, "C levels");
-}
-
-
-#define leavelevel(ls)	((ls)->L->nCcalls--)
-
-
-static void closegoto (LexState *ls, int g, Labeldesc *label) {
-  int i;
-  FuncState *fs = ls->fs;
-  Labellist *gl = &ls->dyd->gt;
-  Labeldesc *gt = &gl->arr[g];
-  lua_assert(luaS_eqstr(gt->name, label->name));
-  if (gt->nactvar < label->nactvar) {
-    TString *vname = getlocvar(fs, gt->nactvar)->varname;
-    const char *msg = luaO_pushfstring(ls->L,
-      "<goto %s> at line %d jumps into the scope of local " LUA_QS,
-      getstr(gt->name), gt->line, getstr(vname));
-    semerror(ls, msg);
-  }
-  luaK_patchlist(fs, gt->pc, label->pc);
-  /* remove goto from pending list */
-  for (i = g; i < gl->n - 1; i++)
-    gl->arr[i] = gl->arr[i + 1];
-  gl->n--;
-}
-
-
-/*
-** try to close a goto with existing labels; this solves backward jumps
-*/
-static int findlabel (LexState *ls, int g) {
-  int i;
-  BlockCnt *bl = ls->fs->bl;
-  Dyndata *dyd = ls->dyd;
-  Labeldesc *gt = &dyd->gt.arr[g];
-  /* check labels in current block for a match */
-  for (i = bl->firstlabel; i < dyd->label.n; i++) {
-    Labeldesc *lb = &dyd->label.arr[i];
-    if (luaS_eqstr(lb->name, gt->name)) {  /* correct label? */
-      if (gt->nactvar > lb->nactvar &&
-          (bl->upval || dyd->label.n > bl->firstlabel))
-        luaK_patchclose(ls->fs, gt->pc, lb->nactvar);
-      closegoto(ls, g, lb);  /* close it */
-      return 1;
-    }
-  }
-  return 0;  /* label not found; cannot close goto */
-}
-
-
-static int newlabelentry (LexState *ls, Labellist *l, TString *name,
-                          int line, int pc) {
-  int n = l->n;
-  luaM_growvector(ls->L, l->arr, n, l->size,
-                  Labeldesc, SHRT_MAX, "labels/gotos");
-  l->arr[n].name = name;
-  l->arr[n].line = line;
-  l->arr[n].nactvar = ls->fs->nactvar;
-  l->arr[n].pc = pc;
-  l->n++;
-  return n;
-}
-
-
-/*
-** check whether new label 'lb' matches any pending gotos in current
-** block; solves forward jumps
-*/
-static void findgotos (LexState *ls, Labeldesc *lb) {
-  Labellist *gl = &ls->dyd->gt;
-  int i = ls->fs->bl->firstgoto;
-  while (i < gl->n) {
-    if (luaS_eqstr(gl->arr[i].name, lb->name))
-      closegoto(ls, i, lb);
-    else
-      i++;
-  }
-}
-
-
-/*
-** "export" pending gotos to outer level, to check them against
-** outer labels; if the block being exited has upvalues, and
-** the goto exits the scope of any variable (which can be the
-** upvalue), close those variables being exited.
-*/
-static void movegotosout (FuncState *fs, BlockCnt *bl) {
-  int i = bl->firstgoto;
-  Labellist *gl = &fs->ls->dyd->gt;
-  /* correct pending gotos to current block and try to close it
-     with visible labels */
-  while (i < gl->n) {
-    Labeldesc *gt = &gl->arr[i];
-    if (gt->nactvar > bl->nactvar) {
-      if (bl->upval)
-        luaK_patchclose(fs, gt->pc, bl->nactvar);
-      gt->nactvar = bl->nactvar;
-    }
-    if (!findlabel(fs->ls, i))
-      i++;  /* move to next one */
-  }
-}
-
-
-static void enterblock (FuncState *fs, BlockCnt *bl, lu_byte isloop) {
-  bl->isloop = isloop;
-  bl->nactvar = fs->nactvar;
-  bl->firstlabel = fs->ls->dyd->label.n;
-  bl->firstgoto = fs->ls->dyd->gt.n;
-  bl->upval = 0;
-  bl->previous = fs->bl;
-  fs->bl = bl;
-  lua_assert(fs->freereg == fs->nactvar);
-}
-
-
-/*
-** create a label named "break" to resolve break statements
-*/
-static void breaklabel (LexState *ls) {
-  TString *n = luaS_new(ls->L, "break");
-  int l = newlabelentry(ls, &ls->dyd->label, n, 0, ls->fs->pc);
-  findgotos(ls, &ls->dyd->label.arr[l]);
-}
-
-/*
-** generates an error for an undefined 'goto'; choose appropriate
-** message when label name is a reserved word (which can only be 'break')
-*/
-static l_noret undefgoto (LexState *ls, Labeldesc *gt) {
-  const char *msg = isreserved(gt->name)
-                    ? "<%s> at line %d not inside a loop"
-                    : "no visible label " LUA_QS " for <goto> at line %d";
-  msg = luaO_pushfstring(ls->L, msg, getstr(gt->name), gt->line);
-  semerror(ls, msg);
-}
-
-
-static void leaveblock (FuncState *fs) {
-  BlockCnt *bl = fs->bl;
-  LexState *ls = fs->ls;
-  if (bl->previous && bl->upval) {
-    /* create a 'jump to here' to close upvalues */
-    int j = luaK_jump(fs);
-    luaK_patchclose(fs, j, bl->nactvar);
-    luaK_patchtohere(fs, j);
-  }
-  if (bl->isloop)
-    breaklabel(ls);  /* close pending breaks */
-  fs->bl = bl->previous;
-  removevars(fs, bl->nactvar);
-  lua_assert(bl->nactvar == fs->nactvar);
-  fs->freereg = fs->nactvar;  /* free registers */
-  ls->dyd->label.n = bl->firstlabel;  /* remove local labels */
-  if (bl->previous)  /* inner block? */
-    movegotosout(fs, bl);  /* update pending gotos to outer block */
-  else if (bl->firstgoto < ls->dyd->gt.n)  /* pending gotos in outer block? */
-    undefgoto(ls, &ls->dyd->gt.arr[bl->firstgoto]);  /* error */
-}
-
-
-/*
-** adds a new prototype into list of prototypes
-*/
-static Proto *addprototype (LexState *ls) {
-  Proto *clp;
-  lua_State *L = ls->L;
-  FuncState *fs = ls->fs;
-  Proto *f = fs->f;  /* prototype of current function */
-  if (fs->np >= f->sizep) {
-    int oldsize = f->sizep;
-    luaM_growvector(L, f->p, fs->np, f->sizep, Proto *, MAXARG_Bx, "functions");
-    while (oldsize < f->sizep) f->p[oldsize++] = NULL;
-  }
-  f->p[fs->np++] = clp = luaF_newproto(L);
-  luaC_objbarrier(L, f, clp);
-  return clp;
-}
-
-
-/*
-** codes instruction to create new closure in parent function.
-** The OP_CLOSURE instruction must use the last available register,
-** so that, if it invokes the GC, the GC knows which registers
-** are in use at that time.
-*/
-static void codeclosure (LexState *ls, expdesc *v) {
-  FuncState *fs = ls->fs->prev;
-  init_exp(v, VRELOCABLE, luaK_codeABx(fs, OP_CLOSURE, 0, fs->np - 1));
-  luaK_exp2nextreg(fs, v);  /* fix it at the last register */
-}
-
-
-static void open_func (LexState *ls, FuncState *fs, BlockCnt *bl) {
-  lua_State *L = ls->L;
-  Proto *f;
-  fs->prev = ls->fs;  /* linked list of funcstates */
-  fs->ls = ls;
-  ls->fs = fs;
-  fs->pc = 0;
-  fs->lasttarget = 0;
-  fs->jpc = NO_JUMP;
-  fs->freereg = 0;
-  fs->nk = 0;
-  fs->np = 0;
-  fs->nups = 0;
-  fs->nlocvars = 0;
-  fs->nactvar = 0;
-  fs->firstlocal = ls->dyd->actvar.n;
-  fs->bl = NULL;
-  f = fs->f;
-  f->source = ls->source;
-  f->maxstacksize = 2;  /* registers 0/1 are always valid */
-  fs->h = luaH_new(L);
-  /* anchor table of constants (to avoid being collected) */
-  sethvalue2s(L, L->top, fs->h);
-  incr_top(L);
-  enterblock(fs, bl, 0);
-}
-
-
-static void close_func (LexState *ls) {
-  lua_State *L = ls->L;
-  FuncState *fs = ls->fs;
-  Proto *f = fs->f;
-  luaK_ret(fs, 0, 0);  /* final return */
-  leaveblock(fs);
-  luaM_reallocvector(L, f->code, f->sizecode, fs->pc, Instruction);
-  f->sizecode = fs->pc;
-  luaM_reallocvector(L, f->lineinfo, f->sizelineinfo, fs->pc, int);
-  f->sizelineinfo = fs->pc;
-  luaM_reallocvector(L, f->k, f->sizek, fs->nk, TValue);
-  f->sizek = fs->nk;
-  luaM_reallocvector(L, f->p, f->sizep, fs->np, Proto *);
-  f->sizep = fs->np;
-  luaM_reallocvector(L, f->locvars, f->sizelocvars, fs->nlocvars, LocVar);
-  f->sizelocvars = fs->nlocvars;
-  luaM_reallocvector(L, f->upvalues, f->sizeupvalues, fs->nups, Upvaldesc);
-  f->sizeupvalues = fs->nups;
-  lua_assert(fs->bl == NULL);
-  ls->fs = fs->prev;
-  /* last token read was anchored in defunct function; must re-anchor it */
-  anchor_token(ls);
-  L->top--;  /* pop table of constants */
-  luaC_checkGC(L);
-}
-
-
-
-/*============================================================*/
-/* GRAMMAR RULES */
-/*============================================================*/
-
-
-/*
-** check whether current token is in the follow set of a block.
-** 'until' closes syntactical blocks, but do not close scope,
-** so it handled in separate.
-*/
-static int block_follow (LexState *ls, int withuntil) {
-  switch (ls->t.token) {
-    case TK_ELSE: case TK_ELSEIF:
-    case TK_END: case TK_EOS:
-      return 1;
-    case TK_UNTIL: return withuntil;
-    default: return 0;
-  }
-}
-
-
-static void statlist (LexState *ls) {
-  /* statlist -> { stat [`;'] } */
-  while (!block_follow(ls, 1)) {
-    if (ls->t.token == TK_RETURN) {
-      statement(ls);
-      return;  /* 'return' must be last statement */
-    }
-    statement(ls);
-  }
-}
-
-
-static void fieldsel (LexState *ls, expdesc *v) {
-  /* fieldsel -> ['.' | ':'] NAME */
-  FuncState *fs = ls->fs;
-  expdesc key;
-  luaK_exp2anyregup(fs, v);
-  luaX_next(ls);  /* skip the dot or colon */
-  checkname(ls, &key);
-  luaK_indexed(fs, v, &key);
-}
-
-
-static void yindex (LexState *ls, expdesc *v) {
-  /* index -> '[' expr ']' */
-  luaX_next(ls);  /* skip the '[' */
-  expr(ls, v);
-  luaK_exp2val(ls->fs, v);
-  checknext(ls, ']');
-}
-
-
-/*
-** {======================================================================
-** Rules for Constructors
-** =======================================================================
-*/
-
-
-struct ConsControl {
-  expdesc v;  /* last list item read */
-  expdesc *t;  /* table descriptor */
-  int nh;  /* total number of `record' elements */
-  int na;  /* total number of array elements */
-  int tostore;  /* number of array elements pending to be stored */
-};
-
-
-static void recfield (LexState *ls, struct ConsControl *cc) {
-  /* recfield -> (NAME | `['exp1`]') = exp1 */
-  FuncState *fs = ls->fs;
-  int reg = ls->fs->freereg;
-  expdesc key, val;
-  int rkkey;
-  if (ls->t.token == TK_NAME) {
-    checklimit(fs, cc->nh, MAX_INT, "items in a constructor");
-    checkname(ls, &key);
-  }
-  else  /* ls->t.token == '[' */
-    yindex(ls, &key);
-  cc->nh++;
-  checknext(ls, '=');
-  rkkey = luaK_exp2RK(fs, &key);
-  expr(ls, &val);
-  luaK_codeABC(fs, OP_SETTABLE, cc->t->u.info, rkkey, luaK_exp2RK(fs, &val));
-  fs->freereg = reg;  /* free registers */
-}
-
-
-static void closelistfield (FuncState *fs, struct ConsControl *cc) {
-  if (cc->v.k == VVOID) return;  /* there is no list item */
-  luaK_exp2nextreg(fs, &cc->v);
-  cc->v.k = VVOID;
-  if (cc->tostore == LFIELDS_PER_FLUSH) {
-    luaK_setlist(fs, cc->t->u.info, cc->na, cc->tostore);  /* flush */
-    cc->tostore = 0;  /* no more items pending */
-  }
-}
-
-
-static void lastlistfield (FuncState *fs, struct ConsControl *cc) {
-  if (cc->tostore == 0) return;
-  if (hasmultret(cc->v.k)) {
-    luaK_setmultret(fs, &cc->v);
-    luaK_setlist(fs, cc->t->u.info, cc->na, LUA_MULTRET);
-    cc->na--;  /* do not count last expression (unknown number of elements) */
-  }
-  else {
-    if (cc->v.k != VVOID)
-      luaK_exp2nextreg(fs, &cc->v);
-    luaK_setlist(fs, cc->t->u.info, cc->na, cc->tostore);
-  }
-}
-
-
-static void listfield (LexState *ls, struct ConsControl *cc) {
-  /* listfield -> exp */
-  expr(ls, &cc->v);
-  checklimit(ls->fs, cc->na, MAX_INT, "items in a constructor");
-  cc->na++;
-  cc->tostore++;
-}
-
-
-static void field (LexState *ls, struct ConsControl *cc) {
-  /* field -> listfield | recfield */
-  switch(ls->t.token) {
-    case TK_NAME: {  /* may be 'listfield' or 'recfield' */
-      if (luaX_lookahead(ls) != '=')  /* expression? */
-        listfield(ls, cc);
-      else
-        recfield(ls, cc);
-      break;
-    }
-    case '[': {
-      recfield(ls, cc);
-      break;
-    }
-    default: {
-      listfield(ls, cc);
-      break;
-    }
-  }
-}
-
-
-static void constructor (LexState *ls, expdesc *t) {
-  /* constructor -> '{' [ field { sep field } [sep] ] '}'
-     sep -> ',' | ';' */
-  FuncState *fs = ls->fs;
-  int line = ls->linenumber;
-  int pc = luaK_codeABC(fs, OP_NEWTABLE, 0, 0, 0);
-  struct ConsControl cc;
-  cc.na = cc.nh = cc.tostore = 0;
-  cc.t = t;
-  init_exp(t, VRELOCABLE, pc);
-  init_exp(&cc.v, VVOID, 0);  /* no value (yet) */
-  luaK_exp2nextreg(ls->fs, t);  /* fix it at stack top */
-  checknext(ls, '{');
-  do {
-    lua_assert(cc.v.k == VVOID || cc.tostore > 0);
-    if (ls->t.token == '}') break;
-    closelistfield(fs, &cc);
-    field(ls, &cc);
-  } while (testnext(ls, ',') || testnext(ls, ';'));
-  check_match(ls, '}', '{', line);
-  lastlistfield(fs, &cc);
-  SETARG_B(fs->f->code[pc], luaO_int2fb(cc.na)); /* set initial array size */
-  SETARG_C(fs->f->code[pc], luaO_int2fb(cc.nh));  /* set initial table size */
-}
-
-/* }====================================================================== */
-
-
-
-static void parlist (LexState *ls) {
-  /* parlist -> [ param { `,' param } ] */
-  FuncState *fs = ls->fs;
-  Proto *f = fs->f;
-  int nparams = 0;
-  f->is_vararg = 0;
-  if (ls->t.token != ')') {  /* is `parlist' not empty? */
-    do {
-      switch (ls->t.token) {
-        case TK_NAME: {  /* param -> NAME */
-          new_localvar(ls, str_checkname(ls));
-          nparams++;
-          break;
-        }
-        case TK_DOTS: {  /* param -> `...' */
-          luaX_next(ls);
-          f->is_vararg = 1;
-          break;
-        }
-        default: luaX_syntaxerror(ls, "<name> or " LUA_QL("...") " expected");
-      }
-    } while (!f->is_vararg && testnext(ls, ','));
-  }
-  adjustlocalvars(ls, nparams);
-  f->numparams = cast_byte(fs->nactvar);
-  luaK_reserveregs(fs, fs->nactvar);  /* reserve register for parameters */
-}
-
-
-static void body (LexState *ls, expdesc *e, int ismethod, int line) {
-  /* body ->  `(' parlist `)' block END */
-  FuncState new_fs;
-  BlockCnt bl;
-  new_fs.f = addprototype(ls);
-  new_fs.f->linedefined = line;
-  open_func(ls, &new_fs, &bl);
-  checknext(ls, '(');
-  if (ismethod) {
-    new_localvarliteral(ls, "self");  /* create 'self' parameter */
-    adjustlocalvars(ls, 1);
-  }
-  parlist(ls);
-  checknext(ls, ')');
-  statlist(ls);
-  new_fs.f->lastlinedefined = ls->linenumber;
-  check_match(ls, TK_END, TK_FUNCTION, line);
-  codeclosure(ls, e);
-  close_func(ls);
-}
-
-
-static int explist (LexState *ls, expdesc *v) {
-  /* explist -> expr { `,' expr } */
-  int n = 1;  /* at least one expression */
-  expr(ls, v);
-  while (testnext(ls, ',')) {
-    luaK_exp2nextreg(ls->fs, v);
-    expr(ls, v);
-    n++;
-  }
-  return n;
-}
-
-
-static void funcargs (LexState *ls, expdesc *f, int line) {
-  FuncState *fs = ls->fs;
-  expdesc args;
-  int base, nparams;
-  switch (ls->t.token) {
-    case '(': {  /* funcargs -> `(' [ explist ] `)' */
-      luaX_next(ls);
-      if (ls->t.token == ')')  /* arg list is empty? */
-        args.k = VVOID;
-      else {
-        explist(ls, &args);
-        luaK_setmultret(fs, &args);
-      }
-      check_match(ls, ')', '(', line);
-      break;
-    }
-    case '{': {  /* funcargs -> constructor */
-      constructor(ls, &args);
-      break;
-    }
-    case TK_STRING: {  /* funcargs -> STRING */
-      codestring(ls, &args, ls->t.seminfo.ts);
-      luaX_next(ls);  /* must use `seminfo' before `next' */
-      break;
-    }
-    default: {
-      luaX_syntaxerror(ls, "function arguments expected");
-    }
-  }
-  lua_assert(f->k == VNONRELOC);
-  base = f->u.info;  /* base register for call */
-  if (hasmultret(args.k))
-    nparams = LUA_MULTRET;  /* open call */
-  else {
-    if (args.k != VVOID)
-      luaK_exp2nextreg(fs, &args);  /* close last argument */
-    nparams = fs->freereg - (base+1);
-  }
-  init_exp(f, VCALL, luaK_codeABC(fs, OP_CALL, base, nparams+1, 2));
-  luaK_fixline(fs, line);
-  fs->freereg = base+1;  /* call remove function and arguments and leaves
-                            (unless changed) one result */
-}
-
-
-
-
-/*
-** {======================================================================
-** Expression parsing
-** =======================================================================
-*/
-
-
-static void primaryexp (LexState *ls, expdesc *v) {
-  /* primaryexp -> NAME | '(' expr ')' */
-  switch (ls->t.token) {
-    case '(': {
-      int line = ls->linenumber;
-      luaX_next(ls);
-      expr(ls, v);
-      check_match(ls, ')', '(', line);
-      luaK_dischargevars(ls->fs, v);
-      return;
-    }
-    case TK_NAME: {
-      singlevar(ls, v);
-      return;
-    }
-    default: {
-      luaX_syntaxerror(ls, "unexpected symbol");
-    }
-  }
-}
-
-
-static void suffixedexp (LexState *ls, expdesc *v) {
-  /* suffixedexp ->
-       primaryexp { '.' NAME | '[' exp ']' | ':' NAME funcargs | funcargs } */
-  FuncState *fs = ls->fs;
-  int line = ls->linenumber;
-  primaryexp(ls, v);
-  for (;;) {
-    switch (ls->t.token) {
-      case '.': {  /* fieldsel */
-        fieldsel(ls, v);
-        break;
-      }
-      case '[': {  /* `[' exp1 `]' */
-        expdesc key;
-        luaK_exp2anyregup(fs, v);
-        yindex(ls, &key);
-        luaK_indexed(fs, v, &key);
-        break;
-      }
-      case ':': {  /* `:' NAME funcargs */
-        expdesc key;
-        luaX_next(ls);
-        checkname(ls, &key);
-        luaK_self(fs, v, &key);
-        funcargs(ls, v, line);
-        break;
-      }
-      case '(': case TK_STRING: case '{': {  /* funcargs */
-        luaK_exp2nextreg(fs, v);
-        funcargs(ls, v, line);
-        break;
-      }
-      default: return;
-    }
-  }
-}
-
-
-static void simpleexp (LexState *ls, expdesc *v) {
-  /* simpleexp -> NUMBER | STRING | NIL | TRUE | FALSE | ... |
-                  constructor | FUNCTION body | suffixedexp */
-  switch (ls->t.token) {
-    case TK_NUMBER: {
-      init_exp(v, VKNUM, 0);
-      v->u.nval = ls->t.seminfo.r;
-      break;
-    }
-    case TK_STRING: {
-      codestring(ls, v, ls->t.seminfo.ts);
-      break;
-    }
-    case TK_NIL: {
-      init_exp(v, VNIL, 0);
-      break;
-    }
-    case TK_TRUE: {
-      init_exp(v, VTRUE, 0);
-      break;
-    }
-    case TK_FALSE: {
-      init_exp(v, VFALSE, 0);
-      break;
-    }
-    case TK_DOTS: {  /* vararg */
-      FuncState *fs = ls->fs;
-      check_condition(ls, fs->f->is_vararg,
-                      "cannot use " LUA_QL("...") " outside a vararg function");
-      init_exp(v, VVARARG, luaK_codeABC(fs, OP_VARARG, 0, 1, 0));
-      break;
-    }
-    case '{': {  /* constructor */
-      constructor(ls, v);
-      return;
-    }
-    case TK_FUNCTION: {
-      luaX_next(ls);
-      body(ls, v, 0, ls->linenumber);
-      return;
-    }
-    default: {
-      suffixedexp(ls, v);
-      return;
-    }
-  }
-  luaX_next(ls);
-}
-
-
-static UnOpr getunopr (int op) {
-  switch (op) {
-    case TK_NOT: return OPR_NOT;
-    case '-': return OPR_MINUS;
-    case '#': return OPR_LEN;
-    default: return OPR_NOUNOPR;
-  }
-}
-
-
-static BinOpr getbinopr (int op) {
-  switch (op) {
-    case '+': return OPR_ADD;
-    case '-': return OPR_SUB;
-    case '*': return OPR_MUL;
-    case '/': return OPR_DIV;
-    case '%': return OPR_MOD;
-    case '^': return OPR_POW;
-    case TK_CONCAT: return OPR_CONCAT;
-    case TK_NE: return OPR_NE;
-    case TK_EQ: return OPR_EQ;
-    case '<': return OPR_LT;
-    case TK_LE: return OPR_LE;
-    case '>': return OPR_GT;
-    case TK_GE: return OPR_GE;
-    case TK_AND: return OPR_AND;
-    case TK_OR: return OPR_OR;
-    default: return OPR_NOBINOPR;
-  }
-}
-
-
-static const struct {
-  lu_byte left;  /* left priority for each binary operator */
-  lu_byte right; /* right priority */
-} priority[] = {  /* ORDER OPR */
-   {6, 6}, {6, 6}, {7, 7}, {7, 7}, {7, 7},  /* `+' `-' `*' `/' `%' */
-   {10, 9}, {5, 4},                 /* ^, .. (right associative) */
-   {3, 3}, {3, 3}, {3, 3},          /* ==, <, <= */
-   {3, 3}, {3, 3}, {3, 3},          /* ~=, >, >= */
-   {2, 2}, {1, 1}                   /* and, or */
-};
-
-#define UNARY_PRIORITY	8  /* priority for unary operators */
-
-
-/*
-** subexpr -> (simpleexp | unop subexpr) { binop subexpr }
-** where `binop' is any binary operator with a priority higher than `limit'
-*/
-static BinOpr subexpr (LexState *ls, expdesc *v, int limit) {
-  BinOpr op;
-  UnOpr uop;
-  enterlevel(ls);
-  uop = getunopr(ls->t.token);
-  if (uop != OPR_NOUNOPR) {
-    int line = ls->linenumber;
-    luaX_next(ls);
-    subexpr(ls, v, UNARY_PRIORITY);
-    luaK_prefix(ls->fs, uop, v, line);
-  }
-  else simpleexp(ls, v);
-  /* expand while operators have priorities higher than `limit' */
-  op = getbinopr(ls->t.token);
-  while (op != OPR_NOBINOPR && priority[op].left > limit) {
-    expdesc v2;
-    BinOpr nextop;
-    int line = ls->linenumber;
-    luaX_next(ls);
-    luaK_infix(ls->fs, op, v);
-    /* read sub-expression with higher priority */
-    nextop = subexpr(ls, &v2, priority[op].right);
-    luaK_posfix(ls->fs, op, v, &v2, line);
-    op = nextop;
-  }
-  leavelevel(ls);
-  return op;  /* return first untreated operator */
-}
-
-
-static void expr (LexState *ls, expdesc *v) {
-  subexpr(ls, v, 0);
-}
-
-/* }==================================================================== */
-
-
-
-/*
-** {======================================================================
-** Rules for Statements
-** =======================================================================
-*/
-
-
-static void block (LexState *ls) {
-  /* block -> statlist */
-  FuncState *fs = ls->fs;
-  BlockCnt bl;
-  enterblock(fs, &bl, 0);
-  statlist(ls);
-  leaveblock(fs);
-}
-
-
-/*
-** structure to chain all variables in the left-hand side of an
-** assignment
-*/
-struct LHS_assign {
-  struct LHS_assign *prev;
-  expdesc v;  /* variable (global, local, upvalue, or indexed) */
-};
-
-
-/*
-** check whether, in an assignment to an upvalue/local variable, the
-** upvalue/local variable is begin used in a previous assignment to a
-** table. If so, save original upvalue/local value in a safe place and
-** use this safe copy in the previous assignment.
-*/
-static void check_conflict (LexState *ls, struct LHS_assign *lh, expdesc *v) {
-  FuncState *fs = ls->fs;
-  int extra = fs->freereg;  /* eventual position to save local variable */
-  int conflict = 0;
-  for (; lh; lh = lh->prev) {  /* check all previous assignments */
-    if (lh->v.k == VINDEXED) {  /* assigning to a table? */
-      /* table is the upvalue/local being assigned now? */
-      if (lh->v.u.ind.vt == v->k && lh->v.u.ind.t == v->u.info) {
-        conflict = 1;
-        lh->v.u.ind.vt = VLOCAL;
-        lh->v.u.ind.t = extra;  /* previous assignment will use safe copy */
-      }
-      /* index is the local being assigned? (index cannot be upvalue) */
-      if (v->k == VLOCAL && lh->v.u.ind.idx == v->u.info) {
-        conflict = 1;
-        lh->v.u.ind.idx = extra;  /* previous assignment will use safe copy */
-      }
-    }
-  }
-  if (conflict) {
-    /* copy upvalue/local value to a temporary (in position 'extra') */
-    OpCode op = (v->k == VLOCAL) ? OP_MOVE : OP_GETUPVAL;
-    luaK_codeABC(fs, op, extra, v->u.info, 0);
-    luaK_reserveregs(fs, 1);
-  }
-}
-
-
-static void assignment (LexState *ls, struct LHS_assign *lh, int nvars) {
-  expdesc e;
-  check_condition(ls, vkisvar(lh->v.k), "syntax error");
-  if (testnext(ls, ',')) {  /* assignment -> ',' suffixedexp assignment */
-    struct LHS_assign nv;
-    nv.prev = lh;
-    suffixedexp(ls, &nv.v);
-    if (nv.v.k != VINDEXED)
-      check_conflict(ls, lh, &nv.v);
-    checklimit(ls->fs, nvars + ls->L->nCcalls, LUAI_MAXCCALLS,
-                    "C levels");
-    assignment(ls, &nv, nvars+1);
-  }
-  else {  /* assignment -> `=' explist */
-    int nexps;
-    checknext(ls, '=');
-    nexps = explist(ls, &e);
-    if (nexps != nvars) {
-      adjust_assign(ls, nvars, nexps, &e);
-      if (nexps > nvars)
-        ls->fs->freereg -= nexps - nvars;  /* remove extra values */
-    }
-    else {
-      luaK_setoneret(ls->fs, &e);  /* close last expression */
-      luaK_storevar(ls->fs, &lh->v, &e);
-      return;  /* avoid default */
-    }
-  }
-  init_exp(&e, VNONRELOC, ls->fs->freereg-1);  /* default assignment */
-  luaK_storevar(ls->fs, &lh->v, &e);
-}
-
-
-static int cond (LexState *ls) {
-  /* cond -> exp */
-  expdesc v;
-  expr(ls, &v);  /* read condition */
-  if (v.k == VNIL) v.k = VFALSE;  /* `falses' are all equal here */
-  luaK_goiftrue(ls->fs, &v);
-  return v.f;
-}
-
-
-static void gotostat (LexState *ls, int pc) {
-  int line = ls->linenumber;
-  TString *label;
-  int g;
-  if (testnext(ls, TK_GOTO))
-    label = str_checkname(ls);
-  else {
-    luaX_next(ls);  /* skip break */
-    label = luaS_new(ls->L, "break");
-  }
-  g = newlabelentry(ls, &ls->dyd->gt, label, line, pc);
-  findlabel(ls, g);  /* close it if label already defined */
-}
-
-
-/* check for repeated labels on the same block */
-static void checkrepeated (FuncState *fs, Labellist *ll, TString *label) {
-  int i;
-  for (i = fs->bl->firstlabel; i < ll->n; i++) {
-    if (luaS_eqstr(label, ll->arr[i].name)) {
-      const char *msg = luaO_pushfstring(fs->ls->L,
-                          "label " LUA_QS " already defined on line %d",
-                          getstr(label), ll->arr[i].line);
-      semerror(fs->ls, msg);
-    }
-  }
-}
-
-
-/* skip no-op statements */
-static void skipnoopstat (LexState *ls) {
-  while (ls->t.token == ';' || ls->t.token == TK_DBCOLON)
-    statement(ls);
-}
-
-
-static void labelstat (LexState *ls, TString *label, int line) {
-  /* label -> '::' NAME '::' */
-  FuncState *fs = ls->fs;
-  Labellist *ll = &ls->dyd->label;
-  int l;  /* index of new label being created */
-  checkrepeated(fs, ll, label);  /* check for repeated labels */
-  checknext(ls, TK_DBCOLON);  /* skip double colon */
-  /* create new entry for this label */
-  l = newlabelentry(ls, ll, label, line, fs->pc);
-  skipnoopstat(ls);  /* skip other no-op statements */
-  if (block_follow(ls, 0)) {  /* label is last no-op statement in the block? */
-    /* assume that locals are already out of scope */
-    ll->arr[l].nactvar = fs->bl->nactvar;
-  }
-  findgotos(ls, &ll->arr[l]);
-}
-
-
-static void whilestat (LexState *ls, int line) {
-  /* whilestat -> WHILE cond DO block END */
-  FuncState *fs = ls->fs;
-  int whileinit;
-  int condexit;
-  BlockCnt bl;
-  luaX_next(ls);  /* skip WHILE */
-  whileinit = luaK_getlabel(fs);
-  condexit = cond(ls);
-  enterblock(fs, &bl, 1);
-  checknext(ls, TK_DO);
-  block(ls);
-  luaK_jumpto(fs, whileinit);
-  check_match(ls, TK_END, TK_WHILE, line);
-  leaveblock(fs);
-  luaK_patchtohere(fs, condexit);  /* false conditions finish the loop */
-}
-
-
-static void repeatstat (LexState *ls, int line) {
-  /* repeatstat -> REPEAT block UNTIL cond */
-  int condexit;
-  FuncState *fs = ls->fs;
-  int repeat_init = luaK_getlabel(fs);
-  BlockCnt bl1, bl2;
-  enterblock(fs, &bl1, 1);  /* loop block */
-  enterblock(fs, &bl2, 0);  /* scope block */
-  luaX_next(ls);  /* skip REPEAT */
-  statlist(ls);
-  check_match(ls, TK_UNTIL, TK_REPEAT, line);
-  condexit = cond(ls);  /* read condition (inside scope block) */
-  if (bl2.upval)  /* upvalues? */
-    luaK_patchclose(fs, condexit, bl2.nactvar);
-  leaveblock(fs);  /* finish scope */
-  luaK_patchlist(fs, condexit, repeat_init);  /* close the loop */
-  leaveblock(fs);  /* finish loop */
-}
-
-
-static int exp1 (LexState *ls) {
-  expdesc e;
-  int reg;
-  expr(ls, &e);
-  luaK_exp2nextreg(ls->fs, &e);
-  lua_assert(e.k == VNONRELOC);
-  reg = e.u.info;
-  return reg;
-}
-
-
-static void forbody (LexState *ls, int base, int line, int nvars, int isnum) {
-  /* forbody -> DO block */
-  BlockCnt bl;
-  FuncState *fs = ls->fs;
-  int prep, endfor;
-  adjustlocalvars(ls, 3);  /* control variables */
-  checknext(ls, TK_DO);
-  prep = isnum ? luaK_codeAsBx(fs, OP_FORPREP, base, NO_JUMP) : luaK_jump(fs);
-  enterblock(fs, &bl, 0);  /* scope for declared variables */
-  adjustlocalvars(ls, nvars);
-  luaK_reserveregs(fs, nvars);
-  block(ls);
-  leaveblock(fs);  /* end of scope for declared variables */
-  luaK_patchtohere(fs, prep);
-  if (isnum)  /* numeric for? */
-    endfor = luaK_codeAsBx(fs, OP_FORLOOP, base, NO_JUMP);
-  else {  /* generic for */
-    luaK_codeABC(fs, OP_TFORCALL, base, 0, nvars);
-    luaK_fixline(fs, line);
-    endfor = luaK_codeAsBx(fs, OP_TFORLOOP, base + 2, NO_JUMP);
-  }
-  luaK_patchlist(fs, endfor, prep + 1);
-  luaK_fixline(fs, line);
-}
-
-
-static void fornum (LexState *ls, TString *varname, int line) {
-  /* fornum -> NAME = exp1,exp1[,exp1] forbody */
-  FuncState *fs = ls->fs;
-  int base = fs->freereg;
-  new_localvarliteral(ls, "(for index)");
-  new_localvarliteral(ls, "(for limit)");
-  new_localvarliteral(ls, "(for step)");
-  new_localvar(ls, varname);
-  checknext(ls, '=');
-  exp1(ls);  /* initial value */
-  checknext(ls, ',');
-  exp1(ls);  /* limit */
-  if (testnext(ls, ','))
-    exp1(ls);  /* optional step */
-  else {  /* default step = 1 */
-    luaK_codek(fs, fs->freereg, luaK_numberK(fs, 1));
-    luaK_reserveregs(fs, 1);
-  }
-  forbody(ls, base, line, 1, 1);
-}
-
-
-static void forlist (LexState *ls, TString *indexname) {
-  /* forlist -> NAME {,NAME} IN explist forbody */
-  FuncState *fs = ls->fs;
-  expdesc e;
-  int nvars = 4;  /* gen, state, control, plus at least one declared var */
-  int line;
-  int base = fs->freereg;
-  /* create control variables */
-  new_localvarliteral(ls, "(for generator)");
-  new_localvarliteral(ls, "(for state)");
-  new_localvarliteral(ls, "(for control)");
-  /* create declared variables */
-  new_localvar(ls, indexname);
-  while (testnext(ls, ',')) {
-    new_localvar(ls, str_checkname(ls));
-    nvars++;
-  }
-  checknext(ls, TK_IN);
-  line = ls->linenumber;
-  adjust_assign(ls, 3, explist(ls, &e), &e);
-  luaK_checkstack(fs, 3);  /* extra space to call generator */
-  forbody(ls, base, line, nvars - 3, 0);
-}
-
-
-static void forstat (LexState *ls, int line) {
-  /* forstat -> FOR (fornum | forlist) END */
-  FuncState *fs = ls->fs;
-  TString *varname;
-  BlockCnt bl;
-  enterblock(fs, &bl, 1);  /* scope for loop and control variables */
-  luaX_next(ls);  /* skip `for' */
-  varname = str_checkname(ls);  /* first variable name */
-  switch (ls->t.token) {
-    case '=': fornum(ls, varname, line); break;
-    case ',': case TK_IN: forlist(ls, varname); break;
-    default: luaX_syntaxerror(ls, LUA_QL("=") " or " LUA_QL("in") " expected");
-  }
-  check_match(ls, TK_END, TK_FOR, line);
-  leaveblock(fs);  /* loop scope (`break' jumps to this point) */
-}
-
-
-static void test_then_block (LexState *ls, int *escapelist) {
-  /* test_then_block -> [IF | ELSEIF] cond THEN block */
-  BlockCnt bl;
-  FuncState *fs = ls->fs;
-  expdesc v;
-  int jf;  /* instruction to skip 'then' code (if condition is false) */
-  luaX_next(ls);  /* skip IF or ELSEIF */
-  expr(ls, &v);  /* read condition */
-  checknext(ls, TK_THEN);
-  if (ls->t.token == TK_GOTO || ls->t.token == TK_BREAK) {
-    luaK_goiffalse(ls->fs, &v);  /* will jump to label if condition is true */
-    enterblock(fs, &bl, 0);  /* must enter block before 'goto' */
-    gotostat(ls, v.t);  /* handle goto/break */
-    skipnoopstat(ls);  /* skip other no-op statements */
-    if (block_follow(ls, 0)) {  /* 'goto' is the entire block? */
-      leaveblock(fs);
-      return;  /* and that is it */
-    }
-    else  /* must skip over 'then' part if condition is false */
-      jf = luaK_jump(fs);
-  }
-  else {  /* regular case (not goto/break) */
-    luaK_goiftrue(ls->fs, &v);  /* skip over block if condition is false */
-    enterblock(fs, &bl, 0);
-    jf = v.f;
-  }
-  statlist(ls);  /* `then' part */
-  leaveblock(fs);
-  if (ls->t.token == TK_ELSE ||
-      ls->t.token == TK_ELSEIF)  /* followed by 'else'/'elseif'? */
-    luaK_concat(fs, escapelist, luaK_jump(fs));  /* must jump over it */
-  luaK_patchtohere(fs, jf);
-}
-
-
-static void ifstat (LexState *ls, int line) {
-  /* ifstat -> IF cond THEN block {ELSEIF cond THEN block} [ELSE block] END */
-  FuncState *fs = ls->fs;
-  int escapelist = NO_JUMP;  /* exit list for finished parts */
-  test_then_block(ls, &escapelist);  /* IF cond THEN block */
-  while (ls->t.token == TK_ELSEIF)
-    test_then_block(ls, &escapelist);  /* ELSEIF cond THEN block */
-  if (testnext(ls, TK_ELSE))
-    block(ls);  /* `else' part */
-  check_match(ls, TK_END, TK_IF, line);
-  luaK_patchtohere(fs, escapelist);  /* patch escape list to 'if' end */
-}
-
-
-static void localfunc (LexState *ls) {
-  expdesc b;
-  FuncState *fs = ls->fs;
-  new_localvar(ls, str_checkname(ls));  /* new local variable */
-  adjustlocalvars(ls, 1);  /* enter its scope */
-  body(ls, &b, 0, ls->linenumber);  /* function created in next register */
-  /* debug information will only see the variable after this point! */
-  getlocvar(fs, b.u.info)->startpc = fs->pc;
-}
-
-
-static void localstat (LexState *ls) {
-  /* stat -> LOCAL NAME {`,' NAME} [`=' explist] */
-  int nvars = 0;
-  int nexps;
-  expdesc e;
-  do {
-    new_localvar(ls, str_checkname(ls));
-    nvars++;
-  } while (testnext(ls, ','));
-  if (testnext(ls, '='))
-    nexps = explist(ls, &e);
-  else {
-    e.k = VVOID;
-    nexps = 0;
-  }
-  adjust_assign(ls, nvars, nexps, &e);
-  adjustlocalvars(ls, nvars);
-}
-
-
-static int funcname (LexState *ls, expdesc *v) {
-  /* funcname -> NAME {fieldsel} [`:' NAME] */
-  int ismethod = 0;
-  singlevar(ls, v);
-  while (ls->t.token == '.')
-    fieldsel(ls, v);
-  if (ls->t.token == ':') {
-    ismethod = 1;
-    fieldsel(ls, v);
-  }
-  return ismethod;
-}
-
-
-static void funcstat (LexState *ls, int line) {
-  /* funcstat -> FUNCTION funcname body */
-  int ismethod;
-  expdesc v, b;
-  luaX_next(ls);  /* skip FUNCTION */
-  ismethod = funcname(ls, &v);
-  body(ls, &b, ismethod, line);
-  luaK_storevar(ls->fs, &v, &b);
-  luaK_fixline(ls->fs, line);  /* definition `happens' in the first line */
-}
-
-
-static void exprstat (LexState *ls) {
-  /* stat -> func | assignment */
-  FuncState *fs = ls->fs;
-  struct LHS_assign v;
-  suffixedexp(ls, &v.v);
-  if (ls->t.token == '=' || ls->t.token == ',') { /* stat -> assignment ? */
-    v.prev = NULL;
-    assignment(ls, &v, 1);
-  }
-  else {  /* stat -> func */
-    check_condition(ls, v.v.k == VCALL, "syntax error");
-    SETARG_C(getcode(fs, &v.v), 1);  /* call statement uses no results */
-  }
-}
-
-
-static void retstat (LexState *ls) {
-  /* stat -> RETURN [explist] [';'] */
-  FuncState *fs = ls->fs;
-  expdesc e;
-  int first, nret;  /* registers with returned values */
-  if (block_follow(ls, 1) || ls->t.token == ';')
-    first = nret = 0;  /* return no values */
-  else {
-    nret = explist(ls, &e);  /* optional return values */
-    if (hasmultret(e.k)) {
-      luaK_setmultret(fs, &e);
-      if (e.k == VCALL && nret == 1) {  /* tail call? */
-        SET_OPCODE(getcode(fs,&e), OP_TAILCALL);
-        lua_assert(GETARG_A(getcode(fs,&e)) == fs->nactvar);
-      }
-      first = fs->nactvar;
-      nret = LUA_MULTRET;  /* return all values */
-    }
-    else {
-      if (nret == 1)  /* only one single value? */
-        first = luaK_exp2anyreg(fs, &e);
-      else {
-        luaK_exp2nextreg(fs, &e);  /* values must go to the `stack' */
-        first = fs->nactvar;  /* return all `active' values */
-        lua_assert(nret == fs->freereg - first);
-      }
-    }
-  }
-  luaK_ret(fs, first, nret);
-  testnext(ls, ';');  /* skip optional semicolon */
-}
-
-
-static void statement (LexState *ls) {
-  int line = ls->linenumber;  /* may be needed for error messages */
-  enterlevel(ls);
-  switch (ls->t.token) {
-    case ';': {  /* stat -> ';' (empty statement) */
-      luaX_next(ls);  /* skip ';' */
-      break;
-    }
-    case TK_IF: {  /* stat -> ifstat */
-      ifstat(ls, line);
-      break;
-    }
-    case TK_WHILE: {  /* stat -> whilestat */
-      whilestat(ls, line);
-      break;
-    }
-    case TK_DO: {  /* stat -> DO block END */
-      luaX_next(ls);  /* skip DO */
-      block(ls);
-      check_match(ls, TK_END, TK_DO, line);
-      break;
-    }
-    case TK_FOR: {  /* stat -> forstat */
-      forstat(ls, line);
-      break;
-    }
-    case TK_REPEAT: {  /* stat -> repeatstat */
-      repeatstat(ls, line);
-      break;
-    }
-    case TK_FUNCTION: {  /* stat -> funcstat */
-      funcstat(ls, line);
-      break;
-    }
-    case TK_LOCAL: {  /* stat -> localstat */
-      luaX_next(ls);  /* skip LOCAL */
-      if (testnext(ls, TK_FUNCTION))  /* local function? */
-        localfunc(ls);
-      else
-        localstat(ls);
-      break;
-    }
-    case TK_DBCOLON: {  /* stat -> label */
-      luaX_next(ls);  /* skip double colon */
-      labelstat(ls, str_checkname(ls), line);
-      break;
-    }
-    case TK_RETURN: {  /* stat -> retstat */
-      luaX_next(ls);  /* skip RETURN */
-      retstat(ls);
-      break;
-    }
-    case TK_BREAK:   /* stat -> breakstat */
-    case TK_GOTO: {  /* stat -> 'goto' NAME */
-      gotostat(ls, luaK_jump(ls->fs));
-      break;
-    }
-    default: {  /* stat -> func | assignment */
-      exprstat(ls);
-      break;
-    }
-  }
-  lua_assert(ls->fs->f->maxstacksize >= ls->fs->freereg &&
-             ls->fs->freereg >= ls->fs->nactvar);
-  ls->fs->freereg = ls->fs->nactvar;  /* free registers */
-  leavelevel(ls);
-}
-
-/* }====================================================================== */
-
-
-/*
-** compiles the main function, which is a regular vararg function with an
-** upvalue named LUA_ENV
-*/
-static void mainfunc (LexState *ls, FuncState *fs) {
-  BlockCnt bl;
-  expdesc v;
-  open_func(ls, fs, &bl);
-  fs->f->is_vararg = 1;  /* main function is always vararg */
-  init_exp(&v, VLOCAL, 0);  /* create and... */
-  newupvalue(fs, ls->envn, &v);  /* ...set environment upvalue */
-  luaX_next(ls);  /* read first token */
-  statlist(ls);  /* parse main body */
-  check(ls, TK_EOS);
-  close_func(ls);
-}
-
-
-Closure *luaY_parser (lua_State *L, ZIO *z, Mbuffer *buff,
-                      Dyndata *dyd, const char *name, int firstchar) {
-  LexState lexstate;
-  FuncState funcstate;
-  Closure *cl = luaF_newLclosure(L, 1);  /* create main closure */
-  /* anchor closure (to avoid being collected) */
-  setclLvalue(L, L->top, cl);
-  incr_top(L);
-  funcstate.f = cl->l.p = luaF_newproto(L);
-  funcstate.f->source = luaS_new(L, name);  /* create and anchor TString */
-  lexstate.buff = buff;
-  lexstate.dyd = dyd;
-  dyd->actvar.n = dyd->gt.n = dyd->label.n = 0;
-  luaX_setinput(L, &lexstate, z, funcstate.f->source, firstchar);
-  mainfunc(&lexstate, &funcstate);
-  lua_assert(!funcstate.prev && funcstate.nups == 1 && !lexstate.fs);
-  /* all scopes should be correctly finished */
-  lua_assert(dyd->actvar.n == 0 && dyd->gt.n == 0 && dyd->label.n == 0);
-  return cl;  /* it's on the stack too */
-}
-
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lstate.h
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lstate.h
+++ /dev/null
@@ -1,228 +0,0 @@
-/*
-** $Id: lstate.h,v 2.82.1.1 2013/04/12 18:48:47 roberto Exp $
-** Global State
-** See Copyright Notice in lua.h
-*/
-
-#ifndef lstate_h
-#define lstate_h
-
-#include "lua.h"
-
-#include "lobject.h"
-#include "ltm.h"
-#include "lzio.h"
-
-
-/*
-
-** Some notes about garbage-collected objects:  All objects in Lua must
-** be kept somehow accessible until being freed.
-**
-** Lua keeps most objects linked in list g->allgc. The link uses field
-** 'next' of the CommonHeader.
-**
-** Strings are kept in several lists headed by the array g->strt.hash.
-**
-** Open upvalues are not subject to independent garbage collection. They
-** are collected together with their respective threads. Lua keeps a
-** double-linked list with all open upvalues (g->uvhead) so that it can
-** mark objects referred by them. (They are always gray, so they must
-** be remarked in the atomic step. Usually their contents would be marked
-** when traversing the respective threads, but the thread may already be
-** dead, while the upvalue is still accessible through closures.)
-**
-** Objects with finalizers are kept in the list g->finobj.
-**
-** The list g->tobefnz links all objects being finalized.
-
-*/
-
-
-struct lua_longjmp;  /* defined in ldo.c */
-
-
-
-/* extra stack space to handle TM calls and some other extras */
-#define EXTRA_STACK   5
-
-
-#define BASIC_STACK_SIZE        (2*LUA_MINSTACK)
-
-
-/* kinds of Garbage Collection */
-#define KGC_NORMAL	0
-#define KGC_EMERGENCY	1	/* gc was forced by an allocation failure */
-#define KGC_GEN		2	/* generational collection */
-
-
-typedef struct stringtable {
-  GCObject **hash;
-  lu_int32 nuse;  /* number of elements */
-  int size;
-} stringtable;
-
-
-/*
-** information about a call
-*/
-typedef struct CallInfo {
-  StkId func;  /* function index in the stack */
-  StkId	top;  /* top for this function */
-  struct CallInfo *previous, *next;  /* dynamic call link */
-  short nresults;  /* expected number of results from this function */
-  lu_byte callstatus;
-  ptrdiff_t extra;
-  union {
-    struct {  /* only for Lua functions */
-      StkId base;  /* base for this function */
-      const Instruction *savedpc;
-    } l;
-    struct {  /* only for C functions */
-      int ctx;  /* context info. in case of yields */
-      lua_CFunction k;  /* continuation in case of yields */
-      ptrdiff_t old_errfunc;
-      lu_byte old_allowhook;
-      lu_byte status;
-    } c;
-  } u;
-} CallInfo;
-
-
-/*
-** Bits in CallInfo status
-*/
-#define CIST_LUA	(1<<0)	/* call is running a Lua function */
-#define CIST_HOOKED	(1<<1)	/* call is running a debug hook */
-#define CIST_REENTRY	(1<<2)	/* call is running on same invocation of
-                                   luaV_execute of previous call */
-#define CIST_YIELDED	(1<<3)	/* call reentered after suspension */
-#define CIST_YPCALL	(1<<4)	/* call is a yieldable protected call */
-#define CIST_STAT	(1<<5)	/* call has an error status (pcall) */
-#define CIST_TAIL	(1<<6)	/* call was tail called */
-#define CIST_HOOKYIELD	(1<<7)	/* last hook called yielded */
-
-
-#define isLua(ci)	((ci)->callstatus & CIST_LUA)
-
-
-/*
-** `global state', shared by all threads of this state
-*/
-typedef struct global_State {
-  lua_Alloc frealloc;  /* function to reallocate memory */
-  void *ud;         /* auxiliary data to `frealloc' */
-  lu_mem totalbytes;  /* number of bytes currently allocated - GCdebt */
-  l_mem GCdebt;  /* bytes allocated not yet compensated by the collector */
-  lu_mem GCmemtrav;  /* memory traversed by the GC */
-  lu_mem GCestimate;  /* an estimate of the non-garbage memory in use */
-  stringtable strt;  /* hash table for strings */
-  TValue l_registry;
-  unsigned int seed;  /* randomized seed for hashes */
-  lu_byte currentwhite;
-  lu_byte gcstate;  /* state of garbage collector */
-  lu_byte gckind;  /* kind of GC running */
-  lu_byte gcrunning;  /* true if GC is running */
-  int sweepstrgc;  /* position of sweep in `strt' */
-  GCObject *allgc;  /* list of all collectable objects */
-  GCObject *finobj;  /* list of collectable objects with finalizers */
-  GCObject **sweepgc;  /* current position of sweep in list 'allgc' */
-  GCObject **sweepfin;  /* current position of sweep in list 'finobj' */
-  GCObject *gray;  /* list of gray objects */
-  GCObject *grayagain;  /* list of objects to be traversed atomically */
-  GCObject *weak;  /* list of tables with weak values */
-  GCObject *ephemeron;  /* list of ephemeron tables (weak keys) */
-  GCObject *allweak;  /* list of all-weak tables */
-  GCObject *tobefnz;  /* list of userdata to be GC */
-  UpVal uvhead;  /* head of double-linked list of all open upvalues */
-  Mbuffer buff;  /* temporary buffer for string concatenation */
-  int gcpause;  /* size of pause between successive GCs */
-  int gcmajorinc;  /* pause between major collections (only in gen. mode) */
-  int gcstepmul;  /* GC `granularity' */
-  lua_CFunction panic;  /* to be called in unprotected errors */
-  struct lua_State *mainthread;
-  const lua_Number *version;  /* pointer to version number */
-  TString *memerrmsg;  /* memory-error message */
-  TString *tmname[TM_N];  /* array with tag-method names */
-  struct Table *mt[LUA_NUMTAGS];  /* metatables for basic types */
-} global_State;
-
-
-/*
-** `per thread' state
-*/
-struct lua_State {
-  CommonHeader;
-  lu_byte status;
-  StkId top;  /* first free slot in the stack */
-  global_State *l_G;
-  CallInfo *ci;  /* call info for current function */
-  const Instruction *oldpc;  /* last pc traced */
-  StkId stack_last;  /* last free slot in the stack */
-  StkId stack;  /* stack base */
-  int stacksize;
-  unsigned short nny;  /* number of non-yieldable calls in stack */
-  unsigned short nCcalls;  /* number of nested C calls */
-  lu_byte hookmask;
-  lu_byte allowhook;
-  int basehookcount;
-  int hookcount;
-  lua_Hook hook;
-  GCObject *openupval;  /* list of open upvalues in this stack */
-  GCObject *gclist;
-  struct lua_longjmp *errorJmp;  /* current error recover point */
-  ptrdiff_t errfunc;  /* current error handling function (stack index) */
-  CallInfo base_ci;  /* CallInfo for first level (C calling Lua) */
-};
-
-
-#define G(L)	(L->l_G)
-
-
-/*
-** Union of all collectable objects
-*/
-union GCObject {
-  GCheader gch;  /* common header */
-  union TString ts;
-  union Udata u;
-  union Closure cl;
-  struct Table h;
-  struct Proto p;
-  struct UpVal uv;
-  struct lua_State th;  /* thread */
-};
-
-
-#define gch(o)		(&(o)->gch)
-
-/* macros to convert a GCObject into a specific value */
-#define rawgco2ts(o)  \
-	check_exp(novariant((o)->gch.tt) == LUA_TSTRING, &((o)->ts))
-#define gco2ts(o)	(&rawgco2ts(o)->tsv)
-#define rawgco2u(o)	check_exp((o)->gch.tt == LUA_TUSERDATA, &((o)->u))
-#define gco2u(o)	(&rawgco2u(o)->uv)
-#define gco2lcl(o)	check_exp((o)->gch.tt == LUA_TLCL, &((o)->cl.l))
-#define gco2ccl(o)	check_exp((o)->gch.tt == LUA_TCCL, &((o)->cl.c))
-#define gco2cl(o)  \
-	check_exp(novariant((o)->gch.tt) == LUA_TFUNCTION, &((o)->cl))
-#define gco2t(o)	check_exp((o)->gch.tt == LUA_TTABLE, &((o)->h))
-#define gco2p(o)	check_exp((o)->gch.tt == LUA_TPROTO, &((o)->p))
-#define gco2uv(o)	check_exp((o)->gch.tt == LUA_TUPVAL, &((o)->uv))
-#define gco2th(o)	check_exp((o)->gch.tt == LUA_TTHREAD, &((o)->th))
-
-/* macro to convert any Lua object into a GCObject */
-#define obj2gco(v)	(cast(GCObject *, (v)))
-
-
-/* actual number of total bytes allocated */
-#define gettotalbytes(g)	((g)->totalbytes + (g)->GCdebt)
-
-LUAI_FUNC void luaE_setdebt (global_State *g, l_mem debt);
-LUAI_FUNC void luaE_freethread (lua_State *L, lua_State *L1);
-LUAI_FUNC CallInfo *luaE_extendCI (lua_State *L);
-LUAI_FUNC void luaE_freeCI (lua_State *L);
-
-
-#endif
-
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lstate.c
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lstate.c
+++ /dev/null
@@ -1,321 +0,0 @@
-/*
-** $Id: lstate.c,v 2.99.1.2 2013/11/08 17:45:31 roberto Exp $
-** Global State
-** See Copyright Notice in lua.h
-*/
-
-
-#include <sys/zfs_context.h>
-
-#define lstate_c
-#define LUA_CORE
-
-#include "lua.h"
-
-#include "lapi.h"
-#include "ldebug.h"
-#include "ldo.h"
-#include "lfunc.h"
-#include "lgc.h"
-#include "llex.h"
-#include "lmem.h"
-#include "lstate.h"
-#include "lstring.h"
-#include "ltable.h"
-#include "ltm.h"
-
-
-#if !defined(LUAI_GCPAUSE)
-#define LUAI_GCPAUSE	200  /* 200% */
-#endif
-
-#if !defined(LUAI_GCMAJOR)
-#define LUAI_GCMAJOR	200  /* 200% */
-#endif
-
-#if !defined(LUAI_GCMUL)
-#define LUAI_GCMUL	200 /* GC runs 'twice the speed' of memory allocation */
-#endif
-
-
-#define MEMERRMSG	"not enough memory"
-
-
-/*
-** a macro to help the creation of a unique random seed when a state is
-** created; the seed is used to randomize hashes.
-*/
-#if !defined(luai_makeseed)
-#define luai_makeseed()		cast(unsigned int, gethrtime())
-#endif
-
-
-
-/*
-** thread state + extra space
-*/
-typedef struct LX {
-#if defined(LUAI_EXTRASPACE)
-  char buff[LUAI_EXTRASPACE];
-#endif
-  lua_State l;
-} LX;
-
-
-/*
-** Main thread combines a thread state and the global state
-*/
-typedef struct LG {
-  LX l;
-  global_State g;
-} LG;
-
-
-
-#define fromstate(L)	(cast(LX *, cast(lu_byte *, (L)) - offsetof(LX, l)))
-
-
-/*
-** Compute an initial seed as random as possible. In ANSI, rely on
-** Address Space Layout Randomization (if present) to increase
-** randomness..
-*/
-#define addbuff(b,p,e) \
-  { size_t t = cast(size_t, e); \
-    memcpy(buff + p, &t, sizeof(t)); p += sizeof(t); }
-
-static unsigned int makeseed (lua_State *L) {
-  char buff[4 * sizeof(size_t)];
-  unsigned int h = luai_makeseed();
-  int p = 0;
-  addbuff(buff, p, L);  /* heap variable */
-  addbuff(buff, p, &h);  /* local variable */
-  addbuff(buff, p, luaO_nilobject);  /* global variable */
-  addbuff(buff, p, &lua_newstate);  /* public function */
-  lua_assert(p == sizeof(buff));
-  return luaS_hash(buff, p, h);
-}
-
-
-/*
-** set GCdebt to a new value keeping the value (totalbytes + GCdebt)
-** invariant
-*/
-void luaE_setdebt (global_State *g, l_mem debt) {
-  g->totalbytes -= (debt - g->GCdebt);
-  g->GCdebt = debt;
-}
-
-
-CallInfo *luaE_extendCI (lua_State *L) {
-  CallInfo *ci = luaM_new(L, CallInfo);
-  lua_assert(L->ci->next == NULL);
-  L->ci->next = ci;
-  ci->previous = L->ci;
-  ci->next = NULL;
-  return ci;
-}
-
-
-void luaE_freeCI (lua_State *L) {
-  CallInfo *ci = L->ci;
-  CallInfo *next = ci->next;
-  ci->next = NULL;
-  while ((ci = next) != NULL) {
-    next = ci->next;
-    luaM_free(L, ci);
-  }
-}
-
-
-static void stack_init (lua_State *L1, lua_State *L) {
-  int i; CallInfo *ci;
-  /* initialize stack array */
-  L1->stack = luaM_newvector(L, BASIC_STACK_SIZE, TValue);
-  L1->stacksize = BASIC_STACK_SIZE;
-  for (i = 0; i < BASIC_STACK_SIZE; i++)
-    setnilvalue(L1->stack + i);  /* erase new stack */
-  L1->top = L1->stack;
-  L1->stack_last = L1->stack + L1->stacksize - EXTRA_STACK;
-  /* initialize first ci */
-  ci = &L1->base_ci;
-  ci->next = ci->previous = NULL;
-  ci->callstatus = 0;
-  ci->func = L1->top;
-  setnilvalue(L1->top++);  /* 'function' entry for this 'ci' */
-  ci->top = L1->top + LUA_MINSTACK;
-  L1->ci = ci;
-}
-
-
-static void freestack (lua_State *L) {
-  if (L->stack == NULL)
-    return;  /* stack not completely built yet */
-  L->ci = &L->base_ci;  /* free the entire 'ci' list */
-  luaE_freeCI(L);
-  luaM_freearray(L, L->stack, L->stacksize);  /* free stack array */
-}
-
-
-/*
-** Create registry table and its predefined values
-*/
-static void init_registry (lua_State *L, global_State *g) {
-  TValue mt;
-  /* create registry */
-  Table *registry = luaH_new(L);
-  sethvalue(L, &g->l_registry, registry);
-  luaH_resize(L, registry, LUA_RIDX_LAST, 0);
-  /* registry[LUA_RIDX_MAINTHREAD] = L */
-  setthvalue(L, &mt, L);
-  luaH_setint(L, registry, LUA_RIDX_MAINTHREAD, &mt);
-  /* registry[LUA_RIDX_GLOBALS] = table of globals */
-  sethvalue(L, &mt, luaH_new(L));
-  luaH_setint(L, registry, LUA_RIDX_GLOBALS, &mt);
-}
-
-
-/*
-** open parts of the state that may cause memory-allocation errors
-*/
-static void f_luaopen (lua_State *L, void *ud) {
-  global_State *g = G(L);
-  UNUSED(ud);
-  stack_init(L, L);  /* init stack */
-  init_registry(L, g);
-  luaS_resize(L, MINSTRTABSIZE);  /* initial size of string table */
-  luaT_init(L);
-  luaX_init(L);
-  /* pre-create memory-error message */
-  g->memerrmsg = luaS_newliteral(L, MEMERRMSG);
-  luaS_fix(g->memerrmsg);  /* it should never be collected */
-  g->gcrunning = 1;  /* allow gc */
-  g->version = lua_version(NULL);
-  luai_userstateopen(L);
-}
-
-
-/*
-** preinitialize a state with consistent values without allocating
-** any memory (to avoid errors)
-*/
-static void preinit_state (lua_State *L, global_State *g) {
-  G(L) = g;
-  L->stack = NULL;
-  L->ci = NULL;
-  L->stacksize = 0;
-  L->errorJmp = NULL;
-  L->nCcalls = 0;
-  L->hook = NULL;
-  L->hookmask = 0;
-  L->basehookcount = 0;
-  L->allowhook = 1;
-  resethookcount(L);
-  L->openupval = NULL;
-  L->nny = 1;
-  L->status = LUA_OK;
-  L->errfunc = 0;
-}
-
-
-static void close_state (lua_State *L) {
-  global_State *g = G(L);
-  luaF_close(L, L->stack);  /* close all upvalues for this thread */
-  luaC_freeallobjects(L);  /* collect all objects */
-  if (g->version)  /* closing a fully built state? */
-    luai_userstateclose(L);
-  luaM_freearray(L, G(L)->strt.hash, G(L)->strt.size);
-  luaZ_freebuffer(L, &g->buff);
-  freestack(L);
-  lua_assert(gettotalbytes(g) == sizeof(LG));
-  (*g->frealloc)(g->ud, fromstate(L), sizeof(LG), 0);  /* free main block */
-}
-
-
-LUA_API lua_State *lua_newthread (lua_State *L) {
-  lua_State *L1;
-  lua_lock(L);
-  luaC_checkGC(L);
-  L1 = &luaC_newobj(L, LUA_TTHREAD, sizeof(LX), NULL, offsetof(LX, l))->th;
-  setthvalue(L, L->top, L1);
-  api_incr_top(L);
-  preinit_state(L1, G(L));
-  L1->hookmask = L->hookmask;
-  L1->basehookcount = L->basehookcount;
-  L1->hook = L->hook;
-  resethookcount(L1);
-  luai_userstatethread(L, L1);
-  stack_init(L1, L);  /* init stack */
-  lua_unlock(L);
-  return L1;
-}
-
-
-void luaE_freethread (lua_State *L, lua_State *L1) {
-  LX *l = fromstate(L1);
-  luaF_close(L1, L1->stack);  /* close all upvalues for this thread */
-  lua_assert(L1->openupval == NULL);
-  luai_userstatefree(L, L1);
-  freestack(L1);
-  luaM_free(L, l);
-}
-
-
-LUA_API lua_State *lua_newstate (lua_Alloc f, void *ud) {
-  int i;
-  lua_State *L;
-  global_State *g;
-  LG *l = cast(LG *, (*f)(ud, NULL, LUA_TTHREAD, sizeof(LG)));
-  if (l == NULL) return NULL;
-  L = &l->l.l;
-  g = &l->g;
-  L->next = NULL;
-  L->tt = LUA_TTHREAD;
-  g->currentwhite = bit2mask(WHITE0BIT, FIXEDBIT);
-  L->marked = luaC_white(g);
-  g->gckind = KGC_NORMAL;
-  preinit_state(L, g);
-  g->frealloc = f;
-  g->ud = ud;
-  g->mainthread = L;
-  g->seed = makeseed(L);
-  g->uvhead.u.l.prev = &g->uvhead;
-  g->uvhead.u.l.next = &g->uvhead;
-  g->gcrunning = 0;  /* no GC while building state */
-  g->GCestimate = 0;
-  g->strt.size = 0;
-  g->strt.nuse = 0;
-  g->strt.hash = NULL;
-  setnilvalue(&g->l_registry);
-  luaZ_initbuffer(L, &g->buff);
-  g->panic = NULL;
-  g->version = NULL;
-  g->gcstate = GCSpause;
-  g->allgc = NULL;
-  g->finobj = NULL;
-  g->tobefnz = NULL;
-  g->sweepgc = g->sweepfin = NULL;
-  g->gray = g->grayagain = NULL;
-  g->weak = g->ephemeron = g->allweak = NULL;
-  g->totalbytes = sizeof(LG);
-  g->GCdebt = 0;
-  g->gcpause = LUAI_GCPAUSE;
-  g->gcmajorinc = LUAI_GCMAJOR;
-  g->gcstepmul = LUAI_GCMUL;
-  for (i=0; i < LUA_NUMTAGS; i++) g->mt[i] = NULL;
-  if (luaD_rawrunprotected(L, f_luaopen, NULL) != LUA_OK) {
-    /* memory allocation error: free partial state */
-    close_state(L);
-    L = NULL;
-  }
-  return L;
-}
-
-
-LUA_API void lua_close (lua_State *L) {
-  L = G(L)->mainthread;  /* only the main thread can be closed */
-  lua_lock(L);
-  close_state(L);
-}
-
-
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lstring.h
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lstring.h
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
-** $Id: lstring.h,v 1.49.1.1 2013/04/12 18:48:47 roberto Exp $
-** String table (keep all strings handled by Lua)
-** See Copyright Notice in lua.h
-*/
-
-#ifndef lstring_h
-#define lstring_h
-
-#include "lgc.h"
-#include "lobject.h"
-#include "lstate.h"
-
-
-#define sizestring(s)	(sizeof(union TString)+((s)->len+1)*sizeof(char))
-
-#define sizeudata(u)	(sizeof(union Udata)+(u)->len)
-
-#define luaS_newliteral(L, s)	(luaS_newlstr(L, "" s, \
-                                 (sizeof(s)/sizeof(char))-1))
-
-#define luaS_fix(s)	l_setbit((s)->tsv.marked, FIXEDBIT)
-
-
-/*
-** test whether a string is a reserved word
-*/
-#define isreserved(s)	((s)->tsv.tt == LUA_TSHRSTR && (s)->tsv.extra > 0)
-
-
-/*
-** equality for short strings, which are always internalized
-*/
-#define eqshrstr(a,b)	check_exp((a)->tsv.tt == LUA_TSHRSTR, (a) == (b))
-
-
-LUAI_FUNC unsigned int luaS_hash (const char *str, size_t l, unsigned int seed);
-LUAI_FUNC int luaS_eqlngstr (TString *a, TString *b);
-LUAI_FUNC int luaS_eqstr (TString *a, TString *b);
-LUAI_FUNC void luaS_resize (lua_State *L, int newsize);
-LUAI_FUNC Udata *luaS_newudata (lua_State *L, size_t s, Table *e);
-LUAI_FUNC TString *luaS_newlstr (lua_State *L, const char *str, size_t l);
-LUAI_FUNC TString *luaS_new (lua_State *L, const char *str);
-
-
-#endif
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lstring.c
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lstring.c
+++ /dev/null
@@ -1,185 +0,0 @@
-/*
-** $Id: lstring.c,v 2.26.1.1 2013/04/12 18:48:47 roberto Exp $
-** String table (keeps all strings handled by Lua)
-** See Copyright Notice in lua.h
-*/
-
-
-#include <sys/zfs_context.h>
-
-#define lstring_c
-#define LUA_CORE
-
-#include "lua.h"
-
-#include "lmem.h"
-#include "lobject.h"
-#include "lstate.h"
-#include "lstring.h"
-
-
-/*
-** Lua will use at most ~(2^LUAI_HASHLIMIT) bytes from a string to
-** compute its hash
-*/
-#if !defined(LUAI_HASHLIMIT)
-#define LUAI_HASHLIMIT		5
-#endif
-
-
-/*
-** equality for long strings
-*/
-int luaS_eqlngstr (TString *a, TString *b) {
-  size_t len = a->tsv.len;
-  lua_assert(a->tsv.tt == LUA_TLNGSTR && b->tsv.tt == LUA_TLNGSTR);
-  return (a == b) ||  /* same instance or... */
-    ((len == b->tsv.len) &&  /* equal length and ... */
-     (memcmp(getstr(a), getstr(b), len) == 0));  /* equal contents */
-}
-
-
-/*
-** equality for strings
-*/
-int luaS_eqstr (TString *a, TString *b) {
-  return (a->tsv.tt == b->tsv.tt) &&
-         (a->tsv.tt == LUA_TSHRSTR ? eqshrstr(a, b) : luaS_eqlngstr(a, b));
-}
-
-
-unsigned int luaS_hash (const char *str, size_t l, unsigned int seed) {
-  unsigned int h = seed ^ cast(unsigned int, l);
-  size_t l1;
-  size_t step = (l >> LUAI_HASHLIMIT) + 1;
-  for (l1 = l; l1 >= step; l1 -= step)
-    h = h ^ ((h<<5) + (h>>2) + cast_byte(str[l1 - 1]));
-  return h;
-}
-
-
-/*
-** resizes the string table
-*/
-void luaS_resize (lua_State *L, int newsize) {
-  int i;
-  stringtable *tb = &G(L)->strt;
-  /* cannot resize while GC is traversing strings */
-  luaC_runtilstate(L, ~bitmask(GCSsweepstring));
-  if (newsize > tb->size) {
-    luaM_reallocvector(L, tb->hash, tb->size, newsize, GCObject *);
-    for (i = tb->size; i < newsize; i++) tb->hash[i] = NULL;
-  }
-  /* rehash */
-  for (i=0; i<tb->size; i++) {
-    GCObject *p = tb->hash[i];
-    tb->hash[i] = NULL;
-    while (p) {  /* for each node in the list */
-      GCObject *next = gch(p)->next;  /* save next */
-      unsigned int h = lmod(gco2ts(p)->hash, newsize);  /* new position */
-      gch(p)->next = tb->hash[h];  /* chain it */
-      tb->hash[h] = p;
-      resetoldbit(p);  /* see MOVE OLD rule */
-      p = next;
-    }
-  }
-  if (newsize < tb->size) {
-    /* shrinking slice must be empty */
-    lua_assert(tb->hash[newsize] == NULL && tb->hash[tb->size - 1] == NULL);
-    luaM_reallocvector(L, tb->hash, tb->size, newsize, GCObject *);
-  }
-  tb->size = newsize;
-}
-
-
-/*
-** creates a new string object
-*/
-static TString *createstrobj (lua_State *L, const char *str, size_t l,
-                              int tag, unsigned int h, GCObject **list) {
-  TString *ts;
-  size_t totalsize;  /* total size of TString object */
-  totalsize = sizeof(TString) + ((l + 1) * sizeof(char));
-  ts = &luaC_newobj(L, tag, totalsize, list, 0)->ts;
-  ts->tsv.len = l;
-  ts->tsv.hash = h;
-  ts->tsv.extra = 0;
-  memcpy(ts+1, str, l*sizeof(char));
-  ((char *)(ts+1))[l] = '\0';  /* ending 0 */
-  return ts;
-}
-
-
-/*
-** creates a new short string, inserting it into string table
-*/
-static TString *newshrstr (lua_State *L, const char *str, size_t l,
-                                       unsigned int h) {
-  GCObject **list;  /* (pointer to) list where it will be inserted */
-  stringtable *tb = &G(L)->strt;
-  TString *s;
-  if (tb->nuse >= cast(lu_int32, tb->size) && tb->size <= MAX_INT/2)
-    luaS_resize(L, tb->size*2);  /* too crowded */
-  list = &tb->hash[lmod(h, tb->size)];
-  s = createstrobj(L, str, l, LUA_TSHRSTR, h, list);
-  tb->nuse++;
-  return s;
-}
-
-
-/*
-** checks whether short string exists and reuses it or creates a new one
-*/
-static TString *internshrstr (lua_State *L, const char *str, size_t l) {
-  GCObject *o;
-  global_State *g = G(L);
-  unsigned int h = luaS_hash(str, l, g->seed);
-  for (o = g->strt.hash[lmod(h, g->strt.size)];
-       o != NULL;
-       o = gch(o)->next) {
-    TString *ts = rawgco2ts(o);
-    if (h == ts->tsv.hash &&
-        l == ts->tsv.len &&
-        (memcmp(str, getstr(ts), l * sizeof(char)) == 0)) {
-      if (isdead(G(L), o))  /* string is dead (but was not collected yet)? */
-        changewhite(o);  /* resurrect it */
-      return ts;
-    }
-  }
-  return newshrstr(L, str, l, h);  /* not found; create a new string */
-}
-
-
-/*
-** new string (with explicit length)
-*/
-TString *luaS_newlstr (lua_State *L, const char *str, size_t l) {
-  if (l <= LUAI_MAXSHORTLEN)  /* short string? */
-    return internshrstr(L, str, l);
-  else {
-    if (l + 1 > (MAX_SIZET - sizeof(TString))/sizeof(char))
-      luaM_toobig(L);
-    return createstrobj(L, str, l, LUA_TLNGSTR, G(L)->seed, NULL);
-  }
-}
-
-
-/*
-** new zero-terminated string
-*/
-TString *luaS_new (lua_State *L, const char *str) {
-  return luaS_newlstr(L, str, strlen(str));
-}
-
-
-Udata *luaS_newudata (lua_State *L, size_t s, Table *e) {
-  Udata *u;
-  if (s > MAX_SIZET - sizeof(Udata))
-    luaM_toobig(L);
-  u = &luaC_newobj(L, LUA_TUSERDATA, sizeof(Udata) + s, NULL, 0)->u;
-  u->uv.len = s;
-  u->uv.metatable = NULL;
-  u->uv.env = e;
-  return u;
-}
-
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lstrlib.c
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lstrlib.c
+++ /dev/null
@@ -1,1050 +0,0 @@
-/*
-** $Id: lstrlib.c,v 1.178.1.1 2013/04/12 18:48:47 roberto Exp $
-** Standard library for string operations and pattern-matching
-** See Copyright Notice in lua.h
-*/
-
-
-#include <sys/ctype.h>
-#include <sys/zfs_context.h>
-
-#define lstrlib_c
-#define LUA_LIB
-
-#include "lua.h"
-
-#include "lauxlib.h"
-#include "lualib.h"
-
-
-/*
-** maximum number of captures that a pattern can do during
-** pattern-matching. This limit is arbitrary.
-*/
-#if !defined(LUA_MAXCAPTURES)
-#define LUA_MAXCAPTURES		32
-#endif
-
-
-/* macro to `unsign' a character */
-#define uchar(c)	((unsigned char)(c))
-
-/*
- * PATCHED: add missing character macros.
- */
-#ifdef illumos
-#define tolower(C)	(((C) >= 'A' && (C) <= 'Z') ? (C) - 'A' + 'a' : (C))
-#define toupper(C)      (((C) >= 'a' && (C) <= 'z') ? (C) - 'a' + 'A': (C))
-#define iscntrl(C)	((((C) >= 0) && ((C) <= 0x1f)) || ((C) == 0x7f))
-#else
-#define	isalnum(C)      (isalpha(C) || isdigit(C))
-#define iscntrl(C)	(uchar(C) <= 0x1f || uchar(C) == 0x7f)
-#endif
-#define isgraph(C)	((C) >= 0x21 && (C) <= 0x7E)
-#define ispunct(C)	(((C) >= 0x21 && (C) <= 0x2F) || \
-    ((C) >= 0x3A && (C) <= 0x40) || \
-    ((C) >= 0x5B && (C) <= 0x60) || \
-    ((C) >= 0x7B && (C) <= 0x7E))
-
-/*
- * The provided version of sprintf returns a char *, but str_format expects
- * it to return the number of characters printed. This version has the expected
- * behavior.
- */
-static size_t str_sprintf(char *buf, const char *fmt, ...) {
-  va_list args;
-  size_t len;
-
-  va_start(args, fmt);
-  len = vsnprintf(buf, INT_MAX, fmt, args);
-  va_end(args);
-
-  return len;
-}
-
-
-static int str_len (lua_State *L) {
-  size_t l;
-  luaL_checklstring(L, 1, &l);
-  lua_pushinteger(L, (lua_Integer)l);
-  return 1;
-}
-
-
-/* translate a relative string position: negative means back from end */
-static size_t posrelat (ptrdiff_t pos, size_t len) {
-  if (pos >= 0) return (size_t)pos;
-  else if (0u - (size_t)pos > len) return 0;
-  else return len - ((size_t)-pos) + 1;
-}
-
-
-static int str_sub (lua_State *L) {
-  size_t l;
-  const char *s = luaL_checklstring(L, 1, &l);
-  size_t start = posrelat(luaL_checkinteger(L, 2), l);
-  size_t end = posrelat(luaL_optinteger(L, 3, -1), l);
-  if (start < 1) start = 1;
-  if (end > l) end = l;
-  if (start <= end)
-    lua_pushlstring(L, s + start - 1, end - start + 1);
-  else lua_pushliteral(L, "");
-  return 1;
-}
-
-
-static int str_reverse (lua_State *L) {
-  size_t l, i;
-  luaL_Buffer b;
-  const char *s = luaL_checklstring(L, 1, &l);
-  char *p = luaL_buffinitsize(L, &b, l);
-  for (i = 0; i < l; i++)
-    p[i] = s[l - i - 1];
-  luaL_pushresultsize(&b, l);
-  return 1;
-}
-
-
-static int str_lower (lua_State *L) {
-  size_t l;
-  size_t i;
-  luaL_Buffer b;
-  const char *s = luaL_checklstring(L, 1, &l);
-  char *p = luaL_buffinitsize(L, &b, l);
-  for (i=0; i<l; i++)
-    p[i] = tolower(uchar(s[i]));
-  luaL_pushresultsize(&b, l);
-  return 1;
-}
-
-
-static int str_upper (lua_State *L) {
-  size_t l;
-  size_t i;
-  luaL_Buffer b;
-  const char *s = luaL_checklstring(L, 1, &l);
-  char *p = luaL_buffinitsize(L, &b, l);
-  for (i=0; i<l; i++)
-    p[i] = toupper(uchar(s[i]));
-  luaL_pushresultsize(&b, l);
-  return 1;
-}
-
-
-/* reasonable limit to avoid arithmetic overflow */
-#define MAXSIZE		((~(size_t)0) >> 1)
-
-static int str_rep (lua_State *L) {
-  size_t l, lsep;
-  const char *s = luaL_checklstring(L, 1, &l);
-  int n = luaL_checkint(L, 2);
-  const char *sep = luaL_optlstring(L, 3, "", &lsep);
-  if (n <= 0) lua_pushliteral(L, "");
-  else if (l + lsep < l || l + lsep >= MAXSIZE / n)  /* may overflow? */
-    return luaL_error(L, "resulting string too large");
-  else {
-    size_t totallen = n * l + (n - 1) * lsep;
-    luaL_Buffer b;
-    char *p = luaL_buffinitsize(L, &b, totallen);
-    while (n-- > 1) {  /* first n-1 copies (followed by separator) */
-      memcpy(p, s, l * sizeof(char)); p += l;
-      if (lsep > 0) {  /* avoid empty 'memcpy' (may be expensive) */
-        memcpy(p, sep, lsep * sizeof(char)); p += lsep;
-      }
-    }
-    memcpy(p, s, l * sizeof(char));  /* last copy (not followed by separator) */
-    luaL_pushresultsize(&b, totallen);
-  }
-  return 1;
-}
-
-
-static int str_byte (lua_State *L) {
-  size_t l;
-  const char *s = luaL_checklstring(L, 1, &l);
-  size_t posi = posrelat(luaL_optinteger(L, 2, 1), l);
-  size_t pose = posrelat(luaL_optinteger(L, 3, posi), l);
-  int n, i;
-  if (posi < 1) posi = 1;
-  if (pose > l) pose = l;
-  if (posi > pose) return 0;  /* empty interval; return no values */
-  n = (int)(pose -  posi + 1);
-  if (posi + n <= pose)  /* (size_t -> int) overflow? */
-    return luaL_error(L, "string slice too long");
-  luaL_checkstack(L, n, "string slice too long");
-  for (i=0; i<n; i++)
-    lua_pushinteger(L, uchar(s[posi+i-1]));
-  return n;
-}
-
-
-static int str_char (lua_State *L) {
-  int n = lua_gettop(L);  /* number of arguments */
-  int i;
-  luaL_Buffer b;
-  char *p = luaL_buffinitsize(L, &b, n);
-  for (i=1; i<=n; i++) {
-    int c = luaL_checkint(L, i);
-    luaL_argcheck(L, uchar(c) == c, i, "value out of range");
-    p[i - 1] = uchar(c);
-  }
-  luaL_pushresultsize(&b, n);
-  return 1;
-}
-
-
-static int writer (lua_State *L, const void* b, size_t size, void* B) {
-  (void)L;
-  luaL_addlstring((luaL_Buffer*) B, (const char *)b, size);
-  return 0;
-}
-
-
-static int str_dump (lua_State *L) {
-  luaL_Buffer b;
-  luaL_checktype(L, 1, LUA_TFUNCTION);
-  lua_settop(L, 1);
-  luaL_buffinit(L,&b);
-  if (lua_dump(L, writer, &b) != 0)
-    return luaL_error(L, "unable to dump given function");
-  luaL_pushresult(&b);
-  return 1;
-}
-
-
-
-/*
-** {======================================================
-** PATTERN MATCHING
-** =======================================================
-*/
-
-
-#define CAP_UNFINISHED	(-1)
-#define CAP_POSITION	(-2)
-
-
-typedef struct MatchState {
-  int matchdepth;  /* control for recursive depth (to avoid C stack overflow) */
-  const char *src_init;  /* init of source string */
-  const char *src_end;  /* end ('\0') of source string */
-  const char *p_end;  /* end ('\0') of pattern */
-  lua_State *L;
-  int level;  /* total number of captures (finished or unfinished) */
-  struct {
-    const char *init;
-    ptrdiff_t len;
-  } capture[LUA_MAXCAPTURES];
-} MatchState;
-
-
-/* recursive function */
-static const char *match (MatchState *ms, const char *s, const char *p);
-
-
-/* maximum recursion depth for 'match' */
-#if !defined(MAXCCALLS)
-#define MAXCCALLS	200
-#endif
-
-
-#define L_ESC		'%'
-#define SPECIALS	"^$*+?.([%-"
-
-
-static int check_capture (MatchState *ms, int l) {
-  l -= '1';
-  if (l < 0 || l >= ms->level || ms->capture[l].len == CAP_UNFINISHED)
-    return luaL_error(ms->L, "invalid capture index %%%d", l + 1);
-  return l;
-}
-
-
-static int capture_to_close (MatchState *ms) {
-  int level = ms->level;
-  for (level--; level>=0; level--)
-    if (ms->capture[level].len == CAP_UNFINISHED) return level;
-  return luaL_error(ms->L, "invalid pattern capture");
-}
-
-
-static const char *classend (MatchState *ms, const char *p) {
-  switch (*p++) {
-    case L_ESC: {
-      if (p == ms->p_end)
-        luaL_error(ms->L, "malformed pattern (ends with " LUA_QL("%%") ")");
-      return p+1;
-    }
-    case '[': {
-      if (*p == '^') p++;
-      do {  /* look for a `]' */
-        if (p == ms->p_end)
-          luaL_error(ms->L, "malformed pattern (missing " LUA_QL("]") ")");
-        if (*(p++) == L_ESC && p < ms->p_end)
-          p++;  /* skip escapes (e.g. `%]') */
-      } while (*p != ']');
-      return p+1;
-    }
-    default: {
-      return p;
-    }
-  }
-}
-
-
-static int match_class (int c, int cl) {
-  int res;
-  switch (tolower(cl)) {
-    case 'a' : res = isalpha(c); break;
-    case 'c' : res = iscntrl(c); break;
-    case 'd' : res = isdigit(c); break;
-    case 'g' : res = isgraph(c); break;
-    case 'l' : res = islower(c); break;
-    case 'p' : res = ispunct(c); break;
-    case 's' : res = isspace(c); break;
-    case 'u' : res = isupper(c); break;
-    case 'w' : res = isalnum(c); break;
-    case 'x' : res = isxdigit(c); break;
-    case 'z' : res = (c == 0); break;  /* deprecated option */
-    default: return (cl == c);
-  }
-  return (islower(cl) ? res : !res);
-}
-
-
-static int matchbracketclass (int c, const char *p, const char *ec) {
-  int sig = 1;
-  if (*(p+1) == '^') {
-    sig = 0;
-    p++;  /* skip the `^' */
-  }
-  while (++p < ec) {
-    if (*p == L_ESC) {
-      p++;
-      if (match_class(c, uchar(*p)))
-        return sig;
-    }
-    else if ((*(p+1) == '-') && (p+2 < ec)) {
-      p+=2;
-      if (uchar(*(p-2)) <= c && c <= uchar(*p))
-        return sig;
-    }
-    else if (uchar(*p) == c) return sig;
-  }
-  return !sig;
-}
-
-
-static int singlematch (MatchState *ms, const char *s, const char *p,
-                        const char *ep) {
-  if (s >= ms->src_end)
-    return 0;
-  else {
-    int c = uchar(*s);
-    switch (*p) {
-      case '.': return 1;  /* matches any char */
-      case L_ESC: return match_class(c, uchar(*(p+1)));
-      case '[': return matchbracketclass(c, p, ep-1);
-      default:  return (uchar(*p) == c);
-    }
-  }
-}
-
-
-static const char *matchbalance (MatchState *ms, const char *s,
-                                   const char *p) {
-  if (p >= ms->p_end - 1)
-    luaL_error(ms->L, "malformed pattern "
-                      "(missing arguments to " LUA_QL("%%b") ")");
-  if (*s != *p) return NULL;
-  else {
-    int b = *p;
-    int e = *(p+1);
-    int cont = 1;
-    while (++s < ms->src_end) {
-      if (*s == e) {
-        if (--cont == 0) return s+1;
-      }
-      else if (*s == b) cont++;
-    }
-  }
-  return NULL;  /* string ends out of balance */
-}
-
-
-static const char *max_expand (MatchState *ms, const char *s,
-                                 const char *p, const char *ep) {
-  ptrdiff_t i = 0;  /* counts maximum expand for item */
-  while (singlematch(ms, s + i, p, ep))
-    i++;
-  /* keeps trying to match with the maximum repetitions */
-  while (i>=0) {
-    const char *res = match(ms, (s+i), ep+1);
-    if (res) return res;
-    i--;  /* else didn't match; reduce 1 repetition to try again */
-  }
-  return NULL;
-}
-
-
-static const char *min_expand (MatchState *ms, const char *s,
-                                 const char *p, const char *ep) {
-  for (;;) {
-    const char *res = match(ms, s, ep+1);
-    if (res != NULL)
-      return res;
-    else if (singlematch(ms, s, p, ep))
-      s++;  /* try with one more repetition */
-    else return NULL;
-  }
-}
-
-
-static const char *start_capture (MatchState *ms, const char *s,
-                                    const char *p, int what) {
-  const char *res;
-  int level = ms->level;
-  if (level >= LUA_MAXCAPTURES) luaL_error(ms->L, "too many captures");
-  ms->capture[level].init = s;
-  ms->capture[level].len = what;
-  ms->level = level+1;
-  if ((res=match(ms, s, p)) == NULL)  /* match failed? */
-    ms->level--;  /* undo capture */
-  return res;
-}
-
-
-static const char *end_capture (MatchState *ms, const char *s,
-                                  const char *p) {
-  int l = capture_to_close(ms);
-  const char *res;
-  ms->capture[l].len = s - ms->capture[l].init;  /* close capture */
-  if ((res = match(ms, s, p)) == NULL)  /* match failed? */
-    ms->capture[l].len = CAP_UNFINISHED;  /* undo capture */
-  return res;
-}
-
-
-static const char *match_capture (MatchState *ms, const char *s, int l) {
-  size_t len;
-  l = check_capture(ms, l);
-  len = ms->capture[l].len;
-  if ((size_t)(ms->src_end-s) >= len &&
-      memcmp(ms->capture[l].init, s, len) == 0)
-    return s+len;
-  else return NULL;
-}
-
-
-static const char *match (MatchState *ms, const char *s, const char *p) {
-  if (ms->matchdepth-- == 0)
-    luaL_error(ms->L, "pattern too complex");
-  init: /* using goto's to optimize tail recursion */
-  if (p != ms->p_end) {  /* end of pattern? */
-    switch (*p) {
-      case '(': {  /* start capture */
-        if (*(p + 1) == ')')  /* position capture? */
-          s = start_capture(ms, s, p + 2, CAP_POSITION);
-        else
-          s = start_capture(ms, s, p + 1, CAP_UNFINISHED);
-        break;
-      }
-      case ')': {  /* end capture */
-        s = end_capture(ms, s, p + 1);
-        break;
-      }
-      case '$': {
-        if ((p + 1) != ms->p_end)  /* is the `$' the last char in pattern? */
-          goto dflt;  /* no; go to default */
-        s = (s == ms->src_end) ? s : NULL;  /* check end of string */
-        break;
-      }
-      case L_ESC: {  /* escaped sequences not in the format class[*+?-]? */
-        switch (*(p + 1)) {
-          case 'b': {  /* balanced string? */
-            s = matchbalance(ms, s, p + 2);
-            if (s != NULL) {
-              p += 4; goto init;  /* return match(ms, s, p + 4); */
-            }  /* else fail (s == NULL) */
-            break;
-          }
-          case 'f': {  /* frontier? */
-            const char *ep; char previous;
-            p += 2;
-            if (*p != '[')
-              luaL_error(ms->L, "missing " LUA_QL("[") " after "
-                                 LUA_QL("%%f") " in pattern");
-            ep = classend(ms, p);  /* points to what is next */
-            previous = (s == ms->src_init) ? '\0' : *(s - 1);
-            if (!matchbracketclass(uchar(previous), p, ep - 1) &&
-               matchbracketclass(uchar(*s), p, ep - 1)) {
-              p = ep; goto init;  /* return match(ms, s, ep); */
-            }
-            s = NULL;  /* match failed */
-            break;
-          }
-          case '0': case '1': case '2': case '3':
-          case '4': case '5': case '6': case '7':
-          case '8': case '9': {  /* capture results (%0-%9)? */
-            s = match_capture(ms, s, uchar(*(p + 1)));
-            if (s != NULL) {
-              p += 2; goto init;  /* return match(ms, s, p + 2) */
-            }
-            break;
-          }
-          default: goto dflt;
-        }
-        break;
-      }
-      default: dflt: {  /* pattern class plus optional suffix */
-        const char *ep = classend(ms, p);  /* points to optional suffix */
-        /* does not match at least once? */
-        if (!singlematch(ms, s, p, ep)) {
-          if (*ep == '*' || *ep == '?' || *ep == '-') {  /* accept empty? */
-            p = ep + 1; goto init;  /* return match(ms, s, ep + 1); */
-          }
-          else  /* '+' or no suffix */
-            s = NULL;  /* fail */
-        }
-        else {  /* matched once */
-          switch (*ep) {  /* handle optional suffix */
-            case '?': {  /* optional */
-              const char *res;
-              if ((res = match(ms, s + 1, ep + 1)) != NULL)
-                s = res;
-              else {
-                p = ep + 1; goto init;  /* else return match(ms, s, ep + 1); */
-              }
-              break;
-            }
-            case '+':  /* 1 or more repetitions */
-              s++;  /* 1 match already done */
-              /* FALLTHROUGH */
-            case '*':  /* 0 or more repetitions */
-              s = max_expand(ms, s, p, ep);
-              break;
-            case '-':  /* 0 or more repetitions (minimum) */
-              s = min_expand(ms, s, p, ep);
-              break;
-            default:  /* no suffix */
-              s++; p = ep; goto init;  /* return match(ms, s + 1, ep); */
-          }
-        }
-        break;
-      }
-    }
-  }
-  ms->matchdepth++;
-  return s;
-}
-
-
-
-static const char *lmemfind (const char *s1, size_t l1,
-                               const char *s2, size_t l2) {
-  if (l2 == 0) return s1;  /* empty strings are everywhere */
-  else if (l2 > l1) return NULL;  /* avoids a negative `l1' */
-  else {
-    const char *init;  /* to search for a `*s2' inside `s1' */
-    l2--;  /* 1st char will be checked by `memchr' */
-    l1 = l1-l2;  /* `s2' cannot be found after that */
-    while (l1 > 0 && (init = (const char *)memchr(s1, *s2, l1)) != NULL) {
-      init++;   /* 1st char is already checked */
-      if (memcmp(init, s2+1, l2) == 0)
-        return init-1;
-      else {  /* correct `l1' and `s1' to try again */
-        l1 -= init-s1;
-        s1 = init;
-      }
-    }
-    return NULL;  /* not found */
-  }
-}
-
-
-static void push_onecapture (MatchState *ms, int i, const char *s,
-                                                    const char *e) {
-  if (i >= ms->level) {
-    if (i == 0)  /* ms->level == 0, too */
-      lua_pushlstring(ms->L, s, e - s);  /* add whole match */
-    else
-      luaL_error(ms->L, "invalid capture index");
-  }
-  else {
-    ptrdiff_t l = ms->capture[i].len;
-    if (l == CAP_UNFINISHED) luaL_error(ms->L, "unfinished capture");
-    if (l == CAP_POSITION)
-      lua_pushinteger(ms->L, ms->capture[i].init - ms->src_init + 1);
-    else
-      lua_pushlstring(ms->L, ms->capture[i].init, l);
-  }
-}
-
-
-static int push_captures (MatchState *ms, const char *s, const char *e) {
-  int i;
-  int nlevels = (ms->level == 0 && s) ? 1 : ms->level;
-  luaL_checkstack(ms->L, nlevels, "too many captures");
-  for (i = 0; i < nlevels; i++)
-    push_onecapture(ms, i, s, e);
-  return nlevels;  /* number of strings pushed */
-}
-
-
-/* check whether pattern has no special characters */
-static int nospecials (const char *p, size_t l) {
-  size_t upto = 0;
-  do {
-    if (strpbrk(p + upto, SPECIALS))
-      return 0;  /* pattern has a special character */
-    upto += strlen(p + upto) + 1;  /* may have more after \0 */
-  } while (upto <= l);
-  return 1;  /* no special chars found */
-}
-
-
-static int str_find_aux (lua_State *L, int find) {
-  size_t ls, lp;
-  const char *s = luaL_checklstring(L, 1, &ls);
-  const char *p = luaL_checklstring(L, 2, &lp);
-  size_t init = posrelat(luaL_optinteger(L, 3, 1), ls);
-  if (init < 1) init = 1;
-  else if (init > ls + 1) {  /* start after string's end? */
-    lua_pushnil(L);  /* cannot find anything */
-    return 1;
-  }
-  /* explicit request or no special characters? */
-  if (find && (lua_toboolean(L, 4) || nospecials(p, lp))) {
-    /* do a plain search */
-    const char *s2 = lmemfind(s + init - 1, ls - init + 1, p, lp);
-    if (s2) {
-      lua_pushinteger(L, s2 - s + 1);
-      lua_pushinteger(L, s2 - s + lp);
-      return 2;
-    }
-  }
-  else {
-    MatchState ms;
-    const char *s1 = s + init - 1;
-    int anchor = (*p == '^');
-    if (anchor) {
-      p++; lp--;  /* skip anchor character */
-    }
-    ms.L = L;
-    ms.matchdepth = MAXCCALLS;
-    ms.src_init = s;
-    ms.src_end = s + ls;
-    ms.p_end = p + lp;
-    do {
-      const char *res;
-      ms.level = 0;
-      lua_assert(ms.matchdepth == MAXCCALLS);
-      if ((res=match(&ms, s1, p)) != NULL) {
-        if (find) {
-          lua_pushinteger(L, s1 - s + 1);  /* start */
-          lua_pushinteger(L, res - s);   /* end */
-          return push_captures(&ms, NULL, 0) + 2;
-        }
-        else
-          return push_captures(&ms, s1, res);
-      }
-    } while (s1++ < ms.src_end && !anchor);
-  }
-  lua_pushnil(L);  /* not found */
-  return 1;
-}
-
-
-static int str_find (lua_State *L) {
-  return str_find_aux(L, 1);
-}
-
-
-static int str_match (lua_State *L) {
-  return str_find_aux(L, 0);
-}
-
-
-static int gmatch_aux (lua_State *L) {
-  MatchState ms;
-  size_t ls, lp;
-  const char *s = lua_tolstring(L, lua_upvalueindex(1), &ls);
-  const char *p = lua_tolstring(L, lua_upvalueindex(2), &lp);
-  const char *src;
-  ms.L = L;
-  ms.matchdepth = MAXCCALLS;
-  ms.src_init = s;
-  ms.src_end = s+ls;
-  ms.p_end = p + lp;
-  for (src = s + (size_t)lua_tointeger(L, lua_upvalueindex(3));
-       src <= ms.src_end;
-       src++) {
-    const char *e;
-    ms.level = 0;
-    lua_assert(ms.matchdepth == MAXCCALLS);
-    if ((e = match(&ms, src, p)) != NULL) {
-      lua_Integer newstart = e-s;
-      if (e == src) newstart++;  /* empty match? go at least one position */
-      lua_pushinteger(L, newstart);
-      lua_replace(L, lua_upvalueindex(3));
-      return push_captures(&ms, src, e);
-    }
-  }
-  return 0;  /* not found */
-}
-
-
-static int str_gmatch (lua_State *L) {
-  luaL_checkstring(L, 1);
-  luaL_checkstring(L, 2);
-  lua_settop(L, 2);
-  lua_pushinteger(L, 0);
-  lua_pushcclosure(L, gmatch_aux, 3);
-  return 1;
-}
-
-
-static void add_s (MatchState *ms, luaL_Buffer *b, const char *s,
-                                                   const char *e) {
-  size_t l, i;
-  const char *news = lua_tolstring(ms->L, 3, &l);
-  for (i = 0; i < l; i++) {
-    if (news[i] != L_ESC)
-      luaL_addchar(b, news[i]);
-    else {
-      i++;  /* skip ESC */
-      if (!isdigit(uchar(news[i]))) {
-        if (news[i] != L_ESC)
-          luaL_error(ms->L, "invalid use of " LUA_QL("%c")
-                           " in replacement string", L_ESC);
-        luaL_addchar(b, news[i]);
-      }
-      else if (news[i] == '0')
-          luaL_addlstring(b, s, e - s);
-      else {
-        push_onecapture(ms, news[i] - '1', s, e);
-        luaL_addvalue(b);  /* add capture to accumulated result */
-      }
-    }
-  }
-}
-
-
-static void add_value (MatchState *ms, luaL_Buffer *b, const char *s,
-                                       const char *e, int tr) {
-  lua_State *L = ms->L;
-  switch (tr) {
-    case LUA_TFUNCTION: {
-      int n;
-      lua_pushvalue(L, 3);
-      n = push_captures(ms, s, e);
-      lua_call(L, n, 1);
-      break;
-    }
-    case LUA_TTABLE: {
-      push_onecapture(ms, 0, s, e);
-      lua_gettable(L, 3);
-      break;
-    }
-    default: {  /* LUA_TNUMBER or LUA_TSTRING */
-      add_s(ms, b, s, e);
-      return;
-    }
-  }
-  if (!lua_toboolean(L, -1)) {  /* nil or false? */
-    lua_pop(L, 1);
-    lua_pushlstring(L, s, e - s);  /* keep original text */
-  }
-  else if (!lua_isstring(L, -1))
-    luaL_error(L, "invalid replacement value (a %s)", luaL_typename(L, -1));
-  luaL_addvalue(b);  /* add result to accumulator */
-}
-
-
-static int str_gsub (lua_State *L) {
-  size_t srcl, lp;
-  const char *src = luaL_checklstring(L, 1, &srcl);
-  const char *p = luaL_checklstring(L, 2, &lp);
-  int tr = lua_type(L, 3);
-  size_t max_s = luaL_optinteger(L, 4, srcl+1);
-  int anchor = (*p == '^');
-  size_t n = 0;
-  MatchState ms;
-  luaL_Buffer b;
-  luaL_argcheck(L, tr == LUA_TNUMBER || tr == LUA_TSTRING ||
-                   tr == LUA_TFUNCTION || tr == LUA_TTABLE, 3,
-                      "string/function/table expected");
-  luaL_buffinit(L, &b);
-  if (anchor) {
-    p++; lp--;  /* skip anchor character */
-  }
-  ms.L = L;
-  ms.matchdepth = MAXCCALLS;
-  ms.src_init = src;
-  ms.src_end = src+srcl;
-  ms.p_end = p + lp;
-  while (n < max_s) {
-    const char *e;
-    ms.level = 0;
-    lua_assert(ms.matchdepth == MAXCCALLS);
-    e = match(&ms, src, p);
-    if (e) {
-      n++;
-      add_value(&ms, &b, src, e, tr);
-    }
-    if (e && e>src) /* non empty match? */
-      src = e;  /* skip it */
-    else if (src < ms.src_end)
-      luaL_addchar(&b, *src++);
-    else break;
-    if (anchor) break;
-  }
-  luaL_addlstring(&b, src, ms.src_end-src);
-  luaL_pushresult(&b);
-  lua_pushinteger(L, n);  /* number of substitutions */
-  return 2;
-}
-
-/* }====================================================== */
-
-
-
-/*
-** {======================================================
-** STRING FORMAT
-** =======================================================
-*/
-
-/*
-** LUA_INTFRMLEN is the length modifier for integer conversions in
-** 'string.format'; LUA_INTFRM_T is the integer type corresponding to
-** the previous length
-*/
-#if !defined(LUA_INTFRMLEN)	/* { */
-#if defined(LUA_USE_LONGLONG)
-
-#define LUA_INTFRMLEN		"ll"
-#define LUA_INTFRM_T		long long
-
-#else
-
-#define LUA_INTFRMLEN		"l"
-#define LUA_INTFRM_T		long
-
-#endif
-#endif				/* } */
-
-
-/*
-** LUA_FLTFRMLEN is the length modifier for float conversions in
-** 'string.format'; LUA_FLTFRM_T is the float type corresponding to
-** the previous length
-*/
-#if !defined(LUA_FLTFRMLEN)
-
-#define LUA_FLTFRMLEN		""
-#define LUA_FLTFRM_T		double
-
-#endif
-
-
-/* maximum size of each formatted item (> len(format('%99.99f', -1e308))) */
-#define MAX_ITEM	512
-/* valid flags in a format specification */
-#define FLAGS	"-+ #0"
-/*
-** maximum size of each format specification (such as '%-099.99d')
-** (+10 accounts for %99.99x plus margin of error)
-*/
-#define MAX_FORMAT	(sizeof(FLAGS) + sizeof(LUA_INTFRMLEN) + 10)
-
-
-static void addquoted (lua_State *L, luaL_Buffer *b, int arg) {
-  size_t l;
-  const char *s = luaL_checklstring(L, arg, &l);
-  luaL_addchar(b, '"');
-  while (l--) {
-    if (*s == '"' || *s == '\\' || *s == '\n') {
-      luaL_addchar(b, '\\');
-      luaL_addchar(b, *s);
-    }
-    else if (*s == '\0' || iscntrl(uchar(*s))) {
-      char buff[10];
-      if (!isdigit(uchar(*(s+1))))
-        sprintf(buff, "\\%d", (int)uchar(*s));
-      else
-        sprintf(buff, "\\%03d", (int)uchar(*s));
-      luaL_addstring(b, buff);
-    }
-    else
-      luaL_addchar(b, *s);
-    s++;
-  }
-  luaL_addchar(b, '"');
-}
-
-static const char *scanformat (lua_State *L, const char *strfrmt, char *form) {
-  const char *p = strfrmt;
-  while (*p != '\0' && strchr(FLAGS, *p) != NULL) p++;  /* skip flags */
-  if ((size_t)(p - strfrmt) >= sizeof(FLAGS)/sizeof(char))
-    luaL_error(L, "invalid format (repeated flags)");
-  if (isdigit(uchar(*p))) p++;  /* skip width */
-  if (isdigit(uchar(*p))) p++;  /* (2 digits at most) */
-  if (*p == '.') {
-    p++;
-    if (isdigit(uchar(*p))) p++;  /* skip precision */
-    if (isdigit(uchar(*p))) p++;  /* (2 digits at most) */
-  }
-  if (isdigit(uchar(*p)))
-    luaL_error(L, "invalid format (width or precision too long)");
-  *(form++) = '%';
-  memcpy(form, strfrmt, (p - strfrmt + 1) * sizeof(char));
-  form += p - strfrmt + 1;
-  *form = '\0';
-  return p;
-}
-
-
-/*
-** add length modifier into formats
-*/
-static void addlenmod (char *form, const char *lenmod) {
-  size_t l = strlen(form);
-  size_t lm = strlen(lenmod);
-  char spec = form[l - 1];
-  strcpy(form + l - 1, lenmod);
-  form[l + lm - 1] = spec;
-  form[l + lm] = '\0';
-}
-
-
-static int str_format (lua_State *L) {
-  int top = lua_gettop(L);
-  int arg = 1;
-  size_t sfl;
-  const char *strfrmt = luaL_checklstring(L, arg, &sfl);
-  const char *strfrmt_end = strfrmt+sfl;
-  luaL_Buffer b;
-  luaL_buffinit(L, &b);
-  while (strfrmt < strfrmt_end) {
-    if (*strfrmt != L_ESC)
-      luaL_addchar(&b, *strfrmt++);
-    else if (*++strfrmt == L_ESC)
-      luaL_addchar(&b, *strfrmt++);  /* %% */
-    else { /* format item */
-      char form[MAX_FORMAT];  /* to store the format (`%...') */
-      char *buff = luaL_prepbuffsize(&b, MAX_ITEM);  /* to put formatted item */
-      int nb = 0;  /* number of bytes in added item */
-      if (++arg > top)
-        luaL_argerror(L, arg, "no value");
-      strfrmt = scanformat(L, strfrmt, form);
-      switch (*strfrmt++) {
-        case 'c': {
-          nb = str_sprintf(buff, form, luaL_checkint(L, arg));
-          break;
-        }
-        case 'd': case 'i': {
-          lua_Number n = luaL_checknumber(L, arg);
-          LUA_INTFRM_T ni = (LUA_INTFRM_T)n;
-          lua_Number diff = n - (lua_Number)ni;
-          luaL_argcheck(L, -1 < diff && diff < 1, arg,
-                        "not a number in proper range");
-          addlenmod(form, LUA_INTFRMLEN);
-          nb = str_sprintf(buff, form, ni);
-          break;
-        }
-        case 'o': case 'u': case 'x': case 'X': {
-          lua_Number n = luaL_checknumber(L, arg);
-          unsigned LUA_INTFRM_T ni = (unsigned LUA_INTFRM_T)n;
-          lua_Number diff = n - (lua_Number)ni;
-          luaL_argcheck(L, -1 < diff && diff < 1, arg,
-                        "not a non-negative number in proper range");
-          addlenmod(form, LUA_INTFRMLEN);
-          nb = str_sprintf(buff, form, ni);
-          break;
-        }
-#if defined(LUA_USE_FLOAT_FORMATS)
-        case 'e': case 'E': case 'f':
-#if defined(LUA_USE_AFORMAT)
-        case 'a': case 'A':
-#endif
-        case 'g': case 'G': {
-          addlenmod(form, LUA_FLTFRMLEN);
-          nb = str_sprintf(buff, form, (LUA_FLTFRM_T)luaL_checknumber(L, arg));
-          break;
-        }
-#endif
-        case 'q': {
-          addquoted(L, &b, arg);
-          break;
-        }
-        case 's': {
-          size_t l;
-          const char *s = luaL_tolstring(L, arg, &l);
-          if (!strchr(form, '.') && l >= 100) {
-            /* no precision and string is too long to be formatted;
-               keep original string */
-            luaL_addvalue(&b);
-            break;
-          }
-          else {
-            nb = str_sprintf(buff, form, s);
-            lua_pop(L, 1);  /* remove result from 'luaL_tolstring' */
-            break;
-          }
-        }
-        default: {  /* also treat cases `pnLlh' */
-          return luaL_error(L, "invalid option " LUA_QL("%%%c") " to "
-                               LUA_QL("format"), *(strfrmt - 1));
-        }
-      }
-      luaL_addsize(&b, nb);
-    }
-  }
-  luaL_pushresult(&b);
-  return 1;
-}
-
-/* }====================================================== */
-
-
-static const luaL_Reg strlib[] = {
-  {"byte", str_byte},
-  {"char", str_char},
-  {"dump", str_dump},
-  {"find", str_find},
-  {"format", str_format},
-  {"gmatch", str_gmatch},
-  {"gsub", str_gsub},
-  {"len", str_len},
-  {"lower", str_lower},
-  {"match", str_match},
-  {"rep", str_rep},
-  {"reverse", str_reverse},
-  {"sub", str_sub},
-  {"upper", str_upper},
-  {NULL, NULL}
-};
-
-
-static void createmetatable (lua_State *L) {
-  lua_createtable(L, 0, 1);  /* table to be metatable for strings */
-  lua_pushliteral(L, "");  /* dummy string */
-  lua_pushvalue(L, -2);  /* copy table */
-  lua_setmetatable(L, -2);  /* set table as metatable for strings */
-  lua_pop(L, 1);  /* pop dummy string */
-  lua_pushvalue(L, -2);  /* get string library */
-  lua_setfield(L, -2, "__index");  /* metatable.__index = string */
-  lua_pop(L, 1);  /* pop metatable */
-}
-
-
-/*
-** Open string library
-*/
-LUAMOD_API int luaopen_string (lua_State *L) {
-  luaL_newlib(L, strlib);
-  createmetatable(L);
-  return 1;
-}
-
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/ltable.h
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/ltable.h
+++ /dev/null
@@ -1,45 +0,0 @@
-/*
-** $Id: ltable.h,v 2.16.1.2 2013/08/30 15:49:41 roberto Exp $
-** Lua tables (hash)
-** See Copyright Notice in lua.h
-*/
-
-#ifndef ltable_h
-#define ltable_h
-
-#include "lobject.h"
-
-
-#define gnode(t,i)	(&(t)->node[i])
-#define gkey(n)		(&(n)->i_key.tvk)
-#define gval(n)		(&(n)->i_val)
-#define gnext(n)	((n)->i_key.nk.next)
-
-#define invalidateTMcache(t)	((t)->flags = 0)
-
-/* returns the key, given the value of a table entry */
-#define keyfromval(v) \
-  (gkey(cast(Node *, cast(char *, (v)) - offsetof(Node, i_val))))
-
-
-LUAI_FUNC const TValue *luaH_getint (Table *t, int key);
-LUAI_FUNC void luaH_setint (lua_State *L, Table *t, int key, TValue *value);
-LUAI_FUNC const TValue *luaH_getstr (Table *t, TString *key);
-LUAI_FUNC const TValue *luaH_get (Table *t, const TValue *key);
-LUAI_FUNC TValue *luaH_newkey (lua_State *L, Table *t, const TValue *key);
-LUAI_FUNC TValue *luaH_set (lua_State *L, Table *t, const TValue *key);
-LUAI_FUNC Table *luaH_new (lua_State *L);
-LUAI_FUNC void luaH_resize (lua_State *L, Table *t, int nasize, int nhsize);
-LUAI_FUNC void luaH_resizearray (lua_State *L, Table *t, int nasize);
-LUAI_FUNC void luaH_free (lua_State *L, Table *t);
-LUAI_FUNC int luaH_next (lua_State *L, Table *t, StkId key);
-LUAI_FUNC int luaH_getn (Table *t);
-
-
-#if defined(LUA_DEBUG)
-LUAI_FUNC Node *luaH_mainposition (const Table *t, const TValue *key);
-LUAI_FUNC int luaH_isdummy (Node *n);
-#endif
-
-
-#endif
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/ltable.c
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/ltable.c
+++ /dev/null
@@ -1,589 +0,0 @@
-/*
-** $Id: ltable.c,v 2.72.1.1 2013/04/12 18:48:47 roberto Exp $
-** Lua tables (hash)
-** See Copyright Notice in lua.h
-*/
-
-
-/*
-** Implementation of tables (aka arrays, objects, or hash tables).
-** Tables keep its elements in two parts: an array part and a hash part.
-** Non-negative integer keys are all candidates to be kept in the array
-** part. The actual size of the array is the largest `n' such that at
-** least half the slots between 0 and n are in use.
-** Hash uses a mix of chained scatter table with Brent's variation.
-** A main invariant of these tables is that, if an element is not
-** in its main position (i.e. the `original' position that its hash gives
-** to it), then the colliding element is in its own main position.
-** Hence even when the load factor reaches 100%, performance remains good.
-*/
-
-#include <sys/zfs_context.h>
-
-#define ltable_c
-#define LUA_CORE
-
-#include "lua.h"
-
-#include "ldebug.h"
-#include "ldo.h"
-#include "lgc.h"
-#include "lmem.h"
-#include "lobject.h"
-#include "lstate.h"
-#include "lstring.h"
-#include "ltable.h"
-#include "lvm.h"
-
-
-/*
-** max size of array part is 2^MAXBITS
-*/
-#if LUAI_BITSINT >= 32
-#define MAXBITS		30
-#else
-#define MAXBITS		(LUAI_BITSINT-2)
-#endif
-
-#define MAXASIZE	(1 << MAXBITS)
-
-
-#define hashpow2(t,n)		(gnode(t, lmod((n), sizenode(t))))
-
-#define hashstr(t,str)		hashpow2(t, (str)->tsv.hash)
-#define hashboolean(t,p)	hashpow2(t, p)
-
-
-/*
-** for some types, it is better to avoid modulus by power of 2, as
-** they tend to have many 2 factors.
-*/
-#define hashmod(t,n)	(gnode(t, ((n) % ((sizenode(t)-1)|1))))
-
-
-#define hashpointer(t,p)	hashmod(t, IntPoint(p))
-
-
-#define dummynode		(&dummynode_)
-
-#define isdummy(n)		((n) == dummynode)
-
-static const Node dummynode_ = {
-  {NILCONSTANT},  /* value */
-  {{NILCONSTANT, NULL}}  /* key */
-};
-
-
-/*
-** hash for lua_Numbers
-*/
-static Node *hashnum (const Table *t, lua_Number n) {
-  int i;
-  luai_hashnum(i, n);
-  if (i < 0) {
-    if (cast(unsigned int, i) == 0u - i)  /* use unsigned to avoid overflows */
-      i = 0;  /* handle INT_MIN */
-    i = -i;  /* must be a positive value */
-  }
-  return hashmod(t, i);
-}
-
-
-
-/*
-** returns the `main' position of an element in a table (that is, the index
-** of its hash value)
-*/
-static Node *mainposition (const Table *t, const TValue *key) {
-  switch (ttype(key)) {
-    case LUA_TNUMBER:
-      return hashnum(t, nvalue(key));
-    case LUA_TLNGSTR: {
-      TString *s = rawtsvalue(key);
-      if (s->tsv.extra == 0) {  /* no hash? */
-        s->tsv.hash = luaS_hash(getstr(s), s->tsv.len, s->tsv.hash);
-        s->tsv.extra = 1;  /* now it has its hash */
-      }
-      return hashstr(t, rawtsvalue(key));
-    }
-    case LUA_TSHRSTR:
-      return hashstr(t, rawtsvalue(key));
-    case LUA_TBOOLEAN:
-      return hashboolean(t, bvalue(key));
-    case LUA_TLIGHTUSERDATA:
-      return hashpointer(t, pvalue(key));
-    case LUA_TLCF:
-      return hashpointer(t, fvalue(key));
-    default:
-      return hashpointer(t, gcvalue(key));
-  }
-}
-
-
-/*
-** returns the index for `key' if `key' is an appropriate key to live in
-** the array part of the table, -1 otherwise.
-*/
-static int arrayindex (const TValue *key) {
-  if (ttisnumber(key)) {
-    lua_Number n = nvalue(key);
-    int k;
-    lua_number2int(k, n);
-    if (luai_numeq(cast_num(k), n))
-      return k;
-  }
-  return -1;  /* `key' did not match some condition */
-}
-
-
-/*
-** returns the index of a `key' for table traversals. First goes all
-** elements in the array part, then elements in the hash part. The
-** beginning of a traversal is signaled by -1.
-*/
-static int findindex (lua_State *L, Table *t, StkId key) {
-  int i;
-  if (ttisnil(key)) return -1;  /* first iteration */
-  i = arrayindex(key);
-  if (0 < i && i <= t->sizearray)  /* is `key' inside array part? */
-    return i-1;  /* yes; that's the index (corrected to C) */
-  else {
-    Node *n = mainposition(t, key);
-    for (;;) {  /* check whether `key' is somewhere in the chain */
-      /* key may be dead already, but it is ok to use it in `next' */
-      if (luaV_rawequalobj(gkey(n), key) ||
-            (ttisdeadkey(gkey(n)) && iscollectable(key) &&
-             deadvalue(gkey(n)) == gcvalue(key))) {
-        i = cast_int(n - gnode(t, 0));  /* key index in hash table */
-        /* hash elements are numbered after array ones */
-        return i + t->sizearray;
-      }
-      else n = gnext(n);
-      if (n == NULL)
-        luaG_runerror(L, "invalid key to " LUA_QL("next"));  /* key not found */
-    }
-  }
-}
-
-
-int luaH_next (lua_State *L, Table *t, StkId key) {
-  int i = findindex(L, t, key);  /* find original element */
-  for (i++; i < t->sizearray; i++) {  /* try first array part */
-    if (!ttisnil(&t->array[i])) {  /* a non-nil value? */
-      setnvalue(key, cast_num(i+1));
-      setobj2s(L, key+1, &t->array[i]);
-      return 1;
-    }
-  }
-  for (i -= t->sizearray; i < sizenode(t); i++) {  /* then hash part */
-    if (!ttisnil(gval(gnode(t, i)))) {  /* a non-nil value? */
-      setobj2s(L, key, gkey(gnode(t, i)));
-      setobj2s(L, key+1, gval(gnode(t, i)));
-      return 1;
-    }
-  }
-  return 0;  /* no more elements */
-}
-
-
-/*
-** {=============================================================
-** Rehash
-** ==============================================================
-*/
-
-
-static int computesizes (int nums[], int *narray) {
-  int i;
-  int twotoi;  /* 2^i */
-  int a = 0;  /* number of elements smaller than 2^i */
-  int na = 0;  /* number of elements to go to array part */
-  int n = 0;  /* optimal size for array part */
-  for (i = 0, twotoi = 1; twotoi/2 < *narray; i++, twotoi *= 2) {
-    if (nums[i] > 0) {
-      a += nums[i];
-      if (a > twotoi/2) {  /* more than half elements present? */
-        n = twotoi;  /* optimal size (till now) */
-        na = a;  /* all elements smaller than n will go to array part */
-      }
-    }
-    if (a == *narray) break;  /* all elements already counted */
-  }
-  *narray = n;
-  lua_assert(*narray/2 <= na && na <= *narray);
-  return na;
-}
-
-
-static int countint (const TValue *key, int *nums) {
-  int k = arrayindex(key);
-  if (0 < k && k <= MAXASIZE) {  /* is `key' an appropriate array index? */
-    nums[luaO_ceillog2(k)]++;  /* count as such */
-    return 1;
-  }
-  else
-    return 0;
-}
-
-
-static int numusearray (const Table *t, int *nums) {
-  int lg;
-  int ttlg;  /* 2^lg */
-  int ause = 0;  /* summation of `nums' */
-  int i = 1;  /* count to traverse all array keys */
-  for (lg=0, ttlg=1; lg<=MAXBITS; lg++, ttlg*=2) {  /* for each slice */
-    int lc = 0;  /* counter */
-    int lim = ttlg;
-    if (lim > t->sizearray) {
-      lim = t->sizearray;  /* adjust upper limit */
-      if (i > lim)
-        break;  /* no more elements to count */
-    }
-    /* count elements in range (2^(lg-1), 2^lg] */
-    for (; i <= lim; i++) {
-      if (!ttisnil(&t->array[i-1]))
-        lc++;
-    }
-    nums[lg] += lc;
-    ause += lc;
-  }
-  return ause;
-}
-
-
-static int numusehash (const Table *t, int *nums, int *pnasize) {
-  int totaluse = 0;  /* total number of elements */
-  int ause = 0;  /* summation of `nums' */
-  int i = sizenode(t);
-  while (i--) {
-    Node *n = &t->node[i];
-    if (!ttisnil(gval(n))) {
-      ause += countint(gkey(n), nums);
-      totaluse++;
-    }
-  }
-  *pnasize += ause;
-  return totaluse;
-}
-
-
-static void setarrayvector (lua_State *L, Table *t, int size) {
-  int i;
-  luaM_reallocvector(L, t->array, t->sizearray, size, TValue);
-  for (i=t->sizearray; i<size; i++)
-     setnilvalue(&t->array[i]);
-  t->sizearray = size;
-}
-
-
-static void setnodevector (lua_State *L, Table *t, int size) {
-  int lsize;
-  if (size == 0) {  /* no elements to hash part? */
-    t->node = cast(Node *, dummynode);  /* use common `dummynode' */
-    lsize = 0;
-  }
-  else {
-    int i;
-    lsize = luaO_ceillog2(size);
-    if (lsize > MAXBITS)
-      luaG_runerror(L, "table overflow");
-    size = twoto(lsize);
-    t->node = luaM_newvector(L, size, Node);
-    for (i=0; i<size; i++) {
-      Node *n = gnode(t, i);
-      gnext(n) = NULL;
-      setnilvalue(gkey(n));
-      setnilvalue(gval(n));
-    }
-  }
-  t->lsizenode = cast_byte(lsize);
-  t->lastfree = gnode(t, size);  /* all positions are free */
-}
-
-
-void luaH_resize (lua_State *L, Table *t, int nasize, int nhsize) {
-  int i;
-  int oldasize = t->sizearray;
-  int oldhsize = t->lsizenode;
-  Node *nold = t->node;  /* save old hash ... */
-  if (nasize > oldasize)  /* array part must grow? */
-    setarrayvector(L, t, nasize);
-  /* create new hash part with appropriate size */
-  setnodevector(L, t, nhsize);
-  if (nasize < oldasize) {  /* array part must shrink? */
-    t->sizearray = nasize;
-    /* re-insert elements from vanishing slice */
-    for (i=nasize; i<oldasize; i++) {
-      if (!ttisnil(&t->array[i]))
-        luaH_setint(L, t, i + 1, &t->array[i]);
-    }
-    /* shrink array */
-    luaM_reallocvector(L, t->array, oldasize, nasize, TValue);
-  }
-  /* re-insert elements from hash part */
-  for (i = twoto(oldhsize) - 1; i >= 0; i--) {
-    Node *old = nold+i;
-    if (!ttisnil(gval(old))) {
-      /* doesn't need barrier/invalidate cache, as entry was
-         already present in the table */
-      setobjt2t(L, luaH_set(L, t, gkey(old)), gval(old));
-    }
-  }
-  if (!isdummy(nold))
-    luaM_freearray(L, nold, cast(size_t, twoto(oldhsize))); /* free old array */
-}
-
-
-void luaH_resizearray (lua_State *L, Table *t, int nasize) {
-  int nsize = isdummy(t->node) ? 0 : sizenode(t);
-  luaH_resize(L, t, nasize, nsize);
-}
-
-
-static void rehash (lua_State *L, Table *t, const TValue *ek) {
-  int nasize, na;
-  int nums[MAXBITS+1];  /* nums[i] = number of keys with 2^(i-1) < k <= 2^i */
-  int i;
-  int totaluse;
-  for (i=0; i<=MAXBITS; i++) nums[i] = 0;  /* reset counts */
-  nasize = numusearray(t, nums);  /* count keys in array part */
-  totaluse = nasize;  /* all those keys are integer keys */
-  totaluse += numusehash(t, nums, &nasize);  /* count keys in hash part */
-  /* count extra key */
-  nasize += countint(ek, nums);
-  totaluse++;
-  /* compute new size for array part */
-  na = computesizes(nums, &nasize);
-  /* resize the table to new computed sizes */
-  luaH_resize(L, t, nasize, totaluse - na);
-}
-
-
-
-/*
-** }=============================================================
-*/
-
-
-Table *luaH_new (lua_State *L) {
-  Table *t = &luaC_newobj(L, LUA_TTABLE, sizeof(Table), NULL, 0)->h;
-  t->metatable = NULL;
-  t->flags = cast_byte(~0);
-  t->array = NULL;
-  t->sizearray = 0;
-  setnodevector(L, t, 0);
-  return t;
-}
-
-
-void luaH_free (lua_State *L, Table *t) {
-  if (!isdummy(t->node))
-    luaM_freearray(L, t->node, cast(size_t, sizenode(t)));
-  luaM_freearray(L, t->array, t->sizearray);
-  luaM_free(L, t);
-}
-
-
-static Node *getfreepos (Table *t) {
-  while (t->lastfree > t->node) {
-    t->lastfree--;
-    if (ttisnil(gkey(t->lastfree)))
-      return t->lastfree;
-  }
-  return NULL;  /* could not find a free place */
-}
-
-
-
-/*
-** inserts a new key into a hash table; first, check whether key's main
-** position is free. If not, check whether colliding node is in its main
-** position or not: if it is not, move colliding node to an empty place and
-** put new key in its main position; otherwise (colliding node is in its main
-** position), new key goes to an empty position.
-*/
-TValue *luaH_newkey (lua_State *L, Table *t, const TValue *key) {
-  Node *mp;
-  if (ttisnil(key)) luaG_runerror(L, "table index is nil");
-  else if (ttisnumber(key) && luai_numisnan(L, nvalue(key)))
-    luaG_runerror(L, "table index is NaN");
-  mp = mainposition(t, key);
-  if (!ttisnil(gval(mp)) || isdummy(mp)) {  /* main position is taken? */
-    Node *othern;
-    Node *n = getfreepos(t);  /* get a free place */
-    if (n == NULL) {  /* cannot find a free place? */
-      rehash(L, t, key);  /* grow table */
-      /* whatever called 'newkey' take care of TM cache and GC barrier */
-      return luaH_set(L, t, key);  /* insert key into grown table */
-    }
-    lua_assert(!isdummy(n));
-    othern = mainposition(t, gkey(mp));
-    if (othern != mp) {  /* is colliding node out of its main position? */
-      /* yes; move colliding node into free position */
-      while (gnext(othern) != mp) othern = gnext(othern);  /* find previous */
-      gnext(othern) = n;  /* redo the chain with `n' in place of `mp' */
-      *n = *mp;  /* copy colliding node into free pos. (mp->next also goes) */
-      gnext(mp) = NULL;  /* now `mp' is free */
-      setnilvalue(gval(mp));
-    }
-    else {  /* colliding node is in its own main position */
-      /* new node will go into free position */
-      gnext(n) = gnext(mp);  /* chain new position */
-      gnext(mp) = n;
-      mp = n;
-    }
-  }
-  setobj2t(L, gkey(mp), key);
-  luaC_barrierback(L, obj2gco(t), key);
-  lua_assert(ttisnil(gval(mp)));
-  return gval(mp);
-}
-
-
-/*
-** search function for integers
-*/
-const TValue *luaH_getint (Table *t, int key) {
-  /* (1 <= key && key <= t->sizearray) */
-  if (cast(unsigned int, key-1) < cast(unsigned int, t->sizearray))
-    return &t->array[key-1];
-  else {
-    lua_Number nk = cast_num(key);
-    Node *n = hashnum(t, nk);
-    do {  /* check whether `key' is somewhere in the chain */
-      if (ttisnumber(gkey(n)) && luai_numeq(nvalue(gkey(n)), nk))
-        return gval(n);  /* that's it */
-      else n = gnext(n);
-    } while (n);
-    return luaO_nilobject;
-  }
-}
-
-
-/*
-** search function for short strings
-*/
-const TValue *luaH_getstr (Table *t, TString *key) {
-  Node *n = hashstr(t, key);
-  lua_assert(key->tsv.tt == LUA_TSHRSTR);
-  do {  /* check whether `key' is somewhere in the chain */
-    if (ttisshrstring(gkey(n)) && eqshrstr(rawtsvalue(gkey(n)), key))
-      return gval(n);  /* that's it */
-    else n = gnext(n);
-  } while (n);
-  return luaO_nilobject;
-}
-
-
-/*
-** main search function
-*/
-const TValue *luaH_get (Table *t, const TValue *key) {
-  switch (ttype(key)) {
-    case LUA_TSHRSTR: return luaH_getstr(t, rawtsvalue(key));
-    case LUA_TNIL: return luaO_nilobject;
-    case LUA_TNUMBER: {
-      int k;
-      lua_Number n = nvalue(key);
-      lua_number2int(k, n);
-      if (luai_numeq(cast_num(k), n)) /* index is int? */
-        return luaH_getint(t, k);  /* use specialized version */
-      /* else go through */
-    }
-    /* FALLTHROUGH */
-    default: {
-      Node *n = mainposition(t, key);
-      do {  /* check whether `key' is somewhere in the chain */
-        if (luaV_rawequalobj(gkey(n), key))
-          return gval(n);  /* that's it */
-        else n = gnext(n);
-      } while (n);
-      return luaO_nilobject;
-    }
-  }
-}
-
-
-/*
-** beware: when using this function you probably need to check a GC
-** barrier and invalidate the TM cache.
-*/
-TValue *luaH_set (lua_State *L, Table *t, const TValue *key) {
-  const TValue *p = luaH_get(t, key);
-  if (p != luaO_nilobject)
-    return cast(TValue *, p);
-  else return luaH_newkey(L, t, key);
-}
-
-
-void luaH_setint (lua_State *L, Table *t, int key, TValue *value) {
-  const TValue *p = luaH_getint(t, key);
-  TValue *cell;
-  if (p != luaO_nilobject)
-    cell = cast(TValue *, p);
-  else {
-    TValue k;
-    setnvalue(&k, cast_num(key));
-    cell = luaH_newkey(L, t, &k);
-  }
-  setobj2t(L, cell, value);
-}
-
-
-static int unbound_search (Table *t, unsigned int j) {
-  unsigned int i = j;  /* i is zero or a present index */
-  j++;
-  /* find `i' and `j' such that i is present and j is not */
-  while (!ttisnil(luaH_getint(t, j))) {
-    i = j;
-    j *= 2;
-    if (j > cast(unsigned int, MAX_INT)) {  /* overflow? */
-      /* table was built with bad purposes: resort to linear search */
-      i = 1;
-      while (!ttisnil(luaH_getint(t, i))) i++;
-      return i - 1;
-    }
-  }
-  /* now do a binary search between them */
-  while (j - i > 1) {
-    unsigned int m = (i+j)/2;
-    if (ttisnil(luaH_getint(t, m))) j = m;
-    else i = m;
-  }
-  return i;
-}
-
-
-/*
-** Try to find a boundary in table `t'. A `boundary' is an integer index
-** such that t[i] is non-nil and t[i+1] is nil (and 0 if t[1] is nil).
-*/
-int luaH_getn (Table *t) {
-  unsigned int j = t->sizearray;
-  if (j > 0 && ttisnil(&t->array[j - 1])) {
-    /* there is a boundary in the array part: (binary) search for it */
-    unsigned int i = 0;
-    while (j - i > 1) {
-      unsigned int m = (i+j)/2;
-      if (ttisnil(&t->array[m - 1])) j = m;
-      else i = m;
-    }
-    return i;
-  }
-  /* else must find a boundary in hash part */
-  else if (isdummy(t->node))  /* hash part is empty? */
-    return j;  /* that is easy... */
-  else return unbound_search(t, j);
-}
-
-
-
-#if defined(LUA_DEBUG)
-
-Node *luaH_mainposition (const Table *t, const TValue *key) {
-  return mainposition(t, key);
-}
-
-int luaH_isdummy (Node *n) { return isdummy(n); }
-
-#endif
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/ltablib.c
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/ltablib.c
+++ /dev/null
@@ -1,284 +0,0 @@
-/*
-** $Id: ltablib.c,v 1.65.1.2 2014/05/07 16:32:55 roberto Exp $
-** Library for Table Manipulation
-** See Copyright Notice in lua.h
-*/
-
-
-#include <sys/zfs_context.h>
-
-#define ltablib_c
-#define LUA_LIB
-
-#include "lua.h"
-
-#include "lauxlib.h"
-#include "lualib.h"
-
-
-#define aux_getn(L,n)	(luaL_checktype(L, n, LUA_TTABLE), luaL_len(L, n))
-
-
-
-#if defined(LUA_COMPAT_MAXN)
-static int maxn (lua_State *L) {
-  lua_Number max = 0;
-  luaL_checktype(L, 1, LUA_TTABLE);
-  lua_pushnil(L);  /* first key */
-  while (lua_next(L, 1)) {
-    lua_pop(L, 1);  /* remove value */
-    if (lua_type(L, -1) == LUA_TNUMBER) {
-      lua_Number v = lua_tonumber(L, -1);
-      if (v > max) max = v;
-    }
-  }
-  lua_pushnumber(L, max);
-  return 1;
-}
-#endif
-
-
-static int tinsert (lua_State *L) {
-  int e = aux_getn(L, 1) + 1;  /* first empty element */
-  int pos;  /* where to insert new element */
-  switch (lua_gettop(L)) {
-    case 2: {  /* called with only 2 arguments */
-      pos = e;  /* insert new element at the end */
-      break;
-    }
-    case 3: {
-      int i;
-      pos = luaL_checkint(L, 2);  /* 2nd argument is the position */
-      luaL_argcheck(L, 1 <= pos && pos <= e, 2, "position out of bounds");
-      for (i = e; i > pos; i--) {  /* move up elements */
-        lua_rawgeti(L, 1, i-1);
-        lua_rawseti(L, 1, i);  /* t[i] = t[i-1] */
-      }
-      break;
-    }
-    default: {
-      return luaL_error(L, "wrong number of arguments to " LUA_QL("insert"));
-    }
-  }
-  lua_rawseti(L, 1, pos);  /* t[pos] = v */
-  return 0;
-}
-
-
-static int tremove (lua_State *L) {
-  int size = aux_getn(L, 1);
-  int pos = luaL_optint(L, 2, size);
-  if (pos != size)  /* validate 'pos' if given */
-    luaL_argcheck(L, 1 <= pos && pos <= size + 1, 1, "position out of bounds");
-  lua_rawgeti(L, 1, pos);  /* result = t[pos] */
-  for ( ; pos < size; pos++) {
-    lua_rawgeti(L, 1, pos+1);
-    lua_rawseti(L, 1, pos);  /* t[pos] = t[pos+1] */
-  }
-  lua_pushnil(L);
-  lua_rawseti(L, 1, pos);  /* t[pos] = nil */
-  return 1;
-}
-
-
-static void addfield (lua_State *L, luaL_Buffer *b, int i) {
-  lua_rawgeti(L, 1, i);
-  if (!lua_isstring(L, -1))
-    luaL_error(L, "invalid value (%s) at index %d in table for "
-                  LUA_QL("concat"), luaL_typename(L, -1), i);
-  luaL_addvalue(b);
-}
-
-
-static int tconcat (lua_State *L) {
-  luaL_Buffer b;
-  size_t lsep;
-  int i, last;
-  const char *sep = luaL_optlstring(L, 2, "", &lsep);
-  luaL_checktype(L, 1, LUA_TTABLE);
-  i = luaL_optint(L, 3, 1);
-  last = luaL_opt(L, luaL_checkint, 4, luaL_len(L, 1));
-  luaL_buffinit(L, &b);
-  for (; i < last; i++) {
-    addfield(L, &b, i);
-    luaL_addlstring(&b, sep, lsep);
-  }
-  if (i == last)  /* add last value (if interval was not empty) */
-    addfield(L, &b, i);
-  luaL_pushresult(&b);
-  return 1;
-}
-
-
-/*
-** {======================================================
-** Pack/unpack
-** =======================================================
-*/
-
-static int pack (lua_State *L) {
-  int n = lua_gettop(L);  /* number of elements to pack */
-  lua_createtable(L, n, 1);  /* create result table */
-  lua_pushinteger(L, n);
-  lua_setfield(L, -2, "n");  /* t.n = number of elements */
-  if (n > 0) {  /* at least one element? */
-    int i;
-    lua_pushvalue(L, 1);
-    lua_rawseti(L, -2, 1);  /* insert first element */
-    lua_replace(L, 1);  /* move table into index 1 */
-    for (i = n; i >= 2; i--)  /* assign other elements */
-      lua_rawseti(L, 1, i);
-  }
-  return 1;  /* return table */
-}
-
-
-static int unpack (lua_State *L) {
-  int i, e;
-  unsigned int n;
-  luaL_checktype(L, 1, LUA_TTABLE);
-  i = luaL_optint(L, 2, 1);
-  e = luaL_opt(L, luaL_checkint, 3, luaL_len(L, 1));
-  if (i > e) return 0;  /* empty range */
-  n = (unsigned int)e - (unsigned int)i;  /* number of elements minus 1 */
-  if (n > (INT_MAX - 10) || !lua_checkstack(L, ++n))
-    return luaL_error(L, "too many results to unpack");
-  lua_rawgeti(L, 1, i);  /* push arg[i] (avoiding overflow problems) */
-  while (i++ < e)  /* push arg[i + 1...e] */
-    lua_rawgeti(L, 1, i);
-  return n;
-}
-
-/* }====================================================== */
-
-
-
-/*
-** {======================================================
-** Quicksort
-** (based on `Algorithms in MODULA-3', Robert Sedgewick;
-**  Addison-Wesley, 1993.)
-** =======================================================
-*/
-
-
-static void set2 (lua_State *L, int i, int j) {
-  lua_rawseti(L, 1, i);
-  lua_rawseti(L, 1, j);
-}
-
-static int sort_comp (lua_State *L, int a, int b) {
-  if (!lua_isnil(L, 2)) {  /* function? */
-    int res;
-    lua_pushvalue(L, 2);
-    lua_pushvalue(L, a-1);  /* -1 to compensate function */
-    lua_pushvalue(L, b-2);  /* -2 to compensate function and `a' */
-    lua_call(L, 2, 1);
-    res = lua_toboolean(L, -1);
-    lua_pop(L, 1);
-    return res;
-  }
-  else  /* a < b? */
-    return lua_compare(L, a, b, LUA_OPLT);
-}
-
-static void auxsort (lua_State *L, int l, int u) {
-  while (l < u) {  /* for tail recursion */
-    int i, j;
-    /* sort elements a[l], a[(l+u)/2] and a[u] */
-    lua_rawgeti(L, 1, l);
-    lua_rawgeti(L, 1, u);
-    if (sort_comp(L, -1, -2))  /* a[u] < a[l]? */
-      set2(L, l, u);  /* swap a[l] - a[u] */
-    else
-      lua_pop(L, 2);
-    if (u-l == 1) break;  /* only 2 elements */
-    i = (l+u)/2;
-    lua_rawgeti(L, 1, i);
-    lua_rawgeti(L, 1, l);
-    if (sort_comp(L, -2, -1))  /* a[i]<a[l]? */
-      set2(L, i, l);
-    else {
-      lua_pop(L, 1);  /* remove a[l] */
-      lua_rawgeti(L, 1, u);
-      if (sort_comp(L, -1, -2))  /* a[u]<a[i]? */
-        set2(L, i, u);
-      else
-        lua_pop(L, 2);
-    }
-    if (u-l == 2) break;  /* only 3 elements */
-    lua_rawgeti(L, 1, i);  /* Pivot */
-    lua_pushvalue(L, -1);
-    lua_rawgeti(L, 1, u-1);
-    set2(L, i, u-1);
-    /* a[l] <= P == a[u-1] <= a[u], only need to sort from l+1 to u-2 */
-    i = l; j = u-1;
-    for (;;) {  /* invariant: a[l..i] <= P <= a[j..u] */
-      /* repeat ++i until a[i] >= P */
-      while (lua_rawgeti(L, 1, ++i), sort_comp(L, -1, -2)) {
-        if (i>=u) luaL_error(L, "invalid order function for sorting");
-        lua_pop(L, 1);  /* remove a[i] */
-      }
-      /* repeat --j until a[j] <= P */
-      while (lua_rawgeti(L, 1, --j), sort_comp(L, -3, -1)) {
-        if (j<=l) luaL_error(L, "invalid order function for sorting");
-        lua_pop(L, 1);  /* remove a[j] */
-      }
-      if (j<i) {
-        lua_pop(L, 3);  /* pop pivot, a[i], a[j] */
-        break;
-      }
-      set2(L, i, j);
-    }
-    lua_rawgeti(L, 1, u-1);
-    lua_rawgeti(L, 1, i);
-    set2(L, u-1, i);  /* swap pivot (a[u-1]) with a[i] */
-    /* a[l..i-1] <= a[i] == P <= a[i+1..u] */
-    /* adjust so that smaller half is in [j..i] and larger one in [l..u] */
-    if (i-l < u-i) {
-      j=l; i=i-1; l=i+2;
-    }
-    else {
-      j=i+1; i=u; u=j-2;
-    }
-    auxsort(L, j, i);  /* call recursively the smaller one */
-  }  /* repeat the routine for the larger one */
-}
-
-static int sort (lua_State *L) {
-  int n = aux_getn(L, 1);
-  luaL_checkstack(L, 40, "");  /* assume array is smaller than 2^40 */
-  if (!lua_isnoneornil(L, 2))  /* is there a 2nd argument? */
-    luaL_checktype(L, 2, LUA_TFUNCTION);
-  lua_settop(L, 2);  /* make sure there is two arguments */
-  auxsort(L, 1, n);
-  return 0;
-}
-
-/* }====================================================== */
-
-
-static const luaL_Reg tab_funcs[] = {
-  {"concat", tconcat},
-#if defined(LUA_COMPAT_MAXN)
-  {"maxn", maxn},
-#endif
-  {"insert", tinsert},
-  {"pack", pack},
-  {"unpack", unpack},
-  {"remove", tremove},
-  {"sort", sort},
-  {NULL, NULL}
-};
-
-
-LUAMOD_API int luaopen_table (lua_State *L) {
-  luaL_newlib(L, tab_funcs);
-#if defined(LUA_COMPAT_UNPACK)
-  /* _G.unpack = table.unpack */
-  lua_getfield(L, -1, "unpack");
-  lua_setglobal(L, "unpack");
-#endif
-  return 1;
-}
-
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/ltm.h
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/ltm.h
+++ /dev/null
@@ -1,57 +0,0 @@
-/*
-** $Id: ltm.h,v 2.11.1.1 2013/04/12 18:48:47 roberto Exp $
-** Tag methods
-** See Copyright Notice in lua.h
-*/
-
-#ifndef ltm_h
-#define ltm_h
-
-
-#include "lobject.h"
-
-
-/*
-* WARNING: if you change the order of this enumeration,
-* grep "ORDER TM"
-*/
-typedef enum {
-  TM_INDEX,
-  TM_NEWINDEX,
-  TM_GC,
-  TM_MODE,
-  TM_LEN,
-  TM_EQ,  /* last tag method with `fast' access */
-  TM_ADD,
-  TM_SUB,
-  TM_MUL,
-  TM_DIV,
-  TM_MOD,
-  TM_POW,
-  TM_UNM,
-  TM_LT,
-  TM_LE,
-  TM_CONCAT,
-  TM_CALL,
-  TM_N		/* number of elements in the enum */
-} TMS;
-
-
-
-#define gfasttm(g,et,e) ((et) == NULL ? NULL : \
-  ((et)->flags & (1u<<(e))) ? NULL : luaT_gettm(et, e, (g)->tmname[e]))
-
-#define fasttm(l,et,e)	gfasttm(G(l), et, e)
-
-#define ttypename(x)	luaT_typenames_[(x) + 1]
-#define objtypename(x)	ttypename(ttypenv(x))
-
-LUAI_DDEC const char *const luaT_typenames_[LUA_TOTALTAGS];
-
-
-LUAI_FUNC const TValue *luaT_gettm (Table *events, TMS event, TString *ename);
-LUAI_FUNC const TValue *luaT_gettmbyobj (lua_State *L, const TValue *o,
-                                                       TMS event);
-LUAI_FUNC void luaT_init (lua_State *L);
-
-#endif
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/ltm.c
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/ltm.c
+++ /dev/null
@@ -1,77 +0,0 @@
-/*
-** $Id: ltm.c,v 2.14.1.1 2013/04/12 18:48:47 roberto Exp $
-** Tag methods
-** See Copyright Notice in lua.h
-*/
-
-
-#include <sys/zfs_context.h>
-
-#define ltm_c
-#define LUA_CORE
-
-#include "lua.h"
-
-#include "lobject.h"
-#include "lstate.h"
-#include "lstring.h"
-#include "ltable.h"
-#include "ltm.h"
-
-
-static const char udatatypename[] = "userdata";
-
-LUAI_DDEF const char *const luaT_typenames_[LUA_TOTALTAGS] = {
-  "no value",
-  "nil", "boolean", udatatypename, "number",
-  "string", "table", "function", udatatypename, "thread",
-  "proto", "upval"  /* these last two cases are used for tests only */
-};
-
-
-void luaT_init (lua_State *L) {
-  static const char *const luaT_eventname[] = {  /* ORDER TM */
-    "__index", "__newindex",
-    "__gc", "__mode", "__len", "__eq",
-    "__add", "__sub", "__mul", "__div", "__mod",
-    "__pow", "__unm", "__lt", "__le",
-    "__concat", "__call"
-  };
-  int i;
-  for (i=0; i<TM_N; i++) {
-    G(L)->tmname[i] = luaS_new(L, luaT_eventname[i]);
-    luaS_fix(G(L)->tmname[i]);  /* never collect these names */
-  }
-}
-
-
-/*
-** function to be used with macro "fasttm": optimized for absence of
-** tag methods
-*/
-const TValue *luaT_gettm (Table *events, TMS event, TString *ename) {
-  const TValue *tm = luaH_getstr(events, ename);
-  lua_assert(event <= TM_EQ);
-  if (ttisnil(tm)) {  /* no tag method? */
-    events->flags |= cast_byte(1u<<event);  /* cache this fact */
-    return NULL;
-  }
-  else return tm;
-}
-
-
-const TValue *luaT_gettmbyobj (lua_State *L, const TValue *o, TMS event) {
-  Table *mt;
-  switch (ttypenv(o)) {
-    case LUA_TTABLE:
-      mt = hvalue(o)->metatable;
-      break;
-    case LUA_TUSERDATA:
-      mt = uvalue(o)->metatable;
-      break;
-    default:
-      mt = G(L)->mt[ttypenv(o)];
-  }
-  return (mt ? luaH_getstr(mt, G(L)->tmname[event]) : luaO_nilobject);
-}
-
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lua.h
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lua.h
+++ /dev/null
@@ -1,443 +0,0 @@
-/*
-** $Id: lua.h,v 1.285.1.4 2015/02/21 14:04:50 roberto Exp $
-** Lua - A Scripting Language
-** Lua.org, PUC-Rio, Brazil (http://www.lua.org)
-** See Copyright Notice at the end of this file
-*/
-
-
-#ifndef lua_h
-#define lua_h
-
-#include <sys/zfs_context.h>
-
-#include "luaconf.h"
-
-
-#define LUA_VERSION_MAJOR	"5"
-#define LUA_VERSION_MINOR	"2"
-#define LUA_VERSION_NUM		502
-#define LUA_VERSION_RELEASE	"4"
-
-#define LUA_VERSION	"Lua " LUA_VERSION_MAJOR "." LUA_VERSION_MINOR
-#define LUA_RELEASE	LUA_VERSION "." LUA_VERSION_RELEASE
-#define LUA_COPYRIGHT	LUA_RELEASE "  Copyright (C) 1994-2015 Lua.org, PUC-Rio"
-#define LUA_AUTHORS	"R. Ierusalimschy, L. H. de Figueiredo, W. Celes"
-
-
-/* mark for precompiled code ('<esc>Lua') */
-#define LUA_SIGNATURE	"\033Lua"
-
-/* option for multiple returns in 'lua_pcall' and 'lua_call' */
-#define LUA_MULTRET	(-1)
-
-
-/*
-** pseudo-indices
-*/
-#define LUA_REGISTRYINDEX	LUAI_FIRSTPSEUDOIDX
-#define lua_upvalueindex(i)	(LUA_REGISTRYINDEX - (i))
-
-
-/* thread status */
-#define LUA_OK		0
-#define LUA_YIELD	1
-#define LUA_ERRRUN	2
-#define LUA_ERRSYNTAX	3
-#define LUA_ERRMEM	4
-#define LUA_ERRGCMM	5
-#define LUA_ERRERR	6
-
-
-typedef struct lua_State lua_State;
-
-typedef int (*lua_CFunction) (lua_State *L);
-
-
-/*
-** functions that read/write blocks when loading/dumping Lua chunks
-*/
-typedef const char * (*lua_Reader) (lua_State *L, void *ud, size_t *sz);
-
-typedef int (*lua_Writer) (lua_State *L, const void* p, size_t sz, void* ud);
-
-
-/*
-** prototype for memory-allocation functions
-*/
-typedef void * (*lua_Alloc) (void *ud, void *ptr, size_t osize, size_t nsize);
-
-
-/*
-** basic types
-*/
-#define LUA_TNONE		(-1)
-
-#define LUA_TNIL		0
-#define LUA_TBOOLEAN		1
-#define LUA_TLIGHTUSERDATA	2
-#define LUA_TNUMBER		3
-#define LUA_TSTRING		4
-#define LUA_TTABLE		5
-#define LUA_TFUNCTION		6
-#define LUA_TUSERDATA		7
-#define LUA_TTHREAD		8
-
-#define LUA_NUMTAGS		9
-
-
-
-/* minimum Lua stack available to a C function */
-#define LUA_MINSTACK	20
-
-
-/* predefined values in the registry */
-#define LUA_RIDX_MAINTHREAD	1
-#define LUA_RIDX_GLOBALS	2
-#define LUA_RIDX_LAST		LUA_RIDX_GLOBALS
-
-
-/* type of numbers in Lua */
-typedef LUA_NUMBER lua_Number;
-
-
-/* type for integer functions */
-typedef LUA_INTEGER lua_Integer;
-
-/* unsigned integer type */
-typedef LUA_UNSIGNED lua_Unsigned;
-
-
-
-
-/*
-** generic extra include file
-*/
-#if defined(LUA_USER_H)
-#include LUA_USER_H
-#endif
-
-
-/*
-** RCS ident string
-*/
-extern const char lua_ident[];
-
-
-/*
-** state manipulation
-*/
-LUA_API lua_State *(lua_newstate) (lua_Alloc f, void *ud);
-LUA_API void       (lua_close) (lua_State *L);
-LUA_API lua_State *(lua_newthread) (lua_State *L);
-
-LUA_API lua_CFunction (lua_atpanic) (lua_State *L, lua_CFunction panicf);
-
-
-LUA_API const lua_Number *(lua_version) (lua_State *L);
-
-
-/*
-** basic stack manipulation
-*/
-LUA_API int   (lua_absindex) (lua_State *L, int idx);
-LUA_API int   (lua_gettop) (lua_State *L);
-LUA_API void  (lua_settop) (lua_State *L, int idx);
-LUA_API void  (lua_pushvalue) (lua_State *L, int idx);
-LUA_API void  (lua_remove) (lua_State *L, int idx);
-LUA_API void  (lua_insert) (lua_State *L, int idx);
-LUA_API void  (lua_replace) (lua_State *L, int idx);
-LUA_API void  (lua_copy) (lua_State *L, int fromidx, int toidx);
-LUA_API int   (lua_checkstack) (lua_State *L, int sz);
-
-LUA_API void  (lua_xmove) (lua_State *from, lua_State *to, int n);
-
-
-/*
-** access functions (stack -> C)
-*/
-
-LUA_API int             (lua_isnumber) (lua_State *L, int idx);
-LUA_API int             (lua_isstring) (lua_State *L, int idx);
-LUA_API int             (lua_iscfunction) (lua_State *L, int idx);
-LUA_API int             (lua_isuserdata) (lua_State *L, int idx);
-LUA_API int             (lua_type) (lua_State *L, int idx);
-LUA_API const char     *(lua_typename) (lua_State *L, int tp);
-
-LUA_API lua_Number      (lua_tonumberx) (lua_State *L, int idx, int *isnum);
-LUA_API lua_Integer     (lua_tointegerx) (lua_State *L, int idx, int *isnum);
-LUA_API lua_Unsigned    (lua_tounsignedx) (lua_State *L, int idx, int *isnum);
-LUA_API int             (lua_toboolean) (lua_State *L, int idx);
-LUA_API const char     *(lua_tolstring) (lua_State *L, int idx, size_t *len);
-LUA_API size_t          (lua_rawlen) (lua_State *L, int idx);
-LUA_API lua_CFunction   (lua_tocfunction) (lua_State *L, int idx);
-LUA_API void	       *(lua_touserdata) (lua_State *L, int idx);
-LUA_API lua_State      *(lua_tothread) (lua_State *L, int idx);
-LUA_API const void     *(lua_topointer) (lua_State *L, int idx);
-
-
-/*
-** Comparison and arithmetic functions
-*/
-
-#define LUA_OPADD	0	/* ORDER TM */
-#define LUA_OPSUB	1
-#define LUA_OPMUL	2
-#define LUA_OPDIV	3
-#define LUA_OPMOD	4
-#define LUA_OPPOW	5
-#define LUA_OPUNM	6
-
-LUA_API void  (lua_arith) (lua_State *L, int op);
-
-#define LUA_OPEQ	0
-#define LUA_OPLT	1
-#define LUA_OPLE	2
-
-LUA_API int   (lua_rawequal) (lua_State *L, int idx1, int idx2);
-LUA_API int   (lua_compare) (lua_State *L, int idx1, int idx2, int op);
-
-
-/*
-** push functions (C -> stack)
-*/
-LUA_API void        (lua_pushnil) (lua_State *L);
-LUA_API void        (lua_pushnumber) (lua_State *L, lua_Number n);
-LUA_API void        (lua_pushinteger) (lua_State *L, lua_Integer n);
-LUA_API void        (lua_pushunsigned) (lua_State *L, lua_Unsigned n);
-LUA_API const char *(lua_pushlstring) (lua_State *L, const char *s, size_t l);
-LUA_API const char *(lua_pushstring) (lua_State *L, const char *s);
-LUA_API const char *(lua_pushvfstring) (lua_State *L, const char *fmt,
-                                                      va_list argp);
-LUA_API const char *(lua_pushfstring) (lua_State *L, const char *fmt, ...);
-LUA_API void  (lua_pushcclosure) (lua_State *L, lua_CFunction fn, int n);
-LUA_API void  (lua_pushboolean) (lua_State *L, int b);
-LUA_API void  (lua_pushlightuserdata) (lua_State *L, void *p);
-LUA_API int   (lua_pushthread) (lua_State *L);
-
-
-/*
-** get functions (Lua -> stack)
-*/
-LUA_API void  (lua_getglobal) (lua_State *L, const char *var);
-LUA_API void  (lua_gettable) (lua_State *L, int idx);
-LUA_API void  (lua_getfield) (lua_State *L, int idx, const char *k);
-LUA_API void  (lua_rawget) (lua_State *L, int idx);
-LUA_API void  (lua_rawgeti) (lua_State *L, int idx, int n);
-LUA_API void  (lua_rawgetp) (lua_State *L, int idx, const void *p);
-LUA_API void  (lua_createtable) (lua_State *L, int narr, int nrec);
-LUA_API void *(lua_newuserdata) (lua_State *L, size_t sz);
-LUA_API int   (lua_getmetatable) (lua_State *L, int objindex);
-LUA_API void  (lua_getuservalue) (lua_State *L, int idx);
-
-
-/*
-** set functions (stack -> Lua)
-*/
-LUA_API void  (lua_setglobal) (lua_State *L, const char *var);
-LUA_API void  (lua_settable) (lua_State *L, int idx);
-LUA_API void  (lua_setfield) (lua_State *L, int idx, const char *k);
-LUA_API void  (lua_rawset) (lua_State *L, int idx);
-LUA_API void  (lua_rawseti) (lua_State *L, int idx, int n);
-LUA_API void  (lua_rawsetp) (lua_State *L, int idx, const void *p);
-LUA_API int   (lua_setmetatable) (lua_State *L, int objindex);
-LUA_API void  (lua_setuservalue) (lua_State *L, int idx);
-
-
-/*
-** 'load' and 'call' functions (load and run Lua code)
-*/
-LUA_API void  (lua_callk) (lua_State *L, int nargs, int nresults, int ctx,
-                           lua_CFunction k);
-#define lua_call(L,n,r)		lua_callk(L, (n), (r), 0, NULL)
-
-LUA_API int   (lua_getctx) (lua_State *L, int *ctx);
-
-LUA_API int   (lua_pcallk) (lua_State *L, int nargs, int nresults, int errfunc,
-                            int ctx, lua_CFunction k);
-#define lua_pcall(L,n,r,f)	lua_pcallk(L, (n), (r), (f), 0, NULL)
-
-LUA_API int   (lua_load) (lua_State *L, lua_Reader reader, void *dt,
-                                        const char *chunkname,
-                                        const char *mode);
-
-LUA_API int (lua_dump) (lua_State *L, lua_Writer writer, void *data);
-
-
-/*
-** coroutine functions
-*/
-LUA_API int  (lua_yieldk) (lua_State *L, int nresults, int ctx,
-                           lua_CFunction k);
-#define lua_yield(L,n)		lua_yieldk(L, (n), 0, NULL)
-LUA_API int  (lua_resume) (lua_State *L, lua_State *from, int narg);
-LUA_API int  (lua_status) (lua_State *L);
-
-/*
-** garbage-collection function and options
-*/
-
-#define LUA_GCSTOP		0
-#define LUA_GCRESTART		1
-#define LUA_GCCOLLECT		2
-#define LUA_GCCOUNT		3
-#define LUA_GCCOUNTB		4
-#define LUA_GCSTEP		5
-#define LUA_GCSETPAUSE		6
-#define LUA_GCSETSTEPMUL	7
-#define LUA_GCSETMAJORINC	8
-#define LUA_GCISRUNNING		9
-#define LUA_GCGEN		10
-#define LUA_GCINC		11
-
-LUA_API int (lua_gc) (lua_State *L, int what, int data);
-
-
-/*
-** miscellaneous functions
-*/
-
-LUA_API int   (lua_error) (lua_State *L);
-
-LUA_API int   (lua_next) (lua_State *L, int idx);
-
-LUA_API void  (lua_concat) (lua_State *L, int n);
-LUA_API void  (lua_len)    (lua_State *L, int idx);
-
-LUA_API lua_Alloc (lua_getallocf) (lua_State *L, void **ud);
-LUA_API void      (lua_setallocf) (lua_State *L, lua_Alloc f, void *ud);
-
-
-
-/*
-** ===============================================================
-** some useful macros
-** ===============================================================
-*/
-
-#define lua_tonumber(L,i)	lua_tonumberx(L,i,NULL)
-#define lua_tointeger(L,i)	lua_tointegerx(L,i,NULL)
-#define lua_tounsigned(L,i)	lua_tounsignedx(L,i,NULL)
-
-#define lua_pop(L,n)		lua_settop(L, -(n)-1)
-
-#define lua_newtable(L)		lua_createtable(L, 0, 0)
-
-#define lua_register(L,n,f) (lua_pushcfunction(L, (f)), lua_setglobal(L, (n)))
-
-#define lua_pushcfunction(L,f)	lua_pushcclosure(L, (f), 0)
-
-#define lua_isfunction(L,n)	(lua_type(L, (n)) == LUA_TFUNCTION)
-#define lua_istable(L,n)	(lua_type(L, (n)) == LUA_TTABLE)
-#define lua_islightuserdata(L,n)	(lua_type(L, (n)) == LUA_TLIGHTUSERDATA)
-#define lua_isnil(L,n)		(lua_type(L, (n)) == LUA_TNIL)
-#define lua_isboolean(L,n)	(lua_type(L, (n)) == LUA_TBOOLEAN)
-#define lua_isthread(L,n)	(lua_type(L, (n)) == LUA_TTHREAD)
-#define lua_isnone(L,n)		(lua_type(L, (n)) == LUA_TNONE)
-#define lua_isnoneornil(L, n)	(lua_type(L, (n)) <= 0)
-
-#define lua_pushliteral(L, s)	\
-	lua_pushlstring(L, "" s, (sizeof(s)/sizeof(char))-1)
-
-#define lua_pushglobaltable(L)  \
-	lua_rawgeti(L, LUA_REGISTRYINDEX, LUA_RIDX_GLOBALS)
-
-#define lua_tostring(L,i)	lua_tolstring(L, (i), NULL)
-
-
-
-/*
-** {======================================================================
-** Debug API
-** =======================================================================
-*/
-
-
-/*
-** Event codes
-*/
-#define LUA_HOOKCALL	0
-#define LUA_HOOKRET	1
-#define LUA_HOOKLINE	2
-#define LUA_HOOKCOUNT	3
-#define LUA_HOOKTAILCALL 4
-
-
-/*
-** Event masks
-*/
-#define LUA_MASKCALL	(1 << LUA_HOOKCALL)
-#define LUA_MASKRET	(1 << LUA_HOOKRET)
-#define LUA_MASKLINE	(1 << LUA_HOOKLINE)
-#define LUA_MASKCOUNT	(1 << LUA_HOOKCOUNT)
-
-typedef struct lua_Debug lua_Debug;  /* activation record */
-
-
-/* Functions to be called by the debugger in specific events */
-typedef void (*lua_Hook) (lua_State *L, lua_Debug *ar);
-
-
-LUA_API int (lua_getstack) (lua_State *L, int level, lua_Debug *ar);
-LUA_API int (lua_getinfo) (lua_State *L, const char *what, lua_Debug *ar);
-LUA_API const char *(lua_getlocal) (lua_State *L, const lua_Debug *ar, int n);
-LUA_API const char *(lua_setlocal) (lua_State *L, const lua_Debug *ar, int n);
-LUA_API const char *(lua_getupvalue) (lua_State *L, int funcindex, int n);
-LUA_API const char *(lua_setupvalue) (lua_State *L, int funcindex, int n);
-
-LUA_API void *(lua_upvalueid) (lua_State *L, int fidx, int n);
-LUA_API void  (lua_upvaluejoin) (lua_State *L, int fidx1, int n1,
-                                               int fidx2, int n2);
-
-LUA_API int (lua_sethook) (lua_State *L, lua_Hook func, int mask, int count);
-LUA_API lua_Hook (lua_gethook) (lua_State *L);
-LUA_API int (lua_gethookmask) (lua_State *L);
-LUA_API int (lua_gethookcount) (lua_State *L);
-
-
-struct lua_Debug {
-  int event;
-  const char *name;	/* (n) */
-  const char *namewhat;	/* (n) 'global', 'local', 'field', 'method' */
-  const char *what;	/* (S) 'Lua', 'C', 'main', 'tail' */
-  const char *source;	/* (S) */
-  int currentline;	/* (l) */
-  int linedefined;	/* (S) */
-  int lastlinedefined;	/* (S) */
-  unsigned char nups;	/* (u) number of upvalues */
-  unsigned char nparams;/* (u) number of parameters */
-  char isvararg;        /* (u) */
-  char istailcall;	/* (t) */
-  char short_src[LUA_IDSIZE]; /* (S) */
-  /* private part */
-  struct CallInfo *i_ci;  /* active function */
-};
-
-/* }====================================================================== */
-
-
-/******************************************************************************
-* Copyright (C) 1994-2015 Lua.org, PUC-Rio.
-*
-* Permission is hereby granted, free of charge, to any person obtaining
-* a copy of this software and associated documentation files (the
-* "Software"), to deal in the Software without restriction, including
-* without limitation the rights to use, copy, modify, merge, publish,
-* distribute, sublicense, and/or sell copies of the Software, and to
-* permit persons to whom the Software is furnished to do so, subject to
-* the following conditions:
-*
-* The above copyright notice and this permission notice shall be
-* included in all copies or substantial portions of the Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
-* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
-* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
-* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
-* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-******************************************************************************/
-
-
-#endif
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/luaconf.h
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/luaconf.h
+++ /dev/null
@@ -1,555 +0,0 @@
-/*
-** $Id: luaconf.h,v 1.176.1.2 2013/11/21 17:26:16 roberto Exp $
-** Configuration file for Lua
-** See Copyright Notice in lua.h
-*/
-
-
-#ifndef lconfig_h
-#define lconfig_h
-
-#include <sys/zfs_context.h>
-#ifdef illumos
-#include <sys/int_fmtio.h>
-#else
-#include <machine/_inttypes.h>
-#endif
-
-extern ssize_t lcompat_sprintf(char *, const char *, ...);
-extern int64_t lcompat_strtoll(const char *, char **);
-extern int64_t lcompat_pow(int64_t, int64_t);
-
-/*
-** ==================================================================
-** Search for "@@" to find all configurable definitions.
-** ===================================================================
-*/
-
-
-/*
-@@ LUA_ANSI controls the use of non-ansi features.
-** CHANGE it (define it) if you want Lua to avoid the use of any
-** non-ansi feature or library.
-*/
-#if !defined(LUA_ANSI) && defined(__STRICT_ANSI__)
-#define LUA_ANSI
-#endif
-
-
-#if !defined(LUA_ANSI) && defined(_WIN32) && !defined(_WIN32_WCE)
-#define LUA_WIN		/* enable goodies for regular Windows platforms */
-#endif
-
-#if defined(LUA_WIN)
-#define LUA_DL_DLL
-#define LUA_USE_AFORMAT		/* assume 'printf' handles 'aA' specifiers */
-#endif
-
-
-
-#if defined(LUA_USE_LINUX)
-#define LUA_USE_POSIX
-#define LUA_USE_DLOPEN		/* needs an extra library: -ldl */
-#define LUA_USE_READLINE	/* needs some extra libraries */
-#define LUA_USE_STRTODHEX	/* assume 'strtod' handles hex formats */
-#define LUA_USE_AFORMAT		/* assume 'printf' handles 'aA' specifiers */
-#define LUA_USE_LONGLONG	/* assume support for long long */
-#endif
-
-#if defined(LUA_USE_MACOSX)
-#define LUA_USE_POSIX
-#define LUA_USE_DLOPEN		/* does not need -ldl */
-#define LUA_USE_READLINE	/* needs an extra library: -lreadline */
-#define LUA_USE_STRTODHEX	/* assume 'strtod' handles hex formats */
-#define LUA_USE_AFORMAT		/* assume 'printf' handles 'aA' specifiers */
-#define LUA_USE_LONGLONG	/* assume support for long long */
-#endif
-
-
-
-/*
-@@ LUA_USE_POSIX includes all functionality listed as X/Open System
-@* Interfaces Extension (XSI).
-** CHANGE it (define it) if your system is XSI compatible.
-*/
-#if defined(LUA_USE_POSIX)
-#define LUA_USE_MKSTEMP
-#define LUA_USE_ISATTY
-#define LUA_USE_POPEN
-#define LUA_USE_ULONGJMP
-#define LUA_USE_GMTIME_R
-#endif
-
-
-
-/*
-@@ LUA_PATH_DEFAULT is the default path that Lua uses to look for
-@* Lua libraries.
-@@ LUA_CPATH_DEFAULT is the default path that Lua uses to look for
-@* C libraries.
-** CHANGE them if your machine has a non-conventional directory
-** hierarchy or if you want to install your libraries in
-** non-conventional directories.
-*/
-#if defined(_WIN32)	/* { */
-/*
-** In Windows, any exclamation mark ('!') in the path is replaced by the
-** path of the directory of the executable file of the current process.
-*/
-#define LUA_LDIR	"!\\lua\\"
-#define LUA_CDIR	"!\\"
-#define LUA_PATH_DEFAULT  \
-		LUA_LDIR"?.lua;"  LUA_LDIR"?\\init.lua;" \
-		LUA_CDIR"?.lua;"  LUA_CDIR"?\\init.lua;" ".\\?.lua"
-#define LUA_CPATH_DEFAULT \
-		LUA_CDIR"?.dll;" LUA_CDIR"loadall.dll;" ".\\?.dll"
-
-#else			/* }{ */
-
-#define LUA_VDIR	LUA_VERSION_MAJOR "." LUA_VERSION_MINOR "/"
-#define LUA_ROOT	"/usr/local/"
-#define LUA_LDIR	LUA_ROOT "share/lua/" LUA_VDIR
-#define LUA_CDIR	LUA_ROOT "lib/lua/" LUA_VDIR
-#define LUA_PATH_DEFAULT  \
-		LUA_LDIR"?.lua;"  LUA_LDIR"?/init.lua;" \
-		LUA_CDIR"?.lua;"  LUA_CDIR"?/init.lua;" "./?.lua"
-#define LUA_CPATH_DEFAULT \
-		LUA_CDIR"?.so;" LUA_CDIR"loadall.so;" "./?.so"
-#endif			/* } */
-
-
-/*
-@@ LUA_DIRSEP is the directory separator (for submodules).
-** CHANGE it if your machine does not use "/" as the directory separator
-** and is not Windows. (On Windows Lua automatically uses "\".)
-*/
-#if defined(_WIN32)
-#define LUA_DIRSEP	"\\"
-#else
-#define LUA_DIRSEP	"/"
-#endif
-
-
-/*
-@@ LUA_ENV is the name of the variable that holds the current
-@@ environment, used to access global names.
-** CHANGE it if you do not like this name.
-*/
-#define LUA_ENV		"_ENV"
-
-
-/*
-@@ LUA_API is a mark for all core API functions.
-@@ LUALIB_API is a mark for all auxiliary library functions.
-@@ LUAMOD_API is a mark for all standard library opening functions.
-** CHANGE them if you need to define those functions in some special way.
-** For instance, if you want to create one Windows DLL with the core and
-** the libraries, you may want to use the following definition (define
-** LUA_BUILD_AS_DLL to get it).
-*/
-#if defined(LUA_BUILD_AS_DLL)	/* { */
-
-#if defined(LUA_CORE) || defined(LUA_LIB)	/* { */
-#define LUA_API __declspec(dllexport)
-#else						/* }{ */
-#define LUA_API __declspec(dllimport)
-#endif						/* } */
-
-#else				/* }{ */
-
-#define LUA_API		extern
-
-#endif				/* } */
-
-
-/* more often than not the libs go together with the core */
-#define LUALIB_API	LUA_API
-#define LUAMOD_API	LUALIB_API
-
-
-/*
-@@ LUAI_FUNC is a mark for all extern functions that are not to be
-@* exported to outside modules.
-@@ LUAI_DDEF and LUAI_DDEC are marks for all extern (const) variables
-@* that are not to be exported to outside modules (LUAI_DDEF for
-@* definitions and LUAI_DDEC for declarations).
-** CHANGE them if you need to mark them in some special way. Elf/gcc
-** (versions 3.2 and later) mark them as "hidden" to optimize access
-** when Lua is compiled as a shared library. Not all elf targets support
-** this attribute. Unfortunately, gcc does not offer a way to check
-** whether the target offers that support, and those without support
-** give a warning about it. To avoid these warnings, change to the
-** default definition.
-*/
-#if defined(__GNUC__) && ((__GNUC__*100 + __GNUC_MINOR__) >= 302) && \
-    defined(__ELF__)		/* { */
-#define LUAI_FUNC	__attribute__((visibility("hidden"))) extern
-#define LUAI_DDEC	LUAI_FUNC
-#define LUAI_DDEF	/* empty */
-
-#else				/* }{ */
-#define LUAI_FUNC	extern
-#define LUAI_DDEC	extern
-#define LUAI_DDEF	/* empty */
-#endif				/* } */
-
-
-
-/*
-@@ LUA_QL describes how error messages quote program elements.
-** CHANGE it if you want a different appearance.
-*/
-#define LUA_QL(x)	"'" x "'"
-#define LUA_QS		LUA_QL("%s")
-
-
-/*
-@@ LUA_IDSIZE gives the maximum size for the description of the source
-@* of a function in debug information.
-** CHANGE it if you want a different size.
-*/
-#define LUA_IDSIZE	60
-
-
-/*
-@@ luai_writestringerror defines how to print error messages.
-** (A format string with one argument is enough for Lua...)
-*/
-#ifdef _KERNEL
-#define luai_writestringerror(s,p) \
-	(zfs_dbgmsg((s), (p)))
-#else
-#define luai_writestringerror(s,p) \
-	(fprintf(stderr, (s), (p)), fflush(stderr))
-#endif
-
-
-/*
-@@ LUAI_MAXSHORTLEN is the maximum length for short strings, that is,
-** strings that are internalized. (Cannot be smaller than reserved words
-** or tags for metamethods, as these strings must be internalized;
-** #("function") = 8, #("__newindex") = 10.)
-*/
-#define LUAI_MAXSHORTLEN        40
-
-
-
-/*
-** {==================================================================
-** Compatibility with previous versions
-** ===================================================================
-*/
-
-/*
-@@ LUA_COMPAT_ALL controls all compatibility options.
-** You can define it to get all options, or change specific options
-** to fit your specific needs.
-*/
-#if defined(LUA_COMPAT_ALL)	/* { */
-
-/*
-@@ LUA_COMPAT_UNPACK controls the presence of global 'unpack'.
-** You can replace it with 'table.unpack'.
-*/
-#define LUA_COMPAT_UNPACK
-
-/*
-@@ LUA_COMPAT_LOADERS controls the presence of table 'package.loaders'.
-** You can replace it with 'package.searchers'.
-*/
-#define LUA_COMPAT_LOADERS
-
-/*
-@@ macro 'lua_cpcall' emulates deprecated function lua_cpcall.
-** You can call your C function directly (with light C functions).
-*/
-#define lua_cpcall(L,f,u)  \
-	(lua_pushcfunction(L, (f)), \
-	 lua_pushlightuserdata(L,(u)), \
-	 lua_pcall(L,1,0,0))
-
-
-/*
-@@ LUA_COMPAT_LOG10 defines the function 'log10' in the math library.
-** You can rewrite 'log10(x)' as 'log(x, 10)'.
-*/
-#define LUA_COMPAT_LOG10
-
-/*
-@@ LUA_COMPAT_LOADSTRING defines the function 'loadstring' in the base
-** library. You can rewrite 'loadstring(s)' as 'load(s)'.
-*/
-#define LUA_COMPAT_LOADSTRING
-
-/*
-@@ LUA_COMPAT_MAXN defines the function 'maxn' in the table library.
-*/
-#define LUA_COMPAT_MAXN
-
-/*
-@@ The following macros supply trivial compatibility for some
-** changes in the API. The macros themselves document how to
-** change your code to avoid using them.
-*/
-#define lua_strlen(L,i)		lua_rawlen(L, (i))
-
-#define lua_objlen(L,i)		lua_rawlen(L, (i))
-
-#define lua_equal(L,idx1,idx2)		lua_compare(L,(idx1),(idx2),LUA_OPEQ)
-#define lua_lessthan(L,idx1,idx2)	lua_compare(L,(idx1),(idx2),LUA_OPLT)
-
-/*
-@@ LUA_COMPAT_MODULE controls compatibility with previous
-** module functions 'module' (Lua) and 'luaL_register' (C).
-*/
-#define LUA_COMPAT_MODULE
-
-#endif				/* } */
-
-/* }================================================================== */
-
-
-
-/*
-@@ LUAI_BITSINT defines the number of bits in an int.
-** CHANGE here if Lua cannot automatically detect the number of bits of
-** your machine. Probably you do not need to change this.
-*/
-/* avoid overflows in comparison */
-#if INT_MAX-20 < 32760		/* { */
-#define LUAI_BITSINT	16
-#elif INT_MAX > 2147483640L	/* }{ */
-/* int has at least 32 bits */
-#define LUAI_BITSINT	32
-#else				/* }{ */
-#error "you must define LUA_BITSINT with number of bits in an integer"
-#endif				/* } */
-
-
-/*
-@@ LUA_INT32 is a signed integer with exactly 32 bits.
-@@ LUAI_UMEM is an unsigned integer big enough to count the total
-@* memory used by Lua.
-@@ LUAI_MEM is a signed integer big enough to count the total memory
-@* used by Lua.
-** CHANGE here if for some weird reason the default definitions are not
-** good enough for your machine. Probably you do not need to change
-** this.
-*/
-#if LUAI_BITSINT >= 32		/* { */
-#define LUA_INT32	int
-#define LUAI_UMEM	size_t
-#define LUAI_MEM	ptrdiff_t
-#else				/* }{ */
-/* 16-bit ints */
-#define LUA_INT32	long
-#define LUAI_UMEM	unsigned long
-#define LUAI_MEM	long
-#endif				/* } */
-
-
-/*
-@@ LUAI_MAXSTACK limits the size of the Lua stack.
-** CHANGE it if you need a different limit. This limit is arbitrary;
-** its only purpose is to stop Lua from consuming unlimited stack
-** space (and to reserve some numbers for pseudo-indices).
-*/
-#if LUAI_BITSINT >= 32
-#define LUAI_MAXSTACK		1000000
-#else
-#define LUAI_MAXSTACK		15000
-#endif
-
-/* reserve some space for error handling */
-#define LUAI_FIRSTPSEUDOIDX	(-LUAI_MAXSTACK - 1000)
-
-
-
-
-/*
-@@ LUAL_BUFFERSIZE is the buffer size used by the lauxlib buffer system.
-** CHANGE it if it uses too much C-stack space.
-*/
-#define LUAL_BUFFERSIZE		1024
-
-
-
-
-/*
-** {==================================================================
-@@ LUA_NUMBER is the type of numbers in Lua.
-** CHANGE the following definitions only if you want to build Lua
-** with a number type different from double. You may also need to
-** change lua_number2int & lua_number2integer.
-** ===================================================================
-*/
-
-#define LUA_NUMBER	int64_t
-
-/*
-@@ LUAI_UACNUMBER is the result of an 'usual argument conversion'
-@* over a number.
-*/
-#define LUAI_UACNUMBER	int64_t
-
-
-/*
-@@ LUA_NUMBER_SCAN is the format for reading numbers.
-@@ LUA_NUMBER_FMT is the format for writing numbers.
-@@ lua_number2str converts a number to a string.
-@@ LUAI_MAXNUMBER2STR is maximum size of previous conversion.
-*/
-#define LUA_NUMBER_FMT		"%" PRId64
-#define lua_number2str(s,n)	lcompat_sprintf((s), LUA_NUMBER_FMT, (n))
-#define LUAI_MAXNUMBER2STR	32 /* 16 digits, sign, point, and \0 */
-
-
-/*
-@@ l_mathop allows the addition of an 'l' or 'f' to all math operations
-*/
-#define l_mathop(x)		(x ## l)
-
-
-/*
-@@ lua_str2number converts a decimal numeric string to a number.
-@@ lua_strx2number converts an hexadecimal numeric string to a number.
-** In C99, 'strtod' does both conversions. C89, however, has no function
-** to convert floating hexadecimal strings to numbers. For these
-** systems, you can leave 'lua_strx2number' undefined and Lua will
-** provide its own implementation.
-*/
-#define lua_str2number(s,p)	lcompat_strtoll((s), (p))
-
-#if defined(LUA_USE_STRTODHEX)
-#define lua_strx2number(s,p)	lcompat_strtoll((s), (p))
-#endif
-
-
-/*
-@@ The luai_num* macros define the primitive operations over numbers.
-*/
-
-/* the following operations need the math library */
-#if defined(lobject_c) || defined(lvm_c)
-#define luai_nummod(L,a,b)	((a) % (b))
-#define luai_numpow(L,a,b)	(lcompat_pow((a),(b)))
-#endif
-
-/* these are quite standard operations */
-#if defined(LUA_CORE)
-#define luai_numadd(L,a,b)	((a)+(b))
-#define luai_numsub(L,a,b)	((a)-(b))
-#define luai_nummul(L,a,b)	((a)*(b))
-#define luai_numdiv(L,a,b)	((a)/(b))
-#define luai_numunm(L,a)	(-(a))
-#define luai_numeq(a,b)		((a)==(b))
-#define luai_numlt(L,a,b)	((a)<(b))
-#define luai_numle(L,a,b)	((a)<=(b))
-#define luai_numisnan(L,a)	(!luai_numeq((a), (a)))
-#endif
-
-
-
-/*
-@@ LUA_INTEGER is the integral type used by lua_pushinteger/lua_tointeger.
-** CHANGE that if ptrdiff_t is not adequate on your machine. (On most
-** machines, ptrdiff_t gives a good choice between int or long.)
-*/
-#define LUA_INTEGER	ptrdiff_t
-
-/*
-@@ LUA_UNSIGNED is the integral type used by lua_pushunsigned/lua_tounsigned.
-** It must have at least 32 bits.
-*/
-#define LUA_UNSIGNED	uint64_t
-
-
-
-/*
-** Some tricks with doubles
-*/
-
-#if defined(LUA_NUMBER_DOUBLE) && !defined(LUA_ANSI)	/* { */
-/*
-** The next definitions activate some tricks to speed up the
-** conversion from doubles to integer types, mainly to LUA_UNSIGNED.
-**
-@@ LUA_MSASMTRICK uses Microsoft assembler to avoid clashes with a
-** DirectX idiosyncrasy.
-**
-@@ LUA_IEEE754TRICK uses a trick that should work on any machine
-** using IEEE754 with a 32-bit integer type.
-**
-@@ LUA_IEEELL extends the trick to LUA_INTEGER; should only be
-** defined when LUA_INTEGER is a 32-bit integer.
-**
-@@ LUA_IEEEENDIAN is the endianness of doubles in your machine
-** (0 for little endian, 1 for big endian); if not defined, Lua will
-** check it dynamically for LUA_IEEE754TRICK (but not for LUA_NANTRICK).
-**
-@@ LUA_NANTRICK controls the use of a trick to pack all types into
-** a single double value, using NaN values to represent non-number
-** values. The trick only works on 32-bit machines (ints and pointers
-** are 32-bit values) with numbers represented as IEEE 754-2008 doubles
-** with conventional endianess (12345678 or 87654321), in CPUs that do
-** not produce signaling NaN values (all NaNs are quiet).
-*/
-
-/* Microsoft compiler on a Pentium (32 bit) ? */
-#if defined(LUA_WIN) && defined(_MSC_VER) && defined(_M_IX86)	/* { */
-
-#define LUA_MSASMTRICK
-#define LUA_IEEEENDIAN		0
-#define LUA_NANTRICK
-
-
-/* pentium 32 bits? */
-#elif defined(__i386__) || defined(__i386) || defined(__X86__) /* }{ */
-
-#define LUA_IEEE754TRICK
-#define LUA_IEEELL
-#define LUA_IEEEENDIAN		0
-#define LUA_NANTRICK
-
-/* pentium 64 bits? */
-#elif defined(__x86_64)						/* }{ */
-
-#define LUA_IEEE754TRICK
-#define LUA_IEEEENDIAN		0
-
-#elif defined(__POWERPC__) || defined(__ppc__)			/* }{ */
-
-#define LUA_IEEE754TRICK
-#define LUA_IEEEENDIAN		1
-
-#else								/* }{ */
-
-/* assume IEEE754 and a 32-bit integer type */
-#define LUA_IEEE754TRICK
-
-#endif								/* } */
-
-#endif							/* } */
-
-/* }================================================================== */
-
-
-
-
-/* =================================================================== */
-
-/*
-** Local configuration. You can use this space to add your redefinitions
-** without modifying the main part of the file.
-*/
-
-#define	getlocaledecpoint() ('.')
-
-#define abs(x) (((x) < 0) ? -(x) : (x))
-
-#if !defined(UCHAR_MAX)
-#define	UCHAR_MAX (0xff)
-#endif
-
-#endif
-
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lualib.h
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lualib.h
+++ /dev/null
@@ -1,55 +0,0 @@
-/*
-** $Id: lualib.h,v 1.43.1.1 2013/04/12 18:48:47 roberto Exp $
-** Lua standard libraries
-** See Copyright Notice in lua.h
-*/
-
-
-#ifndef lualib_h
-#define lualib_h
-
-#include "lua.h"
-
-
-
-LUAMOD_API int (luaopen_base) (lua_State *L);
-
-#define LUA_COLIBNAME	"coroutine"
-LUAMOD_API int (luaopen_coroutine) (lua_State *L);
-
-#define LUA_TABLIBNAME	"table"
-LUAMOD_API int (luaopen_table) (lua_State *L);
-
-#define LUA_IOLIBNAME	"io"
-LUAMOD_API int (luaopen_io) (lua_State *L);
-
-#define LUA_OSLIBNAME	"os"
-LUAMOD_API int (luaopen_os) (lua_State *L);
-
-#define LUA_STRLIBNAME	"string"
-LUAMOD_API int (luaopen_string) (lua_State *L);
-
-#define LUA_BITLIBNAME	"bit32"
-LUAMOD_API int (luaopen_bit32) (lua_State *L);
-
-#define LUA_MATHLIBNAME	"math"
-LUAMOD_API int (luaopen_math) (lua_State *L);
-
-#define LUA_DBLIBNAME	"debug"
-LUAMOD_API int (luaopen_debug) (lua_State *L);
-
-#define LUA_LOADLIBNAME	"package"
-LUAMOD_API int (luaopen_package) (lua_State *L);
-
-
-/* open all previous libraries */
-LUALIB_API void (luaL_openlibs) (lua_State *L);
-
-
-
-#if !defined(lua_assert)
-#define lua_assert(x)	((void)0)
-#endif
-
-
-#endif
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lundump.h
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lundump.h
+++ /dev/null
@@ -1,28 +0,0 @@
-/*
-** $Id: lundump.h,v 1.39.1.1 2013/04/12 18:48:47 roberto Exp $
-** load precompiled Lua chunks
-** See Copyright Notice in lua.h
-*/
-
-#ifndef lundump_h
-#define lundump_h
-
-#include "lobject.h"
-#include "lzio.h"
-
-/* load one chunk; from lundump.c */
-LUAI_FUNC Closure* luaU_undump (lua_State* L, ZIO* Z, Mbuffer* buff, const char* name);
-
-/* make header; from lundump.c */
-LUAI_FUNC void luaU_header (lu_byte* h);
-
-/* dump one chunk; from ldump.c */
-LUAI_FUNC int luaU_dump (lua_State* L, const Proto* f, lua_Writer w, void* data, int strip);
-
-/* data to catch conversion errors */
-#define LUAC_TAIL		"\x19\x93\r\n\x1a\n"
-
-/* size in bytes of header of binary files */
-#define LUAC_HEADERSIZE		(sizeof(LUA_SIGNATURE)-sizeof(char)+2+6+sizeof(LUAC_TAIL)-sizeof(char))
-
-#endif
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lundump.c
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lundump.c
+++ /dev/null
@@ -1,258 +0,0 @@
-/*
-** $Id: lundump.c,v 2.22.1.1 2013/04/12 18:48:47 roberto Exp $
-** load precompiled Lua chunks
-** See Copyright Notice in lua.h
-*/
-
-#include <sys/zfs_context.h>
-
-#define lundump_c
-#define LUA_CORE
-
-#include "lua.h"
-
-#include "ldebug.h"
-#include "ldo.h"
-#include "lfunc.h"
-#include "lmem.h"
-#include "lobject.h"
-#include "lstring.h"
-#include "lundump.h"
-#include "lzio.h"
-
-typedef struct {
- lua_State* L;
- ZIO* Z;
- Mbuffer* b;
- const char* name;
-} LoadState;
-
-static l_noret error(LoadState* S, const char* why)
-{
- luaO_pushfstring(S->L,"%s: %s precompiled chunk",S->name,why);
- luaD_throw(S->L,LUA_ERRSYNTAX);
-}
-
-#define LoadMem(S,b,n,size)	LoadBlock(S,b,(n)*(size))
-#define LoadByte(S)		(lu_byte)LoadChar(S)
-#define LoadVar(S,x)		LoadMem(S,&x,1,sizeof(x))
-#define LoadVector(S,b,n,size)	LoadMem(S,b,n,size)
-
-#if !defined(luai_verifycode)
-#define luai_verifycode(L,b,f)	/* empty */
-#endif
-
-static void LoadBlock(LoadState* S, void* b, size_t size)
-{
- if (luaZ_read(S->Z,b,size)!=0) error(S,"truncated");
-}
-
-static int LoadChar(LoadState* S)
-{
- char x;
- LoadVar(S,x);
- return x;
-}
-
-static int LoadInt(LoadState* S)
-{
- int x;
- LoadVar(S,x);
- if (x<0) error(S,"corrupted");
- return x;
-}
-
-static lua_Number LoadNumber(LoadState* S)
-{
- lua_Number x;
- LoadVar(S,x);
- return x;
-}
-
-static TString* LoadString(LoadState* S)
-{
- size_t size;
- LoadVar(S,size);
- if (size==0)
-  return NULL;
- else
- {
-  char* s=luaZ_openspace(S->L,S->b,size);
-  LoadBlock(S,s,size*sizeof(char));
-  return luaS_newlstr(S->L,s,size-1);		/* remove trailing '\0' */
- }
-}
-
-static void LoadCode(LoadState* S, Proto* f)
-{
- int n=LoadInt(S);
- f->code=luaM_newvector(S->L,n,Instruction);
- f->sizecode=n;
- LoadVector(S,f->code,n,sizeof(Instruction));
-}
-
-static void LoadFunction(LoadState* S, Proto* f);
-
-static void LoadConstants(LoadState* S, Proto* f)
-{
- int i,n;
- n=LoadInt(S);
- f->k=luaM_newvector(S->L,n,TValue);
- f->sizek=n;
- for (i=0; i<n; i++) setnilvalue(&f->k[i]);
- for (i=0; i<n; i++)
- {
-  TValue* o=&f->k[i];
-  int t=LoadChar(S);
-  switch (t)
-  {
-   case LUA_TNIL:
-	setnilvalue(o);
-	break;
-   case LUA_TBOOLEAN:
-	setbvalue(o,LoadChar(S));
-	break;
-   case LUA_TNUMBER:
-	setnvalue(o,LoadNumber(S));
-	break;
-   case LUA_TSTRING:
-	setsvalue2n(S->L,o,LoadString(S));
-	break;
-    default: lua_assert(0);
-  }
- }
- n=LoadInt(S);
- f->p=luaM_newvector(S->L,n,Proto*);
- f->sizep=n;
- for (i=0; i<n; i++) f->p[i]=NULL;
- for (i=0; i<n; i++)
- {
-  f->p[i]=luaF_newproto(S->L);
-  LoadFunction(S,f->p[i]);
- }
-}
-
-static void LoadUpvalues(LoadState* S, Proto* f)
-{
- int i,n;
- n=LoadInt(S);
- f->upvalues=luaM_newvector(S->L,n,Upvaldesc);
- f->sizeupvalues=n;
- for (i=0; i<n; i++) f->upvalues[i].name=NULL;
- for (i=0; i<n; i++)
- {
-  f->upvalues[i].instack=LoadByte(S);
-  f->upvalues[i].idx=LoadByte(S);
- }
-}
-
-static void LoadDebug(LoadState* S, Proto* f)
-{
- int i,n;
- f->source=LoadString(S);
- n=LoadInt(S);
- f->lineinfo=luaM_newvector(S->L,n,int);
- f->sizelineinfo=n;
- LoadVector(S,f->lineinfo,n,sizeof(int));
- n=LoadInt(S);
- f->locvars=luaM_newvector(S->L,n,LocVar);
- f->sizelocvars=n;
- for (i=0; i<n; i++) f->locvars[i].varname=NULL;
- for (i=0; i<n; i++)
- {
-  f->locvars[i].varname=LoadString(S);
-  f->locvars[i].startpc=LoadInt(S);
-  f->locvars[i].endpc=LoadInt(S);
- }
- n=LoadInt(S);
- for (i=0; i<n; i++) f->upvalues[i].name=LoadString(S);
-}
-
-static void LoadFunction(LoadState* S, Proto* f)
-{
- f->linedefined=LoadInt(S);
- f->lastlinedefined=LoadInt(S);
- f->numparams=LoadByte(S);
- f->is_vararg=LoadByte(S);
- f->maxstacksize=LoadByte(S);
- LoadCode(S,f);
- LoadConstants(S,f);
- LoadUpvalues(S,f);
- LoadDebug(S,f);
-}
-
-/* the code below must be consistent with the code in luaU_header */
-#define N0	LUAC_HEADERSIZE
-#define N1	(sizeof(LUA_SIGNATURE)-sizeof(char))
-#define N2	N1+2
-#define N3	N2+6
-
-static void LoadHeader(LoadState* S)
-{
- lu_byte h[LUAC_HEADERSIZE];
- lu_byte s[LUAC_HEADERSIZE];
- luaU_header(h);
- memcpy(s,h,sizeof(char));			/* first char already read */
- LoadBlock(S,s+sizeof(char),LUAC_HEADERSIZE-sizeof(char));
- if (memcmp(h,s,N0)==0) return;
- if (memcmp(h,s,N1)!=0) error(S,"not a");
- if (memcmp(h,s,N2)!=0) error(S,"version mismatch in");
- if (memcmp(h,s,N3)!=0) error(S,"incompatible"); else error(S,"corrupted");
-}
-
-/*
-** load precompiled chunk
-*/
-Closure* luaU_undump (lua_State* L, ZIO* Z, Mbuffer* buff, const char* name)
-{
- LoadState S;
- Closure* cl;
- if (*name=='@' || *name=='=')
-  S.name=name+1;
- else if (*name==LUA_SIGNATURE[0])
-  S.name="binary string";
- else
-  S.name=name;
- S.L=L;
- S.Z=Z;
- S.b=buff;
- LoadHeader(&S);
- cl=luaF_newLclosure(L,1);
- setclLvalue(L,L->top,cl); incr_top(L);
- cl->l.p=luaF_newproto(L);
- LoadFunction(&S,cl->l.p);
- if (cl->l.p->sizeupvalues != 1)
- {
-  Proto* p=cl->l.p;
-  cl=luaF_newLclosure(L,cl->l.p->sizeupvalues);
-  cl->l.p=p;
-  setclLvalue(L,L->top-1,cl);
- }
- luai_verifycode(L,buff,cl->l.p);
- return cl;
-}
-
-#define MYINT(s)	(s[0]-'0')
-#define VERSION		MYINT(LUA_VERSION_MAJOR)*16+MYINT(LUA_VERSION_MINOR)
-#define FORMAT		0		/* this is the official format */
-
-/*
-* make header for precompiled chunks
-* if you change the code below be sure to update LoadHeader and FORMAT above
-* and LUAC_HEADERSIZE in lundump.h
-*/
-void luaU_header (lu_byte* h)
-{
- int x=1;
- memcpy(h,LUA_SIGNATURE,sizeof(LUA_SIGNATURE)-sizeof(char));
- h+=sizeof(LUA_SIGNATURE)-sizeof(char);
- *h++=cast_byte(VERSION);
- *h++=cast_byte(FORMAT);
- *h++=cast_byte(*(char*)&x);			/* endianness */
- *h++=cast_byte(sizeof(int));
- *h++=cast_byte(sizeof(size_t));
- *h++=cast_byte(sizeof(Instruction));
- *h++=cast_byte(sizeof(lua_Number));
- *h++=cast_byte(((lua_Number)0.5)==0);		/* is lua_Number integral? */
- memcpy(h,LUAC_TAIL,sizeof(LUAC_TAIL)-sizeof(char));
-}
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lvm.h
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lvm.h
+++ /dev/null
@@ -1,44 +0,0 @@
-/*
-** $Id: lvm.h,v 2.18.1.1 2013/04/12 18:48:47 roberto Exp $
-** Lua virtual machine
-** See Copyright Notice in lua.h
-*/
-
-#ifndef lvm_h
-#define lvm_h
-
-
-#include "ldo.h"
-#include "lobject.h"
-#include "ltm.h"
-
-
-#define tostring(L,o) (ttisstring(o) || (luaV_tostring(L, o)))
-
-#define tonumber(o,n)	(ttisnumber(o) || (((o) = luaV_tonumber(o,n)) != NULL))
-
-#define equalobj(L,o1,o2)  (ttisequal(o1, o2) && luaV_equalobj_(L, o1, o2))
-
-#define luaV_rawequalobj(o1,o2)		equalobj(NULL,o1,o2)
-
-
-/* not to called directly */
-LUAI_FUNC int luaV_equalobj_ (lua_State *L, const TValue *t1, const TValue *t2);
-
-
-LUAI_FUNC int luaV_lessthan (lua_State *L, const TValue *l, const TValue *r);
-LUAI_FUNC int luaV_lessequal (lua_State *L, const TValue *l, const TValue *r);
-LUAI_FUNC const TValue *luaV_tonumber (const TValue *obj, TValue *n);
-LUAI_FUNC int luaV_tostring (lua_State *L, StkId obj);
-LUAI_FUNC void luaV_gettable (lua_State *L, const TValue *t, TValue *key,
-                                            StkId val);
-LUAI_FUNC void luaV_settable (lua_State *L, const TValue *t, TValue *key,
-                                            StkId val);
-LUAI_FUNC void luaV_finishOp (lua_State *L);
-LUAI_FUNC void luaV_execute (lua_State *L);
-LUAI_FUNC void luaV_concat (lua_State *L, int total);
-LUAI_FUNC void luaV_arith (lua_State *L, StkId ra, const TValue *rb,
-                           const TValue *rc, TMS op);
-LUAI_FUNC void luaV_objlen (lua_State *L, StkId ra, const TValue *rb);
-
-#endif
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lvm.c
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lvm.c
+++ /dev/null
@@ -1,930 +0,0 @@
-/*
-** $Id: lvm.c,v 2.155.1.1 2013/04/12 18:48:47 roberto Exp $
-** Lua virtual machine
-** See Copyright Notice in lua.h
-*/
-
-
-#include <sys/zfs_context.h>
-
-#define	strcoll(l,r) (strcmp((l),(r)))
-
-#define lvm_c
-#define LUA_CORE
-
-#include "lua.h"
-
-#include "ldebug.h"
-#include "ldo.h"
-#include "lfunc.h"
-#include "lgc.h"
-#include "lobject.h"
-#include "lopcodes.h"
-#include "lstate.h"
-#include "lstring.h"
-#include "ltable.h"
-#include "ltm.h"
-#include "lvm.h"
-
-
-
-/* limit for table tag-method chains (to avoid loops) */
-#define MAXTAGLOOP	100
-
-
-const TValue *luaV_tonumber (const TValue *obj, TValue *n) {
-  lua_Number num;
-  if (ttisnumber(obj)) return obj;
-  if (ttisstring(obj) && luaO_str2d(svalue(obj), tsvalue(obj)->len, &num)) {
-    setnvalue(n, num);
-    return n;
-  }
-  else
-    return NULL;
-}
-
-
-int luaV_tostring (lua_State *L, StkId obj) {
-  if (!ttisnumber(obj))
-    return 0;
-  else {
-    char s[LUAI_MAXNUMBER2STR];
-    lua_Number n = nvalue(obj);
-    int l = lua_number2str(s, n);
-    setsvalue2s(L, obj, luaS_newlstr(L, s, l));
-    return 1;
-  }
-}
-
-
-static void traceexec (lua_State *L) {
-  CallInfo *ci = L->ci;
-  lu_byte mask = L->hookmask;
-  int counthook = ((mask & LUA_MASKCOUNT) && L->hookcount == 0);
-  if (counthook)
-    resethookcount(L);  /* reset count */
-  if (ci->callstatus & CIST_HOOKYIELD) {  /* called hook last time? */
-    ci->callstatus &= ~CIST_HOOKYIELD;  /* erase mark */
-    return;  /* do not call hook again (VM yielded, so it did not move) */
-  }
-  if (counthook)
-    luaD_hook(L, LUA_HOOKCOUNT, -1);  /* call count hook */
-  if (mask & LUA_MASKLINE) {
-    Proto *p = ci_func(ci)->p;
-    int npc = pcRel(ci->u.l.savedpc, p);
-    int newline = getfuncline(p, npc);
-    if (npc == 0 ||  /* call linehook when enter a new function, */
-        ci->u.l.savedpc <= L->oldpc ||  /* when jump back (loop), or when */
-        newline != getfuncline(p, pcRel(L->oldpc, p)))  /* enter a new line */
-      luaD_hook(L, LUA_HOOKLINE, newline);  /* call line hook */
-  }
-  L->oldpc = ci->u.l.savedpc;
-  if (L->status == LUA_YIELD) {  /* did hook yield? */
-    if (counthook)
-      L->hookcount = 1;  /* undo decrement to zero */
-    ci->u.l.savedpc--;  /* undo increment (resume will increment it again) */
-    ci->callstatus |= CIST_HOOKYIELD;  /* mark that it yielded */
-    ci->func = L->top - 1;  /* protect stack below results */
-    luaD_throw(L, LUA_YIELD);
-  }
-}
-
-
-static void callTM (lua_State *L, const TValue *f, const TValue *p1,
-                    const TValue *p2, TValue *p3, int hasres) {
-  ptrdiff_t result = savestack(L, p3);
-  setobj2s(L, L->top++, f);  /* push function */
-  setobj2s(L, L->top++, p1);  /* 1st argument */
-  setobj2s(L, L->top++, p2);  /* 2nd argument */
-  if (!hasres)  /* no result? 'p3' is third argument */
-    setobj2s(L, L->top++, p3);  /* 3rd argument */
-  /* metamethod may yield only when called from Lua code */
-  luaD_call(L, L->top - (4 - hasres), hasres, isLua(L->ci));
-  if (hasres) {  /* if has result, move it to its place */
-    p3 = restorestack(L, result);
-    setobjs2s(L, p3, --L->top);
-  }
-}
-
-
-void luaV_gettable (lua_State *L, const TValue *t, TValue *key, StkId val) {
-  int loop;
-  for (loop = 0; loop < MAXTAGLOOP; loop++) {
-    const TValue *tm;
-    if (ttistable(t)) {  /* `t' is a table? */
-      Table *h = hvalue(t);
-      const TValue *res = luaH_get(h, key); /* do a primitive get */
-      if (!ttisnil(res) ||  /* result is not nil? */
-          (tm = fasttm(L, h->metatable, TM_INDEX)) == NULL) { /* or no TM? */
-        setobj2s(L, val, res);
-        return;
-      }
-      /* else will try the tag method */
-    }
-    else if (ttisnil(tm = luaT_gettmbyobj(L, t, TM_INDEX)))
-      luaG_typeerror(L, t, "index");
-    if (ttisfunction(tm)) {
-      callTM(L, tm, t, key, val, 1);
-      return;
-    }
-    t = tm;  /* else repeat with 'tm' */
-  }
-  luaG_runerror(L, "loop in gettable");
-}
-
-
-void luaV_settable (lua_State *L, const TValue *t, TValue *key, StkId val) {
-  int loop;
-  for (loop = 0; loop < MAXTAGLOOP; loop++) {
-    const TValue *tm;
-    if (ttistable(t)) {  /* `t' is a table? */
-      Table *h = hvalue(t);
-      TValue *oldval = cast(TValue *, luaH_get(h, key));
-      /* if previous value is not nil, there must be a previous entry
-         in the table; moreover, a metamethod has no relevance */
-      if (!ttisnil(oldval) ||
-         /* previous value is nil; must check the metamethod */
-         ((tm = fasttm(L, h->metatable, TM_NEWINDEX)) == NULL &&
-         /* no metamethod; is there a previous entry in the table? */
-         (oldval != luaO_nilobject ||
-         /* no previous entry; must create one. (The next test is
-            always true; we only need the assignment.) */
-         (oldval = luaH_newkey(L, h, key), 1)))) {
-        /* no metamethod and (now) there is an entry with given key */
-        setobj2t(L, oldval, val);  /* assign new value to that entry */
-        invalidateTMcache(h);
-        luaC_barrierback(L, obj2gco(h), val);
-        return;
-      }
-      /* else will try the metamethod */
-    }
-    else  /* not a table; check metamethod */
-      if (ttisnil(tm = luaT_gettmbyobj(L, t, TM_NEWINDEX)))
-        luaG_typeerror(L, t, "index");
-    /* there is a metamethod */
-    if (ttisfunction(tm)) {
-      callTM(L, tm, t, key, val, 0);
-      return;
-    }
-    t = tm;  /* else repeat with 'tm' */
-  }
-  luaG_runerror(L, "loop in settable");
-}
-
-
-static int call_binTM (lua_State *L, const TValue *p1, const TValue *p2,
-                       StkId res, TMS event) {
-  const TValue *tm = luaT_gettmbyobj(L, p1, event);  /* try first operand */
-  if (ttisnil(tm))
-    tm = luaT_gettmbyobj(L, p2, event);  /* try second operand */
-  if (ttisnil(tm)) return 0;
-  callTM(L, tm, p1, p2, res, 1);
-  return 1;
-}
-
-
-static const TValue *get_equalTM (lua_State *L, Table *mt1, Table *mt2,
-                                  TMS event) {
-  const TValue *tm1 = fasttm(L, mt1, event);
-  const TValue *tm2;
-  if (tm1 == NULL) return NULL;  /* no metamethod */
-  if (mt1 == mt2) return tm1;  /* same metatables => same metamethods */
-  tm2 = fasttm(L, mt2, event);
-  if (tm2 == NULL) return NULL;  /* no metamethod */
-  if (luaV_rawequalobj(tm1, tm2))  /* same metamethods? */
-    return tm1;
-  return NULL;
-}
-
-
-static int call_orderTM (lua_State *L, const TValue *p1, const TValue *p2,
-                         TMS event) {
-  if (!call_binTM(L, p1, p2, L->top, event))
-    return -1;  /* no metamethod */
-  else
-    return !l_isfalse(L->top);
-}
-
-
-static int l_strcmp (const TString *ls, const TString *rs) {
-  const char *l = getstr(ls);
-  size_t ll = ls->tsv.len;
-  const char *r = getstr(rs);
-  size_t lr = rs->tsv.len;
-  for (;;) {
-    int temp = strcoll(l, r);
-    if (temp != 0) return temp;
-    else {  /* strings are equal up to a `\0' */
-      size_t len = strlen(l);  /* index of first `\0' in both strings */
-      if (len == lr)  /* r is finished? */
-        return (len == ll) ? 0 : 1;
-      else if (len == ll)  /* l is finished? */
-        return -1;  /* l is smaller than r (because r is not finished) */
-      /* both strings longer than `len'; go on comparing (after the `\0') */
-      len++;
-      l += len; ll -= len; r += len; lr -= len;
-    }
-  }
-}
-
-
-int luaV_lessthan (lua_State *L, const TValue *l, const TValue *r) {
-  int res;
-  if (ttisnumber(l) && ttisnumber(r))
-    return luai_numlt(L, nvalue(l), nvalue(r));
-  else if (ttisstring(l) && ttisstring(r))
-    return l_strcmp(rawtsvalue(l), rawtsvalue(r)) < 0;
-  else if ((res = call_orderTM(L, l, r, TM_LT)) < 0)
-    luaG_ordererror(L, l, r);
-  return res;
-}
-
-
-int luaV_lessequal (lua_State *L, const TValue *l, const TValue *r) {
-  int res;
-  if (ttisnumber(l) && ttisnumber(r))
-    return luai_numle(L, nvalue(l), nvalue(r));
-  else if (ttisstring(l) && ttisstring(r))
-    return l_strcmp(rawtsvalue(l), rawtsvalue(r)) <= 0;
-  else if ((res = call_orderTM(L, l, r, TM_LE)) >= 0)  /* first try `le' */
-    return res;
-  else if ((res = call_orderTM(L, r, l, TM_LT)) < 0)  /* else try `lt' */
-    luaG_ordererror(L, l, r);
-  return !res;
-}
-
-
-/*
-** equality of Lua values. L == NULL means raw equality (no metamethods)
-*/
-int luaV_equalobj_ (lua_State *L, const TValue *t1, const TValue *t2) {
-  const TValue *tm;
-  lua_assert(ttisequal(t1, t2));
-  switch (ttype(t1)) {
-    case LUA_TNIL: return 1;
-    case LUA_TNUMBER: return luai_numeq(nvalue(t1), nvalue(t2));
-    case LUA_TBOOLEAN: return bvalue(t1) == bvalue(t2);  /* true must be 1 !! */
-    case LUA_TLIGHTUSERDATA: return pvalue(t1) == pvalue(t2);
-    case LUA_TLCF: return fvalue(t1) == fvalue(t2);
-    case LUA_TSHRSTR: return eqshrstr(rawtsvalue(t1), rawtsvalue(t2));
-    case LUA_TLNGSTR: return luaS_eqlngstr(rawtsvalue(t1), rawtsvalue(t2));
-    case LUA_TUSERDATA: {
-      if (uvalue(t1) == uvalue(t2)) return 1;
-      else if (L == NULL) return 0;
-      tm = get_equalTM(L, uvalue(t1)->metatable, uvalue(t2)->metatable, TM_EQ);
-      break;  /* will try TM */
-    }
-    case LUA_TTABLE: {
-      if (hvalue(t1) == hvalue(t2)) return 1;
-      else if (L == NULL) return 0;
-      tm = get_equalTM(L, hvalue(t1)->metatable, hvalue(t2)->metatable, TM_EQ);
-      break;  /* will try TM */
-    }
-    default:
-      lua_assert(iscollectable(t1));
-      return gcvalue(t1) == gcvalue(t2);
-  }
-  if (tm == NULL) return 0;  /* no TM? */
-  callTM(L, tm, t1, t2, L->top, 1);  /* call TM */
-  return !l_isfalse(L->top);
-}
-
-
-void luaV_concat (lua_State *L, int total) {
-  lua_assert(total >= 2);
-  do {
-    StkId top = L->top;
-    int n = 2;  /* number of elements handled in this pass (at least 2) */
-    if (!(ttisstring(top-2) || ttisnumber(top-2)) || !tostring(L, top-1)) {
-      if (!call_binTM(L, top-2, top-1, top-2, TM_CONCAT))
-        luaG_concaterror(L, top-2, top-1);
-    }
-    else if (tsvalue(top-1)->len == 0)  /* second operand is empty? */
-      (void)tostring(L, top - 2);  /* result is first operand */
-    else if (ttisstring(top-2) && tsvalue(top-2)->len == 0) {
-      setobjs2s(L, top - 2, top - 1);  /* result is second op. */
-    }
-    else {
-      /* at least two non-empty string values; get as many as possible */
-      size_t tl = tsvalue(top-1)->len;
-      char *buffer;
-      int i;
-      /* collect total length */
-      for (i = 1; i < total && tostring(L, top-i-1); i++) {
-        size_t l = tsvalue(top-i-1)->len;
-        if (l >= (MAX_SIZET/sizeof(char)) - tl)
-          luaG_runerror(L, "string length overflow");
-        tl += l;
-      }
-      buffer = luaZ_openspace(L, &G(L)->buff, tl);
-      tl = 0;
-      n = i;
-      do {  /* concat all strings */
-        size_t l = tsvalue(top-i)->len;
-        memcpy(buffer+tl, svalue(top-i), l * sizeof(char));
-        tl += l;
-      } while (--i > 0);
-      setsvalue2s(L, top-n, luaS_newlstr(L, buffer, tl));
-    }
-    total -= n-1;  /* got 'n' strings to create 1 new */
-    L->top -= n-1;  /* popped 'n' strings and pushed one */
-  } while (total > 1);  /* repeat until only 1 result left */
-}
-
-
-void luaV_objlen (lua_State *L, StkId ra, const TValue *rb) {
-  const TValue *tm;
-  switch (ttypenv(rb)) {
-    case LUA_TTABLE: {
-      Table *h = hvalue(rb);
-      tm = fasttm(L, h->metatable, TM_LEN);
-      if (tm) break;  /* metamethod? break switch to call it */
-      setnvalue(ra, cast_num(luaH_getn(h)));  /* else primitive len */
-      return;
-    }
-    case LUA_TSTRING: {
-      setnvalue(ra, cast_num(tsvalue(rb)->len));
-      return;
-    }
-    default: {  /* try metamethod */
-      tm = luaT_gettmbyobj(L, rb, TM_LEN);
-      if (ttisnil(tm))  /* no metamethod? */
-        luaG_typeerror(L, rb, "get length of");
-      break;
-    }
-  }
-  callTM(L, tm, rb, rb, ra, 1);
-}
-
-/*
- * luaV_div and luaV_mod patched in from Lua 5.3.2 in order to properly handle
- * div/mod by zero (instead of crashing, which is the default behavior in
- * Lua 5.2)
- */
-
-/*
-** Integer division; return 'm // n', that is, floor(m/n).
-** C division truncates its result (rounds towards zero).
-** 'floor(q) == trunc(q)' when 'q >= 0' or when 'q' is integer,
-** otherwise 'floor(q) == trunc(q) - 1'.
-*/
-static lua_Number luaV_div (lua_State *L, lua_Number m, lua_Number n) {
-  if ((lua_Unsigned)(n) + 1u <= 1u) {  /* special cases: -1 or 0 */
-    if (n == 0)
-      luaG_runerror(L, "attempt to divide by zero");
-    return (0 - m);   /* n==-1; avoid overflow with 0x80000...//-1 */
-  }
-  else {
-    lua_Number q = m / n;  /* perform C division */
-    if ((m ^ n) < 0 && m % n != 0)  /* 'm/n' would be negative non-integer? */
-      q -= 1;  /* correct result for different rounding */
-    return q;
-  }
-}
-
-
-/*
-** Integer modulus; return 'm % n'. (Assume that C '%' with
-** negative operands follows C99 behavior. See previous comment
-** about luaV_div.)
-*/
-static lua_Number luaV_mod (lua_State *L, lua_Number m, lua_Number n) {
-  if ((lua_Unsigned)(n) + 1u <= 1u) {  /* special cases: -1 or 0 */
-    if (n == 0)
-      luaG_runerror(L, "attempt to perform 'n%%0'");
-    return 0;   /* m % -1 == 0; avoid overflow with 0x80000...%-1 */
-  }
-  else {
-    lua_Number r = m % n;
-    if (r != 0 && (m ^ n) < 0)  /* 'm/n' would be non-integer negative? */
-      r += n;  /* correct result for different rounding */
-    return r;
-  }
-}
-
-/*
- * End patch from 5.3.2
- */
-
-void luaV_arith (lua_State *L, StkId ra, const TValue *rb,
-                 const TValue *rc, TMS op) {
-  TValue tempb, tempc;
-  const TValue *b, *c;
-  if ((b = luaV_tonumber(rb, &tempb)) != NULL &&
-      (c = luaV_tonumber(rc, &tempc)) != NULL) {
-    /*
-     * Patched: if dividing or modding, use patched functions from 5.3
-     */
-    lua_Number res;
-    int lop = op - TM_ADD + LUA_OPADD;
-    if (lop == LUA_OPDIV) {
-      res = luaV_div(L, nvalue(b), nvalue(c));
-    } else if (lop == LUA_OPMOD) {
-      res = luaV_mod(L, nvalue(b), nvalue(c));
-    } else {
-      res = luaO_arith(op - TM_ADD + LUA_OPADD, nvalue(b), nvalue(c));
-    }
-    setnvalue(ra, res);
-  }
-  else if (!call_binTM(L, rb, rc, ra, op))
-    luaG_aritherror(L, rb, rc);
-}
-
-
-/*
-** check whether cached closure in prototype 'p' may be reused, that is,
-** whether there is a cached closure with the same upvalues needed by
-** new closure to be created.
-*/
-static Closure *getcached (Proto *p, UpVal **encup, StkId base) {
-  Closure *c = p->cache;
-  if (c != NULL) {  /* is there a cached closure? */
-    int nup = p->sizeupvalues;
-    Upvaldesc *uv = p->upvalues;
-    int i;
-    for (i = 0; i < nup; i++) {  /* check whether it has right upvalues */
-      TValue *v = uv[i].instack ? base + uv[i].idx : encup[uv[i].idx]->v;
-      if (c->l.upvals[i]->v != v)
-        return NULL;  /* wrong upvalue; cannot reuse closure */
-    }
-  }
-  return c;  /* return cached closure (or NULL if no cached closure) */
-}
-
-
-/*
-** create a new Lua closure, push it in the stack, and initialize
-** its upvalues. Note that the call to 'luaC_barrierproto' must come
-** before the assignment to 'p->cache', as the function needs the
-** original value of that field.
-*/
-static void pushclosure (lua_State *L, Proto *p, UpVal **encup, StkId base,
-                         StkId ra) {
-  int nup = p->sizeupvalues;
-  Upvaldesc *uv = p->upvalues;
-  int i;
-  Closure *ncl = luaF_newLclosure(L, nup);
-  ncl->l.p = p;
-  setclLvalue(L, ra, ncl);  /* anchor new closure in stack */
-  for (i = 0; i < nup; i++) {  /* fill in its upvalues */
-    if (uv[i].instack)  /* upvalue refers to local variable? */
-      ncl->l.upvals[i] = luaF_findupval(L, base + uv[i].idx);
-    else  /* get upvalue from enclosing function */
-      ncl->l.upvals[i] = encup[uv[i].idx];
-  }
-  luaC_barrierproto(L, p, ncl);
-  p->cache = ncl;  /* save it on cache for reuse */
-}
-
-
-/*
-** finish execution of an opcode interrupted by an yield
-*/
-void luaV_finishOp (lua_State *L) {
-  CallInfo *ci = L->ci;
-  StkId base = ci->u.l.base;
-  Instruction inst = *(ci->u.l.savedpc - 1);  /* interrupted instruction */
-  OpCode op = GET_OPCODE(inst);
-  switch (op) {  /* finish its execution */
-    case OP_ADD: case OP_SUB: case OP_MUL: case OP_DIV:
-    case OP_MOD: case OP_POW: case OP_UNM: case OP_LEN:
-    case OP_GETTABUP: case OP_GETTABLE: case OP_SELF: {
-      setobjs2s(L, base + GETARG_A(inst), --L->top);
-      break;
-    }
-    case OP_LE: case OP_LT: case OP_EQ: {
-      int res = !l_isfalse(L->top - 1);
-      L->top--;
-      /* metamethod should not be called when operand is K */
-      lua_assert(!ISK(GETARG_B(inst)));
-      if (op == OP_LE &&  /* "<=" using "<" instead? */
-          ttisnil(luaT_gettmbyobj(L, base + GETARG_B(inst), TM_LE)))
-        res = !res;  /* invert result */
-      lua_assert(GET_OPCODE(*ci->u.l.savedpc) == OP_JMP);
-      if (res != GETARG_A(inst))  /* condition failed? */
-        ci->u.l.savedpc++;  /* skip jump instruction */
-      break;
-    }
-    case OP_CONCAT: {
-      StkId top = L->top - 1;  /* top when 'call_binTM' was called */
-      int b = GETARG_B(inst);      /* first element to concatenate */
-      int total = cast_int(top - 1 - (base + b));  /* yet to concatenate */
-      setobj2s(L, top - 2, top);  /* put TM result in proper position */
-      if (total > 1) {  /* are there elements to concat? */
-        L->top = top - 1;  /* top is one after last element (at top-2) */
-        luaV_concat(L, total);  /* concat them (may yield again) */
-      }
-      /* move final result to final position */
-      setobj2s(L, ci->u.l.base + GETARG_A(inst), L->top - 1);
-      L->top = ci->top;  /* restore top */
-      break;
-    }
-    case OP_TFORCALL: {
-      lua_assert(GET_OPCODE(*ci->u.l.savedpc) == OP_TFORLOOP);
-      L->top = ci->top;  /* correct top */
-      break;
-    }
-    case OP_CALL: {
-      if (GETARG_C(inst) - 1 >= 0)  /* nresults >= 0? */
-        L->top = ci->top;  /* adjust results */
-      break;
-    }
-    case OP_TAILCALL: case OP_SETTABUP: case OP_SETTABLE:
-      break;
-    default: lua_assert(0);
-  }
-}
-
-
-
-/*
-** some macros for common tasks in `luaV_execute'
-*/
-
-#if !defined luai_runtimecheck
-#define luai_runtimecheck(L, c)		/* void */
-#endif
-
-
-#define RA(i)	(base+GETARG_A(i))
-/* to be used after possible stack reallocation */
-#define RB(i)	check_exp(getBMode(GET_OPCODE(i)) == OpArgR, base+GETARG_B(i))
-#define RC(i)	check_exp(getCMode(GET_OPCODE(i)) == OpArgR, base+GETARG_C(i))
-#define RKB(i)	check_exp(getBMode(GET_OPCODE(i)) == OpArgK, \
-	ISK(GETARG_B(i)) ? k+INDEXK(GETARG_B(i)) : base+GETARG_B(i))
-#define RKC(i)	check_exp(getCMode(GET_OPCODE(i)) == OpArgK, \
-	ISK(GETARG_C(i)) ? k+INDEXK(GETARG_C(i)) : base+GETARG_C(i))
-#define KBx(i)  \
-  (k + (GETARG_Bx(i) != 0 ? GETARG_Bx(i) - 1 : GETARG_Ax(*ci->u.l.savedpc++)))
-
-
-/* execute a jump instruction */
-#define dojump(ci,i,e) \
-  { int a = GETARG_A(i); \
-    if (a > 0) luaF_close(L, ci->u.l.base + a - 1); \
-    ci->u.l.savedpc += GETARG_sBx(i) + e; }
-
-/* for test instructions, execute the jump instruction that follows it */
-#define donextjump(ci)	{ i = *ci->u.l.savedpc; dojump(ci, i, 1); }
-
-
-#define Protect(x)	{ {x;}; base = ci->u.l.base; }
-
-#define checkGC(L,c)  \
-  Protect( luaC_condGC(L,{L->top = (c);  /* limit of live values */ \
-                          luaC_step(L); \
-                          L->top = ci->top;})  /* restore top */ \
-           luai_threadyield(L); )
-
-
-#define arith_op(op,tm) { \
-        TValue *rb = RKB(i); \
-        TValue *rc = RKC(i); \
-        if (ttisnumber(rb) && ttisnumber(rc)) { \
-          lua_Number nb = nvalue(rb), nc = nvalue(rc); \
-          setnvalue(ra, op(L, nb, nc)); \
-        } \
-        else { Protect(luaV_arith(L, ra, rb, rc, tm)); } }
-
-
-#define vmdispatch(o)	switch(o)
-#define vmcase(l,b)	case l: {b}  break;
-#define vmcasenb(l,b)	case l: {b}		/* nb = no break */
-
-void luaV_execute (lua_State *L) {
-  CallInfo *ci = L->ci;
-  LClosure *cl;
-  TValue *k;
-  StkId base;
- newframe:  /* reentry point when frame changes (call/return) */
-  lua_assert(ci == L->ci);
-  cl = clLvalue(ci->func);
-  k = cl->p->k;
-  base = ci->u.l.base;
-  /* main loop of interpreter */
-  for (;;) {
-    Instruction i = *(ci->u.l.savedpc++);
-    StkId ra;
-    if ((L->hookmask & (LUA_MASKLINE | LUA_MASKCOUNT)) &&
-        (--L->hookcount == 0 || L->hookmask & LUA_MASKLINE)) {
-      Protect(traceexec(L));
-    }
-    /* WARNING: several calls may realloc the stack and invalidate `ra' */
-    ra = RA(i);
-    lua_assert(base == ci->u.l.base);
-    lua_assert(base <= L->top && L->top < L->stack + L->stacksize);
-    vmdispatch (GET_OPCODE(i)) {
-      vmcase(OP_MOVE,
-        setobjs2s(L, ra, RB(i));
-      )
-      vmcase(OP_LOADK,
-        TValue *rb = k + GETARG_Bx(i);
-        setobj2s(L, ra, rb);
-      )
-      vmcase(OP_LOADKX,
-        TValue *rb;
-        lua_assert(GET_OPCODE(*ci->u.l.savedpc) == OP_EXTRAARG);
-        rb = k + GETARG_Ax(*ci->u.l.savedpc++);
-        setobj2s(L, ra, rb);
-      )
-      vmcase(OP_LOADBOOL,
-        setbvalue(ra, GETARG_B(i));
-        if (GETARG_C(i)) ci->u.l.savedpc++;  /* skip next instruction (if C) */
-      )
-      vmcase(OP_LOADNIL,
-        int b = GETARG_B(i);
-        do {
-          setnilvalue(ra++);
-        } while (b--);
-      )
-      vmcase(OP_GETUPVAL,
-        int b = GETARG_B(i);
-        setobj2s(L, ra, cl->upvals[b]->v);
-      )
-      vmcase(OP_GETTABUP,
-        int b = GETARG_B(i);
-        Protect(luaV_gettable(L, cl->upvals[b]->v, RKC(i), ra));
-      )
-      vmcase(OP_GETTABLE,
-        Protect(luaV_gettable(L, RB(i), RKC(i), ra));
-      )
-      vmcase(OP_SETTABUP,
-        int a = GETARG_A(i);
-        Protect(luaV_settable(L, cl->upvals[a]->v, RKB(i), RKC(i)));
-      )
-      vmcase(OP_SETUPVAL,
-        UpVal *uv = cl->upvals[GETARG_B(i)];
-        setobj(L, uv->v, ra);
-        luaC_barrier(L, uv, ra);
-      )
-      vmcase(OP_SETTABLE,
-        Protect(luaV_settable(L, ra, RKB(i), RKC(i)));
-      )
-      vmcase(OP_NEWTABLE,
-        int b = GETARG_B(i);
-        int c = GETARG_C(i);
-        Table *t = luaH_new(L);
-        sethvalue(L, ra, t);
-        if (b != 0 || c != 0)
-          luaH_resize(L, t, luaO_fb2int(b), luaO_fb2int(c));
-        checkGC(L, ra + 1);
-      )
-      vmcase(OP_SELF,
-        StkId rb = RB(i);
-        setobjs2s(L, ra+1, rb);
-        Protect(luaV_gettable(L, rb, RKC(i), ra));
-      )
-      vmcase(OP_ADD,
-        arith_op(luai_numadd, TM_ADD);
-      )
-      vmcase(OP_SUB,
-        arith_op(luai_numsub, TM_SUB);
-      )
-      vmcase(OP_MUL,
-        arith_op(luai_nummul, TM_MUL);
-      )
-      /*
-       * Patched: use luaV_* instead of luai_* to handle div/mod by 0
-       */
-      vmcase(OP_DIV,
-        arith_op(luaV_div, TM_DIV);
-      )
-      vmcase(OP_MOD,
-        arith_op(luaV_mod, TM_MOD);
-      )
-      vmcase(OP_POW,
-        arith_op(luai_numpow, TM_POW);
-      )
-      vmcase(OP_UNM,
-        TValue *rb = RB(i);
-        if (ttisnumber(rb)) {
-          lua_Number nb = nvalue(rb);
-          setnvalue(ra, luai_numunm(L, nb));
-        }
-        else {
-          Protect(luaV_arith(L, ra, rb, rb, TM_UNM));
-        }
-      )
-      vmcase(OP_NOT,
-        TValue *rb = RB(i);
-        int res = l_isfalse(rb);  /* next assignment may change this value */
-        setbvalue(ra, res);
-      )
-      vmcase(OP_LEN,
-        Protect(luaV_objlen(L, ra, RB(i)));
-      )
-      vmcase(OP_CONCAT,
-        int b = GETARG_B(i);
-        int c = GETARG_C(i);
-        StkId rb;
-        L->top = base + c + 1;  /* mark the end of concat operands */
-        Protect(luaV_concat(L, c - b + 1));
-        ra = RA(i);  /* 'luav_concat' may invoke TMs and move the stack */
-        rb = b + base;
-        setobjs2s(L, ra, rb);
-        checkGC(L, (ra >= rb ? ra + 1 : rb));
-        L->top = ci->top;  /* restore top */
-      )
-      vmcase(OP_JMP,
-        dojump(ci, i, 0);
-      )
-      vmcase(OP_EQ,
-        TValue *rb = RKB(i);
-        TValue *rc = RKC(i);
-        Protect(
-          if (cast_int(equalobj(L, rb, rc)) != GETARG_A(i))
-            ci->u.l.savedpc++;
-          else
-            donextjump(ci);
-        )
-      )
-      vmcase(OP_LT,
-        Protect(
-          if (luaV_lessthan(L, RKB(i), RKC(i)) != GETARG_A(i))
-            ci->u.l.savedpc++;
-          else
-            donextjump(ci);
-        )
-      )
-      vmcase(OP_LE,
-        Protect(
-          if (luaV_lessequal(L, RKB(i), RKC(i)) != GETARG_A(i))
-            ci->u.l.savedpc++;
-          else
-            donextjump(ci);
-        )
-      )
-      vmcase(OP_TEST,
-        if (GETARG_C(i) ? l_isfalse(ra) : !l_isfalse(ra))
-            ci->u.l.savedpc++;
-          else
-          donextjump(ci);
-      )
-      vmcase(OP_TESTSET,
-        TValue *rb = RB(i);
-        if (GETARG_C(i) ? l_isfalse(rb) : !l_isfalse(rb))
-          ci->u.l.savedpc++;
-        else {
-          setobjs2s(L, ra, rb);
-          donextjump(ci);
-        }
-      )
-      vmcase(OP_CALL,
-        int b = GETARG_B(i);
-        int nresults = GETARG_C(i) - 1;
-        if (b != 0) L->top = ra+b;  /* else previous instruction set top */
-        if (luaD_precall(L, ra, nresults)) {  /* C function? */
-          if (nresults >= 0) L->top = ci->top;  /* adjust results */
-          base = ci->u.l.base;
-        }
-        else {  /* Lua function */
-          ci = L->ci;
-          ci->callstatus |= CIST_REENTRY;
-          goto newframe;  /* restart luaV_execute over new Lua function */
-        }
-      )
-      vmcase(OP_TAILCALL,
-        int b = GETARG_B(i);
-        if (b != 0) L->top = ra+b;  /* else previous instruction set top */
-        lua_assert(GETARG_C(i) - 1 == LUA_MULTRET);
-        if (luaD_precall(L, ra, LUA_MULTRET))  /* C function? */
-          base = ci->u.l.base;
-        else {
-          /* tail call: put called frame (n) in place of caller one (o) */
-          CallInfo *nci = L->ci;  /* called frame */
-          CallInfo *oci = nci->previous;  /* caller frame */
-          StkId nfunc = nci->func;  /* called function */
-          StkId ofunc = oci->func;  /* caller function */
-          /* last stack slot filled by 'precall' */
-          StkId lim = nci->u.l.base + getproto(nfunc)->numparams;
-          int aux;
-          /* close all upvalues from previous call */
-          if (cl->p->sizep > 0) luaF_close(L, oci->u.l.base);
-          /* move new frame into old one */
-          for (aux = 0; nfunc + aux < lim; aux++)
-            setobjs2s(L, ofunc + aux, nfunc + aux);
-          oci->u.l.base = ofunc + (nci->u.l.base - nfunc);  /* correct base */
-          oci->top = L->top = ofunc + (L->top - nfunc);  /* correct top */
-          oci->u.l.savedpc = nci->u.l.savedpc;
-          oci->callstatus |= CIST_TAIL;  /* function was tail called */
-          ci = L->ci = oci;  /* remove new frame */
-          lua_assert(L->top == oci->u.l.base + getproto(ofunc)->maxstacksize);
-          goto newframe;  /* restart luaV_execute over new Lua function */
-        }
-      )
-      vmcasenb(OP_RETURN,
-        int b = GETARG_B(i);
-        if (b != 0) L->top = ra+b-1;
-        if (cl->p->sizep > 0) luaF_close(L, base);
-        b = luaD_poscall(L, ra);
-        if (!(ci->callstatus & CIST_REENTRY))  /* 'ci' still the called one */
-          return;  /* external invocation: return */
-        else {  /* invocation via reentry: continue execution */
-          ci = L->ci;
-          if (b) L->top = ci->top;
-          lua_assert(isLua(ci));
-          lua_assert(GET_OPCODE(*((ci)->u.l.savedpc - 1)) == OP_CALL);
-          goto newframe;  /* restart luaV_execute over new Lua function */
-        }
-      )
-      vmcase(OP_FORLOOP,
-        lua_Number step = nvalue(ra+2);
-        lua_Number idx = luai_numadd(L, nvalue(ra), step); /* increment index */
-        lua_Number limit = nvalue(ra+1);
-        if (luai_numlt(L, 0, step) ? luai_numle(L, idx, limit)
-                                   : luai_numle(L, limit, idx)) {
-          ci->u.l.savedpc += GETARG_sBx(i);  /* jump back */
-          setnvalue(ra, idx);  /* update internal index... */
-          setnvalue(ra+3, idx);  /* ...and external index */
-        }
-      )
-      vmcase(OP_FORPREP,
-        const TValue *init = ra;
-        const TValue *plimit = ra+1;
-        const TValue *pstep = ra+2;
-        if (!tonumber(init, ra))
-          luaG_runerror(L, LUA_QL("for") " initial value must be a number");
-        else if (!tonumber(plimit, ra+1))
-          luaG_runerror(L, LUA_QL("for") " limit must be a number");
-        else if (!tonumber(pstep, ra+2))
-          luaG_runerror(L, LUA_QL("for") " step must be a number");
-        setnvalue(ra, luai_numsub(L, nvalue(ra), nvalue(pstep)));
-        ci->u.l.savedpc += GETARG_sBx(i);
-      )
-      vmcasenb(OP_TFORCALL,
-        StkId cb = ra + 3;  /* call base */
-        setobjs2s(L, cb+2, ra+2);
-        setobjs2s(L, cb+1, ra+1);
-        setobjs2s(L, cb, ra);
-        L->top = cb + 3;  /* func. + 2 args (state and index) */
-        Protect(luaD_call(L, cb, GETARG_C(i), 1));
-        L->top = ci->top;
-        i = *(ci->u.l.savedpc++);  /* go to next instruction */
-        ra = RA(i);
-        lua_assert(GET_OPCODE(i) == OP_TFORLOOP);
-        goto l_tforloop;
-      )
-      vmcase(OP_TFORLOOP,
-        l_tforloop:
-        if (!ttisnil(ra + 1)) {  /* continue loop? */
-          setobjs2s(L, ra, ra + 1);  /* save control variable */
-           ci->u.l.savedpc += GETARG_sBx(i);  /* jump back */
-        }
-      )
-      vmcase(OP_SETLIST,
-        int n = GETARG_B(i);
-        int c = GETARG_C(i);
-        int last;
-        Table *h;
-        if (n == 0) n = cast_int(L->top - ra) - 1;
-        if (c == 0) {
-          lua_assert(GET_OPCODE(*ci->u.l.savedpc) == OP_EXTRAARG);
-          c = GETARG_Ax(*ci->u.l.savedpc++);
-        }
-        luai_runtimecheck(L, ttistable(ra));
-        h = hvalue(ra);
-        last = ((c-1)*LFIELDS_PER_FLUSH) + n;
-        if (last > h->sizearray)  /* needs more space? */
-          luaH_resizearray(L, h, last);  /* pre-allocate it at once */
-        for (; n > 0; n--) {
-          TValue *val = ra+n;
-          luaH_setint(L, h, last--, val);
-          luaC_barrierback(L, obj2gco(h), val);
-        }
-        L->top = ci->top;  /* correct top (in case of previous open call) */
-      )
-      vmcase(OP_CLOSURE,
-        Proto *p = cl->p->p[GETARG_Bx(i)];
-        Closure *ncl = getcached(p, cl->upvals, base);  /* cached closure */
-        if (ncl == NULL)  /* no match? */
-          pushclosure(L, p, cl->upvals, base, ra);  /* create a new one */
-        else
-          setclLvalue(L, ra, ncl);  /* push cashed closure */
-        checkGC(L, ra + 1);
-      )
-      vmcase(OP_VARARG,
-        int b = GETARG_B(i) - 1;
-        int j;
-        int n = cast_int(base - ci->func) - cl->p->numparams - 1;
-        if (b < 0) {  /* B == 0? */
-          b = n;  /* get all var. arguments */
-          Protect(luaD_checkstack(L, n));
-          ra = RA(i);  /* previous call may change the stack */
-          L->top = ra + n;
-        }
-        for (j = 0; j < b; j++) {
-          if (j < n) {
-            setobjs2s(L, ra + j, base - n + j);
-          }
-          else {
-            setnilvalue(ra + j);
-          }
-        }
-      )
-      vmcase(OP_EXTRAARG,
-        lua_assert(0);
-      )
-    }
-  }
-}
-
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lzio.h
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lzio.h
+++ /dev/null
@@ -1,65 +0,0 @@
-/*
-** $Id: lzio.h,v 1.26.1.1 2013/04/12 18:48:47 roberto Exp $
-** Buffered streams
-** See Copyright Notice in lua.h
-*/
-
-
-#ifndef lzio_h
-#define lzio_h
-
-#include "lua.h"
-
-#include "lmem.h"
-
-
-#define EOZ	(-1)			/* end of stream */
-
-typedef struct Zio ZIO;
-
-#define zgetc(z)  (((z)->n--)>0 ?  cast_uchar(*(z)->p++) : luaZ_fill(z))
-
-
-typedef struct Mbuffer {
-  char *buffer;
-  size_t n;
-  size_t buffsize;
-} Mbuffer;
-
-#define luaZ_initbuffer(L, buff) ((buff)->buffer = NULL, (buff)->buffsize = 0)
-
-#define luaZ_buffer(buff)	((buff)->buffer)
-#define luaZ_sizebuffer(buff)	((buff)->buffsize)
-#define luaZ_bufflen(buff)	((buff)->n)
-
-#define luaZ_resetbuffer(buff) ((buff)->n = 0)
-
-
-#define luaZ_resizebuffer(L, buff, size) \
-	(luaM_reallocvector(L, (buff)->buffer, (buff)->buffsize, size, char), \
-	(buff)->buffsize = size)
-
-#define luaZ_freebuffer(L, buff)	luaZ_resizebuffer(L, buff, 0)
-
-
-LUAI_FUNC char *luaZ_openspace (lua_State *L, Mbuffer *buff, size_t n);
-LUAI_FUNC void luaZ_init (lua_State *L, ZIO *z, lua_Reader reader,
-                                        void *data);
-LUAI_FUNC size_t luaZ_read (ZIO* z, void* b, size_t n);	/* read next n bytes */
-
-
-
-/* --------- Private Part ------------------ */
-
-struct Zio {
-  size_t n;			/* bytes still unread */
-  const char *p;		/* current position in buffer */
-  lua_Reader reader;		/* reader function */
-  void* data;			/* additional data */
-  lua_State *L;			/* Lua state (for reader) */
-};
-
-
-LUAI_FUNC int luaZ_fill (ZIO *z);
-
-#endif
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lzio.c
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lzio.c
+++ /dev/null
@@ -1,76 +0,0 @@
-/*
-** $Id: lzio.c,v 1.35.1.1 2013/04/12 18:48:47 roberto Exp $
-** Buffered streams
-** See Copyright Notice in lua.h
-*/
-
-
-#include <sys/zfs_context.h>
-
-#define lzio_c
-#define LUA_CORE
-
-#include "lua.h"
-
-#include "llimits.h"
-#include "lmem.h"
-#include "lstate.h"
-#include "lzio.h"
-
-
-int luaZ_fill (ZIO *z) {
-  size_t size;
-  lua_State *L = z->L;
-  const char *buff;
-  lua_unlock(L);
-  buff = z->reader(L, z->data, &size);
-  lua_lock(L);
-  if (buff == NULL || size == 0)
-    return EOZ;
-  z->n = size - 1;  /* discount char being returned */
-  z->p = buff;
-  return cast_uchar(*(z->p++));
-}
-
-
-void luaZ_init (lua_State *L, ZIO *z, lua_Reader reader, void *data) {
-  z->L = L;
-  z->reader = reader;
-  z->data = data;
-  z->n = 0;
-  z->p = NULL;
-}
-
-
-/* --------------------------------------------------------------- read --- */
-size_t luaZ_read (ZIO *z, void *b, size_t n) {
-  while (n) {
-    size_t m;
-    if (z->n == 0) {  /* no bytes in buffer? */
-      if (luaZ_fill(z) == EOZ)  /* try to read more */
-        return n;  /* no more input; return number of missing bytes */
-      else {
-        z->n++;  /* luaZ_fill consumed first byte; put it back */
-        z->p--;
-      }
-    }
-    m = (n <= z->n) ? n : z->n;  /* min. between n and z->n */
-    memcpy(b, z->p, m);
-    z->n -= m;
-    z->p += m;
-    b = (char *)b + m;
-    n -= m;
-  }
-  return 0;
-}
-
-/* ------------------------------------------------------------------------ */
-char *luaZ_openspace (lua_State *L, Mbuffer *buff, size_t n) {
-  if (n > buff->buffsize) {
-    if (n < LUA_MINBUFFER) n = LUA_MINBUFFER;
-    luaZ_resizebuffer(L, buff, n);
-  }
-  return buff->buffer;
-}
-
-
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lzjb.c
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lzjb.c
+++ /dev/null
@@ -1,129 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-
-/*
- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- */
-
-/*
- * We keep our own copy of this algorithm for 3 main reasons:
- *	1. If we didn't, anyone modifying common/os/compress.c would
- *         directly break our on disk format
- *	2. Our version of lzjb does not have a number of checks that the
- *         common/os version needs and uses
- *	3. We initialize the lempel to ensure deterministic results,
- *	   so that identical blocks can always be deduplicated.
- * In particular, we are adding the "feature" that compress() can
- * take a destination buffer size and returns the compressed length, or the
- * source length if compression would overflow the destination buffer.
- */
-
-#include <sys/zfs_context.h>
-#include <sys/types.h>
-#include <sys/param.h>
-
-#define	MATCH_BITS	6
-#define	MATCH_MIN	3
-#define	MATCH_MAX	((1 << MATCH_BITS) + (MATCH_MIN - 1))
-#define	OFFSET_MASK	((1 << (16 - MATCH_BITS)) - 1)
-#define	LEMPEL_SIZE	1024
-
-/*ARGSUSED*/
-size_t
-lzjb_compress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n)
-{
-	uchar_t *src = s_start;
-	uchar_t *dst = d_start;
-	uchar_t *cpy;
-	uchar_t *copymap = NULL;
-	int copymask = 1 << (NBBY - 1);
-	int mlen, offset, hash;
-	uint16_t *hp;
-	uint16_t lempel[LEMPEL_SIZE] = { 0 };
-
-	while (src < (uchar_t *)s_start + s_len) {
-		if ((copymask <<= 1) == (1 << NBBY)) {
-			if (dst >= (uchar_t *)d_start + d_len - 1 - 2 * NBBY)
-				return (s_len);
-			copymask = 1;
-			copymap = dst;
-			*dst++ = 0;
-		}
-		if (src > (uchar_t *)s_start + s_len - MATCH_MAX) {
-			*dst++ = *src++;
-			continue;
-		}
-		hash = (src[0] << 16) + (src[1] << 8) + src[2];
-		hash += hash >> 9;
-		hash += hash >> 5;
-		hp = &lempel[hash & (LEMPEL_SIZE - 1)];
-		offset = (intptr_t)(src - *hp) & OFFSET_MASK;
-		*hp = (uint16_t)(uintptr_t)src;
-		cpy = src - offset;
-		if (cpy >= (uchar_t *)s_start && cpy != src &&
-		    src[0] == cpy[0] && src[1] == cpy[1] && src[2] == cpy[2]) {
-			*copymap |= copymask;
-			for (mlen = MATCH_MIN; mlen < MATCH_MAX; mlen++)
-				if (src[mlen] != cpy[mlen])
-					break;
-			*dst++ = ((mlen - MATCH_MIN) << (NBBY - MATCH_BITS)) |
-			    (offset >> NBBY);
-			*dst++ = (uchar_t)offset;
-			src += mlen;
-		} else {
-			*dst++ = *src++;
-		}
-	}
-	return (dst - (uchar_t *)d_start);
-}
-
-/*ARGSUSED*/
-int
-lzjb_decompress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n)
-{
-	uchar_t *src = s_start;
-	uchar_t *dst = d_start;
-	uchar_t *d_end = (uchar_t *)d_start + d_len;
-	uchar_t *cpy;
-	uchar_t copymap = 0;
-	int copymask = 1 << (NBBY - 1);
-
-	while (dst < d_end) {
-		if ((copymask <<= 1) == (1 << NBBY)) {
-			copymask = 1;
-			copymap = *src++;
-		}
-		if (copymap & copymask) {
-			int mlen = (src[0] >> (NBBY - MATCH_BITS)) + MATCH_MIN;
-			int offset = ((src[0] << NBBY) | src[1]) & OFFSET_MASK;
-			src += 2;
-			if ((cpy = dst - offset) < (uchar_t *)d_start)
-				return (-1);
-			if (mlen > (d_end - dst))
-				mlen = d_end - dst;
-			while (--mlen >= 0)
-				*dst++ = *cpy++;
-		} else {
-			*dst++ = *src++;
-		}
-	}
-	return (0);
-}
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c
+++ /dev/null
@@ -1,4624 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
- * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
- * Copyright (c) 2014 Integros [integros.com]
- * Copyright (c) 2017, Intel Corporation.
- */
-
-#include <sys/zfs_context.h>
-#include <sys/dmu.h>
-#include <sys/dmu_tx.h>
-#include <sys/space_map.h>
-#include <sys/metaslab_impl.h>
-#include <sys/vdev_impl.h>
-#include <sys/zio.h>
-#include <sys/spa_impl.h>
-#include <sys/zfeature.h>
-#include <sys/vdev_indirect_mapping.h>
-#include <sys/zap.h>
-
-SYSCTL_DECL(_vfs_zfs);
-SYSCTL_NODE(_vfs_zfs, OID_AUTO, metaslab, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
-    "ZFS metaslab");
-
-#define	GANG_ALLOCATION(flags) \
-	((flags) & (METASLAB_GANG_CHILD | METASLAB_GANG_HEADER))
-
-uint64_t metaslab_aliquot = 512ULL << 10;
-uint64_t metaslab_force_ganging = SPA_MAXBLOCKSIZE + 1;	/* force gang blocks */
-SYSCTL_QUAD(_vfs_zfs_metaslab, OID_AUTO, force_ganging, CTLFLAG_RWTUN,
-    &metaslab_force_ganging, 0,
-    "Force gang block allocation for blocks larger than or equal to this value");
-
-/*
- * Since we can touch multiple metaslabs (and their respective space maps)
- * with each transaction group, we benefit from having a smaller space map
- * block size since it allows us to issue more I/O operations scattered
- * around the disk.
- */
-int zfs_metaslab_sm_blksz = (1 << 12);
-SYSCTL_INT(_vfs_zfs, OID_AUTO, metaslab_sm_blksz, CTLFLAG_RDTUN,
-    &zfs_metaslab_sm_blksz, 0,
-    "Block size for metaslab DTL space map.  Power of 2 and greater than 4096.");
-
-/*
- * The in-core space map representation is more compact than its on-disk form.
- * The zfs_condense_pct determines how much more compact the in-core
- * space map representation must be before we compact it on-disk.
- * Values should be greater than or equal to 100.
- */
-int zfs_condense_pct = 200;
-SYSCTL_INT(_vfs_zfs, OID_AUTO, condense_pct, CTLFLAG_RWTUN,
-    &zfs_condense_pct, 0,
-    "Condense on-disk spacemap when it is more than this many percents"
-    " of in-memory counterpart");
-
-/*
- * Condensing a metaslab is not guaranteed to actually reduce the amount of
- * space used on disk. In particular, a space map uses data in increments of
- * MAX(1 << ashift, space_map_blksize), so a metaslab might use the
- * same number of blocks after condensing. Since the goal of condensing is to
- * reduce the number of IOPs required to read the space map, we only want to
- * condense when we can be sure we will reduce the number of blocks used by the
- * space map. Unfortunately, we cannot precisely compute whether or not this is
- * the case in metaslab_should_condense since we are holding ms_lock. Instead,
- * we apply the following heuristic: do not condense a spacemap unless the
- * uncondensed size consumes greater than zfs_metaslab_condense_block_threshold
- * blocks.
- */
-int zfs_metaslab_condense_block_threshold = 4;
-
-/*
- * The zfs_mg_noalloc_threshold defines which metaslab groups should
- * be eligible for allocation. The value is defined as a percentage of
- * free space. Metaslab groups that have more free space than
- * zfs_mg_noalloc_threshold are always eligible for allocations. Once
- * a metaslab group's free space is less than or equal to the
- * zfs_mg_noalloc_threshold the allocator will avoid allocating to that
- * group unless all groups in the pool have reached zfs_mg_noalloc_threshold.
- * Once all groups in the pool reach zfs_mg_noalloc_threshold then all
- * groups are allowed to accept allocations. Gang blocks are always
- * eligible to allocate on any metaslab group. The default value of 0 means
- * no metaslab group will be excluded based on this criterion.
- */
-int zfs_mg_noalloc_threshold = 0;
-SYSCTL_INT(_vfs_zfs, OID_AUTO, mg_noalloc_threshold, CTLFLAG_RWTUN,
-    &zfs_mg_noalloc_threshold, 0,
-    "Percentage of metaslab group size that should be free"
-    " to make it eligible for allocation");
-
-/*
- * Metaslab groups are considered eligible for allocations if their
- * fragmenation metric (measured as a percentage) is less than or equal to
- * zfs_mg_fragmentation_threshold. If a metaslab group exceeds this threshold
- * then it will be skipped unless all metaslab groups within the metaslab
- * class have also crossed this threshold.
- */
-int zfs_mg_fragmentation_threshold = 85;
-SYSCTL_INT(_vfs_zfs, OID_AUTO, mg_fragmentation_threshold, CTLFLAG_RWTUN,
-    &zfs_mg_fragmentation_threshold, 0,
-    "Percentage of metaslab group size that should be considered "
-    "eligible for allocations unless all metaslab groups within the metaslab class "
-    "have also crossed this threshold");
-
-/*
- * Allow metaslabs to keep their active state as long as their fragmentation
- * percentage is less than or equal to zfs_metaslab_fragmentation_threshold. An
- * active metaslab that exceeds this threshold will no longer keep its active
- * status allowing better metaslabs to be selected.
- */
-int zfs_metaslab_fragmentation_threshold = 70;
-SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, fragmentation_threshold, CTLFLAG_RWTUN,
-    &zfs_metaslab_fragmentation_threshold, 0,
-    "Maximum percentage of metaslab fragmentation level to keep their active state");
-
-/*
- * When set will load all metaslabs when pool is first opened.
- */
-int metaslab_debug_load = 0;
-SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, debug_load, CTLFLAG_RWTUN,
-    &metaslab_debug_load, 0,
-    "Load all metaslabs when pool is first opened");
-
-/*
- * When set will prevent metaslabs from being unloaded.
- */
-int metaslab_debug_unload = 0;
-SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, debug_unload, CTLFLAG_RWTUN,
-    &metaslab_debug_unload, 0,
-    "Prevent metaslabs from being unloaded");
-
-/*
- * Minimum size which forces the dynamic allocator to change
- * it's allocation strategy.  Once the space map cannot satisfy
- * an allocation of this size then it switches to using more
- * aggressive strategy (i.e search by size rather than offset).
- */
-uint64_t metaslab_df_alloc_threshold = SPA_OLD_MAXBLOCKSIZE;
-SYSCTL_QUAD(_vfs_zfs_metaslab, OID_AUTO, df_alloc_threshold, CTLFLAG_RWTUN,
-    &metaslab_df_alloc_threshold, 0,
-    "Minimum size which forces the dynamic allocator to change it's allocation strategy");
-
-/*
- * The minimum free space, in percent, which must be available
- * in a space map to continue allocations in a first-fit fashion.
- * Once the space map's free space drops below this level we dynamically
- * switch to using best-fit allocations.
- */
-int metaslab_df_free_pct = 4;
-SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, df_free_pct, CTLFLAG_RWTUN,
-    &metaslab_df_free_pct, 0,
-    "The minimum free space, in percent, which must be available in a "
-    "space map to continue allocations in a first-fit fashion");
-
-/*
- * A metaslab is considered "free" if it contains a contiguous
- * segment which is greater than metaslab_min_alloc_size.
- */
-uint64_t metaslab_min_alloc_size = DMU_MAX_ACCESS;
-SYSCTL_QUAD(_vfs_zfs_metaslab, OID_AUTO, min_alloc_size, CTLFLAG_RWTUN,
-    &metaslab_min_alloc_size, 0,
-    "A metaslab is considered \"free\" if it contains a contiguous "
-    "segment which is greater than vfs.zfs.metaslab.min_alloc_size");
-
-/*
- * Percentage of all cpus that can be used by the metaslab taskq.
- */
-int metaslab_load_pct = 50;
-SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, load_pct, CTLFLAG_RWTUN,
-    &metaslab_load_pct, 0,
-    "Percentage of cpus that can be used by the metaslab taskq");
-
-/*
- * Determines how many txgs a metaslab may remain loaded without having any
- * allocations from it. As long as a metaslab continues to be used we will
- * keep it loaded.
- */
-int metaslab_unload_delay = TXG_SIZE * 2;
-SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, unload_delay, CTLFLAG_RWTUN,
-    &metaslab_unload_delay, 0,
-    "Number of TXGs that an unused metaslab can be kept in memory");
-
-/*
- * Max number of metaslabs per group to preload.
- */
-int metaslab_preload_limit = SPA_DVAS_PER_BP;
-SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, preload_limit, CTLFLAG_RWTUN,
-    &metaslab_preload_limit, 0,
-    "Max number of metaslabs per group to preload");
-
-/*
- * Enable/disable preloading of metaslab.
- */
-boolean_t metaslab_preload_enabled = B_TRUE;
-SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, preload_enabled, CTLFLAG_RWTUN,
-    &metaslab_preload_enabled, 0,
-    "Max number of metaslabs per group to preload");
-
-/*
- * Enable/disable fragmentation weighting on metaslabs.
- */
-boolean_t metaslab_fragmentation_factor_enabled = B_TRUE;
-SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, fragmentation_factor_enabled, CTLFLAG_RWTUN,
-    &metaslab_fragmentation_factor_enabled, 0,
-    "Enable fragmentation weighting on metaslabs");
-
-/*
- * Enable/disable lba weighting (i.e. outer tracks are given preference).
- */
-boolean_t metaslab_lba_weighting_enabled = B_TRUE;
-SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, lba_weighting_enabled, CTLFLAG_RWTUN,
-    &metaslab_lba_weighting_enabled, 0,
-    "Enable LBA weighting (i.e. outer tracks are given preference)");
-
-/*
- * Enable/disable metaslab group biasing.
- */
-boolean_t metaslab_bias_enabled = B_TRUE;
-SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, bias_enabled, CTLFLAG_RWTUN,
-    &metaslab_bias_enabled, 0,
-    "Enable metaslab group biasing");
-
-/*
- * Enable/disable remapping of indirect DVAs to their concrete vdevs.
- */
-boolean_t zfs_remap_blkptr_enable = B_TRUE;
-
-/*
- * Enable/disable segment-based metaslab selection.
- */
-boolean_t zfs_metaslab_segment_weight_enabled = B_TRUE;
-
-/*
- * When using segment-based metaslab selection, we will continue
- * allocating from the active metaslab until we have exhausted
- * zfs_metaslab_switch_threshold of its buckets.
- */
-int zfs_metaslab_switch_threshold = 2;
-
-/*
- * Internal switch to enable/disable the metaslab allocation tracing
- * facility.
- */
-#ifdef _METASLAB_TRACING
-boolean_t metaslab_trace_enabled = B_TRUE;
-#endif
-
-/*
- * Maximum entries that the metaslab allocation tracing facility will keep
- * in a given list when running in non-debug mode. We limit the number
- * of entries in non-debug mode to prevent us from using up too much memory.
- * The limit should be sufficiently large that we don't expect any allocation
- * to every exceed this value. In debug mode, the system will panic if this
- * limit is ever reached allowing for further investigation.
- */
-#ifdef _METASLAB_TRACING
-uint64_t metaslab_trace_max_entries = 5000;
-#endif
-
-static uint64_t metaslab_weight(metaslab_t *);
-static void metaslab_set_fragmentation(metaslab_t *);
-static void metaslab_free_impl(vdev_t *, uint64_t, uint64_t, boolean_t);
-static void metaslab_check_free_impl(vdev_t *, uint64_t, uint64_t);
-static void metaslab_passivate(metaslab_t *msp, uint64_t weight);
-static uint64_t metaslab_weight_from_range_tree(metaslab_t *msp);
-#ifdef _METASLAB_TRACING
-kmem_cache_t *metaslab_alloc_trace_cache;
-#endif
-
-/*
- * ==========================================================================
- * Metaslab classes
- * ==========================================================================
- */
-metaslab_class_t *
-metaslab_class_create(spa_t *spa, metaslab_ops_t *ops)
-{
-	metaslab_class_t *mc;
-
-	mc = kmem_zalloc(sizeof (metaslab_class_t), KM_SLEEP);
-
-	mc->mc_spa = spa;
-	mc->mc_rotor = NULL;
-	mc->mc_ops = ops;
-	mutex_init(&mc->mc_lock, NULL, MUTEX_DEFAULT, NULL);
-	mc->mc_alloc_slots = kmem_zalloc(spa->spa_alloc_count *
-	    sizeof (zfs_refcount_t), KM_SLEEP);
-	mc->mc_alloc_max_slots = kmem_zalloc(spa->spa_alloc_count *
-	    sizeof (uint64_t), KM_SLEEP);
-	for (int i = 0; i < spa->spa_alloc_count; i++)
-		zfs_refcount_create_tracked(&mc->mc_alloc_slots[i]);
-
-	return (mc);
-}
-
-void
-metaslab_class_destroy(metaslab_class_t *mc)
-{
-	ASSERT(mc->mc_rotor == NULL);
-	ASSERT(mc->mc_alloc == 0);
-	ASSERT(mc->mc_deferred == 0);
-	ASSERT(mc->mc_space == 0);
-	ASSERT(mc->mc_dspace == 0);
-
-	for (int i = 0; i < mc->mc_spa->spa_alloc_count; i++)
-		zfs_refcount_destroy(&mc->mc_alloc_slots[i]);
-	kmem_free(mc->mc_alloc_slots, mc->mc_spa->spa_alloc_count *
-	    sizeof (zfs_refcount_t));
-	kmem_free(mc->mc_alloc_max_slots, mc->mc_spa->spa_alloc_count *
-	    sizeof (uint64_t));
-	mutex_destroy(&mc->mc_lock);
-	kmem_free(mc, sizeof (metaslab_class_t));
-}
-
-int
-metaslab_class_validate(metaslab_class_t *mc)
-{
-	metaslab_group_t *mg;
-	vdev_t *vd;
-
-	/*
-	 * Must hold one of the spa_config locks.
-	 */
-	ASSERT(spa_config_held(mc->mc_spa, SCL_ALL, RW_READER) ||
-	    spa_config_held(mc->mc_spa, SCL_ALL, RW_WRITER));
-
-	if ((mg = mc->mc_rotor) == NULL)
-		return (0);
-
-	do {
-		vd = mg->mg_vd;
-		ASSERT(vd->vdev_mg != NULL);
-		ASSERT3P(vd->vdev_top, ==, vd);
-		ASSERT3P(mg->mg_class, ==, mc);
-		ASSERT3P(vd->vdev_ops, !=, &vdev_hole_ops);
-	} while ((mg = mg->mg_next) != mc->mc_rotor);
-
-	return (0);
-}
-
-static void
-metaslab_class_space_update(metaslab_class_t *mc, int64_t alloc_delta,
-    int64_t defer_delta, int64_t space_delta, int64_t dspace_delta)
-{
-	atomic_add_64(&mc->mc_alloc, alloc_delta);
-	atomic_add_64(&mc->mc_deferred, defer_delta);
-	atomic_add_64(&mc->mc_space, space_delta);
-	atomic_add_64(&mc->mc_dspace, dspace_delta);
-}
-
-void
-metaslab_class_minblocksize_update(metaslab_class_t *mc)
-{
-	metaslab_group_t *mg;
-	vdev_t *vd;
-	uint64_t minashift = UINT64_MAX;
-
-	if ((mg = mc->mc_rotor) == NULL) {
-		mc->mc_minblocksize = SPA_MINBLOCKSIZE;
-		return;
-	}
-
-	do {
-		vd = mg->mg_vd;
-		if (vd->vdev_ashift < minashift)
-			minashift = vd->vdev_ashift;
-	} while ((mg = mg->mg_next) != mc->mc_rotor);
-
-	mc->mc_minblocksize = 1ULL << minashift;
-}
-
-uint64_t
-metaslab_class_get_alloc(metaslab_class_t *mc)
-{
-	return (mc->mc_alloc);
-}
-
-uint64_t
-metaslab_class_get_deferred(metaslab_class_t *mc)
-{
-	return (mc->mc_deferred);
-}
-
-uint64_t
-metaslab_class_get_space(metaslab_class_t *mc)
-{
-	return (mc->mc_space);
-}
-
-uint64_t
-metaslab_class_get_dspace(metaslab_class_t *mc)
-{
-	return (spa_deflate(mc->mc_spa) ? mc->mc_dspace : mc->mc_space);
-}
-
-uint64_t
-metaslab_class_get_minblocksize(metaslab_class_t *mc)
-{
-	return (mc->mc_minblocksize);
-}
-
-void
-metaslab_class_histogram_verify(metaslab_class_t *mc)
-{
-	spa_t *spa = mc->mc_spa;
-	vdev_t *rvd = spa->spa_root_vdev;
-	uint64_t *mc_hist;
-	int i;
-
-	if ((zfs_flags & ZFS_DEBUG_HISTOGRAM_VERIFY) == 0)
-		return;
-
-	mc_hist = kmem_zalloc(sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE,
-	    KM_SLEEP);
-
-	for (int c = 0; c < rvd->vdev_children; c++) {
-		vdev_t *tvd = rvd->vdev_child[c];
-		metaslab_group_t *mg = tvd->vdev_mg;
-
-		/*
-		 * Skip any holes, uninitialized top-levels, or
-		 * vdevs that are not in this metalab class.
-		 */
-		if (!vdev_is_concrete(tvd) || tvd->vdev_ms_shift == 0 ||
-		    mg->mg_class != mc) {
-			continue;
-		}
-
-		for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++)
-			mc_hist[i] += mg->mg_histogram[i];
-	}
-
-	for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++)
-		VERIFY3U(mc_hist[i], ==, mc->mc_histogram[i]);
-
-	kmem_free(mc_hist, sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE);
-}
-
-/*
- * Calculate the metaslab class's fragmentation metric. The metric
- * is weighted based on the space contribution of each metaslab group.
- * The return value will be a number between 0 and 100 (inclusive), or
- * ZFS_FRAG_INVALID if the metric has not been set. See comment above the
- * zfs_frag_table for more information about the metric.
- */
-uint64_t
-metaslab_class_fragmentation(metaslab_class_t *mc)
-{
-	vdev_t *rvd = mc->mc_spa->spa_root_vdev;
-	uint64_t fragmentation = 0;
-
-	spa_config_enter(mc->mc_spa, SCL_VDEV, FTAG, RW_READER);
-
-	for (int c = 0; c < rvd->vdev_children; c++) {
-		vdev_t *tvd = rvd->vdev_child[c];
-		metaslab_group_t *mg = tvd->vdev_mg;
-
-		/*
-		 * Skip any holes, uninitialized top-levels,
-		 * or vdevs that are not in this metalab class.
-		 */
-		if (!vdev_is_concrete(tvd) || tvd->vdev_ms_shift == 0 ||
-		    mg->mg_class != mc) {
-			continue;
-		}
-
-		/*
-		 * If a metaslab group does not contain a fragmentation
-		 * metric then just bail out.
-		 */
-		if (mg->mg_fragmentation == ZFS_FRAG_INVALID) {
-			spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG);
-			return (ZFS_FRAG_INVALID);
-		}
-
-		/*
-		 * Determine how much this metaslab_group is contributing
-		 * to the overall pool fragmentation metric.
-		 */
-		fragmentation += mg->mg_fragmentation *
-		    metaslab_group_get_space(mg);
-	}
-	fragmentation /= metaslab_class_get_space(mc);
-
-	ASSERT3U(fragmentation, <=, 100);
-	spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG);
-	return (fragmentation);
-}
-
-/*
- * Calculate the amount of expandable space that is available in
- * this metaslab class. If a device is expanded then its expandable
- * space will be the amount of allocatable space that is currently not
- * part of this metaslab class.
- */
-uint64_t
-metaslab_class_expandable_space(metaslab_class_t *mc)
-{
-	vdev_t *rvd = mc->mc_spa->spa_root_vdev;
-	uint64_t space = 0;
-
-	spa_config_enter(mc->mc_spa, SCL_VDEV, FTAG, RW_READER);
-	for (int c = 0; c < rvd->vdev_children; c++) {
-		uint64_t tspace;
-		vdev_t *tvd = rvd->vdev_child[c];
-		metaslab_group_t *mg = tvd->vdev_mg;
-
-		if (!vdev_is_concrete(tvd) || tvd->vdev_ms_shift == 0 ||
-		    mg->mg_class != mc) {
-			continue;
-		}
-
-		/*
-		 * Calculate if we have enough space to add additional
-		 * metaslabs. We report the expandable space in terms
-		 * of the metaslab size since that's the unit of expansion.
-		 * Adjust by efi system partition size.
-		 */
-		tspace = tvd->vdev_max_asize - tvd->vdev_asize;
-		if (tspace > mc->mc_spa->spa_bootsize) {
-			tspace -= mc->mc_spa->spa_bootsize;
-		}
-		space += P2ALIGN(tspace, 1ULL << tvd->vdev_ms_shift);
-	}
-	spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG);
-	return (space);
-}
-
-static int
-metaslab_compare(const void *x1, const void *x2)
-{
-	const metaslab_t *m1 = (const metaslab_t *)x1;
-	const metaslab_t *m2 = (const metaslab_t *)x2;
-
-	int sort1 = 0;
-	int sort2 = 0;
-	if (m1->ms_allocator != -1 && m1->ms_primary)
-		sort1 = 1;
-	else if (m1->ms_allocator != -1 && !m1->ms_primary)
-		sort1 = 2;
-	if (m2->ms_allocator != -1 && m2->ms_primary)
-		sort2 = 1;
-	else if (m2->ms_allocator != -1 && !m2->ms_primary)
-		sort2 = 2;
-
-	/*
-	 * Sort inactive metaslabs first, then primaries, then secondaries. When
-	 * selecting a metaslab to allocate from, an allocator first tries its
-	 * primary, then secondary active metaslab. If it doesn't have active
-	 * metaslabs, or can't allocate from them, it searches for an inactive
-	 * metaslab to activate. If it can't find a suitable one, it will steal
-	 * a primary or secondary metaslab from another allocator.
-	 */
-	if (sort1 < sort2)
-		return (-1);
-	if (sort1 > sort2)
-		return (1);
-
-	int cmp = AVL_CMP(m2->ms_weight, m1->ms_weight);
-	if (likely(cmp))
-		return (cmp);
-
-	IMPLY(AVL_CMP(m1->ms_start, m2->ms_start) == 0, m1 == m2);
-
-	return (AVL_CMP(m1->ms_start, m2->ms_start));
-}
-
-uint64_t
-metaslab_allocated_space(metaslab_t *msp)
-{
-	return (msp->ms_allocated_space);
-}
-
-/*
- * Verify that the space accounting on disk matches the in-core range_trees.
- */
-static void
-metaslab_verify_space(metaslab_t *msp, uint64_t txg)
-{
-	spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
-	uint64_t allocating = 0;
-	uint64_t sm_free_space, msp_free_space;
-
-	ASSERT(MUTEX_HELD(&msp->ms_lock));
-	ASSERT(!msp->ms_condensing);
-
-	if ((zfs_flags & ZFS_DEBUG_METASLAB_VERIFY) == 0)
-		return;
-
-	/*
-	 * We can only verify the metaslab space when we're called
-	 * from syncing context with a loaded metaslab that has an
-	 * allocated space map. Calling this in non-syncing context
-	 * does not provide a consistent view of the metaslab since
-	 * we're performing allocations in the future.
-	 */
-	if (txg != spa_syncing_txg(spa) || msp->ms_sm == NULL ||
-	    !msp->ms_loaded)
-		return;
-
-	/*
-	 * Even though the smp_alloc field can get negative (e.g.
-	 * see vdev_checkpoint_sm), that should never be the case
-	 * when it come's to a metaslab's space map.
-	 */
-	ASSERT3S(space_map_allocated(msp->ms_sm), >=, 0);
-
-	sm_free_space = msp->ms_size - metaslab_allocated_space(msp);
-
-	/*
-	 * Account for future allocations since we would have
-	 * already deducted that space from the ms_allocatable.
-	 */
-	for (int t = 0; t < TXG_CONCURRENT_STATES; t++) {
-		allocating +=
-		    range_tree_space(msp->ms_allocating[(txg + t) & TXG_MASK]);
-	}
-
-	ASSERT3U(msp->ms_deferspace, ==,
-	    range_tree_space(msp->ms_defer[0]) +
-	    range_tree_space(msp->ms_defer[1]));
-
-	msp_free_space = range_tree_space(msp->ms_allocatable) + allocating +
-	    msp->ms_deferspace + range_tree_space(msp->ms_freed);
-
-	VERIFY3U(sm_free_space, ==, msp_free_space);
-}
-
-/*
- * ==========================================================================
- * Metaslab groups
- * ==========================================================================
- */
-/*
- * Update the allocatable flag and the metaslab group's capacity.
- * The allocatable flag is set to true if the capacity is below
- * the zfs_mg_noalloc_threshold or has a fragmentation value that is
- * greater than zfs_mg_fragmentation_threshold. If a metaslab group
- * transitions from allocatable to non-allocatable or vice versa then the
- * metaslab group's class is updated to reflect the transition.
- */
-static void
-metaslab_group_alloc_update(metaslab_group_t *mg)
-{
-	vdev_t *vd = mg->mg_vd;
-	metaslab_class_t *mc = mg->mg_class;
-	vdev_stat_t *vs = &vd->vdev_stat;
-	boolean_t was_allocatable;
-	boolean_t was_initialized;
-
-	ASSERT(vd == vd->vdev_top);
-	ASSERT3U(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_READER), ==,
-	    SCL_ALLOC);
-
-	mutex_enter(&mg->mg_lock);
-	was_allocatable = mg->mg_allocatable;
-	was_initialized = mg->mg_initialized;
-
-	mg->mg_free_capacity = ((vs->vs_space - vs->vs_alloc) * 100) /
-	    (vs->vs_space + 1);
-
-	mutex_enter(&mc->mc_lock);
-
-	/*
-	 * If the metaslab group was just added then it won't
-	 * have any space until we finish syncing out this txg.
-	 * At that point we will consider it initialized and available
-	 * for allocations.  We also don't consider non-activated
-	 * metaslab groups (e.g. vdevs that are in the middle of being removed)
-	 * to be initialized, because they can't be used for allocation.
-	 */
-	mg->mg_initialized = metaslab_group_initialized(mg);
-	if (!was_initialized && mg->mg_initialized) {
-		mc->mc_groups++;
-	} else if (was_initialized && !mg->mg_initialized) {
-		ASSERT3U(mc->mc_groups, >, 0);
-		mc->mc_groups--;
-	}
-	if (mg->mg_initialized)
-		mg->mg_no_free_space = B_FALSE;
-
-	/*
-	 * A metaslab group is considered allocatable if it has plenty
-	 * of free space or is not heavily fragmented. We only take
-	 * fragmentation into account if the metaslab group has a valid
-	 * fragmentation metric (i.e. a value between 0 and 100).
-	 */
-	mg->mg_allocatable = (mg->mg_activation_count > 0 &&
-	    mg->mg_free_capacity > zfs_mg_noalloc_threshold &&
-	    (mg->mg_fragmentation == ZFS_FRAG_INVALID ||
-	    mg->mg_fragmentation <= zfs_mg_fragmentation_threshold));
-
-	/*
-	 * The mc_alloc_groups maintains a count of the number of
-	 * groups in this metaslab class that are still above the
-	 * zfs_mg_noalloc_threshold. This is used by the allocating
-	 * threads to determine if they should avoid allocations to
-	 * a given group. The allocator will avoid allocations to a group
-	 * if that group has reached or is below the zfs_mg_noalloc_threshold
-	 * and there are still other groups that are above the threshold.
-	 * When a group transitions from allocatable to non-allocatable or
-	 * vice versa we update the metaslab class to reflect that change.
-	 * When the mc_alloc_groups value drops to 0 that means that all
-	 * groups have reached the zfs_mg_noalloc_threshold making all groups
-	 * eligible for allocations. This effectively means that all devices
-	 * are balanced again.
-	 */
-	if (was_allocatable && !mg->mg_allocatable)
-		mc->mc_alloc_groups--;
-	else if (!was_allocatable && mg->mg_allocatable)
-		mc->mc_alloc_groups++;
-	mutex_exit(&mc->mc_lock);
-
-	mutex_exit(&mg->mg_lock);
-}
-
-metaslab_group_t *
-metaslab_group_create(metaslab_class_t *mc, vdev_t *vd, int allocators)
-{
-	metaslab_group_t *mg;
-
-	mg = kmem_zalloc(sizeof (metaslab_group_t), KM_SLEEP);
-	mutex_init(&mg->mg_lock, NULL, MUTEX_DEFAULT, NULL);
-	mutex_init(&mg->mg_ms_initialize_lock, NULL, MUTEX_DEFAULT, NULL);
-	cv_init(&mg->mg_ms_initialize_cv, NULL, CV_DEFAULT, NULL);
-	mg->mg_primaries = kmem_zalloc(allocators * sizeof (metaslab_t *),
-	    KM_SLEEP);
-	mg->mg_secondaries = kmem_zalloc(allocators * sizeof (metaslab_t *),
-	    KM_SLEEP);
-	avl_create(&mg->mg_metaslab_tree, metaslab_compare,
-	    sizeof (metaslab_t), offsetof(struct metaslab, ms_group_node));
-	mg->mg_vd = vd;
-	mg->mg_class = mc;
-	mg->mg_activation_count = 0;
-	mg->mg_initialized = B_FALSE;
-	mg->mg_no_free_space = B_TRUE;
-	mg->mg_allocators = allocators;
-
-	mg->mg_alloc_queue_depth = kmem_zalloc(allocators *
-	    sizeof (zfs_refcount_t), KM_SLEEP);
-	mg->mg_cur_max_alloc_queue_depth = kmem_zalloc(allocators *
-	    sizeof (uint64_t), KM_SLEEP);
-	for (int i = 0; i < allocators; i++) {
-		zfs_refcount_create_tracked(&mg->mg_alloc_queue_depth[i]);
-		mg->mg_cur_max_alloc_queue_depth[i] = 0;
-	}
-
-	mg->mg_taskq = taskq_create("metaslab_group_taskq", metaslab_load_pct,
-	    minclsyspri, 10, INT_MAX, TASKQ_THREADS_CPU_PCT);
-
-	return (mg);
-}
-
-void
-metaslab_group_destroy(metaslab_group_t *mg)
-{
-	ASSERT(mg->mg_prev == NULL);
-	ASSERT(mg->mg_next == NULL);
-	/*
-	 * We may have gone below zero with the activation count
-	 * either because we never activated in the first place or
-	 * because we're done, and possibly removing the vdev.
-	 */
-	ASSERT(mg->mg_activation_count <= 0);
-
-	taskq_destroy(mg->mg_taskq);
-	avl_destroy(&mg->mg_metaslab_tree);
-	kmem_free(mg->mg_primaries, mg->mg_allocators * sizeof (metaslab_t *));
-	kmem_free(mg->mg_secondaries, mg->mg_allocators *
-	    sizeof (metaslab_t *));
-	mutex_destroy(&mg->mg_lock);
-	mutex_destroy(&mg->mg_ms_initialize_lock);
-	cv_destroy(&mg->mg_ms_initialize_cv);
-
-	for (int i = 0; i < mg->mg_allocators; i++) {
-		zfs_refcount_destroy(&mg->mg_alloc_queue_depth[i]);
-		mg->mg_cur_max_alloc_queue_depth[i] = 0;
-	}
-	kmem_free(mg->mg_alloc_queue_depth, mg->mg_allocators *
-	    sizeof (zfs_refcount_t));
-	kmem_free(mg->mg_cur_max_alloc_queue_depth, mg->mg_allocators *
-	    sizeof (uint64_t));
-
-	kmem_free(mg, sizeof (metaslab_group_t));
-}
-
-void
-metaslab_group_activate(metaslab_group_t *mg)
-{
-	metaslab_class_t *mc = mg->mg_class;
-	metaslab_group_t *mgprev, *mgnext;
-
-	ASSERT3U(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER), !=, 0);
-
-	ASSERT(mc->mc_rotor != mg);
-	ASSERT(mg->mg_prev == NULL);
-	ASSERT(mg->mg_next == NULL);
-	ASSERT(mg->mg_activation_count <= 0);
-
-	if (++mg->mg_activation_count <= 0)
-		return;
-
-	mg->mg_aliquot = metaslab_aliquot * MAX(1, mg->mg_vd->vdev_children);
-	metaslab_group_alloc_update(mg);
-
-	if ((mgprev = mc->mc_rotor) == NULL) {
-		mg->mg_prev = mg;
-		mg->mg_next = mg;
-	} else {
-		mgnext = mgprev->mg_next;
-		mg->mg_prev = mgprev;
-		mg->mg_next = mgnext;
-		mgprev->mg_next = mg;
-		mgnext->mg_prev = mg;
-	}
-	mc->mc_rotor = mg;
-	metaslab_class_minblocksize_update(mc);
-}
-
-/*
- * Passivate a metaslab group and remove it from the allocation rotor.
- * Callers must hold both the SCL_ALLOC and SCL_ZIO lock prior to passivating
- * a metaslab group. This function will momentarily drop spa_config_locks
- * that are lower than the SCL_ALLOC lock (see comment below).
- */
-void
-metaslab_group_passivate(metaslab_group_t *mg)
-{
-	metaslab_class_t *mc = mg->mg_class;
-	spa_t *spa = mc->mc_spa;
-	metaslab_group_t *mgprev, *mgnext;
-	int locks = spa_config_held(spa, SCL_ALL, RW_WRITER);
-
-	ASSERT3U(spa_config_held(spa, SCL_ALLOC | SCL_ZIO, RW_WRITER), ==,
-	    (SCL_ALLOC | SCL_ZIO));
-
-	if (--mg->mg_activation_count != 0) {
-		ASSERT(mc->mc_rotor != mg);
-		ASSERT(mg->mg_prev == NULL);
-		ASSERT(mg->mg_next == NULL);
-		ASSERT(mg->mg_activation_count < 0);
-		return;
-	}
-
-	/*
-	 * The spa_config_lock is an array of rwlocks, ordered as
-	 * follows (from highest to lowest):
-	 *	SCL_CONFIG > SCL_STATE > SCL_L2ARC > SCL_ALLOC >
-	 *	SCL_ZIO > SCL_FREE > SCL_VDEV
-	 * (For more information about the spa_config_lock see spa_misc.c)
-	 * The higher the lock, the broader its coverage. When we passivate
-	 * a metaslab group, we must hold both the SCL_ALLOC and the SCL_ZIO
-	 * config locks. However, the metaslab group's taskq might be trying
-	 * to preload metaslabs so we must drop the SCL_ZIO lock and any
-	 * lower locks to allow the I/O to complete. At a minimum,
-	 * we continue to hold the SCL_ALLOC lock, which prevents any future
-	 * allocations from taking place and any changes to the vdev tree.
-	 */
-	spa_config_exit(spa, locks & ~(SCL_ZIO - 1), spa);
-	taskq_wait(mg->mg_taskq);
-	spa_config_enter(spa, locks & ~(SCL_ZIO - 1), spa, RW_WRITER);
-	metaslab_group_alloc_update(mg);
-	for (int i = 0; i < mg->mg_allocators; i++) {
-		metaslab_t *msp = mg->mg_primaries[i];
-		if (msp != NULL) {
-			mutex_enter(&msp->ms_lock);
-			metaslab_passivate(msp,
-			    metaslab_weight_from_range_tree(msp));
-			mutex_exit(&msp->ms_lock);
-		}
-		msp = mg->mg_secondaries[i];
-		if (msp != NULL) {
-			mutex_enter(&msp->ms_lock);
-			metaslab_passivate(msp,
-			    metaslab_weight_from_range_tree(msp));
-			mutex_exit(&msp->ms_lock);
-		}
-	}
-
-	mgprev = mg->mg_prev;
-	mgnext = mg->mg_next;
-
-	if (mg == mgnext) {
-		mc->mc_rotor = NULL;
-	} else {
-		mc->mc_rotor = mgnext;
-		mgprev->mg_next = mgnext;
-		mgnext->mg_prev = mgprev;
-	}
-
-	mg->mg_prev = NULL;
-	mg->mg_next = NULL;
-	metaslab_class_minblocksize_update(mc);
-}
-
-boolean_t
-metaslab_group_initialized(metaslab_group_t *mg)
-{
-	vdev_t *vd = mg->mg_vd;
-	vdev_stat_t *vs = &vd->vdev_stat;
-
-	return (vs->vs_space != 0 && mg->mg_activation_count > 0);
-}
-
-uint64_t
-metaslab_group_get_space(metaslab_group_t *mg)
-{
-	return ((1ULL << mg->mg_vd->vdev_ms_shift) * mg->mg_vd->vdev_ms_count);
-}
-
-void
-metaslab_group_histogram_verify(metaslab_group_t *mg)
-{
-	uint64_t *mg_hist;
-	vdev_t *vd = mg->mg_vd;
-	uint64_t ashift = vd->vdev_ashift;
-	int i;
-
-	if ((zfs_flags & ZFS_DEBUG_HISTOGRAM_VERIFY) == 0)
-		return;
-
-	mg_hist = kmem_zalloc(sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE,
-	    KM_SLEEP);
-
-	ASSERT3U(RANGE_TREE_HISTOGRAM_SIZE, >=,
-	    SPACE_MAP_HISTOGRAM_SIZE + ashift);
-
-	for (int m = 0; m < vd->vdev_ms_count; m++) {
-		metaslab_t *msp = vd->vdev_ms[m];
-		ASSERT(msp != NULL);
-
-		/* skip if not active or not a member */
-		if (msp->ms_sm == NULL || msp->ms_group != mg)
-			continue;
-
-		for (i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++)
-			mg_hist[i + ashift] +=
-			    msp->ms_sm->sm_phys->smp_histogram[i];
-	}
-
-	for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i ++)
-		VERIFY3U(mg_hist[i], ==, mg->mg_histogram[i]);
-
-	kmem_free(mg_hist, sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE);
-}
-
-static void
-metaslab_group_histogram_add(metaslab_group_t *mg, metaslab_t *msp)
-{
-	metaslab_class_t *mc = mg->mg_class;
-	uint64_t ashift = mg->mg_vd->vdev_ashift;
-
-	ASSERT(MUTEX_HELD(&msp->ms_lock));
-	if (msp->ms_sm == NULL)
-		return;
-
-	mutex_enter(&mg->mg_lock);
-	for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
-		mg->mg_histogram[i + ashift] +=
-		    msp->ms_sm->sm_phys->smp_histogram[i];
-		mc->mc_histogram[i + ashift] +=
-		    msp->ms_sm->sm_phys->smp_histogram[i];
-	}
-	mutex_exit(&mg->mg_lock);
-}
-
-void
-metaslab_group_histogram_remove(metaslab_group_t *mg, metaslab_t *msp)
-{
-	metaslab_class_t *mc = mg->mg_class;
-	uint64_t ashift = mg->mg_vd->vdev_ashift;
-
-	ASSERT(MUTEX_HELD(&msp->ms_lock));
-	if (msp->ms_sm == NULL)
-		return;
-
-	mutex_enter(&mg->mg_lock);
-	for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
-		ASSERT3U(mg->mg_histogram[i + ashift], >=,
-		    msp->ms_sm->sm_phys->smp_histogram[i]);
-		ASSERT3U(mc->mc_histogram[i + ashift], >=,
-		    msp->ms_sm->sm_phys->smp_histogram[i]);
-
-		mg->mg_histogram[i + ashift] -=
-		    msp->ms_sm->sm_phys->smp_histogram[i];
-		mc->mc_histogram[i + ashift] -=
-		    msp->ms_sm->sm_phys->smp_histogram[i];
-	}
-	mutex_exit(&mg->mg_lock);
-}
-
-static void
-metaslab_group_add(metaslab_group_t *mg, metaslab_t *msp)
-{
-	ASSERT(msp->ms_group == NULL);
-	mutex_enter(&mg->mg_lock);
-	msp->ms_group = mg;
-	msp->ms_weight = 0;
-	avl_add(&mg->mg_metaslab_tree, msp);
-	mutex_exit(&mg->mg_lock);
-
-	mutex_enter(&msp->ms_lock);
-	metaslab_group_histogram_add(mg, msp);
-	mutex_exit(&msp->ms_lock);
-}
-
-static void
-metaslab_group_remove(metaslab_group_t *mg, metaslab_t *msp)
-{
-	mutex_enter(&msp->ms_lock);
-	metaslab_group_histogram_remove(mg, msp);
-	mutex_exit(&msp->ms_lock);
-
-	mutex_enter(&mg->mg_lock);
-	ASSERT(msp->ms_group == mg);
-	avl_remove(&mg->mg_metaslab_tree, msp);
-	msp->ms_group = NULL;
-	mutex_exit(&mg->mg_lock);
-}
-
-static void
-metaslab_group_sort_impl(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight)
-{
-	ASSERT(MUTEX_HELD(&mg->mg_lock));
-	ASSERT(msp->ms_group == mg);
-	avl_remove(&mg->mg_metaslab_tree, msp);
-	msp->ms_weight = weight;
-	avl_add(&mg->mg_metaslab_tree, msp);
-
-}
-
-static void
-metaslab_group_sort(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight)
-{
-	/*
-	 * Although in principle the weight can be any value, in
-	 * practice we do not use values in the range [1, 511].
-	 */
-	ASSERT(weight >= SPA_MINBLOCKSIZE || weight == 0);
-	ASSERT(MUTEX_HELD(&msp->ms_lock));
-
-	mutex_enter(&mg->mg_lock);
-	metaslab_group_sort_impl(mg, msp, weight);
-	mutex_exit(&mg->mg_lock);
-}
-
-/*
- * Calculate the fragmentation for a given metaslab group. We can use
- * a simple average here since all metaslabs within the group must have
- * the same size. The return value will be a value between 0 and 100
- * (inclusive), or ZFS_FRAG_INVALID if less than half of the metaslab in this
- * group have a fragmentation metric.
- */
-uint64_t
-metaslab_group_fragmentation(metaslab_group_t *mg)
-{
-	vdev_t *vd = mg->mg_vd;
-	uint64_t fragmentation = 0;
-	uint64_t valid_ms = 0;
-
-	for (int m = 0; m < vd->vdev_ms_count; m++) {
-		metaslab_t *msp = vd->vdev_ms[m];
-
-		if (msp->ms_fragmentation == ZFS_FRAG_INVALID)
-			continue;
-		if (msp->ms_group != mg)
-			continue;
-
-		valid_ms++;
-		fragmentation += msp->ms_fragmentation;
-	}
-
-	if (valid_ms <= mg->mg_vd->vdev_ms_count / 2)
-		return (ZFS_FRAG_INVALID);
-
-	fragmentation /= valid_ms;
-	ASSERT3U(fragmentation, <=, 100);
-	return (fragmentation);
-}
-
-/*
- * Determine if a given metaslab group should skip allocations. A metaslab
- * group should avoid allocations if its free capacity is less than the
- * zfs_mg_noalloc_threshold or its fragmentation metric is greater than
- * zfs_mg_fragmentation_threshold and there is at least one metaslab group
- * that can still handle allocations. If the allocation throttle is enabled
- * then we skip allocations to devices that have reached their maximum
- * allocation queue depth unless the selected metaslab group is the only
- * eligible group remaining.
- */
-static boolean_t
-metaslab_group_allocatable(metaslab_group_t *mg, metaslab_group_t *rotor,
-    uint64_t psize, int allocator, int d)
-{
-	spa_t *spa = mg->mg_vd->vdev_spa;
-	metaslab_class_t *mc = mg->mg_class;
-
-	/*
-	 * We can only consider skipping this metaslab group if it's
-	 * in the normal metaslab class and there are other metaslab
-	 * groups to select from. Otherwise, we always consider it eligible
-	 * for allocations.
-	 */
-	if ((mc != spa_normal_class(spa) &&
-	    mc != spa_special_class(spa) &&
-	    mc != spa_dedup_class(spa)) ||
-	    mc->mc_groups <= 1)
-		return (B_TRUE);
-
-	/*
-	 * If the metaslab group's mg_allocatable flag is set (see comments
-	 * in metaslab_group_alloc_update() for more information) and
-	 * the allocation throttle is disabled then allow allocations to this
-	 * device. However, if the allocation throttle is enabled then
-	 * check if we have reached our allocation limit (mg_alloc_queue_depth)
-	 * to determine if we should allow allocations to this metaslab group.
-	 * If all metaslab groups are no longer considered allocatable
-	 * (mc_alloc_groups == 0) or we're trying to allocate the smallest
-	 * gang block size then we allow allocations on this metaslab group
-	 * regardless of the mg_allocatable or throttle settings.
-	 */
-	if (mg->mg_allocatable) {
-		metaslab_group_t *mgp;
-		int64_t qdepth;
-		uint64_t qmax = mg->mg_cur_max_alloc_queue_depth[allocator];
-
-		if (!mc->mc_alloc_throttle_enabled)
-			return (B_TRUE);
-
-		/*
-		 * If this metaslab group does not have any free space, then
-		 * there is no point in looking further.
-		 */
-		if (mg->mg_no_free_space)
-			return (B_FALSE);
-
-		/*
-		 * Relax allocation throttling for ditto blocks.  Due to
-		 * random imbalances in allocation it tends to push copies
-		 * to one vdev, that looks a bit better at the moment.
-		 */
-		qmax = qmax * (4 + d) / 4;
-
-		qdepth = zfs_refcount_count(
-		    &mg->mg_alloc_queue_depth[allocator]);
-
-		/*
-		 * If this metaslab group is below its qmax or it's
-		 * the only allocatable metasable group, then attempt
-		 * to allocate from it.
-		 */
-		if (qdepth < qmax || mc->mc_alloc_groups == 1)
-			return (B_TRUE);
-		ASSERT3U(mc->mc_alloc_groups, >, 1);
-
-		/*
-		 * Since this metaslab group is at or over its qmax, we
-		 * need to determine if there are metaslab groups after this
-		 * one that might be able to handle this allocation. This is
-		 * racy since we can't hold the locks for all metaslab
-		 * groups at the same time when we make this check.
-		 */
-		for (mgp = mg->mg_next; mgp != rotor; mgp = mgp->mg_next) {
-			qmax = mgp->mg_cur_max_alloc_queue_depth[allocator];
-			qmax = qmax * (4 + d) / 4;
-			qdepth = zfs_refcount_count(
-			    &mgp->mg_alloc_queue_depth[allocator]);
-
-			/*
-			 * If there is another metaslab group that
-			 * might be able to handle the allocation, then
-			 * we return false so that we skip this group.
-			 */
-			if (qdepth < qmax && !mgp->mg_no_free_space)
-				return (B_FALSE);
-		}
-
-		/*
-		 * We didn't find another group to handle the allocation
-		 * so we can't skip this metaslab group even though
-		 * we are at or over our qmax.
-		 */
-		return (B_TRUE);
-
-	} else if (mc->mc_alloc_groups == 0 || psize == SPA_MINBLOCKSIZE) {
-		return (B_TRUE);
-	}
-	return (B_FALSE);
-}
-
-/*
- * ==========================================================================
- * Range tree callbacks
- * ==========================================================================
- */
-
-/*
- * Comparison function for the private size-ordered tree. Tree is sorted
- * by size, larger sizes at the end of the tree.
- */
-static int
-metaslab_rangesize_compare(const void *x1, const void *x2)
-{
-	const range_seg_t *r1 = x1;
-	const range_seg_t *r2 = x2;
-	uint64_t rs_size1 = r1->rs_end - r1->rs_start;
-	uint64_t rs_size2 = r2->rs_end - r2->rs_start;
-
-	int cmp = AVL_CMP(rs_size1, rs_size2);
-	if (likely(cmp))
-		return (cmp);
-
-	return (AVL_CMP(r1->rs_start, r2->rs_start));
-}
-
-/*
- * ==========================================================================
- * Common allocator routines
- * ==========================================================================
- */
-
-/*
- * Return the maximum contiguous segment within the metaslab.
- */
-uint64_t
-metaslab_block_maxsize(metaslab_t *msp)
-{
-	avl_tree_t *t = &msp->ms_allocatable_by_size;
-	range_seg_t *rs;
-
-	if (t == NULL || (rs = avl_last(t)) == NULL)
-		return (0ULL);
-
-	return (rs->rs_end - rs->rs_start);
-}
-
-static range_seg_t *
-metaslab_block_find(avl_tree_t *t, uint64_t start, uint64_t size)
-{
-	range_seg_t *rs, rsearch;
-	avl_index_t where;
-
-	rsearch.rs_start = start;
-	rsearch.rs_end = start + size;
-
-	rs = avl_find(t, &rsearch, &where);
-	if (rs == NULL) {
-		rs = avl_nearest(t, where, AVL_AFTER);
-	}
-
-	return (rs);
-}
-
-/*
- * This is a helper function that can be used by the allocator to find
- * a suitable block to allocate. This will search the specified AVL
- * tree looking for a block that matches the specified criteria.
- */
-static uint64_t
-metaslab_block_picker(avl_tree_t *t, uint64_t *cursor, uint64_t size,
-    uint64_t align)
-{
-	range_seg_t *rs = metaslab_block_find(t, *cursor, size);
-
-	while (rs != NULL) {
-		uint64_t offset = P2ROUNDUP(rs->rs_start, align);
-
-		if (offset + size <= rs->rs_end) {
-			*cursor = offset + size;
-			return (offset);
-		}
-		rs = AVL_NEXT(t, rs);
-	}
-
-	/*
-	 * If we know we've searched the whole map (*cursor == 0), give up.
-	 * Otherwise, reset the cursor to the beginning and try again.
-	 */
-	if (*cursor == 0)
-		return (-1ULL);
-
-	*cursor = 0;
-	return (metaslab_block_picker(t, cursor, size, align));
-}
-
-/*
- * ==========================================================================
- * The first-fit block allocator
- * ==========================================================================
- */
-static uint64_t
-metaslab_ff_alloc(metaslab_t *msp, uint64_t size)
-{
-	/*
-	 * Find the largest power of 2 block size that evenly divides the
-	 * requested size. This is used to try to allocate blocks with similar
-	 * alignment from the same area of the metaslab (i.e. same cursor
-	 * bucket) but it does not guarantee that other allocations sizes
-	 * may exist in the same region.
-	 */
-	uint64_t align = size & -size;
-	uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1];
-	avl_tree_t *t = &msp->ms_allocatable->rt_root;
-
-	return (metaslab_block_picker(t, cursor, size, align));
-}
-
-static metaslab_ops_t metaslab_ff_ops = {
-	metaslab_ff_alloc
-};
-
-/*
- * ==========================================================================
- * Dynamic block allocator -
- * Uses the first fit allocation scheme until space get low and then
- * adjusts to a best fit allocation method. Uses metaslab_df_alloc_threshold
- * and metaslab_df_free_pct to determine when to switch the allocation scheme.
- * ==========================================================================
- */
-static uint64_t
-metaslab_df_alloc(metaslab_t *msp, uint64_t size)
-{
-	/*
-	 * Find the largest power of 2 block size that evenly divides the
-	 * requested size. This is used to try to allocate blocks with similar
-	 * alignment from the same area of the metaslab (i.e. same cursor
-	 * bucket) but it does not guarantee that other allocations sizes
-	 * may exist in the same region.
-	 */
-	uint64_t align = size & -size;
-	uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1];
-	range_tree_t *rt = msp->ms_allocatable;
-	avl_tree_t *t = &rt->rt_root;
-	uint64_t max_size = metaslab_block_maxsize(msp);
-	int free_pct = range_tree_space(rt) * 100 / msp->ms_size;
-
-	ASSERT(MUTEX_HELD(&msp->ms_lock));
-	ASSERT3U(avl_numnodes(t), ==,
-	    avl_numnodes(&msp->ms_allocatable_by_size));
-
-	if (max_size < size)
-		return (-1ULL);
-
-	/*
-	 * If we're running low on space switch to using the size
-	 * sorted AVL tree (best-fit).
-	 */
-	if (max_size < metaslab_df_alloc_threshold ||
-	    free_pct < metaslab_df_free_pct) {
-		t = &msp->ms_allocatable_by_size;
-		*cursor = 0;
-	}
-
-	return (metaslab_block_picker(t, cursor, size, 1ULL));
-}
-
-static metaslab_ops_t metaslab_df_ops = {
-	metaslab_df_alloc
-};
-
-/*
- * ==========================================================================
- * Cursor fit block allocator -
- * Select the largest region in the metaslab, set the cursor to the beginning
- * of the range and the cursor_end to the end of the range. As allocations
- * are made advance the cursor. Continue allocating from the cursor until
- * the range is exhausted and then find a new range.
- * ==========================================================================
- */
-static uint64_t
-metaslab_cf_alloc(metaslab_t *msp, uint64_t size)
-{
-	range_tree_t *rt = msp->ms_allocatable;
-	avl_tree_t *t = &msp->ms_allocatable_by_size;
-	uint64_t *cursor = &msp->ms_lbas[0];
-	uint64_t *cursor_end = &msp->ms_lbas[1];
-	uint64_t offset = 0;
-
-	ASSERT(MUTEX_HELD(&msp->ms_lock));
-	ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&rt->rt_root));
-
-	ASSERT3U(*cursor_end, >=, *cursor);
-
-	if ((*cursor + size) > *cursor_end) {
-		range_seg_t *rs;
-
-		rs = avl_last(&msp->ms_allocatable_by_size);
-		if (rs == NULL || (rs->rs_end - rs->rs_start) < size)
-			return (-1ULL);
-
-		*cursor = rs->rs_start;
-		*cursor_end = rs->rs_end;
-	}
-
-	offset = *cursor;
-	*cursor += size;
-
-	return (offset);
-}
-
-static metaslab_ops_t metaslab_cf_ops = {
-	metaslab_cf_alloc
-};
-
-/*
- * ==========================================================================
- * New dynamic fit allocator -
- * Select a region that is large enough to allocate 2^metaslab_ndf_clump_shift
- * contiguous blocks. If no region is found then just use the largest segment
- * that remains.
- * ==========================================================================
- */
-
-/*
- * Determines desired number of contiguous blocks (2^metaslab_ndf_clump_shift)
- * to request from the allocator.
- */
-uint64_t metaslab_ndf_clump_shift = 4;
-
-static uint64_t
-metaslab_ndf_alloc(metaslab_t *msp, uint64_t size)
-{
-	avl_tree_t *t = &msp->ms_allocatable->rt_root;
-	avl_index_t where;
-	range_seg_t *rs, rsearch;
-	uint64_t hbit = highbit64(size);
-	uint64_t *cursor = &msp->ms_lbas[hbit - 1];
-	uint64_t max_size = metaslab_block_maxsize(msp);
-
-	ASSERT(MUTEX_HELD(&msp->ms_lock));
-	ASSERT3U(avl_numnodes(t), ==,
-	    avl_numnodes(&msp->ms_allocatable_by_size));
-
-	if (max_size < size)
-		return (-1ULL);
-
-	rsearch.rs_start = *cursor;
-	rsearch.rs_end = *cursor + size;
-
-	rs = avl_find(t, &rsearch, &where);
-	if (rs == NULL || (rs->rs_end - rs->rs_start) < size) {
-		t = &msp->ms_allocatable_by_size;
-
-		rsearch.rs_start = 0;
-		rsearch.rs_end = MIN(max_size,
-		    1ULL << (hbit + metaslab_ndf_clump_shift));
-		rs = avl_find(t, &rsearch, &where);
-		if (rs == NULL)
-			rs = avl_nearest(t, where, AVL_AFTER);
-		ASSERT(rs != NULL);
-	}
-
-	if ((rs->rs_end - rs->rs_start) >= size) {
-		*cursor = rs->rs_start + size;
-		return (rs->rs_start);
-	}
-	return (-1ULL);
-}
-
-static metaslab_ops_t metaslab_ndf_ops = {
-	metaslab_ndf_alloc
-};
-
-metaslab_ops_t *zfs_metaslab_ops = &metaslab_df_ops;
-
-/*
- * ==========================================================================
- * Metaslabs
- * ==========================================================================
- */
-
-static void
-metaslab_aux_histograms_clear(metaslab_t *msp)
-{
-	/*
-	 * Auxiliary histograms are only cleared when resetting them,
-	 * which can only happen while the metaslab is loaded.
-	 */
-	ASSERT(msp->ms_loaded);
-
-	bzero(msp->ms_synchist, sizeof (msp->ms_synchist));
-	for (int t = 0; t < TXG_DEFER_SIZE; t++)
-		bzero(msp->ms_deferhist[t], sizeof (msp->ms_deferhist[t]));
-}
-
-static void
-metaslab_aux_histogram_add(uint64_t *histogram, uint64_t shift,
-    range_tree_t *rt)
-{
-	/*
-	 * This is modeled after space_map_histogram_add(), so refer to that
-	 * function for implementation details. We want this to work like
-	 * the space map histogram, and not the range tree histogram, as we
-	 * are essentially constructing a delta that will be later subtracted
-	 * from the space map histogram.
-	 */
-	int idx = 0;
-	for (int i = shift; i < RANGE_TREE_HISTOGRAM_SIZE; i++) {
-		ASSERT3U(i, >=, idx + shift);
-		histogram[idx] += rt->rt_histogram[i] << (i - idx - shift);
-
-		if (idx < SPACE_MAP_HISTOGRAM_SIZE - 1) {
-			ASSERT3U(idx + shift, ==, i);
-			idx++;
-			ASSERT3U(idx, <, SPACE_MAP_HISTOGRAM_SIZE);
-		}
-	}
-}
-
-/*
- * Called at every sync pass that the metaslab gets synced.
- *
- * The reason is that we want our auxiliary histograms to be updated
- * wherever the metaslab's space map histogram is updated. This way
- * we stay consistent on which parts of the metaslab space map's
- * histogram are currently not available for allocations (e.g because
- * they are in the defer, freed, and freeing trees).
- */
-static void
-metaslab_aux_histograms_update(metaslab_t *msp)
-{
-	space_map_t *sm = msp->ms_sm;
-	ASSERT(sm != NULL);
-
-	/*
-	 * This is similar to the metaslab's space map histogram updates
-	 * that take place in metaslab_sync(). The only difference is that
-	 * we only care about segments that haven't made it into the
-	 * ms_allocatable tree yet.
-	 */
-	if (msp->ms_loaded) {
-		metaslab_aux_histograms_clear(msp);
-
-		metaslab_aux_histogram_add(msp->ms_synchist,
-		    sm->sm_shift, msp->ms_freed);
-
-		for (int t = 0; t < TXG_DEFER_SIZE; t++) {
-			metaslab_aux_histogram_add(msp->ms_deferhist[t],
-			    sm->sm_shift, msp->ms_defer[t]);
-		}
-	}
-
-	metaslab_aux_histogram_add(msp->ms_synchist,
-	    sm->sm_shift, msp->ms_freeing);
-}
-
-/*
- * Called every time we are done syncing (writing to) the metaslab,
- * i.e. at the end of each sync pass.
- * [see the comment in metaslab_impl.h for ms_synchist, ms_deferhist]
- */
-static void
-metaslab_aux_histograms_update_done(metaslab_t *msp, boolean_t defer_allowed)
-{
-	spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
-	space_map_t *sm = msp->ms_sm;
-
-	if (sm == NULL) {
-		/*
-		 * We came here from metaslab_init() when creating/opening a
-		 * pool, looking at a metaslab that hasn't had any allocations
-		 * yet.
-		 */
-		return;
-	}
-
-	/*
-	 * This is similar to the actions that we take for the ms_freed
-	 * and ms_defer trees in metaslab_sync_done().
-	 */
-	uint64_t hist_index = spa_syncing_txg(spa) % TXG_DEFER_SIZE;
-	if (defer_allowed) {
-		bcopy(msp->ms_synchist, msp->ms_deferhist[hist_index],
-		    sizeof (msp->ms_synchist));
-	} else {
-		bzero(msp->ms_deferhist[hist_index],
-		    sizeof (msp->ms_deferhist[hist_index]));
-	}
-	bzero(msp->ms_synchist, sizeof (msp->ms_synchist));
-}
-
-/*
- * Ensure that the metaslab's weight and fragmentation are consistent
- * with the contents of the histogram (either the range tree's histogram
- * or the space map's depending whether the metaslab is loaded).
- */
-static void
-metaslab_verify_weight_and_frag(metaslab_t *msp)
-{
-	ASSERT(MUTEX_HELD(&msp->ms_lock));
-
-	if ((zfs_flags & ZFS_DEBUG_METASLAB_VERIFY) == 0)
-		return;
-
-	/* see comment in metaslab_verify_unflushed_changes() */
-	if (msp->ms_group == NULL)
-		return;
-
-	/*
-	 * Devices being removed always return a weight of 0 and leave
-	 * fragmentation and ms_max_size as is - there is nothing for
-	 * us to verify here.
-	 */
-	vdev_t *vd = msp->ms_group->mg_vd;
-	if (vd->vdev_removing)
-		return;
-
-	/*
-	 * If the metaslab is dirty it probably means that we've done
-	 * some allocations or frees that have changed our histograms
-	 * and thus the weight.
-	 */
-	for (int t = 0; t < TXG_SIZE; t++) {
-		if (txg_list_member(&vd->vdev_ms_list, msp, t))
-			return;
-	}
-
-	/*
-	 * This verification checks that our in-memory state is consistent
-	 * with what's on disk. If the pool is read-only then there aren't
-	 * any changes and we just have the initially-loaded state.
-	 */
-	if (!spa_writeable(msp->ms_group->mg_vd->vdev_spa))
-		return;
-
-	/* some extra verification for in-core tree if you can */
-	if (msp->ms_loaded) {
-		range_tree_stat_verify(msp->ms_allocatable);
-		VERIFY(space_map_histogram_verify(msp->ms_sm,
-		    msp->ms_allocatable));
-	}
-
-	uint64_t weight = msp->ms_weight;
-	uint64_t was_active = msp->ms_weight & METASLAB_ACTIVE_MASK;
-	boolean_t space_based = WEIGHT_IS_SPACEBASED(msp->ms_weight);
-	uint64_t frag = msp->ms_fragmentation;
-	uint64_t max_segsize = msp->ms_max_size;
-
-	msp->ms_weight = 0;
-	msp->ms_fragmentation = 0;
-	msp->ms_max_size = 0;
-
-	/*
-	 * This function is used for verification purposes. Regardless of
-	 * whether metaslab_weight() thinks this metaslab should be active or
-	 * not, we want to ensure that the actual weight (and therefore the
-	 * value of ms_weight) would be the same if it was to be recalculated
-	 * at this point.
-	 */
-	msp->ms_weight = metaslab_weight(msp) | was_active;
-
-	VERIFY3U(max_segsize, ==, msp->ms_max_size);
-
-	/*
-	 * If the weight type changed then there is no point in doing
-	 * verification. Revert fields to their original values.
-	 */
-	if ((space_based && !WEIGHT_IS_SPACEBASED(msp->ms_weight)) ||
-	    (!space_based && WEIGHT_IS_SPACEBASED(msp->ms_weight))) {
-		msp->ms_fragmentation = frag;
-		msp->ms_weight = weight;
-		return;
-	}
-
-	VERIFY3U(msp->ms_fragmentation, ==, frag);
-	VERIFY3U(msp->ms_weight, ==, weight);
-}
-
-/*
- * Wait for any in-progress metaslab loads to complete.
- */
-static void
-metaslab_load_wait(metaslab_t *msp)
-{
-	ASSERT(MUTEX_HELD(&msp->ms_lock));
-
-	while (msp->ms_loading) {
-		ASSERT(!msp->ms_loaded);
-		cv_wait(&msp->ms_load_cv, &msp->ms_lock);
-	}
-}
-
-static int
-metaslab_load_impl(metaslab_t *msp)
-{
-	int error = 0;
-
-	ASSERT(MUTEX_HELD(&msp->ms_lock));
-	ASSERT(msp->ms_loading);
-	ASSERT(!msp->ms_condensing);
-
-	/*
-	 * We temporarily drop the lock to unblock other operations while we
-	 * are reading the space map. Therefore, metaslab_sync() and
-	 * metaslab_sync_done() can run at the same time as we do.
-	 *
-	 * metaslab_sync() can append to the space map while we are loading.
-	 * Therefore we load only entries that existed when we started the
-	 * load. Additionally, metaslab_sync_done() has to wait for the load
-	 * to complete because there are potential races like metaslab_load()
-	 * loading parts of the space map that are currently being appended
-	 * by metaslab_sync(). If we didn't, the ms_allocatable would have
-	 * entries that metaslab_sync_done() would try to re-add later.
-	 *
-	 * That's why before dropping the lock we remember the synced length
-	 * of the metaslab and read up to that point of the space map,
-	 * ignoring entries appended by metaslab_sync() that happen after we
-	 * drop the lock.
-	 */
-	uint64_t length = msp->ms_synced_length;
-	mutex_exit(&msp->ms_lock);
-
-	if (msp->ms_sm != NULL) {
-		error = space_map_load_length(msp->ms_sm, msp->ms_allocatable,
-		    SM_FREE, length);
-	} else {
-		/*
-		 * The space map has not been allocated yet, so treat
-		 * all the space in the metaslab as free and add it to the
-		 * ms_allocatable tree.
-		 */
-		range_tree_add(msp->ms_allocatable,
-		    msp->ms_start, msp->ms_size);
-	}
-
-	/*
-	 * We need to grab the ms_sync_lock to prevent metaslab_sync() from
-	 * changing the ms_sm and the metaslab's range trees while we are
-	 * about to use them and populate the ms_allocatable. The ms_lock
-	 * is insufficient for this because metaslab_sync() doesn't hold
-	 * the ms_lock while writing the ms_checkpointing tree to disk.
-	 */
-	mutex_enter(&msp->ms_sync_lock);
-	mutex_enter(&msp->ms_lock);
-	ASSERT(!msp->ms_condensing);
-
-	if (error != 0) {
-		mutex_exit(&msp->ms_sync_lock);
-		return (error);
-	}
-
-	ASSERT3P(msp->ms_group, !=, NULL);
-	msp->ms_loaded = B_TRUE;
-
-	/*
-	 * The ms_allocatable contains the segments that exist in the
-	 * ms_defer trees [see ms_synced_length]. Thus we need to remove
-	 * them from ms_allocatable as they will be added again in
-	 * metaslab_sync_done().
-	 */
-	for (int t = 0; t < TXG_DEFER_SIZE; t++) {
-		range_tree_walk(msp->ms_defer[t],
-		    range_tree_remove, msp->ms_allocatable);
-	}
-
-	/*
-	 * Call metaslab_recalculate_weight_and_sort() now that the
-	 * metaslab is loaded so we get the metaslab's real weight.
-	 *
-	 * Unless this metaslab was created with older software and
-	 * has not yet been converted to use segment-based weight, we
-	 * expect the new weight to be better or equal to the weight
-	 * that the metaslab had while it was not loaded. This is
-	 * because the old weight does not take into account the
-	 * consolidation of adjacent segments between TXGs. [see
-	 * comment for ms_synchist and ms_deferhist[] for more info]
-	 */
-	uint64_t weight = msp->ms_weight;
-	metaslab_recalculate_weight_and_sort(msp);
-	if (!WEIGHT_IS_SPACEBASED(weight))
-		ASSERT3U(weight, <=, msp->ms_weight);
-	msp->ms_max_size = metaslab_block_maxsize(msp);
-
-	spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
-	metaslab_verify_space(msp, spa_syncing_txg(spa));
-	mutex_exit(&msp->ms_sync_lock);
-
-	return (0);
-}
-
-int
-metaslab_load(metaslab_t *msp)
-{
-	ASSERT(MUTEX_HELD(&msp->ms_lock));
-
-	/*
-	 * There may be another thread loading the same metaslab, if that's
-	 * the case just wait until the other thread is done and return.
-	 */
-	metaslab_load_wait(msp);
-	if (msp->ms_loaded)
-		return (0);
-	VERIFY(!msp->ms_loading);
-	ASSERT(!msp->ms_condensing);
-
-	msp->ms_loading = B_TRUE;
-	int error = metaslab_load_impl(msp);
-	msp->ms_loading = B_FALSE;
-	cv_broadcast(&msp->ms_load_cv);
-
-	return (error);
-}
-
-void
-metaslab_unload(metaslab_t *msp)
-{
-	ASSERT(MUTEX_HELD(&msp->ms_lock));
-
-	metaslab_verify_weight_and_frag(msp);
-
-	range_tree_vacate(msp->ms_allocatable, NULL, NULL);
-	msp->ms_loaded = B_FALSE;
-
-	msp->ms_weight &= ~METASLAB_ACTIVE_MASK;
-	msp->ms_max_size = 0;
-
-	/*
-	 * We explicitly recalculate the metaslab's weight based on its space
-	 * map (as it is now not loaded). We want unload metaslabs to always
-	 * have their weights calculated from the space map histograms, while
-	 * loaded ones have it calculated from their in-core range tree
-	 * [see metaslab_load()]. This way, the weight reflects the information
-	 * available in-core, whether it is loaded or not
-	 *
-	 * If ms_group == NULL means that we came here from metaslab_fini(),
-	 * at which point it doesn't make sense for us to do the recalculation
-	 * and the sorting.
-	 */
-	if (msp->ms_group != NULL)
-		metaslab_recalculate_weight_and_sort(msp);
-}
-
-static void
-metaslab_space_update(vdev_t *vd, metaslab_class_t *mc, int64_t alloc_delta,
-    int64_t defer_delta, int64_t space_delta)
-{
-	vdev_space_update(vd, alloc_delta, defer_delta, space_delta);
-
-	ASSERT3P(vd->vdev_spa->spa_root_vdev, ==, vd->vdev_parent);
-	ASSERT(vd->vdev_ms_count != 0);
-
-	metaslab_class_space_update(mc, alloc_delta, defer_delta, space_delta,
-	    vdev_deflated_space(vd, space_delta));
-}
-
-int
-metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, uint64_t txg,
-    metaslab_t **msp)
-{
-	vdev_t *vd = mg->mg_vd;
-	spa_t *spa = vd->vdev_spa;
-	objset_t *mos = spa->spa_meta_objset;
-	metaslab_t *ms;
-	int error;
-
-	ms = kmem_zalloc(sizeof (metaslab_t), KM_SLEEP);
-	mutex_init(&ms->ms_lock, NULL, MUTEX_DEFAULT, NULL);
-	mutex_init(&ms->ms_sync_lock, NULL, MUTEX_DEFAULT, NULL);
-	cv_init(&ms->ms_load_cv, NULL, CV_DEFAULT, NULL);
-
-	ms->ms_id = id;
-	ms->ms_start = id << vd->vdev_ms_shift;
-	ms->ms_size = 1ULL << vd->vdev_ms_shift;
-	ms->ms_allocator = -1;
-	ms->ms_new = B_TRUE;
-
-	/*
-	 * We only open space map objects that already exist. All others
-	 * will be opened when we finally allocate an object for it.
-	 *
-	 * Note:
-	 * When called from vdev_expand(), we can't call into the DMU as
-	 * we are holding the spa_config_lock as a writer and we would
-	 * deadlock [see relevant comment in vdev_metaslab_init()]. in
-	 * that case, the object parameter is zero though, so we won't
-	 * call into the DMU.
-	 */
-	if (object != 0) {
-		error = space_map_open(&ms->ms_sm, mos, object, ms->ms_start,
-		    ms->ms_size, vd->vdev_ashift);
-
-		if (error != 0) {
-			kmem_free(ms, sizeof (metaslab_t));
-			return (error);
-		}
-
-		ASSERT(ms->ms_sm != NULL);
-		ASSERT3S(space_map_allocated(ms->ms_sm), >=, 0);
-		ms->ms_allocated_space = space_map_allocated(ms->ms_sm);
-	}
-
-	/*
-	 * We create the ms_allocatable here, but we don't create the
-	 * other range trees until metaslab_sync_done().  This serves
-	 * two purposes: it allows metaslab_sync_done() to detect the
-	 * addition of new space; and for debugging, it ensures that
-	 * we'd data fault on any attempt to use this metaslab before
-	 * it's ready.
-	 */
-	ms->ms_allocatable = range_tree_create_impl(&rt_avl_ops, &ms->ms_allocatable_by_size,
-	    metaslab_rangesize_compare, 0);
-	metaslab_group_add(mg, ms);
-
-	metaslab_set_fragmentation(ms);
-
-	/*
-	 * If we're opening an existing pool (txg == 0) or creating
-	 * a new one (txg == TXG_INITIAL), all space is available now.
-	 * If we're adding space to an existing pool, the new space
-	 * does not become available until after this txg has synced.
-	 * The metaslab's weight will also be initialized when we sync
-	 * out this txg. This ensures that we don't attempt to allocate
-	 * from it before we have initialized it completely.
-	 */
-	if (txg <= TXG_INITIAL) {
-		metaslab_sync_done(ms, 0);
-		metaslab_space_update(vd, mg->mg_class,
-		    metaslab_allocated_space(ms), 0, 0);
-	}
-
-	/*
-	 * If metaslab_debug_load is set and we're initializing a metaslab
-	 * that has an allocated space map object then load the space map
-	 * so that we can verify frees.
-	 */
-	if (metaslab_debug_load && ms->ms_sm != NULL) {
-		mutex_enter(&ms->ms_lock);
-		VERIFY0(metaslab_load(ms));
-		mutex_exit(&ms->ms_lock);
-	}
-
-	if (txg != 0) {
-		vdev_dirty(vd, 0, NULL, txg);
-		vdev_dirty(vd, VDD_METASLAB, ms, txg);
-	}
-
-	*msp = ms;
-
-	return (0);
-}
-
-void
-metaslab_fini(metaslab_t *msp)
-{
-	metaslab_group_t *mg = msp->ms_group;
-	vdev_t *vd = mg->mg_vd;
-
-	metaslab_group_remove(mg, msp);
-
-	mutex_enter(&msp->ms_lock);
-	VERIFY(msp->ms_group == NULL);
-	metaslab_space_update(vd, mg->mg_class,
-	    -metaslab_allocated_space(msp), 0, -msp->ms_size);
-
-	space_map_close(msp->ms_sm);
-
-	metaslab_unload(msp);
-
-	range_tree_destroy(msp->ms_allocatable);
-	range_tree_destroy(msp->ms_freeing);
-	range_tree_destroy(msp->ms_freed);
-
-	for (int t = 0; t < TXG_SIZE; t++) {
-		range_tree_destroy(msp->ms_allocating[t]);
-	}
-
-	for (int t = 0; t < TXG_DEFER_SIZE; t++) {
-		range_tree_destroy(msp->ms_defer[t]);
-	}
-	ASSERT0(msp->ms_deferspace);
-
-	range_tree_destroy(msp->ms_checkpointing);
-
-	for (int t = 0; t < TXG_SIZE; t++)
-		ASSERT(!txg_list_member(&vd->vdev_ms_list, msp, t));
-
-	mutex_exit(&msp->ms_lock);
-	cv_destroy(&msp->ms_load_cv);
-	mutex_destroy(&msp->ms_lock);
-	mutex_destroy(&msp->ms_sync_lock);
-	ASSERT3U(msp->ms_allocator, ==, -1);
-
-	kmem_free(msp, sizeof (metaslab_t));
-}
-
-#define	FRAGMENTATION_TABLE_SIZE	17
-
-/*
- * This table defines a segment size based fragmentation metric that will
- * allow each metaslab to derive its own fragmentation value. This is done
- * by calculating the space in each bucket of the spacemap histogram and
- * multiplying that by the fragmentation metric in this table. Doing
- * this for all buckets and dividing it by the total amount of free
- * space in this metaslab (i.e. the total free space in all buckets) gives
- * us the fragmentation metric. This means that a high fragmentation metric
- * equates to most of the free space being comprised of small segments.
- * Conversely, if the metric is low, then most of the free space is in
- * large segments. A 10% change in fragmentation equates to approximately
- * double the number of segments.
- *
- * This table defines 0% fragmented space using 16MB segments. Testing has
- * shown that segments that are greater than or equal to 16MB do not suffer
- * from drastic performance problems. Using this value, we derive the rest
- * of the table. Since the fragmentation value is never stored on disk, it
- * is possible to change these calculations in the future.
- */
-int zfs_frag_table[FRAGMENTATION_TABLE_SIZE] = {
-	100,	/* 512B	*/
-	100,	/* 1K	*/
-	98,	/* 2K	*/
-	95,	/* 4K	*/
-	90,	/* 8K	*/
-	80,	/* 16K	*/
-	70,	/* 32K	*/
-	60,	/* 64K	*/
-	50,	/* 128K	*/
-	40,	/* 256K	*/
-	30,	/* 512K	*/
-	20,	/* 1M	*/
-	15,	/* 2M	*/
-	10,	/* 4M	*/
-	5,	/* 8M	*/
-	0	/* 16M	*/
-};
-
-/*
- * Calculate the metaslab's fragmentation metric and set ms_fragmentation.
- * Setting this value to ZFS_FRAG_INVALID means that the metaslab has not
- * been upgraded and does not support this metric. Otherwise, the return
- * value should be in the range [0, 100].
- */
-static void
-metaslab_set_fragmentation(metaslab_t *msp)
-{
-	spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
-	uint64_t fragmentation = 0;
-	uint64_t total = 0;
-	boolean_t feature_enabled = spa_feature_is_enabled(spa,
-	    SPA_FEATURE_SPACEMAP_HISTOGRAM);
-
-	if (!feature_enabled) {
-		msp->ms_fragmentation = ZFS_FRAG_INVALID;
-		return;
-	}
-
-	/*
-	 * A null space map means that the entire metaslab is free
-	 * and thus is not fragmented.
-	 */
-	if (msp->ms_sm == NULL) {
-		msp->ms_fragmentation = 0;
-		return;
-	}
-
-	/*
-	 * If this metaslab's space map has not been upgraded, flag it
-	 * so that we upgrade next time we encounter it.
-	 */
-	if (msp->ms_sm->sm_dbuf->db_size != sizeof (space_map_phys_t)) {
-		uint64_t txg = spa_syncing_txg(spa);
-		vdev_t *vd = msp->ms_group->mg_vd;
-
-		/*
-		 * If we've reached the final dirty txg, then we must
-		 * be shutting down the pool. We don't want to dirty
-		 * any data past this point so skip setting the condense
-		 * flag. We can retry this action the next time the pool
-		 * is imported.
-		 */
-		if (spa_writeable(spa) && txg < spa_final_dirty_txg(spa)) {
-			msp->ms_condense_wanted = B_TRUE;
-			vdev_dirty(vd, VDD_METASLAB, msp, txg + 1);
-			zfs_dbgmsg("txg %llu, requesting force condense: "
-			    "ms_id %llu, vdev_id %llu", txg, msp->ms_id,
-			    vd->vdev_id);
-		}
-		msp->ms_fragmentation = ZFS_FRAG_INVALID;
-		return;
-	}
-
-	for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
-		uint64_t space = 0;
-		uint8_t shift = msp->ms_sm->sm_shift;
-
-		int idx = MIN(shift - SPA_MINBLOCKSHIFT + i,
-		    FRAGMENTATION_TABLE_SIZE - 1);
-
-		if (msp->ms_sm->sm_phys->smp_histogram[i] == 0)
-			continue;
-
-		space = msp->ms_sm->sm_phys->smp_histogram[i] << (i + shift);
-		total += space;
-
-		ASSERT3U(idx, <, FRAGMENTATION_TABLE_SIZE);
-		fragmentation += space * zfs_frag_table[idx];
-	}
-
-	if (total > 0)
-		fragmentation /= total;
-	ASSERT3U(fragmentation, <=, 100);
-
-	msp->ms_fragmentation = fragmentation;
-}
-
-/*
- * Compute a weight -- a selection preference value -- for the given metaslab.
- * This is based on the amount of free space, the level of fragmentation,
- * the LBA range, and whether the metaslab is loaded.
- */
-static uint64_t
-metaslab_space_weight(metaslab_t *msp)
-{
-	metaslab_group_t *mg = msp->ms_group;
-	vdev_t *vd = mg->mg_vd;
-	uint64_t weight, space;
-
-	ASSERT(MUTEX_HELD(&msp->ms_lock));
-	ASSERT(!vd->vdev_removing);
-
-	/*
-	 * The baseline weight is the metaslab's free space.
-	 */
-	space = msp->ms_size - metaslab_allocated_space(msp);
-
-	if (metaslab_fragmentation_factor_enabled &&
-	    msp->ms_fragmentation != ZFS_FRAG_INVALID) {
-		/*
-		 * Use the fragmentation information to inversely scale
-		 * down the baseline weight. We need to ensure that we
-		 * don't exclude this metaslab completely when it's 100%
-		 * fragmented. To avoid this we reduce the fragmented value
-		 * by 1.
-		 */
-		space = (space * (100 - (msp->ms_fragmentation - 1))) / 100;
-
-		/*
-		 * If space < SPA_MINBLOCKSIZE, then we will not allocate from
-		 * this metaslab again. The fragmentation metric may have
-		 * decreased the space to something smaller than
-		 * SPA_MINBLOCKSIZE, so reset the space to SPA_MINBLOCKSIZE
-		 * so that we can consume any remaining space.
-		 */
-		if (space > 0 && space < SPA_MINBLOCKSIZE)
-			space = SPA_MINBLOCKSIZE;
-	}
-	weight = space;
-
-	/*
-	 * Modern disks have uniform bit density and constant angular velocity.
-	 * Therefore, the outer recording zones are faster (higher bandwidth)
-	 * than the inner zones by the ratio of outer to inner track diameter,
-	 * which is typically around 2:1.  We account for this by assigning
-	 * higher weight to lower metaslabs (multiplier ranging from 2x to 1x).
-	 * In effect, this means that we'll select the metaslab with the most
-	 * free bandwidth rather than simply the one with the most free space.
-	 */
-	if (!vd->vdev_nonrot && metaslab_lba_weighting_enabled) {
-		weight = 2 * weight - (msp->ms_id * weight) / vd->vdev_ms_count;
-		ASSERT(weight >= space && weight <= 2 * space);
-	}
-
-	/*
-	 * If this metaslab is one we're actively using, adjust its
-	 * weight to make it preferable to any inactive metaslab so
-	 * we'll polish it off. If the fragmentation on this metaslab
-	 * has exceed our threshold, then don't mark it active.
-	 */
-	if (msp->ms_loaded && msp->ms_fragmentation != ZFS_FRAG_INVALID &&
-	    msp->ms_fragmentation <= zfs_metaslab_fragmentation_threshold) {
-		weight |= (msp->ms_weight & METASLAB_ACTIVE_MASK);
-	}
-
-	WEIGHT_SET_SPACEBASED(weight);
-	return (weight);
-}
-
-/*
- * Return the weight of the specified metaslab, according to the segment-based
- * weighting algorithm. The metaslab must be loaded. This function can
- * be called within a sync pass since it relies only on the metaslab's
- * range tree which is always accurate when the metaslab is loaded.
- */
-static uint64_t
-metaslab_weight_from_range_tree(metaslab_t *msp)
-{
-	uint64_t weight = 0;
-	uint32_t segments = 0;
-
-	ASSERT(msp->ms_loaded);
-
-	for (int i = RANGE_TREE_HISTOGRAM_SIZE - 1; i >= SPA_MINBLOCKSHIFT;
-	    i--) {
-		uint8_t shift = msp->ms_group->mg_vd->vdev_ashift;
-		int max_idx = SPACE_MAP_HISTOGRAM_SIZE + shift - 1;
-
-		segments <<= 1;
-		segments += msp->ms_allocatable->rt_histogram[i];
-
-		/*
-		 * The range tree provides more precision than the space map
-		 * and must be downgraded so that all values fit within the
-		 * space map's histogram. This allows us to compare loaded
-		 * vs. unloaded metaslabs to determine which metaslab is
-		 * considered "best".
-		 */
-		if (i > max_idx)
-			continue;
-
-		if (segments != 0) {
-			WEIGHT_SET_COUNT(weight, segments);
-			WEIGHT_SET_INDEX(weight, i);
-			WEIGHT_SET_ACTIVE(weight, 0);
-			break;
-		}
-	}
-	return (weight);
-}
-
-/*
- * Calculate the weight based on the on-disk histogram. This should only
- * be called after a sync pass has completely finished since the on-disk
- * information is updated in metaslab_sync().
- */
-static uint64_t
-metaslab_weight_from_spacemap(metaslab_t *msp)
-{
-	space_map_t *sm = msp->ms_sm;
-	ASSERT(!msp->ms_loaded);
-	ASSERT(sm != NULL);
-	ASSERT3U(space_map_object(sm), !=, 0);
-	ASSERT3U(sm->sm_dbuf->db_size, ==, sizeof (space_map_phys_t));
-
-	/*
-	 * Create a joint histogram from all the segments that have made
-	 * it to the metaslab's space map histogram, that are not yet
-	 * available for allocation because they are still in the freeing
-	 * pipeline (e.g. freeing, freed, and defer trees). Then subtract
-	 * these segments from the space map's histogram to get a more
-	 * accurate weight.
-	 */
-	uint64_t deferspace_histogram[SPACE_MAP_HISTOGRAM_SIZE] = {0};
-	for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++)
-		deferspace_histogram[i] += msp->ms_synchist[i];
-	for (int t = 0; t < TXG_DEFER_SIZE; t++) {
-		for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
-			deferspace_histogram[i] += msp->ms_deferhist[t][i];
-		}
-	}
-
-	uint64_t weight = 0;
-	for (int i = SPACE_MAP_HISTOGRAM_SIZE - 1; i >= 0; i--) {
-		ASSERT3U(sm->sm_phys->smp_histogram[i], >=,
-		    deferspace_histogram[i]);
-		uint64_t count =
-		    sm->sm_phys->smp_histogram[i] - deferspace_histogram[i];
-		if (count != 0) {
-			WEIGHT_SET_COUNT(weight, count);
-			WEIGHT_SET_INDEX(weight, i + sm->sm_shift);
-			WEIGHT_SET_ACTIVE(weight, 0);
-			break;
-		}
-	}
-	return (weight);
-}
-
-/*
- * Compute a segment-based weight for the specified metaslab. The weight
- * is determined by highest bucket in the histogram. The information
- * for the highest bucket is encoded into the weight value.
- */
-static uint64_t
-metaslab_segment_weight(metaslab_t *msp)
-{
-	metaslab_group_t *mg = msp->ms_group;
-	uint64_t weight = 0;
-	uint8_t shift = mg->mg_vd->vdev_ashift;
-
-	ASSERT(MUTEX_HELD(&msp->ms_lock));
-
-	/*
-	 * The metaslab is completely free.
-	 */
-	if (metaslab_allocated_space(msp) == 0) {
-		int idx = highbit64(msp->ms_size) - 1;
-		int max_idx = SPACE_MAP_HISTOGRAM_SIZE + shift - 1;
-
-		if (idx < max_idx) {
-			WEIGHT_SET_COUNT(weight, 1ULL);
-			WEIGHT_SET_INDEX(weight, idx);
-		} else {
-			WEIGHT_SET_COUNT(weight, 1ULL << (idx - max_idx));
-			WEIGHT_SET_INDEX(weight, max_idx);
-		}
-		WEIGHT_SET_ACTIVE(weight, 0);
-		ASSERT(!WEIGHT_IS_SPACEBASED(weight));
-
-		return (weight);
-	}
-
-	ASSERT3U(msp->ms_sm->sm_dbuf->db_size, ==, sizeof (space_map_phys_t));
-
-	/*
-	 * If the metaslab is fully allocated then just make the weight 0.
-	 */
-	if (metaslab_allocated_space(msp) == msp->ms_size)
-		return (0);
-	/*
-	 * If the metaslab is already loaded, then use the range tree to
-	 * determine the weight. Otherwise, we rely on the space map information
-	 * to generate the weight.
-	 */
-	if (msp->ms_loaded) {
-		weight = metaslab_weight_from_range_tree(msp);
-	} else {
-		weight = metaslab_weight_from_spacemap(msp);
-	}
-
-	/*
-	 * If the metaslab was active the last time we calculated its weight
-	 * then keep it active. We want to consume the entire region that
-	 * is associated with this weight.
-	 */
-	if (msp->ms_activation_weight != 0 && weight != 0)
-		WEIGHT_SET_ACTIVE(weight, WEIGHT_GET_ACTIVE(msp->ms_weight));
-	return (weight);
-}
-
-/*
- * Determine if we should attempt to allocate from this metaslab. If the
- * metaslab has a maximum size then we can quickly determine if the desired
- * allocation size can be satisfied. Otherwise, if we're using segment-based
- * weighting then we can determine the maximum allocation that this metaslab
- * can accommodate based on the index encoded in the weight. If we're using
- * space-based weights then rely on the entire weight (excluding the weight
- * type bit).
- */
-boolean_t
-metaslab_should_allocate(metaslab_t *msp, uint64_t asize)
-{
-	boolean_t should_allocate;
-
-	if (msp->ms_max_size != 0)
-		return (msp->ms_max_size >= asize);
-
-	if (!WEIGHT_IS_SPACEBASED(msp->ms_weight)) {
-		/*
-		 * The metaslab segment weight indicates segments in the
-		 * range [2^i, 2^(i+1)), where i is the index in the weight.
-		 * Since the asize might be in the middle of the range, we
-		 * should attempt the allocation if asize < 2^(i+1).
-		 */
-		should_allocate = (asize <
-		    1ULL << (WEIGHT_GET_INDEX(msp->ms_weight) + 1));
-	} else {
-		should_allocate = (asize <=
-		    (msp->ms_weight & ~METASLAB_WEIGHT_TYPE));
-	}
-	return (should_allocate);
-}
-
-static uint64_t
-metaslab_weight(metaslab_t *msp)
-{
-	vdev_t *vd = msp->ms_group->mg_vd;
-	spa_t *spa = vd->vdev_spa;
-	uint64_t weight;
-
-	ASSERT(MUTEX_HELD(&msp->ms_lock));
-
-	/*
-	 * If this vdev is in the process of being removed, there is nothing
-	 * for us to do here.
-	 */
-	if (vd->vdev_removing)
-		return (0);
-
-	metaslab_set_fragmentation(msp);
-
-	/*
-	 * Update the maximum size if the metaslab is loaded. This will
-	 * ensure that we get an accurate maximum size if newly freed space
-	 * has been added back into the free tree.
-	 */
-	if (msp->ms_loaded)
-		msp->ms_max_size = metaslab_block_maxsize(msp);
-	else
-		ASSERT0(msp->ms_max_size);
-
-	/*
-	 * Segment-based weighting requires space map histogram support.
-	 */
-	if (zfs_metaslab_segment_weight_enabled &&
-	    spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM) &&
-	    (msp->ms_sm == NULL || msp->ms_sm->sm_dbuf->db_size ==
-	    sizeof (space_map_phys_t))) {
-		weight = metaslab_segment_weight(msp);
-	} else {
-		weight = metaslab_space_weight(msp);
-	}
-	return (weight);
-}
-
-void
-metaslab_recalculate_weight_and_sort(metaslab_t *msp)
-{
-	/* note: we preserve the mask (e.g. indication of primary, etc..) */
-	uint64_t was_active = msp->ms_weight & METASLAB_ACTIVE_MASK;
-	metaslab_group_sort(msp->ms_group, msp,
-	    metaslab_weight(msp) | was_active);
-}
-
-static int
-metaslab_activate_allocator(metaslab_group_t *mg, metaslab_t *msp,
-    int allocator, uint64_t activation_weight)
-{
-	/*
-	 * If we're activating for the claim code, we don't want to actually
-	 * set the metaslab up for a specific allocator.
-	 */
-	if (activation_weight == METASLAB_WEIGHT_CLAIM)
-		return (0);
-	metaslab_t **arr = (activation_weight == METASLAB_WEIGHT_PRIMARY ?
-	    mg->mg_primaries : mg->mg_secondaries);
-
-	ASSERT(MUTEX_HELD(&msp->ms_lock));
-	mutex_enter(&mg->mg_lock);
-	if (arr[allocator] != NULL) {
-		mutex_exit(&mg->mg_lock);
-		return (EEXIST);
-	}
-
-	arr[allocator] = msp;
-	ASSERT3S(msp->ms_allocator, ==, -1);
-	msp->ms_allocator = allocator;
-	msp->ms_primary = (activation_weight == METASLAB_WEIGHT_PRIMARY);
-	mutex_exit(&mg->mg_lock);
-
-	return (0);
-}
-
-static int
-metaslab_activate(metaslab_t *msp, int allocator, uint64_t activation_weight)
-{
-	ASSERT(MUTEX_HELD(&msp->ms_lock));
-
-	if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) {
-		int error = metaslab_load(msp);
-		if (error != 0) {
-			metaslab_group_sort(msp->ms_group, msp, 0);
-			return (error);
-		}
-		if ((msp->ms_weight & METASLAB_ACTIVE_MASK) != 0) {
-			/*
-			 * The metaslab was activated for another allocator
-			 * while we were waiting, we should reselect.
-			 */
-			return (EBUSY);
-		}
-		if ((error = metaslab_activate_allocator(msp->ms_group, msp,
-		    allocator, activation_weight)) != 0) {
-			return (error);
-		}
-
-		msp->ms_activation_weight = msp->ms_weight;
-		metaslab_group_sort(msp->ms_group, msp,
-		    msp->ms_weight | activation_weight);
-	}
-	ASSERT(msp->ms_loaded);
-	ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK);
-
-	return (0);
-}
-
-static void
-metaslab_passivate_allocator(metaslab_group_t *mg, metaslab_t *msp,
-    uint64_t weight)
-{
-	ASSERT(MUTEX_HELD(&msp->ms_lock));
-	if (msp->ms_weight & METASLAB_WEIGHT_CLAIM) {
-		metaslab_group_sort(mg, msp, weight);
-		return;
-	}
-
-	mutex_enter(&mg->mg_lock);
-	ASSERT3P(msp->ms_group, ==, mg);
-	if (msp->ms_primary) {
-		ASSERT3U(0, <=, msp->ms_allocator);
-		ASSERT3U(msp->ms_allocator, <, mg->mg_allocators);
-		ASSERT3P(mg->mg_primaries[msp->ms_allocator], ==, msp);
-		ASSERT(msp->ms_weight & METASLAB_WEIGHT_PRIMARY);
-		mg->mg_primaries[msp->ms_allocator] = NULL;
-	} else {
-		ASSERT(msp->ms_weight & METASLAB_WEIGHT_SECONDARY);
-		ASSERT3P(mg->mg_secondaries[msp->ms_allocator], ==, msp);
-		mg->mg_secondaries[msp->ms_allocator] = NULL;
-	}
-	msp->ms_allocator = -1;
-	metaslab_group_sort_impl(mg, msp, weight);
-	mutex_exit(&mg->mg_lock);
-}
-
-static void
-metaslab_passivate(metaslab_t *msp, uint64_t weight)
-{
-	uint64_t size = weight & ~METASLAB_WEIGHT_TYPE;
-
-	/*
-	 * If size < SPA_MINBLOCKSIZE, then we will not allocate from
-	 * this metaslab again.  In that case, it had better be empty,
-	 * or we would be leaving space on the table.
-	 */
-	ASSERT(size >= SPA_MINBLOCKSIZE ||
-	    range_tree_is_empty(msp->ms_allocatable));
-	ASSERT0(weight & METASLAB_ACTIVE_MASK);
-
-	msp->ms_activation_weight = 0;
-	metaslab_passivate_allocator(msp->ms_group, msp, weight);
-	ASSERT((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0);
-}
-
-/*
- * Segment-based metaslabs are activated once and remain active until
- * we either fail an allocation attempt (similar to space-based metaslabs)
- * or have exhausted the free space in zfs_metaslab_switch_threshold
- * buckets since the metaslab was activated. This function checks to see
- * if we've exhaused the zfs_metaslab_switch_threshold buckets in the
- * metaslab and passivates it proactively. This will allow us to select a
- * metaslabs with larger contiguous region if any remaining within this
- * metaslab group. If we're in sync pass > 1, then we continue using this
- * metaslab so that we don't dirty more block and cause more sync passes.
- */
-void
-metaslab_segment_may_passivate(metaslab_t *msp)
-{
-	spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
-
-	if (WEIGHT_IS_SPACEBASED(msp->ms_weight) || spa_sync_pass(spa) > 1)
-		return;
-
-	/*
-	 * Since we are in the middle of a sync pass, the most accurate
-	 * information that is accessible to us is the in-core range tree
-	 * histogram; calculate the new weight based on that information.
-	 */
-	uint64_t weight = metaslab_weight_from_range_tree(msp);
-	int activation_idx = WEIGHT_GET_INDEX(msp->ms_activation_weight);
-	int current_idx = WEIGHT_GET_INDEX(weight);
-
-	if (current_idx <= activation_idx - zfs_metaslab_switch_threshold)
-		metaslab_passivate(msp, weight);
-}
-
-static void
-metaslab_preload(void *arg)
-{
-	metaslab_t *msp = arg;
-	spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
-
-	ASSERT(!MUTEX_HELD(&msp->ms_group->mg_lock));
-
-	mutex_enter(&msp->ms_lock);
-	(void) metaslab_load(msp);
-	msp->ms_selected_txg = spa_syncing_txg(spa);
-	mutex_exit(&msp->ms_lock);
-}
-
-static void
-metaslab_group_preload(metaslab_group_t *mg)
-{
-	spa_t *spa = mg->mg_vd->vdev_spa;
-	metaslab_t *msp;
-	avl_tree_t *t = &mg->mg_metaslab_tree;
-	int m = 0;
-
-	if (spa_shutting_down(spa) || !metaslab_preload_enabled) {
-		taskq_wait(mg->mg_taskq);
-		return;
-	}
-
-	mutex_enter(&mg->mg_lock);
-
-	/*
-	 * Load the next potential metaslabs
-	 */
-	for (msp = avl_first(t); msp != NULL; msp = AVL_NEXT(t, msp)) {
-		ASSERT3P(msp->ms_group, ==, mg);
-
-		/*
-		 * We preload only the maximum number of metaslabs specified
-		 * by metaslab_preload_limit. If a metaslab is being forced
-		 * to condense then we preload it too. This will ensure
-		 * that force condensing happens in the next txg.
-		 */
-		if (++m > metaslab_preload_limit && !msp->ms_condense_wanted) {
-			continue;
-		}
-
-		VERIFY(taskq_dispatch(mg->mg_taskq, metaslab_preload,
-		    msp, TQ_SLEEP) != 0);
-	}
-	mutex_exit(&mg->mg_lock);
-}
-
-/*
- * Determine if the space map's on-disk footprint is past our tolerance
- * for inefficiency. We would like to use the following criteria to make
- * our decision:
- *
- * 1. The size of the space map object should not dramatically increase as a
- * result of writing out the free space range tree.
- *
- * 2. The minimal on-disk space map representation is zfs_condense_pct/100
- * times the size than the free space range tree representation
- * (i.e. zfs_condense_pct = 110 and in-core = 1MB, minimal = 1.1MB).
- *
- * 3. The on-disk size of the space map should actually decrease.
- *
- * Unfortunately, we cannot compute the on-disk size of the space map in this
- * context because we cannot accurately compute the effects of compression, etc.
- * Instead, we apply the heuristic described in the block comment for
- * zfs_metaslab_condense_block_threshold - we only condense if the space used
- * is greater than a threshold number of blocks.
- */
-static boolean_t
-metaslab_should_condense(metaslab_t *msp)
-{
-	space_map_t *sm = msp->ms_sm;
-	vdev_t *vd = msp->ms_group->mg_vd;
-	uint64_t vdev_blocksize = 1 << vd->vdev_ashift;
-	uint64_t current_txg = spa_syncing_txg(vd->vdev_spa);
-
-	ASSERT(MUTEX_HELD(&msp->ms_lock));
-	ASSERT(msp->ms_loaded);
-
-	/*
-	 * Allocations and frees in early passes are generally more space
-	 * efficient (in terms of blocks described in space map entries)
-	 * than the ones in later passes (e.g. we don't compress after
-	 * sync pass 5) and condensing a metaslab multiple times in a txg
-	 * could degrade performance.
-	 *
-	 * Thus we prefer condensing each metaslab at most once every txg at
-	 * the earliest sync pass possible. If a metaslab is eligible for
-	 * condensing again after being considered for condensing within the
-	 * same txg, it will hopefully be dirty in the next txg where it will
-	 * be condensed at an earlier pass.
-	 */
-	if (msp->ms_condense_checked_txg == current_txg)
-		return (B_FALSE);
-	msp->ms_condense_checked_txg = current_txg;
-
-	/*
-	 * We always condense metaslabs that are empty and metaslabs for
-	 * which a condense request has been made.
-	 */
-	if (avl_is_empty(&msp->ms_allocatable_by_size) ||
-	    msp->ms_condense_wanted)
-		return (B_TRUE);
-
-	uint64_t object_size = space_map_length(msp->ms_sm);
-	uint64_t optimal_size = space_map_estimate_optimal_size(sm,
-	    msp->ms_allocatable, SM_NO_VDEVID);
-
-	dmu_object_info_t doi;
-	dmu_object_info_from_db(sm->sm_dbuf, &doi);
-	uint64_t record_size = MAX(doi.doi_data_block_size, vdev_blocksize);
-
-	return (object_size >= (optimal_size * zfs_condense_pct / 100) &&
-	    object_size > zfs_metaslab_condense_block_threshold * record_size);
-}
-
-/*
- * Condense the on-disk space map representation to its minimized form.
- * The minimized form consists of a small number of allocations followed by
- * the entries of the free range tree.
- */
-static void
-metaslab_condense(metaslab_t *msp, uint64_t txg, dmu_tx_t *tx)
-{
-	range_tree_t *condense_tree;
-	space_map_t *sm = msp->ms_sm;
-
-	ASSERT(MUTEX_HELD(&msp->ms_lock));
-	ASSERT(msp->ms_loaded);
-
-	zfs_dbgmsg("condensing: txg %llu, msp[%llu] %p, vdev id %llu, "
-	    "spa %s, smp size %llu, segments %lu, forcing condense=%s", txg,
-	    msp->ms_id, msp, msp->ms_group->mg_vd->vdev_id,
-	    msp->ms_group->mg_vd->vdev_spa->spa_name,
-	    space_map_length(msp->ms_sm),
-	    avl_numnodes(&msp->ms_allocatable->rt_root),
-	    msp->ms_condense_wanted ? "TRUE" : "FALSE");
-
-	msp->ms_condense_wanted = B_FALSE;
-
-	/*
-	 * Create an range tree that is 100% allocated. We remove segments
-	 * that have been freed in this txg, any deferred frees that exist,
-	 * and any allocation in the future. Removing segments should be
-	 * a relatively inexpensive operation since we expect these trees to
-	 * have a small number of nodes.
-	 */
-	condense_tree = range_tree_create(NULL, NULL);
-	range_tree_add(condense_tree, msp->ms_start, msp->ms_size);
-
-	range_tree_walk(msp->ms_freeing, range_tree_remove, condense_tree);
-	range_tree_walk(msp->ms_freed, range_tree_remove, condense_tree);
-
-	for (int t = 0; t < TXG_DEFER_SIZE; t++) {
-		range_tree_walk(msp->ms_defer[t],
-		    range_tree_remove, condense_tree);
-	}
-
-	for (int t = 1; t < TXG_CONCURRENT_STATES; t++) {
-		range_tree_walk(msp->ms_allocating[(txg + t) & TXG_MASK],
-		    range_tree_remove, condense_tree);
-	}
-
-	/*
-	 * We're about to drop the metaslab's lock thus allowing
-	 * other consumers to change it's content. Set the
-	 * metaslab's ms_condensing flag to ensure that
-	 * allocations on this metaslab do not occur while we're
-	 * in the middle of committing it to disk. This is only critical
-	 * for ms_allocatable as all other range trees use per txg
-	 * views of their content.
-	 */
-	msp->ms_condensing = B_TRUE;
-
-	mutex_exit(&msp->ms_lock);
-	space_map_truncate(sm, zfs_metaslab_sm_blksz, tx);
-
-	/*
-	 * While we would ideally like to create a space map representation
-	 * that consists only of allocation records, doing so can be
-	 * prohibitively expensive because the in-core free tree can be
-	 * large, and therefore computationally expensive to subtract
-	 * from the condense_tree. Instead we sync out two trees, a cheap
-	 * allocation only tree followed by the in-core free tree. While not
-	 * optimal, this is typically close to optimal, and much cheaper to
-	 * compute.
-	 */
-	space_map_write(sm, condense_tree, SM_ALLOC, SM_NO_VDEVID, tx);
-	range_tree_vacate(condense_tree, NULL, NULL);
-	range_tree_destroy(condense_tree);
-
-	space_map_write(sm, msp->ms_allocatable, SM_FREE, SM_NO_VDEVID, tx);
-	mutex_enter(&msp->ms_lock);
-	msp->ms_condensing = B_FALSE;
-}
-
-/*
- * Write a metaslab to disk in the context of the specified transaction group.
- */
-void
-metaslab_sync(metaslab_t *msp, uint64_t txg)
-{
-	metaslab_group_t *mg = msp->ms_group;
-	vdev_t *vd = mg->mg_vd;
-	spa_t *spa = vd->vdev_spa;
-	objset_t *mos = spa_meta_objset(spa);
-	range_tree_t *alloctree = msp->ms_allocating[txg & TXG_MASK];
-	dmu_tx_t *tx;
-	uint64_t object = space_map_object(msp->ms_sm);
-
-	ASSERT(!vd->vdev_ishole);
-
-	/*
-	 * This metaslab has just been added so there's no work to do now.
-	 */
-	if (msp->ms_freeing == NULL) {
-		ASSERT3P(alloctree, ==, NULL);
-		return;
-	}
-
-	ASSERT3P(alloctree, !=, NULL);
-	ASSERT3P(msp->ms_freeing, !=, NULL);
-	ASSERT3P(msp->ms_freed, !=, NULL);
-	ASSERT3P(msp->ms_checkpointing, !=, NULL);
-
-	/*
-	 * Normally, we don't want to process a metaslab if there are no
-	 * allocations or frees to perform. However, if the metaslab is being
-	 * forced to condense and it's loaded, we need to let it through.
-	 */
-	if (range_tree_is_empty(alloctree) &&
-	    range_tree_is_empty(msp->ms_freeing) &&
-	    range_tree_is_empty(msp->ms_checkpointing) &&
-	    !(msp->ms_loaded && msp->ms_condense_wanted))
-		return;
-
-
-	VERIFY(txg <= spa_final_dirty_txg(spa));
-
-	/*
-	 * The only state that can actually be changing concurrently
-	 * with metaslab_sync() is the metaslab's ms_allocatable. No
-	 * other thread can be modifying this txg's alloc, freeing,
-	 * freed, or space_map_phys_t.  We drop ms_lock whenever we
-	 * could call into the DMU, because the DMU can call down to
-	 * us (e.g. via zio_free()) at any time.
-	 *
-	 * The spa_vdev_remove_thread() can be reading metaslab state
-	 * concurrently, and it is locked out by the ms_sync_lock.
-	 * Note that the ms_lock is insufficient for this, because it
-	 * is dropped by space_map_write().
-	 */
-	tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg);
-
-	if (msp->ms_sm == NULL) {
-		uint64_t new_object;
-
-		new_object = space_map_alloc(mos, zfs_metaslab_sm_blksz, tx);
-		VERIFY3U(new_object, !=, 0);
-
-		VERIFY0(space_map_open(&msp->ms_sm, mos, new_object,
-		    msp->ms_start, msp->ms_size, vd->vdev_ashift));
-
-		ASSERT(msp->ms_sm != NULL);
-		ASSERT0(metaslab_allocated_space(msp));
-	}
-
-	if (!range_tree_is_empty(msp->ms_checkpointing) &&
-	    vd->vdev_checkpoint_sm == NULL) {
-		ASSERT(spa_has_checkpoint(spa));
-
-		uint64_t new_object = space_map_alloc(mos,
-		    vdev_standard_sm_blksz, tx);
-		VERIFY3U(new_object, !=, 0);
-
-		VERIFY0(space_map_open(&vd->vdev_checkpoint_sm,
-		    mos, new_object, 0, vd->vdev_asize, vd->vdev_ashift));
-		ASSERT3P(vd->vdev_checkpoint_sm, !=, NULL);
-
-		/*
-		 * We save the space map object as an entry in vdev_top_zap
-		 * so it can be retrieved when the pool is reopened after an
-		 * export or through zdb.
-		 */
-		VERIFY0(zap_add(vd->vdev_spa->spa_meta_objset,
-		    vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM,
-		    sizeof (new_object), 1, &new_object, tx));
-	}
-
-	mutex_enter(&msp->ms_sync_lock);
-	mutex_enter(&msp->ms_lock);
-
-	/*
-	 * Note: metaslab_condense() clears the space map's histogram.
-	 * Therefore we must verify and remove this histogram before
-	 * condensing.
-	 */
-	metaslab_group_histogram_verify(mg);
-	metaslab_class_histogram_verify(mg->mg_class);
-	metaslab_group_histogram_remove(mg, msp);
-
-	if (msp->ms_loaded && metaslab_should_condense(msp)) {
-		metaslab_condense(msp, txg, tx);
-	} else {
-		mutex_exit(&msp->ms_lock);
-		space_map_write(msp->ms_sm, alloctree, SM_ALLOC,
-		    SM_NO_VDEVID, tx);
-		space_map_write(msp->ms_sm, msp->ms_freeing, SM_FREE,
-		    SM_NO_VDEVID, tx);
-		mutex_enter(&msp->ms_lock);
-	}
-
-	msp->ms_allocated_space += range_tree_space(alloctree);
-	ASSERT3U(msp->ms_allocated_space, >=,
-	    range_tree_space(msp->ms_freeing));
-	msp->ms_allocated_space -= range_tree_space(msp->ms_freeing);
-
-	if (!range_tree_is_empty(msp->ms_checkpointing)) {
-		ASSERT(spa_has_checkpoint(spa));
-		ASSERT3P(vd->vdev_checkpoint_sm, !=, NULL);
-
-		/*
-		 * Since we are doing writes to disk and the ms_checkpointing
-		 * tree won't be changing during that time, we drop the
-		 * ms_lock while writing to the checkpoint space map.
-		 */
-		mutex_exit(&msp->ms_lock);
-		space_map_write(vd->vdev_checkpoint_sm,
-		    msp->ms_checkpointing, SM_FREE, SM_NO_VDEVID, tx);
-		mutex_enter(&msp->ms_lock);
-
-		spa->spa_checkpoint_info.sci_dspace +=
-		    range_tree_space(msp->ms_checkpointing);
-		vd->vdev_stat.vs_checkpoint_space +=
-		    range_tree_space(msp->ms_checkpointing);
-		ASSERT3U(vd->vdev_stat.vs_checkpoint_space, ==,
-		    -space_map_allocated(vd->vdev_checkpoint_sm));
-
-		range_tree_vacate(msp->ms_checkpointing, NULL, NULL);
-	}
-
-	if (msp->ms_loaded) {
-		/*
-		 * When the space map is loaded, we have an accurate
-		 * histogram in the range tree. This gives us an opportunity
-		 * to bring the space map's histogram up-to-date so we clear
-		 * it first before updating it.
-		 */
-		space_map_histogram_clear(msp->ms_sm);
-		space_map_histogram_add(msp->ms_sm, msp->ms_allocatable, tx);
-
-		/*
-		 * Since we've cleared the histogram we need to add back
-		 * any free space that has already been processed, plus
-		 * any deferred space. This allows the on-disk histogram
-		 * to accurately reflect all free space even if some space
-		 * is not yet available for allocation (i.e. deferred).
-		 */
-		space_map_histogram_add(msp->ms_sm, msp->ms_freed, tx);
-
-		/*
-		 * Add back any deferred free space that has not been
-		 * added back into the in-core free tree yet. This will
-		 * ensure that we don't end up with a space map histogram
-		 * that is completely empty unless the metaslab is fully
-		 * allocated.
-		 */
-		for (int t = 0; t < TXG_DEFER_SIZE; t++) {
-			space_map_histogram_add(msp->ms_sm,
-			    msp->ms_defer[t], tx);
-		}
-	}
-
-	/*
-	 * Always add the free space from this sync pass to the space
-	 * map histogram. We want to make sure that the on-disk histogram
-	 * accounts for all free space. If the space map is not loaded,
-	 * then we will lose some accuracy but will correct it the next
-	 * time we load the space map.
-	 */
-	space_map_histogram_add(msp->ms_sm, msp->ms_freeing, tx);
-	metaslab_aux_histograms_update(msp);
-
-	metaslab_group_histogram_add(mg, msp);
-	metaslab_group_histogram_verify(mg);
-	metaslab_class_histogram_verify(mg->mg_class);
-
-	/*
-	 * For sync pass 1, we avoid traversing this txg's free range tree
-	 * and instead will just swap the pointers for freeing and freed.
-	 * We can safely do this since the freed_tree is guaranteed to be
-	 * empty on the initial pass.
-	 */
-	if (spa_sync_pass(spa) == 1) {
-		range_tree_swap(&msp->ms_freeing, &msp->ms_freed);
-		ASSERT0(msp->ms_allocated_this_txg);
-	} else {
-		range_tree_vacate(msp->ms_freeing,
-		    range_tree_add, msp->ms_freed);
-	}
-	msp->ms_allocated_this_txg += range_tree_space(alloctree);
-	range_tree_vacate(alloctree, NULL, NULL);
-
-	ASSERT0(range_tree_space(msp->ms_allocating[txg & TXG_MASK]));
-	ASSERT0(range_tree_space(msp->ms_allocating[TXG_CLEAN(txg)
-	    & TXG_MASK]));
-	ASSERT0(range_tree_space(msp->ms_freeing));
-	ASSERT0(range_tree_space(msp->ms_checkpointing));
-
-	mutex_exit(&msp->ms_lock);
-
-	if (object != space_map_object(msp->ms_sm)) {
-		object = space_map_object(msp->ms_sm);
-		dmu_write(mos, vd->vdev_ms_array, sizeof (uint64_t) *
-		    msp->ms_id, sizeof (uint64_t), &object, tx);
-	}
-	mutex_exit(&msp->ms_sync_lock);
-	dmu_tx_commit(tx);
-}
-
-/*
- * Called after a transaction group has completely synced to mark
- * all of the metaslab's free space as usable.
- */
-void
-metaslab_sync_done(metaslab_t *msp, uint64_t txg)
-{
-	metaslab_group_t *mg = msp->ms_group;
-	vdev_t *vd = mg->mg_vd;
-	spa_t *spa = vd->vdev_spa;
-	range_tree_t **defer_tree;
-	int64_t alloc_delta, defer_delta;
-	boolean_t defer_allowed = B_TRUE;
-
-	ASSERT(!vd->vdev_ishole);
-
-	mutex_enter(&msp->ms_lock);
-
-	/*
-	 * If this metaslab is just becoming available, initialize its
-	 * range trees and add its capacity to the vdev.
-	 */
-	if (msp->ms_freed == NULL) {
-		for (int t = 0; t < TXG_SIZE; t++) {
-			ASSERT(msp->ms_allocating[t] == NULL);
-
-			msp->ms_allocating[t] = range_tree_create(NULL, NULL);
-		}
-
-		ASSERT3P(msp->ms_freeing, ==, NULL);
-		msp->ms_freeing = range_tree_create(NULL, NULL);
-
-		ASSERT3P(msp->ms_freed, ==, NULL);
-		msp->ms_freed = range_tree_create(NULL, NULL);
-
-		for (int t = 0; t < TXG_DEFER_SIZE; t++) {
-			ASSERT(msp->ms_defer[t] == NULL);
-
-			msp->ms_defer[t] = range_tree_create(NULL, NULL);
-		}
-
-		ASSERT3P(msp->ms_checkpointing, ==, NULL);
-		msp->ms_checkpointing = range_tree_create(NULL, NULL);
-
-		metaslab_space_update(vd, mg->mg_class, 0, 0, msp->ms_size);
-	}
-	ASSERT0(range_tree_space(msp->ms_freeing));
-	ASSERT0(range_tree_space(msp->ms_checkpointing));
-
-	defer_tree = &msp->ms_defer[txg % TXG_DEFER_SIZE];
-
-	uint64_t free_space = metaslab_class_get_space(spa_normal_class(spa)) -
-	    metaslab_class_get_alloc(spa_normal_class(spa));
-	if (free_space <= spa_get_slop_space(spa) || vd->vdev_removing) {
-		defer_allowed = B_FALSE;
-	}
-
-	defer_delta = 0;
-	alloc_delta = msp->ms_allocated_this_txg -
-	    range_tree_space(msp->ms_freed);
-	if (defer_allowed) {
-		defer_delta = range_tree_space(msp->ms_freed) -
-		    range_tree_space(*defer_tree);
-	} else {
-		defer_delta -= range_tree_space(*defer_tree);
-	}
-
-	metaslab_space_update(vd, mg->mg_class, alloc_delta + defer_delta,
-	    defer_delta, 0);
-
-	/*
-	 * If there's a metaslab_load() in progress, wait for it to complete
-	 * so that we have a consistent view of the in-core space map.
-	 */
-	metaslab_load_wait(msp);
-
-	/*
-	 * Move the frees from the defer_tree back to the free
-	 * range tree (if it's loaded). Swap the freed_tree and
-	 * the defer_tree -- this is safe to do because we've
-	 * just emptied out the defer_tree.
-	 */
-	range_tree_vacate(*defer_tree,
-	    msp->ms_loaded ? range_tree_add : NULL, msp->ms_allocatable);
-	if (defer_allowed) {
-		range_tree_swap(&msp->ms_freed, defer_tree);
-	} else {
-		range_tree_vacate(msp->ms_freed,
-		    msp->ms_loaded ? range_tree_add : NULL,
-		    msp->ms_allocatable);
-	}
-
-	msp->ms_synced_length = space_map_length(msp->ms_sm);
-
-	msp->ms_deferspace += defer_delta;
-	ASSERT3S(msp->ms_deferspace, >=, 0);
-	ASSERT3S(msp->ms_deferspace, <=, msp->ms_size);
-	if (msp->ms_deferspace != 0) {
-		/*
-		 * Keep syncing this metaslab until all deferred frees
-		 * are back in circulation.
-		 */
-		vdev_dirty(vd, VDD_METASLAB, msp, txg + 1);
-	}
-	metaslab_aux_histograms_update_done(msp, defer_allowed);
-
-	if (msp->ms_new) {
-		msp->ms_new = B_FALSE;
-		mutex_enter(&mg->mg_lock);
-		mg->mg_ms_ready++;
-		mutex_exit(&mg->mg_lock);
-	}
-
-	/*
-	 * Re-sort metaslab within its group now that we've adjusted
-	 * its allocatable space.
-	 */
-	metaslab_recalculate_weight_and_sort(msp);
-
-	/*
-	 * If the metaslab is loaded and we've not tried to load or allocate
-	 * from it in 'metaslab_unload_delay' txgs, then unload it.
-	 */
-	if (msp->ms_loaded &&
-	    msp->ms_initializing == 0 &&
-	    msp->ms_selected_txg + metaslab_unload_delay < txg) {
-		for (int t = 1; t < TXG_CONCURRENT_STATES; t++) {
-			VERIFY0(range_tree_space(
-			    msp->ms_allocating[(txg + t) & TXG_MASK]));
-		}
-		if (msp->ms_allocator != -1) {
-			metaslab_passivate(msp, msp->ms_weight &
-			    ~METASLAB_ACTIVE_MASK);
-		}
-
-		if (!metaslab_debug_unload)
-			metaslab_unload(msp);
-	}
-
-	ASSERT0(range_tree_space(msp->ms_allocating[txg & TXG_MASK]));
-	ASSERT0(range_tree_space(msp->ms_freeing));
-	ASSERT0(range_tree_space(msp->ms_freed));
-	ASSERT0(range_tree_space(msp->ms_checkpointing));
-
-	msp->ms_allocated_this_txg = 0;
-	mutex_exit(&msp->ms_lock);
-}
-
-void
-metaslab_sync_reassess(metaslab_group_t *mg)
-{
-	spa_t *spa = mg->mg_class->mc_spa;
-
-	spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER);
-	metaslab_group_alloc_update(mg);
-	mg->mg_fragmentation = metaslab_group_fragmentation(mg);
-
-	/*
-	 * Preload the next potential metaslabs but only on active
-	 * metaslab groups. We can get into a state where the metaslab
-	 * is no longer active since we dirty metaslabs as we remove a
-	 * a device, thus potentially making the metaslab group eligible
-	 * for preloading.
-	 */
-	if (mg->mg_activation_count > 0) {
-		metaslab_group_preload(mg);
-	}
-	spa_config_exit(spa, SCL_ALLOC, FTAG);
-}
-
-/*
- * When writing a ditto block (i.e. more than one DVA for a given BP) on
- * the same vdev as an existing DVA of this BP, then try to allocate it
- * on a different metaslab than existing DVAs (i.e. a unique metaslab).
- */
-static boolean_t
-metaslab_is_unique(metaslab_t *msp, dva_t *dva)
-{
-	uint64_t dva_ms_id;
-
-	if (DVA_GET_ASIZE(dva) == 0)
-		return (B_TRUE);
-
-	if (msp->ms_group->mg_vd->vdev_id != DVA_GET_VDEV(dva))
-		return (B_TRUE);
-
-	dva_ms_id = DVA_GET_OFFSET(dva) >> msp->ms_group->mg_vd->vdev_ms_shift;
-
-	return (msp->ms_id != dva_ms_id);
-}
-
-/*
- * ==========================================================================
- * Metaslab allocation tracing facility
- * ==========================================================================
- */
-#ifdef _METASLAB_TRACING
-kstat_t *metaslab_trace_ksp;
-kstat_named_t metaslab_trace_over_limit;
-
-void
-metaslab_alloc_trace_init(void)
-{
-	ASSERT(metaslab_alloc_trace_cache == NULL);
-	metaslab_alloc_trace_cache = kmem_cache_create(
-	    "metaslab_alloc_trace_cache", sizeof (metaslab_alloc_trace_t),
-	    0, NULL, NULL, NULL, NULL, NULL, 0);
-	metaslab_trace_ksp = kstat_create("zfs", 0, "metaslab_trace_stats",
-	    "misc", KSTAT_TYPE_NAMED, 1, KSTAT_FLAG_VIRTUAL);
-	if (metaslab_trace_ksp != NULL) {
-		metaslab_trace_ksp->ks_data = &metaslab_trace_over_limit;
-		kstat_named_init(&metaslab_trace_over_limit,
-		    "metaslab_trace_over_limit", KSTAT_DATA_UINT64);
-		kstat_install(metaslab_trace_ksp);
-	}
-}
-
-void
-metaslab_alloc_trace_fini(void)
-{
-	if (metaslab_trace_ksp != NULL) {
-		kstat_delete(metaslab_trace_ksp);
-		metaslab_trace_ksp = NULL;
-	}
-	kmem_cache_destroy(metaslab_alloc_trace_cache);
-	metaslab_alloc_trace_cache = NULL;
-}
-
-/*
- * Add an allocation trace element to the allocation tracing list.
- */
-static void
-metaslab_trace_add(zio_alloc_list_t *zal, metaslab_group_t *mg,
-    metaslab_t *msp, uint64_t psize, uint32_t dva_id, uint64_t offset,
-    int allocator)
-{
-	if (!metaslab_trace_enabled)
-		return;
-
-	/*
-	 * When the tracing list reaches its maximum we remove
-	 * the second element in the list before adding a new one.
-	 * By removing the second element we preserve the original
-	 * entry as a clue to what allocations steps have already been
-	 * performed.
-	 */
-	if (zal->zal_size == metaslab_trace_max_entries) {
-		metaslab_alloc_trace_t *mat_next;
-#ifdef DEBUG
-		panic("too many entries in allocation list");
-#endif
-		atomic_inc_64(&metaslab_trace_over_limit.value.ui64);
-		zal->zal_size--;
-		mat_next = list_next(&zal->zal_list, list_head(&zal->zal_list));
-		list_remove(&zal->zal_list, mat_next);
-		kmem_cache_free(metaslab_alloc_trace_cache, mat_next);
-	}
-
-	metaslab_alloc_trace_t *mat =
-	    kmem_cache_alloc(metaslab_alloc_trace_cache, KM_SLEEP);
-	list_link_init(&mat->mat_list_node);
-	mat->mat_mg = mg;
-	mat->mat_msp = msp;
-	mat->mat_size = psize;
-	mat->mat_dva_id = dva_id;
-	mat->mat_offset = offset;
-	mat->mat_weight = 0;
-	mat->mat_allocator = allocator;
-
-	if (msp != NULL)
-		mat->mat_weight = msp->ms_weight;
-
-	/*
-	 * The list is part of the zio so locking is not required. Only
-	 * a single thread will perform allocations for a given zio.
-	 */
-	list_insert_tail(&zal->zal_list, mat);
-	zal->zal_size++;
-
-	ASSERT3U(zal->zal_size, <=, metaslab_trace_max_entries);
-}
-
-void
-metaslab_trace_init(zio_alloc_list_t *zal)
-{
-	list_create(&zal->zal_list, sizeof (metaslab_alloc_trace_t),
-	    offsetof(metaslab_alloc_trace_t, mat_list_node));
-	zal->zal_size = 0;
-}
-
-void
-metaslab_trace_fini(zio_alloc_list_t *zal)
-{
-	metaslab_alloc_trace_t *mat;
-
-	while ((mat = list_remove_head(&zal->zal_list)) != NULL)
-		kmem_cache_free(metaslab_alloc_trace_cache, mat);
-	list_destroy(&zal->zal_list);
-	zal->zal_size = 0;
-}
-
-#else
-
-#define	metaslab_trace_add(zal, mg, msp, psize, id, off, alloc)
-
-void
-metaslab_alloc_trace_init(void)
-{
-}
-
-void
-metaslab_alloc_trace_fini(void)
-{
-}
-
-void
-metaslab_trace_init(zio_alloc_list_t *zal)
-{
-}
-
-void
-metaslab_trace_fini(zio_alloc_list_t *zal)
-{
-}
-
-#endif /* _METASLAB_TRACING */
-
-/*
- * ==========================================================================
- * Metaslab block operations
- * ==========================================================================
- */
-
-static void
-metaslab_group_alloc_increment(spa_t *spa, uint64_t vdev, void *tag, int flags,
-    int allocator)
-{
-	if (!(flags & METASLAB_ASYNC_ALLOC) ||
-	    (flags & METASLAB_DONT_THROTTLE))
-		return;
-
-	metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg;
-	if (!mg->mg_class->mc_alloc_throttle_enabled)
-		return;
-
-	(void) zfs_refcount_add(&mg->mg_alloc_queue_depth[allocator], tag);
-}
-
-static void
-metaslab_group_increment_qdepth(metaslab_group_t *mg, int allocator)
-{
-	uint64_t max = mg->mg_max_alloc_queue_depth;
-	uint64_t cur = mg->mg_cur_max_alloc_queue_depth[allocator];
-	while (cur < max) {
-		if (atomic_cas_64(&mg->mg_cur_max_alloc_queue_depth[allocator],
-		    cur, cur + 1) == cur) {
-			atomic_inc_64(
-			    &mg->mg_class->mc_alloc_max_slots[allocator]);
-			return;
-		}
-		cur = mg->mg_cur_max_alloc_queue_depth[allocator];
-	}
-}
-
-void
-metaslab_group_alloc_decrement(spa_t *spa, uint64_t vdev, void *tag, int flags,
-    int allocator, boolean_t io_complete)
-{
-	if (!(flags & METASLAB_ASYNC_ALLOC) ||
-	    (flags & METASLAB_DONT_THROTTLE))
-		return;
-
-	metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg;
-	if (!mg->mg_class->mc_alloc_throttle_enabled)
-		return;
-
-	(void) zfs_refcount_remove(&mg->mg_alloc_queue_depth[allocator], tag);
-	if (io_complete)
-		metaslab_group_increment_qdepth(mg, allocator);
-}
-
-void
-metaslab_group_alloc_verify(spa_t *spa, const blkptr_t *bp, void *tag,
-    int allocator)
-{
-#ifdef ZFS_DEBUG
-	const dva_t *dva = bp->blk_dva;
-	int ndvas = BP_GET_NDVAS(bp);
-
-	for (int d = 0; d < ndvas; d++) {
-		uint64_t vdev = DVA_GET_VDEV(&dva[d]);
-		metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg;
-		VERIFY(zfs_refcount_not_held(
-		    &mg->mg_alloc_queue_depth[allocator], tag));
-	}
-#endif
-}
-
-static uint64_t
-metaslab_block_alloc(metaslab_t *msp, uint64_t size, uint64_t txg)
-{
-	uint64_t start;
-	range_tree_t *rt = msp->ms_allocatable;
-	metaslab_class_t *mc = msp->ms_group->mg_class;
-
-	VERIFY(!msp->ms_condensing);
-	VERIFY0(msp->ms_initializing);
-
-	start = mc->mc_ops->msop_alloc(msp, size);
-	if (start != -1ULL) {
-		metaslab_group_t *mg = msp->ms_group;
-		vdev_t *vd = mg->mg_vd;
-
-		VERIFY0(P2PHASE(start, 1ULL << vd->vdev_ashift));
-		VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift));
-		VERIFY3U(range_tree_space(rt) - size, <=, msp->ms_size);
-		range_tree_remove(rt, start, size);
-
-		if (range_tree_is_empty(msp->ms_allocating[txg & TXG_MASK]))
-			vdev_dirty(mg->mg_vd, VDD_METASLAB, msp, txg);
-
-		range_tree_add(msp->ms_allocating[txg & TXG_MASK], start, size);
-
-		/* Track the last successful allocation */
-		msp->ms_alloc_txg = txg;
-		metaslab_verify_space(msp, txg);
-	}
-
-	/*
-	 * Now that we've attempted the allocation we need to update the
-	 * metaslab's maximum block size since it may have changed.
-	 */
-	msp->ms_max_size = metaslab_block_maxsize(msp);
-	return (start);
-}
-
-/*
- * Find the metaslab with the highest weight that is less than what we've
- * already tried.  In the common case, this means that we will examine each
- * metaslab at most once. Note that concurrent callers could reorder metaslabs
- * by activation/passivation once we have dropped the mg_lock. If a metaslab is
- * activated by another thread, and we fail to allocate from the metaslab we
- * have selected, we may not try the newly-activated metaslab, and instead
- * activate another metaslab.  This is not optimal, but generally does not cause
- * any problems (a possible exception being if every metaslab is completely full
- * except for the the newly-activated metaslab which we fail to examine).
- */
-static metaslab_t *
-find_valid_metaslab(metaslab_group_t *mg, uint64_t activation_weight,
-    dva_t *dva, int d, boolean_t want_unique, uint64_t asize, int allocator,
-    zio_alloc_list_t *zal, metaslab_t *search, boolean_t *was_active)
-{
-	avl_index_t idx;
-	avl_tree_t *t = &mg->mg_metaslab_tree;
-	metaslab_t *msp = avl_find(t, search, &idx);
-	if (msp == NULL)
-		msp = avl_nearest(t, idx, AVL_AFTER);
-
-	for (; msp != NULL; msp = AVL_NEXT(t, msp)) {
-		int i;
-		if (!metaslab_should_allocate(msp, asize)) {
-			metaslab_trace_add(zal, mg, msp, asize, d,
-			    TRACE_TOO_SMALL, allocator);
-			continue;
-		}
-
-		/*
-		 * If the selected metaslab is condensing or being
-		 * initialized, skip it.
-		 */
-		if (msp->ms_condensing || msp->ms_initializing > 0)
-			continue;
-
-		*was_active = msp->ms_allocator != -1;
-		/*
-		 * If we're activating as primary, this is our first allocation
-		 * from this disk, so we don't need to check how close we are.
-		 * If the metaslab under consideration was already active,
-		 * we're getting desperate enough to steal another allocator's
-		 * metaslab, so we still don't care about distances.
-		 */
-		if (activation_weight == METASLAB_WEIGHT_PRIMARY || *was_active)
-			break;
-
-		for (i = 0; i < d; i++) {
-			if (want_unique &&
-			    !metaslab_is_unique(msp, &dva[i]))
-				break;  /* try another metaslab */
-		}
-		if (i == d)
-			break;
-	}
-
-	if (msp != NULL) {
-		search->ms_weight = msp->ms_weight;
-		search->ms_start = msp->ms_start + 1;
-		search->ms_allocator = msp->ms_allocator;
-		search->ms_primary = msp->ms_primary;
-	}
-	return (msp);
-}
-
-/* ARGSUSED */
-static uint64_t
-metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal,
-    uint64_t asize, uint64_t txg, boolean_t want_unique, dva_t *dva,
-    int d, int allocator)
-{
-	metaslab_t *msp = NULL;
-	uint64_t offset = -1ULL;
-	uint64_t activation_weight;
-
-	activation_weight = METASLAB_WEIGHT_PRIMARY;
-	for (int i = 0; i < d; i++) {
-		if (activation_weight == METASLAB_WEIGHT_PRIMARY &&
-		    DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) {
-			activation_weight = METASLAB_WEIGHT_SECONDARY;
-		} else if (activation_weight == METASLAB_WEIGHT_SECONDARY &&
-		    DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) {
-			activation_weight = METASLAB_WEIGHT_CLAIM;
-			break;
-		}
-	}
-
-	/*
-	 * If we don't have enough metaslabs active to fill the entire array, we
-	 * just use the 0th slot.
-	 */
-	if (mg->mg_ms_ready < mg->mg_allocators * 3)
-		allocator = 0;
-
-	ASSERT3U(mg->mg_vd->vdev_ms_count, >=, 2);
-
-	metaslab_t *search = kmem_alloc(sizeof (*search), KM_SLEEP);
-	search->ms_weight = UINT64_MAX;
-	search->ms_start = 0;
-	/*
-	 * At the end of the metaslab tree are the already-active metaslabs,
-	 * first the primaries, then the secondaries. When we resume searching
-	 * through the tree, we need to consider ms_allocator and ms_primary so
-	 * we start in the location right after where we left off, and don't
-	 * accidentally loop forever considering the same metaslabs.
-	 */
-	search->ms_allocator = -1;
-	search->ms_primary = B_TRUE;
-	for (;;) {
-		boolean_t was_active = B_FALSE;
-
-		mutex_enter(&mg->mg_lock);
-
-		if (activation_weight == METASLAB_WEIGHT_PRIMARY &&
-		    mg->mg_primaries[allocator] != NULL) {
-			msp = mg->mg_primaries[allocator];
-			was_active = B_TRUE;
-		} else if (activation_weight == METASLAB_WEIGHT_SECONDARY &&
-		    mg->mg_secondaries[allocator] != NULL) {
-			msp = mg->mg_secondaries[allocator];
-			was_active = B_TRUE;
-		} else {
-			msp = find_valid_metaslab(mg, activation_weight, dva, d,
-			    want_unique, asize, allocator, zal, search,
-			    &was_active);
-		}
-
-		mutex_exit(&mg->mg_lock);
-		if (msp == NULL) {
-			kmem_free(search, sizeof (*search));
-			return (-1ULL);
-		}
-
-		mutex_enter(&msp->ms_lock);
-		/*
-		 * Ensure that the metaslab we have selected is still
-		 * capable of handling our request. It's possible that
-		 * another thread may have changed the weight while we
-		 * were blocked on the metaslab lock. We check the
-		 * active status first to see if we need to reselect
-		 * a new metaslab.
-		 */
-		if (was_active && !(msp->ms_weight & METASLAB_ACTIVE_MASK)) {
-			mutex_exit(&msp->ms_lock);
-			continue;
-		}
-
-		/*
-		 * If the metaslab is freshly activated for an allocator that
-		 * isn't the one we're allocating from, or if it's a primary and
-		 * we're seeking a secondary (or vice versa), we go back and
-		 * select a new metaslab.
-		 */
-		if (!was_active && (msp->ms_weight & METASLAB_ACTIVE_MASK) &&
-		    (msp->ms_allocator != -1) &&
-		    (msp->ms_allocator != allocator || ((activation_weight ==
-		    METASLAB_WEIGHT_PRIMARY) != msp->ms_primary))) {
-			mutex_exit(&msp->ms_lock);
-			continue;
-		}
-
-		if (msp->ms_weight & METASLAB_WEIGHT_CLAIM &&
-		    activation_weight != METASLAB_WEIGHT_CLAIM) {
-			metaslab_passivate(msp, msp->ms_weight &
-			    ~METASLAB_WEIGHT_CLAIM);
-			mutex_exit(&msp->ms_lock);
-			continue;
-		}
-
-		if (metaslab_activate(msp, allocator, activation_weight) != 0) {
-			mutex_exit(&msp->ms_lock);
-			continue;
-		}
-
-		msp->ms_selected_txg = txg;
-
-		/*
-		 * Now that we have the lock, recheck to see if we should
-		 * continue to use this metaslab for this allocation. The
-		 * the metaslab is now loaded so metaslab_should_allocate() can
-		 * accurately determine if the allocation attempt should
-		 * proceed.
-		 */
-		if (!metaslab_should_allocate(msp, asize)) {
-			/* Passivate this metaslab and select a new one. */
-			metaslab_trace_add(zal, mg, msp, asize, d,
-			    TRACE_TOO_SMALL, allocator);
-			goto next;
-		}
-
-		/*
-		 * If this metaslab is currently condensing then pick again as
-		 * we can't manipulate this metaslab until it's committed
-		 * to disk. If this metaslab is being initialized, we shouldn't
-		 * allocate from it since the allocated region might be
-		 * overwritten after allocation.
-		 */
-		if (msp->ms_condensing) {
-			metaslab_trace_add(zal, mg, msp, asize, d,
-			    TRACE_CONDENSING, allocator);
-			metaslab_passivate(msp, msp->ms_weight &
-			    ~METASLAB_ACTIVE_MASK);
-			mutex_exit(&msp->ms_lock);
-			continue;
-		} else if (msp->ms_initializing > 0) {
-			metaslab_trace_add(zal, mg, msp, asize, d,
-			    TRACE_INITIALIZING, allocator);
-			metaslab_passivate(msp, msp->ms_weight &
-			    ~METASLAB_ACTIVE_MASK);
-			mutex_exit(&msp->ms_lock);
-			continue;
-		}
-
-		offset = metaslab_block_alloc(msp, asize, txg);
-		metaslab_trace_add(zal, mg, msp, asize, d, offset, allocator);
-
-		if (offset != -1ULL) {
-			/* Proactively passivate the metaslab, if needed */
-			metaslab_segment_may_passivate(msp);
-			break;
-		}
-next:
-		ASSERT(msp->ms_loaded);
-
-		/*
-		 * We were unable to allocate from this metaslab so determine
-		 * a new weight for this metaslab. Now that we have loaded
-		 * the metaslab we can provide a better hint to the metaslab
-		 * selector.
-		 *
-		 * For space-based metaslabs, we use the maximum block size.
-		 * This information is only available when the metaslab
-		 * is loaded and is more accurate than the generic free
-		 * space weight that was calculated by metaslab_weight().
-		 * This information allows us to quickly compare the maximum
-		 * available allocation in the metaslab to the allocation
-		 * size being requested.
-		 *
-		 * For segment-based metaslabs, determine the new weight
-		 * based on the highest bucket in the range tree. We
-		 * explicitly use the loaded segment weight (i.e. the range
-		 * tree histogram) since it contains the space that is
-		 * currently available for allocation and is accurate
-		 * even within a sync pass.
-		 */
-		if (WEIGHT_IS_SPACEBASED(msp->ms_weight)) {
-			uint64_t weight = metaslab_block_maxsize(msp);
-			WEIGHT_SET_SPACEBASED(weight);
-			metaslab_passivate(msp, weight);
-		} else {
-			metaslab_passivate(msp,
-			    metaslab_weight_from_range_tree(msp));
-		}
-
-		/*
-		 * We have just failed an allocation attempt, check
-		 * that metaslab_should_allocate() agrees. Otherwise,
-		 * we may end up in an infinite loop retrying the same
-		 * metaslab.
-		 */
-		ASSERT(!metaslab_should_allocate(msp, asize));
-
-		mutex_exit(&msp->ms_lock);
-	}
-	mutex_exit(&msp->ms_lock);
-	kmem_free(search, sizeof (*search));
-	return (offset);
-}
-
-static uint64_t
-metaslab_group_alloc(metaslab_group_t *mg, zio_alloc_list_t *zal,
-    uint64_t asize, uint64_t txg, boolean_t want_unique, dva_t *dva,
-    int d, int allocator)
-{
-	uint64_t offset;
-	ASSERT(mg->mg_initialized);
-
-	offset = metaslab_group_alloc_normal(mg, zal, asize, txg, want_unique,
-	    dva, d, allocator);
-
-	mutex_enter(&mg->mg_lock);
-	if (offset == -1ULL) {
-		mg->mg_failed_allocations++;
-		metaslab_trace_add(zal, mg, NULL, asize, d,
-		    TRACE_GROUP_FAILURE, allocator);
-		if (asize == SPA_GANGBLOCKSIZE) {
-			/*
-			 * This metaslab group was unable to allocate
-			 * the minimum gang block size so it must be out of
-			 * space. We must notify the allocation throttle
-			 * to start skipping allocation attempts to this
-			 * metaslab group until more space becomes available.
-			 * Note: this failure cannot be caused by the
-			 * allocation throttle since the allocation throttle
-			 * is only responsible for skipping devices and
-			 * not failing block allocations.
-			 */
-			mg->mg_no_free_space = B_TRUE;
-		}
-	}
-	mg->mg_allocations++;
-	mutex_exit(&mg->mg_lock);
-	return (offset);
-}
-
-/*
- * Allocate a block for the specified i/o.
- */
-int
-metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
-    dva_t *dva, int d, dva_t *hintdva, uint64_t txg, int flags,
-    zio_alloc_list_t *zal, int allocator)
-{
-	metaslab_group_t *mg, *rotor;
-	vdev_t *vd;
-	boolean_t try_hard = B_FALSE;
-
-	ASSERT(!DVA_IS_VALID(&dva[d]));
-
-	/*
-	 * For testing, make some blocks above a certain size be gang blocks.
-	 * This will also test spilling from special to normal.
-	 */
-	if (psize >= metaslab_force_ganging && (ddi_get_lbolt() & 3) == 0) {
-		metaslab_trace_add(zal, NULL, NULL, psize, d, TRACE_FORCE_GANG,
-		    allocator);
-		return (SET_ERROR(ENOSPC));
-	}
-
-	/*
-	 * Start at the rotor and loop through all mgs until we find something.
-	 * Note that there's no locking on mc_rotor or mc_aliquot because
-	 * nothing actually breaks if we miss a few updates -- we just won't
-	 * allocate quite as evenly.  It all balances out over time.
-	 *
-	 * If we are doing ditto or log blocks, try to spread them across
-	 * consecutive vdevs.  If we're forced to reuse a vdev before we've
-	 * allocated all of our ditto blocks, then try and spread them out on
-	 * that vdev as much as possible.  If it turns out to not be possible,
-	 * gradually lower our standards until anything becomes acceptable.
-	 * Also, allocating on consecutive vdevs (as opposed to random vdevs)
-	 * gives us hope of containing our fault domains to something we're
-	 * able to reason about.  Otherwise, any two top-level vdev failures
-	 * will guarantee the loss of data.  With consecutive allocation,
-	 * only two adjacent top-level vdev failures will result in data loss.
-	 *
-	 * If we are doing gang blocks (hintdva is non-NULL), try to keep
-	 * ourselves on the same vdev as our gang block header.  That
-	 * way, we can hope for locality in vdev_cache, plus it makes our
-	 * fault domains something tractable.
-	 */
-	if (hintdva) {
-		vd = vdev_lookup_top(spa, DVA_GET_VDEV(&hintdva[d]));
-
-		/*
-		 * It's possible the vdev we're using as the hint no
-		 * longer exists or its mg has been closed (e.g. by
-		 * device removal).  Consult the rotor when
-		 * all else fails.
-		 */
-		if (vd != NULL && vd->vdev_mg != NULL) {
-			mg = vd->vdev_mg;
-
-			if (flags & METASLAB_HINTBP_AVOID &&
-			    mg->mg_next != NULL)
-				mg = mg->mg_next;
-		} else {
-			mg = mc->mc_rotor;
-		}
-	} else if (d != 0) {
-		vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d - 1]));
-		mg = vd->vdev_mg->mg_next;
-	} else {
-		ASSERT(mc->mc_rotor != NULL);
-		mg = mc->mc_rotor;
-	}
-
-	/*
-	 * If the hint put us into the wrong metaslab class, or into a
-	 * metaslab group that has been passivated, just follow the rotor.
-	 */
-	if (mg->mg_class != mc || mg->mg_activation_count <= 0)
-		mg = mc->mc_rotor;
-
-	rotor = mg;
-top:
-	do {
-		boolean_t allocatable;
-
-		ASSERT(mg->mg_activation_count == 1);
-		vd = mg->mg_vd;
-
-		/*
-		 * Don't allocate from faulted devices.
-		 */
-		if (try_hard) {
-			spa_config_enter(spa, SCL_ZIO, FTAG, RW_READER);
-			allocatable = vdev_allocatable(vd);
-			spa_config_exit(spa, SCL_ZIO, FTAG);
-		} else {
-			allocatable = vdev_allocatable(vd);
-		}
-
-		/*
-		 * Determine if the selected metaslab group is eligible
-		 * for allocations. If we're ganging then don't allow
-		 * this metaslab group to skip allocations since that would
-		 * inadvertently return ENOSPC and suspend the pool
-		 * even though space is still available.
-		 */
-		if (allocatable && !GANG_ALLOCATION(flags) && !try_hard) {
-			allocatable = metaslab_group_allocatable(mg, rotor,
-			    psize, allocator, d);
-		}
-
-		if (!allocatable) {
-			metaslab_trace_add(zal, mg, NULL, psize, d,
-			    TRACE_NOT_ALLOCATABLE, allocator);
-			goto next;
-		}
-
-		ASSERT(mg->mg_initialized);
-
-		/*
-		 * Avoid writing single-copy data to a failing,
-		 * non-redundant vdev, unless we've already tried all
-		 * other vdevs.
-		 */
-		if ((vd->vdev_stat.vs_write_errors > 0 ||
-		    vd->vdev_state < VDEV_STATE_HEALTHY) &&
-		    d == 0 && !try_hard && vd->vdev_children == 0) {
-			metaslab_trace_add(zal, mg, NULL, psize, d,
-			    TRACE_VDEV_ERROR, allocator);
-			goto next;
-		}
-
-		ASSERT(mg->mg_class == mc);
-
-		uint64_t asize = vdev_psize_to_asize(vd, psize);
-		ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0);
-
-		/*
-		 * If we don't need to try hard, then require that the
-		 * block be on an different metaslab from any other DVAs
-		 * in this BP (unique=true).  If we are trying hard, then
-		 * allow any metaslab to be used (unique=false).
-		 */
-		uint64_t offset = metaslab_group_alloc(mg, zal, asize, txg,
-		    !try_hard, dva, d, allocator);
-
-		if (offset != -1ULL) {
-			/*
-			 * If we've just selected this metaslab group,
-			 * figure out whether the corresponding vdev is
-			 * over- or under-used relative to the pool,
-			 * and set an allocation bias to even it out.
-			 */
-			if (mc->mc_aliquot == 0 && metaslab_bias_enabled) {
-				vdev_stat_t *vs = &vd->vdev_stat;
-				int64_t vu, cu;
-
-				vu = (vs->vs_alloc * 100) / (vs->vs_space + 1);
-				cu = (mc->mc_alloc * 100) / (mc->mc_space + 1);
-
-				/*
-				 * Calculate how much more or less we should
-				 * try to allocate from this device during
-				 * this iteration around the rotor.
-				 * For example, if a device is 80% full
-				 * and the pool is 20% full then we should
-				 * reduce allocations by 60% on this device.
-				 *
-				 * mg_bias = (20 - 80) * 512K / 100 = -307K
-				 *
-				 * This reduces allocations by 307K for this
-				 * iteration.
-				 */
-				mg->mg_bias = ((cu - vu) *
-				    (int64_t)mg->mg_aliquot) / 100;
-			} else if (!metaslab_bias_enabled) {
-				mg->mg_bias = 0;
-			}
-
-			if (atomic_add_64_nv(&mc->mc_aliquot, asize) >=
-			    mg->mg_aliquot + mg->mg_bias) {
-				mc->mc_rotor = mg->mg_next;
-				mc->mc_aliquot = 0;
-			}
-
-			DVA_SET_VDEV(&dva[d], vd->vdev_id);
-			DVA_SET_OFFSET(&dva[d], offset);
-			DVA_SET_GANG(&dva[d], !!(flags & METASLAB_GANG_HEADER));
-			DVA_SET_ASIZE(&dva[d], asize);
-
-			return (0);
-		}
-next:
-		mc->mc_rotor = mg->mg_next;
-		mc->mc_aliquot = 0;
-	} while ((mg = mg->mg_next) != rotor);
-
-	/*
-	 * If we haven't tried hard, do so now.
-	 */
-	if (!try_hard) {
-		try_hard = B_TRUE;
-		goto top;
-	}
-
-	bzero(&dva[d], sizeof (dva_t));
-
-	metaslab_trace_add(zal, rotor, NULL, psize, d, TRACE_ENOSPC, allocator);
-	return (SET_ERROR(ENOSPC));
-}
-
-void
-metaslab_free_concrete(vdev_t *vd, uint64_t offset, uint64_t asize,
-    boolean_t checkpoint)
-{
-	metaslab_t *msp;
-	spa_t *spa = vd->vdev_spa;
-
-	ASSERT(vdev_is_concrete(vd));
-	ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0);
-	ASSERT3U(offset >> vd->vdev_ms_shift, <, vd->vdev_ms_count);
-
-	msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
-
-	VERIFY(!msp->ms_condensing);
-	VERIFY3U(offset, >=, msp->ms_start);
-	VERIFY3U(offset + asize, <=, msp->ms_start + msp->ms_size);
-	VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift));
-	VERIFY0(P2PHASE(asize, 1ULL << vd->vdev_ashift));
-
-	metaslab_check_free_impl(vd, offset, asize);
-
-	mutex_enter(&msp->ms_lock);
-	if (range_tree_is_empty(msp->ms_freeing) &&
-	    range_tree_is_empty(msp->ms_checkpointing)) {
-		vdev_dirty(vd, VDD_METASLAB, msp, spa_syncing_txg(spa));
-	}
-
-	if (checkpoint) {
-		ASSERT(spa_has_checkpoint(spa));
-		range_tree_add(msp->ms_checkpointing, offset, asize);
-	} else {
-		range_tree_add(msp->ms_freeing, offset, asize);
-	}
-	mutex_exit(&msp->ms_lock);
-}
-
-/* ARGSUSED */
-void
-metaslab_free_impl_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset,
-    uint64_t size, void *arg)
-{
-	boolean_t *checkpoint = arg;
-
-	ASSERT3P(checkpoint, !=, NULL);
-
-	if (vd->vdev_ops->vdev_op_remap != NULL)
-		vdev_indirect_mark_obsolete(vd, offset, size);
-	else
-		metaslab_free_impl(vd, offset, size, *checkpoint);
-}
-
-static void
-metaslab_free_impl(vdev_t *vd, uint64_t offset, uint64_t size,
-    boolean_t checkpoint)
-{
-	spa_t *spa = vd->vdev_spa;
-
-	ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0);
-
-	if (spa_syncing_txg(spa) > spa_freeze_txg(spa))
-		return;
-
-	if (spa->spa_vdev_removal != NULL &&
-	    spa->spa_vdev_removal->svr_vdev_id == vd->vdev_id &&
-	    vdev_is_concrete(vd)) {
-		/*
-		 * Note: we check if the vdev is concrete because when
-		 * we complete the removal, we first change the vdev to be
-		 * an indirect vdev (in open context), and then (in syncing
-		 * context) clear spa_vdev_removal.
-		 */
-		free_from_removing_vdev(vd, offset, size);
-	} else if (vd->vdev_ops->vdev_op_remap != NULL) {
-		vdev_indirect_mark_obsolete(vd, offset, size);
-		vd->vdev_ops->vdev_op_remap(vd, offset, size,
-		    metaslab_free_impl_cb, &checkpoint);
-	} else {
-		metaslab_free_concrete(vd, offset, size, checkpoint);
-	}
-}
-
-typedef struct remap_blkptr_cb_arg {
-	blkptr_t *rbca_bp;
-	spa_remap_cb_t rbca_cb;
-	vdev_t *rbca_remap_vd;
-	uint64_t rbca_remap_offset;
-	void *rbca_cb_arg;
-} remap_blkptr_cb_arg_t;
-
-void
-remap_blkptr_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset,
-    uint64_t size, void *arg)
-{
-	remap_blkptr_cb_arg_t *rbca = arg;
-	blkptr_t *bp = rbca->rbca_bp;
-
-	/* We can not remap split blocks. */
-	if (size != DVA_GET_ASIZE(&bp->blk_dva[0]))
-		return;
-	ASSERT0(inner_offset);
-
-	if (rbca->rbca_cb != NULL) {
-		/*
-		 * At this point we know that we are not handling split
-		 * blocks and we invoke the callback on the previous
-		 * vdev which must be indirect.
-		 */
-		ASSERT3P(rbca->rbca_remap_vd->vdev_ops, ==, &vdev_indirect_ops);
-
-		rbca->rbca_cb(rbca->rbca_remap_vd->vdev_id,
-		    rbca->rbca_remap_offset, size, rbca->rbca_cb_arg);
-
-		/* set up remap_blkptr_cb_arg for the next call */
-		rbca->rbca_remap_vd = vd;
-		rbca->rbca_remap_offset = offset;
-	}
-
-	/*
-	 * The phys birth time is that of dva[0].  This ensures that we know
-	 * when each dva was written, so that resilver can determine which
-	 * blocks need to be scrubbed (i.e. those written during the time
-	 * the vdev was offline).  It also ensures that the key used in
-	 * the ARC hash table is unique (i.e. dva[0] + phys_birth).  If
-	 * we didn't change the phys_birth, a lookup in the ARC for a
-	 * remapped BP could find the data that was previously stored at
-	 * this vdev + offset.
-	 */
-	vdev_t *oldvd = vdev_lookup_top(vd->vdev_spa,
-	    DVA_GET_VDEV(&bp->blk_dva[0]));
-	vdev_indirect_births_t *vib = oldvd->vdev_indirect_births;
-	bp->blk_phys_birth = vdev_indirect_births_physbirth(vib,
-	    DVA_GET_OFFSET(&bp->blk_dva[0]), DVA_GET_ASIZE(&bp->blk_dva[0]));
-
-	DVA_SET_VDEV(&bp->blk_dva[0], vd->vdev_id);
-	DVA_SET_OFFSET(&bp->blk_dva[0], offset);
-}
-
-/*
- * If the block pointer contains any indirect DVAs, modify them to refer to
- * concrete DVAs.  Note that this will sometimes not be possible, leaving
- * the indirect DVA in place.  This happens if the indirect DVA spans multiple
- * segments in the mapping (i.e. it is a "split block").
- *
- * If the BP was remapped, calls the callback on the original dva (note the
- * callback can be called multiple times if the original indirect DVA refers
- * to another indirect DVA, etc).
- *
- * Returns TRUE if the BP was remapped.
- */
-boolean_t
-spa_remap_blkptr(spa_t *spa, blkptr_t *bp, spa_remap_cb_t callback, void *arg)
-{
-	remap_blkptr_cb_arg_t rbca;
-
-	if (!zfs_remap_blkptr_enable)
-		return (B_FALSE);
-
-	if (!spa_feature_is_enabled(spa, SPA_FEATURE_OBSOLETE_COUNTS))
-		return (B_FALSE);
-
-	/*
-	 * Dedup BP's can not be remapped, because ddt_phys_select() depends
-	 * on DVA[0] being the same in the BP as in the DDT (dedup table).
-	 */
-	if (BP_GET_DEDUP(bp))
-		return (B_FALSE);
-
-	/*
-	 * Gang blocks can not be remapped, because
-	 * zio_checksum_gang_verifier() depends on the DVA[0] that's in
-	 * the BP used to read the gang block header (GBH) being the same
-	 * as the DVA[0] that we allocated for the GBH.
-	 */
-	if (BP_IS_GANG(bp))
-		return (B_FALSE);
-
-	/*
-	 * Embedded BP's have no DVA to remap.
-	 */
-	if (BP_GET_NDVAS(bp) < 1)
-		return (B_FALSE);
-
-	/*
-	 * Note: we only remap dva[0].  If we remapped other dvas, we
-	 * would no longer know what their phys birth txg is.
-	 */
-	dva_t *dva = &bp->blk_dva[0];
-
-	uint64_t offset = DVA_GET_OFFSET(dva);
-	uint64_t size = DVA_GET_ASIZE(dva);
-	vdev_t *vd = vdev_lookup_top(spa, DVA_GET_VDEV(dva));
-
-	if (vd->vdev_ops->vdev_op_remap == NULL)
-		return (B_FALSE);
-
-	rbca.rbca_bp = bp;
-	rbca.rbca_cb = callback;
-	rbca.rbca_remap_vd = vd;
-	rbca.rbca_remap_offset = offset;
-	rbca.rbca_cb_arg = arg;
-
-	/*
-	 * remap_blkptr_cb() will be called in order for each level of
-	 * indirection, until a concrete vdev is reached or a split block is
-	 * encountered. old_vd and old_offset are updated within the callback
-	 * as we go from the one indirect vdev to the next one (either concrete
-	 * or indirect again) in that order.
-	 */
-	vd->vdev_ops->vdev_op_remap(vd, offset, size, remap_blkptr_cb, &rbca);
-
-	/* Check if the DVA wasn't remapped because it is a split block */
-	if (DVA_GET_VDEV(&rbca.rbca_bp->blk_dva[0]) == vd->vdev_id)
-		return (B_FALSE);
-
-	return (B_TRUE);
-}
-
-/*
- * Undo the allocation of a DVA which happened in the given transaction group.
- */
-void
-metaslab_unalloc_dva(spa_t *spa, const dva_t *dva, uint64_t txg)
-{
-	metaslab_t *msp;
-	vdev_t *vd;
-	uint64_t vdev = DVA_GET_VDEV(dva);
-	uint64_t offset = DVA_GET_OFFSET(dva);
-	uint64_t size = DVA_GET_ASIZE(dva);
-
-	ASSERT(DVA_IS_VALID(dva));
-	ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0);
-
-	if (txg > spa_freeze_txg(spa))
-		return;
-
-	if ((vd = vdev_lookup_top(spa, vdev)) == NULL ||
-	    (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count) {
-		cmn_err(CE_WARN, "metaslab_free_dva(): bad DVA %llu:%llu",
-		    (u_longlong_t)vdev, (u_longlong_t)offset);
-		ASSERT(0);
-		return;
-	}
-
-	ASSERT(!vd->vdev_removing);
-	ASSERT(vdev_is_concrete(vd));
-	ASSERT0(vd->vdev_indirect_config.vic_mapping_object);
-	ASSERT3P(vd->vdev_indirect_mapping, ==, NULL);
-
-	if (DVA_GET_GANG(dva))
-		size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
-
-	msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
-
-	mutex_enter(&msp->ms_lock);
-	range_tree_remove(msp->ms_allocating[txg & TXG_MASK],
-	    offset, size);
-
-	VERIFY(!msp->ms_condensing);
-	VERIFY3U(offset, >=, msp->ms_start);
-	VERIFY3U(offset + size, <=, msp->ms_start + msp->ms_size);
-	VERIFY3U(range_tree_space(msp->ms_allocatable) + size, <=,
-	    msp->ms_size);
-	VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift));
-	VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift));
-	range_tree_add(msp->ms_allocatable, offset, size);
-	mutex_exit(&msp->ms_lock);
-}
-
-/*
- * Free the block represented by the given DVA.
- */
-void
-metaslab_free_dva(spa_t *spa, const dva_t *dva, boolean_t checkpoint)
-{
-	uint64_t vdev = DVA_GET_VDEV(dva);
-	uint64_t offset = DVA_GET_OFFSET(dva);
-	uint64_t size = DVA_GET_ASIZE(dva);
-	vdev_t *vd = vdev_lookup_top(spa, vdev);
-
-	ASSERT(DVA_IS_VALID(dva));
-	ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0);
-
-	if (DVA_GET_GANG(dva)) {
-		size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
-	}
-
-	metaslab_free_impl(vd, offset, size, checkpoint);
-}
-
-/*
- * Reserve some allocation slots. The reservation system must be called
- * before we call into the allocator. If there aren't any available slots
- * then the I/O will be throttled until an I/O completes and its slots are
- * freed up. The function returns true if it was successful in placing
- * the reservation.
- */
-boolean_t
-metaslab_class_throttle_reserve(metaslab_class_t *mc, int slots, int allocator,
-    zio_t *zio, int flags)
-{
-	uint64_t available_slots = 0;
-	boolean_t slot_reserved = B_FALSE;
-	uint64_t max = mc->mc_alloc_max_slots[allocator];
-
-	ASSERT(mc->mc_alloc_throttle_enabled);
-	mutex_enter(&mc->mc_lock);
-
-	uint64_t reserved_slots =
-	    zfs_refcount_count(&mc->mc_alloc_slots[allocator]);
-	if (reserved_slots < max)
-		available_slots = max - reserved_slots;
-
-	if (slots <= available_slots || GANG_ALLOCATION(flags) ||
-	    flags & METASLAB_MUST_RESERVE) {
-		/*
-		 * We reserve the slots individually so that we can unreserve
-		 * them individually when an I/O completes.
-		 */
-		for (int d = 0; d < slots; d++) {
-			reserved_slots =
-			    zfs_refcount_add(&mc->mc_alloc_slots[allocator],
-			    zio);
-		}
-		zio->io_flags |= ZIO_FLAG_IO_ALLOCATING;
-		slot_reserved = B_TRUE;
-	}
-
-	mutex_exit(&mc->mc_lock);
-	return (slot_reserved);
-}
-
-void
-metaslab_class_throttle_unreserve(metaslab_class_t *mc, int slots,
-    int allocator, zio_t *zio)
-{
-	ASSERT(mc->mc_alloc_throttle_enabled);
-	mutex_enter(&mc->mc_lock);
-	for (int d = 0; d < slots; d++) {
-		(void) zfs_refcount_remove(&mc->mc_alloc_slots[allocator],
-		    zio);
-	}
-	mutex_exit(&mc->mc_lock);
-}
-
-static int
-metaslab_claim_concrete(vdev_t *vd, uint64_t offset, uint64_t size,
-    uint64_t txg)
-{
-	metaslab_t *msp;
-	spa_t *spa = vd->vdev_spa;
-	int error = 0;
-
-	if (offset >> vd->vdev_ms_shift >= vd->vdev_ms_count)
-		return (ENXIO);
-
-	ASSERT3P(vd->vdev_ms, !=, NULL);
-	msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
-
-	mutex_enter(&msp->ms_lock);
-
-	if ((txg != 0 && spa_writeable(spa)) || !msp->ms_loaded)
-		error = metaslab_activate(msp, 0, METASLAB_WEIGHT_CLAIM);
-	/*
-	 * No need to fail in that case; someone else has activated the
-	 * metaslab, but that doesn't preclude us from using it.
-	 */
-	if (error == EBUSY)
-		error = 0;
-
-	if (error == 0 &&
-	    !range_tree_contains(msp->ms_allocatable, offset, size))
-		error = SET_ERROR(ENOENT);
-
-	if (error || txg == 0) {	/* txg == 0 indicates dry run */
-		mutex_exit(&msp->ms_lock);
-		return (error);
-	}
-
-	VERIFY(!msp->ms_condensing);
-	VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift));
-	VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift));
-	VERIFY3U(range_tree_space(msp->ms_allocatable) - size, <=,
-	    msp->ms_size);
-	range_tree_remove(msp->ms_allocatable, offset, size);
-
-	if (spa_writeable(spa)) {	/* don't dirty if we're zdb(1M) */
-		if (range_tree_is_empty(msp->ms_allocating[txg & TXG_MASK]))
-			vdev_dirty(vd, VDD_METASLAB, msp, txg);
-		range_tree_add(msp->ms_allocating[txg & TXG_MASK],
-		    offset, size);
-	}
-
-	mutex_exit(&msp->ms_lock);
-
-	return (0);
-}
-
-typedef struct metaslab_claim_cb_arg_t {
-	uint64_t	mcca_txg;
-	int		mcca_error;
-} metaslab_claim_cb_arg_t;
-
-/* ARGSUSED */
-static void
-metaslab_claim_impl_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset,
-    uint64_t size, void *arg)
-{
-	metaslab_claim_cb_arg_t *mcca_arg = arg;
-
-	if (mcca_arg->mcca_error == 0) {
-		mcca_arg->mcca_error = metaslab_claim_concrete(vd, offset,
-		    size, mcca_arg->mcca_txg);
-	}
-}
-
-int
-metaslab_claim_impl(vdev_t *vd, uint64_t offset, uint64_t size, uint64_t txg)
-{
-	if (vd->vdev_ops->vdev_op_remap != NULL) {
-		metaslab_claim_cb_arg_t arg;
-
-		/*
-		 * Only zdb(1M) can claim on indirect vdevs.  This is used
-		 * to detect leaks of mapped space (that are not accounted
-		 * for in the obsolete counts, spacemap, or bpobj).
-		 */
-		ASSERT(!spa_writeable(vd->vdev_spa));
-		arg.mcca_error = 0;
-		arg.mcca_txg = txg;
-
-		vd->vdev_ops->vdev_op_remap(vd, offset, size,
-		    metaslab_claim_impl_cb, &arg);
-
-		if (arg.mcca_error == 0) {
-			arg.mcca_error = metaslab_claim_concrete(vd,
-			    offset, size, txg);
-		}
-		return (arg.mcca_error);
-	} else {
-		return (metaslab_claim_concrete(vd, offset, size, txg));
-	}
-}
-
-/*
- * Intent log support: upon opening the pool after a crash, notify the SPA
- * of blocks that the intent log has allocated for immediate write, but
- * which are still considered free by the SPA because the last transaction
- * group didn't commit yet.
- */
-static int
-metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg)
-{
-	uint64_t vdev = DVA_GET_VDEV(dva);
-	uint64_t offset = DVA_GET_OFFSET(dva);
-	uint64_t size = DVA_GET_ASIZE(dva);
-	vdev_t *vd;
-
-	if ((vd = vdev_lookup_top(spa, vdev)) == NULL) {
-		return (SET_ERROR(ENXIO));
-	}
-
-	ASSERT(DVA_IS_VALID(dva));
-
-	if (DVA_GET_GANG(dva))
-		size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
-
-	return (metaslab_claim_impl(vd, offset, size, txg));
-}
-
-int
-metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp,
-    int ndvas, uint64_t txg, blkptr_t *hintbp, int flags,
-    zio_alloc_list_t *zal, zio_t *zio, int allocator)
-{
-	dva_t *dva = bp->blk_dva;
-	dva_t *hintdva = (hintbp != NULL) ? hintbp->blk_dva : NULL;
-	int error = 0;
-
-	ASSERT(bp->blk_birth == 0);
-	ASSERT(BP_PHYSICAL_BIRTH(bp) == 0);
-
-	spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER);
-
-	if (mc->mc_rotor == NULL) {	/* no vdevs in this class */
-		spa_config_exit(spa, SCL_ALLOC, FTAG);
-		return (SET_ERROR(ENOSPC));
-	}
-
-	ASSERT(ndvas > 0 && ndvas <= spa_max_replication(spa));
-	ASSERT(BP_GET_NDVAS(bp) == 0);
-	ASSERT(hintbp == NULL || ndvas <= BP_GET_NDVAS(hintbp));
-	ASSERT3P(zal, !=, NULL);
-
-	for (int d = 0; d < ndvas; d++) {
-		error = metaslab_alloc_dva(spa, mc, psize, dva, d, hintdva,
-		    txg, flags, zal, allocator);
-		if (error != 0) {
-			for (d--; d >= 0; d--) {
-				metaslab_unalloc_dva(spa, &dva[d], txg);
-				metaslab_group_alloc_decrement(spa,
-				    DVA_GET_VDEV(&dva[d]), zio, flags,
-				    allocator, B_FALSE);
-				bzero(&dva[d], sizeof (dva_t));
-			}
-			spa_config_exit(spa, SCL_ALLOC, FTAG);
-			return (error);
-		} else {
-			/*
-			 * Update the metaslab group's queue depth
-			 * based on the newly allocated dva.
-			 */
-			metaslab_group_alloc_increment(spa,
-			    DVA_GET_VDEV(&dva[d]), zio, flags, allocator);
-		}
-
-	}
-	ASSERT(error == 0);
-	ASSERT(BP_GET_NDVAS(bp) == ndvas);
-
-	spa_config_exit(spa, SCL_ALLOC, FTAG);
-
-	BP_SET_BIRTH(bp, txg, txg);
-
-	return (0);
-}
-
-void
-metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, boolean_t now)
-{
-	const dva_t *dva = bp->blk_dva;
-	int ndvas = BP_GET_NDVAS(bp);
-
-	ASSERT(!BP_IS_HOLE(bp));
-	ASSERT(!now || bp->blk_birth >= spa_syncing_txg(spa));
-
-	/*
-	 * If we have a checkpoint for the pool we need to make sure that
-	 * the blocks that we free that are part of the checkpoint won't be
-	 * reused until the checkpoint is discarded or we revert to it.
-	 *
-	 * The checkpoint flag is passed down the metaslab_free code path
-	 * and is set whenever we want to add a block to the checkpoint's
-	 * accounting. That is, we "checkpoint" blocks that existed at the
-	 * time the checkpoint was created and are therefore referenced by
-	 * the checkpointed uberblock.
-	 *
-	 * Note that, we don't checkpoint any blocks if the current
-	 * syncing txg <= spa_checkpoint_txg. We want these frees to sync
-	 * normally as they will be referenced by the checkpointed uberblock.
-	 */
-	boolean_t checkpoint = B_FALSE;
-	if (bp->blk_birth <= spa->spa_checkpoint_txg &&
-	    spa_syncing_txg(spa) > spa->spa_checkpoint_txg) {
-		/*
-		 * At this point, if the block is part of the checkpoint
-		 * there is no way it was created in the current txg.
-		 */
-		ASSERT(!now);
-		ASSERT3U(spa_syncing_txg(spa), ==, txg);
-		checkpoint = B_TRUE;
-	}
-
-	spa_config_enter(spa, SCL_FREE, FTAG, RW_READER);
-
-	for (int d = 0; d < ndvas; d++) {
-		if (now) {
-			metaslab_unalloc_dva(spa, &dva[d], txg);
-		} else {
-			ASSERT3U(txg, ==, spa_syncing_txg(spa));
-			metaslab_free_dva(spa, &dva[d], checkpoint);
-		}
-	}
-
-	spa_config_exit(spa, SCL_FREE, FTAG);
-}
-
-int
-metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg)
-{
-	const dva_t *dva = bp->blk_dva;
-	int ndvas = BP_GET_NDVAS(bp);
-	int error = 0;
-
-	ASSERT(!BP_IS_HOLE(bp));
-
-	if (txg != 0) {
-		/*
-		 * First do a dry run to make sure all DVAs are claimable,
-		 * so we don't have to unwind from partial failures below.
-		 */
-		if ((error = metaslab_claim(spa, bp, 0)) != 0)
-			return (error);
-	}
-
-	spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER);
-
-	for (int d = 0; d < ndvas; d++) {
-		error = metaslab_claim_dva(spa, &dva[d], txg);
-		if (error != 0)
-			break;
-	}
-
-	spa_config_exit(spa, SCL_ALLOC, FTAG);
-
-	ASSERT(error == 0 || txg == 0);
-
-	return (error);
-}
-
-/* ARGSUSED */
-static void
-metaslab_check_free_impl_cb(uint64_t inner, vdev_t *vd, uint64_t offset,
-    uint64_t size, void *arg)
-{
-	if (vd->vdev_ops == &vdev_indirect_ops)
-		return;
-
-	metaslab_check_free_impl(vd, offset, size);
-}
-
-static void
-metaslab_check_free_impl(vdev_t *vd, uint64_t offset, uint64_t size)
-{
-	metaslab_t *msp;
-	spa_t *spa = vd->vdev_spa;
-
-	if ((zfs_flags & ZFS_DEBUG_ZIO_FREE) == 0)
-		return;
-
-	if (vd->vdev_ops->vdev_op_remap != NULL) {
-		vd->vdev_ops->vdev_op_remap(vd, offset, size,
-		    metaslab_check_free_impl_cb, NULL);
-		return;
-	}
-
-	ASSERT(vdev_is_concrete(vd));
-	ASSERT3U(offset >> vd->vdev_ms_shift, <, vd->vdev_ms_count);
-	ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0);
-
-	msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
-
-	mutex_enter(&msp->ms_lock);
-	if (msp->ms_loaded) {
-		range_tree_verify_not_present(msp->ms_allocatable,
-		    offset, size);
-	}
-
-	range_tree_verify_not_present(msp->ms_freeing, offset, size);
-	range_tree_verify_not_present(msp->ms_checkpointing, offset, size);
-	range_tree_verify_not_present(msp->ms_freed, offset, size);
-	for (int j = 0; j < TXG_DEFER_SIZE; j++)
-		range_tree_verify_not_present(msp->ms_defer[j], offset, size);
-	mutex_exit(&msp->ms_lock);
-}
-
-void
-metaslab_check_free(spa_t *spa, const blkptr_t *bp)
-{
-	if ((zfs_flags & ZFS_DEBUG_ZIO_FREE) == 0)
-		return;
-
-	spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
-	for (int i = 0; i < BP_GET_NDVAS(bp); i++) {
-		uint64_t vdev = DVA_GET_VDEV(&bp->blk_dva[i]);
-		vdev_t *vd = vdev_lookup_top(spa, vdev);
-		uint64_t offset = DVA_GET_OFFSET(&bp->blk_dva[i]);
-		uint64_t size = DVA_GET_ASIZE(&bp->blk_dva[i]);
-
-		if (DVA_GET_GANG(&bp->blk_dva[i]))
-			size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
-
-		ASSERT3P(vd, !=, NULL);
-
-		metaslab_check_free_impl(vd, offset, size);
-	}
-	spa_config_exit(spa, SCL_VDEV, FTAG);
-}
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/mmp.c
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/mmp.c
+++ /dev/null
@@ -1,750 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright (c) 2017 by Lawrence Livermore National Security, LLC.
- * Copyright 2019 Joyent, Inc.
- */
-
-#include <sys/abd.h>
-#include <sys/mmp.h>
-#include <sys/spa.h>
-#include <sys/spa_impl.h>
-#include <sys/time.h>
-#include <sys/vdev.h>
-#include <sys/vdev_impl.h>
-#include <sys/zfs_context.h>
-#include <sys/callb.h>
-
-/*
- * Multi-Modifier Protection (MMP) attempts to prevent a user from importing
- * or opening a pool on more than one host at a time.  In particular, it
- * prevents "zpool import -f" on a host from succeeding while the pool is
- * already imported on another host.  There are many other ways in which a
- * device could be used by two hosts for different purposes at the same time
- * resulting in pool damage.  This implementation does not attempt to detect
- * those cases.
- *
- * MMP operates by ensuring there are frequent visible changes on disk (a
- * "heartbeat") at all times.  And by altering the import process to check
- * for these changes and failing the import when they are detected.  This
- * functionality is enabled by setting the 'multihost' pool property to on.
- *
- * Uberblocks written by the txg_sync thread always go into the first
- * (N-MMP_BLOCKS_PER_LABEL) slots, the remaining slots are reserved for MMP.
- * They are used to hold uberblocks which are exactly the same as the last
- * synced uberblock except that the ub_timestamp and mmp_config are frequently
- * updated.  Like all other uberblocks, the slot is written with an embedded
- * checksum, and slots with invalid checksums are ignored.  This provides the
- * "heartbeat", with no risk of overwriting good uberblocks that must be
- * preserved, e.g. previous txgs and associated block pointers.
- *
- * Three optional fields are added to uberblock structure; ub_mmp_magic,
- * ub_mmp_config, and ub_mmp_delay.  The ub_mmp_magic value allows zfs to tell
- * whether the other ub_mmp_* fields are valid.  The ub_mmp_config field tells
- * the importing host the settings of zfs_multihost_interval and
- * zfs_multihost_fail_intervals on the host which last had (or currently has)
- * the pool imported.  These determine how long a host must wait to detect
- * activity in the pool, before concluding the pool is not in use.  The
- * mmp_delay field is a decaying average of the amount of time between
- * completion of successive MMP writes, in nanoseconds.  It indicates whether
- * MMP is enabled.
- *
- * During import an activity test may now be performed to determine if
- * the pool is in use.  The activity test is typically required if the
- * ZPOOL_CONFIG_HOSTID does not match the system hostid, the pool state is
- * POOL_STATE_ACTIVE, and the pool is not a root pool.
- *
- * The activity test finds the "best" uberblock (highest txg, timestamp, and, if
- * ub_mmp_magic is valid, sequence number from ub_mmp_config).  It then waits
- * some time, and finds the "best" uberblock again.  If any of the mentioned
- * fields have different values in the newly read uberblock, the pool is in use
- * by another host and the import fails.  In order to assure the accuracy of the
- * activity test, the default values result in an activity test duration of 20x
- * the mmp write interval.
- *
- * The duration of the "zpool import" activity test depends on the information
- * available in the "best" uberblock:
- *
- * 1) If uberblock was written by zfs-0.8 or newer and fail_intervals > 0:
- *    ub_mmp_config.fail_intervals * ub_mmp_config.multihost_interval * 2
- *
- *    In this case, a weak guarantee is provided.  Since the host which last had
- *    the pool imported will suspend the pool if no mmp writes land within
- *    fail_intervals * multihost_interval ms, the absence of writes during that
- *    time means either the pool is not imported, or it is imported but the pool
- *    is suspended and no further writes will occur.
- *
- *    Note that resuming the suspended pool on the remote host would invalidate
- *    this guarantee, and so it is not allowed.
- *
- *    The factor of 2 provides a conservative safety factor and derives from
- *    MMP_IMPORT_SAFETY_FACTOR;
- *
- * 2) If uberblock was written by zfs-0.8 or newer and fail_intervals == 0:
- *    (ub_mmp_config.multihost_interval + ub_mmp_delay) *
- *        zfs_multihost_import_intervals
- *
- *    In this case no guarantee can provided.  However, as long as some devices
- *    are healthy and connected, it is likely that at least one write will land
- *    within (multihost_interval + mmp_delay) because multihost_interval is
- *    enough time for a write to be attempted to each leaf vdev, and mmp_delay
- *    is enough for one to land, based on past delays.  Multiplying by
- *    zfs_multihost_import_intervals provides a conservative safety factor.
- *
- * 3) If uberblock was written by zfs-0.7:
- *    (zfs_multihost_interval + ub_mmp_delay) * zfs_multihost_import_intervals
- *
- *    The same logic as case #2 applies, but we do not know remote tunables.
- *
- *    We use the local value for zfs_multihost_interval because the original MMP
- *    did not record this value in the uberblock.
- *
- *    ub_mmp_delay >= (zfs_multihost_interval / leaves), so if the other host
- *    has a much larger zfs_multihost_interval set, ub_mmp_delay will reflect
- *    that.  We will have waited enough time for zfs_multihost_import_intervals
- *    writes to be issued and all but one to land.
- *
- *    single device pool example delays
- *
- *    import_delay = (1 + 1) * 20   =  40s #defaults, no I/O delay
- *    import_delay = (1 + 10) * 20  = 220s #defaults, 10s I/O delay
- *    import_delay = (10 + 10) * 20 = 400s #10s multihost_interval,
- *                                          no I/O delay
- *    100 device pool example delays
- *
- *    import_delay = (1 + .01) * 20 =  20s #defaults, no I/O delay
- *    import_delay = (1 + 10) * 20  = 220s #defaults, 10s I/O delay
- *    import_delay = (10 + .1) * 20 = 202s #10s multihost_interval,
- *                                          no I/O delay
- *
- * 4) Otherwise, this uberblock was written by a pre-MMP zfs:
- *    zfs_multihost_import_intervals * zfs_multihost_interval
- *
- *    In this case local tunables are used.  By default this product = 10s, long
- *    enough for a pool with any activity at all to write at least one
- *    uberblock.  No guarantee can be provided.
- *
- * Additionally, the duration is then extended by a random 25% to attempt to to
- * detect simultaneous imports.  For example, if both partner hosts are rebooted
- * at the same time and automatically attempt to import the pool.
- */
-
-/*
- * Used to control the frequency of mmp writes which are performed when the
- * 'multihost' pool property is on.  This is one factor used to determine the
- * length of the activity check during import.
- *
- * On average an mmp write will be issued for each leaf vdev every
- * zfs_multihost_interval milliseconds.  In practice, the observed period can
- * vary with the I/O load and this observed value is the ub_mmp_delay which is
- * stored in the uberblock.  The minimum allowed value is 100 ms.
- */
-ulong_t zfs_multihost_interval = MMP_DEFAULT_INTERVAL;
-#ifdef __FreeBSD__
-SYSCTL_DECL(_vfs_zfs);
-SYSCTL_ULONG(_vfs_zfs, OID_AUTO, multihost_interval, CTLFLAG_RWTUN,
-    &zfs_multihost_interval, 0, "Interval between MMP writes, milliseconds");
-#endif
-
-/*
- * Used to control the duration of the activity test on import.  Smaller values
- * of zfs_multihost_import_intervals will reduce the import time but increase
- * the risk of failing to detect an active pool.  The total activity check time
- * is never allowed to drop below one second.  A value of 0 is ignored and
- * treated as if it was set to 1.
- */
-uint_t zfs_multihost_import_intervals = MMP_DEFAULT_IMPORT_INTERVALS;
-#ifdef __FreeBSD__
-SYSCTL_UINT(_vfs_zfs, OID_AUTO, multihost_import_intervals, CTLFLAG_RWTUN,
-    &zfs_multihost_import_intervals, 0,
-    "MMP activity check period for pool import, "
-    "in units of multihost_interval");
-#endif
-
-/*
- * Controls the behavior of the pool when mmp write failures or delays are
- * detected.
- *
- * When zfs_multihost_fail_intervals = 0, mmp write failures or delays are
- * ignored.  The failures will still be reported to the ZED which depending on
- * its configuration may take action such as suspending the pool or taking a
- * device offline.
- *
- * When zfs_multihost_fail_intervals > 0, the pool will be suspended if
- * zfs_multihost_fail_intervals * zfs_multihost_interval milliseconds pass
- * without a successful mmp write.  This guarantees the activity test will see
- * mmp writes if the pool is imported.  A value of 1 is ignored and treated as
- * if it was set to 2, because a single leaf vdev pool will issue a write once
- * per multihost_interval and thus any variation in latency would cause the
- * pool to be suspended.
- */
-uint_t zfs_multihost_fail_intervals = MMP_DEFAULT_FAIL_INTERVALS;
-#ifdef __FreeBSD__
-SYSCTL_UINT(_vfs_zfs, OID_AUTO, multihost_fail_intervals, CTLFLAG_RWTUN,
-    &zfs_multihost_fail_intervals, 0,
-    "How long to tolerate MMP write failures before suspending a pool, "
-    "in units of multihost_interval");
-#endif
-
-char *mmp_tag = "mmp_write_uberblock";
-static void mmp_thread(void *arg);
-
-void
-mmp_init(spa_t *spa)
-{
-	mmp_thread_t *mmp = &spa->spa_mmp;
-
-	mutex_init(&mmp->mmp_thread_lock, NULL, MUTEX_DEFAULT, NULL);
-	cv_init(&mmp->mmp_thread_cv, NULL, CV_DEFAULT, NULL);
-	mutex_init(&mmp->mmp_io_lock, NULL, MUTEX_DEFAULT, NULL);
-	mmp->mmp_kstat_id = 1;
-
-	/*
-	 * mmp_write_done() calculates mmp_delay based on prior mmp_delay and
-	 * the elapsed time since the last write.  For the first mmp write,
-	 * there is no "last write", so we start with fake non-zero values.
-	 */
-	mmp->mmp_last_write = gethrtime();
-	mmp->mmp_delay = MSEC2NSEC(MMP_INTERVAL_OK(zfs_multihost_interval));
-}
-
-void
-mmp_fini(spa_t *spa)
-{
-	mmp_thread_t *mmp = &spa->spa_mmp;
-
-	mutex_destroy(&mmp->mmp_thread_lock);
-	cv_destroy(&mmp->mmp_thread_cv);
-	mutex_destroy(&mmp->mmp_io_lock);
-}
-
-static void
-mmp_thread_enter(mmp_thread_t *mmp, callb_cpr_t *cpr)
-{
-	CALLB_CPR_INIT(cpr, &mmp->mmp_thread_lock, callb_generic_cpr, FTAG);
-	mutex_enter(&mmp->mmp_thread_lock);
-}
-
-static void
-mmp_thread_exit(mmp_thread_t *mmp, kthread_t **mpp, callb_cpr_t *cpr)
-{
-	ASSERT(*mpp != NULL);
-	*mpp = NULL;
-	cv_broadcast(&mmp->mmp_thread_cv);
-	CALLB_CPR_EXIT(cpr);		/* drops &mmp->mmp_thread_lock */
-	thread_exit();
-}
-
-void
-mmp_thread_start(spa_t *spa)
-{
-	mmp_thread_t *mmp = &spa->spa_mmp;
-
-	if (spa_writeable(spa)) {
-		mutex_enter(&mmp->mmp_thread_lock);
-		if (!mmp->mmp_thread) {
-			mmp->mmp_thread = thread_create(NULL, 0, mmp_thread,
-			    spa, 0, &p0, TS_RUN, minclsyspri);
-			zfs_dbgmsg("MMP thread started pool '%s' "
-			    "gethrtime %llu", spa_name(spa), gethrtime());
-		}
-		mutex_exit(&mmp->mmp_thread_lock);
-	}
-}
-
-void
-mmp_thread_stop(spa_t *spa)
-{
-	mmp_thread_t *mmp = &spa->spa_mmp;
-
-	mutex_enter(&mmp->mmp_thread_lock);
-	mmp->mmp_thread_exiting = 1;
-	cv_broadcast(&mmp->mmp_thread_cv);
-
-	while (mmp->mmp_thread) {
-		cv_wait(&mmp->mmp_thread_cv, &mmp->mmp_thread_lock);
-	}
-	mutex_exit(&mmp->mmp_thread_lock);
-	zfs_dbgmsg("MMP thread stopped pool '%s' gethrtime %llu",
-	    spa_name(spa), gethrtime());
-
-	ASSERT(mmp->mmp_thread == NULL);
-	mmp->mmp_thread_exiting = 0;
-}
-
-typedef enum mmp_vdev_state_flag {
-	MMP_FAIL_NOT_WRITABLE	= (1 << 0),
-	MMP_FAIL_WRITE_PENDING	= (1 << 1),
-} mmp_vdev_state_flag_t;
-
-/*
- * Find a leaf vdev to write an MMP block to.  It must not have an outstanding
- * mmp write (if so a new write will also likely block).  If there is no usable
- * leaf, a nonzero error value is returned. The error value returned is a bit
- * field.
- *
- * MMP_FAIL_WRITE_PENDING   One or more leaf vdevs are writeable, but have an
- *                          outstanding MMP write.
- * MMP_FAIL_NOT_WRITABLE    One or more leaf vdevs are not writeable.
- */
-
-static int
-mmp_next_leaf(spa_t *spa)
-{
-	vdev_t *leaf;
-	vdev_t *starting_leaf;
-	int fail_mask = 0;
-
-	ASSERT(MUTEX_HELD(&spa->spa_mmp.mmp_io_lock));
-	ASSERT(spa_config_held(spa, SCL_STATE, RW_READER));
-	ASSERT(list_link_active(&spa->spa_leaf_list.list_head) == B_TRUE);
-	ASSERT(!list_is_empty(&spa->spa_leaf_list));
-
-	if (spa->spa_mmp.mmp_leaf_last_gen != spa->spa_leaf_list_gen) {
-		spa->spa_mmp.mmp_last_leaf = list_head(&spa->spa_leaf_list);
-		spa->spa_mmp.mmp_leaf_last_gen = spa->spa_leaf_list_gen;
-	}
-
-	leaf = spa->spa_mmp.mmp_last_leaf;
-	if (leaf == NULL)
-		leaf = list_head(&spa->spa_leaf_list);
-	starting_leaf = leaf;
-
-	do {
-		leaf = list_next(&spa->spa_leaf_list, leaf);
-		if (leaf == NULL)
-			leaf = list_head(&spa->spa_leaf_list);
-
-		if (!vdev_writeable(leaf)) {
-			fail_mask |= MMP_FAIL_NOT_WRITABLE;
-		} else if (leaf->vdev_mmp_pending != 0) {
-			fail_mask |= MMP_FAIL_WRITE_PENDING;
-		} else {
-			spa->spa_mmp.mmp_last_leaf = leaf;
-			return (0);
-		}
-	} while (leaf != starting_leaf);
-
-	ASSERT(fail_mask);
-
-	return (fail_mask);
-}
-
-/*
- * MMP writes are issued on a fixed schedule, but may complete at variable,
- * much longer, intervals.  The mmp_delay captures long periods between
- * successful writes for any reason, including disk latency, scheduling delays,
- * etc.
- *
- * The mmp_delay is usually calculated as a decaying average, but if the latest
- * delay is higher we do not average it, so that we do not hide sudden spikes
- * which the importing host must wait for.
- *
- * If writes are occurring frequently, such as due to a high rate of txg syncs,
- * the mmp_delay could become very small.  Since those short delays depend on
- * activity we cannot count on, we never allow mmp_delay to get lower than rate
- * expected if only mmp_thread writes occur.
- *
- * If an mmp write was skipped or fails, and we have already waited longer than
- * mmp_delay, we need to update it so the next write reflects the longer delay.
- *
- * Do not set mmp_delay if the multihost property is not on, so as not to
- * trigger an activity check on import.
- */
-static void
-mmp_delay_update(spa_t *spa, boolean_t write_completed)
-{
-	mmp_thread_t *mts = &spa->spa_mmp;
-	hrtime_t delay = gethrtime() - mts->mmp_last_write;
-
-	ASSERT(MUTEX_HELD(&mts->mmp_io_lock));
-
-	if (spa_multihost(spa) == B_FALSE) {
-		mts->mmp_delay = 0;
-		return;
-	}
-
-	if (delay > mts->mmp_delay)
-		mts->mmp_delay = delay;
-
-	if (write_completed == B_FALSE)
-		return;
-
-	mts->mmp_last_write = gethrtime();
-
-	/*
-	 * strictly less than, in case delay was changed above.
-	 */
-	if (delay < mts->mmp_delay) {
-		hrtime_t min_delay =
-		    MSEC2NSEC(MMP_INTERVAL_OK(zfs_multihost_interval)) /
-		    MAX(1, vdev_count_leaves(spa));
-		mts->mmp_delay = MAX(((delay + mts->mmp_delay * 127) / 128),
-		    min_delay);
-	}
-}
-
-static void
-mmp_write_done(zio_t *zio)
-{
-	spa_t *spa = zio->io_spa;
-	vdev_t *vd = zio->io_vd;
-	mmp_thread_t *mts = zio->io_private;
-
-	mutex_enter(&mts->mmp_io_lock);
-	uint64_t mmp_kstat_id = vd->vdev_mmp_kstat_id;
-	hrtime_t mmp_write_duration = gethrtime() - vd->vdev_mmp_pending;
-
-	mmp_delay_update(spa, (zio->io_error == 0));
-
-	vd->vdev_mmp_pending = 0;
-	vd->vdev_mmp_kstat_id = 0;
-
-	mutex_exit(&mts->mmp_io_lock);
-	spa_config_exit(spa, SCL_STATE, mmp_tag);
-
-	abd_free(zio->io_abd);
-}
-
-/*
- * When the uberblock on-disk is updated by a spa_sync,
- * creating a new "best" uberblock, update the one stored
- * in the mmp thread state, used for mmp writes.
- */
-void
-mmp_update_uberblock(spa_t *spa, uberblock_t *ub)
-{
-	mmp_thread_t *mmp = &spa->spa_mmp;
-
-	mutex_enter(&mmp->mmp_io_lock);
-	mmp->mmp_ub = *ub;
-	mmp->mmp_seq = 1;
-	mmp->mmp_ub.ub_timestamp = gethrestime_sec();
-	mmp_delay_update(spa, B_TRUE);
-	mutex_exit(&mmp->mmp_io_lock);
-}
-
-/*
- * Choose a random vdev, label, and MMP block, and write over it
- * with a copy of the last-synced uberblock, whose timestamp
- * has been updated to reflect that the pool is in use.
- */
-static void
-mmp_write_uberblock(spa_t *spa)
-{
-	int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL;
-	mmp_thread_t *mmp = &spa->spa_mmp;
-	uberblock_t *ub;
-	vdev_t *vd = NULL;
-	int label, error;
-	uint64_t offset;
-
-	hrtime_t lock_acquire_time = gethrtime();
-	spa_config_enter(spa, SCL_STATE, mmp_tag, RW_READER);
-	lock_acquire_time = gethrtime() - lock_acquire_time;
-	if (lock_acquire_time > (MSEC2NSEC(MMP_MIN_INTERVAL) / 10))
-		zfs_dbgmsg("MMP SCL_STATE acquisition pool '%s' took %llu ns "
-		    "gethrtime %llu", spa_name(spa), lock_acquire_time,
-		    gethrtime());
-
-	mutex_enter(&mmp->mmp_io_lock);
-
-	error = mmp_next_leaf(spa);
-
-	/*
-	 * spa_mmp_history has two types of entries:
-	 * Issued MMP write: records time issued, error status, etc.
-	 * Skipped MMP write: an MMP write could not be issued because no
-	 * suitable leaf vdev was available.  See comment above struct
-	 * spa_mmp_history for details.
-	 */
-
-	if (error) {
-		mmp_delay_update(spa, B_FALSE);
-		if (mmp->mmp_skip_error == error) {
-			/*
-			 * ZoL porting note: the following is TBD
-			 * spa_mmp_history_set_skip(spa, mmp->mmp_kstat_id - 1);
-			 */
-		} else {
-			mmp->mmp_skip_error = error;
-			/*
-			 * ZoL porting note: the following is TBD
-			 * spa_mmp_history_add(spa, mmp->mmp_ub.ub_txg,
-			 * gethrestime_sec(), mmp->mmp_delay, NULL, 0,
-			 * mmp->mmp_kstat_id++, error);
-			 */
-			zfs_dbgmsg("MMP error choosing leaf pool '%s' "
-			    "gethrtime %llu fail_mask %#x", spa_name(spa),
-			    gethrtime(), error);
-		}
-		mutex_exit(&mmp->mmp_io_lock);
-		spa_config_exit(spa, SCL_STATE, mmp_tag);
-		return;
-	}
-
-	vd = spa->spa_mmp.mmp_last_leaf;
-	if (mmp->mmp_skip_error != 0) {
-		mmp->mmp_skip_error = 0;
-		zfs_dbgmsg("MMP write after skipping due to unavailable "
-		    "leaves, pool '%s' gethrtime %llu leaf %#llu",
-		    spa_name(spa), gethrtime(), vd->vdev_guid);
-	}
-
-	if (mmp->mmp_zio_root == NULL)
-		mmp->mmp_zio_root = zio_root(spa, NULL, NULL,
-		    flags | ZIO_FLAG_GODFATHER);
-
-	if (mmp->mmp_ub.ub_timestamp != gethrestime_sec()) {
-		/*
-		 * Want to reset mmp_seq when timestamp advances because after
-		 * an mmp_seq wrap new values will not be chosen by
-		 * uberblock_compare() as the "best".
-		 */
-		mmp->mmp_ub.ub_timestamp = gethrestime_sec();
-		mmp->mmp_seq = 1;
-	}
-
-	ub = &mmp->mmp_ub;
-	ub->ub_mmp_magic = MMP_MAGIC;
-	ub->ub_mmp_delay = mmp->mmp_delay;
-	ub->ub_mmp_config = MMP_SEQ_SET(mmp->mmp_seq) |
-	    MMP_INTERVAL_SET(MMP_INTERVAL_OK(zfs_multihost_interval)) |
-	    MMP_FAIL_INT_SET(MMP_FAIL_INTVS_OK(
-	    zfs_multihost_fail_intervals));
-	vd->vdev_mmp_pending = gethrtime();
-	vd->vdev_mmp_kstat_id = mmp->mmp_kstat_id;
-
-	zio_t *zio  = zio_null(mmp->mmp_zio_root, spa, NULL, NULL, NULL, flags);
-	abd_t *ub_abd = abd_alloc_for_io(VDEV_UBERBLOCK_SIZE(vd), B_TRUE);
-	abd_zero(ub_abd, VDEV_UBERBLOCK_SIZE(vd));
-	abd_copy_from_buf(ub_abd, ub, sizeof (uberblock_t));
-
-	mmp->mmp_seq++;
-	mmp->mmp_kstat_id++;
-	mutex_exit(&mmp->mmp_io_lock);
-
-	offset = VDEV_UBERBLOCK_OFFSET(vd, VDEV_UBERBLOCK_COUNT(vd) -
-	    MMP_BLOCKS_PER_LABEL + spa_get_random(MMP_BLOCKS_PER_LABEL));
-
-	label = spa_get_random(VDEV_LABELS);
-	vdev_label_write(zio, vd, label, ub_abd, offset,
-	    VDEV_UBERBLOCK_SIZE(vd), mmp_write_done, mmp,
-	    flags | ZIO_FLAG_DONT_PROPAGATE);
-
-	/*
-	 * ZoL porting note: the following is TBD
-	 * (void) spa_mmp_history_add(spa, ub->ub_txg, ub->ub_timestamp,
-	 * ub->ub_mmp_delay, vd, label, vd->vdev_mmp_kstat_id, 0);
-	 */
-
-	zio_nowait(zio);
-}
-
-static void
-mmp_thread(void *arg)
-{
-	spa_t *spa = (spa_t *)arg;
-	mmp_thread_t *mmp = &spa->spa_mmp;
-	boolean_t suspended = spa_suspended(spa);
-	boolean_t multihost = spa_multihost(spa);
-	uint64_t mmp_interval = MSEC2NSEC(MMP_INTERVAL_OK(
-	    zfs_multihost_interval));
-	uint32_t mmp_fail_intervals = MMP_FAIL_INTVS_OK(
-	    zfs_multihost_fail_intervals);
-	hrtime_t mmp_fail_ns = mmp_fail_intervals * mmp_interval;
-	boolean_t last_spa_suspended = suspended;
-	boolean_t last_spa_multihost = multihost;
-	uint64_t last_mmp_interval = mmp_interval;
-	uint32_t last_mmp_fail_intervals = mmp_fail_intervals;
-	hrtime_t last_mmp_fail_ns = mmp_fail_ns;
-	callb_cpr_t cpr;
-	int skip_wait = 0;
-
-	mmp_thread_enter(mmp, &cpr);
-
-	while (!mmp->mmp_thread_exiting) {
-		hrtime_t next_time = gethrtime() +
-		    MSEC2NSEC(MMP_DEFAULT_INTERVAL);
-		int leaves = MAX(vdev_count_leaves(spa), 1);
-
-		/* Detect changes in tunables or state */
-
-		last_spa_suspended = suspended;
-		last_spa_multihost = multihost;
-		suspended = spa_suspended(spa);
-		multihost = spa_multihost(spa);
-
-		last_mmp_interval = mmp_interval;
-		last_mmp_fail_intervals = mmp_fail_intervals;
-		last_mmp_fail_ns = mmp_fail_ns;
-		mmp_interval = MSEC2NSEC(MMP_INTERVAL_OK(
-		    zfs_multihost_interval));
-		mmp_fail_intervals = MMP_FAIL_INTVS_OK(
-		    zfs_multihost_fail_intervals);
-
-		/* Smooth so pool is not suspended when reducing tunables */
-		if (mmp_fail_intervals * mmp_interval < mmp_fail_ns) {
-			mmp_fail_ns = (mmp_fail_ns * 31 +
-			    mmp_fail_intervals * mmp_interval) / 32;
-		} else {
-			mmp_fail_ns = mmp_fail_intervals *
-			    mmp_interval;
-		}
-
-		if (mmp_interval != last_mmp_interval ||
-		    mmp_fail_intervals != last_mmp_fail_intervals) {
-			/*
-			 * We want other hosts to see new tunables as quickly as
-			 * possible.  Write out at higher frequency than usual.
-			 */
-			skip_wait += leaves;
-		}
-
-		if (multihost)
-			next_time = gethrtime() + mmp_interval / leaves;
-
-		if (mmp_fail_ns != last_mmp_fail_ns) {
-			zfs_dbgmsg("MMP interval change pool '%s' "
-			    "gethrtime %llu last_mmp_interval %llu "
-			    "mmp_interval %llu last_mmp_fail_intervals %u "
-			    "mmp_fail_intervals %u mmp_fail_ns %llu "
-			    "skip_wait %d leaves %d next_time %llu",
-			    spa_name(spa), gethrtime(), last_mmp_interval,
-			    mmp_interval, last_mmp_fail_intervals,
-			    mmp_fail_intervals, mmp_fail_ns, skip_wait, leaves,
-			    next_time);
-		}
-
-		/*
-		 * MMP off => on, or suspended => !suspended:
-		 * No writes occurred recently.  Update mmp_last_write to give
-		 * us some time to try.
-		 */
-		if ((!last_spa_multihost && multihost) ||
-		    (last_spa_suspended && !suspended)) {
-			zfs_dbgmsg("MMP state change pool '%s': gethrtime %llu "
-			    "last_spa_multihost %u multihost %u "
-			    "last_spa_suspended %u suspended %u",
-			    spa_name(spa), last_spa_multihost, multihost,
-			    last_spa_suspended, suspended);
-			mutex_enter(&mmp->mmp_io_lock);
-			mmp->mmp_last_write = gethrtime();
-			mmp->mmp_delay = mmp_interval;
-			mutex_exit(&mmp->mmp_io_lock);
-		}
-
-		/*
-		 * MMP on => off:
-		 * mmp_delay == 0 tells importing node to skip activity check.
-		 */
-		if (last_spa_multihost && !multihost) {
-			mutex_enter(&mmp->mmp_io_lock);
-			mmp->mmp_delay = 0;
-			mutex_exit(&mmp->mmp_io_lock);
-		}
-
-		/*
-		 * Suspend the pool if no MMP write has succeeded in over
-		 * mmp_interval * mmp_fail_intervals nanoseconds.
-		 */
-		if (multihost && !suspended && mmp_fail_intervals &&
-		    (gethrtime() - mmp->mmp_last_write) > mmp_fail_ns) {
-			zfs_dbgmsg("MMP suspending pool '%s': gethrtime %llu "
-			    "mmp_last_write %llu mmp_interval %llu "
-			    "mmp_fail_intervals %llu mmp_fail_ns %llu",
-			    spa_name(spa), (u_longlong_t)gethrtime(),
-			    (u_longlong_t)mmp->mmp_last_write,
-			    (u_longlong_t)mmp_interval,
-			    (u_longlong_t)mmp_fail_intervals,
-			    (u_longlong_t)mmp_fail_ns);
-			cmn_err(CE_WARN, "MMP writes to pool '%s' have not "
-			    "succeeded in over %llu ms; suspending pool. "
-			    "Hrtime %llu",
-			    spa_name(spa),
-			    NSEC2MSEC(gethrtime() - mmp->mmp_last_write),
-			    gethrtime());
-			zio_suspend(spa, NULL, ZIO_SUSPEND_MMP);
-		}
-
-		if (multihost && !suspended)
-			mmp_write_uberblock(spa);
-
-		if (skip_wait > 0) {
-			next_time = gethrtime() + MSEC2NSEC(MMP_MIN_INTERVAL) /
-			    leaves;
-			skip_wait--;
-		}
-
-		CALLB_CPR_SAFE_BEGIN(&cpr);
-#if defined(illumos)
-		(void) cv_timedwait_sig_hrtime(&mmp->mmp_thread_cv,
-		    &mmp->mmp_thread_lock, next_time);
-#elif defined(_KERNEL)
-		(void) cv_timedwait_sig_sbt(&mmp->mmp_thread_cv,
-		    &mmp->mmp_thread_lock, nstosbt(next_time),
-		    100 * SBT_1US, C_ABSOLUTE);
-#else
-		(void) cv_timedwait_sig_hires(&mmp->mmp_thread_cv,
-		    &mmp->mmp_thread_lock, next_time, USEC2NSEC(100),
-		    CALLOUT_FLAG_ABSOLUTE);
-#endif
-		CALLB_CPR_SAFE_END(&cpr, &mmp->mmp_thread_lock);
-	}
-
-	/* Outstanding writes are allowed to complete. */
-	if (mmp->mmp_zio_root)
-		zio_wait(mmp->mmp_zio_root);
-
-	mmp->mmp_zio_root = NULL;
-	mmp_thread_exit(mmp, &mmp->mmp_thread, &cpr);
-}
-
-/*
- * Signal the MMP thread to wake it, when it is sleeping on
- * its cv.  Used when some module parameter has changed and
- * we want the thread to know about it.
- * Only signal if the pool is active and mmp thread is
- * running, otherwise there is no thread to wake.
- */
-static void
-mmp_signal_thread(spa_t *spa)
-{
-	mmp_thread_t *mmp = &spa->spa_mmp;
-
-	mutex_enter(&mmp->mmp_thread_lock);
-	if (mmp->mmp_thread)
-		cv_broadcast(&mmp->mmp_thread_cv);
-	mutex_exit(&mmp->mmp_thread_lock);
-}
-
-void
-mmp_signal_all_threads(void)
-{
-	spa_t *spa = NULL;
-
-	mutex_enter(&spa_namespace_lock);
-	while ((spa = spa_next(spa))) {
-		if (spa->spa_state == POOL_STATE_ACTIVE)
-			mmp_signal_thread(spa);
-	}
-	mutex_exit(&spa_namespace_lock);
-}
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/multilist.c
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/multilist.c
+++ /dev/null
@@ -1,423 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * This file and its contents are supplied under the terms of the
- * Common Development and Distribution License ("CDDL"), version 1.0.
- * You may only use this file in accordance with the terms of version
- * 1.0 of the CDDL.
- *
- * A full copy of the text of the CDDL should have accompanied this
- * source.  A copy of the CDDL is also available via the Internet at
- * http://www.illumos.org/license/CDDL.
- *
- * CDDL HEADER END
- */
-/*
- * Copyright (c) 2013, 2017 by Delphix. All rights reserved.
- */
-
-#include <sys/zfs_context.h>
-#include <sys/multilist.h>
-
-/* needed for spa_get_random() */
-#include <sys/spa.h>
-
-/*
- * This overrides the number of sublists in each multilist_t, which defaults
- * to the number of CPUs in the system (see multilist_create()).
- */
-int zfs_multilist_num_sublists = 0;
-
-/*
- * Given the object contained on the list, return a pointer to the
- * object's multilist_node_t structure it contains.
- */
-static multilist_node_t *
-multilist_d2l(multilist_t *ml, void *obj)
-{
-	return ((multilist_node_t *)((char *)obj + ml->ml_offset));
-}
-
-/*
- * Initialize a new mutlilist using the parameters specified.
- *
- *  - 'size' denotes the size of the structure containing the
- *     multilist_node_t.
- *  - 'offset' denotes the byte offset of the mutlilist_node_t within
- *     the structure that contains it.
- *  - 'num' specifies the number of internal sublists to create.
- *  - 'index_func' is used to determine which sublist to insert into
- *     when the multilist_insert() function is called; as well as which
- *     sublist to remove from when multilist_remove() is called. The
- *     requirements this function must meet, are the following:
- *
- *      - It must always return the same value when called on the same
- *        object (to ensure the object is removed from the list it was
- *        inserted into).
- *
- *      - It must return a value in the range [0, number of sublists).
- *        The multilist_get_num_sublists() function may be used to
- *        determine the number of sublists in the multilist.
- *
- *     Also, in order to reduce internal contention between the sublists
- *     during insertion and removal, this function should choose evenly
- *     between all available sublists when inserting. This isn't a hard
- *     requirement, but a general rule of thumb in order to garner the
- *     best multi-threaded performance out of the data structure.
- */
-static multilist_t *
-multilist_create_impl(size_t size, size_t offset,
-    unsigned int num, multilist_sublist_index_func_t *index_func)
-{
-	ASSERT3U(size, >, 0);
-	ASSERT3U(size, >=, offset + sizeof (multilist_node_t));
-	ASSERT3U(num, >, 0);
-	ASSERT3P(index_func, !=, NULL);
-
-	multilist_t *ml = kmem_alloc(sizeof (*ml), KM_SLEEP);
-	ml->ml_offset = offset;
-	ml->ml_num_sublists = num;
-	ml->ml_index_func = index_func;
-
-	ml->ml_sublists = kmem_zalloc(sizeof (multilist_sublist_t) *
-	    ml->ml_num_sublists, KM_SLEEP);
-
-	ASSERT3P(ml->ml_sublists, !=, NULL);
-
-	for (int i = 0; i < ml->ml_num_sublists; i++) {
-		multilist_sublist_t *mls = &ml->ml_sublists[i];
-		mutex_init(&mls->mls_lock, NULL, MUTEX_DEFAULT, NULL);
-		list_create(&mls->mls_list, size, offset);
-	}
-	return (ml);
-}
-
-/*
- * Allocate a new multilist, using the default number of sublists
- * (the number of CPUs, or at least 4, or the tunable
- * zfs_multilist_num_sublists).
- */
-multilist_t *
-multilist_create(size_t size, size_t offset,
-    multilist_sublist_index_func_t *index_func)
-{
-	int num_sublists;
-
-	if (zfs_multilist_num_sublists > 0) {
-		num_sublists = zfs_multilist_num_sublists;
-	} else {
-		num_sublists = MAX(max_ncpus, 4);
-	}
-
-	return (multilist_create_impl(size, offset, num_sublists, index_func));
-}
-
-/*
- * Destroy the given multilist object, and free up any memory it holds.
- */
-void
-multilist_destroy(multilist_t *ml)
-{
-	ASSERT(multilist_is_empty(ml));
-
-	for (int i = 0; i < ml->ml_num_sublists; i++) {
-		multilist_sublist_t *mls = &ml->ml_sublists[i];
-
-		ASSERT(list_is_empty(&mls->mls_list));
-
-		list_destroy(&mls->mls_list);
-		mutex_destroy(&mls->mls_lock);
-	}
-
-	ASSERT3P(ml->ml_sublists, !=, NULL);
-	kmem_free(ml->ml_sublists,
-	    sizeof (multilist_sublist_t) * ml->ml_num_sublists);
-
-	ml->ml_num_sublists = 0;
-	ml->ml_offset = 0;
-	kmem_free(ml, sizeof (multilist_t));
-}
-
-/*
- * Insert the given object into the multilist.
- *
- * This function will insert the object specified into the sublist
- * determined using the function given at multilist creation time.
- *
- * The sublist locks are automatically acquired if not already held, to
- * ensure consistency when inserting and removing from multiple threads.
- */
-void
-multilist_insert(multilist_t *ml, void *obj)
-{
-	unsigned int sublist_idx = ml->ml_index_func(ml, obj);
-	multilist_sublist_t *mls;
-	boolean_t need_lock;
-
-	DTRACE_PROBE3(multilist__insert, multilist_t *, ml,
-	    unsigned int, sublist_idx, void *, obj);
-
-	ASSERT3U(sublist_idx, <, ml->ml_num_sublists);
-
-	mls = &ml->ml_sublists[sublist_idx];
-
-	/*
-	 * Note: Callers may already hold the sublist lock by calling
-	 * multilist_sublist_lock().  Here we rely on MUTEX_HELD()
-	 * returning TRUE if and only if the current thread holds the
-	 * lock.  While it's a little ugly to make the lock recursive in
-	 * this way, it works and allows the calling code to be much
-	 * simpler -- otherwise it would have to pass around a flag
-	 * indicating that it already has the lock.
-	 */
-	need_lock = !MUTEX_HELD(&mls->mls_lock);
-
-	if (need_lock)
-		mutex_enter(&mls->mls_lock);
-
-	ASSERT(!multilist_link_active(multilist_d2l(ml, obj)));
-
-	multilist_sublist_insert_head(mls, obj);
-
-	if (need_lock)
-		mutex_exit(&mls->mls_lock);
-}
-
-/*
- * Remove the given object from the multilist.
- *
- * This function will remove the object specified from the sublist
- * determined using the function given at multilist creation time.
- *
- * The necessary sublist locks are automatically acquired, to ensure
- * consistency when inserting and removing from multiple threads.
- */
-void
-multilist_remove(multilist_t *ml, void *obj)
-{
-	unsigned int sublist_idx = ml->ml_index_func(ml, obj);
-	multilist_sublist_t *mls;
-	boolean_t need_lock;
-
-	DTRACE_PROBE3(multilist__remove, multilist_t *, ml,
-	    unsigned int, sublist_idx, void *, obj);
-
-	ASSERT3U(sublist_idx, <, ml->ml_num_sublists);
-
-	mls = &ml->ml_sublists[sublist_idx];
-	/* See comment in multilist_insert(). */
-	need_lock = !MUTEX_HELD(&mls->mls_lock);
-
-	if (need_lock)
-		mutex_enter(&mls->mls_lock);
-
-	ASSERT(multilist_link_active(multilist_d2l(ml, obj)));
-
-	multilist_sublist_remove(mls, obj);
-
-	if (need_lock)
-		mutex_exit(&mls->mls_lock);
-}
-
-/*
- * Check to see if this multilist object is empty.
- *
- * This will return TRUE if it finds all of the sublists of this
- * multilist to be empty, and FALSE otherwise. Each sublist lock will be
- * automatically acquired as necessary.
- *
- * If concurrent insertions and removals are occurring, the semantics
- * of this function become a little fuzzy. Instead of locking all
- * sublists for the entire call time of the function, each sublist is
- * only locked as it is individually checked for emptiness. Thus, it's
- * possible for this function to return TRUE with non-empty sublists at
- * the time the function returns. This would be due to another thread
- * inserting into a given sublist, after that specific sublist was check
- * and deemed empty, but before all sublists have been checked.
- */
-int
-multilist_is_empty(multilist_t *ml)
-{
-	for (int i = 0; i < ml->ml_num_sublists; i++) {
-		multilist_sublist_t *mls = &ml->ml_sublists[i];
-		/* See comment in multilist_insert(). */
-		boolean_t need_lock = !MUTEX_HELD(&mls->mls_lock);
-
-		if (need_lock)
-			mutex_enter(&mls->mls_lock);
-
-		if (!list_is_empty(&mls->mls_list)) {
-			if (need_lock)
-				mutex_exit(&mls->mls_lock);
-
-			return (FALSE);
-		}
-
-		if (need_lock)
-			mutex_exit(&mls->mls_lock);
-	}
-
-	return (TRUE);
-}
-
-/* Return the number of sublists composing this multilist */
-unsigned int
-multilist_get_num_sublists(multilist_t *ml)
-{
-	return (ml->ml_num_sublists);
-}
-
-/* Return a randomly selected, valid sublist index for this multilist */
-unsigned int
-multilist_get_random_index(multilist_t *ml)
-{
-	return (spa_get_random(ml->ml_num_sublists));
-}
-
-/* Lock and return the sublist specified at the given index */
-multilist_sublist_t *
-multilist_sublist_lock(multilist_t *ml, unsigned int sublist_idx)
-{
-	multilist_sublist_t *mls;
-
-	ASSERT3U(sublist_idx, <, ml->ml_num_sublists);
-	mls = &ml->ml_sublists[sublist_idx];
-	mutex_enter(&mls->mls_lock);
-
-	return (mls);
-}
-
-/* Lock and return the sublist that would be used to store the specified obj */
-multilist_sublist_t *
-multilist_sublist_lock_obj(multilist_t *ml, void *obj)
-{
-	return (multilist_sublist_lock(ml, ml->ml_index_func(ml, obj)));
-}
-
-void
-multilist_sublist_unlock(multilist_sublist_t *mls)
-{
-	mutex_exit(&mls->mls_lock);
-}
-
-/*
- * We're allowing any object to be inserted into this specific sublist,
- * but this can lead to trouble if multilist_remove() is called to
- * remove this object. Specifically, if calling ml_index_func on this
- * object returns an index for sublist different than what is passed as
- * a parameter here, any call to multilist_remove() with this newly
- * inserted object is undefined! (the call to multilist_remove() will
- * remove the object from a list that it isn't contained in)
- */
-void
-multilist_sublist_insert_head(multilist_sublist_t *mls, void *obj)
-{
-	ASSERT(MUTEX_HELD(&mls->mls_lock));
-	list_insert_head(&mls->mls_list, obj);
-}
-
-/* please see comment above multilist_sublist_insert_head */
-void
-multilist_sublist_insert_tail(multilist_sublist_t *mls, void *obj)
-{
-	ASSERT(MUTEX_HELD(&mls->mls_lock));
-	list_insert_tail(&mls->mls_list, obj);
-}
-
-/*
- * Move the object one element forward in the list.
- *
- * This function will move the given object forward in the list (towards
- * the head) by one object. So, in essence, it will swap its position in
- * the list with its "prev" pointer. If the given object is already at the
- * head of the list, it cannot be moved forward any more than it already
- * is, so no action is taken.
- *
- * NOTE: This function **must not** remove any object from the list other
- *       than the object given as the parameter. This is relied upon in
- *       arc_evict_state_impl().
- */
-void
-multilist_sublist_move_forward(multilist_sublist_t *mls, void *obj)
-{
-	void *prev = list_prev(&mls->mls_list, obj);
-
-	ASSERT(MUTEX_HELD(&mls->mls_lock));
-	ASSERT(!list_is_empty(&mls->mls_list));
-
-	/* 'obj' must be at the head of the list, nothing to do */
-	if (prev == NULL)
-		return;
-
-	list_remove(&mls->mls_list, obj);
-	list_insert_before(&mls->mls_list, prev, obj);
-}
-
-void
-multilist_sublist_remove(multilist_sublist_t *mls, void *obj)
-{
-	ASSERT(MUTEX_HELD(&mls->mls_lock));
-	list_remove(&mls->mls_list, obj);
-}
-
-int
-multilist_sublist_is_empty(multilist_sublist_t *mls)
-{
-	ASSERT(MUTEX_HELD(&mls->mls_lock));
-	return (list_is_empty(&mls->mls_list));
-}
-
-int
-multilist_sublist_is_empty_idx(multilist_t *ml, unsigned int sublist_idx)
-{
-	multilist_sublist_t *mls;
-	int empty;
-
-	ASSERT3U(sublist_idx, <, ml->ml_num_sublists);
-	mls = &ml->ml_sublists[sublist_idx];
-	ASSERT(!MUTEX_HELD(&mls->mls_lock));
-	mutex_enter(&mls->mls_lock);
-	empty = list_is_empty(&mls->mls_list);
-	mutex_exit(&mls->mls_lock);
-	return (empty);
-}
-
-void *
-multilist_sublist_head(multilist_sublist_t *mls)
-{
-	ASSERT(MUTEX_HELD(&mls->mls_lock));
-	return (list_head(&mls->mls_list));
-}
-
-void *
-multilist_sublist_tail(multilist_sublist_t *mls)
-{
-	ASSERT(MUTEX_HELD(&mls->mls_lock));
-	return (list_tail(&mls->mls_list));
-}
-
-void *
-multilist_sublist_next(multilist_sublist_t *mls, void *obj)
-{
-	ASSERT(MUTEX_HELD(&mls->mls_lock));
-	return (list_next(&mls->mls_list, obj));
-}
-
-void *
-multilist_sublist_prev(multilist_sublist_t *mls, void *obj)
-{
-	ASSERT(MUTEX_HELD(&mls->mls_lock));
-	return (list_prev(&mls->mls_list, obj));
-}
-
-void
-multilist_link_init(multilist_node_t *link)
-{
-	list_link_init(link);
-}
-
-int
-multilist_link_active(multilist_node_t *link)
-{
-	return (list_link_active(link));
-}
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/range_tree.c
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/range_tree.c
+++ /dev/null
@@ -1,670 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-/*
- * Copyright (c) 2013, 2017 by Delphix. All rights reserved.
- */
-
-#include <sys/zfs_context.h>
-#include <sys/spa.h>
-#include <sys/dmu.h>
-#include <sys/dnode.h>
-#include <sys/zio.h>
-#include <sys/range_tree.h>
-
-/*
- * Range trees are tree-based data structures that can be used to
- * track free space or generally any space allocation information.
- * A range tree keeps track of individual segments and automatically
- * provides facilities such as adjacent extent merging and extent
- * splitting in response to range add/remove requests.
- *
- * A range tree starts out completely empty, with no segments in it.
- * Adding an allocation via range_tree_add to the range tree can either:
- * 1) create a new extent
- * 2) extend an adjacent extent
- * 3) merge two adjacent extents
- * Conversely, removing an allocation via range_tree_remove can:
- * 1) completely remove an extent
- * 2) shorten an extent (if the allocation was near one of its ends)
- * 3) split an extent into two extents, in effect punching a hole
- *
- * A range tree is also capable of 'bridging' gaps when adding
- * allocations. This is useful for cases when close proximity of
- * allocations is an important detail that needs to be represented
- * in the range tree. See range_tree_set_gap(). The default behavior
- * is not to bridge gaps (i.e. the maximum allowed gap size is 0).
- *
- * In order to traverse a range tree, use either the range_tree_walk()
- * or range_tree_vacate() functions.
- *
- * To obtain more accurate information on individual segment
- * operations that the range tree performs "under the hood", you can
- * specify a set of callbacks by passing a range_tree_ops_t structure
- * to the range_tree_create function. Any callbacks that are non-NULL
- * are then called at the appropriate times.
- *
- * The range tree code also supports a special variant of range trees
- * that can bridge small gaps between segments. This kind of tree is used
- * by the dsl scanning code to group I/Os into mostly sequential chunks to
- * optimize disk performance. The code here attempts to do this with as
- * little memory and computational overhead as possible. One limitation of
- * this implementation is that segments of range trees with gaps can only
- * support removing complete segments.
- */
-
-kmem_cache_t *range_seg_cache;
-
-/* Generic ops for managing an AVL tree alongside a range tree */
-struct range_tree_ops rt_avl_ops = {
-	.rtop_create = rt_avl_create,
-	.rtop_destroy = rt_avl_destroy,
-	.rtop_add = rt_avl_add,
-	.rtop_remove = rt_avl_remove,
-	.rtop_vacate = rt_avl_vacate,
-};
-
-void
-range_tree_init(void)
-{
-	ASSERT(range_seg_cache == NULL);
-	range_seg_cache = kmem_cache_create("range_seg_cache",
-	    sizeof (range_seg_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
-}
-
-void
-range_tree_fini(void)
-{
-	kmem_cache_destroy(range_seg_cache);
-	range_seg_cache = NULL;
-}
-
-void
-range_tree_stat_verify(range_tree_t *rt)
-{
-	range_seg_t *rs;
-	uint64_t hist[RANGE_TREE_HISTOGRAM_SIZE] = { 0 };
-	int i;
-
-	for (rs = avl_first(&rt->rt_root); rs != NULL;
-	    rs = AVL_NEXT(&rt->rt_root, rs)) {
-		uint64_t size = rs->rs_end - rs->rs_start;
-		int idx	= highbit64(size) - 1;
-
-		hist[idx]++;
-		ASSERT3U(hist[idx], !=, 0);
-	}
-
-	for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) {
-		if (hist[i] != rt->rt_histogram[i]) {
-			zfs_dbgmsg("i=%d, hist=%p, hist=%llu, rt_hist=%llu",
-			    i, hist, hist[i], rt->rt_histogram[i]);
-		}
-		VERIFY3U(hist[i], ==, rt->rt_histogram[i]);
-	}
-}
-
-static void
-range_tree_stat_incr(range_tree_t *rt, range_seg_t *rs)
-{
-	uint64_t size = rs->rs_end - rs->rs_start;
-	int idx = highbit64(size) - 1;
-
-	ASSERT(size != 0);
-	ASSERT3U(idx, <,
-	    sizeof (rt->rt_histogram) / sizeof (*rt->rt_histogram));
-
-	rt->rt_histogram[idx]++;
-	ASSERT3U(rt->rt_histogram[idx], !=, 0);
-}
-
-static void
-range_tree_stat_decr(range_tree_t *rt, range_seg_t *rs)
-{
-	uint64_t size = rs->rs_end - rs->rs_start;
-	int idx = highbit64(size) - 1;
-
-	ASSERT(size != 0);
-	ASSERT3U(idx, <,
-	    sizeof (rt->rt_histogram) / sizeof (*rt->rt_histogram));
-
-	ASSERT3U(rt->rt_histogram[idx], !=, 0);
-	rt->rt_histogram[idx]--;
-}
-
-/*
- * NOTE: caller is responsible for all locking.
- */
-static int
-range_tree_seg_compare(const void *x1, const void *x2)
-{
-	const range_seg_t *r1 = (const range_seg_t *)x1;
-	const range_seg_t *r2 = (const range_seg_t *)x2;
-
-	ASSERT3U(r1->rs_start, <=, r1->rs_end);
-	ASSERT3U(r2->rs_start, <=, r2->rs_end);
-
-	return ((r1->rs_start >= r2->rs_end) - (r1->rs_end <= r2->rs_start));
-}
-
-range_tree_t *
-range_tree_create_impl(range_tree_ops_t *ops, void *arg,
-    int (*avl_compare) (const void *, const void *), uint64_t gap)
-{
-	range_tree_t *rt = kmem_zalloc(sizeof (range_tree_t), KM_SLEEP);
-
-	avl_create(&rt->rt_root, range_tree_seg_compare,
-	    sizeof (range_seg_t), offsetof(range_seg_t, rs_node));
-
-	rt->rt_ops = ops;
-	rt->rt_arg = arg;
-	rt->rt_gap = gap;
-	rt->rt_avl_compare = avl_compare;
-
-	if (rt->rt_ops != NULL && rt->rt_ops->rtop_create != NULL)
-		rt->rt_ops->rtop_create(rt, rt->rt_arg);
-
-	return (rt);
-}
-
-range_tree_t *
-range_tree_create(range_tree_ops_t *ops, void *arg)
-{
-	return (range_tree_create_impl(ops, arg, NULL, 0));
-}
-
-void
-range_tree_destroy(range_tree_t *rt)
-{
-	VERIFY0(rt->rt_space);
-
-	if (rt->rt_ops != NULL && rt->rt_ops->rtop_destroy != NULL)
-		rt->rt_ops->rtop_destroy(rt, rt->rt_arg);
-
-	avl_destroy(&rt->rt_root);
-	kmem_free(rt, sizeof (*rt));
-}
-
-void
-range_tree_adjust_fill(range_tree_t *rt, range_seg_t *rs, int64_t delta)
-{
-	ASSERT3U(rs->rs_fill + delta, !=, 0);
-	ASSERT3U(rs->rs_fill + delta, <=, rs->rs_end - rs->rs_start);
-
-	if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL)
-		rt->rt_ops->rtop_remove(rt, rs, rt->rt_arg);
-	rs->rs_fill += delta;
-	if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL)
-		rt->rt_ops->rtop_add(rt, rs, rt->rt_arg);
-}
-
-static void
-range_tree_add_impl(void *arg, uint64_t start, uint64_t size, uint64_t fill)
-{
-	range_tree_t *rt = arg;
-	avl_index_t where;
-	range_seg_t rsearch, *rs_before, *rs_after, *rs;
-	uint64_t end = start + size, gap = rt->rt_gap;
-	uint64_t bridge_size = 0;
-	boolean_t merge_before, merge_after;
-
-	ASSERT3U(size, !=, 0);
-	ASSERT3U(fill, <=, size);
-
-	rsearch.rs_start = start;
-	rsearch.rs_end = end;
-	rs = avl_find(&rt->rt_root, &rsearch, &where);
-
-	if (gap == 0 && rs != NULL &&
-	    rs->rs_start <= start && rs->rs_end >= end) {
-		zfs_panic_recover("zfs: allocating allocated segment"
-		    "(offset=%llu size=%llu) of (offset=%llu size=%llu)\n",
-		    (longlong_t)start, (longlong_t)size,
-		    (longlong_t)rs->rs_start,
-		    (longlong_t)rs->rs_end - rs->rs_start);
-		return;
-	}
-
-	/*
-	 * If this is a gap-supporting range tree, it is possible that we
-	 * are inserting into an existing segment. In this case simply
-	 * bump the fill count and call the remove / add callbacks. If the
-	 * new range will extend an existing segment, we remove the
-	 * existing one, apply the new extent to it and re-insert it using
-	 * the normal code paths.
-	 */
-	if (rs != NULL) {
-		ASSERT3U(gap, !=, 0);
-		if (rs->rs_start <= start && rs->rs_end >= end) {
-			range_tree_adjust_fill(rt, rs, fill);
-			return;
-		}
-
-		avl_remove(&rt->rt_root, rs);
-		if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL)
-			rt->rt_ops->rtop_remove(rt, rs, rt->rt_arg);
-
-		range_tree_stat_decr(rt, rs);
-		rt->rt_space -= rs->rs_end - rs->rs_start;
-
-		fill += rs->rs_fill;
-		start = MIN(start, rs->rs_start);
-		end = MAX(end, rs->rs_end);
-		size = end - start;
-
-		range_tree_add_impl(rt, start, size, fill);
-
-		kmem_cache_free(range_seg_cache, rs);
-		return;
-	}
-
-	ASSERT3P(rs, ==, NULL);
-
-	/*
-	 * Determine whether or not we will have to merge with our neighbors.
-	 * If gap != 0, we might need to merge with our neighbors even if we
-	 * aren't directly touching.
-	 */
-	rs_before = avl_nearest(&rt->rt_root, where, AVL_BEFORE);
-	rs_after = avl_nearest(&rt->rt_root, where, AVL_AFTER);
-
-	merge_before = (rs_before != NULL && rs_before->rs_end >= start - gap);
-	merge_after = (rs_after != NULL && rs_after->rs_start <= end + gap);
-
-	if (merge_before && gap != 0)
-		bridge_size += start - rs_before->rs_end;
-	if (merge_after && gap != 0)
-		bridge_size += rs_after->rs_start - end;
-
-	if (merge_before && merge_after) {
-		avl_remove(&rt->rt_root, rs_before);
-		if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL) {
-			rt->rt_ops->rtop_remove(rt, rs_before, rt->rt_arg);
-			rt->rt_ops->rtop_remove(rt, rs_after, rt->rt_arg);
-		}
-
-		range_tree_stat_decr(rt, rs_before);
-		range_tree_stat_decr(rt, rs_after);
-
-		rs_after->rs_fill += rs_before->rs_fill + fill;
-		rs_after->rs_start = rs_before->rs_start;
-		kmem_cache_free(range_seg_cache, rs_before);
-		rs = rs_after;
-	} else if (merge_before) {
-		if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL)
-			rt->rt_ops->rtop_remove(rt, rs_before, rt->rt_arg);
-
-		range_tree_stat_decr(rt, rs_before);
-
-		rs_before->rs_fill += fill;
-		rs_before->rs_end = end;
-		rs = rs_before;
-	} else if (merge_after) {
-		if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL)
-			rt->rt_ops->rtop_remove(rt, rs_after, rt->rt_arg);
-
-		range_tree_stat_decr(rt, rs_after);
-
-		rs_after->rs_fill += fill;
-		rs_after->rs_start = start;
-		rs = rs_after;
-	} else {
-		rs = kmem_cache_alloc(range_seg_cache, KM_SLEEP);
-
-		rs->rs_fill = fill;
-		rs->rs_start = start;
-		rs->rs_end = end;
-		avl_insert(&rt->rt_root, rs, where);
-	}
-
-	if (gap != 0)
-		ASSERT3U(rs->rs_fill, <=, rs->rs_end - rs->rs_start);
-	else
-		ASSERT3U(rs->rs_fill, ==, rs->rs_end - rs->rs_start);
-
-	if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL)
-		rt->rt_ops->rtop_add(rt, rs, rt->rt_arg);
-
-	range_tree_stat_incr(rt, rs);
-	rt->rt_space += size + bridge_size;
-}
-
-void
-range_tree_add(void *arg, uint64_t start, uint64_t size)
-{
-	range_tree_add_impl(arg, start, size, size);
-}
-
-static void
-range_tree_remove_impl(range_tree_t *rt, uint64_t start, uint64_t size,
-    boolean_t do_fill)
-{
-	avl_index_t where;
-	range_seg_t rsearch, *rs, *newseg;
-	uint64_t end = start + size;
-	boolean_t left_over, right_over;
-
-	VERIFY3U(size, !=, 0);
-	VERIFY3U(size, <=, rt->rt_space);
-
-	rsearch.rs_start = start;
-	rsearch.rs_end = end;
-	rs = avl_find(&rt->rt_root, &rsearch, &where);
-
-	/* Make sure we completely overlap with someone */
-	if (rs == NULL) {
-		zfs_panic_recover("zfs: freeing free segment "
-		    "(offset=%llu size=%llu)",
-		    (longlong_t)start, (longlong_t)size);
-		return;
-	}
-
-	/*
-	 * Range trees with gap support must only remove complete segments
-	 * from the tree. This allows us to maintain accurate fill accounting
-	 * and to ensure that bridged sections are not leaked. If we need to
-	 * remove less than the full segment, we can only adjust the fill count.
-	 */
-	if (rt->rt_gap != 0) {
-		if (do_fill) {
-			if (rs->rs_fill == size) {
-				start = rs->rs_start;
-				end = rs->rs_end;
-				size = end - start;
-			} else {
-				range_tree_adjust_fill(rt, rs, -size);
-				return;
-			}
-		} else if (rs->rs_start != start || rs->rs_end != end) {
-			zfs_panic_recover("zfs: freeing partial segment of "
-			    "gap tree (offset=%llu size=%llu) of "
-			    "(offset=%llu size=%llu)",
-			    (longlong_t)start, (longlong_t)size,
-			    (longlong_t)rs->rs_start,
-			    (longlong_t)rs->rs_end - rs->rs_start);
-			return;
-		}
-	}
-
-	VERIFY3U(rs->rs_start, <=, start);
-	VERIFY3U(rs->rs_end, >=, end);
-
-	left_over = (rs->rs_start != start);
-	right_over = (rs->rs_end != end);
-
-	range_tree_stat_decr(rt, rs);
-
-	if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL)
-		rt->rt_ops->rtop_remove(rt, rs, rt->rt_arg);
-
-	if (left_over && right_over) {
-		newseg = kmem_cache_alloc(range_seg_cache, KM_SLEEP);
-		newseg->rs_start = end;
-		newseg->rs_end = rs->rs_end;
-		newseg->rs_fill = newseg->rs_end - newseg->rs_start;
-		range_tree_stat_incr(rt, newseg);
-
-		rs->rs_end = start;
-
-		avl_insert_here(&rt->rt_root, newseg, rs, AVL_AFTER);
-		if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL)
-			rt->rt_ops->rtop_add(rt, newseg, rt->rt_arg);
-	} else if (left_over) {
-		rs->rs_end = start;
-	} else if (right_over) {
-		rs->rs_start = end;
-	} else {
-		avl_remove(&rt->rt_root, rs);
-		kmem_cache_free(range_seg_cache, rs);
-		rs = NULL;
-	}
-
-	if (rs != NULL) {
-		/*
-		 * The fill of the leftover segment will always be equal to
-		 * the size, since we do not support removing partial segments
-		 * of range trees with gaps.
-		 */
-		rs->rs_fill = rs->rs_end - rs->rs_start;
-		range_tree_stat_incr(rt, rs);
-
-		if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL)
-			rt->rt_ops->rtop_add(rt, rs, rt->rt_arg);
-	}
-
-	rt->rt_space -= size;
-}
-
-void
-range_tree_remove(void *arg, uint64_t start, uint64_t size)
-{
-	range_tree_remove_impl(arg, start, size, B_FALSE);
-}
-
-void
-range_tree_remove_fill(range_tree_t *rt, uint64_t start, uint64_t size)
-{
-	range_tree_remove_impl(rt, start, size, B_TRUE);
-}
-
-void
-range_tree_resize_segment(range_tree_t *rt, range_seg_t *rs,
-    uint64_t newstart, uint64_t newsize)
-{
-	int64_t delta = newsize - (rs->rs_end - rs->rs_start);
-
-	range_tree_stat_decr(rt, rs);
-	if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL)
-		rt->rt_ops->rtop_remove(rt, rs, rt->rt_arg);
-
-	rs->rs_start = newstart;
-	rs->rs_end = newstart + newsize;
-
-	range_tree_stat_incr(rt, rs);
-	if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL)
-		rt->rt_ops->rtop_add(rt, rs, rt->rt_arg);
-
-	rt->rt_space += delta;
-}
-
-static range_seg_t *
-range_tree_find_impl(range_tree_t *rt, uint64_t start, uint64_t size)
-{
-	range_seg_t rsearch;
-	uint64_t end = start + size;
-
-	VERIFY(size != 0);
-
-	rsearch.rs_start = start;
-	rsearch.rs_end = end;
-	return (avl_find(&rt->rt_root, &rsearch, NULL));
-}
-
-range_seg_t *
-range_tree_find(range_tree_t *rt, uint64_t start, uint64_t size)
-{
-	range_seg_t *rs = range_tree_find_impl(rt, start, size);
-	if (rs != NULL && rs->rs_start <= start && rs->rs_end >= start + size)
-		return (rs);
-	return (NULL);
-}
-
-void
-range_tree_verify_not_present(range_tree_t *rt, uint64_t off, uint64_t size)
-{
-	range_seg_t *rs = range_tree_find(rt, off, size);
-	if (rs != NULL)
-		panic("segment already in tree; rs=%p", (void *)rs);
-}
-
-boolean_t
-range_tree_contains(range_tree_t *rt, uint64_t start, uint64_t size)
-{
-	return (range_tree_find(rt, start, size) != NULL);
-}
-
-/*
- * Ensure that this range is not in the tree, regardless of whether
- * it is currently in the tree.
- */
-void
-range_tree_clear(range_tree_t *rt, uint64_t start, uint64_t size)
-{
-	range_seg_t *rs;
-
-	if (size == 0)
-		return;
-
-	while ((rs = range_tree_find_impl(rt, start, size)) != NULL) {
-		uint64_t free_start = MAX(rs->rs_start, start);
-		uint64_t free_end = MIN(rs->rs_end, start + size);
-		range_tree_remove(rt, free_start, free_end - free_start);
-	}
-}
-
-void
-range_tree_swap(range_tree_t **rtsrc, range_tree_t **rtdst)
-{
-	range_tree_t *rt;
-
-	ASSERT0(range_tree_space(*rtdst));
-	ASSERT0(avl_numnodes(&(*rtdst)->rt_root));
-
-	rt = *rtsrc;
-	*rtsrc = *rtdst;
-	*rtdst = rt;
-}
-
-void
-range_tree_vacate(range_tree_t *rt, range_tree_func_t *func, void *arg)
-{
-	range_seg_t *rs;
-	void *cookie = NULL;
-
-
-	if (rt->rt_ops != NULL && rt->rt_ops->rtop_vacate != NULL)
-		rt->rt_ops->rtop_vacate(rt, rt->rt_arg);
-
-	while ((rs = avl_destroy_nodes(&rt->rt_root, &cookie)) != NULL) {
-		if (func != NULL)
-			func(arg, rs->rs_start, rs->rs_end - rs->rs_start);
-		kmem_cache_free(range_seg_cache, rs);
-	}
-
-	bzero(rt->rt_histogram, sizeof (rt->rt_histogram));
-	rt->rt_space = 0;
-}
-
-void
-range_tree_walk(range_tree_t *rt, range_tree_func_t *func, void *arg)
-{
-	range_seg_t *rs;
-
-	for (rs = avl_first(&rt->rt_root); rs; rs = AVL_NEXT(&rt->rt_root, rs))
-		func(arg, rs->rs_start, rs->rs_end - rs->rs_start);
-}
-
-range_seg_t *
-range_tree_first(range_tree_t *rt)
-{
-	return (avl_first(&rt->rt_root));
-}
-
-uint64_t
-range_tree_space(range_tree_t *rt)
-{
-	return (rt->rt_space);
-}
-
-/* Generic range tree functions for maintaining segments in an AVL tree. */
-void
-rt_avl_create(range_tree_t *rt, void *arg)
-{
-	avl_tree_t *tree = arg;
-
-	avl_create(tree, rt->rt_avl_compare, sizeof (range_seg_t),
-	    offsetof(range_seg_t, rs_pp_node));
-}
-
-void
-rt_avl_destroy(range_tree_t *rt, void *arg)
-{
-	avl_tree_t *tree = arg;
-
-	ASSERT0(avl_numnodes(tree));
-	avl_destroy(tree);
-}
-
-void
-rt_avl_add(range_tree_t *rt, range_seg_t *rs, void *arg)
-{
-	avl_tree_t *tree = arg;
-	avl_add(tree, rs);
-}
-
-void
-rt_avl_remove(range_tree_t *rt, range_seg_t *rs, void *arg)
-{
-	avl_tree_t *tree = arg;
-	avl_remove(tree, rs);
-}
-
-void
-rt_avl_vacate(range_tree_t *rt, void *arg)
-{
-	/*
-	 * Normally one would walk the tree freeing nodes along the way.
-	 * Since the nodes are shared with the range trees we can avoid
-	 * walking all nodes and just reinitialize the avl tree. The nodes
-	 * will be freed by the range tree, so we don't want to free them here.
-	 */
-	rt_avl_create(rt, arg);
-}
-
-boolean_t
-range_tree_is_empty(range_tree_t *rt)
-{
-	ASSERT(rt != NULL);
-	return (range_tree_space(rt) == 0);
-}
-
-uint64_t
-range_tree_min(range_tree_t *rt)
-{
-	range_seg_t *rs = avl_first(&rt->rt_root);
-	return (rs != NULL ? rs->rs_start : 0);
-}
-
-uint64_t
-range_tree_max(range_tree_t *rt)
-{
-	range_seg_t *rs = avl_last(&rt->rt_root);
-	return (rs != NULL ? rs->rs_end : 0);
-}
-
-uint64_t
-range_tree_span(range_tree_t *rt)
-{
-	return (range_tree_max(rt) - range_tree_min(rt));
-}
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/refcount.c
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/refcount.c
+++ /dev/null
@@ -1,321 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
- */
-
-#include <sys/zfs_context.h>
-#include <sys/refcount.h>
-
-#ifdef	ZFS_DEBUG
-
-#ifdef _KERNEL
-int reference_tracking_enable = FALSE; /* runs out of memory too easily */
-SYSCTL_DECL(_vfs_zfs);
-SYSCTL_INT(_vfs_zfs, OID_AUTO, reference_tracking_enable, CTLFLAG_RDTUN,
-    &reference_tracking_enable, 0,
-    "Track reference holders to refcount_t objects, used mostly by ZFS");
-#else
-int reference_tracking_enable = TRUE;
-#endif
-int reference_history = 3; /* tunable */
-
-static kmem_cache_t *reference_cache;
-static kmem_cache_t *reference_history_cache;
-
-void
-zfs_refcount_init(void)
-{
-	reference_cache = kmem_cache_create("reference_cache",
-	    sizeof (reference_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
-
-	reference_history_cache = kmem_cache_create("reference_history_cache",
-	    sizeof (uint64_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
-}
-
-void
-zfs_refcount_fini(void)
-{
-	kmem_cache_destroy(reference_cache);
-	kmem_cache_destroy(reference_history_cache);
-}
-
-void
-zfs_refcount_create(zfs_refcount_t *rc)
-{
-	mutex_init(&rc->rc_mtx, NULL, MUTEX_DEFAULT, NULL);
-	list_create(&rc->rc_list, sizeof (reference_t),
-	    offsetof(reference_t, ref_link));
-	list_create(&rc->rc_removed, sizeof (reference_t),
-	    offsetof(reference_t, ref_link));
-	rc->rc_count = 0;
-	rc->rc_removed_count = 0;
-	rc->rc_tracked = reference_tracking_enable;
-}
-
-void
-zfs_refcount_create_tracked(zfs_refcount_t *rc)
-{
-	zfs_refcount_create(rc);
-	rc->rc_tracked = B_TRUE;
-}
-
-void
-zfs_refcount_create_untracked(zfs_refcount_t *rc)
-{
-	zfs_refcount_create(rc);
-	rc->rc_tracked = B_FALSE;
-}
-
-void
-zfs_refcount_destroy_many(zfs_refcount_t *rc, uint64_t number)
-{
-	reference_t *ref;
-
-	ASSERT(rc->rc_count == number);
-	while (ref = list_head(&rc->rc_list)) {
-		list_remove(&rc->rc_list, ref);
-		kmem_cache_free(reference_cache, ref);
-	}
-	list_destroy(&rc->rc_list);
-
-	while (ref = list_head(&rc->rc_removed)) {
-		list_remove(&rc->rc_removed, ref);
-		kmem_cache_free(reference_history_cache, ref->ref_removed);
-		kmem_cache_free(reference_cache, ref);
-	}
-	list_destroy(&rc->rc_removed);
-	mutex_destroy(&rc->rc_mtx);
-}
-
-void
-zfs_refcount_destroy(zfs_refcount_t *rc)
-{
-	zfs_refcount_destroy_many(rc, 0);
-}
-
-int
-zfs_refcount_is_zero(zfs_refcount_t *rc)
-{
-	return (rc->rc_count == 0);
-}
-
-int64_t
-zfs_refcount_count(zfs_refcount_t *rc)
-{
-	return (rc->rc_count);
-}
-
-int64_t
-zfs_refcount_add_many(zfs_refcount_t *rc, uint64_t number, void *holder)
-{
-	reference_t *ref = NULL;
-	int64_t count;
-
-	if (rc->rc_tracked) {
-		ref = kmem_cache_alloc(reference_cache, KM_SLEEP);
-		ref->ref_holder = holder;
-		ref->ref_number = number;
-	}
-	mutex_enter(&rc->rc_mtx);
-	ASSERT(rc->rc_count >= 0);
-	if (rc->rc_tracked)
-		list_insert_head(&rc->rc_list, ref);
-	rc->rc_count += number;
-	count = rc->rc_count;
-	mutex_exit(&rc->rc_mtx);
-
-	return (count);
-}
-
-int64_t
-zfs_refcount_add(zfs_refcount_t *rc, void *holder)
-{
-	return (zfs_refcount_add_many(rc, 1, holder));
-}
-
-int64_t
-zfs_refcount_remove_many(zfs_refcount_t *rc, uint64_t number, void *holder)
-{
-	reference_t *ref;
-	int64_t count;
-
-	mutex_enter(&rc->rc_mtx);
-	ASSERT(rc->rc_count >= number);
-
-	if (!rc->rc_tracked) {
-		rc->rc_count -= number;
-		count = rc->rc_count;
-		mutex_exit(&rc->rc_mtx);
-		return (count);
-	}
-
-	for (ref = list_head(&rc->rc_list); ref;
-	    ref = list_next(&rc->rc_list, ref)) {
-		if (ref->ref_holder == holder && ref->ref_number == number) {
-			list_remove(&rc->rc_list, ref);
-			if (reference_history > 0) {
-				ref->ref_removed =
-				    kmem_cache_alloc(reference_history_cache,
-				    KM_SLEEP);
-				list_insert_head(&rc->rc_removed, ref);
-				rc->rc_removed_count++;
-				if (rc->rc_removed_count > reference_history) {
-					ref = list_tail(&rc->rc_removed);
-					list_remove(&rc->rc_removed, ref);
-					kmem_cache_free(reference_history_cache,
-					    ref->ref_removed);
-					kmem_cache_free(reference_cache, ref);
-					rc->rc_removed_count--;
-				}
-			} else {
-				kmem_cache_free(reference_cache, ref);
-			}
-			rc->rc_count -= number;
-			count = rc->rc_count;
-			mutex_exit(&rc->rc_mtx);
-			return (count);
-		}
-	}
-	panic("No such hold %p on refcount %llx", holder,
-	    (u_longlong_t)(uintptr_t)rc);
-	return (-1);
-}
-
-int64_t
-zfs_refcount_remove(zfs_refcount_t *rc, void *holder)
-{
-	return (zfs_refcount_remove_many(rc, 1, holder));
-}
-
-void
-zfs_refcount_transfer(zfs_refcount_t *dst, zfs_refcount_t *src)
-{
-	int64_t count, removed_count;
-	list_t list, removed;
-
-	list_create(&list, sizeof (reference_t),
-	    offsetof(reference_t, ref_link));
-	list_create(&removed, sizeof (reference_t),
-	    offsetof(reference_t, ref_link));
-
-	mutex_enter(&src->rc_mtx);
-	count = src->rc_count;
-	removed_count = src->rc_removed_count;
-	src->rc_count = 0;
-	src->rc_removed_count = 0;
-	list_move_tail(&list, &src->rc_list);
-	list_move_tail(&removed, &src->rc_removed);
-	mutex_exit(&src->rc_mtx);
-
-	mutex_enter(&dst->rc_mtx);
-	dst->rc_count += count;
-	dst->rc_removed_count += removed_count;
-	list_move_tail(&dst->rc_list, &list);
-	list_move_tail(&dst->rc_removed, &removed);
-	mutex_exit(&dst->rc_mtx);
-
-	list_destroy(&list);
-	list_destroy(&removed);
-}
-
-void
-zfs_refcount_transfer_ownership(zfs_refcount_t *rc, void *current_holder,
-    void *new_holder)
-{
-	reference_t *ref;
-	boolean_t found = B_FALSE;
-
-	mutex_enter(&rc->rc_mtx);
-	if (!rc->rc_tracked) {
-		mutex_exit(&rc->rc_mtx);
-		return;
-	}
-
-	for (ref = list_head(&rc->rc_list); ref;
-	    ref = list_next(&rc->rc_list, ref)) {
-		if (ref->ref_holder == current_holder) {
-			ref->ref_holder = new_holder;
-			found = B_TRUE;
-			break;
-		}
-	}
-	ASSERT(found);
-	mutex_exit(&rc->rc_mtx);
-}
-
-/*
- * If tracking is enabled, return true if a reference exists that matches
- * the "holder" tag. If tracking is disabled, then return true if a reference
- * might be held.
- */
-boolean_t
-zfs_refcount_held(zfs_refcount_t *rc, void *holder)
-{
-	reference_t *ref;
-
-	mutex_enter(&rc->rc_mtx);
-
-	if (!rc->rc_tracked) {
-		mutex_exit(&rc->rc_mtx);
-		return (rc->rc_count > 0);
-	}
-
-	for (ref = list_head(&rc->rc_list); ref;
-	    ref = list_next(&rc->rc_list, ref)) {
-		if (ref->ref_holder == holder) {
-			mutex_exit(&rc->rc_mtx);
-			return (B_TRUE);
-		}
-	}
-	mutex_exit(&rc->rc_mtx);
-	return (B_FALSE);
-}
-
-/*
- * If tracking is enabled, return true if a reference does not exist that
- * matches the "holder" tag. If tracking is disabled, always return true
- * since the reference might not be held.
- */
-boolean_t
-zfs_refcount_not_held(zfs_refcount_t *rc, void *holder)
-{
-	reference_t *ref;
-
-	mutex_enter(&rc->rc_mtx);
-
-	if (!rc->rc_tracked) {
-		mutex_exit(&rc->rc_mtx);
-		return (B_TRUE);
-	}
-
-	for (ref = list_head(&rc->rc_list); ref;
-	    ref = list_next(&rc->rc_list, ref)) {
-		if (ref->ref_holder == holder) {
-			mutex_exit(&rc->rc_mtx);
-			return (B_FALSE);
-		}
-	}
-	mutex_exit(&rc->rc_mtx);
-	return (B_TRUE);
-}
-#endif	/* ZFS_DEBUG */
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/rrwlock.c
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/rrwlock.c
+++ /dev/null
@@ -1,396 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-/*
- * Copyright (c) 2012 by Delphix. All rights reserved.
- */
-
-#include <sys/refcount.h>
-#include <sys/rrwlock.h>
-
-/*
- * This file contains the implementation of a re-entrant read
- * reader/writer lock (aka "rrwlock").
- *
- * This is a normal reader/writer lock with the additional feature
- * of allowing threads who have already obtained a read lock to
- * re-enter another read lock (re-entrant read) - even if there are
- * waiting writers.
- *
- * Callers who have not obtained a read lock give waiting writers priority.
- *
- * The rrwlock_t lock does not allow re-entrant writers, nor does it
- * allow a re-entrant mix of reads and writes (that is, it does not
- * allow a caller who has already obtained a read lock to be able to
- * then grab a write lock without first dropping all read locks, and
- * vice versa).
- *
- * The rrwlock_t uses tsd (thread specific data) to keep a list of
- * nodes (rrw_node_t), where each node keeps track of which specific
- * lock (rrw_node_t::rn_rrl) the thread has grabbed.  Since re-entering
- * should be rare, a thread that grabs multiple reads on the same rrwlock_t
- * will store multiple rrw_node_ts of the same 'rrn_rrl'. Nodes on the
- * tsd list can represent a different rrwlock_t.  This allows a thread
- * to enter multiple and unique rrwlock_ts for read locks at the same time.
- *
- * Since using tsd exposes some overhead, the rrwlock_t only needs to
- * keep tsd data when writers are waiting.  If no writers are waiting, then
- * a reader just bumps the anonymous read count (rr_anon_rcount) - no tsd
- * is needed.  Once a writer attempts to grab the lock, readers then
- * keep tsd data and bump the linked readers count (rr_linked_rcount).
- *
- * If there are waiting writers and there are anonymous readers, then a
- * reader doesn't know if it is a re-entrant lock. But since it may be one,
- * we allow the read to proceed (otherwise it could deadlock).  Since once
- * waiting writers are active, readers no longer bump the anonymous count,
- * the anonymous readers will eventually flush themselves out.  At this point,
- * readers will be able to tell if they are a re-entrant lock (have a
- * rrw_node_t entry for the lock) or not. If they are a re-entrant lock, then
- * we must let the proceed.  If they are not, then the reader blocks for the
- * waiting writers.  Hence, we do not starve writers.
- */
-
-/* global key for TSD */
-uint_t rrw_tsd_key;
-
-typedef struct rrw_node {
-	struct rrw_node *rn_next;
-	rrwlock_t *rn_rrl;
-	void *rn_tag;
-} rrw_node_t;
-
-static rrw_node_t *
-rrn_find(rrwlock_t *rrl)
-{
-	rrw_node_t *rn;
-
-	if (zfs_refcount_count(&rrl->rr_linked_rcount) == 0)
-		return (NULL);
-
-	for (rn = tsd_get(rrw_tsd_key); rn != NULL; rn = rn->rn_next) {
-		if (rn->rn_rrl == rrl)
-			return (rn);
-	}
-	return (NULL);
-}
-
-/*
- * Add a node to the head of the singly linked list.
- */
-static void
-rrn_add(rrwlock_t *rrl, void *tag)
-{
-	rrw_node_t *rn;
-
-	rn = kmem_alloc(sizeof (*rn), KM_SLEEP);
-	rn->rn_rrl = rrl;
-	rn->rn_next = tsd_get(rrw_tsd_key);
-	rn->rn_tag = tag;
-	VERIFY(tsd_set(rrw_tsd_key, rn) == 0);
-}
-
-/*
- * If a node is found for 'rrl', then remove the node from this
- * thread's list and return TRUE; otherwise return FALSE.
- */
-static boolean_t
-rrn_find_and_remove(rrwlock_t *rrl, void *tag)
-{
-	rrw_node_t *rn;
-	rrw_node_t *prev = NULL;
-
-	if (zfs_refcount_count(&rrl->rr_linked_rcount) == 0)
-		return (B_FALSE);
-
-	for (rn = tsd_get(rrw_tsd_key); rn != NULL; rn = rn->rn_next) {
-		if (rn->rn_rrl == rrl && rn->rn_tag == tag) {
-			if (prev)
-				prev->rn_next = rn->rn_next;
-			else
-				VERIFY(tsd_set(rrw_tsd_key, rn->rn_next) == 0);
-			kmem_free(rn, sizeof (*rn));
-			return (B_TRUE);
-		}
-		prev = rn;
-	}
-	return (B_FALSE);
-}
-
-void
-rrw_init(rrwlock_t *rrl, boolean_t track_all)
-{
-	mutex_init(&rrl->rr_lock, NULL, MUTEX_DEFAULT, NULL);
-	cv_init(&rrl->rr_cv, NULL, CV_DEFAULT, NULL);
-	rrl->rr_writer = NULL;
-	zfs_refcount_create(&rrl->rr_anon_rcount);
-	zfs_refcount_create(&rrl->rr_linked_rcount);
-	rrl->rr_writer_wanted = B_FALSE;
-	rrl->rr_track_all = track_all;
-}
-
-void
-rrw_destroy(rrwlock_t *rrl)
-{
-	mutex_destroy(&rrl->rr_lock);
-	cv_destroy(&rrl->rr_cv);
-	ASSERT(rrl->rr_writer == NULL);
-	zfs_refcount_destroy(&rrl->rr_anon_rcount);
-	zfs_refcount_destroy(&rrl->rr_linked_rcount);
-}
-
-static void
-rrw_enter_read_impl(rrwlock_t *rrl, boolean_t prio, void *tag)
-{
-	mutex_enter(&rrl->rr_lock);
-#if !defined(DEBUG) && defined(_KERNEL)
-	if (rrl->rr_writer == NULL && !rrl->rr_writer_wanted &&
-	    !rrl->rr_track_all) {
-		rrl->rr_anon_rcount.rc_count++;
-		mutex_exit(&rrl->rr_lock);
-		return;
-	}
-	DTRACE_PROBE(zfs__rrwfastpath__rdmiss);
-#endif
-	ASSERT(rrl->rr_writer != curthread);
-	ASSERT(zfs_refcount_count(&rrl->rr_anon_rcount) >= 0);
-
-	while (rrl->rr_writer != NULL || (rrl->rr_writer_wanted &&
-	    zfs_refcount_is_zero(&rrl->rr_anon_rcount) && !prio &&
-	    rrn_find(rrl) == NULL))
-		cv_wait(&rrl->rr_cv, &rrl->rr_lock);
-
-	if (rrl->rr_writer_wanted || rrl->rr_track_all) {
-		/* may or may not be a re-entrant enter */
-		rrn_add(rrl, tag);
-		(void) zfs_refcount_add(&rrl->rr_linked_rcount, tag);
-	} else {
-		(void) zfs_refcount_add(&rrl->rr_anon_rcount, tag);
-	}
-	ASSERT(rrl->rr_writer == NULL);
-	mutex_exit(&rrl->rr_lock);
-}
-
-void
-rrw_enter_read(rrwlock_t *rrl, void *tag)
-{
-	rrw_enter_read_impl(rrl, B_FALSE, tag);
-}
-
-/*
- * take a read lock even if there are pending write lock requests. if we want
- * to take a lock reentrantly, but from different threads (that have a
- * relationship to each other), the normal detection mechanism to overrule
- * the pending writer does not work, so we have to give an explicit hint here.
- */
-void
-rrw_enter_read_prio(rrwlock_t *rrl, void *tag)
-{
-	rrw_enter_read_impl(rrl, B_TRUE, tag);
-}
-
-
-void
-rrw_enter_write(rrwlock_t *rrl)
-{
-	mutex_enter(&rrl->rr_lock);
-	ASSERT(rrl->rr_writer != curthread);
-
-	while (zfs_refcount_count(&rrl->rr_anon_rcount) > 0 ||
-	    zfs_refcount_count(&rrl->rr_linked_rcount) > 0 ||
-	    rrl->rr_writer != NULL) {
-		rrl->rr_writer_wanted = B_TRUE;
-		cv_wait(&rrl->rr_cv, &rrl->rr_lock);
-	}
-	rrl->rr_writer_wanted = B_FALSE;
-	rrl->rr_writer = curthread;
-	mutex_exit(&rrl->rr_lock);
-}
-
-void
-rrw_enter(rrwlock_t *rrl, krw_t rw, void *tag)
-{
-	if (rw == RW_READER)
-		rrw_enter_read(rrl, tag);
-	else
-		rrw_enter_write(rrl);
-}
-
-void
-rrw_exit(rrwlock_t *rrl, void *tag)
-{
-	mutex_enter(&rrl->rr_lock);
-#if !defined(DEBUG) && defined(_KERNEL)
-	if (!rrl->rr_writer && rrl->rr_linked_rcount.rc_count == 0) {
-		rrl->rr_anon_rcount.rc_count--;
-		if (rrl->rr_anon_rcount.rc_count == 0)
-			cv_broadcast(&rrl->rr_cv);
-		mutex_exit(&rrl->rr_lock);
-		return;
-	}
-	DTRACE_PROBE(zfs__rrwfastpath__exitmiss);
-#endif
-	ASSERT(!zfs_refcount_is_zero(&rrl->rr_anon_rcount) ||
-	    !zfs_refcount_is_zero(&rrl->rr_linked_rcount) ||
-	    rrl->rr_writer != NULL);
-
-	if (rrl->rr_writer == NULL) {
-		int64_t count;
-		if (rrn_find_and_remove(rrl, tag)) {
-			count = zfs_refcount_remove(
-			    &rrl->rr_linked_rcount, tag);
-		} else {
-			ASSERT(!rrl->rr_track_all);
-			count = zfs_refcount_remove(&rrl->rr_anon_rcount, tag);
-		}
-		if (count == 0)
-			cv_broadcast(&rrl->rr_cv);
-	} else {
-		ASSERT(rrl->rr_writer == curthread);
-		ASSERT(zfs_refcount_is_zero(&rrl->rr_anon_rcount) &&
-		    zfs_refcount_is_zero(&rrl->rr_linked_rcount));
-		rrl->rr_writer = NULL;
-		cv_broadcast(&rrl->rr_cv);
-	}
-	mutex_exit(&rrl->rr_lock);
-}
-
-/*
- * If the lock was created with track_all, rrw_held(RW_READER) will return
- * B_TRUE iff the current thread has the lock for reader.  Otherwise it may
- * return B_TRUE if any thread has the lock for reader.
- */
-boolean_t
-rrw_held(rrwlock_t *rrl, krw_t rw)
-{
-	boolean_t held;
-
-	mutex_enter(&rrl->rr_lock);
-	if (rw == RW_WRITER) {
-		held = (rrl->rr_writer == curthread);
-	} else {
-		held = (!zfs_refcount_is_zero(&rrl->rr_anon_rcount) ||
-		    rrn_find(rrl) != NULL);
-	}
-	mutex_exit(&rrl->rr_lock);
-
-	return (held);
-}
-
-void
-rrw_tsd_destroy(void *arg)
-{
-	rrw_node_t *rn = arg;
-	if (rn != NULL) {
-		panic("thread %p terminating with rrw lock %p held",
-		    (void *)curthread, (void *)rn->rn_rrl);
-	}
-}
-
-/*
- * A reader-mostly lock implementation, tuning above reader-writer locks
- * for hightly parallel read acquisitions, while pessimizing writes.
- *
- * The idea is to split single busy lock into array of locks, so that
- * each reader can lock only one of them for read, depending on result
- * of simple hash function.  That proportionally reduces lock congestion.
- * Writer same time has to sequentially aquire write on all the locks.
- * That makes write aquisition proportionally slower, but in places where
- * it is used (filesystem unmount) performance is not critical.
- *
- * All the functions below are direct wrappers around functions above.
- */
-void
-rrm_init(rrmlock_t *rrl, boolean_t track_all)
-{
-	int i;
-
-	for (i = 0; i < RRM_NUM_LOCKS; i++)
-		rrw_init(&rrl->locks[i], track_all);
-}
-
-void
-rrm_destroy(rrmlock_t *rrl)
-{
-	int i;
-
-	for (i = 0; i < RRM_NUM_LOCKS; i++)
-		rrw_destroy(&rrl->locks[i]);
-}
-
-void
-rrm_enter(rrmlock_t *rrl, krw_t rw, void *tag)
-{
-	if (rw == RW_READER)
-		rrm_enter_read(rrl, tag);
-	else
-		rrm_enter_write(rrl);
-}
-
-/*
- * This maps the current thread to a specific lock.  Note that the lock
- * must be released by the same thread that acquired it.  We do this
- * mapping by taking the thread pointer mod a prime number.  We examine
- * only the low 32 bits of the thread pointer, because 32-bit division
- * is faster than 64-bit division, and the high 32 bits have little
- * entropy anyway.
- */
-#define	RRM_TD_LOCK()	(((uint32_t)(uintptr_t)(curthread)) % RRM_NUM_LOCKS)
-
-void
-rrm_enter_read(rrmlock_t *rrl, void *tag)
-{
-	rrw_enter_read(&rrl->locks[RRM_TD_LOCK()], tag);
-}
-
-void
-rrm_enter_write(rrmlock_t *rrl)
-{
-	int i;
-
-	for (i = 0; i < RRM_NUM_LOCKS; i++)
-		rrw_enter_write(&rrl->locks[i]);
-}
-
-void
-rrm_exit(rrmlock_t *rrl, void *tag)
-{
-	int i;
-
-	if (rrl->locks[0].rr_writer == curthread) {
-		for (i = 0; i < RRM_NUM_LOCKS; i++)
-			rrw_exit(&rrl->locks[i], tag);
-	} else {
-		rrw_exit(&rrl->locks[RRM_TD_LOCK()], tag);
-	}
-}
-
-boolean_t
-rrm_held(rrmlock_t *rrl, krw_t rw)
-{
-	if (rw == RW_WRITER) {
-		return (rrw_held(&rrl->locks[0], rw));
-	} else {
-		return (rrw_held(&rrl->locks[RRM_TD_LOCK()], rw));
-	}
-}
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sa.c
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sa.c
+++ /dev/null
@@ -1,2012 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-
-/*
- * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
- * Portions Copyright 2011 iXsystems, Inc
- * Copyright (c) 2013, 2017 by Delphix. All rights reserved.
- * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
- * Copyright (c) 2014 Integros [integros.com]
- */
-
-#include <sys/zfs_context.h>
-#include <sys/types.h>
-#include <sys/param.h>
-#include <sys/systm.h>
-#include <sys/sysmacros.h>
-#include <sys/dmu.h>
-#include <sys/dmu_impl.h>
-#include <sys/dmu_objset.h>
-#include <sys/dmu_tx.h>
-#include <sys/dbuf.h>
-#include <sys/dnode.h>
-#include <sys/zap.h>
-#include <sys/sa.h>
-#include <sys/sunddi.h>
-#include <sys/sa_impl.h>
-#include <sys/dnode.h>
-#include <sys/errno.h>
-#include <sys/zfs_context.h>
-
-/*
- * ZFS System attributes:
- *
- * A generic mechanism to allow for arbitrary attributes
- * to be stored in a dnode.  The data will be stored in the bonus buffer of
- * the dnode and if necessary a special "spill" block will be used to handle
- * overflow situations.  The spill block will be sized to fit the data
- * from 512 - 128K.  When a spill block is used the BP (blkptr_t) for the
- * spill block is stored at the end of the current bonus buffer.  Any
- * attributes that would be in the way of the blkptr_t will be relocated
- * into the spill block.
- *
- * Attribute registration:
- *
- * Stored persistently on a per dataset basis
- * a mapping between attribute "string" names and their actual attribute
- * numeric values, length, and byteswap function.  The names are only used
- * during registration.  All  attributes are known by their unique attribute
- * id value.  If an attribute can have a variable size then the value
- * 0 will be used to indicate this.
- *
- * Attribute Layout:
- *
- * Attribute layouts are a way to compactly store multiple attributes, but
- * without taking the overhead associated with managing each attribute
- * individually.  Since you will typically have the same set of attributes
- * stored in the same order a single table will be used to represent that
- * layout.  The ZPL for example will usually have only about 10 different
- * layouts (regular files, device files, symlinks,
- * regular files + scanstamp, files/dir with extended attributes, and then
- * you have the possibility of all of those minus ACL, because it would
- * be kicked out into the spill block)
- *
- * Layouts are simply an array of the attributes and their
- * ordering i.e. [0, 1, 4, 5, 2]
- *
- * Each distinct layout is given a unique layout number and that is whats
- * stored in the header at the beginning of the SA data buffer.
- *
- * A layout only covers a single dbuf (bonus or spill).  If a set of
- * attributes is split up between the bonus buffer and a spill buffer then
- * two different layouts will be used.  This allows us to byteswap the
- * spill without looking at the bonus buffer and keeps the on disk format of
- * the bonus and spill buffer the same.
- *
- * Adding a single attribute will cause the entire set of attributes to
- * be rewritten and could result in a new layout number being constructed
- * as part of the rewrite if no such layout exists for the new set of
- * attribues.  The new attribute will be appended to the end of the already
- * existing attributes.
- *
- * Both the attribute registration and attribute layout information are
- * stored in normal ZAP attributes.  Their should be a small number of
- * known layouts and the set of attributes is assumed to typically be quite
- * small.
- *
- * The registered attributes and layout "table" information is maintained
- * in core and a special "sa_os_t" is attached to the objset_t.
- *
- * A special interface is provided to allow for quickly applying
- * a large set of attributes at once.  sa_replace_all_by_template() is
- * used to set an array of attributes.  This is used by the ZPL when
- * creating a brand new file.  The template that is passed into the function
- * specifies the attribute, size for variable length attributes, location of
- * data and special "data locator" function if the data isn't in a contiguous
- * location.
- *
- * Byteswap implications:
- *
- * Since the SA attributes are not entirely self describing we can't do
- * the normal byteswap processing.  The special ZAP layout attribute and
- * attribute registration attributes define the byteswap function and the
- * size of the attributes, unless it is variable sized.
- * The normal ZFS byteswapping infrastructure assumes you don't need
- * to read any objects in order to do the necessary byteswapping.  Whereas
- * SA attributes can only be properly byteswapped if the dataset is opened
- * and the layout/attribute ZAP attributes are available.  Because of this
- * the SA attributes will be byteswapped when they are first accessed by
- * the SA code that will read the SA data.
- */
-
-typedef void (sa_iterfunc_t)(void *hdr, void *addr, sa_attr_type_t,
-    uint16_t length, int length_idx, boolean_t, void *userp);
-
-static int sa_build_index(sa_handle_t *hdl, sa_buf_type_t buftype);
-static void sa_idx_tab_hold(objset_t *os, sa_idx_tab_t *idx_tab);
-static sa_idx_tab_t *sa_find_idx_tab(objset_t *os, dmu_object_type_t bonustype,
-    sa_hdr_phys_t *hdr);
-static void sa_idx_tab_rele(objset_t *os, void *arg);
-static void sa_copy_data(sa_data_locator_t *func, void *start, void *target,
-    int buflen);
-static int sa_modify_attrs(sa_handle_t *hdl, sa_attr_type_t newattr,
-    sa_data_op_t action, sa_data_locator_t *locator, void *datastart,
-    uint16_t buflen, dmu_tx_t *tx);
-
-arc_byteswap_func_t *sa_bswap_table[] = {
-	byteswap_uint64_array,
-	byteswap_uint32_array,
-	byteswap_uint16_array,
-	byteswap_uint8_array,
-	zfs_acl_byteswap,
-};
-
-#define	SA_COPY_DATA(f, s, t, l) \
-	{ \
-		if (f == NULL) { \
-			if (l == 8) { \
-				*(uint64_t *)t = *(uint64_t *)s; \
-			} else if (l == 16) { \
-				*(uint64_t *)t = *(uint64_t *)s; \
-				*(uint64_t *)((uintptr_t)t + 8) = \
-				    *(uint64_t *)((uintptr_t)s + 8); \
-			} else { \
-				bcopy(s, t, l); \
-			} \
-		} else \
-			sa_copy_data(f, s, t, l); \
-	}
-
-/*
- * This table is fixed and cannot be changed.  Its purpose is to
- * allow the SA code to work with both old/new ZPL file systems.
- * It contains the list of legacy attributes.  These attributes aren't
- * stored in the "attribute" registry zap objects, since older ZPL file systems
- * won't have the registry.  Only objsets of type ZFS_TYPE_FILESYSTEM will
- * use this static table.
- */
-sa_attr_reg_t sa_legacy_attrs[] = {
-	{"ZPL_ATIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 0},
-	{"ZPL_MTIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 1},
-	{"ZPL_CTIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 2},
-	{"ZPL_CRTIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 3},
-	{"ZPL_GEN", sizeof (uint64_t), SA_UINT64_ARRAY, 4},
-	{"ZPL_MODE", sizeof (uint64_t), SA_UINT64_ARRAY, 5},
-	{"ZPL_SIZE", sizeof (uint64_t), SA_UINT64_ARRAY, 6},
-	{"ZPL_PARENT", sizeof (uint64_t), SA_UINT64_ARRAY, 7},
-	{"ZPL_LINKS", sizeof (uint64_t), SA_UINT64_ARRAY, 8},
-	{"ZPL_XATTR", sizeof (uint64_t), SA_UINT64_ARRAY, 9},
-	{"ZPL_RDEV", sizeof (uint64_t), SA_UINT64_ARRAY, 10},
-	{"ZPL_FLAGS", sizeof (uint64_t), SA_UINT64_ARRAY, 11},
-	{"ZPL_UID", sizeof (uint64_t), SA_UINT64_ARRAY, 12},
-	{"ZPL_GID", sizeof (uint64_t), SA_UINT64_ARRAY, 13},
-	{"ZPL_PAD", sizeof (uint64_t) * 4, SA_UINT64_ARRAY, 14},
-	{"ZPL_ZNODE_ACL", 88, SA_UINT8_ARRAY, 15},
-};
-
-/*
- * This is only used for objects of type DMU_OT_ZNODE
- */
-sa_attr_type_t sa_legacy_zpl_layout[] = {
-    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
-};
-
-/*
- * Special dummy layout used for buffers with no attributes.
- */
-sa_attr_type_t sa_dummy_zpl_layout[] = { 0 };
-
-static int sa_legacy_attr_count = 16;
-static kmem_cache_t *sa_cache = NULL;
-
-/*ARGSUSED*/
-static int
-sa_cache_constructor(void *buf, void *unused, int kmflag)
-{
-	sa_handle_t *hdl = buf;
-
-	mutex_init(&hdl->sa_lock, NULL, MUTEX_DEFAULT, NULL);
-	return (0);
-}
-
-/*ARGSUSED*/
-static void
-sa_cache_destructor(void *buf, void *unused)
-{
-	sa_handle_t *hdl = buf;
-	mutex_destroy(&hdl->sa_lock);
-}
-
-void
-sa_cache_init(void)
-{
-	sa_cache = kmem_cache_create("sa_cache",
-	    sizeof (sa_handle_t), 0, sa_cache_constructor,
-	    sa_cache_destructor, NULL, NULL, NULL, 0);
-}
-
-void
-sa_cache_fini(void)
-{
-	if (sa_cache)
-		kmem_cache_destroy(sa_cache);
-}
-
-static int
-layout_num_compare(const void *arg1, const void *arg2)
-{
-	const sa_lot_t *node1 = (const sa_lot_t *)arg1;
-	const sa_lot_t *node2 = (const sa_lot_t *)arg2;
-
-	return (AVL_CMP(node1->lot_num, node2->lot_num));
-}
-
-static int
-layout_hash_compare(const void *arg1, const void *arg2)
-{
-	const sa_lot_t *node1 = (const sa_lot_t *)arg1;
-	const sa_lot_t *node2 = (const sa_lot_t *)arg2;
-
-	int cmp = AVL_CMP(node1->lot_hash, node2->lot_hash);
-	if (likely(cmp))
-		return (cmp);
-
-	return (AVL_CMP(node1->lot_instance, node2->lot_instance));
-}
-
-boolean_t
-sa_layout_equal(sa_lot_t *tbf, sa_attr_type_t *attrs, int count)
-{
-	int i;
-
-	if (count != tbf->lot_attr_count)
-		return (1);
-
-	for (i = 0; i != count; i++) {
-		if (attrs[i] != tbf->lot_attrs[i])
-			return (1);
-	}
-	return (0);
-}
-
-#define	SA_ATTR_HASH(attr) (zfs_crc64_table[(-1ULL ^ attr) & 0xFF])
-
-static uint64_t
-sa_layout_info_hash(sa_attr_type_t *attrs, int attr_count)
-{
-	int i;
-	uint64_t crc = -1ULL;
-
-	for (i = 0; i != attr_count; i++)
-		crc ^= SA_ATTR_HASH(attrs[i]);
-
-	return (crc);
-}
-
-static int
-sa_get_spill(sa_handle_t *hdl)
-{
-	int rc;
-	if (hdl->sa_spill == NULL) {
-		if ((rc = dmu_spill_hold_existing(hdl->sa_bonus, NULL,
-		    &hdl->sa_spill)) == 0)
-			VERIFY(0 == sa_build_index(hdl, SA_SPILL));
-	} else {
-		rc = 0;
-	}
-
-	return (rc);
-}
-
-/*
- * Main attribute lookup/update function
- * returns 0 for success or non zero for failures
- *
- * Operates on bulk array, first failure will abort further processing
- */
-int
-sa_attr_op(sa_handle_t *hdl, sa_bulk_attr_t *bulk, int count,
-    sa_data_op_t data_op, dmu_tx_t *tx)
-{
-	sa_os_t *sa = hdl->sa_os->os_sa;
-	int i;
-	int error = 0;
-	sa_buf_type_t buftypes;
-
-	buftypes = 0;
-
-	ASSERT(count > 0);
-	for (i = 0; i != count; i++) {
-		ASSERT(bulk[i].sa_attr <= hdl->sa_os->os_sa->sa_num_attrs);
-
-		bulk[i].sa_addr = NULL;
-		/* First check the bonus buffer */
-
-		if (hdl->sa_bonus_tab && TOC_ATTR_PRESENT(
-		    hdl->sa_bonus_tab->sa_idx_tab[bulk[i].sa_attr])) {
-			SA_ATTR_INFO(sa, hdl->sa_bonus_tab,
-			    SA_GET_HDR(hdl, SA_BONUS),
-			    bulk[i].sa_attr, bulk[i], SA_BONUS, hdl);
-			if (tx && !(buftypes & SA_BONUS)) {
-				dmu_buf_will_dirty(hdl->sa_bonus, tx);
-				buftypes |= SA_BONUS;
-			}
-		}
-		if (bulk[i].sa_addr == NULL &&
-		    ((error = sa_get_spill(hdl)) == 0)) {
-			if (TOC_ATTR_PRESENT(
-			    hdl->sa_spill_tab->sa_idx_tab[bulk[i].sa_attr])) {
-				SA_ATTR_INFO(sa, hdl->sa_spill_tab,
-				    SA_GET_HDR(hdl, SA_SPILL),
-				    bulk[i].sa_attr, bulk[i], SA_SPILL, hdl);
-				if (tx && !(buftypes & SA_SPILL) &&
-				    bulk[i].sa_size == bulk[i].sa_length) {
-					dmu_buf_will_dirty(hdl->sa_spill, tx);
-					buftypes |= SA_SPILL;
-				}
-			}
-		}
-		if (error && error != ENOENT) {
-			return ((error == ECKSUM) ? EIO : error);
-		}
-
-		switch (data_op) {
-		case SA_LOOKUP:
-			if (bulk[i].sa_addr == NULL)
-				return (SET_ERROR(ENOENT));
-			if (bulk[i].sa_data) {
-				SA_COPY_DATA(bulk[i].sa_data_func,
-				    bulk[i].sa_addr, bulk[i].sa_data,
-				    bulk[i].sa_size);
-			}
-			continue;
-
-		case SA_UPDATE:
-			/* existing rewrite of attr */
-			if (bulk[i].sa_addr &&
-			    bulk[i].sa_size == bulk[i].sa_length) {
-				SA_COPY_DATA(bulk[i].sa_data_func,
-				    bulk[i].sa_data, bulk[i].sa_addr,
-				    bulk[i].sa_length);
-				continue;
-			} else if (bulk[i].sa_addr) { /* attr size change */
-				error = sa_modify_attrs(hdl, bulk[i].sa_attr,
-				    SA_REPLACE, bulk[i].sa_data_func,
-				    bulk[i].sa_data, bulk[i].sa_length, tx);
-			} else { /* adding new attribute */
-				error = sa_modify_attrs(hdl, bulk[i].sa_attr,
-				    SA_ADD, bulk[i].sa_data_func,
-				    bulk[i].sa_data, bulk[i].sa_length, tx);
-			}
-			if (error)
-				return (error);
-			break;
-		}
-	}
-	return (error);
-}
-
-static sa_lot_t *
-sa_add_layout_entry(objset_t *os, sa_attr_type_t *attrs, int attr_count,
-    uint64_t lot_num, uint64_t hash, boolean_t zapadd, dmu_tx_t *tx)
-{
-	sa_os_t *sa = os->os_sa;
-	sa_lot_t *tb, *findtb;
-	int i;
-	avl_index_t loc;
-
-	ASSERT(MUTEX_HELD(&sa->sa_lock));
-	tb = kmem_zalloc(sizeof (sa_lot_t), KM_SLEEP);
-	tb->lot_attr_count = attr_count;
-	tb->lot_attrs = kmem_alloc(sizeof (sa_attr_type_t) * attr_count,
-	    KM_SLEEP);
-	bcopy(attrs, tb->lot_attrs, sizeof (sa_attr_type_t) * attr_count);
-	tb->lot_num = lot_num;
-	tb->lot_hash = hash;
-	tb->lot_instance = 0;
-
-	if (zapadd) {
-		char attr_name[8];
-
-		if (sa->sa_layout_attr_obj == 0) {
-			sa->sa_layout_attr_obj = zap_create_link(os,
-			    DMU_OT_SA_ATTR_LAYOUTS,
-			    sa->sa_master_obj, SA_LAYOUTS, tx);
-		}
-
-		(void) snprintf(attr_name, sizeof (attr_name),
-		    "%d", (int)lot_num);
-		VERIFY(0 == zap_update(os, os->os_sa->sa_layout_attr_obj,
-		    attr_name, 2, attr_count, attrs, tx));
-	}
-
-	list_create(&tb->lot_idx_tab, sizeof (sa_idx_tab_t),
-	    offsetof(sa_idx_tab_t, sa_next));
-
-	for (i = 0; i != attr_count; i++) {
-		if (sa->sa_attr_table[tb->lot_attrs[i]].sa_length == 0)
-			tb->lot_var_sizes++;
-	}
-
-	avl_add(&sa->sa_layout_num_tree, tb);
-
-	/* verify we don't have a hash collision */
-	if ((findtb = avl_find(&sa->sa_layout_hash_tree, tb, &loc)) != NULL) {
-		for (; findtb && findtb->lot_hash == hash;
-		    findtb = AVL_NEXT(&sa->sa_layout_hash_tree, findtb)) {
-			if (findtb->lot_instance != tb->lot_instance)
-				break;
-			tb->lot_instance++;
-		}
-	}
-	avl_add(&sa->sa_layout_hash_tree, tb);
-	return (tb);
-}
-
-static void
-sa_find_layout(objset_t *os, uint64_t hash, sa_attr_type_t *attrs,
-    int count, dmu_tx_t *tx, sa_lot_t **lot)
-{
-	sa_lot_t *tb, tbsearch;
-	avl_index_t loc;
-	sa_os_t *sa = os->os_sa;
-	boolean_t found = B_FALSE;
-
-	mutex_enter(&sa->sa_lock);
-	tbsearch.lot_hash = hash;
-	tbsearch.lot_instance = 0;
-	tb = avl_find(&sa->sa_layout_hash_tree, &tbsearch, &loc);
-	if (tb) {
-		for (; tb && tb->lot_hash == hash;
-		    tb = AVL_NEXT(&sa->sa_layout_hash_tree, tb)) {
-			if (sa_layout_equal(tb, attrs, count) == 0) {
-				found = B_TRUE;
-				break;
-			}
-		}
-	}
-	if (!found) {
-		tb = sa_add_layout_entry(os, attrs, count,
-		    avl_numnodes(&sa->sa_layout_num_tree), hash, B_TRUE, tx);
-	}
-	mutex_exit(&sa->sa_lock);
-	*lot = tb;
-}
-
-static int
-sa_resize_spill(sa_handle_t *hdl, uint32_t size, dmu_tx_t *tx)
-{
-	int error;
-	uint32_t blocksize;
-
-	if (size == 0) {
-		blocksize = SPA_MINBLOCKSIZE;
-	} else if (size > SPA_OLD_MAXBLOCKSIZE) {
-		ASSERT(0);
-		return (SET_ERROR(EFBIG));
-	} else {
-		blocksize = P2ROUNDUP_TYPED(size, SPA_MINBLOCKSIZE, uint32_t);
-	}
-
-	error = dbuf_spill_set_blksz(hdl->sa_spill, blocksize, tx);
-	ASSERT(error == 0);
-	return (error);
-}
-
-static void
-sa_copy_data(sa_data_locator_t *func, void *datastart, void *target, int buflen)
-{
-	if (func == NULL) {
-		bcopy(datastart, target, buflen);
-	} else {
-		boolean_t start;
-		int bytes;
-		void *dataptr;
-		void *saptr = target;
-		uint32_t length;
-
-		start = B_TRUE;
-		bytes = 0;
-		while (bytes < buflen) {
-			func(&dataptr, &length, buflen, start, datastart);
-			bcopy(dataptr, saptr, length);
-			saptr = (void *)((caddr_t)saptr + length);
-			bytes += length;
-			start = B_FALSE;
-		}
-	}
-}
-
-/*
- * Determine several different sizes
- * first the sa header size
- * the number of bytes to be stored
- * if spill would occur the index in the attribute array is returned
- *
- * the boolean will_spill will be set when spilling is necessary.  It
- * is only set when the buftype is SA_BONUS
- */
-static int
-sa_find_sizes(sa_os_t *sa, sa_bulk_attr_t *attr_desc, int attr_count,
-    dmu_buf_t *db, sa_buf_type_t buftype, int full_space, int *index,
-    int *total, boolean_t *will_spill)
-{
-	int var_size = 0;
-	int i;
-	int hdrsize;
-	int extra_hdrsize;
-
-	if (buftype == SA_BONUS && sa->sa_force_spill) {
-		*total = 0;
-		*index = 0;
-		*will_spill = B_TRUE;
-		return (0);
-	}
-
-	*index = -1;
-	*total = 0;
-	*will_spill = B_FALSE;
-
-	extra_hdrsize = 0;
-	hdrsize = (SA_BONUSTYPE_FROM_DB(db) == DMU_OT_ZNODE) ? 0 :
-	    sizeof (sa_hdr_phys_t);
-
-	ASSERT(IS_P2ALIGNED(full_space, 8));
-
-	for (i = 0; i != attr_count; i++) {
-		boolean_t is_var_sz;
-
-		*total = P2ROUNDUP(*total, 8);
-		*total += attr_desc[i].sa_length;
-		if (*will_spill)
-			continue;
-
-		is_var_sz = (SA_REGISTERED_LEN(sa, attr_desc[i].sa_attr) == 0);
-		if (is_var_sz) {
-			var_size++;
-		}
-
-		if (is_var_sz && var_size > 1) {
-			/*
-			 * Don't worry that the spill block might overflow.
-			 * It will be resized if needed in sa_build_layouts().
-			 */
-			if (buftype == SA_SPILL ||
-			    P2ROUNDUP(hdrsize + sizeof (uint16_t), 8) +
-			    *total < full_space) {
-				/*
-				 * Account for header space used by array of
-				 * optional sizes of variable-length attributes.
-				 * Record the extra header size in case this
-				 * increase needs to be reversed due to
-				 * spill-over.
-				 */
-				hdrsize += sizeof (uint16_t);
-				if (*index != -1)
-					extra_hdrsize += sizeof (uint16_t);
-			} else {
-				ASSERT(buftype == SA_BONUS);
-				if (*index == -1)
-					*index = i;
-				*will_spill = B_TRUE;
-				continue;
-			}
-		}
-
-		/*
-		 * find index of where spill *could* occur.
-		 * Then continue to count of remainder attribute
-		 * space.  The sum is used later for sizing bonus
-		 * and spill buffer.
-		 */
-		if (buftype == SA_BONUS && *index == -1 &&
-		    (*total + P2ROUNDUP(hdrsize, 8)) >
-		    (full_space - sizeof (blkptr_t))) {
-			*index = i;
-		}
-
-		if ((*total + P2ROUNDUP(hdrsize, 8)) > full_space &&
-		    buftype == SA_BONUS)
-			*will_spill = B_TRUE;
-	}
-
-	if (*will_spill)
-		hdrsize -= extra_hdrsize;
-
-	hdrsize = P2ROUNDUP(hdrsize, 8);
-	return (hdrsize);
-}
-
-#define	BUF_SPACE_NEEDED(total, header) (total + header)
-
-/*
- * Find layout that corresponds to ordering of attributes
- * If not found a new layout number is created and added to
- * persistent layout tables.
- */
-static int
-sa_build_layouts(sa_handle_t *hdl, sa_bulk_attr_t *attr_desc, int attr_count,
-    dmu_tx_t *tx)
-{
-	sa_os_t *sa = hdl->sa_os->os_sa;
-	uint64_t hash;
-	sa_buf_type_t buftype;
-	sa_hdr_phys_t *sahdr;
-	void *data_start;
-	int buf_space;
-	sa_attr_type_t *attrs, *attrs_start;
-	int i, lot_count;
-	int dnodesize;
-	int hdrsize;
-	int spillhdrsize = 0;
-	int used;
-	dmu_object_type_t bonustype;
-	sa_lot_t *lot;
-	int len_idx;
-	int spill_used;
-	int bonuslen;
-	boolean_t spilling;
-
-	dmu_buf_will_dirty(hdl->sa_bonus, tx);
-	bonustype = SA_BONUSTYPE_FROM_DB(hdl->sa_bonus);
-	dmu_object_dnsize_from_db(hdl->sa_bonus, &dnodesize);
-	bonuslen = DN_BONUS_SIZE(dnodesize);
-	
-	dmu_object_dnsize_from_db(hdl->sa_bonus, &dnodesize);
-	bonuslen = DN_BONUS_SIZE(dnodesize);
-
-	/* first determine bonus header size and sum of all attributes */
-	hdrsize = sa_find_sizes(sa, attr_desc, attr_count, hdl->sa_bonus,
-	    SA_BONUS, bonuslen, &i, &used, &spilling);
-
-	if (used > SPA_OLD_MAXBLOCKSIZE)
-		return (SET_ERROR(EFBIG));
-
-	VERIFY(0 == dmu_set_bonus(hdl->sa_bonus, spilling ?
-	    MIN(bonuslen - sizeof (blkptr_t), used + hdrsize) :
-	    used + hdrsize, tx));
-
-	ASSERT((bonustype == DMU_OT_ZNODE && spilling == 0) ||
-	    bonustype == DMU_OT_SA);
-
-	/* setup and size spill buffer when needed */
-	if (spilling) {
-		boolean_t dummy;
-
-		if (hdl->sa_spill == NULL) {
-			VERIFY(dmu_spill_hold_by_bonus(hdl->sa_bonus, NULL,
-			    &hdl->sa_spill) == 0);
-		}
-		dmu_buf_will_dirty(hdl->sa_spill, tx);
-
-		spillhdrsize = sa_find_sizes(sa, &attr_desc[i],
-		    attr_count - i, hdl->sa_spill, SA_SPILL,
-		    hdl->sa_spill->db_size, &i, &spill_used, &dummy);
-
-		if (spill_used > SPA_OLD_MAXBLOCKSIZE)
-			return (SET_ERROR(EFBIG));
-
-		buf_space = hdl->sa_spill->db_size - spillhdrsize;
-		if (BUF_SPACE_NEEDED(spill_used, spillhdrsize) >
-		    hdl->sa_spill->db_size)
-			VERIFY(0 == sa_resize_spill(hdl,
-			    BUF_SPACE_NEEDED(spill_used, spillhdrsize), tx));
-	}
-
-	/* setup starting pointers to lay down data */
-	data_start = (void *)((uintptr_t)hdl->sa_bonus->db_data + hdrsize);
-	sahdr = (sa_hdr_phys_t *)hdl->sa_bonus->db_data;
-	buftype = SA_BONUS;
-
-	if (spilling)
-		buf_space = (sa->sa_force_spill) ?
-		    0 : SA_BLKPTR_SPACE - hdrsize;
-	else
-		buf_space = hdl->sa_bonus->db_size - hdrsize;
-
-	attrs_start = attrs = kmem_alloc(sizeof (sa_attr_type_t) * attr_count,
-	    KM_SLEEP);
-	lot_count = 0;
-
-	for (i = 0, len_idx = 0, hash = -1ULL; i != attr_count; i++) {
-		uint16_t length;
-
-		ASSERT(IS_P2ALIGNED(data_start, 8));
-		ASSERT(IS_P2ALIGNED(buf_space, 8));
-		attrs[i] = attr_desc[i].sa_attr;
-		length = SA_REGISTERED_LEN(sa, attrs[i]);
-		if (length == 0)
-			length = attr_desc[i].sa_length;
-		else
-			VERIFY(length == attr_desc[i].sa_length);
-
-		if (buf_space < length) {  /* switch to spill buffer */
-			VERIFY(spilling);
-			VERIFY(bonustype == DMU_OT_SA);
-			if (buftype == SA_BONUS && !sa->sa_force_spill) {
-				sa_find_layout(hdl->sa_os, hash, attrs_start,
-				    lot_count, tx, &lot);
-				SA_SET_HDR(sahdr, lot->lot_num, hdrsize);
-			}
-
-			buftype = SA_SPILL;
-			hash = -1ULL;
-			len_idx = 0;
-
-			sahdr = (sa_hdr_phys_t *)hdl->sa_spill->db_data;
-			sahdr->sa_magic = SA_MAGIC;
-			data_start = (void *)((uintptr_t)sahdr +
-			    spillhdrsize);
-			attrs_start = &attrs[i];
-			buf_space = hdl->sa_spill->db_size - spillhdrsize;
-			lot_count = 0;
-		}
-		hash ^= SA_ATTR_HASH(attrs[i]);
-		attr_desc[i].sa_addr = data_start;
-		attr_desc[i].sa_size = length;
-		SA_COPY_DATA(attr_desc[i].sa_data_func, attr_desc[i].sa_data,
-		    data_start, length);
-		if (sa->sa_attr_table[attrs[i]].sa_length == 0) {
-			sahdr->sa_lengths[len_idx++] = length;
-		}
-		VERIFY((uintptr_t)data_start % 8 == 0);
-		data_start = (void *)P2ROUNDUP(((uintptr_t)data_start +
-		    length), 8);
-		buf_space -= P2ROUNDUP(length, 8);
-		lot_count++;
-	}
-
-	sa_find_layout(hdl->sa_os, hash, attrs_start, lot_count, tx, &lot);
-
-	/*
-	 * Verify that old znodes always have layout number 0.
-	 * Must be DMU_OT_SA for arbitrary layouts
-	 */
-	VERIFY((bonustype == DMU_OT_ZNODE && lot->lot_num == 0) ||
-	    (bonustype == DMU_OT_SA && lot->lot_num > 1));
-
-	if (bonustype == DMU_OT_SA) {
-		SA_SET_HDR(sahdr, lot->lot_num,
-		    buftype == SA_BONUS ? hdrsize : spillhdrsize);
-	}
-
-	kmem_free(attrs, sizeof (sa_attr_type_t) * attr_count);
-	if (hdl->sa_bonus_tab) {
-		sa_idx_tab_rele(hdl->sa_os, hdl->sa_bonus_tab);
-		hdl->sa_bonus_tab = NULL;
-	}
-	if (!sa->sa_force_spill)
-		VERIFY(0 == sa_build_index(hdl, SA_BONUS));
-	if (hdl->sa_spill) {
-		sa_idx_tab_rele(hdl->sa_os, hdl->sa_spill_tab);
-		if (!spilling) {
-			/*
-			 * remove spill block that is no longer needed.
-			 */
-			dmu_buf_rele(hdl->sa_spill, NULL);
-			hdl->sa_spill = NULL;
-			hdl->sa_spill_tab = NULL;
-			VERIFY(0 == dmu_rm_spill(hdl->sa_os,
-			    sa_handle_object(hdl), tx));
-		} else {
-			VERIFY(0 == sa_build_index(hdl, SA_SPILL));
-		}
-	}
-
-	return (0);
-}
-
-static void
-sa_free_attr_table(sa_os_t *sa)
-{
-	int i;
-
-	if (sa->sa_attr_table == NULL)
-		return;
-
-	for (i = 0; i != sa->sa_num_attrs; i++) {
-		if (sa->sa_attr_table[i].sa_name)
-			kmem_free(sa->sa_attr_table[i].sa_name,
-			    strlen(sa->sa_attr_table[i].sa_name) + 1);
-	}
-
-	kmem_free(sa->sa_attr_table,
-	    sizeof (sa_attr_table_t) * sa->sa_num_attrs);
-
-	sa->sa_attr_table = NULL;
-}
-
-static int
-sa_attr_table_setup(objset_t *os, sa_attr_reg_t *reg_attrs, int count)
-{
-	sa_os_t *sa = os->os_sa;
-	uint64_t sa_attr_count = 0;
-	uint64_t sa_reg_count = 0;
-	int error = 0;
-	uint64_t attr_value;
-	sa_attr_table_t *tb;
-	zap_cursor_t zc;
-	zap_attribute_t za;
-	int registered_count = 0;
-	int i;
-	dmu_objset_type_t ostype = dmu_objset_type(os);
-
-	sa->sa_user_table =
-	    kmem_zalloc(count * sizeof (sa_attr_type_t), KM_SLEEP);
-	sa->sa_user_table_sz = count * sizeof (sa_attr_type_t);
-
-	if (sa->sa_reg_attr_obj != 0) {
-		error = zap_count(os, sa->sa_reg_attr_obj,
-		    &sa_attr_count);
-
-		/*
-		 * Make sure we retrieved a count and that it isn't zero
-		 */
-		if (error || (error == 0 && sa_attr_count == 0)) {
-			if (error == 0)
-				error = SET_ERROR(EINVAL);
-			goto bail;
-		}
-		sa_reg_count = sa_attr_count;
-	}
-
-	if (ostype == DMU_OST_ZFS && sa_attr_count == 0)
-		sa_attr_count += sa_legacy_attr_count;
-
-	/* Allocate attribute numbers for attributes that aren't registered */
-	for (i = 0; i != count; i++) {
-		boolean_t found = B_FALSE;
-		int j;
-
-		if (ostype == DMU_OST_ZFS) {
-			for (j = 0; j != sa_legacy_attr_count; j++) {
-				if (strcmp(reg_attrs[i].sa_name,
-				    sa_legacy_attrs[j].sa_name) == 0) {
-					sa->sa_user_table[i] =
-					    sa_legacy_attrs[j].sa_attr;
-					found = B_TRUE;
-				}
-			}
-		}
-		if (found)
-			continue;
-
-		if (sa->sa_reg_attr_obj)
-			error = zap_lookup(os, sa->sa_reg_attr_obj,
-			    reg_attrs[i].sa_name, 8, 1, &attr_value);
-		else
-			error = SET_ERROR(ENOENT);
-		switch (error) {
-		case ENOENT:
-			sa->sa_user_table[i] = (sa_attr_type_t)sa_attr_count;
-			sa_attr_count++;
-			break;
-		case 0:
-			sa->sa_user_table[i] = ATTR_NUM(attr_value);
-			break;
-		default:
-			goto bail;
-		}
-	}
-
-	sa->sa_num_attrs = sa_attr_count;
-	tb = sa->sa_attr_table =
-	    kmem_zalloc(sizeof (sa_attr_table_t) * sa_attr_count, KM_SLEEP);
-
-	/*
-	 * Attribute table is constructed from requested attribute list,
-	 * previously foreign registered attributes, and also the legacy
-	 * ZPL set of attributes.
-	 */
-
-	if (sa->sa_reg_attr_obj) {
-		for (zap_cursor_init(&zc, os, sa->sa_reg_attr_obj);
-		    (error = zap_cursor_retrieve(&zc, &za)) == 0;
-		    zap_cursor_advance(&zc)) {
-			uint64_t value;
-			value  = za.za_first_integer;
-
-			registered_count++;
-			tb[ATTR_NUM(value)].sa_attr = ATTR_NUM(value);
-			tb[ATTR_NUM(value)].sa_length = ATTR_LENGTH(value);
-			tb[ATTR_NUM(value)].sa_byteswap = ATTR_BSWAP(value);
-			tb[ATTR_NUM(value)].sa_registered = B_TRUE;
-
-			if (tb[ATTR_NUM(value)].sa_name) {
-				continue;
-			}
-			tb[ATTR_NUM(value)].sa_name =
-			    kmem_zalloc(strlen(za.za_name) +1, KM_SLEEP);
-			(void) strlcpy(tb[ATTR_NUM(value)].sa_name, za.za_name,
-			    strlen(za.za_name) +1);
-		}
-		zap_cursor_fini(&zc);
-		/*
-		 * Make sure we processed the correct number of registered
-		 * attributes
-		 */
-		if (registered_count != sa_reg_count) {
-			ASSERT(error != 0);
-			goto bail;
-		}
-
-	}
-
-	if (ostype == DMU_OST_ZFS) {
-		for (i = 0; i != sa_legacy_attr_count; i++) {
-			if (tb[i].sa_name)
-				continue;
-			tb[i].sa_attr = sa_legacy_attrs[i].sa_attr;
-			tb[i].sa_length = sa_legacy_attrs[i].sa_length;
-			tb[i].sa_byteswap = sa_legacy_attrs[i].sa_byteswap;
-			tb[i].sa_registered = B_FALSE;
-			tb[i].sa_name =
-			    kmem_zalloc(strlen(sa_legacy_attrs[i].sa_name) +1,
-			    KM_SLEEP);
-			(void) strlcpy(tb[i].sa_name,
-			    sa_legacy_attrs[i].sa_name,
-			    strlen(sa_legacy_attrs[i].sa_name) + 1);
-		}
-	}
-
-	for (i = 0; i != count; i++) {
-		sa_attr_type_t attr_id;
-
-		attr_id = sa->sa_user_table[i];
-		if (tb[attr_id].sa_name)
-			continue;
-
-		tb[attr_id].sa_length = reg_attrs[i].sa_length;
-		tb[attr_id].sa_byteswap = reg_attrs[i].sa_byteswap;
-		tb[attr_id].sa_attr = attr_id;
-		tb[attr_id].sa_name =
-		    kmem_zalloc(strlen(reg_attrs[i].sa_name) + 1, KM_SLEEP);
-		(void) strlcpy(tb[attr_id].sa_name, reg_attrs[i].sa_name,
-		    strlen(reg_attrs[i].sa_name) + 1);
-	}
-
-	sa->sa_need_attr_registration =
-	    (sa_attr_count != registered_count);
-
-	return (0);
-bail:
-	kmem_free(sa->sa_user_table, count * sizeof (sa_attr_type_t));
-	sa->sa_user_table = NULL;
-	sa_free_attr_table(sa);
-	return ((error != 0) ? error : EINVAL);
-}
-
-int
-sa_setup(objset_t *os, uint64_t sa_obj, sa_attr_reg_t *reg_attrs, int count,
-    sa_attr_type_t **user_table)
-{
-	zap_cursor_t zc;
-	zap_attribute_t za;
-	sa_os_t *sa;
-	dmu_objset_type_t ostype = dmu_objset_type(os);
-	sa_attr_type_t *tb;
-	int error;
-
-	mutex_enter(&os->os_user_ptr_lock);
-	if (os->os_sa) {
-		mutex_enter(&os->os_sa->sa_lock);
-		mutex_exit(&os->os_user_ptr_lock);
-		tb = os->os_sa->sa_user_table;
-		mutex_exit(&os->os_sa->sa_lock);
-		*user_table = tb;
-		return (0);
-	}
-
-	sa = kmem_zalloc(sizeof (sa_os_t), KM_SLEEP);
-	mutex_init(&sa->sa_lock, NULL, MUTEX_DEFAULT, NULL);
-	sa->sa_master_obj = sa_obj;
-
-	os->os_sa = sa;
-	mutex_enter(&sa->sa_lock);
-	mutex_exit(&os->os_user_ptr_lock);
-	avl_create(&sa->sa_layout_num_tree, layout_num_compare,
-	    sizeof (sa_lot_t), offsetof(sa_lot_t, lot_num_node));
-	avl_create(&sa->sa_layout_hash_tree, layout_hash_compare,
-	    sizeof (sa_lot_t), offsetof(sa_lot_t, lot_hash_node));
-
-	if (sa_obj) {
-		error = zap_lookup(os, sa_obj, SA_LAYOUTS,
-		    8, 1, &sa->sa_layout_attr_obj);
-		if (error != 0 && error != ENOENT)
-			goto fail;
-		error = zap_lookup(os, sa_obj, SA_REGISTRY,
-		    8, 1, &sa->sa_reg_attr_obj);
-		if (error != 0 && error != ENOENT)
-			goto fail;
-	}
-
-	if ((error = sa_attr_table_setup(os, reg_attrs, count)) != 0)
-		goto fail;
-
-	if (sa->sa_layout_attr_obj != 0) {
-		uint64_t layout_count;
-
-		error = zap_count(os, sa->sa_layout_attr_obj,
-		    &layout_count);
-
-		/*
-		 * Layout number count should be > 0
-		 */
-		if (error || (error == 0 && layout_count == 0)) {
-			if (error == 0)
-				error = SET_ERROR(EINVAL);
-			goto fail;
-		}
-
-		for (zap_cursor_init(&zc, os, sa->sa_layout_attr_obj);
-		    (error = zap_cursor_retrieve(&zc, &za)) == 0;
-		    zap_cursor_advance(&zc)) {
-			sa_attr_type_t *lot_attrs;
-			uint64_t lot_num;
-
-			lot_attrs = kmem_zalloc(sizeof (sa_attr_type_t) *
-			    za.za_num_integers, KM_SLEEP);
-
-			if ((error = (zap_lookup(os, sa->sa_layout_attr_obj,
-			    za.za_name, 2, za.za_num_integers,
-			    lot_attrs))) != 0) {
-				kmem_free(lot_attrs, sizeof (sa_attr_type_t) *
-				    za.za_num_integers);
-				break;
-			}
-			VERIFY(ddi_strtoull(za.za_name, NULL, 10,
-			    (unsigned long long *)&lot_num) == 0);
-
-			(void) sa_add_layout_entry(os, lot_attrs,
-			    za.za_num_integers, lot_num,
-			    sa_layout_info_hash(lot_attrs,
-			    za.za_num_integers), B_FALSE, NULL);
-			kmem_free(lot_attrs, sizeof (sa_attr_type_t) *
-			    za.za_num_integers);
-		}
-		zap_cursor_fini(&zc);
-
-		/*
-		 * Make sure layout count matches number of entries added
-		 * to AVL tree
-		 */
-		if (avl_numnodes(&sa->sa_layout_num_tree) != layout_count) {
-			ASSERT(error != 0);
-			goto fail;
-		}
-	}
-
-	/* Add special layout number for old ZNODES */
-	if (ostype == DMU_OST_ZFS) {
-		(void) sa_add_layout_entry(os, sa_legacy_zpl_layout,
-		    sa_legacy_attr_count, 0,
-		    sa_layout_info_hash(sa_legacy_zpl_layout,
-		    sa_legacy_attr_count), B_FALSE, NULL);
-
-		(void) sa_add_layout_entry(os, sa_dummy_zpl_layout, 0, 1,
-		    0, B_FALSE, NULL);
-	}
-	*user_table = os->os_sa->sa_user_table;
-	mutex_exit(&sa->sa_lock);
-	return (0);
-fail:
-	os->os_sa = NULL;
-	sa_free_attr_table(sa);
-	if (sa->sa_user_table)
-		kmem_free(sa->sa_user_table, sa->sa_user_table_sz);
-	mutex_exit(&sa->sa_lock);
-	avl_destroy(&sa->sa_layout_hash_tree);
-	avl_destroy(&sa->sa_layout_num_tree);
-	mutex_destroy(&sa->sa_lock);
-	kmem_free(sa, sizeof (sa_os_t));
-	return ((error == ECKSUM) ? EIO : error);
-}
-
-void
-sa_tear_down(objset_t *os)
-{
-	sa_os_t *sa = os->os_sa;
-	sa_lot_t *layout;
-	void *cookie;
-
-	kmem_free(sa->sa_user_table, sa->sa_user_table_sz);
-
-	/* Free up attr table */
-
-	sa_free_attr_table(sa);
-
-	cookie = NULL;
-	while (layout = avl_destroy_nodes(&sa->sa_layout_hash_tree, &cookie)) {
-		sa_idx_tab_t *tab;
-		while (tab = list_head(&layout->lot_idx_tab)) {
-			ASSERT(zfs_refcount_count(&tab->sa_refcount));
-			sa_idx_tab_rele(os, tab);
-		}
-	}
-
-	cookie = NULL;
-	while (layout = avl_destroy_nodes(&sa->sa_layout_num_tree, &cookie)) {
-		kmem_free(layout->lot_attrs,
-		    sizeof (sa_attr_type_t) * layout->lot_attr_count);
-		kmem_free(layout, sizeof (sa_lot_t));
-	}
-
-	avl_destroy(&sa->sa_layout_hash_tree);
-	avl_destroy(&sa->sa_layout_num_tree);
-	mutex_destroy(&sa->sa_lock);
-
-	kmem_free(sa, sizeof (sa_os_t));
-	os->os_sa = NULL;
-}
-
-void
-sa_build_idx_tab(void *hdr, void *attr_addr, sa_attr_type_t attr,
-    uint16_t length, int length_idx, boolean_t var_length, void *userp)
-{
-	sa_idx_tab_t *idx_tab = userp;
-
-	if (var_length) {
-		ASSERT(idx_tab->sa_variable_lengths);
-		idx_tab->sa_variable_lengths[length_idx] = length;
-	}
-	TOC_ATTR_ENCODE(idx_tab->sa_idx_tab[attr], length_idx,
-	    (uint32_t)((uintptr_t)attr_addr - (uintptr_t)hdr));
-}
-
-static void
-sa_attr_iter(objset_t *os, sa_hdr_phys_t *hdr, dmu_object_type_t type,
-    sa_iterfunc_t func, sa_lot_t *tab, void *userp)
-{
-	void *data_start;
-	sa_lot_t *tb = tab;
-	sa_lot_t search;
-	avl_index_t loc;
-	sa_os_t *sa = os->os_sa;
-	int i;
-	uint16_t *length_start = NULL;
-	uint8_t length_idx = 0;
-
-	if (tab == NULL) {
-		search.lot_num = SA_LAYOUT_NUM(hdr, type);
-		tb = avl_find(&sa->sa_layout_num_tree, &search, &loc);
-		ASSERT(tb);
-	}
-
-	if (IS_SA_BONUSTYPE(type)) {
-		data_start = (void *)P2ROUNDUP(((uintptr_t)hdr +
-		    offsetof(sa_hdr_phys_t, sa_lengths) +
-		    (sizeof (uint16_t) * tb->lot_var_sizes)), 8);
-		length_start = hdr->sa_lengths;
-	} else {
-		data_start = hdr;
-	}
-
-	for (i = 0; i != tb->lot_attr_count; i++) {
-		int attr_length, reg_length;
-		uint8_t idx_len;
-
-		reg_length = sa->sa_attr_table[tb->lot_attrs[i]].sa_length;
-		if (reg_length) {
-			attr_length = reg_length;
-			idx_len = 0;
-		} else {
-			attr_length = length_start[length_idx];
-			idx_len = length_idx++;
-		}
-
-		func(hdr, data_start, tb->lot_attrs[i], attr_length,
-		    idx_len, reg_length == 0 ? B_TRUE : B_FALSE, userp);
-
-		data_start = (void *)P2ROUNDUP(((uintptr_t)data_start +
-		    attr_length), 8);
-	}
-}
-
-/*ARGSUSED*/
-void
-sa_byteswap_cb(void *hdr, void *attr_addr, sa_attr_type_t attr,
-    uint16_t length, int length_idx, boolean_t variable_length, void *userp)
-{
-	sa_handle_t *hdl = userp;
-	sa_os_t *sa = hdl->sa_os->os_sa;
-
-	sa_bswap_table[sa->sa_attr_table[attr].sa_byteswap](attr_addr, length);
-}
-
-void
-sa_byteswap(sa_handle_t *hdl, sa_buf_type_t buftype)
-{
-	sa_hdr_phys_t *sa_hdr_phys = SA_GET_HDR(hdl, buftype);
-	dmu_buf_impl_t *db;
-	sa_os_t *sa = hdl->sa_os->os_sa;
-	int num_lengths = 1;
-	int i;
-
-	ASSERT(MUTEX_HELD(&sa->sa_lock));
-	if (sa_hdr_phys->sa_magic == SA_MAGIC)
-		return;
-
-	db = SA_GET_DB(hdl, buftype);
-
-	if (buftype == SA_SPILL) {
-		arc_release(db->db_buf, NULL);
-		arc_buf_thaw(db->db_buf);
-	}
-
-	sa_hdr_phys->sa_magic = BSWAP_32(sa_hdr_phys->sa_magic);
-	sa_hdr_phys->sa_layout_info = BSWAP_16(sa_hdr_phys->sa_layout_info);
-
-	/*
-	 * Determine number of variable lenghts in header
-	 * The standard 8 byte header has one for free and a
-	 * 16 byte header would have 4 + 1;
-	 */
-	if (SA_HDR_SIZE(sa_hdr_phys) > 8)
-		num_lengths += (SA_HDR_SIZE(sa_hdr_phys) - 8) >> 1;
-	for (i = 0; i != num_lengths; i++)
-		sa_hdr_phys->sa_lengths[i] =
-		    BSWAP_16(sa_hdr_phys->sa_lengths[i]);
-
-	sa_attr_iter(hdl->sa_os, sa_hdr_phys, DMU_OT_SA,
-	    sa_byteswap_cb, NULL, hdl);
-
-	if (buftype == SA_SPILL)
-		arc_buf_freeze(((dmu_buf_impl_t *)hdl->sa_spill)->db_buf);
-}
-
-static int
-sa_build_index(sa_handle_t *hdl, sa_buf_type_t buftype)
-{
-	sa_hdr_phys_t *sa_hdr_phys;
-	dmu_buf_impl_t *db = SA_GET_DB(hdl, buftype);
-	dmu_object_type_t bonustype = SA_BONUSTYPE_FROM_DB(db);
-	sa_os_t *sa = hdl->sa_os->os_sa;
-	sa_idx_tab_t *idx_tab;
-
-	sa_hdr_phys = SA_GET_HDR(hdl, buftype);
-
-	mutex_enter(&sa->sa_lock);
-
-	/* Do we need to byteswap? */
-
-	/* only check if not old znode */
-	if (IS_SA_BONUSTYPE(bonustype) && sa_hdr_phys->sa_magic != SA_MAGIC &&
-	    sa_hdr_phys->sa_magic != 0) {
-		VERIFY(BSWAP_32(sa_hdr_phys->sa_magic) == SA_MAGIC);
-		sa_byteswap(hdl, buftype);
-	}
-
-	idx_tab = sa_find_idx_tab(hdl->sa_os, bonustype, sa_hdr_phys);
-
-	if (buftype == SA_BONUS)
-		hdl->sa_bonus_tab = idx_tab;
-	else
-		hdl->sa_spill_tab = idx_tab;
-
-	mutex_exit(&sa->sa_lock);
-	return (0);
-}
-
-/*ARGSUSED*/
-static void
-sa_evict_sync(void *dbu)
-{
-	panic("evicting sa dbuf\n");
-}
-
-static void
-sa_idx_tab_rele(objset_t *os, void *arg)
-{
-	sa_os_t *sa = os->os_sa;
-	sa_idx_tab_t *idx_tab = arg;
-
-	if (idx_tab == NULL)
-		return;
-
-	mutex_enter(&sa->sa_lock);
-	if (zfs_refcount_remove(&idx_tab->sa_refcount, NULL) == 0) {
-		list_remove(&idx_tab->sa_layout->lot_idx_tab, idx_tab);
-		if (idx_tab->sa_variable_lengths)
-			kmem_free(idx_tab->sa_variable_lengths,
-			    sizeof (uint16_t) *
-			    idx_tab->sa_layout->lot_var_sizes);
-		zfs_refcount_destroy(&idx_tab->sa_refcount);
-		kmem_free(idx_tab->sa_idx_tab,
-		    sizeof (uint32_t) * sa->sa_num_attrs);
-		kmem_free(idx_tab, sizeof (sa_idx_tab_t));
-	}
-	mutex_exit(&sa->sa_lock);
-}
-
-static void
-sa_idx_tab_hold(objset_t *os, sa_idx_tab_t *idx_tab)
-{
-	sa_os_t *sa = os->os_sa;
-
-	ASSERT(MUTEX_HELD(&sa->sa_lock));
-	(void) zfs_refcount_add(&idx_tab->sa_refcount, NULL);
-}
-
-void
-sa_handle_destroy(sa_handle_t *hdl)
-{
-	dmu_buf_t *db = hdl->sa_bonus;
-
-	mutex_enter(&hdl->sa_lock);
-	(void) dmu_buf_remove_user(db, &hdl->sa_dbu);
-
-	if (hdl->sa_bonus_tab)
-		sa_idx_tab_rele(hdl->sa_os, hdl->sa_bonus_tab);
-
-	if (hdl->sa_spill_tab)
-		sa_idx_tab_rele(hdl->sa_os, hdl->sa_spill_tab);
-
-	dmu_buf_rele(hdl->sa_bonus, NULL);
-
-	if (hdl->sa_spill)
-		dmu_buf_rele((dmu_buf_t *)hdl->sa_spill, NULL);
-	mutex_exit(&hdl->sa_lock);
-
-	kmem_cache_free(sa_cache, hdl);
-}
-
-int
-sa_handle_get_from_db(objset_t *os, dmu_buf_t *db, void *userp,
-    sa_handle_type_t hdl_type, sa_handle_t **handlepp)
-{
-	int error = 0;
-	dmu_object_info_t doi;
-	sa_handle_t *handle = NULL;
-
-#ifdef ZFS_DEBUG
-	dmu_object_info_from_db(db, &doi);
-	ASSERT(doi.doi_bonus_type == DMU_OT_SA ||
-	    doi.doi_bonus_type == DMU_OT_ZNODE);
-#endif
-	/* find handle, if it exists */
-	/* if one doesn't exist then create a new one, and initialize it */
-
-	if (hdl_type == SA_HDL_SHARED)
-		handle = dmu_buf_get_user(db);
-
-	if (handle == NULL) {
-		sa_handle_t *winner = NULL;
-
-		handle = kmem_cache_alloc(sa_cache, KM_SLEEP);
-		handle->sa_dbu.dbu_evict_func_sync = NULL;
-		handle->sa_dbu.dbu_evict_func_async = NULL;
-		handle->sa_userp = userp;
-		handle->sa_bonus = db;
-		handle->sa_os = os;
-		handle->sa_spill = NULL;
-		handle->sa_bonus_tab = NULL;
-		handle->sa_spill_tab = NULL;
-
-		error = sa_build_index(handle, SA_BONUS);
-
-		if (hdl_type == SA_HDL_SHARED) {
-			dmu_buf_init_user(&handle->sa_dbu, sa_evict_sync, NULL,
-			    NULL);
-			winner = dmu_buf_set_user_ie(db, &handle->sa_dbu);
-		}
-
-		if (winner != NULL) {
-			kmem_cache_free(sa_cache, handle);
-			handle = winner;
-		}
-	}
-	*handlepp = handle;
-
-	return (error);
-}
-
-int
-sa_handle_get(objset_t *objset, uint64_t objid, void *userp,
-    sa_handle_type_t hdl_type, sa_handle_t **handlepp)
-{
-	dmu_buf_t *db;
-	int error;
-
-	if (error = dmu_bonus_hold(objset, objid, NULL, &db))
-		return (error);
-
-	return (sa_handle_get_from_db(objset, db, userp, hdl_type,
-	    handlepp));
-}
-
-int
-sa_buf_hold(objset_t *objset, uint64_t obj_num, void *tag, dmu_buf_t **db)
-{
-	return (dmu_bonus_hold(objset, obj_num, tag, db));
-}
-
-void
-sa_buf_rele(dmu_buf_t *db, void *tag)
-{
-	dmu_buf_rele(db, tag);
-}
-
-int
-sa_lookup_impl(sa_handle_t *hdl, sa_bulk_attr_t *bulk, int count)
-{
-	ASSERT(hdl);
-	ASSERT(MUTEX_HELD(&hdl->sa_lock));
-	return (sa_attr_op(hdl, bulk, count, SA_LOOKUP, NULL));
-}
-
-int
-sa_lookup(sa_handle_t *hdl, sa_attr_type_t attr, void *buf, uint32_t buflen)
-{
-	int error;
-	sa_bulk_attr_t bulk;
-
-	bulk.sa_attr = attr;
-	bulk.sa_data = buf;
-	bulk.sa_length = buflen;
-	bulk.sa_data_func = NULL;
-
-	ASSERT(hdl);
-	mutex_enter(&hdl->sa_lock);
-	error = sa_lookup_impl(hdl, &bulk, 1);
-	mutex_exit(&hdl->sa_lock);
-	return (error);
-}
-
-#ifdef _KERNEL
-int
-sa_lookup_uio(sa_handle_t *hdl, sa_attr_type_t attr, uio_t *uio)
-{
-	int error;
-	sa_bulk_attr_t bulk;
-
-	bulk.sa_data = NULL;
-	bulk.sa_attr = attr;
-	bulk.sa_data_func = NULL;
-
-	ASSERT(hdl);
-
-	mutex_enter(&hdl->sa_lock);
-	if ((error = sa_attr_op(hdl, &bulk, 1, SA_LOOKUP, NULL)) == 0) {
-		error = uiomove((void *)bulk.sa_addr, MIN(bulk.sa_size,
-		    uio->uio_resid), UIO_READ, uio);
-	}
-	mutex_exit(&hdl->sa_lock);
-	return (error);
-
-}
-#endif
-
-static sa_idx_tab_t *
-sa_find_idx_tab(objset_t *os, dmu_object_type_t bonustype, sa_hdr_phys_t *hdr)
-{
-	sa_idx_tab_t *idx_tab;
-	sa_os_t *sa = os->os_sa;
-	sa_lot_t *tb, search;
-	avl_index_t loc;
-
-	/*
-	 * Deterimine layout number.  If SA node and header == 0 then
-	 * force the index table to the dummy "1" empty layout.
-	 *
-	 * The layout number would only be zero for a newly created file
-	 * that has not added any attributes yet, or with crypto enabled which
-	 * doesn't write any attributes to the bonus buffer.
-	 */
-
-	search.lot_num = SA_LAYOUT_NUM(hdr, bonustype);
-
-	tb = avl_find(&sa->sa_layout_num_tree, &search, &loc);
-
-	/* Verify header size is consistent with layout information */
-	ASSERT(tb);
-	ASSERT(IS_SA_BONUSTYPE(bonustype) &&
-	    SA_HDR_SIZE_MATCH_LAYOUT(hdr, tb) || !IS_SA_BONUSTYPE(bonustype) ||
-	    (IS_SA_BONUSTYPE(bonustype) && hdr->sa_layout_info == 0));
-
-	/*
-	 * See if any of the already existing TOC entries can be reused?
-	 */
-
-	for (idx_tab = list_head(&tb->lot_idx_tab); idx_tab;
-	    idx_tab = list_next(&tb->lot_idx_tab, idx_tab)) {
-		boolean_t valid_idx = B_TRUE;
-		int i;
-
-		if (tb->lot_var_sizes != 0 &&
-		    idx_tab->sa_variable_lengths != NULL) {
-			for (i = 0; i != tb->lot_var_sizes; i++) {
-				if (hdr->sa_lengths[i] !=
-				    idx_tab->sa_variable_lengths[i]) {
-					valid_idx = B_FALSE;
-					break;
-				}
-			}
-		}
-		if (valid_idx) {
-			sa_idx_tab_hold(os, idx_tab);
-			return (idx_tab);
-		}
-	}
-
-	/* No such luck, create a new entry */
-	idx_tab = kmem_zalloc(sizeof (sa_idx_tab_t), KM_SLEEP);
-	idx_tab->sa_idx_tab =
-	    kmem_zalloc(sizeof (uint32_t) * sa->sa_num_attrs, KM_SLEEP);
-	idx_tab->sa_layout = tb;
-	zfs_refcount_create(&idx_tab->sa_refcount);
-	if (tb->lot_var_sizes)
-		idx_tab->sa_variable_lengths = kmem_alloc(sizeof (uint16_t) *
-		    tb->lot_var_sizes, KM_SLEEP);
-
-	sa_attr_iter(os, hdr, bonustype, sa_build_idx_tab,
-	    tb, idx_tab);
-	sa_idx_tab_hold(os, idx_tab);   /* one hold for consumer */
-	sa_idx_tab_hold(os, idx_tab);	/* one for layout */
-	list_insert_tail(&tb->lot_idx_tab, idx_tab);
-	return (idx_tab);
-}
-
-void
-sa_default_locator(void **dataptr, uint32_t *len, uint32_t total_len,
-    boolean_t start, void *userdata)
-{
-	ASSERT(start);
-
-	*dataptr = userdata;
-	*len = total_len;
-}
-
-static void
-sa_attr_register_sync(sa_handle_t *hdl, dmu_tx_t *tx)
-{
-	uint64_t attr_value = 0;
-	sa_os_t *sa = hdl->sa_os->os_sa;
-	sa_attr_table_t *tb = sa->sa_attr_table;
-	int i;
-
-	mutex_enter(&sa->sa_lock);
-
-	if (!sa->sa_need_attr_registration || sa->sa_master_obj == 0) {
-		mutex_exit(&sa->sa_lock);
-		return;
-	}
-
-	if (sa->sa_reg_attr_obj == 0) {
-		sa->sa_reg_attr_obj = zap_create_link(hdl->sa_os,
-		    DMU_OT_SA_ATTR_REGISTRATION,
-		    sa->sa_master_obj, SA_REGISTRY, tx);
-	}
-	for (i = 0; i != sa->sa_num_attrs; i++) {
-		if (sa->sa_attr_table[i].sa_registered)
-			continue;
-		ATTR_ENCODE(attr_value, tb[i].sa_attr, tb[i].sa_length,
-		    tb[i].sa_byteswap);
-		VERIFY(0 == zap_update(hdl->sa_os, sa->sa_reg_attr_obj,
-		    tb[i].sa_name, 8, 1, &attr_value, tx));
-		tb[i].sa_registered = B_TRUE;
-	}
-	sa->sa_need_attr_registration = B_FALSE;
-	mutex_exit(&sa->sa_lock);
-}
-
-/*
- * Replace all attributes with attributes specified in template.
- * If dnode had a spill buffer then those attributes will be
- * also be replaced, possibly with just an empty spill block
- *
- * This interface is intended to only be used for bulk adding of
- * attributes for a new file.  It will also be used by the ZPL
- * when converting and old formatted znode to native SA support.
- */
-int
-sa_replace_all_by_template_locked(sa_handle_t *hdl, sa_bulk_attr_t *attr_desc,
-    int attr_count, dmu_tx_t *tx)
-{
-	sa_os_t *sa = hdl->sa_os->os_sa;
-
-	if (sa->sa_need_attr_registration)
-		sa_attr_register_sync(hdl, tx);
-	return (sa_build_layouts(hdl, attr_desc, attr_count, tx));
-}
-
-int
-sa_replace_all_by_template(sa_handle_t *hdl, sa_bulk_attr_t *attr_desc,
-    int attr_count, dmu_tx_t *tx)
-{
-	int error;
-
-	mutex_enter(&hdl->sa_lock);
-	error = sa_replace_all_by_template_locked(hdl, attr_desc,
-	    attr_count, tx);
-	mutex_exit(&hdl->sa_lock);
-	return (error);
-}
-
-/*
- * Add/remove a single attribute or replace a variable-sized attribute value
- * with a value of a different size, and then rewrite the entire set
- * of attributes.
- * Same-length attribute value replacement (including fixed-length attributes)
- * is handled more efficiently by the upper layers.
- */
-static int
-sa_modify_attrs(sa_handle_t *hdl, sa_attr_type_t newattr,
-    sa_data_op_t action, sa_data_locator_t *locator, void *datastart,
-    uint16_t buflen, dmu_tx_t *tx)
-{
-	sa_os_t *sa = hdl->sa_os->os_sa;
-	dmu_buf_impl_t *db = (dmu_buf_impl_t *)hdl->sa_bonus;
-	dnode_t *dn;
-	sa_bulk_attr_t *attr_desc;
-	void *old_data[2];
-	int bonus_attr_count = 0;
-	int bonus_data_size = 0;
-	int spill_data_size = 0;
-	int spill_attr_count = 0;
-	int error;
-	uint16_t length, reg_length;
-	int i, j, k, length_idx;
-	sa_hdr_phys_t *hdr;
-	sa_idx_tab_t *idx_tab;
-	int attr_count;
-	int count;
-
-	ASSERT(MUTEX_HELD(&hdl->sa_lock));
-
-	/* First make of copy of the old data */
-
-	DB_DNODE_ENTER(db);
-	dn = DB_DNODE(db);
-	if (dn->dn_bonuslen != 0) {
-		bonus_data_size = hdl->sa_bonus->db_size;
-		old_data[0] = kmem_alloc(bonus_data_size, KM_SLEEP);
-		bcopy(hdl->sa_bonus->db_data, old_data[0],
-		    hdl->sa_bonus->db_size);
-		bonus_attr_count = hdl->sa_bonus_tab->sa_layout->lot_attr_count;
-	} else {
-		old_data[0] = NULL;
-	}
-	DB_DNODE_EXIT(db);
-
-	/* Bring spill buffer online if it isn't currently */
-
-	if ((error = sa_get_spill(hdl)) == 0) {
-		spill_data_size = hdl->sa_spill->db_size;
-		old_data[1] = kmem_alloc(spill_data_size, KM_SLEEP);
-		bcopy(hdl->sa_spill->db_data, old_data[1],
-		    hdl->sa_spill->db_size);
-		spill_attr_count =
-		    hdl->sa_spill_tab->sa_layout->lot_attr_count;
-	} else if (error && error != ENOENT) {
-		if (old_data[0])
-			kmem_free(old_data[0], bonus_data_size);
-		return (error);
-	} else {
-		old_data[1] = NULL;
-	}
-
-	/* build descriptor of all attributes */
-
-	attr_count = bonus_attr_count + spill_attr_count;
-	if (action == SA_ADD)
-		attr_count++;
-	else if (action == SA_REMOVE)
-		attr_count--;
-
-	attr_desc = kmem_zalloc(sizeof (sa_bulk_attr_t) * attr_count, KM_SLEEP);
-
-	/*
-	 * loop through bonus and spill buffer if it exists, and
-	 * build up new attr_descriptor to reset the attributes
-	 */
-	k = j = 0;
-	count = bonus_attr_count;
-	hdr = SA_GET_HDR(hdl, SA_BONUS);
-	idx_tab = SA_IDX_TAB_GET(hdl, SA_BONUS);
-	for (; k != 2; k++) {
-		/*
-		 * Iterate over each attribute in layout.  Fetch the
-		 * size of variable-length attributes needing rewrite
-		 * from sa_lengths[].
-		 */
-		for (i = 0, length_idx = 0; i != count; i++) {
-			sa_attr_type_t attr;
-
-			attr = idx_tab->sa_layout->lot_attrs[i];
-			reg_length = SA_REGISTERED_LEN(sa, attr);
-			if (reg_length == 0) {
-				length = hdr->sa_lengths[length_idx];
-				length_idx++;
-			} else {
-				length = reg_length;
-			}
-			if (attr == newattr) {
-				/*
-				 * There is nothing to do for SA_REMOVE,
-				 * so it is just skipped.
-				 */
-				if (action == SA_REMOVE)
-					continue;
-
-				/*
-				 * Duplicate attributes are not allowed, so the
-				 * action can not be SA_ADD here.
-				 */
-				ASSERT3S(action, ==, SA_REPLACE);
-
-				/*
-				 * Only a variable-sized attribute can be
-				 * replaced here, and its size must be changing.
-				 */
-				ASSERT3U(reg_length, ==, 0);
-				ASSERT3U(length, !=, buflen);
-				SA_ADD_BULK_ATTR(attr_desc, j, attr,
-				    locator, datastart, buflen);
-			} else {
-				SA_ADD_BULK_ATTR(attr_desc, j, attr,
-				    NULL, (void *)
-				    (TOC_OFF(idx_tab->sa_idx_tab[attr]) +
-				    (uintptr_t)old_data[k]), length);
-			}
-		}
-		if (k == 0 && hdl->sa_spill) {
-			hdr = SA_GET_HDR(hdl, SA_SPILL);
-			idx_tab = SA_IDX_TAB_GET(hdl, SA_SPILL);
-			count = spill_attr_count;
-		} else {
-			break;
-		}
-	}
-	if (action == SA_ADD) {
-		reg_length = SA_REGISTERED_LEN(sa, newattr);
-		IMPLY(reg_length != 0, reg_length == buflen);
-		SA_ADD_BULK_ATTR(attr_desc, j, newattr, locator,
-		    datastart, buflen);
-	}
-	ASSERT3U(j, ==, attr_count);
-
-	error = sa_build_layouts(hdl, attr_desc, attr_count, tx);
-
-	if (old_data[0])
-		kmem_free(old_data[0], bonus_data_size);
-	if (old_data[1])
-		kmem_free(old_data[1], spill_data_size);
-	kmem_free(attr_desc, sizeof (sa_bulk_attr_t) * attr_count);
-
-	return (error);
-}
-
-static int
-sa_bulk_update_impl(sa_handle_t *hdl, sa_bulk_attr_t *bulk, int count,
-    dmu_tx_t *tx)
-{
-	int error;
-	sa_os_t *sa = hdl->sa_os->os_sa;
-	dmu_object_type_t bonustype;
-
-	bonustype = SA_BONUSTYPE_FROM_DB(SA_GET_DB(hdl, SA_BONUS));
-
-	ASSERT(hdl);
-	ASSERT(MUTEX_HELD(&hdl->sa_lock));
-
-	/* sync out registration table if necessary */
-	if (sa->sa_need_attr_registration)
-		sa_attr_register_sync(hdl, tx);
-
-	error = sa_attr_op(hdl, bulk, count, SA_UPDATE, tx);
-	if (error == 0 && !IS_SA_BONUSTYPE(bonustype) && sa->sa_update_cb)
-		sa->sa_update_cb(hdl, tx);
-
-	return (error);
-}
-
-/*
- * update or add new attribute
- */
-int
-sa_update(sa_handle_t *hdl, sa_attr_type_t type,
-    void *buf, uint32_t buflen, dmu_tx_t *tx)
-{
-	int error;
-	sa_bulk_attr_t bulk;
-
-	bulk.sa_attr = type;
-	bulk.sa_data_func = NULL;
-	bulk.sa_length = buflen;
-	bulk.sa_data = buf;
-
-	mutex_enter(&hdl->sa_lock);
-	error = sa_bulk_update_impl(hdl, &bulk, 1, tx);
-	mutex_exit(&hdl->sa_lock);
-	return (error);
-}
-
-int
-sa_update_from_cb(sa_handle_t *hdl, sa_attr_type_t attr,
-    uint32_t buflen, sa_data_locator_t *locator, void *userdata, dmu_tx_t *tx)
-{
-	int error;
-	sa_bulk_attr_t bulk;
-
-	bulk.sa_attr = attr;
-	bulk.sa_data = userdata;
-	bulk.sa_data_func = locator;
-	bulk.sa_length = buflen;
-
-	mutex_enter(&hdl->sa_lock);
-	error = sa_bulk_update_impl(hdl, &bulk, 1, tx);
-	mutex_exit(&hdl->sa_lock);
-	return (error);
-}
-
-/*
- * Return size of an attribute
- */
-
-int
-sa_size(sa_handle_t *hdl, sa_attr_type_t attr, int *size)
-{
-	sa_bulk_attr_t bulk;
-	int error;
-
-	bulk.sa_data = NULL;
-	bulk.sa_attr = attr;
-	bulk.sa_data_func = NULL;
-
-	ASSERT(hdl);
-	mutex_enter(&hdl->sa_lock);
-	if ((error = sa_attr_op(hdl, &bulk, 1, SA_LOOKUP, NULL)) != 0) {
-		mutex_exit(&hdl->sa_lock);
-		return (error);
-	}
-	*size = bulk.sa_size;
-
-	mutex_exit(&hdl->sa_lock);
-	return (0);
-}
-
-int
-sa_bulk_lookup_locked(sa_handle_t *hdl, sa_bulk_attr_t *attrs, int count)
-{
-	ASSERT(hdl);
-	ASSERT(MUTEX_HELD(&hdl->sa_lock));
-	return (sa_lookup_impl(hdl, attrs, count));
-}
-
-int
-sa_bulk_lookup(sa_handle_t *hdl, sa_bulk_attr_t *attrs, int count)
-{
-	int error;
-
-	ASSERT(hdl);
-	mutex_enter(&hdl->sa_lock);
-	error = sa_bulk_lookup_locked(hdl, attrs, count);
-	mutex_exit(&hdl->sa_lock);
-	return (error);
-}
-
-int
-sa_bulk_update(sa_handle_t *hdl, sa_bulk_attr_t *attrs, int count, dmu_tx_t *tx)
-{
-	int error;
-
-	ASSERT(hdl);
-	mutex_enter(&hdl->sa_lock);
-	error = sa_bulk_update_impl(hdl, attrs, count, tx);
-	mutex_exit(&hdl->sa_lock);
-	return (error);
-}
-
-int
-sa_remove(sa_handle_t *hdl, sa_attr_type_t attr, dmu_tx_t *tx)
-{
-	int error;
-
-	mutex_enter(&hdl->sa_lock);
-	error = sa_modify_attrs(hdl, attr, SA_REMOVE, NULL,
-	    NULL, 0, tx);
-	mutex_exit(&hdl->sa_lock);
-	return (error);
-}
-
-void
-sa_object_info(sa_handle_t *hdl, dmu_object_info_t *doi)
-{
-	dmu_object_info_from_db((dmu_buf_t *)hdl->sa_bonus, doi);
-}
-
-void
-sa_object_size(sa_handle_t *hdl, uint32_t *blksize, u_longlong_t *nblocks)
-{
-	dmu_object_size_from_db((dmu_buf_t *)hdl->sa_bonus,
-	    blksize, nblocks);
-}
-
-void
-sa_set_userp(sa_handle_t *hdl, void *ptr)
-{
-	hdl->sa_userp = ptr;
-}
-
-dmu_buf_t *
-sa_get_db(sa_handle_t *hdl)
-{
-	return ((dmu_buf_t *)hdl->sa_bonus);
-}
-
-void *
-sa_get_userdata(sa_handle_t *hdl)
-{
-	return (hdl->sa_userp);
-}
-
-void
-sa_register_update_callback_locked(objset_t *os, sa_update_cb_t *func)
-{
-	ASSERT(MUTEX_HELD(&os->os_sa->sa_lock));
-	os->os_sa->sa_update_cb = func;
-}
-
-void
-sa_register_update_callback(objset_t *os, sa_update_cb_t *func)
-{
-
-	mutex_enter(&os->os_sa->sa_lock);
-	sa_register_update_callback_locked(os, func);
-	mutex_exit(&os->os_sa->sa_lock);
-}
-
-uint64_t
-sa_handle_object(sa_handle_t *hdl)
-{
-	return (hdl->sa_bonus->db_object);
-}
-
-boolean_t
-sa_enabled(objset_t *os)
-{
-	return (os->os_sa == NULL);
-}
-
-int
-sa_set_sa_object(objset_t *os, uint64_t sa_object)
-{
-	sa_os_t *sa = os->os_sa;
-
-	if (sa->sa_master_obj)
-		return (1);
-
-	sa->sa_master_obj = sa_object;
-
-	return (0);
-}
-
-int
-sa_hdrsize(void *arg)
-{
-	sa_hdr_phys_t *hdr = arg;
-
-	return (SA_HDR_SIZE(hdr));
-}
-
-void
-sa_handle_lock(sa_handle_t *hdl)
-{
-	ASSERT(hdl);
-	mutex_enter(&hdl->sa_lock);
-}
-
-void
-sa_handle_unlock(sa_handle_t *hdl)
-{
-	ASSERT(hdl);
-	mutex_exit(&hdl->sa_lock);
-}
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sha256.c
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sha256.c
+++ /dev/null
@@ -1,105 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-/*
- * Copyright 2013 Saso Kiselkov. All rights reserved.
- * Copyright (c) 2016 by Delphix. All rights reserved.
- */
-#include <sys/zfs_context.h>
-#include <sys/zio.h>
-#ifdef _KERNEL
-#include <crypto/sha2/sha256.h>
-#include <crypto/sha2/sha512t.h>
-#else
-#include <sha256.h>
-#include <sha512t.h>
-#endif
-#include <sys/abd.h>
-
-static int
-sha256_incremental(void *buf, size_t size, void *arg)
-{
-	SHA256_CTX *ctx = arg;
-	SHA256_Update(ctx, buf, size);
-	return (0);
-}
-
-static int
-sha512_incremental(void *buf, size_t size, void *arg)
-{
-	SHA512_CTX *ctx = arg;
-	SHA512_256_Update(ctx, buf, size);
-	return (0);
-}
-
-/*ARGSUSED*/
-void
-abd_checksum_SHA256(abd_t *abd, uint64_t size,
-    const void *ctx_template, zio_cksum_t *zcp)
-{
-	SHA256_CTX ctx;
-	zio_cksum_t tmp;
-
-	SHA256_Init(&ctx);
-	(void) abd_iterate_func(abd, 0, size, sha256_incremental, &ctx);
-	SHA256_Final((unsigned char *)&tmp, &ctx);
-
-	/*
-	 * A prior implementation of this function had a
-	 * private SHA256 implementation always wrote things out in
-	 * Big Endian and there wasn't a byteswap variant of it.
-	 * To preserve on disk compatibility we need to force that
-	 * behavior.
-	 */
-	zcp->zc_word[0] = BE_64(tmp.zc_word[0]);
-	zcp->zc_word[1] = BE_64(tmp.zc_word[1]);
-	zcp->zc_word[2] = BE_64(tmp.zc_word[2]);
-	zcp->zc_word[3] = BE_64(tmp.zc_word[3]);
-}
-
-/*ARGSUSED*/
-void
-abd_checksum_SHA512_native(abd_t *abd, uint64_t size,
-    const void *ctx_template, zio_cksum_t *zcp)
-{
-	SHA512_CTX	ctx;
-
-	SHA512_256_Init(&ctx);
-	(void) abd_iterate_func(abd, 0, size, sha512_incremental, &ctx);
-	SHA512_256_Final((unsigned char *)zcp, &ctx);
-}
-
-/*ARGSUSED*/
-void
-abd_checksum_SHA512_byteswap(abd_t *abd, uint64_t size,
-    const void *ctx_template, zio_cksum_t *zcp)
-{
-	zio_cksum_t	tmp;
-
-	abd_checksum_SHA512_native(abd, size, ctx_template, &tmp);
-	zcp->zc_word[0] = BSWAP_64(tmp.zc_word[0]);
-	zcp->zc_word[1] = BSWAP_64(tmp.zc_word[1]);
-	zcp->zc_word[2] = BSWAP_64(tmp.zc_word[2]);
-	zcp->zc_word[3] = BSWAP_64(tmp.zc_word[3]);
-}
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/skein_zfs.c
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/skein_zfs.c
+++ /dev/null
@@ -1,105 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://opensource.org/licenses/CDDL-1.0.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2013 Saso Kiselkov.  All rights reserved.
- * Copyright (c) 2016 by Delphix. All rights reserved.
- */
-#include <sys/zfs_context.h>
-#include <sys/zio.h>
-#ifdef _KERNEL
-#include <crypto/skein/skein.h>
-#else
-#include <skein.h>
-#endif
-#include <sys/abd.h>
-
-static int
-skein_incremental(void *buf, size_t size, void *arg)
-{
-	Skein_512_Ctxt_t *ctx = arg;
-	(void) Skein_512_Update(ctx, buf, size);
-	return (0);
-}
-
-/*
- * Computes a native 256-bit skein MAC checksum. Please note that this
- * function requires the presence of a ctx_template that should be allocated
- * using abd_checksum_skein_tmpl_init.
- */
-/*ARGSUSED*/
-void
-abd_checksum_skein_native(abd_t *abd, uint64_t size,
-    const void *ctx_template, zio_cksum_t *zcp)
-{
-	Skein_512_Ctxt_t	ctx;
-
-	ASSERT(ctx_template != NULL);
-	bcopy(ctx_template, &ctx, sizeof (ctx));
-	(void) abd_iterate_func(abd, 0, size, skein_incremental, &ctx);
-	(void) Skein_512_Final(&ctx, (uint8_t *)zcp);
-	bzero(&ctx, sizeof (ctx));
-}
-
-/*
- * Byteswapped version of abd_checksum_skein_native. This just invokes
- * the native checksum function and byteswaps the resulting checksum (since
- * skein is internally endian-insensitive).
- */
-void
-abd_checksum_skein_byteswap(abd_t *abd, uint64_t size,
-    const void *ctx_template, zio_cksum_t *zcp)
-{
-	zio_cksum_t	tmp;
-
-	abd_checksum_skein_native(abd, size, ctx_template, &tmp);
-	zcp->zc_word[0] = BSWAP_64(tmp.zc_word[0]);
-	zcp->zc_word[1] = BSWAP_64(tmp.zc_word[1]);
-	zcp->zc_word[2] = BSWAP_64(tmp.zc_word[2]);
-	zcp->zc_word[3] = BSWAP_64(tmp.zc_word[3]);
-}
-
-/*
- * Allocates a skein MAC template suitable for using in skein MAC checksum
- * computations and returns a pointer to it.
- */
-void *
-abd_checksum_skein_tmpl_init(const zio_cksum_salt_t *salt)
-{
-	Skein_512_Ctxt_t	*ctx;
-
-	ctx = kmem_zalloc(sizeof (*ctx), KM_SLEEP);
-	(void) Skein_512_InitExt(ctx, sizeof (zio_cksum_t) * 8, 0,
-	    salt->zcs_bytes, sizeof (salt->zcs_bytes));
-	return (ctx);
-}
-
-/*
- * Frees a skein context template previously allocated using
- * abd_checksum_skein_tmpl_init.
- */
-void
-abd_checksum_skein_tmpl_free(void *ctx_template)
-{
-	Skein_512_Ctxt_t	*ctx = ctx_template;
-
-	bzero(ctx, sizeof (*ctx));
-	kmem_free(ctx, sizeof (*ctx));
-}
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c
+++ /dev/null
@@ -1,8969 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-
-/*
- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
- * Copyright (c) 2015, Nexenta Systems, Inc.  All rights reserved.
- * Copyright (c) 2013 Martin Matuska <mm@FreeBSD.org>. All rights reserved.
- * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
- * Copyright 2013 Saso Kiselkov. All rights reserved.
- * Copyright (c) 2014 Integros [integros.com]
- * Copyright 2016 Toomas Soome <tsoome@me.com>
- * Copyright 2018 Joyent, Inc.
- * Copyright (c) 2017, Intel Corporation.
- * Copyright (c) 2017 Datto Inc.
- * Copyright 2018 OmniOS Community Edition (OmniOSce) Association.
- * Copyright (c) 2016 Actifio, Inc. All rights reserved.
- */
-
-/*
- * SPA: Storage Pool Allocator
- *
- * This file contains all the routines used when modifying on-disk SPA state.
- * This includes opening, importing, destroying, exporting a pool, and syncing a
- * pool.
- */
-
-#include <sys/zfs_context.h>
-#include <sys/fm/fs/zfs.h>
-#include <sys/spa_impl.h>
-#include <sys/zio.h>
-#include <sys/zio_checksum.h>
-#include <sys/dmu.h>
-#include <sys/dmu_tx.h>
-#include <sys/zap.h>
-#include <sys/zil.h>
-#include <sys/ddt.h>
-#include <sys/vdev_impl.h>
-#include <sys/vdev_removal.h>
-#include <sys/vdev_indirect_mapping.h>
-#include <sys/vdev_indirect_births.h>
-#include <sys/vdev_initialize.h>
-#include <sys/metaslab.h>
-#include <sys/metaslab_impl.h>
-#include <sys/mmp.h>
-#include <sys/uberblock_impl.h>
-#include <sys/txg.h>
-#include <sys/avl.h>
-#include <sys/bpobj.h>
-#include <sys/dmu_traverse.h>
-#include <sys/dmu_objset.h>
-#include <sys/unique.h>
-#include <sys/dsl_pool.h>
-#include <sys/dsl_dataset.h>
-#include <sys/dsl_dir.h>
-#include <sys/dsl_prop.h>
-#include <sys/dsl_synctask.h>
-#include <sys/fs/zfs.h>
-#include <sys/arc.h>
-#include <sys/callb.h>
-#include <sys/spa_boot.h>
-#include <sys/zfs_ioctl.h>
-#include <sys/dsl_scan.h>
-#include <sys/dmu_send.h>
-#include <sys/dsl_destroy.h>
-#include <sys/dsl_userhold.h>
-#include <sys/zfeature.h>
-#include <sys/zvol.h>
-#include <sys/trim_map.h>
-#include <sys/abd.h>
-
-#ifdef	_KERNEL
-#include <sys/callb.h>
-#include <sys/cpupart.h>
-#include <sys/zone.h>
-#endif	/* _KERNEL */
-
-#include "zfs_prop.h"
-#include "zfs_comutil.h"
-
-/* Check hostid on import? */
-static int check_hostid = 1;
-
-/*
- * The interval, in seconds, at which failed configuration cache file writes
- * should be retried.
- */
-int zfs_ccw_retry_interval = 300;
-
-SYSCTL_DECL(_vfs_zfs);
-SYSCTL_INT(_vfs_zfs, OID_AUTO, check_hostid, CTLFLAG_RWTUN, &check_hostid, 0,
-    "Check hostid on import?");
-TUNABLE_INT("vfs.zfs.ccw_retry_interval", &zfs_ccw_retry_interval);
-SYSCTL_INT(_vfs_zfs, OID_AUTO, ccw_retry_interval, CTLFLAG_RW,
-    &zfs_ccw_retry_interval, 0,
-    "Configuration cache file write, retry after failure, interval (seconds)");
-
-typedef enum zti_modes {
-	ZTI_MODE_FIXED,			/* value is # of threads (min 1) */
-	ZTI_MODE_BATCH,			/* cpu-intensive; value is ignored */
-	ZTI_MODE_NULL,			/* don't create a taskq */
-	ZTI_NMODES
-} zti_modes_t;
-
-#define	ZTI_P(n, q)	{ ZTI_MODE_FIXED, (n), (q) }
-#define	ZTI_BATCH	{ ZTI_MODE_BATCH, 0, 1 }
-#define	ZTI_NULL	{ ZTI_MODE_NULL, 0, 0 }
-
-#define	ZTI_N(n)	ZTI_P(n, 1)
-#define	ZTI_ONE		ZTI_N(1)
-
-typedef struct zio_taskq_info {
-	zti_modes_t zti_mode;
-	uint_t zti_value;
-	uint_t zti_count;
-} zio_taskq_info_t;
-
-static const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = {
-	"issue", "issue_high", "intr", "intr_high"
-};
-
-/*
- * This table defines the taskq settings for each ZFS I/O type. When
- * initializing a pool, we use this table to create an appropriately sized
- * taskq. Some operations are low volume and therefore have a small, static
- * number of threads assigned to their taskqs using the ZTI_N(#) or ZTI_ONE
- * macros. Other operations process a large amount of data; the ZTI_BATCH
- * macro causes us to create a taskq oriented for throughput. Some operations
- * are so high frequency and short-lived that the taskq itself can become a a
- * point of lock contention. The ZTI_P(#, #) macro indicates that we need an
- * additional degree of parallelism specified by the number of threads per-
- * taskq and the number of taskqs; when dispatching an event in this case, the
- * particular taskq is chosen at random.
- *
- * The different taskq priorities are to handle the different contexts (issue
- * and interrupt) and then to reserve threads for ZIO_PRIORITY_NOW I/Os that
- * need to be handled with minimum delay.
- */
-const zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = {
-	/* ISSUE	ISSUE_HIGH	INTR		INTR_HIGH */
-	{ ZTI_ONE,	ZTI_NULL,	ZTI_ONE,	ZTI_NULL }, /* NULL */
-	{ ZTI_N(8),	ZTI_NULL,	ZTI_P(12, 8),	ZTI_NULL }, /* READ */
-	{ ZTI_BATCH,	ZTI_N(5),	ZTI_P(12, 8),	ZTI_N(5) }, /* WRITE */
-	{ ZTI_P(12, 8),	ZTI_NULL,	ZTI_ONE,	ZTI_NULL }, /* FREE */
-	{ ZTI_ONE,	ZTI_NULL,	ZTI_ONE,	ZTI_NULL }, /* CLAIM */
-	{ ZTI_ONE,	ZTI_NULL,	ZTI_ONE,	ZTI_NULL }, /* IOCTL */
-};
-
-static void spa_sync_version(void *arg, dmu_tx_t *tx);
-static void spa_sync_props(void *arg, dmu_tx_t *tx);
-static boolean_t spa_has_active_shared_spare(spa_t *spa);
-static int spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport);
-static void spa_vdev_resilver_done(spa_t *spa);
-
-uint_t		zio_taskq_batch_pct = 75;	/* 1 thread per cpu in pset */
-#ifdef PSRSET_BIND
-id_t		zio_taskq_psrset_bind = PS_NONE;
-#endif
-#ifdef SYSDC
-boolean_t	zio_taskq_sysdc = B_TRUE;	/* use SDC scheduling class */
-uint_t		zio_taskq_basedc = 80;		/* base duty cycle */
-#endif
-
-#ifdef _KERNEL
-#define SPA_PROCESS
-#endif
-boolean_t	spa_create_process = B_TRUE;	/* no process ==> no sysdc */
-
-extern int	zfs_sync_pass_deferred_free;
-
-/*
- * Report any spa_load_verify errors found, but do not fail spa_load.
- * This is used by zdb to analyze non-idle pools.
- */
-boolean_t	spa_load_verify_dryrun = B_FALSE;
-
-/*
- * This (illegal) pool name is used when temporarily importing a spa_t in order
- * to get the vdev stats associated with the imported devices.
- */
-#define	TRYIMPORT_NAME	"$import"
-
-/*
- * For debugging purposes: print out vdev tree during pool import.
- */
-int	spa_load_print_vdev_tree = B_FALSE;
-
-/*
- * A non-zero value for zfs_max_missing_tvds means that we allow importing
- * pools with missing top-level vdevs. This is strictly intended for advanced
- * pool recovery cases since missing data is almost inevitable. Pools with
- * missing devices can only be imported read-only for safety reasons, and their
- * fail-mode will be automatically set to "continue".
- *
- * With 1 missing vdev we should be able to import the pool and mount all
- * datasets. User data that was not modified after the missing device has been
- * added should be recoverable. This means that snapshots created prior to the
- * addition of that device should be completely intact.
- *
- * With 2 missing vdevs, some datasets may fail to mount since there are
- * dataset statistics that are stored as regular metadata. Some data might be
- * recoverable if those vdevs were added recently.
- *
- * With 3 or more missing vdevs, the pool is severely damaged and MOS entries
- * may be missing entirely. Chances of data recovery are very low. Note that
- * there are also risks of performing an inadvertent rewind as we might be
- * missing all the vdevs with the latest uberblocks.
- */
-uint64_t	zfs_max_missing_tvds = 0;
-
-/*
- * The parameters below are similar to zfs_max_missing_tvds but are only
- * intended for a preliminary open of the pool with an untrusted config which
- * might be incomplete or out-dated.
- *
- * We are more tolerant for pools opened from a cachefile since we could have
- * an out-dated cachefile where a device removal was not registered.
- * We could have set the limit arbitrarily high but in the case where devices
- * are really missing we would want to return the proper error codes; we chose
- * SPA_DVAS_PER_BP - 1 so that some copies of the MOS would still be available
- * and we get a chance to retrieve the trusted config.
- */
-uint64_t	zfs_max_missing_tvds_cachefile = SPA_DVAS_PER_BP - 1;
-
-/*
- * In the case where config was assembled by scanning device paths (/dev/dsks
- * by default) we are less tolerant since all the existing devices should have
- * been detected and we want spa_load to return the right error codes.
- */
-uint64_t	zfs_max_missing_tvds_scan = 0;
-
-
-SYSCTL_DECL(_vfs_zfs_zio);
-SYSCTL_INT(_vfs_zfs_zio, OID_AUTO, taskq_batch_pct, CTLFLAG_RDTUN,
-    &zio_taskq_batch_pct, 0,
-    "Percentage of CPUs to run an IO worker thread");
-SYSCTL_INT(_vfs_zfs, OID_AUTO, spa_load_print_vdev_tree, CTLFLAG_RWTUN,
-    &spa_load_print_vdev_tree, 0,
-    "print out vdev tree during pool import");
-SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, max_missing_tvds, CTLFLAG_RWTUN,
-    &zfs_max_missing_tvds, 0,
-    "allow importing pools with missing top-level vdevs");
-SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, max_missing_tvds_cachefile, CTLFLAG_RWTUN,
-    &zfs_max_missing_tvds_cachefile, 0,
-    "allow importing pools with missing top-level vdevs in cache file");
-SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, max_missing_tvds_scan, CTLFLAG_RWTUN,
-    &zfs_max_missing_tvds_scan, 0,
-    "allow importing pools with missing top-level vdevs during scan");
-
-/*
- * Debugging aid that pauses spa_sync() towards the end.
- */
-boolean_t	zfs_pause_spa_sync = B_FALSE;
-
-/*
- * ==========================================================================
- * SPA properties routines
- * ==========================================================================
- */
-
-/*
- * Add a (source=src, propname=propval) list to an nvlist.
- */
-static void
-spa_prop_add_list(nvlist_t *nvl, zpool_prop_t prop, char *strval,
-    uint64_t intval, zprop_source_t src)
-{
-	const char *propname = zpool_prop_to_name(prop);
-	nvlist_t *propval;
-
-	VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0);
-	VERIFY(nvlist_add_uint64(propval, ZPROP_SOURCE, src) == 0);
-
-	if (strval != NULL)
-		VERIFY(nvlist_add_string(propval, ZPROP_VALUE, strval) == 0);
-	else
-		VERIFY(nvlist_add_uint64(propval, ZPROP_VALUE, intval) == 0);
-
-	VERIFY(nvlist_add_nvlist(nvl, propname, propval) == 0);
-	nvlist_free(propval);
-}
-
-/*
- * Get property values from the spa configuration.
- */
-static void
-spa_prop_get_config(spa_t *spa, nvlist_t **nvp)
-{
-	vdev_t *rvd = spa->spa_root_vdev;
-	dsl_pool_t *pool = spa->spa_dsl_pool;
-	uint64_t size, alloc, cap, version;
-	zprop_source_t src = ZPROP_SRC_NONE;
-	spa_config_dirent_t *dp;
-	metaslab_class_t *mc = spa_normal_class(spa);
-
-	ASSERT(MUTEX_HELD(&spa->spa_props_lock));
-
-	if (rvd != NULL) {
-		alloc = metaslab_class_get_alloc(mc);
-		alloc += metaslab_class_get_alloc(spa_special_class(spa));
-		alloc += metaslab_class_get_alloc(spa_dedup_class(spa));
-
-		size = metaslab_class_get_space(mc);
-		size += metaslab_class_get_space(spa_special_class(spa));
-		size += metaslab_class_get_space(spa_dedup_class(spa));
-
-		spa_prop_add_list(*nvp, ZPOOL_PROP_NAME, spa_name(spa), 0, src);
-		spa_prop_add_list(*nvp, ZPOOL_PROP_SIZE, NULL, size, src);
-		spa_prop_add_list(*nvp, ZPOOL_PROP_ALLOCATED, NULL, alloc, src);
-		spa_prop_add_list(*nvp, ZPOOL_PROP_FREE, NULL,
-		    size - alloc, src);
-		spa_prop_add_list(*nvp, ZPOOL_PROP_CHECKPOINT, NULL,
-		    spa->spa_checkpoint_info.sci_dspace, src);
-
-		spa_prop_add_list(*nvp, ZPOOL_PROP_FRAGMENTATION, NULL,
-		    metaslab_class_fragmentation(mc), src);
-		spa_prop_add_list(*nvp, ZPOOL_PROP_EXPANDSZ, NULL,
-		    metaslab_class_expandable_space(mc), src);
-		spa_prop_add_list(*nvp, ZPOOL_PROP_READONLY, NULL,
-		    (spa_mode(spa) == FREAD), src);
-
-		cap = (size == 0) ? 0 : (alloc * 100 / size);
-		spa_prop_add_list(*nvp, ZPOOL_PROP_CAPACITY, NULL, cap, src);
-
-		spa_prop_add_list(*nvp, ZPOOL_PROP_DEDUPRATIO, NULL,
-		    ddt_get_pool_dedup_ratio(spa), src);
-
-		spa_prop_add_list(*nvp, ZPOOL_PROP_HEALTH, NULL,
-		    rvd->vdev_state, src);
-
-		version = spa_version(spa);
-		if (version == zpool_prop_default_numeric(ZPOOL_PROP_VERSION))
-			src = ZPROP_SRC_DEFAULT;
-		else
-			src = ZPROP_SRC_LOCAL;
-		spa_prop_add_list(*nvp, ZPOOL_PROP_VERSION, NULL, version, src);
-	}
-
-	if (pool != NULL) {
-		/*
-		 * The $FREE directory was introduced in SPA_VERSION_DEADLISTS,
-		 * when opening pools before this version freedir will be NULL.
-		 */
-		if (pool->dp_free_dir != NULL) {
-			spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING, NULL,
-			    dsl_dir_phys(pool->dp_free_dir)->dd_used_bytes,
-			    src);
-		} else {
-			spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING,
-			    NULL, 0, src);
-		}
-
-		if (pool->dp_leak_dir != NULL) {
-			spa_prop_add_list(*nvp, ZPOOL_PROP_LEAKED, NULL,
-			    dsl_dir_phys(pool->dp_leak_dir)->dd_used_bytes,
-			    src);
-		} else {
-			spa_prop_add_list(*nvp, ZPOOL_PROP_LEAKED,
-			    NULL, 0, src);
-		}
-	}
-
-	spa_prop_add_list(*nvp, ZPOOL_PROP_GUID, NULL, spa_guid(spa), src);
-
-	if (spa->spa_comment != NULL) {
-		spa_prop_add_list(*nvp, ZPOOL_PROP_COMMENT, spa->spa_comment,
-		    0, ZPROP_SRC_LOCAL);
-	}
-
-	if (spa->spa_root != NULL)
-		spa_prop_add_list(*nvp, ZPOOL_PROP_ALTROOT, spa->spa_root,
-		    0, ZPROP_SRC_LOCAL);
-
-	if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_BLOCKS)) {
-		spa_prop_add_list(*nvp, ZPOOL_PROP_MAXBLOCKSIZE, NULL,
-		    MIN(zfs_max_recordsize, SPA_MAXBLOCKSIZE), ZPROP_SRC_NONE);
-	} else {
-		spa_prop_add_list(*nvp, ZPOOL_PROP_MAXBLOCKSIZE, NULL,
-		    SPA_OLD_MAXBLOCKSIZE, ZPROP_SRC_NONE);
-	}
-
-	if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_DNODE)) {
-		spa_prop_add_list(*nvp, ZPOOL_PROP_MAXDNODESIZE, NULL,
-		    DNODE_MAX_SIZE, ZPROP_SRC_NONE);
-	} else {
-		spa_prop_add_list(*nvp, ZPOOL_PROP_MAXDNODESIZE, NULL,
-		    DNODE_MIN_SIZE, ZPROP_SRC_NONE);
-	}
-
-	if ((dp = list_head(&spa->spa_config_list)) != NULL) {
-		if (dp->scd_path == NULL) {
-			spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE,
-			    "none", 0, ZPROP_SRC_LOCAL);
-		} else if (strcmp(dp->scd_path, spa_config_path) != 0) {
-			spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE,
-			    dp->scd_path, 0, ZPROP_SRC_LOCAL);
-		}
-	}
-}
-
-/*
- * Get zpool property values.
- */
-int
-spa_prop_get(spa_t *spa, nvlist_t **nvp)
-{
-	objset_t *mos = spa->spa_meta_objset;
-	zap_cursor_t zc;
-	zap_attribute_t za;
-	int err;
-
-	VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0);
-
-	mutex_enter(&spa->spa_props_lock);
-
-	/*
-	 * Get properties from the spa config.
-	 */
-	spa_prop_get_config(spa, nvp);
-
-	/* If no pool property object, no more prop to get. */
-	if (mos == NULL || spa->spa_pool_props_object == 0) {
-		mutex_exit(&spa->spa_props_lock);
-		return (0);
-	}
-
-	/*
-	 * Get properties from the MOS pool property object.
-	 */
-	for (zap_cursor_init(&zc, mos, spa->spa_pool_props_object);
-	    (err = zap_cursor_retrieve(&zc, &za)) == 0;
-	    zap_cursor_advance(&zc)) {
-		uint64_t intval = 0;
-		char *strval = NULL;
-		zprop_source_t src = ZPROP_SRC_DEFAULT;
-		zpool_prop_t prop;
-
-		if ((prop = zpool_name_to_prop(za.za_name)) == ZPOOL_PROP_INVAL)
-			continue;
-
-		switch (za.za_integer_length) {
-		case 8:
-			/* integer property */
-			if (za.za_first_integer !=
-			    zpool_prop_default_numeric(prop))
-				src = ZPROP_SRC_LOCAL;
-
-			if (prop == ZPOOL_PROP_BOOTFS) {
-				dsl_pool_t *dp;
-				dsl_dataset_t *ds = NULL;
-
-				dp = spa_get_dsl(spa);
-				dsl_pool_config_enter(dp, FTAG);
-				err = dsl_dataset_hold_obj(dp,
-				    za.za_first_integer, FTAG, &ds);
-				if (err != 0) {
-					dsl_pool_config_exit(dp, FTAG);
-					break;
-				}
-
-				strval = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN,
-				    KM_SLEEP);
-				dsl_dataset_name(ds, strval);
-				dsl_dataset_rele(ds, FTAG);
-				dsl_pool_config_exit(dp, FTAG);
-			} else {
-				strval = NULL;
-				intval = za.za_first_integer;
-			}
-
-			spa_prop_add_list(*nvp, prop, strval, intval, src);
-
-			if (strval != NULL)
-				kmem_free(strval, ZFS_MAX_DATASET_NAME_LEN);
-
-			break;
-
-		case 1:
-			/* string property */
-			strval = kmem_alloc(za.za_num_integers, KM_SLEEP);
-			err = zap_lookup(mos, spa->spa_pool_props_object,
-			    za.za_name, 1, za.za_num_integers, strval);
-			if (err) {
-				kmem_free(strval, za.za_num_integers);
-				break;
-			}
-			spa_prop_add_list(*nvp, prop, strval, 0, src);
-			kmem_free(strval, za.za_num_integers);
-			break;
-
-		default:
-			break;
-		}
-	}
-	zap_cursor_fini(&zc);
-	mutex_exit(&spa->spa_props_lock);
-out:
-	if (err && err != ENOENT) {
-		nvlist_free(*nvp);
-		*nvp = NULL;
-		return (err);
-	}
-
-	return (0);
-}
-
-/*
- * Validate the given pool properties nvlist and modify the list
- * for the property values to be set.
- */
-static int
-spa_prop_validate(spa_t *spa, nvlist_t *props)
-{
-	nvpair_t *elem;
-	int error = 0, reset_bootfs = 0;
-	uint64_t objnum = 0;
-	boolean_t has_feature = B_FALSE;
-
-	elem = NULL;
-	while ((elem = nvlist_next_nvpair(props, elem)) != NULL) {
-		uint64_t intval;
-		char *strval, *slash, *check, *fname;
-		const char *propname = nvpair_name(elem);
-		zpool_prop_t prop = zpool_name_to_prop(propname);
-
-		switch (prop) {
-		case ZPOOL_PROP_INVAL:
-			if (!zpool_prop_feature(propname)) {
-				error = SET_ERROR(EINVAL);
-				break;
-			}
-
-			/*
-			 * Sanitize the input.
-			 */
-			if (nvpair_type(elem) != DATA_TYPE_UINT64) {
-				error = SET_ERROR(EINVAL);
-				break;
-			}
-
-			if (nvpair_value_uint64(elem, &intval) != 0) {
-				error = SET_ERROR(EINVAL);
-				break;
-			}
-
-			if (intval != 0) {
-				error = SET_ERROR(EINVAL);
-				break;
-			}
-
-			fname = strchr(propname, '@') + 1;
-			if (zfeature_lookup_name(fname, NULL) != 0) {
-				error = SET_ERROR(EINVAL);
-				break;
-			}
-
-			has_feature = B_TRUE;
-			break;
-
-		case ZPOOL_PROP_VERSION:
-			error = nvpair_value_uint64(elem, &intval);
-			if (!error &&
-			    (intval < spa_version(spa) ||
-			    intval > SPA_VERSION_BEFORE_FEATURES ||
-			    has_feature))
-				error = SET_ERROR(EINVAL);
-			break;
-
-		case ZPOOL_PROP_DELEGATION:
-		case ZPOOL_PROP_AUTOREPLACE:
-		case ZPOOL_PROP_LISTSNAPS:
-		case ZPOOL_PROP_AUTOEXPAND:
-			error = nvpair_value_uint64(elem, &intval);
-			if (!error && intval > 1)
-				error = SET_ERROR(EINVAL);
-			break;
-
-		case ZPOOL_PROP_MULTIHOST:
-			error = nvpair_value_uint64(elem, &intval);
-			if (!error && intval > 1)
-				error = SET_ERROR(EINVAL);
-
-			if (!error && !spa_get_hostid())
-				error = SET_ERROR(ENOTSUP);
-
-			break;
-
-		case ZPOOL_PROP_BOOTFS:
-			/*
-			 * If the pool version is less than SPA_VERSION_BOOTFS,
-			 * or the pool is still being created (version == 0),
-			 * the bootfs property cannot be set.
-			 */
-			if (spa_version(spa) < SPA_VERSION_BOOTFS) {
-				error = SET_ERROR(ENOTSUP);
-				break;
-			}
-
-			/*
-			 * Make sure the vdev config is bootable
-			 */
-			if (!vdev_is_bootable(spa->spa_root_vdev)) {
-				error = SET_ERROR(ENOTSUP);
-				break;
-			}
-
-			reset_bootfs = 1;
-
-			error = nvpair_value_string(elem, &strval);
-
-			if (!error) {
-				objset_t *os;
-				uint64_t propval;
-
-				if (strval == NULL || strval[0] == '\0') {
-					objnum = zpool_prop_default_numeric(
-					    ZPOOL_PROP_BOOTFS);
-					break;
-				}
-
-				error = dmu_objset_hold(strval, FTAG, &os);
-				if (error != 0)
-					break;
-
-				/*
-				 * Must be ZPL, and its property settings
-				 * must be supported.
-				 */
-
-				if (dmu_objset_type(os) != DMU_OST_ZFS) {
-					error = SET_ERROR(ENOTSUP);
-				} else if ((error =
-				    dsl_prop_get_int_ds(dmu_objset_ds(os),
-				    zfs_prop_to_name(ZFS_PROP_COMPRESSION),
-				    &propval)) == 0 &&
-				    !BOOTFS_COMPRESS_VALID(propval)) {
-					error = SET_ERROR(ENOTSUP);
-				} else {
-					objnum = dmu_objset_id(os);
-				}
-				dmu_objset_rele(os, FTAG);
-			}
-			break;
-
-		case ZPOOL_PROP_FAILUREMODE:
-			error = nvpair_value_uint64(elem, &intval);
-			if (!error && (intval < ZIO_FAILURE_MODE_WAIT ||
-			    intval > ZIO_FAILURE_MODE_PANIC))
-				error = SET_ERROR(EINVAL);
-
-			/*
-			 * This is a special case which only occurs when
-			 * the pool has completely failed. This allows
-			 * the user to change the in-core failmode property
-			 * without syncing it out to disk (I/Os might
-			 * currently be blocked). We do this by returning
-			 * EIO to the caller (spa_prop_set) to trick it
-			 * into thinking we encountered a property validation
-			 * error.
-			 */
-			if (!error && spa_suspended(spa)) {
-				spa->spa_failmode = intval;
-				error = SET_ERROR(EIO);
-			}
-			break;
-
-		case ZPOOL_PROP_CACHEFILE:
-			if ((error = nvpair_value_string(elem, &strval)) != 0)
-				break;
-
-			if (strval[0] == '\0')
-				break;
-
-			if (strcmp(strval, "none") == 0)
-				break;
-
-			if (strval[0] != '/') {
-				error = SET_ERROR(EINVAL);
-				break;
-			}
-
-			slash = strrchr(strval, '/');
-			ASSERT(slash != NULL);
-
-			if (slash[1] == '\0' || strcmp(slash, "/.") == 0 ||
-			    strcmp(slash, "/..") == 0)
-				error = SET_ERROR(EINVAL);
-			break;
-
-		case ZPOOL_PROP_COMMENT:
-			if ((error = nvpair_value_string(elem, &strval)) != 0)
-				break;
-			for (check = strval; *check != '\0'; check++) {
-				/*
-				 * The kernel doesn't have an easy isprint()
-				 * check.  For this kernel check, we merely
-				 * check ASCII apart from DEL.  Fix this if
-				 * there is an easy-to-use kernel isprint().
-				 */
-				if (*check >= 0x7f) {
-					error = SET_ERROR(EINVAL);
-					break;
-				}
-			}
-			if (strlen(strval) > ZPROP_MAX_COMMENT)
-				error = E2BIG;
-			break;
-
-		case ZPOOL_PROP_DEDUPDITTO:
-			if (spa_version(spa) < SPA_VERSION_DEDUP)
-				error = SET_ERROR(ENOTSUP);
-			else
-				error = nvpair_value_uint64(elem, &intval);
-			if (error == 0 &&
-			    intval != 0 && intval < ZIO_DEDUPDITTO_MIN)
-				error = SET_ERROR(EINVAL);
-			break;
-		}
-
-		if (error)
-			break;
-	}
-
-	if (!error && reset_bootfs) {
-		error = nvlist_remove(props,
-		    zpool_prop_to_name(ZPOOL_PROP_BOOTFS), DATA_TYPE_STRING);
-
-		if (!error) {
-			error = nvlist_add_uint64(props,
-			    zpool_prop_to_name(ZPOOL_PROP_BOOTFS), objnum);
-		}
-	}
-
-	return (error);
-}
-
-void
-spa_configfile_set(spa_t *spa, nvlist_t *nvp, boolean_t need_sync)
-{
-	char *cachefile;
-	spa_config_dirent_t *dp;
-
-	if (nvlist_lookup_string(nvp, zpool_prop_to_name(ZPOOL_PROP_CACHEFILE),
-	    &cachefile) != 0)
-		return;
-
-	dp = kmem_alloc(sizeof (spa_config_dirent_t),
-	    KM_SLEEP);
-
-	if (cachefile[0] == '\0')
-		dp->scd_path = spa_strdup(spa_config_path);
-	else if (strcmp(cachefile, "none") == 0)
-		dp->scd_path = NULL;
-	else
-		dp->scd_path = spa_strdup(cachefile);
-
-	list_insert_head(&spa->spa_config_list, dp);
-	if (need_sync)
-		spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
-}
-
-int
-spa_prop_set(spa_t *spa, nvlist_t *nvp)
-{
-	int error;
-	nvpair_t *elem = NULL;
-	boolean_t need_sync = B_FALSE;
-
-	if ((error = spa_prop_validate(spa, nvp)) != 0)
-		return (error);
-
-	while ((elem = nvlist_next_nvpair(nvp, elem)) != NULL) {
-		zpool_prop_t prop = zpool_name_to_prop(nvpair_name(elem));
-
-		if (prop == ZPOOL_PROP_CACHEFILE ||
-		    prop == ZPOOL_PROP_ALTROOT ||
-		    prop == ZPOOL_PROP_READONLY)
-			continue;
-
-		if (prop == ZPOOL_PROP_VERSION || prop == ZPOOL_PROP_INVAL) {
-			uint64_t ver;
-
-			if (prop == ZPOOL_PROP_VERSION) {
-				VERIFY(nvpair_value_uint64(elem, &ver) == 0);
-			} else {
-				ASSERT(zpool_prop_feature(nvpair_name(elem)));
-				ver = SPA_VERSION_FEATURES;
-				need_sync = B_TRUE;
-			}
-
-			/* Save time if the version is already set. */
-			if (ver == spa_version(spa))
-				continue;
-
-			/*
-			 * In addition to the pool directory object, we might
-			 * create the pool properties object, the features for
-			 * read object, the features for write object, or the
-			 * feature descriptions object.
-			 */
-			error = dsl_sync_task(spa->spa_name, NULL,
-			    spa_sync_version, &ver,
-			    6, ZFS_SPACE_CHECK_RESERVED);
-			if (error)
-				return (error);
-			continue;
-		}
-
-		need_sync = B_TRUE;
-		break;
-	}
-
-	if (need_sync) {
-		return (dsl_sync_task(spa->spa_name, NULL, spa_sync_props,
-		    nvp, 6, ZFS_SPACE_CHECK_RESERVED));
-	}
-
-	return (0);
-}
-
-/*
- * If the bootfs property value is dsobj, clear it.
- */
-void
-spa_prop_clear_bootfs(spa_t *spa, uint64_t dsobj, dmu_tx_t *tx)
-{
-	if (spa->spa_bootfs == dsobj && spa->spa_pool_props_object != 0) {
-		VERIFY(zap_remove(spa->spa_meta_objset,
-		    spa->spa_pool_props_object,
-		    zpool_prop_to_name(ZPOOL_PROP_BOOTFS), tx) == 0);
-		spa->spa_bootfs = 0;
-	}
-}
-
-/*ARGSUSED*/
-static int
-spa_change_guid_check(void *arg, dmu_tx_t *tx)
-{
-	uint64_t *newguid = arg;
-	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
-	vdev_t *rvd = spa->spa_root_vdev;
-	uint64_t vdev_state;
-
-	if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) {
-		int error = (spa_has_checkpoint(spa)) ?
-		    ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT;
-		return (SET_ERROR(error));
-	}
-
-	spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
-	vdev_state = rvd->vdev_state;
-	spa_config_exit(spa, SCL_STATE, FTAG);
-
-	if (vdev_state != VDEV_STATE_HEALTHY)
-		return (SET_ERROR(ENXIO));
-
-	ASSERT3U(spa_guid(spa), !=, *newguid);
-
-	return (0);
-}
-
-static void
-spa_change_guid_sync(void *arg, dmu_tx_t *tx)
-{
-	uint64_t *newguid = arg;
-	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
-	uint64_t oldguid;
-	vdev_t *rvd = spa->spa_root_vdev;
-
-	oldguid = spa_guid(spa);
-
-	spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
-	rvd->vdev_guid = *newguid;
-	rvd->vdev_guid_sum += (*newguid - oldguid);
-	vdev_config_dirty(rvd);
-	spa_config_exit(spa, SCL_STATE, FTAG);
-
-	spa_history_log_internal(spa, "guid change", tx, "old=%llu new=%llu",
-	    oldguid, *newguid);
-}
-
-/*
- * Change the GUID for the pool.  This is done so that we can later
- * re-import a pool built from a clone of our own vdevs.  We will modify
- * the root vdev's guid, our own pool guid, and then mark all of our
- * vdevs dirty.  Note that we must make sure that all our vdevs are
- * online when we do this, or else any vdevs that weren't present
- * would be orphaned from our pool.  We are also going to issue a
- * sysevent to update any watchers.
- */
-int
-spa_change_guid(spa_t *spa)
-{
-	int error;
-	uint64_t guid;
-
-	mutex_enter(&spa->spa_vdev_top_lock);
-	mutex_enter(&spa_namespace_lock);
-	guid = spa_generate_guid(NULL);
-
-	error = dsl_sync_task(spa->spa_name, spa_change_guid_check,
-	    spa_change_guid_sync, &guid, 5, ZFS_SPACE_CHECK_RESERVED);
-
-	if (error == 0) {
-		spa_write_cachefile(spa, B_FALSE, B_TRUE);
-		spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_REGUID);
-	}
-
-	mutex_exit(&spa_namespace_lock);
-	mutex_exit(&spa->spa_vdev_top_lock);
-
-	return (error);
-}
-
-/*
- * ==========================================================================
- * SPA state manipulation (open/create/destroy/import/export)
- * ==========================================================================
- */
-
-static int
-spa_error_entry_compare(const void *a, const void *b)
-{
-	const spa_error_entry_t *sa = (const spa_error_entry_t *)a;
-	const spa_error_entry_t *sb = (const spa_error_entry_t *)b;
-	int ret;
-
-	ret = memcmp(&sa->se_bookmark, &sb->se_bookmark,
-	    sizeof (zbookmark_phys_t));
-
-	return (AVL_ISIGN(ret));
-}
-
-/*
- * Utility function which retrieves copies of the current logs and
- * re-initializes them in the process.
- */
-void
-spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub)
-{
-	ASSERT(MUTEX_HELD(&spa->spa_errlist_lock));
-
-	bcopy(&spa->spa_errlist_last, last, sizeof (avl_tree_t));
-	bcopy(&spa->spa_errlist_scrub, scrub, sizeof (avl_tree_t));
-
-	avl_create(&spa->spa_errlist_scrub,
-	    spa_error_entry_compare, sizeof (spa_error_entry_t),
-	    offsetof(spa_error_entry_t, se_avl));
-	avl_create(&spa->spa_errlist_last,
-	    spa_error_entry_compare, sizeof (spa_error_entry_t),
-	    offsetof(spa_error_entry_t, se_avl));
-}
-
-static void
-spa_taskqs_init(spa_t *spa, zio_type_t t, zio_taskq_type_t q)
-{
-	const zio_taskq_info_t *ztip = &zio_taskqs[t][q];
-	enum zti_modes mode = ztip->zti_mode;
-	uint_t value = ztip->zti_value;
-	uint_t count = ztip->zti_count;
-	spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
-	char name[32];
-	uint_t flags = 0;
-	boolean_t batch = B_FALSE;
-
-	if (mode == ZTI_MODE_NULL) {
-		tqs->stqs_count = 0;
-		tqs->stqs_taskq = NULL;
-		return;
-	}
-
-	ASSERT3U(count, >, 0);
-
-	tqs->stqs_count = count;
-	tqs->stqs_taskq = kmem_alloc(count * sizeof (taskq_t *), KM_SLEEP);
-
-	switch (mode) {
-	case ZTI_MODE_FIXED:
-		ASSERT3U(value, >=, 1);
-		value = MAX(value, 1);
-		break;
-
-	case ZTI_MODE_BATCH:
-		batch = B_TRUE;
-		flags |= TASKQ_THREADS_CPU_PCT;
-		value = zio_taskq_batch_pct;
-		break;
-
-	default:
-		panic("unrecognized mode for %s_%s taskq (%u:%u) in "
-		    "spa_activate()",
-		    zio_type_name[t], zio_taskq_types[q], mode, value);
-		break;
-	}
-
-	for (uint_t i = 0; i < count; i++) {
-		taskq_t *tq;
-
-		if (count > 1) {
-			(void) snprintf(name, sizeof (name), "%s_%s_%u",
-			    zio_type_name[t], zio_taskq_types[q], i);
-		} else {
-			(void) snprintf(name, sizeof (name), "%s_%s",
-			    zio_type_name[t], zio_taskq_types[q]);
-		}
-
-#ifdef SYSDC
-		if (zio_taskq_sysdc && spa->spa_proc != &p0) {
-			if (batch)
-				flags |= TASKQ_DC_BATCH;
-
-			tq = taskq_create_sysdc(name, value, 50, INT_MAX,
-			    spa->spa_proc, zio_taskq_basedc, flags);
-		} else {
-#endif
-			pri_t pri = maxclsyspri;
-			/*
-			 * The write issue taskq can be extremely CPU
-			 * intensive.  Run it at slightly lower priority
-			 * than the other taskqs.
-			 * FreeBSD notes:
-			 * - numerically higher priorities are lower priorities;
-			 * - if priorities divided by four (RQ_PPQ) are equal
-			 *   then a difference between them is insignificant.
-			 */
-			if (t == ZIO_TYPE_WRITE && q == ZIO_TASKQ_ISSUE)
-#ifdef illumos
-				pri--;
-#else
-				pri += 4;
-#endif
-
-			tq = taskq_create_proc(name, value, pri, 50,
-			    INT_MAX, spa->spa_proc, flags);
-#ifdef SYSDC
-		}
-#endif
-
-		tqs->stqs_taskq[i] = tq;
-	}
-}
-
-static void
-spa_taskqs_fini(spa_t *spa, zio_type_t t, zio_taskq_type_t q)
-{
-	spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
-
-	if (tqs->stqs_taskq == NULL) {
-		ASSERT0(tqs->stqs_count);
-		return;
-	}
-
-	for (uint_t i = 0; i < tqs->stqs_count; i++) {
-		ASSERT3P(tqs->stqs_taskq[i], !=, NULL);
-		taskq_destroy(tqs->stqs_taskq[i]);
-	}
-
-	kmem_free(tqs->stqs_taskq, tqs->stqs_count * sizeof (taskq_t *));
-	tqs->stqs_taskq = NULL;
-}
-
-/*
- * Dispatch a task to the appropriate taskq for the ZFS I/O type and priority.
- * Note that a type may have multiple discrete taskqs to avoid lock contention
- * on the taskq itself. In that case we choose which taskq at random by using
- * the low bits of gethrtime().
- */
-void
-spa_taskq_dispatch_ent(spa_t *spa, zio_type_t t, zio_taskq_type_t q,
-    task_func_t *func, void *arg, uint_t flags, taskq_ent_t *ent)
-{
-	spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
-	taskq_t *tq;
-
-	ASSERT3P(tqs->stqs_taskq, !=, NULL);
-	ASSERT3U(tqs->stqs_count, !=, 0);
-
-	if (tqs->stqs_count == 1) {
-		tq = tqs->stqs_taskq[0];
-	} else {
-#ifdef _KERNEL
-		tq = tqs->stqs_taskq[(u_int)(sbinuptime() + curcpu) %
-		    tqs->stqs_count];
-#else
-		tq = tqs->stqs_taskq[gethrtime() % tqs->stqs_count];
-#endif
-	}
-
-	taskq_dispatch_ent(tq, func, arg, flags, ent);
-}
-
-static void
-spa_create_zio_taskqs(spa_t *spa)
-{
-	for (int t = 0; t < ZIO_TYPES; t++) {
-		for (int q = 0; q < ZIO_TASKQ_TYPES; q++) {
-			spa_taskqs_init(spa, t, q);
-		}
-	}
-}
-
-#ifdef SPA_PROCESS
-static int
-newproc(void (*pc)(void *), void *arg, id_t cid, int pri,
-    void **ct, pid_t pid)
-{
-	va_list ap;
-	spa_t *spa = (spa_t *)arg;	/* XXX */
-	struct proc *newp;
-	struct thread *td;
-	int error;
-
-	ASSERT(ct == NULL);
-	ASSERT(pid == 0);
-	ASSERT(cid == syscid);
-
-	error = kproc_create(pc, arg, &newp, 0, 0, "zpool-%s", spa->spa_name);
-	if (error != 0)
-		return (error);
-	td = FIRST_THREAD_IN_PROC(newp);
-	thread_lock(td);
-	sched_prio(td, pri);
-	thread_unlock(td);
-	return (0);
-}
-
-static void
-spa_thread(void *arg)
-{
-	callb_cpr_t cprinfo;
-
-	spa_t *spa = arg;
-#ifdef illumos
-	user_t *pu = PTOU(curproc);
-#endif
-	CALLB_CPR_INIT(&cprinfo, &spa->spa_proc_lock, callb_generic_cpr,
-	    spa->spa_name);
-
-	ASSERT(curproc != &p0);
-#ifdef illumos
-	(void) snprintf(pu->u_psargs, sizeof (pu->u_psargs),
-	    "zpool-%s", spa->spa_name);
-	(void) strlcpy(pu->u_comm, pu->u_psargs, sizeof (pu->u_comm));
-#endif
-
-#ifdef PSRSET_BIND
-	/* bind this thread to the requested psrset */
-	if (zio_taskq_psrset_bind != PS_NONE) {
-		pool_lock();
-		mutex_enter(&cpu_lock);
-		mutex_enter(&pidlock);
-		mutex_enter(&curproc->p_lock);
-
-		if (cpupart_bind_thread(curthread, zio_taskq_psrset_bind,
-		    0, NULL, NULL) == 0)  {
-			curthread->t_bind_pset = zio_taskq_psrset_bind;
-		} else {
-			cmn_err(CE_WARN,
-			    "Couldn't bind process for zfs pool \"%s\" to "
-			    "pset %d\n", spa->spa_name, zio_taskq_psrset_bind);
-		}
-
-		mutex_exit(&curproc->p_lock);
-		mutex_exit(&pidlock);
-		mutex_exit(&cpu_lock);
-		pool_unlock();
-	}
-#endif
-
-#ifdef SYSDC
-	if (zio_taskq_sysdc) {
-		sysdc_thread_enter(curthread, 100, 0);
-	}
-#endif
-
-	spa->spa_proc = curproc;
-	spa->spa_did = curthread->t_did;
-
-	spa_create_zio_taskqs(spa);
-
-	mutex_enter(&spa->spa_proc_lock);
-	ASSERT(spa->spa_proc_state == SPA_PROC_CREATED);
-
-	spa->spa_proc_state = SPA_PROC_ACTIVE;
-	cv_broadcast(&spa->spa_proc_cv);
-
-	CALLB_CPR_SAFE_BEGIN(&cprinfo);
-	while (spa->spa_proc_state == SPA_PROC_ACTIVE)
-		cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock);
-	CALLB_CPR_SAFE_END(&cprinfo, &spa->spa_proc_lock);
-
-	ASSERT(spa->spa_proc_state == SPA_PROC_DEACTIVATE);
-	spa->spa_proc_state = SPA_PROC_GONE;
-	spa->spa_proc = &p0;
-	cv_broadcast(&spa->spa_proc_cv);
-	CALLB_CPR_EXIT(&cprinfo);	/* drops spa_proc_lock */
-
-#ifdef illumos
-	mutex_enter(&curproc->p_lock);
-	lwp_exit();
-#else
-	kthread_exit();
-#endif
-}
-#endif	/* SPA_PROCESS */
-
-/*
- * Activate an uninitialized pool.
- */
-static void
-spa_activate(spa_t *spa, int mode)
-{
-	ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED);
-
-	spa->spa_state = POOL_STATE_ACTIVE;
-	spa->spa_mode = mode;
-
-	spa->spa_normal_class = metaslab_class_create(spa, zfs_metaslab_ops);
-	spa->spa_log_class = metaslab_class_create(spa, zfs_metaslab_ops);
-	spa->spa_special_class = metaslab_class_create(spa, zfs_metaslab_ops);
-	spa->spa_dedup_class = metaslab_class_create(spa, zfs_metaslab_ops);
-
-	/* Try to create a covering process */
-	mutex_enter(&spa->spa_proc_lock);
-	ASSERT(spa->spa_proc_state == SPA_PROC_NONE);
-	ASSERT(spa->spa_proc == &p0);
-	spa->spa_did = 0;
-
-#ifdef SPA_PROCESS
-	/* Only create a process if we're going to be around a while. */
-	if (spa_create_process && strcmp(spa->spa_name, TRYIMPORT_NAME) != 0) {
-		if (newproc(spa_thread, (caddr_t)spa, syscid, maxclsyspri,
-		    NULL, 0) == 0) {
-			spa->spa_proc_state = SPA_PROC_CREATED;
-			while (spa->spa_proc_state == SPA_PROC_CREATED) {
-				cv_wait(&spa->spa_proc_cv,
-				    &spa->spa_proc_lock);
-			}
-			ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE);
-			ASSERT(spa->spa_proc != &p0);
-			ASSERT(spa->spa_did != 0);
-		} else {
-#ifdef _KERNEL
-			cmn_err(CE_WARN,
-			    "Couldn't create process for zfs pool \"%s\"\n",
-			    spa->spa_name);
-#endif
-		}
-	}
-#endif	/* SPA_PROCESS */
-	mutex_exit(&spa->spa_proc_lock);
-
-	/* If we didn't create a process, we need to create our taskqs. */
-#ifndef SPA_PROCESS
-	ASSERT(spa->spa_proc == &p0);
-#endif	/* SPA_PROCESS */
-	if (spa->spa_proc == &p0) {
-		spa_create_zio_taskqs(spa);
-	}
-
-	/*
-	 * Start TRIM thread.
-	 */
-	trim_thread_create(spa);
-
-	/*
-	 * This taskq is used to perform zvol-minor-related tasks
-	 * asynchronously. This has several advantages, including easy
-	 * resolution of various deadlocks (zfsonlinux bug #3681).
-	 *
-	 * The taskq must be single threaded to ensure tasks are always
-	 * processed in the order in which they were dispatched.
-	 *
-	 * A taskq per pool allows one to keep the pools independent.
-	 * This way if one pool is suspended, it will not impact another.
-	 *
-	 * The preferred location to dispatch a zvol minor task is a sync
-	 * task. In this context, there is easy access to the spa_t and minimal
-	 * error handling is required because the sync task must succeed.
-	 */
-	spa->spa_zvol_taskq = taskq_create("z_zvol", 1, minclsyspri,
-	    1, INT_MAX, 0);
-
-	for (size_t i = 0; i < TXG_SIZE; i++) {
-		spa->spa_txg_zio[i] = zio_root(spa, NULL, NULL,
-		    ZIO_FLAG_CANFAIL);
-	}
-
-	list_create(&spa->spa_config_dirty_list, sizeof (vdev_t),
-	    offsetof(vdev_t, vdev_config_dirty_node));
-	list_create(&spa->spa_evicting_os_list, sizeof (objset_t),
-	    offsetof(objset_t, os_evicting_node));
-	list_create(&spa->spa_state_dirty_list, sizeof (vdev_t),
-	    offsetof(vdev_t, vdev_state_dirty_node));
-
-	txg_list_create(&spa->spa_vdev_txg_list, spa,
-	    offsetof(struct vdev, vdev_txg_node));
-
-	avl_create(&spa->spa_errlist_scrub,
-	    spa_error_entry_compare, sizeof (spa_error_entry_t),
-	    offsetof(spa_error_entry_t, se_avl));
-	avl_create(&spa->spa_errlist_last,
-	    spa_error_entry_compare, sizeof (spa_error_entry_t),
-	    offsetof(spa_error_entry_t, se_avl));
-}
-
-/*
- * Opposite of spa_activate().
- */
-static void
-spa_deactivate(spa_t *spa)
-{
-	ASSERT(spa->spa_sync_on == B_FALSE);
-	ASSERT(spa->spa_dsl_pool == NULL);
-	ASSERT(spa->spa_root_vdev == NULL);
-	ASSERT(spa->spa_async_zio_root == NULL);
-	ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED);
-
-	/*
-	 * Stop TRIM thread in case spa_unload() wasn't called directly
-	 * before spa_deactivate().
-	 */
-	trim_thread_destroy(spa);
-
-	spa_evicting_os_wait(spa);
-
-	if (spa->spa_zvol_taskq) {
-		taskq_destroy(spa->spa_zvol_taskq);
-		spa->spa_zvol_taskq = NULL;
-	}
-
-	txg_list_destroy(&spa->spa_vdev_txg_list);
-
-	list_destroy(&spa->spa_config_dirty_list);
-	list_destroy(&spa->spa_evicting_os_list);
-	list_destroy(&spa->spa_state_dirty_list);
-
-	for (int t = 0; t < ZIO_TYPES; t++) {
-		for (int q = 0; q < ZIO_TASKQ_TYPES; q++) {
-			spa_taskqs_fini(spa, t, q);
-		}
-	}
-
-	for (size_t i = 0; i < TXG_SIZE; i++) {
-		ASSERT3P(spa->spa_txg_zio[i], !=, NULL);
-		VERIFY0(zio_wait(spa->spa_txg_zio[i]));
-		spa->spa_txg_zio[i] = NULL;
-	}
-
-	metaslab_class_destroy(spa->spa_normal_class);
-	spa->spa_normal_class = NULL;
-
-	metaslab_class_destroy(spa->spa_log_class);
-	spa->spa_log_class = NULL;
-
-	metaslab_class_destroy(spa->spa_special_class);
-	spa->spa_special_class = NULL;
-
-	metaslab_class_destroy(spa->spa_dedup_class);
-	spa->spa_dedup_class = NULL;
-
-	/*
-	 * If this was part of an import or the open otherwise failed, we may
-	 * still have errors left in the queues.  Empty them just in case.
-	 */
-	spa_errlog_drain(spa);
-
-	avl_destroy(&spa->spa_errlist_scrub);
-	avl_destroy(&spa->spa_errlist_last);
-
-	spa->spa_state = POOL_STATE_UNINITIALIZED;
-
-	mutex_enter(&spa->spa_proc_lock);
-	if (spa->spa_proc_state != SPA_PROC_NONE) {
-		ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE);
-		spa->spa_proc_state = SPA_PROC_DEACTIVATE;
-		cv_broadcast(&spa->spa_proc_cv);
-		while (spa->spa_proc_state == SPA_PROC_DEACTIVATE) {
-			ASSERT(spa->spa_proc != &p0);
-			cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock);
-		}
-		ASSERT(spa->spa_proc_state == SPA_PROC_GONE);
-		spa->spa_proc_state = SPA_PROC_NONE;
-	}
-	ASSERT(spa->spa_proc == &p0);
-	mutex_exit(&spa->spa_proc_lock);
-
-#ifdef SPA_PROCESS
-#ifdef illumos
-	/*
-	 * We want to make sure spa_thread() has actually exited the ZFS
-	 * module, so that the module can't be unloaded out from underneath
-	 * it.
-	 */
-	if (spa->spa_did != 0) {
-		thread_join(spa->spa_did);
-		spa->spa_did = 0;
-	}
-#endif
-#endif	/* SPA_PROCESS */
-}
-
-/*
- * Verify a pool configuration, and construct the vdev tree appropriately.  This
- * will create all the necessary vdevs in the appropriate layout, with each vdev
- * in the CLOSED state.  This will prep the pool before open/creation/import.
- * All vdev validation is done by the vdev_alloc() routine.
- */
-static int
-spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent,
-    uint_t id, int atype)
-{
-	nvlist_t **child;
-	uint_t children;
-	int error;
-
-	if ((error = vdev_alloc(spa, vdp, nv, parent, id, atype)) != 0)
-		return (error);
-
-	if ((*vdp)->vdev_ops->vdev_op_leaf)
-		return (0);
-
-	error = nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
-	    &child, &children);
-
-	if (error == ENOENT)
-		return (0);
-
-	if (error) {
-		vdev_free(*vdp);
-		*vdp = NULL;
-		return (SET_ERROR(EINVAL));
-	}
-
-	for (int c = 0; c < children; c++) {
-		vdev_t *vd;
-		if ((error = spa_config_parse(spa, &vd, child[c], *vdp, c,
-		    atype)) != 0) {
-			vdev_free(*vdp);
-			*vdp = NULL;
-			return (error);
-		}
-	}
-
-	ASSERT(*vdp != NULL);
-
-	return (0);
-}
-
-/*
- * Opposite of spa_load().
- */
-static void
-spa_unload(spa_t *spa)
-{
-	int i;
-
-	ASSERT(MUTEX_HELD(&spa_namespace_lock));
-
-	spa_load_note(spa, "UNLOADING");
-
-	/*
-	 * Stop TRIM thread.
-	 */
-	trim_thread_destroy(spa);
-
-	/*
-	 * Stop async tasks.
-	 */
-	spa_async_suspend(spa);
-
-	if (spa->spa_root_vdev) {
-		vdev_initialize_stop_all(spa->spa_root_vdev,
-		    VDEV_INITIALIZE_ACTIVE);
-	}
-
-	/*
-	 * Stop syncing.
-	 */
-	if (spa->spa_sync_on) {
-		txg_sync_stop(spa->spa_dsl_pool);
-		spa->spa_sync_on = B_FALSE;
-	}
-
-	/*
-	 * Even though vdev_free() also calls vdev_metaslab_fini, we need
-	 * to call it earlier, before we wait for async i/o to complete.
-	 * This ensures that there is no async metaslab prefetching, by
-	 * calling taskq_wait(mg_taskq).
-	 */
-	if (spa->spa_root_vdev != NULL) {
-		spa_config_enter(spa, SCL_ALL, spa, RW_WRITER);
-		for (int c = 0; c < spa->spa_root_vdev->vdev_children; c++)
-			vdev_metaslab_fini(spa->spa_root_vdev->vdev_child[c]);
-		spa_config_exit(spa, SCL_ALL, spa);
-	}
-
-	if (spa->spa_mmp.mmp_thread)
-		mmp_thread_stop(spa);
-
-	/*
-	 * Wait for any outstanding async I/O to complete.
-	 */
-	if (spa->spa_async_zio_root != NULL) {
-		for (int i = 0; i < max_ncpus; i++)
-			(void) zio_wait(spa->spa_async_zio_root[i]);
-		kmem_free(spa->spa_async_zio_root, max_ncpus * sizeof (void *));
-		spa->spa_async_zio_root = NULL;
-	}
-
-	if (spa->spa_vdev_removal != NULL) {
-		spa_vdev_removal_destroy(spa->spa_vdev_removal);
-		spa->spa_vdev_removal = NULL;
-	}
-
-	if (spa->spa_condense_zthr != NULL) {
-		zthr_destroy(spa->spa_condense_zthr);
-		spa->spa_condense_zthr = NULL;
-	}
-
-	if (spa->spa_checkpoint_discard_zthr != NULL) {
-		zthr_destroy(spa->spa_checkpoint_discard_zthr);
-		spa->spa_checkpoint_discard_zthr = NULL;
-	}
-
-	spa_condense_fini(spa);
-
-	bpobj_close(&spa->spa_deferred_bpobj);
-
-	spa_config_enter(spa, SCL_ALL, spa, RW_WRITER);
-
-	/*
-	 * Close all vdevs.
-	 */
-	if (spa->spa_root_vdev)
-		vdev_free(spa->spa_root_vdev);
-	ASSERT(spa->spa_root_vdev == NULL);
-
-	/*
-	 * Close the dsl pool.
-	 */
-	if (spa->spa_dsl_pool) {
-		dsl_pool_close(spa->spa_dsl_pool);
-		spa->spa_dsl_pool = NULL;
-		spa->spa_meta_objset = NULL;
-	}
-
-	ddt_unload(spa);
-
-	/*
-	 * Drop and purge level 2 cache
-	 */
-	spa_l2cache_drop(spa);
-
-	for (i = 0; i < spa->spa_spares.sav_count; i++)
-		vdev_free(spa->spa_spares.sav_vdevs[i]);
-	if (spa->spa_spares.sav_vdevs) {
-		kmem_free(spa->spa_spares.sav_vdevs,
-		    spa->spa_spares.sav_count * sizeof (void *));
-		spa->spa_spares.sav_vdevs = NULL;
-	}
-	if (spa->spa_spares.sav_config) {
-		nvlist_free(spa->spa_spares.sav_config);
-		spa->spa_spares.sav_config = NULL;
-	}
-	spa->spa_spares.sav_count = 0;
-
-	for (i = 0; i < spa->spa_l2cache.sav_count; i++) {
-		vdev_clear_stats(spa->spa_l2cache.sav_vdevs[i]);
-		vdev_free(spa->spa_l2cache.sav_vdevs[i]);
-	}
-	if (spa->spa_l2cache.sav_vdevs) {
-		kmem_free(spa->spa_l2cache.sav_vdevs,
-		    spa->spa_l2cache.sav_count * sizeof (void *));
-		spa->spa_l2cache.sav_vdevs = NULL;
-	}
-	if (spa->spa_l2cache.sav_config) {
-		nvlist_free(spa->spa_l2cache.sav_config);
-		spa->spa_l2cache.sav_config = NULL;
-	}
-	spa->spa_l2cache.sav_count = 0;
-
-	spa->spa_async_suspended = 0;
-
-	spa->spa_indirect_vdevs_loaded = B_FALSE;
-
-	if (spa->spa_comment != NULL) {
-		spa_strfree(spa->spa_comment);
-		spa->spa_comment = NULL;
-	}
-
-	spa_config_exit(spa, SCL_ALL, spa);
-}
-
-/*
- * Load (or re-load) the current list of vdevs describing the active spares for
- * this pool.  When this is called, we have some form of basic information in
- * 'spa_spares.sav_config'.  We parse this into vdevs, try to open them, and
- * then re-generate a more complete list including status information.
- */
-void
-spa_load_spares(spa_t *spa)
-{
-	nvlist_t **spares;
-	uint_t nspares;
-	int i;
-	vdev_t *vd, *tvd;
-
-#ifndef _KERNEL
-	/*
-	 * zdb opens both the current state of the pool and the
-	 * checkpointed state (if present), with a different spa_t.
-	 *
-	 * As spare vdevs are shared among open pools, we skip loading
-	 * them when we load the checkpointed state of the pool.
-	 */
-	if (!spa_writeable(spa))
-		return;
-#endif
-
-	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
-
-	/*
-	 * First, close and free any existing spare vdevs.
-	 */
-	for (i = 0; i < spa->spa_spares.sav_count; i++) {
-		vd = spa->spa_spares.sav_vdevs[i];
-
-		/* Undo the call to spa_activate() below */
-		if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid,
-		    B_FALSE)) != NULL && tvd->vdev_isspare)
-			spa_spare_remove(tvd);
-		vdev_close(vd);
-		vdev_free(vd);
-	}
-
-	if (spa->spa_spares.sav_vdevs)
-		kmem_free(spa->spa_spares.sav_vdevs,
-		    spa->spa_spares.sav_count * sizeof (void *));
-
-	if (spa->spa_spares.sav_config == NULL)
-		nspares = 0;
-	else
-		VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config,
-		    ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0);
-
-	spa->spa_spares.sav_count = (int)nspares;
-	spa->spa_spares.sav_vdevs = NULL;
-
-	if (nspares == 0)
-		return;
-
-	/*
-	 * Construct the array of vdevs, opening them to get status in the
-	 * process.   For each spare, there is potentially two different vdev_t
-	 * structures associated with it: one in the list of spares (used only
-	 * for basic validation purposes) and one in the active vdev
-	 * configuration (if it's spared in).  During this phase we open and
-	 * validate each vdev on the spare list.  If the vdev also exists in the
-	 * active configuration, then we also mark this vdev as an active spare.
-	 */
-	spa->spa_spares.sav_vdevs = kmem_alloc(nspares * sizeof (void *),
-	    KM_SLEEP);
-	for (i = 0; i < spa->spa_spares.sav_count; i++) {
-		VERIFY(spa_config_parse(spa, &vd, spares[i], NULL, 0,
-		    VDEV_ALLOC_SPARE) == 0);
-		ASSERT(vd != NULL);
-
-		spa->spa_spares.sav_vdevs[i] = vd;
-
-		if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid,
-		    B_FALSE)) != NULL) {
-			if (!tvd->vdev_isspare)
-				spa_spare_add(tvd);
-
-			/*
-			 * We only mark the spare active if we were successfully
-			 * able to load the vdev.  Otherwise, importing a pool
-			 * with a bad active spare would result in strange
-			 * behavior, because multiple pool would think the spare
-			 * is actively in use.
-			 *
-			 * There is a vulnerability here to an equally bizarre
-			 * circumstance, where a dead active spare is later
-			 * brought back to life (onlined or otherwise).  Given
-			 * the rarity of this scenario, and the extra complexity
-			 * it adds, we ignore the possibility.
-			 */
-			if (!vdev_is_dead(tvd))
-				spa_spare_activate(tvd);
-		}
-
-		vd->vdev_top = vd;
-		vd->vdev_aux = &spa->spa_spares;
-
-		if (vdev_open(vd) != 0)
-			continue;
-
-		if (vdev_validate_aux(vd) == 0)
-			spa_spare_add(vd);
-	}
-
-	/*
-	 * Recompute the stashed list of spares, with status information
-	 * this time.
-	 */
-	VERIFY(nvlist_remove(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES,
-	    DATA_TYPE_NVLIST_ARRAY) == 0);
-
-	spares = kmem_alloc(spa->spa_spares.sav_count * sizeof (void *),
-	    KM_SLEEP);
-	for (i = 0; i < spa->spa_spares.sav_count; i++)
-		spares[i] = vdev_config_generate(spa,
-		    spa->spa_spares.sav_vdevs[i], B_TRUE, VDEV_CONFIG_SPARE);
-	VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config,
-	    ZPOOL_CONFIG_SPARES, spares, spa->spa_spares.sav_count) == 0);
-	for (i = 0; i < spa->spa_spares.sav_count; i++)
-		nvlist_free(spares[i]);
-	kmem_free(spares, spa->spa_spares.sav_count * sizeof (void *));
-}
-
-/*
- * Load (or re-load) the current list of vdevs describing the active l2cache for
- * this pool.  When this is called, we have some form of basic information in
- * 'spa_l2cache.sav_config'.  We parse this into vdevs, try to open them, and
- * then re-generate a more complete list including status information.
- * Devices which are already active have their details maintained, and are
- * not re-opened.
- */
-void
-spa_load_l2cache(spa_t *spa)
-{
-	nvlist_t **l2cache;
-	uint_t nl2cache;
-	int i, j, oldnvdevs;
-	uint64_t guid;
-	vdev_t *vd, **oldvdevs, **newvdevs;
-	spa_aux_vdev_t *sav = &spa->spa_l2cache;
-
-#ifndef _KERNEL
-	/*
-	 * zdb opens both the current state of the pool and the
-	 * checkpointed state (if present), with a different spa_t.
-	 *
-	 * As L2 caches are part of the ARC which is shared among open
-	 * pools, we skip loading them when we load the checkpointed
-	 * state of the pool.
-	 */
-	if (!spa_writeable(spa))
-		return;
-#endif
-
-	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
-
-	if (sav->sav_config != NULL) {
-		VERIFY(nvlist_lookup_nvlist_array(sav->sav_config,
-		    ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0);
-		newvdevs = kmem_alloc(nl2cache * sizeof (void *), KM_SLEEP);
-	} else {
-		nl2cache = 0;
-		newvdevs = NULL;
-	}
-
-	oldvdevs = sav->sav_vdevs;
-	oldnvdevs = sav->sav_count;
-	sav->sav_vdevs = NULL;
-	sav->sav_count = 0;
-
-	/*
-	 * Process new nvlist of vdevs.
-	 */
-	for (i = 0; i < nl2cache; i++) {
-		VERIFY(nvlist_lookup_uint64(l2cache[i], ZPOOL_CONFIG_GUID,
-		    &guid) == 0);
-
-		newvdevs[i] = NULL;
-		for (j = 0; j < oldnvdevs; j++) {
-			vd = oldvdevs[j];
-			if (vd != NULL && guid == vd->vdev_guid) {
-				/*
-				 * Retain previous vdev for add/remove ops.
-				 */
-				newvdevs[i] = vd;
-				oldvdevs[j] = NULL;
-				break;
-			}
-		}
-
-		if (newvdevs[i] == NULL) {
-			/*
-			 * Create new vdev
-			 */
-			VERIFY(spa_config_parse(spa, &vd, l2cache[i], NULL, 0,
-			    VDEV_ALLOC_L2CACHE) == 0);
-			ASSERT(vd != NULL);
-			newvdevs[i] = vd;
-
-			/*
-			 * Commit this vdev as an l2cache device,
-			 * even if it fails to open.
-			 */
-			spa_l2cache_add(vd);
-
-			vd->vdev_top = vd;
-			vd->vdev_aux = sav;
-
-			spa_l2cache_activate(vd);
-
-			if (vdev_open(vd) != 0)
-				continue;
-
-			(void) vdev_validate_aux(vd);
-
-			if (!vdev_is_dead(vd))
-				l2arc_add_vdev(spa, vd);
-		}
-	}
-
-	/*
-	 * Purge vdevs that were dropped
-	 */
-	for (i = 0; i < oldnvdevs; i++) {
-		uint64_t pool;
-
-		vd = oldvdevs[i];
-		if (vd != NULL) {
-			ASSERT(vd->vdev_isl2cache);
-
-			if (spa_l2cache_exists(vd->vdev_guid, &pool) &&
-			    pool != 0ULL && l2arc_vdev_present(vd))
-				l2arc_remove_vdev(vd);
-			vdev_clear_stats(vd);
-			vdev_free(vd);
-		}
-	}
-
-	if (oldvdevs)
-		kmem_free(oldvdevs, oldnvdevs * sizeof (void *));
-
-	if (sav->sav_config == NULL)
-		goto out;
-
-	sav->sav_vdevs = newvdevs;
-	sav->sav_count = (int)nl2cache;
-
-	/*
-	 * Recompute the stashed list of l2cache devices, with status
-	 * information this time.
-	 */
-	VERIFY(nvlist_remove(sav->sav_config, ZPOOL_CONFIG_L2CACHE,
-	    DATA_TYPE_NVLIST_ARRAY) == 0);
-
-	l2cache = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP);
-	for (i = 0; i < sav->sav_count; i++)
-		l2cache[i] = vdev_config_generate(spa,
-		    sav->sav_vdevs[i], B_TRUE, VDEV_CONFIG_L2CACHE);
-	VERIFY(nvlist_add_nvlist_array(sav->sav_config,
-	    ZPOOL_CONFIG_L2CACHE, l2cache, sav->sav_count) == 0);
-out:
-	for (i = 0; i < sav->sav_count; i++)
-		nvlist_free(l2cache[i]);
-	if (sav->sav_count)
-		kmem_free(l2cache, sav->sav_count * sizeof (void *));
-}
-
-static int
-load_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value)
-{
-	dmu_buf_t *db;
-	char *packed = NULL;
-	size_t nvsize = 0;
-	int error;
-	*value = NULL;
-
-	error = dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db);
-	if (error != 0)
-		return (error);
-
-	nvsize = *(uint64_t *)db->db_data;
-	dmu_buf_rele(db, FTAG);
-
-	packed = kmem_alloc(nvsize, KM_SLEEP);
-	error = dmu_read(spa->spa_meta_objset, obj, 0, nvsize, packed,
-	    DMU_READ_PREFETCH);
-	if (error == 0)
-		error = nvlist_unpack(packed, nvsize, value, 0);
-	kmem_free(packed, nvsize);
-
-	return (error);
-}
-
-/*
- * Concrete top-level vdevs that are not missing and are not logs. At every
- * spa_sync we write new uberblocks to at least SPA_SYNC_MIN_VDEVS core tvds.
- */
-static uint64_t
-spa_healthy_core_tvds(spa_t *spa)
-{
-	vdev_t *rvd = spa->spa_root_vdev;
-	uint64_t tvds = 0;
-
-	for (uint64_t i = 0; i < rvd->vdev_children; i++) {
-		vdev_t *vd = rvd->vdev_child[i];
-		if (vd->vdev_islog)
-			continue;
-		if (vdev_is_concrete(vd) && !vdev_is_dead(vd))
-			tvds++;
-	}
-
-	return (tvds);
-}
-
-/*
- * Checks to see if the given vdev could not be opened, in which case we post a
- * sysevent to notify the autoreplace code that the device has been removed.
- */
-static void
-spa_check_removed(vdev_t *vd)
-{
-	for (uint64_t c = 0; c < vd->vdev_children; c++)
-		spa_check_removed(vd->vdev_child[c]);
-
-	if (vd->vdev_ops->vdev_op_leaf && vdev_is_dead(vd) &&
-	    vdev_is_concrete(vd)) {
-		zfs_post_autoreplace(vd->vdev_spa, vd);
-		spa_event_notify(vd->vdev_spa, vd, NULL, ESC_ZFS_VDEV_CHECK);
-	}
-}
-
-static int
-spa_check_for_missing_logs(spa_t *spa)
-{
-	vdev_t *rvd = spa->spa_root_vdev;
-
-	/*
-	 * If we're doing a normal import, then build up any additional
-	 * diagnostic information about missing log devices.
-	 * We'll pass this up to the user for further processing.
-	 */
-	if (!(spa->spa_import_flags & ZFS_IMPORT_MISSING_LOG)) {
-		nvlist_t **child, *nv;
-		uint64_t idx = 0;
-
-		child = kmem_alloc(rvd->vdev_children * sizeof (nvlist_t **),
-		    KM_SLEEP);
-		VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0);
-
-		for (uint64_t c = 0; c < rvd->vdev_children; c++) {
-			vdev_t *tvd = rvd->vdev_child[c];
-
-			/*
-			 * We consider a device as missing only if it failed
-			 * to open (i.e. offline or faulted is not considered
-			 * as missing).
-			 */
-			if (tvd->vdev_islog &&
-			    tvd->vdev_state == VDEV_STATE_CANT_OPEN) {
-				child[idx++] = vdev_config_generate(spa, tvd,
-				    B_FALSE, VDEV_CONFIG_MISSING);
-			}
-		}
-
-		if (idx > 0) {
-			fnvlist_add_nvlist_array(nv,
-			    ZPOOL_CONFIG_CHILDREN, child, idx);
-			fnvlist_add_nvlist(spa->spa_load_info,
-			    ZPOOL_CONFIG_MISSING_DEVICES, nv);
-
-			for (uint64_t i = 0; i < idx; i++)
-				nvlist_free(child[i]);
-		}
-		nvlist_free(nv);
-		kmem_free(child, rvd->vdev_children * sizeof (char **));
-
-		if (idx > 0) {
-			spa_load_failed(spa, "some log devices are missing");
-			vdev_dbgmsg_print_tree(rvd, 2);
-			return (SET_ERROR(ENXIO));
-		}
-	} else {
-		for (uint64_t c = 0; c < rvd->vdev_children; c++) {
-			vdev_t *tvd = rvd->vdev_child[c];
-
-			if (tvd->vdev_islog &&
-			    tvd->vdev_state == VDEV_STATE_CANT_OPEN) {
-				spa_set_log_state(spa, SPA_LOG_CLEAR);
-				spa_load_note(spa, "some log devices are "
-				    "missing, ZIL is dropped.");
-				vdev_dbgmsg_print_tree(rvd, 2);
-				break;
-			}
-		}
-	}
-
-	return (0);
-}
-
-/*
- * Check for missing log devices
- */
-static boolean_t
-spa_check_logs(spa_t *spa)
-{
-	boolean_t rv = B_FALSE;
-	dsl_pool_t *dp = spa_get_dsl(spa);
-
-	switch (spa->spa_log_state) {
-	case SPA_LOG_MISSING:
-		/* need to recheck in case slog has been restored */
-	case SPA_LOG_UNKNOWN:
-		rv = (dmu_objset_find_dp(dp, dp->dp_root_dir_obj,
-		    zil_check_log_chain, NULL, DS_FIND_CHILDREN) != 0);
-		if (rv)
-			spa_set_log_state(spa, SPA_LOG_MISSING);
-		break;
-	}
-	return (rv);
-}
-
-static boolean_t
-spa_passivate_log(spa_t *spa)
-{
-	vdev_t *rvd = spa->spa_root_vdev;
-	boolean_t slog_found = B_FALSE;
-
-	ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER));
-
-	if (!spa_has_slogs(spa))
-		return (B_FALSE);
-
-	for (int c = 0; c < rvd->vdev_children; c++) {
-		vdev_t *tvd = rvd->vdev_child[c];
-		metaslab_group_t *mg = tvd->vdev_mg;
-
-		if (tvd->vdev_islog) {
-			metaslab_group_passivate(mg);
-			slog_found = B_TRUE;
-		}
-	}
-
-	return (slog_found);
-}
-
-static void
-spa_activate_log(spa_t *spa)
-{
-	vdev_t *rvd = spa->spa_root_vdev;
-
-	ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER));
-
-	for (int c = 0; c < rvd->vdev_children; c++) {
-		vdev_t *tvd = rvd->vdev_child[c];
-		metaslab_group_t *mg = tvd->vdev_mg;
-
-		if (tvd->vdev_islog)
-			metaslab_group_activate(mg);
-	}
-}
-
-int
-spa_reset_logs(spa_t *spa)
-{
-	int error;
-
-	error = dmu_objset_find(spa_name(spa), zil_reset,
-	    NULL, DS_FIND_CHILDREN);
-	if (error == 0) {
-		/*
-		 * We successfully offlined the log device, sync out the
-		 * current txg so that the "stubby" block can be removed
-		 * by zil_sync().
-		 */
-		txg_wait_synced(spa->spa_dsl_pool, 0);
-	}
-	return (error);
-}
-
-static void
-spa_aux_check_removed(spa_aux_vdev_t *sav)
-{
-	int i;
-
-	for (i = 0; i < sav->sav_count; i++)
-		spa_check_removed(sav->sav_vdevs[i]);
-}
-
-void
-spa_claim_notify(zio_t *zio)
-{
-	spa_t *spa = zio->io_spa;
-
-	if (zio->io_error)
-		return;
-
-	mutex_enter(&spa->spa_props_lock);	/* any mutex will do */
-	if (spa->spa_claim_max_txg < zio->io_bp->blk_birth)
-		spa->spa_claim_max_txg = zio->io_bp->blk_birth;
-	mutex_exit(&spa->spa_props_lock);
-}
-
-typedef struct spa_load_error {
-	uint64_t	sle_meta_count;
-	uint64_t	sle_data_count;
-} spa_load_error_t;
-
-static void
-spa_load_verify_done(zio_t *zio)
-{
-	blkptr_t *bp = zio->io_bp;
-	spa_load_error_t *sle = zio->io_private;
-	dmu_object_type_t type = BP_GET_TYPE(bp);
-	int error = zio->io_error;
-	spa_t *spa = zio->io_spa;
-
-	abd_free(zio->io_abd);
-	if (error) {
-		if ((BP_GET_LEVEL(bp) != 0 || DMU_OT_IS_METADATA(type)) &&
-		    type != DMU_OT_INTENT_LOG)
-			atomic_inc_64(&sle->sle_meta_count);
-		else
-			atomic_inc_64(&sle->sle_data_count);
-	}
-
-	mutex_enter(&spa->spa_scrub_lock);
-	spa->spa_load_verify_ios--;
-	cv_broadcast(&spa->spa_scrub_io_cv);
-	mutex_exit(&spa->spa_scrub_lock);
-}
-
-/*
- * Maximum number of concurrent scrub i/os to create while verifying
- * a pool while importing it.
- */
-int spa_load_verify_maxinflight = 10000;
-boolean_t spa_load_verify_metadata = B_TRUE;
-boolean_t spa_load_verify_data = B_TRUE;
-
-SYSCTL_INT(_vfs_zfs, OID_AUTO, spa_load_verify_maxinflight, CTLFLAG_RWTUN,
-    &spa_load_verify_maxinflight, 0,
-    "Maximum number of concurrent scrub I/Os to create while verifying a "
-    "pool while importing it");
-
-SYSCTL_INT(_vfs_zfs, OID_AUTO, spa_load_verify_metadata, CTLFLAG_RWTUN,
-    &spa_load_verify_metadata, 0,
-    "Check metadata on import?");
- 
-SYSCTL_INT(_vfs_zfs, OID_AUTO, spa_load_verify_data, CTLFLAG_RWTUN,
-    &spa_load_verify_data, 0,
-    "Check user data on import?");
- 
-/*ARGSUSED*/
-static int
-spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
-    const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg)
-{
-	if (bp == NULL || BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp))
-		return (0);
-	/*
-	 * Note: normally this routine will not be called if
-	 * spa_load_verify_metadata is not set.  However, it may be useful
-	 * to manually set the flag after the traversal has begun.
-	 */
-	if (!spa_load_verify_metadata)
-		return (0);
-	if (!BP_IS_METADATA(bp) && !spa_load_verify_data)
-		return (0);
-
-	zio_t *rio = arg;
-	size_t size = BP_GET_PSIZE(bp);
-
-	mutex_enter(&spa->spa_scrub_lock);
-	while (spa->spa_load_verify_ios >= spa_load_verify_maxinflight)
-		cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
-	spa->spa_load_verify_ios++;
-	mutex_exit(&spa->spa_scrub_lock);
-
-	zio_nowait(zio_read(rio, spa, bp, abd_alloc_for_io(size, B_FALSE), size,
-	    spa_load_verify_done, rio->io_private, ZIO_PRIORITY_SCRUB,
-	    ZIO_FLAG_SPECULATIVE | ZIO_FLAG_CANFAIL |
-	    ZIO_FLAG_SCRUB | ZIO_FLAG_RAW, zb));
-	return (0);
-}
-
-/* ARGSUSED */
-int
-verify_dataset_name_len(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg)
-{
-	if (dsl_dataset_namelen(ds) >= ZFS_MAX_DATASET_NAME_LEN)
-		return (SET_ERROR(ENAMETOOLONG));
-
-	return (0);
-}
-
-static int
-spa_load_verify(spa_t *spa)
-{
-	zio_t *rio;
-	spa_load_error_t sle = { 0 };
-	zpool_load_policy_t policy;
-	boolean_t verify_ok = B_FALSE;
-	int error = 0;
-
-	zpool_get_load_policy(spa->spa_config, &policy);
-
-	if (policy.zlp_rewind & ZPOOL_NEVER_REWIND)
-		return (0);
-
-	dsl_pool_config_enter(spa->spa_dsl_pool, FTAG);
-	error = dmu_objset_find_dp(spa->spa_dsl_pool,
-	    spa->spa_dsl_pool->dp_root_dir_obj, verify_dataset_name_len, NULL,
-	    DS_FIND_CHILDREN);
-	dsl_pool_config_exit(spa->spa_dsl_pool, FTAG);
-	if (error != 0)
-		return (error);
-
-	rio = zio_root(spa, NULL, &sle,
-	    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE);
-
-	if (spa_load_verify_metadata) {
-		if (spa->spa_extreme_rewind) {
-			spa_load_note(spa, "performing a complete scan of the "
-			    "pool since extreme rewind is on. This may take "
-			    "a very long time.\n  (spa_load_verify_data=%u, "
-			    "spa_load_verify_metadata=%u)",
-			    spa_load_verify_data, spa_load_verify_metadata);
-		}
-		error = traverse_pool(spa, spa->spa_verify_min_txg,
-		    TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA,
-		    spa_load_verify_cb, rio);
-	}
-
-	(void) zio_wait(rio);
-
-	spa->spa_load_meta_errors = sle.sle_meta_count;
-	spa->spa_load_data_errors = sle.sle_data_count;
-
-	if (sle.sle_meta_count != 0 || sle.sle_data_count != 0) {
-		spa_load_note(spa, "spa_load_verify found %llu metadata errors "
-		    "and %llu data errors", (u_longlong_t)sle.sle_meta_count,
-		    (u_longlong_t)sle.sle_data_count);
-	}
-
-	if (spa_load_verify_dryrun ||
-	    (!error && sle.sle_meta_count <= policy.zlp_maxmeta &&
-	    sle.sle_data_count <= policy.zlp_maxdata)) {
-		int64_t loss = 0;
-
-		verify_ok = B_TRUE;
-		spa->spa_load_txg = spa->spa_uberblock.ub_txg;
-		spa->spa_load_txg_ts = spa->spa_uberblock.ub_timestamp;
-
-		loss = spa->spa_last_ubsync_txg_ts - spa->spa_load_txg_ts;
-		VERIFY(nvlist_add_uint64(spa->spa_load_info,
-		    ZPOOL_CONFIG_LOAD_TIME, spa->spa_load_txg_ts) == 0);
-		VERIFY(nvlist_add_int64(spa->spa_load_info,
-		    ZPOOL_CONFIG_REWIND_TIME, loss) == 0);
-		VERIFY(nvlist_add_uint64(spa->spa_load_info,
-		    ZPOOL_CONFIG_LOAD_DATA_ERRORS, sle.sle_data_count) == 0);
-	} else {
-		spa->spa_load_max_txg = spa->spa_uberblock.ub_txg;
-	}
-
-	if (spa_load_verify_dryrun)
-		return (0);
-
-	if (error) {
-		if (error != ENXIO && error != EIO)
-			error = SET_ERROR(EIO);
-		return (error);
-	}
-
-	return (verify_ok ? 0 : EIO);
-}
-
-/*
- * Find a value in the pool props object.
- */
-static void
-spa_prop_find(spa_t *spa, zpool_prop_t prop, uint64_t *val)
-{
-	(void) zap_lookup(spa->spa_meta_objset, spa->spa_pool_props_object,
-	    zpool_prop_to_name(prop), sizeof (uint64_t), 1, val);
-}
-
-/*
- * Find a value in the pool directory object.
- */
-static int
-spa_dir_prop(spa_t *spa, const char *name, uint64_t *val, boolean_t log_enoent)
-{
-	int error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
-	    name, sizeof (uint64_t), 1, val);
-
-	if (error != 0 && (error != ENOENT || log_enoent)) {
-		spa_load_failed(spa, "couldn't get '%s' value in MOS directory "
-		    "[error=%d]", name, error);
-	}
-
-	return (error);
-}
-
-static int
-spa_vdev_err(vdev_t *vdev, vdev_aux_t aux, int err)
-{
-	vdev_set_state(vdev, B_TRUE, VDEV_STATE_CANT_OPEN, aux);
-	return (SET_ERROR(err));
-}
-
-static void
-spa_spawn_aux_threads(spa_t *spa)
-{
-	ASSERT(spa_writeable(spa));
-
-	ASSERT(MUTEX_HELD(&spa_namespace_lock));
-
-	spa_start_indirect_condensing_thread(spa);
-
-	ASSERT3P(spa->spa_checkpoint_discard_zthr, ==, NULL);
-	spa->spa_checkpoint_discard_zthr =
-	    zthr_create(spa_checkpoint_discard_thread_check,
-	    spa_checkpoint_discard_thread, spa);
-}
-
-/*
- * Fix up config after a partly-completed split.  This is done with the
- * ZPOOL_CONFIG_SPLIT nvlist.  Both the splitting pool and the split-off
- * pool have that entry in their config, but only the splitting one contains
- * a list of all the guids of the vdevs that are being split off.
- *
- * This function determines what to do with that list: either rejoin
- * all the disks to the pool, or complete the splitting process.  To attempt
- * the rejoin, each disk that is offlined is marked online again, and
- * we do a reopen() call.  If the vdev label for every disk that was
- * marked online indicates it was successfully split off (VDEV_AUX_SPLIT_POOL)
- * then we call vdev_split() on each disk, and complete the split.
- *
- * Otherwise we leave the config alone, with all the vdevs in place in
- * the original pool.
- */
-static void
-spa_try_repair(spa_t *spa, nvlist_t *config)
-{
-	uint_t extracted;
-	uint64_t *glist;
-	uint_t i, gcount;
-	nvlist_t *nvl;
-	vdev_t **vd;
-	boolean_t attempt_reopen;
-
-	if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, &nvl) != 0)
-		return;
-
-	/* check that the config is complete */
-	if (nvlist_lookup_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST,
-	    &glist, &gcount) != 0)
-		return;
-
-	vd = kmem_zalloc(gcount * sizeof (vdev_t *), KM_SLEEP);
-
-	/* attempt to online all the vdevs & validate */
-	attempt_reopen = B_TRUE;
-	for (i = 0; i < gcount; i++) {
-		if (glist[i] == 0)	/* vdev is hole */
-			continue;
-
-		vd[i] = spa_lookup_by_guid(spa, glist[i], B_FALSE);
-		if (vd[i] == NULL) {
-			/*
-			 * Don't bother attempting to reopen the disks;
-			 * just do the split.
-			 */
-			attempt_reopen = B_FALSE;
-		} else {
-			/* attempt to re-online it */
-			vd[i]->vdev_offline = B_FALSE;
-		}
-	}
-
-	if (attempt_reopen) {
-		vdev_reopen(spa->spa_root_vdev);
-
-		/* check each device to see what state it's in */
-		for (extracted = 0, i = 0; i < gcount; i++) {
-			if (vd[i] != NULL &&
-			    vd[i]->vdev_stat.vs_aux != VDEV_AUX_SPLIT_POOL)
-				break;
-			++extracted;
-		}
-	}
-
-	/*
-	 * If every disk has been moved to the new pool, or if we never
-	 * even attempted to look at them, then we split them off for
-	 * good.
-	 */
-	if (!attempt_reopen || gcount == extracted) {
-		for (i = 0; i < gcount; i++)
-			if (vd[i] != NULL)
-				vdev_split(vd[i]);
-		vdev_reopen(spa->spa_root_vdev);
-	}
-
-	kmem_free(vd, gcount * sizeof (vdev_t *));
-}
-
-static int
-spa_load(spa_t *spa, spa_load_state_t state, spa_import_type_t type)
-{
-	char *ereport = FM_EREPORT_ZFS_POOL;
-	int error;
-
-	spa->spa_load_state = state;
-
-	gethrestime(&spa->spa_loaded_ts);
-	error = spa_load_impl(spa, type, &ereport);
-
-	/*
-	 * Don't count references from objsets that are already closed
-	 * and are making their way through the eviction process.
-	 */
-	spa_evicting_os_wait(spa);
-	spa->spa_minref = zfs_refcount_count(&spa->spa_refcount);
-	if (error) {
-		if (error != EEXIST) {
-			spa->spa_loaded_ts.tv_sec = 0;
-			spa->spa_loaded_ts.tv_nsec = 0;
-		}
-		if (error != EBADF) {
-			zfs_ereport_post(ereport, spa, NULL, NULL, 0, 0);
-		}
-	}
-	spa->spa_load_state = error ? SPA_LOAD_ERROR : SPA_LOAD_NONE;
-	spa->spa_ena = 0;
-
-	return (error);
-}
-
-/*
- * Count the number of per-vdev ZAPs associated with all of the vdevs in the
- * vdev tree rooted in the given vd, and ensure that each ZAP is present in the
- * spa's per-vdev ZAP list.
- */
-static uint64_t
-vdev_count_verify_zaps(vdev_t *vd)
-{
-	spa_t *spa = vd->vdev_spa;
-	uint64_t total = 0;
-	if (vd->vdev_top_zap != 0) {
-		total++;
-		ASSERT0(zap_lookup_int(spa->spa_meta_objset,
-		    spa->spa_all_vdev_zaps, vd->vdev_top_zap));
-	}
-	if (vd->vdev_leaf_zap != 0) {
-		total++;
-		ASSERT0(zap_lookup_int(spa->spa_meta_objset,
-		    spa->spa_all_vdev_zaps, vd->vdev_leaf_zap));
-	}
-
-	for (uint64_t i = 0; i < vd->vdev_children; i++) {
-		total += vdev_count_verify_zaps(vd->vdev_child[i]);
-	}
-
-	return (total);
-}
-
-/*
- * Determine whether the activity check is required.
- */
-static boolean_t
-spa_activity_check_required(spa_t *spa, uberblock_t *ub, nvlist_t *label,
-    nvlist_t *config)
-{
-	uint64_t state = 0;
-	uint64_t hostid = 0;
-	uint64_t tryconfig_txg = 0;
-	uint64_t tryconfig_timestamp = 0;
-	uint16_t tryconfig_mmp_seq = 0;
-	nvlist_t *nvinfo;
-
-	if (nvlist_exists(config, ZPOOL_CONFIG_LOAD_INFO)) {
-		nvinfo = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_LOAD_INFO);
-		(void) nvlist_lookup_uint64(nvinfo, ZPOOL_CONFIG_MMP_TXG,
-		    &tryconfig_txg);
-		(void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_TIMESTAMP,
-		    &tryconfig_timestamp);
-		(void) nvlist_lookup_uint16(nvinfo, ZPOOL_CONFIG_MMP_SEQ,
-		    &tryconfig_mmp_seq);
-	}
-
-	(void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE, &state);
-
-	/*
-	 * Disable the MMP activity check - This is used by zdb which
-	 * is intended to be used on potentially active pools.
-	 */
-	if (spa->spa_import_flags & ZFS_IMPORT_SKIP_MMP)
-		return (B_FALSE);
-
-	/*
-	 * Skip the activity check when the MMP feature is disabled.
-	 */
-	if (ub->ub_mmp_magic == MMP_MAGIC && ub->ub_mmp_delay == 0)
-		return (B_FALSE);
-
-	/*
-	 * If the tryconfig_ values are nonzero, they are the results of an
-	 * earlier tryimport.  If they all match the uberblock we just found,
-	 * then the pool has not changed and we return false so we do not test
-	 * a second time.
-	 */
-	if (tryconfig_txg && tryconfig_txg == ub->ub_txg &&
-	    tryconfig_timestamp && tryconfig_timestamp == ub->ub_timestamp &&
-	    tryconfig_mmp_seq && tryconfig_mmp_seq ==
-	    (MMP_SEQ_VALID(ub) ? MMP_SEQ(ub) : 0))
-		return (B_FALSE);
-
-	/*
-	 * Allow the activity check to be skipped when importing the pool
-	 * on the same host which last imported it.  Since the hostid from
-	 * configuration may be stale use the one read from the label.
-	 */
-	if (nvlist_exists(label, ZPOOL_CONFIG_HOSTID))
-		hostid = fnvlist_lookup_uint64(label, ZPOOL_CONFIG_HOSTID);
-
-	if (hostid == spa_get_hostid())
-		return (B_FALSE);
-
-	/*
-	 * Skip the activity test when the pool was cleanly exported.
-	 */
-	if (state != POOL_STATE_ACTIVE)
-		return (B_FALSE);
-
-	return (B_TRUE);
-}
-
-/*
- * Nanoseconds the activity check must watch for changes on-disk.
- */
-static uint64_t
-spa_activity_check_duration(spa_t *spa, uberblock_t *ub)
-{
-	uint64_t import_intervals = MAX(zfs_multihost_import_intervals, 1);
-	uint64_t multihost_interval = MSEC2NSEC(
-	    MMP_INTERVAL_OK(zfs_multihost_interval));
-	uint64_t import_delay = MAX(NANOSEC, import_intervals *
-	    multihost_interval);
-
-	/*
-	 * Local tunables determine a minimum duration except for the case
-	 * where we know when the remote host will suspend the pool if MMP
-	 * writes do not land.
-	 *
-	 * See Big Theory comment at the top of mmp.c for the reasoning behind
-	 * these cases and times.
-	 */
-
-	ASSERT(MMP_IMPORT_SAFETY_FACTOR >= 100);
-
-	if (MMP_INTERVAL_VALID(ub) && MMP_FAIL_INT_VALID(ub) &&
-	    MMP_FAIL_INT(ub) > 0) {
-
-		/* MMP on remote host will suspend pool after failed writes */
-		import_delay = MMP_FAIL_INT(ub) * MSEC2NSEC(MMP_INTERVAL(ub)) *
-		    MMP_IMPORT_SAFETY_FACTOR / 100;
-
-		zfs_dbgmsg("fail_intvals>0 import_delay=%llu ub_mmp "
-		    "mmp_fails=%llu ub_mmp mmp_interval=%llu "
-		    "import_intervals=%u", import_delay, MMP_FAIL_INT(ub),
-		    MMP_INTERVAL(ub), import_intervals);
-
-	} else if (MMP_INTERVAL_VALID(ub) && MMP_FAIL_INT_VALID(ub) &&
-	    MMP_FAIL_INT(ub) == 0) {
-
-		/* MMP on remote host will never suspend pool */
-		import_delay = MAX(import_delay, (MSEC2NSEC(MMP_INTERVAL(ub)) +
-		    ub->ub_mmp_delay) * import_intervals);
-
-		zfs_dbgmsg("fail_intvals=0 import_delay=%llu ub_mmp "
-		    "mmp_interval=%llu ub_mmp_delay=%llu "
-		    "import_intervals=%u", import_delay, MMP_INTERVAL(ub),
-		    ub->ub_mmp_delay, import_intervals);
-
-	} else if (MMP_VALID(ub)) {
-		/*
-		 * zfs-0.7 compatability case
-		 */
-
-		import_delay = MAX(import_delay, (multihost_interval +
-		    ub->ub_mmp_delay) * import_intervals);
-
-		zfs_dbgmsg("import_delay=%llu ub_mmp_delay=%llu "
-		    "import_intervals=%u leaves=%u", import_delay,
-		    ub->ub_mmp_delay, import_intervals,
-		    vdev_count_leaves(spa));
-	} else {
-		/* Using local tunings is the only reasonable option */
-		zfs_dbgmsg("pool last imported on non-MMP aware "
-		    "host using import_delay=%llu multihost_interval=%llu "
-		    "import_intervals=%u", import_delay, multihost_interval,
-		    import_intervals);
-	}
-
-	return (import_delay);
-}
-
-/*
- * Perform the import activity check.  If the user canceled the import or
- * we detected activity then fail.
- */
-static int
-spa_activity_check(spa_t *spa, uberblock_t *ub, nvlist_t *config)
-{
-	uint64_t txg = ub->ub_txg;
-	uint64_t timestamp = ub->ub_timestamp;
-	uint64_t mmp_config = ub->ub_mmp_config;
-	uint16_t mmp_seq = MMP_SEQ_VALID(ub) ? MMP_SEQ(ub) : 0;
-	uint64_t import_delay;
-	hrtime_t import_expire;
-	nvlist_t *mmp_label = NULL;
-	vdev_t *rvd = spa->spa_root_vdev;
-	kcondvar_t cv;
-	kmutex_t mtx;
-	int error = 0;
-
-	cv_init(&cv, NULL, CV_DEFAULT, NULL);
-	mutex_init(&mtx, NULL, MUTEX_DEFAULT, NULL);
-	mutex_enter(&mtx);
-
-	/*
-	 * If ZPOOL_CONFIG_MMP_TXG is present an activity check was performed
-	 * during the earlier tryimport.  If the txg recorded there is 0 then
-	 * the pool is known to be active on another host.
-	 *
-	 * Otherwise, the pool might be in use on another host.  Check for
-	 * changes in the uberblocks on disk if necessary.
-	 */
-	if (nvlist_exists(config, ZPOOL_CONFIG_LOAD_INFO)) {
-		nvlist_t *nvinfo = fnvlist_lookup_nvlist(config,
-		    ZPOOL_CONFIG_LOAD_INFO);
-
-		if (nvlist_exists(nvinfo, ZPOOL_CONFIG_MMP_TXG) &&
-		    fnvlist_lookup_uint64(nvinfo, ZPOOL_CONFIG_MMP_TXG) == 0) {
-			vdev_uberblock_load(rvd, ub, &mmp_label);
-			error = SET_ERROR(EREMOTEIO);
-			goto out;
-		}
-	}
-
-	import_delay = spa_activity_check_duration(spa, ub);
-
-	/* Add a small random factor in case of simultaneous imports (0-25%) */
-	import_delay += import_delay * spa_get_random(250) / 1000;
-
-	import_expire = gethrtime() + import_delay;
-
-	while (gethrtime() < import_expire) {
-		vdev_uberblock_load(rvd, ub, &mmp_label);
-
-		if (txg != ub->ub_txg || timestamp != ub->ub_timestamp ||
-		    mmp_seq != (MMP_SEQ_VALID(ub) ? MMP_SEQ(ub) : 0)) {
-			zfs_dbgmsg("multihost activity detected "
-			    "txg %llu ub_txg  %llu "
-			    "timestamp %llu ub_timestamp  %llu "
-			    "mmp_config %#llx ub_mmp_config %#llx",
-			    txg, ub->ub_txg, timestamp, ub->ub_timestamp,
-			    mmp_config, ub->ub_mmp_config);
-
-			error = SET_ERROR(EREMOTEIO);
-			break;
-		}
-
-		if (mmp_label) {
-			nvlist_free(mmp_label);
-			mmp_label = NULL;
-		}
-		error = cv_timedwait_sig(&cv, &mtx, hz);
-#if defined(illumos) || !defined(_KERNEL)
-		if (error != -1) {
-#else
-		if (error != EWOULDBLOCK) {
-#endif
-			error = SET_ERROR(EINTR);
-			break;
-		}
-		error = 0;
-	}
-
-out:
-	mutex_exit(&mtx);
-	mutex_destroy(&mtx);
-	cv_destroy(&cv);
-
-	/*
-	 * If the pool is determined to be active store the status in the
-	 * spa->spa_load_info nvlist.  If the remote hostname or hostid are
-	 * available from configuration read from disk store them as well.
-	 * This allows 'zpool import' to generate a more useful message.
-	 *
-	 * ZPOOL_CONFIG_MMP_STATE    - observed pool status (mandatory)
-	 * ZPOOL_CONFIG_MMP_HOSTNAME - hostname from the active pool
-	 * ZPOOL_CONFIG_MMP_HOSTID   - hostid from the active pool
-	 */
-	if (error == EREMOTEIO) {
-		char *hostname = "<unknown>";
-		uint64_t hostid = 0;
-
-		if (mmp_label) {
-			if (nvlist_exists(mmp_label, ZPOOL_CONFIG_HOSTNAME)) {
-				hostname = fnvlist_lookup_string(mmp_label,
-				    ZPOOL_CONFIG_HOSTNAME);
-				fnvlist_add_string(spa->spa_load_info,
-				    ZPOOL_CONFIG_MMP_HOSTNAME, hostname);
-			}
-
-			if (nvlist_exists(mmp_label, ZPOOL_CONFIG_HOSTID)) {
-				hostid = fnvlist_lookup_uint64(mmp_label,
-				    ZPOOL_CONFIG_HOSTID);
-				fnvlist_add_uint64(spa->spa_load_info,
-				    ZPOOL_CONFIG_MMP_HOSTID, hostid);
-			}
-		}
-
-		fnvlist_add_uint64(spa->spa_load_info,
-		    ZPOOL_CONFIG_MMP_STATE, MMP_STATE_ACTIVE);
-		fnvlist_add_uint64(spa->spa_load_info,
-		    ZPOOL_CONFIG_MMP_TXG, 0);
-
-		error = spa_vdev_err(rvd, VDEV_AUX_ACTIVE, EREMOTEIO);
-	}
-
-	if (mmp_label)
-		nvlist_free(mmp_label);
-
-	return (error);
-}
-
-static int
-spa_verify_host(spa_t *spa, nvlist_t *mos_config)
-{
-	uint64_t hostid;
-	char *hostname;
-	uint64_t myhostid = 0;
-
-	if (!spa_is_root(spa) && nvlist_lookup_uint64(mos_config,
-	    ZPOOL_CONFIG_HOSTID, &hostid) == 0) {
-		hostname = fnvlist_lookup_string(mos_config,
-		    ZPOOL_CONFIG_HOSTNAME);
-
-		myhostid = zone_get_hostid(NULL);
-
-		if (hostid != 0 && myhostid != 0 && hostid != myhostid) {
-			cmn_err(CE_WARN, "pool '%s' could not be "
-			    "loaded as it was last accessed by "
-			    "another system (host: %s hostid: 0x%llx). "
-			    "See: http://illumos.org/msg/ZFS-8000-EY",
-			    spa_name(spa), hostname, (u_longlong_t)hostid);
-			spa_load_failed(spa, "hostid verification failed: pool "
-			    "last accessed by host: %s (hostid: 0x%llx)",
-			    hostname, (u_longlong_t)hostid);
-			return (SET_ERROR(EBADF));
-		}
-	}
-
-	return (0);
-}
-
-static int
-spa_ld_parse_config(spa_t *spa, spa_import_type_t type)
-{
-	int error = 0;
-	nvlist_t *nvtree, *nvl, *config = spa->spa_config;
-	int parse;
-	vdev_t *rvd;
-	uint64_t pool_guid;
-	char *comment;
-
-	/*
-	 * Versioning wasn't explicitly added to the label until later, so if
-	 * it's not present treat it as the initial version.
-	 */
-	if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION,
-	    &spa->spa_ubsync.ub_version) != 0)
-		spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL;
-
-	if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) {
-		spa_load_failed(spa, "invalid config provided: '%s' missing",
-		    ZPOOL_CONFIG_POOL_GUID);
-		return (SET_ERROR(EINVAL));
-	}
-
-	/*
-	 * If we are doing an import, ensure that the pool is not already
-	 * imported by checking if its pool guid already exists in the
-	 * spa namespace.
-	 *
-	 * The only case that we allow an already imported pool to be
-	 * imported again, is when the pool is checkpointed and we want to
-	 * look at its checkpointed state from userland tools like zdb.
-	 */
-#ifdef _KERNEL
-	if ((spa->spa_load_state == SPA_LOAD_IMPORT ||
-	    spa->spa_load_state == SPA_LOAD_TRYIMPORT) &&
-	    spa_guid_exists(pool_guid, 0)) {
-#else
-	if ((spa->spa_load_state == SPA_LOAD_IMPORT ||
-	    spa->spa_load_state == SPA_LOAD_TRYIMPORT) &&
-	    spa_guid_exists(pool_guid, 0) &&
-	    !spa_importing_readonly_checkpoint(spa)) {
-#endif
-		spa_load_failed(spa, "a pool with guid %llu is already open",
-		    (u_longlong_t)pool_guid);
-		return (SET_ERROR(EEXIST));
-	}
-
-	spa->spa_config_guid = pool_guid;
-
-	nvlist_free(spa->spa_load_info);
-	spa->spa_load_info = fnvlist_alloc();
-
-	ASSERT(spa->spa_comment == NULL);
-	if (nvlist_lookup_string(config, ZPOOL_CONFIG_COMMENT, &comment) == 0)
-		spa->spa_comment = spa_strdup(comment);
-
-	(void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG,
-	    &spa->spa_config_txg);
-
-	if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, &nvl) == 0)
-		spa->spa_config_splitting = fnvlist_dup(nvl);
-
-	if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvtree)) {
-		spa_load_failed(spa, "invalid config provided: '%s' missing",
-		    ZPOOL_CONFIG_VDEV_TREE);
-		return (SET_ERROR(EINVAL));
-	}
-
-	/*
-	 * Create "The Godfather" zio to hold all async IOs
-	 */
-	spa->spa_async_zio_root = kmem_alloc(max_ncpus * sizeof (void *),
-	    KM_SLEEP);
-	for (int i = 0; i < max_ncpus; i++) {
-		spa->spa_async_zio_root[i] = zio_root(spa, NULL, NULL,
-		    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE |
-		    ZIO_FLAG_GODFATHER);
-	}
-
-	/*
-	 * Parse the configuration into a vdev tree.  We explicitly set the
-	 * value that will be returned by spa_version() since parsing the
-	 * configuration requires knowing the version number.
-	 */
-	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
-	parse = (type == SPA_IMPORT_EXISTING ?
-	    VDEV_ALLOC_LOAD : VDEV_ALLOC_SPLIT);
-	error = spa_config_parse(spa, &rvd, nvtree, NULL, 0, parse);
-	spa_config_exit(spa, SCL_ALL, FTAG);
-
-	if (error != 0) {
-		spa_load_failed(spa, "unable to parse config [error=%d]",
-		    error);
-		return (error);
-	}
-
-	ASSERT(spa->spa_root_vdev == rvd);
-	ASSERT3U(spa->spa_min_ashift, >=, SPA_MINBLOCKSHIFT);
-	ASSERT3U(spa->spa_max_ashift, <=, SPA_MAXBLOCKSHIFT);
-
-	if (type != SPA_IMPORT_ASSEMBLE) {
-		ASSERT(spa_guid(spa) == pool_guid);
-	}
-
-	return (0);
-}
-
-/*
- * Recursively open all vdevs in the vdev tree. This function is called twice:
- * first with the untrusted config, then with the trusted config.
- */
-static int
-spa_ld_open_vdevs(spa_t *spa)
-{
-	int error = 0;
-
-	/*
-	 * spa_missing_tvds_allowed defines how many top-level vdevs can be
-	 * missing/unopenable for the root vdev to be still considered openable.
-	 */
-	if (spa->spa_trust_config) {
-		spa->spa_missing_tvds_allowed = zfs_max_missing_tvds;
-	} else if (spa->spa_config_source == SPA_CONFIG_SRC_CACHEFILE) {
-		spa->spa_missing_tvds_allowed = zfs_max_missing_tvds_cachefile;
-	} else if (spa->spa_config_source == SPA_CONFIG_SRC_SCAN) {
-		spa->spa_missing_tvds_allowed = zfs_max_missing_tvds_scan;
-	} else {
-		spa->spa_missing_tvds_allowed = 0;
-	}
-
-	spa->spa_missing_tvds_allowed =
-	    MAX(zfs_max_missing_tvds, spa->spa_missing_tvds_allowed);
-
-	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
-	error = vdev_open(spa->spa_root_vdev);
-	spa_config_exit(spa, SCL_ALL, FTAG);
-
-	if (spa->spa_missing_tvds != 0) {
-		spa_load_note(spa, "vdev tree has %lld missing top-level "
-		    "vdevs.", (u_longlong_t)spa->spa_missing_tvds);
-		if (spa->spa_trust_config && (spa->spa_mode & FWRITE)) {
-			/*
-			 * Although theoretically we could allow users to open
-			 * incomplete pools in RW mode, we'd need to add a lot
-			 * of extra logic (e.g. adjust pool space to account
-			 * for missing vdevs).
-			 * This limitation also prevents users from accidentally
-			 * opening the pool in RW mode during data recovery and
-			 * damaging it further.
-			 */
-			spa_load_note(spa, "pools with missing top-level "
-			    "vdevs can only be opened in read-only mode.");
-			error = SET_ERROR(ENXIO);
-		} else {
-			spa_load_note(spa, "current settings allow for maximum "
-			    "%lld missing top-level vdevs at this stage.",
-			    (u_longlong_t)spa->spa_missing_tvds_allowed);
-		}
-	}
-	if (error != 0) {
-		spa_load_failed(spa, "unable to open vdev tree [error=%d]",
-		    error);
-	}
-	if (spa->spa_missing_tvds != 0 || error != 0)
-		vdev_dbgmsg_print_tree(spa->spa_root_vdev, 2);
-
-	return (error);
-}
-
-/*
- * We need to validate the vdev labels against the configuration that
- * we have in hand. This function is called twice: first with an untrusted
- * config, then with a trusted config. The validation is more strict when the
- * config is trusted.
- */
-static int
-spa_ld_validate_vdevs(spa_t *spa)
-{
-	int error = 0;
-	vdev_t *rvd = spa->spa_root_vdev;
-
-	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
-	error = vdev_validate(rvd);
-	spa_config_exit(spa, SCL_ALL, FTAG);
-
-	if (error != 0) {
-		spa_load_failed(spa, "vdev_validate failed [error=%d]", error);
-		return (error);
-	}
-
-	if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) {
-		spa_load_failed(spa, "cannot open vdev tree after invalidating "
-		    "some vdevs");
-		vdev_dbgmsg_print_tree(rvd, 2);
-		return (SET_ERROR(ENXIO));
-	}
-
-	return (0);
-}
-
-static void
-spa_ld_select_uberblock_done(spa_t *spa, uberblock_t *ub)
-{
-	spa->spa_state = POOL_STATE_ACTIVE;
-	spa->spa_ubsync = spa->spa_uberblock;
-	spa->spa_verify_min_txg = spa->spa_extreme_rewind ?
-	    TXG_INITIAL - 1 : spa_last_synced_txg(spa) - TXG_DEFER_SIZE - 1;
-	spa->spa_first_txg = spa->spa_last_ubsync_txg ?
-	    spa->spa_last_ubsync_txg : spa_last_synced_txg(spa) + 1;
-	spa->spa_claim_max_txg = spa->spa_first_txg;
-	spa->spa_prev_software_version = ub->ub_software_version;
-}
-
-static int
-spa_ld_select_uberblock(spa_t *spa, spa_import_type_t type)
-{
-	vdev_t *rvd = spa->spa_root_vdev;
-	nvlist_t *label;
-	uberblock_t *ub = &spa->spa_uberblock;
-	boolean_t activity_check = B_FALSE;
-
-	/*
-	 * If we are opening the checkpointed state of the pool by
-	 * rewinding to it, at this point we will have written the
-	 * checkpointed uberblock to the vdev labels, so searching
-	 * the labels will find the right uberblock.  However, if
-	 * we are opening the checkpointed state read-only, we have
-	 * not modified the labels. Therefore, we must ignore the
-	 * labels and continue using the spa_uberblock that was set
-	 * by spa_ld_checkpoint_rewind.
-	 *
-	 * Note that it would be fine to ignore the labels when
-	 * rewinding (opening writeable) as well. However, if we
-	 * crash just after writing the labels, we will end up
-	 * searching the labels. Doing so in the common case means
-	 * that this code path gets exercised normally, rather than
-	 * just in the edge case.
-	 */
-	if (ub->ub_checkpoint_txg != 0 &&
-	    spa_importing_readonly_checkpoint(spa)) {
-		spa_ld_select_uberblock_done(spa, ub);
-		return (0);
-	}
-
-	/*
-	 * Find the best uberblock.
-	 */
-	vdev_uberblock_load(rvd, ub, &label);
-
-	/*
-	 * If we weren't able to find a single valid uberblock, return failure.
-	 */
-	if (ub->ub_txg == 0) {
-		nvlist_free(label);
-		spa_load_failed(spa, "no valid uberblock found");
-		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, ENXIO));
-	}
-
-	spa_load_note(spa, "using uberblock with txg=%llu",
-	    (u_longlong_t)ub->ub_txg);
-
-	/*
-	 * For pools which have the multihost property on determine if the
-	 * pool is truly inactive and can be safely imported.  Prevent
-	 * hosts which don't have a hostid set from importing the pool.
-	 */
-	activity_check = spa_activity_check_required(spa, ub, label,
-	    spa->spa_config);
-	if (activity_check) {
-		if (ub->ub_mmp_magic == MMP_MAGIC && ub->ub_mmp_delay &&
-		    spa_get_hostid() == 0) {
-			nvlist_free(label);
-			fnvlist_add_uint64(spa->spa_load_info,
-			    ZPOOL_CONFIG_MMP_STATE, MMP_STATE_NO_HOSTID);
-			return (spa_vdev_err(rvd, VDEV_AUX_ACTIVE, EREMOTEIO));
-		}
-
-		int error = spa_activity_check(spa, ub, spa->spa_config);
-		if (error) {
-			nvlist_free(label);
-			return (error);
-		}
-
-		fnvlist_add_uint64(spa->spa_load_info,
-		    ZPOOL_CONFIG_MMP_STATE, MMP_STATE_INACTIVE);
-		fnvlist_add_uint64(spa->spa_load_info,
-		    ZPOOL_CONFIG_MMP_TXG, ub->ub_txg);
-		fnvlist_add_uint16(spa->spa_load_info,
-		    ZPOOL_CONFIG_MMP_SEQ,
-		    (MMP_SEQ_VALID(ub) ? MMP_SEQ(ub) : 0));
-	}
-
-	/*
-	 * If the pool has an unsupported version we can't open it.
-	 */
-	if (!SPA_VERSION_IS_SUPPORTED(ub->ub_version)) {
-		nvlist_free(label);
-		spa_load_failed(spa, "version %llu is not supported",
-		    (u_longlong_t)ub->ub_version);
-		return (spa_vdev_err(rvd, VDEV_AUX_VERSION_NEWER, ENOTSUP));
-	}
-
-	if (ub->ub_version >= SPA_VERSION_FEATURES) {
-		nvlist_t *features;
-
-		/*
-		 * If we weren't able to find what's necessary for reading the
-		 * MOS in the label, return failure.
-		 */
-		if (label == NULL) {
-			spa_load_failed(spa, "label config unavailable");
-			return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA,
-			    ENXIO));
-		}
-
-		if (nvlist_lookup_nvlist(label, ZPOOL_CONFIG_FEATURES_FOR_READ,
-		    &features) != 0) {
-			nvlist_free(label);
-			spa_load_failed(spa, "invalid label: '%s' missing",
-			    ZPOOL_CONFIG_FEATURES_FOR_READ);
-			return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA,
-			    ENXIO));
-		}
-
-		/*
-		 * Update our in-core representation with the definitive values
-		 * from the label.
-		 */
-		nvlist_free(spa->spa_label_features);
-		VERIFY(nvlist_dup(features, &spa->spa_label_features, 0) == 0);
-	}
-
-	nvlist_free(label);
-
-	/*
-	 * Look through entries in the label nvlist's features_for_read. If
-	 * there is a feature listed there which we don't understand then we
-	 * cannot open a pool.
-	 */
-	if (ub->ub_version >= SPA_VERSION_FEATURES) {
-		nvlist_t *unsup_feat;
-
-		VERIFY(nvlist_alloc(&unsup_feat, NV_UNIQUE_NAME, KM_SLEEP) ==
-		    0);
-
-		for (nvpair_t *nvp = nvlist_next_nvpair(spa->spa_label_features,
-		    NULL); nvp != NULL;
-		    nvp = nvlist_next_nvpair(spa->spa_label_features, nvp)) {
-			if (!zfeature_is_supported(nvpair_name(nvp))) {
-				VERIFY(nvlist_add_string(unsup_feat,
-				    nvpair_name(nvp), "") == 0);
-			}
-		}
-
-		if (!nvlist_empty(unsup_feat)) {
-			VERIFY(nvlist_add_nvlist(spa->spa_load_info,
-			    ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat) == 0);
-			nvlist_free(unsup_feat);
-			spa_load_failed(spa, "some features are unsupported");
-			return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT,
-			    ENOTSUP));
-		}
-
-		nvlist_free(unsup_feat);
-	}
-
-	if (type != SPA_IMPORT_ASSEMBLE && spa->spa_config_splitting) {
-		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
-		spa_try_repair(spa, spa->spa_config);
-		spa_config_exit(spa, SCL_ALL, FTAG);
-		nvlist_free(spa->spa_config_splitting);
-		spa->spa_config_splitting = NULL;
-	}
-
-	/*
-	 * Initialize internal SPA structures.
-	 */
-	spa_ld_select_uberblock_done(spa, ub);
-
-	return (0);
-}
-
-static int
-spa_ld_open_rootbp(spa_t *spa)
-{
-	int error = 0;
-	vdev_t *rvd = spa->spa_root_vdev;
-
-	error = dsl_pool_init(spa, spa->spa_first_txg, &spa->spa_dsl_pool);
-	if (error != 0) {
-		spa_load_failed(spa, "unable to open rootbp in dsl_pool_init "
-		    "[error=%d]", error);
-		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
-	}
-	spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset;
-
-	return (0);
-}
-
-static int
-spa_ld_trusted_config(spa_t *spa, spa_import_type_t type,
-    boolean_t reloading)
-{
-	vdev_t *mrvd, *rvd = spa->spa_root_vdev;
-	nvlist_t *nv, *mos_config, *policy;
-	int error = 0, copy_error;
-	uint64_t healthy_tvds, healthy_tvds_mos;
-	uint64_t mos_config_txg;
-
-	if (spa_dir_prop(spa, DMU_POOL_CONFIG, &spa->spa_config_object, B_TRUE)
-	    != 0)
-		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
-
-	/*
-	 * If we're assembling a pool from a split, the config provided is
-	 * already trusted so there is nothing to do.
-	 */
-	if (type == SPA_IMPORT_ASSEMBLE)
-		return (0);
-
-	healthy_tvds = spa_healthy_core_tvds(spa);
-
-	if (load_nvlist(spa, spa->spa_config_object, &mos_config)
-	    != 0) {
-		spa_load_failed(spa, "unable to retrieve MOS config");
-		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
-	}
-
-	/*
-	 * If we are doing an open, pool owner wasn't verified yet, thus do
-	 * the verification here.
-	 */
-	if (spa->spa_load_state == SPA_LOAD_OPEN) {
-		error = spa_verify_host(spa, mos_config);
-		if (error != 0) {
-			nvlist_free(mos_config);
-			return (error);
-		}
-	}
-
-	nv = fnvlist_lookup_nvlist(mos_config, ZPOOL_CONFIG_VDEV_TREE);
-
-	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
-
-	/*
-	 * Build a new vdev tree from the trusted config
-	 */
-	VERIFY(spa_config_parse(spa, &mrvd, nv, NULL, 0, VDEV_ALLOC_LOAD) == 0);
-
-	/*
-	 * Vdev paths in the MOS may be obsolete. If the untrusted config was
-	 * obtained by scanning /dev/dsk, then it will have the right vdev
-	 * paths. We update the trusted MOS config with this information.
-	 * We first try to copy the paths with vdev_copy_path_strict, which
-	 * succeeds only when both configs have exactly the same vdev tree.
-	 * If that fails, we fall back to a more flexible method that has a
-	 * best effort policy.
-	 */
-	copy_error = vdev_copy_path_strict(rvd, mrvd);
-	if (copy_error != 0 || spa_load_print_vdev_tree) {
-		spa_load_note(spa, "provided vdev tree:");
-		vdev_dbgmsg_print_tree(rvd, 2);
-		spa_load_note(spa, "MOS vdev tree:");
-		vdev_dbgmsg_print_tree(mrvd, 2);
-	}
-	if (copy_error != 0) {
-		spa_load_note(spa, "vdev_copy_path_strict failed, falling "
-		    "back to vdev_copy_path_relaxed");
-		vdev_copy_path_relaxed(rvd, mrvd);
-	}
-
-	vdev_close(rvd);
-	vdev_free(rvd);
-	spa->spa_root_vdev = mrvd;
-	rvd = mrvd;
-	spa_config_exit(spa, SCL_ALL, FTAG);
-
-	/*
-	 * We will use spa_config if we decide to reload the spa or if spa_load
-	 * fails and we rewind. We must thus regenerate the config using the
-	 * MOS information with the updated paths. ZPOOL_LOAD_POLICY is used to
-	 * pass settings on how to load the pool and is not stored in the MOS.
-	 * We copy it over to our new, trusted config.
-	 */
-	mos_config_txg = fnvlist_lookup_uint64(mos_config,
-	    ZPOOL_CONFIG_POOL_TXG);
-	nvlist_free(mos_config);
-	mos_config = spa_config_generate(spa, NULL, mos_config_txg, B_FALSE);
-	if (nvlist_lookup_nvlist(spa->spa_config, ZPOOL_LOAD_POLICY,
-	    &policy) == 0)
-		fnvlist_add_nvlist(mos_config, ZPOOL_LOAD_POLICY, policy);
-	spa_config_set(spa, mos_config);
-	spa->spa_config_source = SPA_CONFIG_SRC_MOS;
-
-	/*
-	 * Now that we got the config from the MOS, we should be more strict
-	 * in checking blkptrs and can make assumptions about the consistency
-	 * of the vdev tree. spa_trust_config must be set to true before opening
-	 * vdevs in order for them to be writeable.
-	 */
-	spa->spa_trust_config = B_TRUE;
-
-	/*
-	 * Open and validate the new vdev tree
-	 */
-	error = spa_ld_open_vdevs(spa);
-	if (error != 0)
-		return (error);
-
-	error = spa_ld_validate_vdevs(spa);
-	if (error != 0)
-		return (error);
-
-	if (copy_error != 0 || spa_load_print_vdev_tree) {
-		spa_load_note(spa, "final vdev tree:");
-		vdev_dbgmsg_print_tree(rvd, 2);
-	}
-
-	if (spa->spa_load_state != SPA_LOAD_TRYIMPORT &&
-	    !spa->spa_extreme_rewind && zfs_max_missing_tvds == 0) {
-		/*
-		 * Sanity check to make sure that we are indeed loading the
-		 * latest uberblock. If we missed SPA_SYNC_MIN_VDEVS tvds
-		 * in the config provided and they happened to be the only ones
-		 * to have the latest uberblock, we could involuntarily perform
-		 * an extreme rewind.
-		 */
-		healthy_tvds_mos = spa_healthy_core_tvds(spa);
-		if (healthy_tvds_mos - healthy_tvds >=
-		    SPA_SYNC_MIN_VDEVS) {
-			spa_load_note(spa, "config provided misses too many "
-			    "top-level vdevs compared to MOS (%lld vs %lld). ",
-			    (u_longlong_t)healthy_tvds,
-			    (u_longlong_t)healthy_tvds_mos);
-			spa_load_note(spa, "vdev tree:");
-			vdev_dbgmsg_print_tree(rvd, 2);
-			if (reloading) {
-				spa_load_failed(spa, "config was already "
-				    "provided from MOS. Aborting.");
-				return (spa_vdev_err(rvd,
-				    VDEV_AUX_CORRUPT_DATA, EIO));
-			}
-			spa_load_note(spa, "spa must be reloaded using MOS "
-			    "config");
-			return (SET_ERROR(EAGAIN));
-		}
-	}
-
-	error = spa_check_for_missing_logs(spa);
-	if (error != 0)
-		return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, ENXIO));
-
-	if (rvd->vdev_guid_sum != spa->spa_uberblock.ub_guid_sum) {
-		spa_load_failed(spa, "uberblock guid sum doesn't match MOS "
-		    "guid sum (%llu != %llu)",
-		    (u_longlong_t)spa->spa_uberblock.ub_guid_sum,
-		    (u_longlong_t)rvd->vdev_guid_sum);
-		return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM,
-		    ENXIO));
-	}
-
-	return (0);
-}
-
-static int
-spa_ld_open_indirect_vdev_metadata(spa_t *spa)
-{
-	int error = 0;
-	vdev_t *rvd = spa->spa_root_vdev;
-
-	/*
-	 * Everything that we read before spa_remove_init() must be stored
-	 * on concreted vdevs.  Therefore we do this as early as possible.
-	 */
-	error = spa_remove_init(spa);
-	if (error != 0) {
-		spa_load_failed(spa, "spa_remove_init failed [error=%d]",
-		    error);
-		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
-	}
-
-	/*
-	 * Retrieve information needed to condense indirect vdev mappings.
-	 */
-	error = spa_condense_init(spa);
-	if (error != 0) {
-		spa_load_failed(spa, "spa_condense_init failed [error=%d]",
-		    error);
-		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error));
-	}
-
-	return (0);
-}
-
-static int
-spa_ld_check_features(spa_t *spa, boolean_t *missing_feat_writep)
-{
-	int error = 0;
-	vdev_t *rvd = spa->spa_root_vdev;
-
-	if (spa_version(spa) >= SPA_VERSION_FEATURES) {
-		boolean_t missing_feat_read = B_FALSE;
-		nvlist_t *unsup_feat, *enabled_feat;
-
-		if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_READ,
-		    &spa->spa_feat_for_read_obj, B_TRUE) != 0) {
-			return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
-		}
-
-		if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_WRITE,
-		    &spa->spa_feat_for_write_obj, B_TRUE) != 0) {
-			return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
-		}
-
-		if (spa_dir_prop(spa, DMU_POOL_FEATURE_DESCRIPTIONS,
-		    &spa->spa_feat_desc_obj, B_TRUE) != 0) {
-			return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
-		}
-
-		enabled_feat = fnvlist_alloc();
-		unsup_feat = fnvlist_alloc();
-
-		if (!spa_features_check(spa, B_FALSE,
-		    unsup_feat, enabled_feat))
-			missing_feat_read = B_TRUE;
-
-		if (spa_writeable(spa) ||
-		    spa->spa_load_state == SPA_LOAD_TRYIMPORT) {
-			if (!spa_features_check(spa, B_TRUE,
-			    unsup_feat, enabled_feat)) {
-				*missing_feat_writep = B_TRUE;
-			}
-		}
-
-		fnvlist_add_nvlist(spa->spa_load_info,
-		    ZPOOL_CONFIG_ENABLED_FEAT, enabled_feat);
-
-		if (!nvlist_empty(unsup_feat)) {
-			fnvlist_add_nvlist(spa->spa_load_info,
-			    ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat);
-		}
-
-		fnvlist_free(enabled_feat);
-		fnvlist_free(unsup_feat);
-
-		if (!missing_feat_read) {
-			fnvlist_add_boolean(spa->spa_load_info,
-			    ZPOOL_CONFIG_CAN_RDONLY);
-		}
-
-		/*
-		 * If the state is SPA_LOAD_TRYIMPORT, our objective is
-		 * twofold: to determine whether the pool is available for
-		 * import in read-write mode and (if it is not) whether the
-		 * pool is available for import in read-only mode. If the pool
-		 * is available for import in read-write mode, it is displayed
-		 * as available in userland; if it is not available for import
-		 * in read-only mode, it is displayed as unavailable in
-		 * userland. If the pool is available for import in read-only
-		 * mode but not read-write mode, it is displayed as unavailable
-		 * in userland with a special note that the pool is actually
-		 * available for open in read-only mode.
-		 *
-		 * As a result, if the state is SPA_LOAD_TRYIMPORT and we are
-		 * missing a feature for write, we must first determine whether
-		 * the pool can be opened read-only before returning to
-		 * userland in order to know whether to display the
-		 * abovementioned note.
-		 */
-		if (missing_feat_read || (*missing_feat_writep &&
-		    spa_writeable(spa))) {
-			spa_load_failed(spa, "pool uses unsupported features");
-			return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT,
-			    ENOTSUP));
-		}
-
-		/*
-		 * Load refcounts for ZFS features from disk into an in-memory
-		 * cache during SPA initialization.
-		 */
-		for (spa_feature_t i = 0; i < SPA_FEATURES; i++) {
-			uint64_t refcount;
-
-			error = feature_get_refcount_from_disk(spa,
-			    &spa_feature_table[i], &refcount);
-			if (error == 0) {
-				spa->spa_feat_refcount_cache[i] = refcount;
-			} else if (error == ENOTSUP) {
-				spa->spa_feat_refcount_cache[i] =
-				    SPA_FEATURE_DISABLED;
-			} else {
-				spa_load_failed(spa, "error getting refcount "
-				    "for feature %s [error=%d]",
-				    spa_feature_table[i].fi_guid, error);
-				return (spa_vdev_err(rvd,
-				    VDEV_AUX_CORRUPT_DATA, EIO));
-			}
-		}
-	}
-
-	if (spa_feature_is_active(spa, SPA_FEATURE_ENABLED_TXG)) {
-		if (spa_dir_prop(spa, DMU_POOL_FEATURE_ENABLED_TXG,
-		    &spa->spa_feat_enabled_txg_obj, B_TRUE) != 0)
-			return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
-	}
-
-	return (0);
-}
-
-static int
-spa_ld_load_special_directories(spa_t *spa)
-{
-	int error = 0;
-	vdev_t *rvd = spa->spa_root_vdev;
-
-	spa->spa_is_initializing = B_TRUE;
-	error = dsl_pool_open(spa->spa_dsl_pool);
-	spa->spa_is_initializing = B_FALSE;
-	if (error != 0) {
-		spa_load_failed(spa, "dsl_pool_open failed [error=%d]", error);
-		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
-	}
-
-	return (0);
-}
-
-static int
-spa_ld_get_props(spa_t *spa)
-{
-	int error = 0;
-	uint64_t obj;
-	vdev_t *rvd = spa->spa_root_vdev;
-
-	/* Grab the secret checksum salt from the MOS. */
-	error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
-	    DMU_POOL_CHECKSUM_SALT, 1,
-	    sizeof (spa->spa_cksum_salt.zcs_bytes),
-	    spa->spa_cksum_salt.zcs_bytes);
-	if (error == ENOENT) {
-		/* Generate a new salt for subsequent use */
-		(void) random_get_pseudo_bytes(spa->spa_cksum_salt.zcs_bytes,
-		    sizeof (spa->spa_cksum_salt.zcs_bytes));
-	} else if (error != 0) {
-		spa_load_failed(spa, "unable to retrieve checksum salt from "
-		    "MOS [error=%d]", error);
-		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
-	}
-
-	if (spa_dir_prop(spa, DMU_POOL_SYNC_BPOBJ, &obj, B_TRUE) != 0)
-		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
-	error = bpobj_open(&spa->spa_deferred_bpobj, spa->spa_meta_objset, obj);
-	if (error != 0) {
-		spa_load_failed(spa, "error opening deferred-frees bpobj "
-		    "[error=%d]", error);
-		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
-	}
-
-	/*
-	 * Load the bit that tells us to use the new accounting function
-	 * (raid-z deflation).  If we have an older pool, this will not
-	 * be present.
-	 */
-	error = spa_dir_prop(spa, DMU_POOL_DEFLATE, &spa->spa_deflate, B_FALSE);
-	if (error != 0 && error != ENOENT)
-		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
-
-	error = spa_dir_prop(spa, DMU_POOL_CREATION_VERSION,
-	    &spa->spa_creation_version, B_FALSE);
-	if (error != 0 && error != ENOENT)
-		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
-
-	/*
-	 * Load the persistent error log.  If we have an older pool, this will
-	 * not be present.
-	 */
-	error = spa_dir_prop(spa, DMU_POOL_ERRLOG_LAST, &spa->spa_errlog_last,
-	    B_FALSE);
-	if (error != 0 && error != ENOENT)
-		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
-
-	error = spa_dir_prop(spa, DMU_POOL_ERRLOG_SCRUB,
-	    &spa->spa_errlog_scrub, B_FALSE);
-	if (error != 0 && error != ENOENT)
-		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
-
-	/*
-	 * Load the history object.  If we have an older pool, this
-	 * will not be present.
-	 */
-	error = spa_dir_prop(spa, DMU_POOL_HISTORY, &spa->spa_history, B_FALSE);
-	if (error != 0 && error != ENOENT)
-		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
-
-	/*
-	 * Load the per-vdev ZAP map. If we have an older pool, this will not
-	 * be present; in this case, defer its creation to a later time to
-	 * avoid dirtying the MOS this early / out of sync context. See
-	 * spa_sync_config_object.
-	 */
-
-	/* The sentinel is only available in the MOS config. */
-	nvlist_t *mos_config;
-	if (load_nvlist(spa, spa->spa_config_object, &mos_config) != 0) {
-		spa_load_failed(spa, "unable to retrieve MOS config");
-		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
-	}
-
-	error = spa_dir_prop(spa, DMU_POOL_VDEV_ZAP_MAP,
-	    &spa->spa_all_vdev_zaps, B_FALSE);
-
-	if (error == ENOENT) {
-		VERIFY(!nvlist_exists(mos_config,
-		    ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS));
-		spa->spa_avz_action = AVZ_ACTION_INITIALIZE;
-		ASSERT0(vdev_count_verify_zaps(spa->spa_root_vdev));
-	} else if (error != 0) {
-		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
-	} else if (!nvlist_exists(mos_config, ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS)) {
-		/*
-		 * An older version of ZFS overwrote the sentinel value, so
-		 * we have orphaned per-vdev ZAPs in the MOS. Defer their
-		 * destruction to later; see spa_sync_config_object.
-		 */
-		spa->spa_avz_action = AVZ_ACTION_DESTROY;
-		/*
-		 * We're assuming that no vdevs have had their ZAPs created
-		 * before this. Better be sure of it.
-		 */
-		ASSERT0(vdev_count_verify_zaps(spa->spa_root_vdev));
-	}
-	nvlist_free(mos_config);
-
-	spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION);
-
-	error = spa_dir_prop(spa, DMU_POOL_PROPS, &spa->spa_pool_props_object,
-	    B_FALSE);
-	if (error && error != ENOENT)
-		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
-
-	if (error == 0) {
-		uint64_t autoreplace;
-
-		spa_prop_find(spa, ZPOOL_PROP_BOOTFS, &spa->spa_bootfs);
-		spa_prop_find(spa, ZPOOL_PROP_AUTOREPLACE, &autoreplace);
-		spa_prop_find(spa, ZPOOL_PROP_DELEGATION, &spa->spa_delegation);
-		spa_prop_find(spa, ZPOOL_PROP_FAILUREMODE, &spa->spa_failmode);
-		spa_prop_find(spa, ZPOOL_PROP_AUTOEXPAND, &spa->spa_autoexpand);
-		spa_prop_find(spa, ZPOOL_PROP_MULTIHOST, &spa->spa_multihost);
-		spa_prop_find(spa, ZPOOL_PROP_DEDUPDITTO,
-		    &spa->spa_dedup_ditto);
-
-		spa->spa_autoreplace = (autoreplace != 0);
-	}
-
-	/*
-	 * If we are importing a pool with missing top-level vdevs,
-	 * we enforce that the pool doesn't panic or get suspended on
-	 * error since the likelihood of missing data is extremely high.
-	 */
-	if (spa->spa_missing_tvds > 0 &&
-	    spa->spa_failmode != ZIO_FAILURE_MODE_CONTINUE &&
-	    spa->spa_load_state != SPA_LOAD_TRYIMPORT) {
-		spa_load_note(spa, "forcing failmode to 'continue' "
-		    "as some top level vdevs are missing");
-		spa->spa_failmode = ZIO_FAILURE_MODE_CONTINUE;
-	}
-
-	return (0);
-}
-
-static int
-spa_ld_open_aux_vdevs(spa_t *spa, spa_import_type_t type)
-{
-	int error = 0;
-	vdev_t *rvd = spa->spa_root_vdev;
-
-	/*
-	 * If we're assembling the pool from the split-off vdevs of
-	 * an existing pool, we don't want to attach the spares & cache
-	 * devices.
-	 */
-
-	/*
-	 * Load any hot spares for this pool.
-	 */
-	error = spa_dir_prop(spa, DMU_POOL_SPARES, &spa->spa_spares.sav_object,
-	    B_FALSE);
-	if (error != 0 && error != ENOENT)
-		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
-	if (error == 0 && type != SPA_IMPORT_ASSEMBLE) {
-		ASSERT(spa_version(spa) >= SPA_VERSION_SPARES);
-		if (load_nvlist(spa, spa->spa_spares.sav_object,
-		    &spa->spa_spares.sav_config) != 0) {
-			spa_load_failed(spa, "error loading spares nvlist");
-			return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
-		}
-
-		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
-		spa_load_spares(spa);
-		spa_config_exit(spa, SCL_ALL, FTAG);
-	} else if (error == 0) {
-		spa->spa_spares.sav_sync = B_TRUE;
-	}
-
-	/*
-	 * Load any level 2 ARC devices for this pool.
-	 */
-	error = spa_dir_prop(spa, DMU_POOL_L2CACHE,
-	    &spa->spa_l2cache.sav_object, B_FALSE);
-	if (error != 0 && error != ENOENT)
-		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
-	if (error == 0 && type != SPA_IMPORT_ASSEMBLE) {
-		ASSERT(spa_version(spa) >= SPA_VERSION_L2CACHE);
-		if (load_nvlist(spa, spa->spa_l2cache.sav_object,
-		    &spa->spa_l2cache.sav_config) != 0) {
-			spa_load_failed(spa, "error loading l2cache nvlist");
-			return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
-		}
-
-		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
-		spa_load_l2cache(spa);
-		spa_config_exit(spa, SCL_ALL, FTAG);
-	} else if (error == 0) {
-		spa->spa_l2cache.sav_sync = B_TRUE;
-	}
-
-	return (0);
-}
-
-static int
-spa_ld_load_vdev_metadata(spa_t *spa)
-{
-	int error = 0;
-	vdev_t *rvd = spa->spa_root_vdev;
-
-	/*
-	 * If the 'multihost' property is set, then never allow a pool to
-	 * be imported when the system hostid is zero.  The exception to
-	 * this rule is zdb which is always allowed to access pools.
-	 */
-	if (spa_multihost(spa) && spa_get_hostid() == 0 &&
-	    (spa->spa_import_flags & ZFS_IMPORT_SKIP_MMP) == 0) {
-		fnvlist_add_uint64(spa->spa_load_info,
-		    ZPOOL_CONFIG_MMP_STATE, MMP_STATE_NO_HOSTID);
-		return (spa_vdev_err(rvd, VDEV_AUX_ACTIVE, EREMOTEIO));
-	}
-
-	/*
-	 * If the 'autoreplace' property is set, then post a resource notifying
-	 * the ZFS DE that it should not issue any faults for unopenable
-	 * devices.  We also iterate over the vdevs, and post a sysevent for any
-	 * unopenable vdevs so that the normal autoreplace handler can take
-	 * over.
-	 */
-	if (spa->spa_autoreplace && spa->spa_load_state != SPA_LOAD_TRYIMPORT) {
-		spa_check_removed(spa->spa_root_vdev);
-		/*
-		 * For the import case, this is done in spa_import(), because
-		 * at this point we're using the spare definitions from
-		 * the MOS config, not necessarily from the userland config.
-		 */
-		if (spa->spa_load_state != SPA_LOAD_IMPORT) {
-			spa_aux_check_removed(&spa->spa_spares);
-			spa_aux_check_removed(&spa->spa_l2cache);
-		}
-	}
-
-	/*
-	 * Load the vdev metadata such as metaslabs, DTLs, spacemap object, etc.
-	 */
-	error = vdev_load(rvd);
-	if (error != 0) {
-		spa_load_failed(spa, "vdev_load failed [error=%d]", error);
-		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error));
-	}
-
-	/*
-	 * Propagate the leaf DTLs we just loaded all the way up the vdev tree.
-	 */
-	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
-	vdev_dtl_reassess(rvd, 0, 0, B_FALSE);
-	spa_config_exit(spa, SCL_ALL, FTAG);
-
-	return (0);
-}
-
-static int
-spa_ld_load_dedup_tables(spa_t *spa)
-{
-	int error = 0;
-	vdev_t *rvd = spa->spa_root_vdev;
-
-	error = ddt_load(spa);
-	if (error != 0) {
-		spa_load_failed(spa, "ddt_load failed [error=%d]", error);
-		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
-	}
-
-	return (0);
-}
-
-static int
-spa_ld_verify_logs(spa_t *spa, spa_import_type_t type, char **ereport)
-{
-	vdev_t *rvd = spa->spa_root_vdev;
-
-	if (type != SPA_IMPORT_ASSEMBLE && spa_writeable(spa)) {
-		boolean_t missing = spa_check_logs(spa);
-		if (missing) {
-			if (spa->spa_missing_tvds != 0) {
-				spa_load_note(spa, "spa_check_logs failed "
-				    "so dropping the logs");
-			} else {
-				*ereport = FM_EREPORT_ZFS_LOG_REPLAY;
-				spa_load_failed(spa, "spa_check_logs failed");
-				return (spa_vdev_err(rvd, VDEV_AUX_BAD_LOG,
-				    ENXIO));
-			}
-		}
-	}
-
-	return (0);
-}
-
-static int
-spa_ld_verify_pool_data(spa_t *spa)
-{
-	int error = 0;
-	vdev_t *rvd = spa->spa_root_vdev;
-
-	/*
-	 * We've successfully opened the pool, verify that we're ready
-	 * to start pushing transactions.
-	 */
-	if (spa->spa_load_state != SPA_LOAD_TRYIMPORT) {
-		error = spa_load_verify(spa);
-		if (error != 0) {
-			spa_load_failed(spa, "spa_load_verify failed "
-			    "[error=%d]", error);
-			return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA,
-			    error));
-		}
-	}
-
-	return (0);
-}
-
-static void
-spa_ld_claim_log_blocks(spa_t *spa)
-{
-	dmu_tx_t *tx;
-	dsl_pool_t *dp = spa_get_dsl(spa);
-
-	/*
-	 * Claim log blocks that haven't been committed yet.
-	 * This must all happen in a single txg.
-	 * Note: spa_claim_max_txg is updated by spa_claim_notify(),
-	 * invoked from zil_claim_log_block()'s i/o done callback.
-	 * Price of rollback is that we abandon the log.
-	 */
-	spa->spa_claiming = B_TRUE;
-
-	tx = dmu_tx_create_assigned(dp, spa_first_txg(spa));
-	(void) dmu_objset_find_dp(dp, dp->dp_root_dir_obj,
-	    zil_claim, tx, DS_FIND_CHILDREN);
-	dmu_tx_commit(tx);
-
-	spa->spa_claiming = B_FALSE;
-
-	spa_set_log_state(spa, SPA_LOG_GOOD);
-}
-
-static void
-spa_ld_check_for_config_update(spa_t *spa, uint64_t config_cache_txg,
-    boolean_t update_config_cache)
-{
-	vdev_t *rvd = spa->spa_root_vdev;
-	int need_update = B_FALSE;
-
-	/*
-	 * If the config cache is stale, or we have uninitialized
-	 * metaslabs (see spa_vdev_add()), then update the config.
-	 *
-	 * If this is a verbatim import, trust the current
-	 * in-core spa_config and update the disk labels.
-	 */
-	if (update_config_cache || config_cache_txg != spa->spa_config_txg ||
-	    spa->spa_load_state == SPA_LOAD_IMPORT ||
-	    spa->spa_load_state == SPA_LOAD_RECOVER ||
-	    (spa->spa_import_flags & ZFS_IMPORT_VERBATIM))
-		need_update = B_TRUE;
-
-	for (int c = 0; c < rvd->vdev_children; c++)
-		if (rvd->vdev_child[c]->vdev_ms_array == 0)
-			need_update = B_TRUE;
-
-	/*
-	 * Update the config cache asychronously in case we're the
-	 * root pool, in which case the config cache isn't writable yet.
-	 */
-	if (need_update)
-		spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
-}
-
-static void
-spa_ld_prepare_for_reload(spa_t *spa)
-{
-	int mode = spa->spa_mode;
-	int async_suspended = spa->spa_async_suspended;
-
-	spa_unload(spa);
-	spa_deactivate(spa);
-	spa_activate(spa, mode);
-
-	/*
-	 * We save the value of spa_async_suspended as it gets reset to 0 by
-	 * spa_unload(). We want to restore it back to the original value before
-	 * returning as we might be calling spa_async_resume() later.
-	 */
-	spa->spa_async_suspended = async_suspended;
-}
-
-static int
-spa_ld_read_checkpoint_txg(spa_t *spa)
-{
-	uberblock_t checkpoint;
-	int error = 0;
-
-	ASSERT0(spa->spa_checkpoint_txg);
-	ASSERT(MUTEX_HELD(&spa_namespace_lock));
-
-	error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
-	    DMU_POOL_ZPOOL_CHECKPOINT, sizeof (uint64_t),
-	    sizeof (uberblock_t) / sizeof (uint64_t), &checkpoint);
-
-	if (error == ENOENT)
-		return (0);
-
-	if (error != 0)
-		return (error);
-
-	ASSERT3U(checkpoint.ub_txg, !=, 0);
-	ASSERT3U(checkpoint.ub_checkpoint_txg, !=, 0);
-	ASSERT3U(checkpoint.ub_timestamp, !=, 0);
-	spa->spa_checkpoint_txg = checkpoint.ub_txg;
-	spa->spa_checkpoint_info.sci_timestamp = checkpoint.ub_timestamp;
-
-	return (0);
-}
-
-static int
-spa_ld_mos_init(spa_t *spa, spa_import_type_t type)
-{
-	int error = 0;
-
-	ASSERT(MUTEX_HELD(&spa_namespace_lock));
-	ASSERT(spa->spa_config_source != SPA_CONFIG_SRC_NONE);
-
-	/*
-	 * Never trust the config that is provided unless we are assembling
-	 * a pool following a split.
-	 * This means don't trust blkptrs and the vdev tree in general. This
-	 * also effectively puts the spa in read-only mode since
-	 * spa_writeable() checks for spa_trust_config to be true.
-	 * We will later load a trusted config from the MOS.
-	 */
-	if (type != SPA_IMPORT_ASSEMBLE)
-		spa->spa_trust_config = B_FALSE;
-
-	/*
-	 * Parse the config provided to create a vdev tree.
-	 */
-	error = spa_ld_parse_config(spa, type);
-	if (error != 0)
-		return (error);
-
-	/*
-	 * Now that we have the vdev tree, try to open each vdev. This involves
-	 * opening the underlying physical device, retrieving its geometry and
-	 * probing the vdev with a dummy I/O. The state of each vdev will be set
-	 * based on the success of those operations. After this we'll be ready
-	 * to read from the vdevs.
-	 */
-	error = spa_ld_open_vdevs(spa);
-	if (error != 0)
-		return (error);
-
-	/*
-	 * Read the label of each vdev and make sure that the GUIDs stored
-	 * there match the GUIDs in the config provided.
-	 * If we're assembling a new pool that's been split off from an
-	 * existing pool, the labels haven't yet been updated so we skip
-	 * validation for now.
-	 */
-	if (type != SPA_IMPORT_ASSEMBLE) {
-		error = spa_ld_validate_vdevs(spa);
-		if (error != 0)
-			return (error);
-	}
-
-	/*
-	 * Read all vdev labels to find the best uberblock (i.e. latest,
-	 * unless spa_load_max_txg is set) and store it in spa_uberblock. We
-	 * get the list of features required to read blkptrs in the MOS from
-	 * the vdev label with the best uberblock and verify that our version
-	 * of zfs supports them all.
-	 */
-	error = spa_ld_select_uberblock(spa, type);
-	if (error != 0)
-		return (error);
-
-	/*
-	 * Pass that uberblock to the dsl_pool layer which will open the root
-	 * blkptr. This blkptr points to the latest version of the MOS and will
-	 * allow us to read its contents.
-	 */
-	error = spa_ld_open_rootbp(spa);
-	if (error != 0)
-		return (error);
-
-	return (0);
-}
-
-static int
-spa_ld_checkpoint_rewind(spa_t *spa)
-{
-	uberblock_t checkpoint;
-	int error = 0;
-
-	ASSERT(MUTEX_HELD(&spa_namespace_lock));
-	ASSERT(spa->spa_import_flags & ZFS_IMPORT_CHECKPOINT);
-
-	error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
-	    DMU_POOL_ZPOOL_CHECKPOINT, sizeof (uint64_t),
-	    sizeof (uberblock_t) / sizeof (uint64_t), &checkpoint);
-
-	if (error != 0) {
-		spa_load_failed(spa, "unable to retrieve checkpointed "
-		    "uberblock from the MOS config [error=%d]", error);
-
-		if (error == ENOENT)
-			error = ZFS_ERR_NO_CHECKPOINT;
-
-		return (error);
-	}
-
-	ASSERT3U(checkpoint.ub_txg, <, spa->spa_uberblock.ub_txg);
-	ASSERT3U(checkpoint.ub_txg, ==, checkpoint.ub_checkpoint_txg);
-
-	/*
-	 * We need to update the txg and timestamp of the checkpointed
-	 * uberblock to be higher than the latest one. This ensures that
-	 * the checkpointed uberblock is selected if we were to close and
-	 * reopen the pool right after we've written it in the vdev labels.
-	 * (also see block comment in vdev_uberblock_compare)
-	 */
-	checkpoint.ub_txg = spa->spa_uberblock.ub_txg + 1;
-	checkpoint.ub_timestamp = gethrestime_sec();
-
-	/*
-	 * Set current uberblock to be the checkpointed uberblock.
-	 */
-	spa->spa_uberblock = checkpoint;
-
-	/*
-	 * If we are doing a normal rewind, then the pool is open for
-	 * writing and we sync the "updated" checkpointed uberblock to
-	 * disk. Once this is done, we've basically rewound the whole
-	 * pool and there is no way back.
-	 *
-	 * There are cases when we don't want to attempt and sync the
-	 * checkpointed uberblock to disk because we are opening a
-	 * pool as read-only. Specifically, verifying the checkpointed
-	 * state with zdb, and importing the checkpointed state to get
-	 * a "preview" of its content.
-	 */
-	if (spa_writeable(spa)) {
-		vdev_t *rvd = spa->spa_root_vdev;
-
-		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
-		vdev_t *svd[SPA_SYNC_MIN_VDEVS] = { NULL };
-		int svdcount = 0;
-		int children = rvd->vdev_children;
-		int c0 = spa_get_random(children);
-
-		for (int c = 0; c < children; c++) {
-			vdev_t *vd = rvd->vdev_child[(c0 + c) % children];
-
-			/* Stop when revisiting the first vdev */
-			if (c > 0 && svd[0] == vd)
-				break;
-
-			if (vd->vdev_ms_array == 0 || vd->vdev_islog ||
-			    !vdev_is_concrete(vd))
-				continue;
-
-			svd[svdcount++] = vd;
-			if (svdcount == SPA_SYNC_MIN_VDEVS)
-				break;
-		}
-		error = vdev_config_sync(svd, svdcount, spa->spa_first_txg);
-		if (error == 0)
-			spa->spa_last_synced_guid = rvd->vdev_guid;
-		spa_config_exit(spa, SCL_ALL, FTAG);
-
-		if (error != 0) {
-			spa_load_failed(spa, "failed to write checkpointed "
-			    "uberblock to the vdev labels [error=%d]", error);
-			return (error);
-		}
-	}
-
-	return (0);
-}
-
-static int
-spa_ld_mos_with_trusted_config(spa_t *spa, spa_import_type_t type,
-    boolean_t *update_config_cache)
-{
-	int error;
-
-	/*
-	 * Parse the config for pool, open and validate vdevs,
-	 * select an uberblock, and use that uberblock to open
-	 * the MOS.
-	 */
-	error = spa_ld_mos_init(spa, type);
-	if (error != 0)
-		return (error);
-
-	/*
-	 * Retrieve the trusted config stored in the MOS and use it to create
-	 * a new, exact version of the vdev tree, then reopen all vdevs.
-	 */
-	error = spa_ld_trusted_config(spa, type, B_FALSE);
-	if (error == EAGAIN) {
-		if (update_config_cache != NULL)
-			*update_config_cache = B_TRUE;
-
-		/*
-		 * Redo the loading process with the trusted config if it is
-		 * too different from the untrusted config.
-		 */
-		spa_ld_prepare_for_reload(spa);
-		spa_load_note(spa, "RELOADING");
-		error = spa_ld_mos_init(spa, type);
-		if (error != 0)
-			return (error);
-
-		error = spa_ld_trusted_config(spa, type, B_TRUE);
-		if (error != 0)
-			return (error);
-
-	} else if (error != 0) {
-		return (error);
-	}
-
-	return (0);
-}
-
-/*
- * Load an existing storage pool, using the config provided. This config
- * describes which vdevs are part of the pool and is later validated against
- * partial configs present in each vdev's label and an entire copy of the
- * config stored in the MOS.
- */
-static int
-spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport)
-{
-	int error = 0;
-	boolean_t missing_feat_write = B_FALSE;
-	boolean_t checkpoint_rewind =
-	    (spa->spa_import_flags & ZFS_IMPORT_CHECKPOINT);
-	boolean_t update_config_cache = B_FALSE;
-
-	ASSERT(MUTEX_HELD(&spa_namespace_lock));
-	ASSERT(spa->spa_config_source != SPA_CONFIG_SRC_NONE);
-
-	spa_load_note(spa, "LOADING");
-
-	error = spa_ld_mos_with_trusted_config(spa, type, &update_config_cache);
-	if (error != 0)
-		return (error);
-
-	/*
-	 * If we are rewinding to the checkpoint then we need to repeat
-	 * everything we've done so far in this function but this time
-	 * selecting the checkpointed uberblock and using that to open
-	 * the MOS.
-	 */
-	if (checkpoint_rewind) {
-		/*
-		 * If we are rewinding to the checkpoint update config cache
-		 * anyway.
-		 */
-		update_config_cache = B_TRUE;
-
-		/*
-		 * Extract the checkpointed uberblock from the current MOS
-		 * and use this as the pool's uberblock from now on. If the
-		 * pool is imported as writeable we also write the checkpoint
-		 * uberblock to the labels, making the rewind permanent.
-		 */
-		error = spa_ld_checkpoint_rewind(spa);
-		if (error != 0)
-			return (error);
-
-		/*
-		 * Redo the loading process process again with the
-		 * checkpointed uberblock.
-		 */
-		spa_ld_prepare_for_reload(spa);
-		spa_load_note(spa, "LOADING checkpointed uberblock");
-		error = spa_ld_mos_with_trusted_config(spa, type, NULL);
-		if (error != 0)
-			return (error);
-	}
-
-	/*
-	 * Retrieve the checkpoint txg if the pool has a checkpoint.
-	 */
-	error = spa_ld_read_checkpoint_txg(spa);
-	if (error != 0)
-		return (error);
-
-	/*
-	 * Retrieve the mapping of indirect vdevs. Those vdevs were removed
-	 * from the pool and their contents were re-mapped to other vdevs. Note
-	 * that everything that we read before this step must have been
-	 * rewritten on concrete vdevs after the last device removal was
-	 * initiated. Otherwise we could be reading from indirect vdevs before
-	 * we have loaded their mappings.
-	 */
-	error = spa_ld_open_indirect_vdev_metadata(spa);
-	if (error != 0)
-		return (error);
-
-	/*
-	 * Retrieve the full list of active features from the MOS and check if
-	 * they are all supported.
-	 */
-	error = spa_ld_check_features(spa, &missing_feat_write);
-	if (error != 0)
-		return (error);
-
-	/*
-	 * Load several special directories from the MOS needed by the dsl_pool
-	 * layer.
-	 */
-	error = spa_ld_load_special_directories(spa);
-	if (error != 0)
-		return (error);
-
-	/*
-	 * Retrieve pool properties from the MOS.
-	 */
-	error = spa_ld_get_props(spa);
-	if (error != 0)
-		return (error);
-
-	/*
-	 * Retrieve the list of auxiliary devices - cache devices and spares -
-	 * and open them.
-	 */
-	error = spa_ld_open_aux_vdevs(spa, type);
-	if (error != 0)
-		return (error);
-
-	/*
-	 * Load the metadata for all vdevs. Also check if unopenable devices
-	 * should be autoreplaced.
-	 */
-	error = spa_ld_load_vdev_metadata(spa);
-	if (error != 0)
-		return (error);
-
-	error = spa_ld_load_dedup_tables(spa);
-	if (error != 0)
-		return (error);
-
-	/*
-	 * Verify the logs now to make sure we don't have any unexpected errors
-	 * when we claim log blocks later.
-	 */
-	error = spa_ld_verify_logs(spa, type, ereport);
-	if (error != 0)
-		return (error);
-
-	if (missing_feat_write) {
-		ASSERT(spa->spa_load_state == SPA_LOAD_TRYIMPORT);
-
-		/*
-		 * At this point, we know that we can open the pool in
-		 * read-only mode but not read-write mode. We now have enough
-		 * information and can return to userland.
-		 */
-		return (spa_vdev_err(spa->spa_root_vdev, VDEV_AUX_UNSUP_FEAT,
-		    ENOTSUP));
-	}
-
-	/*
-	 * Traverse the last txgs to make sure the pool was left off in a safe
-	 * state. When performing an extreme rewind, we verify the whole pool,
-	 * which can take a very long time.
-	 */
-	error = spa_ld_verify_pool_data(spa);
-	if (error != 0)
-		return (error);
-
-	/*
-	 * Calculate the deflated space for the pool. This must be done before
-	 * we write anything to the pool because we'd need to update the space
-	 * accounting using the deflated sizes.
-	 */
-	spa_update_dspace(spa);
-
-	/*
-	 * We have now retrieved all the information we needed to open the
-	 * pool. If we are importing the pool in read-write mode, a few
-	 * additional steps must be performed to finish the import.
-	 */
-	if (spa_writeable(spa) && (spa->spa_load_state == SPA_LOAD_RECOVER ||
-	    spa->spa_load_max_txg == UINT64_MAX)) {
-		uint64_t config_cache_txg = spa->spa_config_txg;
-
-		ASSERT(spa->spa_load_state != SPA_LOAD_TRYIMPORT);
-
-		/*
-		 * In case of a checkpoint rewind, log the original txg
-		 * of the checkpointed uberblock.
-		 */
-		if (checkpoint_rewind) {
-			spa_history_log_internal(spa, "checkpoint rewind",
-			    NULL, "rewound state to txg=%llu",
-			    (u_longlong_t)spa->spa_uberblock.ub_checkpoint_txg);
-		}
-
-		/*
-		 * Traverse the ZIL and claim all blocks.
-		 */
-		spa_ld_claim_log_blocks(spa);
-
-		/*
-		 * Kick-off the syncing thread.
-		 */
-		spa->spa_sync_on = B_TRUE;
-		txg_sync_start(spa->spa_dsl_pool);
-		mmp_thread_start(spa);
-
-		/*
-		 * Wait for all claims to sync.  We sync up to the highest
-		 * claimed log block birth time so that claimed log blocks
-		 * don't appear to be from the future.  spa_claim_max_txg
-		 * will have been set for us by ZIL traversal operations
-		 * performed above.
-		 */
-		txg_wait_synced(spa->spa_dsl_pool, spa->spa_claim_max_txg);
-
-		/*
-		 * Check if we need to request an update of the config. On the
-		 * next sync, we would update the config stored in vdev labels
-		 * and the cachefile (by default /etc/zfs/zpool.cache).
-		 */
-		spa_ld_check_for_config_update(spa, config_cache_txg,
-		    update_config_cache);
-
-		/*
-		 * Check all DTLs to see if anything needs resilvering.
-		 */
-		if (!dsl_scan_resilvering(spa->spa_dsl_pool) &&
-		    vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL))
-			spa_async_request(spa, SPA_ASYNC_RESILVER);
-
-		/*
-		 * Log the fact that we booted up (so that we can detect if
-		 * we rebooted in the middle of an operation).
-		 */
-		spa_history_log_version(spa, "open");
-
-		spa_restart_removal(spa);
-		spa_spawn_aux_threads(spa);
-
-		/*
-		 * Delete any inconsistent datasets.
-		 *
-		 * Note:
-		 * Since we may be issuing deletes for clones here,
-		 * we make sure to do so after we've spawned all the
-		 * auxiliary threads above (from which the livelist
-		 * deletion zthr is part of).
-		 */
-		(void) dmu_objset_find(spa_name(spa),
-		    dsl_destroy_inconsistent, NULL, DS_FIND_CHILDREN);
-
-		/*
-		 * Clean up any stale temporary dataset userrefs.
-		 */
-		dsl_pool_clean_tmp_userrefs(spa->spa_dsl_pool);
-
-		spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
-		vdev_initialize_restart(spa->spa_root_vdev);
-		spa_config_exit(spa, SCL_CONFIG, FTAG);
-	}
-
-	spa_load_note(spa, "LOADED");
-
-	return (0);
-}
-
-static int
-spa_load_retry(spa_t *spa, spa_load_state_t state)
-{
-	int mode = spa->spa_mode;
-
-	spa_unload(spa);
-	spa_deactivate(spa);
-
-	spa->spa_load_max_txg = spa->spa_uberblock.ub_txg - 1;
-
-	spa_activate(spa, mode);
-	spa_async_suspend(spa);
-
-	spa_load_note(spa, "spa_load_retry: rewind, max txg: %llu",
-	    (u_longlong_t)spa->spa_load_max_txg);
-
-	return (spa_load(spa, state, SPA_IMPORT_EXISTING));
-}
-
-/*
- * If spa_load() fails this function will try loading prior txg's. If
- * 'state' is SPA_LOAD_RECOVER and one of these loads succeeds the pool
- * will be rewound to that txg. If 'state' is not SPA_LOAD_RECOVER this
- * function will not rewind the pool and will return the same error as
- * spa_load().
- */
-static int
-spa_load_best(spa_t *spa, spa_load_state_t state, uint64_t max_request,
-    int rewind_flags)
-{
-	nvlist_t *loadinfo = NULL;
-	nvlist_t *config = NULL;
-	int load_error, rewind_error;
-	uint64_t safe_rewind_txg;
-	uint64_t min_txg;
-
-	if (spa->spa_load_txg && state == SPA_LOAD_RECOVER) {
-		spa->spa_load_max_txg = spa->spa_load_txg;
-		spa_set_log_state(spa, SPA_LOG_CLEAR);
-	} else {
-		spa->spa_load_max_txg = max_request;
-		if (max_request != UINT64_MAX)
-			spa->spa_extreme_rewind = B_TRUE;
-	}
-
-	load_error = rewind_error = spa_load(spa, state, SPA_IMPORT_EXISTING);
-	if (load_error == 0)
-		return (0);
-	if (load_error == ZFS_ERR_NO_CHECKPOINT) {
-		/*
-		 * When attempting checkpoint-rewind on a pool with no
-		 * checkpoint, we should not attempt to load uberblocks
-		 * from previous txgs when spa_load fails.
-		 */
-		ASSERT(spa->spa_import_flags & ZFS_IMPORT_CHECKPOINT);
-		return (load_error);
-	}
-
-	if (spa->spa_root_vdev != NULL)
-		config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
-
-	spa->spa_last_ubsync_txg = spa->spa_uberblock.ub_txg;
-	spa->spa_last_ubsync_txg_ts = spa->spa_uberblock.ub_timestamp;
-
-	if (rewind_flags & ZPOOL_NEVER_REWIND) {
-		nvlist_free(config);
-		return (load_error);
-	}
-
-	if (state == SPA_LOAD_RECOVER) {
-		/* Price of rolling back is discarding txgs, including log */
-		spa_set_log_state(spa, SPA_LOG_CLEAR);
-	} else {
-		/*
-		 * If we aren't rolling back save the load info from our first
-		 * import attempt so that we can restore it after attempting
-		 * to rewind.
-		 */
-		loadinfo = spa->spa_load_info;
-		spa->spa_load_info = fnvlist_alloc();
-	}
-
-	spa->spa_load_max_txg = spa->spa_last_ubsync_txg;
-	safe_rewind_txg = spa->spa_last_ubsync_txg - TXG_DEFER_SIZE;
-	min_txg = (rewind_flags & ZPOOL_EXTREME_REWIND) ?
-	    TXG_INITIAL : safe_rewind_txg;
-
-	/*
-	 * Continue as long as we're finding errors, we're still within
-	 * the acceptable rewind range, and we're still finding uberblocks
-	 */
-	while (rewind_error && spa->spa_uberblock.ub_txg >= min_txg &&
-	    spa->spa_uberblock.ub_txg <= spa->spa_load_max_txg) {
-		if (spa->spa_load_max_txg < safe_rewind_txg)
-			spa->spa_extreme_rewind = B_TRUE;
-		rewind_error = spa_load_retry(spa, state);
-	}
-
-	spa->spa_extreme_rewind = B_FALSE;
-	spa->spa_load_max_txg = UINT64_MAX;
-
-	if (config && (rewind_error || state != SPA_LOAD_RECOVER))
-		spa_config_set(spa, config);
-	else
-		nvlist_free(config);
-
-	if (state == SPA_LOAD_RECOVER) {
-		ASSERT3P(loadinfo, ==, NULL);
-		return (rewind_error);
-	} else {
-		/* Store the rewind info as part of the initial load info */
-		fnvlist_add_nvlist(loadinfo, ZPOOL_CONFIG_REWIND_INFO,
-		    spa->spa_load_info);
-
-		/* Restore the initial load info */
-		fnvlist_free(spa->spa_load_info);
-		spa->spa_load_info = loadinfo;
-
-		return (load_error);
-	}
-}
-
-/*
- * Pool Open/Import
- *
- * The import case is identical to an open except that the configuration is sent
- * down from userland, instead of grabbed from the configuration cache.  For the
- * case of an open, the pool configuration will exist in the
- * POOL_STATE_UNINITIALIZED state.
- *
- * The stats information (gen/count/ustats) is used to gather vdev statistics at
- * the same time open the pool, without having to keep around the spa_t in some
- * ambiguous state.
- */
-static int
-spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t *nvpolicy,
-    nvlist_t **config)
-{
-	spa_t *spa;
-	spa_load_state_t state = SPA_LOAD_OPEN;
-	int error;
-	int locked = B_FALSE;
-	int firstopen = B_FALSE;
-
-	*spapp = NULL;
-
-	/*
-	 * As disgusting as this is, we need to support recursive calls to this
-	 * function because dsl_dir_open() is called during spa_load(), and ends
-	 * up calling spa_open() again.  The real fix is to figure out how to
-	 * avoid dsl_dir_open() calling this in the first place.
-	 */
-	if (mutex_owner(&spa_namespace_lock) != curthread) {
-		mutex_enter(&spa_namespace_lock);
-		locked = B_TRUE;
-	}
-
-	if ((spa = spa_lookup(pool)) == NULL) {
-		if (locked)
-			mutex_exit(&spa_namespace_lock);
-		return (SET_ERROR(ENOENT));
-	}
-
-	if (spa->spa_state == POOL_STATE_UNINITIALIZED) {
-		zpool_load_policy_t policy;
-
-		firstopen = B_TRUE;
-
-		zpool_get_load_policy(nvpolicy ? nvpolicy : spa->spa_config,
-		    &policy);
-		if (policy.zlp_rewind & ZPOOL_DO_REWIND)
-			state = SPA_LOAD_RECOVER;
-
-		spa_activate(spa, spa_mode_global);
-
-		if (state != SPA_LOAD_RECOVER)
-			spa->spa_last_ubsync_txg = spa->spa_load_txg = 0;
-		spa->spa_config_source = SPA_CONFIG_SRC_CACHEFILE;
-
-		zfs_dbgmsg("spa_open_common: opening %s", pool);
-		error = spa_load_best(spa, state, policy.zlp_txg,
-		    policy.zlp_rewind);
-
-		if (error == EBADF) {
-			/*
-			 * If vdev_validate() returns failure (indicated by
-			 * EBADF), it indicates that one of the vdevs indicates
-			 * that the pool has been exported or destroyed.  If
-			 * this is the case, the config cache is out of sync and
-			 * we should remove the pool from the namespace.
-			 */
-			spa_unload(spa);
-			spa_deactivate(spa);
-			spa_write_cachefile(spa, B_TRUE, B_TRUE);
-			spa_remove(spa);
-			if (locked)
-				mutex_exit(&spa_namespace_lock);
-			return (SET_ERROR(ENOENT));
-		}
-
-		if (error) {
-			/*
-			 * We can't open the pool, but we still have useful
-			 * information: the state of each vdev after the
-			 * attempted vdev_open().  Return this to the user.
-			 */
-			if (config != NULL && spa->spa_config) {
-				VERIFY(nvlist_dup(spa->spa_config, config,
-				    KM_SLEEP) == 0);
-				VERIFY(nvlist_add_nvlist(*config,
-				    ZPOOL_CONFIG_LOAD_INFO,
-				    spa->spa_load_info) == 0);
-			}
-			spa_unload(spa);
-			spa_deactivate(spa);
-			spa->spa_last_open_failed = error;
-			if (locked)
-				mutex_exit(&spa_namespace_lock);
-			*spapp = NULL;
-			return (error);
-		}
-	}
-
-	spa_open_ref(spa, tag);
-
-	if (config != NULL)
-		*config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
-
-	/*
-	 * If we've recovered the pool, pass back any information we
-	 * gathered while doing the load.
-	 */
-	if (state == SPA_LOAD_RECOVER) {
-		VERIFY(nvlist_add_nvlist(*config, ZPOOL_CONFIG_LOAD_INFO,
-		    spa->spa_load_info) == 0);
-	}
-
-	if (locked) {
-		spa->spa_last_open_failed = 0;
-		spa->spa_last_ubsync_txg = 0;
-		spa->spa_load_txg = 0;
-		mutex_exit(&spa_namespace_lock);
-#ifdef __FreeBSD__
-#ifdef _KERNEL
-		if (firstopen)
-			zvol_create_minors(spa, spa->spa_name);
-#endif
-#endif
-	}
-
-	*spapp = spa;
-
-	return (0);
-}
-
-int
-spa_open_rewind(const char *name, spa_t **spapp, void *tag, nvlist_t *policy,
-    nvlist_t **config)
-{
-	return (spa_open_common(name, spapp, tag, policy, config));
-}
-
-int
-spa_open(const char *name, spa_t **spapp, void *tag)
-{
-	return (spa_open_common(name, spapp, tag, NULL, NULL));
-}
-
-/*
- * Lookup the given spa_t, incrementing the inject count in the process,
- * preventing it from being exported or destroyed.
- */
-spa_t *
-spa_inject_addref(char *name)
-{
-	spa_t *spa;
-
-	mutex_enter(&spa_namespace_lock);
-	if ((spa = spa_lookup(name)) == NULL) {
-		mutex_exit(&spa_namespace_lock);
-		return (NULL);
-	}
-	spa->spa_inject_ref++;
-	mutex_exit(&spa_namespace_lock);
-
-	return (spa);
-}
-
-void
-spa_inject_delref(spa_t *spa)
-{
-	mutex_enter(&spa_namespace_lock);
-	spa->spa_inject_ref--;
-	mutex_exit(&spa_namespace_lock);
-}
-
-/*
- * Add spares device information to the nvlist.
- */
-static void
-spa_add_spares(spa_t *spa, nvlist_t *config)
-{
-	nvlist_t **spares;
-	uint_t i, nspares;
-	nvlist_t *nvroot;
-	uint64_t guid;
-	vdev_stat_t *vs;
-	uint_t vsc;
-	uint64_t pool;
-
-	ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER));
-
-	if (spa->spa_spares.sav_count == 0)
-		return;
-
-	VERIFY(nvlist_lookup_nvlist(config,
-	    ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0);
-	VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config,
-	    ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0);
-	if (nspares != 0) {
-		VERIFY(nvlist_add_nvlist_array(nvroot,
-		    ZPOOL_CONFIG_SPARES, spares, nspares) == 0);
-		VERIFY(nvlist_lookup_nvlist_array(nvroot,
-		    ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0);
-
-		/*
-		 * Go through and find any spares which have since been
-		 * repurposed as an active spare.  If this is the case, update
-		 * their status appropriately.
-		 */
-		for (i = 0; i < nspares; i++) {
-			VERIFY(nvlist_lookup_uint64(spares[i],
-			    ZPOOL_CONFIG_GUID, &guid) == 0);
-			if (spa_spare_exists(guid, &pool, NULL) &&
-			    pool != 0ULL) {
-				VERIFY(nvlist_lookup_uint64_array(
-				    spares[i], ZPOOL_CONFIG_VDEV_STATS,
-				    (uint64_t **)&vs, &vsc) == 0);
-				vs->vs_state = VDEV_STATE_CANT_OPEN;
-				vs->vs_aux = VDEV_AUX_SPARED;
-			}
-		}
-	}
-}
-
-/*
- * Add l2cache device information to the nvlist, including vdev stats.
- */
-static void
-spa_add_l2cache(spa_t *spa, nvlist_t *config)
-{
-	nvlist_t **l2cache;
-	uint_t i, j, nl2cache;
-	nvlist_t *nvroot;
-	uint64_t guid;
-	vdev_t *vd;
-	vdev_stat_t *vs;
-	uint_t vsc;
-
-	ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER));
-
-	if (spa->spa_l2cache.sav_count == 0)
-		return;
-
-	VERIFY(nvlist_lookup_nvlist(config,
-	    ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0);
-	VERIFY(nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config,
-	    ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0);
-	if (nl2cache != 0) {
-		VERIFY(nvlist_add_nvlist_array(nvroot,
-		    ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0);
-		VERIFY(nvlist_lookup_nvlist_array(nvroot,
-		    ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0);
-
-		/*
-		 * Update level 2 cache device stats.
-		 */
-
-		for (i = 0; i < nl2cache; i++) {
-			VERIFY(nvlist_lookup_uint64(l2cache[i],
-			    ZPOOL_CONFIG_GUID, &guid) == 0);
-
-			vd = NULL;
-			for (j = 0; j < spa->spa_l2cache.sav_count; j++) {
-				if (guid ==
-				    spa->spa_l2cache.sav_vdevs[j]->vdev_guid) {
-					vd = spa->spa_l2cache.sav_vdevs[j];
-					break;
-				}
-			}
-			ASSERT(vd != NULL);
-
-			VERIFY(nvlist_lookup_uint64_array(l2cache[i],
-			    ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc)
-			    == 0);
-			vdev_get_stats(vd, vs);
-		}
-	}
-}
-
-static void
-spa_feature_stats_from_disk(spa_t *spa, nvlist_t *features)
-{
-	zap_cursor_t zc;
-	zap_attribute_t za;
-
-	/* We may be unable to read features if pool is suspended. */
-	if (spa_suspended(spa))
-		return;
-
-	if (spa->spa_feat_for_read_obj != 0) {
-		for (zap_cursor_init(&zc, spa->spa_meta_objset,
-		    spa->spa_feat_for_read_obj);
-		    zap_cursor_retrieve(&zc, &za) == 0;
-		    zap_cursor_advance(&zc)) {
-			ASSERT(za.za_integer_length == sizeof (uint64_t) &&
-			    za.za_num_integers == 1);
-			VERIFY0(nvlist_add_uint64(features, za.za_name,
-			    za.za_first_integer));
-		}
-		zap_cursor_fini(&zc);
-	}
-
-	if (spa->spa_feat_for_write_obj != 0) {
-		for (zap_cursor_init(&zc, spa->spa_meta_objset,
-		    spa->spa_feat_for_write_obj);
-		    zap_cursor_retrieve(&zc, &za) == 0;
-		    zap_cursor_advance(&zc)) {
-			ASSERT(za.za_integer_length == sizeof (uint64_t) &&
-			    za.za_num_integers == 1);
-			VERIFY0(nvlist_add_uint64(features, za.za_name,
-			    za.za_first_integer));
-		}
-		zap_cursor_fini(&zc);
-	}
-}
-
-static void
-spa_feature_stats_from_cache(spa_t *spa, nvlist_t *features)
-{
-	int i;
-
-	for (i = 0; i < SPA_FEATURES; i++) {
-		zfeature_info_t feature = spa_feature_table[i];
-		uint64_t refcount;
-
-		if (feature_get_refcount(spa, &feature, &refcount) != 0)
-			continue;
-
-		VERIFY0(nvlist_add_uint64(features, feature.fi_guid, refcount));
-	}
-}
-
-/*
- * Store a list of pool features and their reference counts in the
- * config.
- *
- * The first time this is called on a spa, allocate a new nvlist, fetch
- * the pool features and reference counts from disk, then save the list
- * in the spa. In subsequent calls on the same spa use the saved nvlist
- * and refresh its values from the cached reference counts.  This
- * ensures we don't block here on I/O on a suspended pool so 'zpool
- * clear' can resume the pool.
- */
-static void
-spa_add_feature_stats(spa_t *spa, nvlist_t *config)
-{
-	nvlist_t *features;
-
-	ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER));
-
-	mutex_enter(&spa->spa_feat_stats_lock);
-	features = spa->spa_feat_stats;
-
-	if (features != NULL) {
-		spa_feature_stats_from_cache(spa, features);
-	} else {
-		VERIFY0(nvlist_alloc(&features, NV_UNIQUE_NAME, KM_SLEEP));
-		spa->spa_feat_stats = features;
-		spa_feature_stats_from_disk(spa, features);
-	}
-
-	VERIFY0(nvlist_add_nvlist(config, ZPOOL_CONFIG_FEATURE_STATS,
-	    features));
-
-	mutex_exit(&spa->spa_feat_stats_lock);
-}
-
-int
-spa_get_stats(const char *name, nvlist_t **config,
-    char *altroot, size_t buflen)
-{
-	int error;
-	spa_t *spa;
-
-	*config = NULL;
-	error = spa_open_common(name, &spa, FTAG, NULL, config);
-
-	if (spa != NULL) {
-		/*
-		 * This still leaves a window of inconsistency where the spares
-		 * or l2cache devices could change and the config would be
-		 * self-inconsistent.
-		 */
-		spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
-
-		if (*config != NULL) {
-			uint64_t loadtimes[2];
-
-			loadtimes[0] = spa->spa_loaded_ts.tv_sec;
-			loadtimes[1] = spa->spa_loaded_ts.tv_nsec;
-			VERIFY(nvlist_add_uint64_array(*config,
-			    ZPOOL_CONFIG_LOADED_TIME, loadtimes, 2) == 0);
-
-			VERIFY(nvlist_add_uint64(*config,
-			    ZPOOL_CONFIG_ERRCOUNT,
-			    spa_get_errlog_size(spa)) == 0);
-
-			if (spa_suspended(spa)) {
-				VERIFY(nvlist_add_uint64(*config,
-				    ZPOOL_CONFIG_SUSPENDED,
-				    spa->spa_failmode) == 0);
-				VERIFY(nvlist_add_uint64(*config,
-				    ZPOOL_CONFIG_SUSPENDED_REASON,
-				    spa->spa_suspended) == 0);
-			}
-
-			spa_add_spares(spa, *config);
-			spa_add_l2cache(spa, *config);
-			spa_add_feature_stats(spa, *config);
-		}
-	}
-
-	/*
-	 * We want to get the alternate root even for faulted pools, so we cheat
-	 * and call spa_lookup() directly.
-	 */
-	if (altroot) {
-		if (spa == NULL) {
-			mutex_enter(&spa_namespace_lock);
-			spa = spa_lookup(name);
-			if (spa)
-				spa_altroot(spa, altroot, buflen);
-			else
-				altroot[0] = '\0';
-			spa = NULL;
-			mutex_exit(&spa_namespace_lock);
-		} else {
-			spa_altroot(spa, altroot, buflen);
-		}
-	}
-
-	if (spa != NULL) {
-		spa_config_exit(spa, SCL_CONFIG, FTAG);
-		spa_close(spa, FTAG);
-	}
-
-	return (error);
-}
-
-/*
- * Validate that the auxiliary device array is well formed.  We must have an
- * array of nvlists, each which describes a valid leaf vdev.  If this is an
- * import (mode is VDEV_ALLOC_SPARE), then we allow corrupted spares to be
- * specified, as long as they are well-formed.
- */
-static int
-spa_validate_aux_devs(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode,
-    spa_aux_vdev_t *sav, const char *config, uint64_t version,
-    vdev_labeltype_t label)
-{
-	nvlist_t **dev;
-	uint_t i, ndev;
-	vdev_t *vd;
-	int error;
-
-	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
-
-	/*
-	 * It's acceptable to have no devs specified.
-	 */
-	if (nvlist_lookup_nvlist_array(nvroot, config, &dev, &ndev) != 0)
-		return (0);
-
-	if (ndev == 0)
-		return (SET_ERROR(EINVAL));
-
-	/*
-	 * Make sure the pool is formatted with a version that supports this
-	 * device type.
-	 */
-	if (spa_version(spa) < version)
-		return (SET_ERROR(ENOTSUP));
-
-	/*
-	 * Set the pending device list so we correctly handle device in-use
-	 * checking.
-	 */
-	sav->sav_pending = dev;
-	sav->sav_npending = ndev;
-
-	for (i = 0; i < ndev; i++) {
-		if ((error = spa_config_parse(spa, &vd, dev[i], NULL, 0,
-		    mode)) != 0)
-			goto out;
-
-		if (!vd->vdev_ops->vdev_op_leaf) {
-			vdev_free(vd);
-			error = SET_ERROR(EINVAL);
-			goto out;
-		}
-
-		vd->vdev_top = vd;
-
-		if ((error = vdev_open(vd)) == 0 &&
-		    (error = vdev_label_init(vd, crtxg, label)) == 0) {
-			VERIFY(nvlist_add_uint64(dev[i], ZPOOL_CONFIG_GUID,
-			    vd->vdev_guid) == 0);
-		}
-
-		vdev_free(vd);
-
-		if (error &&
-		    (mode != VDEV_ALLOC_SPARE && mode != VDEV_ALLOC_L2CACHE))
-			goto out;
-		else
-			error = 0;
-	}
-
-out:
-	sav->sav_pending = NULL;
-	sav->sav_npending = 0;
-	return (error);
-}
-
-static int
-spa_validate_aux(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode)
-{
-	int error;
-
-	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
-
-	if ((error = spa_validate_aux_devs(spa, nvroot, crtxg, mode,
-	    &spa->spa_spares, ZPOOL_CONFIG_SPARES, SPA_VERSION_SPARES,
-	    VDEV_LABEL_SPARE)) != 0) {
-		return (error);
-	}
-
-	return (spa_validate_aux_devs(spa, nvroot, crtxg, mode,
-	    &spa->spa_l2cache, ZPOOL_CONFIG_L2CACHE, SPA_VERSION_L2CACHE,
-	    VDEV_LABEL_L2CACHE));
-}
-
-static void
-spa_set_aux_vdevs(spa_aux_vdev_t *sav, nvlist_t **devs, int ndevs,
-    const char *config)
-{
-	int i;
-
-	if (sav->sav_config != NULL) {
-		nvlist_t **olddevs;
-		uint_t oldndevs;
-		nvlist_t **newdevs;
-
-		/*
-		 * Generate new dev list by concatentating with the
-		 * current dev list.
-		 */
-		VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, config,
-		    &olddevs, &oldndevs) == 0);
-
-		newdevs = kmem_alloc(sizeof (void *) *
-		    (ndevs + oldndevs), KM_SLEEP);
-		for (i = 0; i < oldndevs; i++)
-			VERIFY(nvlist_dup(olddevs[i], &newdevs[i],
-			    KM_SLEEP) == 0);
-		for (i = 0; i < ndevs; i++)
-			VERIFY(nvlist_dup(devs[i], &newdevs[i + oldndevs],
-			    KM_SLEEP) == 0);
-
-		VERIFY(nvlist_remove(sav->sav_config, config,
-		    DATA_TYPE_NVLIST_ARRAY) == 0);
-
-		VERIFY(nvlist_add_nvlist_array(sav->sav_config,
-		    config, newdevs, ndevs + oldndevs) == 0);
-		for (i = 0; i < oldndevs + ndevs; i++)
-			nvlist_free(newdevs[i]);
-		kmem_free(newdevs, (oldndevs + ndevs) * sizeof (void *));
-	} else {
-		/*
-		 * Generate a new dev list.
-		 */
-		VERIFY(nvlist_alloc(&sav->sav_config, NV_UNIQUE_NAME,
-		    KM_SLEEP) == 0);
-		VERIFY(nvlist_add_nvlist_array(sav->sav_config, config,
-		    devs, ndevs) == 0);
-	}
-}
-
-/*
- * Stop and drop level 2 ARC devices
- */
-void
-spa_l2cache_drop(spa_t *spa)
-{
-	vdev_t *vd;
-	int i;
-	spa_aux_vdev_t *sav = &spa->spa_l2cache;
-
-	for (i = 0; i < sav->sav_count; i++) {
-		uint64_t pool;
-
-		vd = sav->sav_vdevs[i];
-		ASSERT(vd != NULL);
-
-		if (spa_l2cache_exists(vd->vdev_guid, &pool) &&
-		    pool != 0ULL && l2arc_vdev_present(vd))
-			l2arc_remove_vdev(vd);
-	}
-}
-
-/*
- * Pool Creation
- */
-int
-spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
-    nvlist_t *zplprops)
-{
-	spa_t *spa;
-	char *altroot = NULL;
-	vdev_t *rvd;
-	dsl_pool_t *dp;
-	dmu_tx_t *tx;
-	int error = 0;
-	uint64_t txg = TXG_INITIAL;
-	nvlist_t **spares, **l2cache;
-	uint_t nspares, nl2cache;
-	uint64_t version, obj;
-	boolean_t has_features;
-	char *poolname;
-	nvlist_t *nvl;
-
-	if (props == NULL ||
-	    nvlist_lookup_string(props,
-	    zpool_prop_to_name(ZPOOL_PROP_TNAME), &poolname) != 0)
-		poolname = (char *)pool;
-
-	/*
-	 * If this pool already exists, return failure.
-	 */
-	mutex_enter(&spa_namespace_lock);
-	if (spa_lookup(poolname) != NULL) {
-		mutex_exit(&spa_namespace_lock);
-		return (SET_ERROR(EEXIST));
-	}
-
-	/*
-	 * Allocate a new spa_t structure.
-	 */
-	nvl = fnvlist_alloc();
-	fnvlist_add_string(nvl, ZPOOL_CONFIG_POOL_NAME, pool);
-	(void) nvlist_lookup_string(props,
-	    zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
-	spa = spa_add(poolname, nvl, altroot);
-	fnvlist_free(nvl);
-	spa_activate(spa, spa_mode_global);
-
-	if (props && (error = spa_prop_validate(spa, props))) {
-		spa_deactivate(spa);
-		spa_remove(spa);
-		mutex_exit(&spa_namespace_lock);
-		return (error);
-	}
-
-	/*
-	 * Temporary pool names should never be written to disk.
-	 */
-	if (poolname != pool)
-		spa->spa_import_flags |= ZFS_IMPORT_TEMP_NAME;
-
-	has_features = B_FALSE;
-	for (nvpair_t *elem = nvlist_next_nvpair(props, NULL);
-	    elem != NULL; elem = nvlist_next_nvpair(props, elem)) {
-		if (zpool_prop_feature(nvpair_name(elem)))
-			has_features = B_TRUE;
-	}
-
-	if (has_features || nvlist_lookup_uint64(props,
-	    zpool_prop_to_name(ZPOOL_PROP_VERSION), &version) != 0) {
-		version = SPA_VERSION;
-	}
-	ASSERT(SPA_VERSION_IS_SUPPORTED(version));
-
-	spa->spa_first_txg = txg;
-	spa->spa_uberblock.ub_txg = txg - 1;
-	spa->spa_uberblock.ub_version = version;
-	spa->spa_ubsync = spa->spa_uberblock;
-	spa->spa_load_state = SPA_LOAD_CREATE;
-	spa->spa_removing_phys.sr_state = DSS_NONE;
-	spa->spa_removing_phys.sr_removing_vdev = -1;
-	spa->spa_removing_phys.sr_prev_indirect_vdev = -1;
-	spa->spa_indirect_vdevs_loaded = B_TRUE;
-
-	/*
-	 * Create "The Godfather" zio to hold all async IOs
-	 */
-	spa->spa_async_zio_root = kmem_alloc(max_ncpus * sizeof (void *),
-	    KM_SLEEP);
-	for (int i = 0; i < max_ncpus; i++) {
-		spa->spa_async_zio_root[i] = zio_root(spa, NULL, NULL,
-		    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE |
-		    ZIO_FLAG_GODFATHER);
-	}
-
-	/*
-	 * Create the root vdev.
-	 */
-	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
-
-	error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_ADD);
-
-	ASSERT(error != 0 || rvd != NULL);
-	ASSERT(error != 0 || spa->spa_root_vdev == rvd);
-
-	if (error == 0 && !zfs_allocatable_devs(nvroot))
-		error = SET_ERROR(EINVAL);
-
-	if (error == 0 &&
-	    (error = vdev_create(rvd, txg, B_FALSE)) == 0 &&
-	    (error = spa_validate_aux(spa, nvroot, txg,
-	    VDEV_ALLOC_ADD)) == 0) {
-		/*
-		 * instantiate the metaslab groups (this will dirty the vdevs)
-		 * we can no longer error exit past this point
-		 */
-		for (int c = 0; error == 0 && c < rvd->vdev_children; c++) {
-			vdev_t *vd = rvd->vdev_child[c];
-
-			vdev_ashift_optimize(vd);
-			vdev_metaslab_set_size(vd);
-			vdev_expand(vd, txg);
-		}
-	}
-
-	spa_config_exit(spa, SCL_ALL, FTAG);
-
-	if (error != 0) {
-		spa_unload(spa);
-		spa_deactivate(spa);
-		spa_remove(spa);
-		mutex_exit(&spa_namespace_lock);
-		return (error);
-	}
-
-	/*
-	 * Get the list of spares, if specified.
-	 */
-	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
-	    &spares, &nspares) == 0) {
-		VERIFY(nvlist_alloc(&spa->spa_spares.sav_config, NV_UNIQUE_NAME,
-		    KM_SLEEP) == 0);
-		VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config,
-		    ZPOOL_CONFIG_SPARES, spares, nspares) == 0);
-		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
-		spa_load_spares(spa);
-		spa_config_exit(spa, SCL_ALL, FTAG);
-		spa->spa_spares.sav_sync = B_TRUE;
-	}
-
-	/*
-	 * Get the list of level 2 cache devices, if specified.
-	 */
-	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE,
-	    &l2cache, &nl2cache) == 0) {
-		VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config,
-		    NV_UNIQUE_NAME, KM_SLEEP) == 0);
-		VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config,
-		    ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0);
-		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
-		spa_load_l2cache(spa);
-		spa_config_exit(spa, SCL_ALL, FTAG);
-		spa->spa_l2cache.sav_sync = B_TRUE;
-	}
-
-	spa->spa_is_initializing = B_TRUE;
-	spa->spa_dsl_pool = dp = dsl_pool_create(spa, zplprops, txg);
-	spa->spa_meta_objset = dp->dp_meta_objset;
-	spa->spa_is_initializing = B_FALSE;
-
-	/*
-	 * Create DDTs (dedup tables).
-	 */
-	ddt_create(spa);
-
-	spa_update_dspace(spa);
-
-	tx = dmu_tx_create_assigned(dp, txg);
-
-	/*
-	 * Create the pool config object.
-	 */
-	spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset,
-	    DMU_OT_PACKED_NVLIST, SPA_CONFIG_BLOCKSIZE,
-	    DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx);
-
-	if (zap_add(spa->spa_meta_objset,
-	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG,
-	    sizeof (uint64_t), 1, &spa->spa_config_object, tx) != 0) {
-		cmn_err(CE_PANIC, "failed to add pool config");
-	}
-
-	if (spa_version(spa) >= SPA_VERSION_FEATURES)
-		spa_feature_create_zap_objects(spa, tx);
-
-	if (zap_add(spa->spa_meta_objset,
-	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CREATION_VERSION,
-	    sizeof (uint64_t), 1, &version, tx) != 0) {
-		cmn_err(CE_PANIC, "failed to add pool version");
-	}
-
-	/* Newly created pools with the right version are always deflated. */
-	if (version >= SPA_VERSION_RAIDZ_DEFLATE) {
-		spa->spa_deflate = TRUE;
-		if (zap_add(spa->spa_meta_objset,
-		    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE,
-		    sizeof (uint64_t), 1, &spa->spa_deflate, tx) != 0) {
-			cmn_err(CE_PANIC, "failed to add deflate");
-		}
-	}
-
-	/*
-	 * Create the deferred-free bpobj.  Turn off compression
-	 * because sync-to-convergence takes longer if the blocksize
-	 * keeps changing.
-	 */
-	obj = bpobj_alloc(spa->spa_meta_objset, 1 << 14, tx);
-	dmu_object_set_compress(spa->spa_meta_objset, obj,
-	    ZIO_COMPRESS_OFF, tx);
-	if (zap_add(spa->spa_meta_objset,
-	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPOBJ,
-	    sizeof (uint64_t), 1, &obj, tx) != 0) {
-		cmn_err(CE_PANIC, "failed to add bpobj");
-	}
-	VERIFY3U(0, ==, bpobj_open(&spa->spa_deferred_bpobj,
-	    spa->spa_meta_objset, obj));
-
-	/*
-	 * Create the pool's history object.
-	 */
-	if (version >= SPA_VERSION_ZPOOL_HISTORY)
-		spa_history_create_obj(spa, tx);
-
-	/*
-	 * Generate some random noise for salted checksums to operate on.
-	 */
-	(void) random_get_pseudo_bytes(spa->spa_cksum_salt.zcs_bytes,
-	    sizeof (spa->spa_cksum_salt.zcs_bytes));
-
-	/*
-	 * Set pool properties.
-	 */
-	spa->spa_bootfs = zpool_prop_default_numeric(ZPOOL_PROP_BOOTFS);
-	spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION);
-	spa->spa_failmode = zpool_prop_default_numeric(ZPOOL_PROP_FAILUREMODE);
-	spa->spa_autoexpand = zpool_prop_default_numeric(ZPOOL_PROP_AUTOEXPAND);
-	spa->spa_multihost = zpool_prop_default_numeric(ZPOOL_PROP_MULTIHOST);
-
-	if (props != NULL) {
-		spa_configfile_set(spa, props, B_FALSE);
-		spa_sync_props(props, tx);
-	}
-
-	dmu_tx_commit(tx);
-
-	spa->spa_sync_on = B_TRUE;
-	txg_sync_start(spa->spa_dsl_pool);
-	mmp_thread_start(spa);
-
-	/*
-	 * We explicitly wait for the first transaction to complete so that our
-	 * bean counters are appropriately updated.
-	 */
-	txg_wait_synced(spa->spa_dsl_pool, txg);
-
-	spa_spawn_aux_threads(spa);
-
-	spa_write_cachefile(spa, B_FALSE, B_TRUE);
-	spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_CREATE);
-
-	spa_history_log_version(spa, "create");
-
-	/*
-	 * Don't count references from objsets that are already closed
-	 * and are making their way through the eviction process.
-	 */
-	spa_evicting_os_wait(spa);
-	spa->spa_minref = zfs_refcount_count(&spa->spa_refcount);
-	spa->spa_load_state = SPA_LOAD_NONE;
-
-	mutex_exit(&spa_namespace_lock);
-
-	return (0);
-}
-
-#ifdef _KERNEL
-#ifdef illumos
-/*
- * Get the root pool information from the root disk, then import the root pool
- * during the system boot up time.
- */
-extern int vdev_disk_read_rootlabel(char *, char *, nvlist_t **);
-
-static nvlist_t *
-spa_generate_rootconf(char *devpath, char *devid, uint64_t *guid)
-{
-	nvlist_t *config;
-	nvlist_t *nvtop, *nvroot;
-	uint64_t pgid;
-
-	if (vdev_disk_read_rootlabel(devpath, devid, &config) != 0)
-		return (NULL);
-
-	/*
-	 * Add this top-level vdev to the child array.
-	 */
-	VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
-	    &nvtop) == 0);
-	VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID,
-	    &pgid) == 0);
-	VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID, guid) == 0);
-
-	/*
-	 * Put this pool's top-level vdevs into a root vdev.
-	 */
-	VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0);
-	VERIFY(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE,
-	    VDEV_TYPE_ROOT) == 0);
-	VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_ID, 0ULL) == 0);
-	VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_GUID, pgid) == 0);
-	VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
-	    &nvtop, 1) == 0);
-
-	/*
-	 * Replace the existing vdev_tree with the new root vdev in
-	 * this pool's configuration (remove the old, add the new).
-	 */
-	VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0);
-	nvlist_free(nvroot);
-	return (config);
-}
-
-/*
- * Walk the vdev tree and see if we can find a device with "better"
- * configuration. A configuration is "better" if the label on that
- * device has a more recent txg.
- */
-static void
-spa_alt_rootvdev(vdev_t *vd, vdev_t **avd, uint64_t *txg)
-{
-	for (int c = 0; c < vd->vdev_children; c++)
-		spa_alt_rootvdev(vd->vdev_child[c], avd, txg);
-
-	if (vd->vdev_ops->vdev_op_leaf) {
-		nvlist_t *label;
-		uint64_t label_txg;
-
-		if (vdev_disk_read_rootlabel(vd->vdev_physpath, vd->vdev_devid,
-		    &label) != 0)
-			return;
-
-		VERIFY(nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_TXG,
-		    &label_txg) == 0);
-
-		/*
-		 * Do we have a better boot device?
-		 */
-		if (label_txg > *txg) {
-			*txg = label_txg;
-			*avd = vd;
-		}
-		nvlist_free(label);
-	}
-}
-
-/*
- * Import a root pool.
- *
- * For x86. devpath_list will consist of devid and/or physpath name of
- * the vdev (e.g. "id1,sd@SSEAGATE..." or "/pci@1f,0/ide@d/disk@0,0:a").
- * The GRUB "findroot" command will return the vdev we should boot.
- *
- * For Sparc, devpath_list consists the physpath name of the booting device
- * no matter the rootpool is a single device pool or a mirrored pool.
- * e.g.
- *	"/pci@1f,0/ide@d/disk@0,0:a"
- */
-int
-spa_import_rootpool(char *devpath, char *devid)
-{
-	spa_t *spa;
-	vdev_t *rvd, *bvd, *avd = NULL;
-	nvlist_t *config, *nvtop;
-	uint64_t guid, txg;
-	char *pname;
-	int error;
-
-	/*
-	 * Read the label from the boot device and generate a configuration.
-	 */
-	config = spa_generate_rootconf(devpath, devid, &guid);
-#if defined(_OBP) && defined(_KERNEL)
-	if (config == NULL) {
-		if (strstr(devpath, "/iscsi/ssd") != NULL) {
-			/* iscsi boot */
-			get_iscsi_bootpath_phy(devpath);
-			config = spa_generate_rootconf(devpath, devid, &guid);
-		}
-	}
-#endif
-	if (config == NULL) {
-		cmn_err(CE_NOTE, "Cannot read the pool label from '%s'",
-		    devpath);
-		return (SET_ERROR(EIO));
-	}
-
-	VERIFY(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME,
-	    &pname) == 0);
-	VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, &txg) == 0);
-
-	mutex_enter(&spa_namespace_lock);
-	if ((spa = spa_lookup(pname)) != NULL) {
-		/*
-		 * Remove the existing root pool from the namespace so that we
-		 * can replace it with the correct config we just read in.
-		 */
-		spa_remove(spa);
-	}
-
-	spa = spa_add(pname, config, NULL);
-	spa->spa_is_root = B_TRUE;
-	spa->spa_import_flags = ZFS_IMPORT_VERBATIM;
-	if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION,
-	    &spa->spa_ubsync.ub_version) != 0)
-		spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL;
-
-	/*
-	 * Build up a vdev tree based on the boot device's label config.
-	 */
-	VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
-	    &nvtop) == 0);
-	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
-	error = spa_config_parse(spa, &rvd, nvtop, NULL, 0,
-	    VDEV_ALLOC_ROOTPOOL);
-	spa_config_exit(spa, SCL_ALL, FTAG);
-	if (error) {
-		mutex_exit(&spa_namespace_lock);
-		nvlist_free(config);
-		cmn_err(CE_NOTE, "Can not parse the config for pool '%s'",
-		    pname);
-		return (error);
-	}
-
-	/*
-	 * Get the boot vdev.
-	 */
-	if ((bvd = vdev_lookup_by_guid(rvd, guid)) == NULL) {
-		cmn_err(CE_NOTE, "Can not find the boot vdev for guid %llu",
-		    (u_longlong_t)guid);
-		error = SET_ERROR(ENOENT);
-		goto out;
-	}
-
-	/*
-	 * Determine if there is a better boot device.
-	 */
-	avd = bvd;
-	spa_alt_rootvdev(rvd, &avd, &txg);
-	if (avd != bvd) {
-		cmn_err(CE_NOTE, "The boot device is 'degraded'. Please "
-		    "try booting from '%s'", avd->vdev_path);
-		error = SET_ERROR(EINVAL);
-		goto out;
-	}
-
-	/*
-	 * If the boot device is part of a spare vdev then ensure that
-	 * we're booting off the active spare.
-	 */
-	if (bvd->vdev_parent->vdev_ops == &vdev_spare_ops &&
-	    !bvd->vdev_isspare) {
-		cmn_err(CE_NOTE, "The boot device is currently spared. Please "
-		    "try booting from '%s'",
-		    bvd->vdev_parent->
-		    vdev_child[bvd->vdev_parent->vdev_children - 1]->vdev_path);
-		error = SET_ERROR(EINVAL);
-		goto out;
-	}
-
-	error = 0;
-out:
-	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
-	vdev_free(rvd);
-	spa_config_exit(spa, SCL_ALL, FTAG);
-	mutex_exit(&spa_namespace_lock);
-
-	nvlist_free(config);
-	return (error);
-}
-
-#else	/* !illumos */
-
-extern int vdev_geom_read_pool_label(const char *name, nvlist_t ***configs,
-    uint64_t *count);
-
-static nvlist_t *
-spa_generate_rootconf(const char *name)
-{
-	nvlist_t **configs, **tops;
-	nvlist_t *config;
-	nvlist_t *best_cfg, *nvtop, *nvroot;
-	uint64_t *holes;
-	uint64_t best_txg;
-	uint64_t nchildren;
-	uint64_t pgid;
-	uint64_t count;
-	uint64_t i;
-	uint_t   nholes;
-
-	if (vdev_geom_read_pool_label(name, &configs, &count) != 0)
-		return (NULL);
-
-	ASSERT3U(count, !=, 0);
-	best_txg = 0;
-	for (i = 0; i < count; i++) {
-		uint64_t txg;
-
-		VERIFY(nvlist_lookup_uint64(configs[i], ZPOOL_CONFIG_POOL_TXG,
-		    &txg) == 0);
-		if (txg > best_txg) {
-			best_txg = txg;
-			best_cfg = configs[i];
-		}
-	}
-
-	nchildren = 1;
-	nvlist_lookup_uint64(best_cfg, ZPOOL_CONFIG_VDEV_CHILDREN, &nchildren);
-	holes = NULL;
-	nvlist_lookup_uint64_array(best_cfg, ZPOOL_CONFIG_HOLE_ARRAY,
-	    &holes, &nholes);
-
-	tops = kmem_zalloc(nchildren * sizeof(void *), KM_SLEEP);
-	for (i = 0; i < nchildren; i++) {
-		if (i >= count)
-			break;
-		if (configs[i] == NULL)
-			continue;
-		VERIFY(nvlist_lookup_nvlist(configs[i], ZPOOL_CONFIG_VDEV_TREE,
-		    &nvtop) == 0);
-		nvlist_dup(nvtop, &tops[i], KM_SLEEP);
-	}
-	for (i = 0; holes != NULL && i < nholes; i++) {
-		if (i >= nchildren)
-			continue;
-		if (tops[holes[i]] != NULL)
-			continue;
-		nvlist_alloc(&tops[holes[i]], NV_UNIQUE_NAME, KM_SLEEP);
-		VERIFY(nvlist_add_string(tops[holes[i]], ZPOOL_CONFIG_TYPE,
-		    VDEV_TYPE_HOLE) == 0);
-		VERIFY(nvlist_add_uint64(tops[holes[i]], ZPOOL_CONFIG_ID,
-		    holes[i]) == 0);
-		VERIFY(nvlist_add_uint64(tops[holes[i]], ZPOOL_CONFIG_GUID,
-		    0) == 0);
-	}
-	for (i = 0; i < nchildren; i++) {
-		if (tops[i] != NULL)
-			continue;
-		nvlist_alloc(&tops[i], NV_UNIQUE_NAME, KM_SLEEP);
-		VERIFY(nvlist_add_string(tops[i], ZPOOL_CONFIG_TYPE,
-		    VDEV_TYPE_MISSING) == 0);
-		VERIFY(nvlist_add_uint64(tops[i], ZPOOL_CONFIG_ID,
-		    i) == 0);
-		VERIFY(nvlist_add_uint64(tops[i], ZPOOL_CONFIG_GUID,
-		    0) == 0);
-	}
-
-	/*
-	 * Create pool config based on the best vdev config.
-	 */
-	nvlist_dup(best_cfg, &config, KM_SLEEP);
-
-	/*
-	 * Put this pool's top-level vdevs into a root vdev.
-	 */
-	VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID,
-	    &pgid) == 0);
-	VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0);
-	VERIFY(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE,
-	    VDEV_TYPE_ROOT) == 0);
-	VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_ID, 0ULL) == 0);
-	VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_GUID, pgid) == 0);
-	VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
-	    tops, nchildren) == 0);
-
-	/*
-	 * Replace the existing vdev_tree with the new root vdev in
-	 * this pool's configuration (remove the old, add the new).
-	 */
-	VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0);
-
-	/*
-	 * Drop vdev config elements that should not be present at pool level.
-	 */
-	nvlist_remove(config, ZPOOL_CONFIG_GUID, DATA_TYPE_UINT64);
-	nvlist_remove(config, ZPOOL_CONFIG_TOP_GUID, DATA_TYPE_UINT64);
-
-	for (i = 0; i < count; i++)
-		nvlist_free(configs[i]);
-	kmem_free(configs, count * sizeof(void *));
-	for (i = 0; i < nchildren; i++)
-		nvlist_free(tops[i]);
-	kmem_free(tops, nchildren * sizeof(void *));
-	nvlist_free(nvroot);
-	return (config);
-}
-
-int
-spa_import_rootpool(const char *name)
-{
-	spa_t *spa;
-	vdev_t *rvd, *bvd, *avd = NULL;
-	nvlist_t *config, *nvtop;
-	uint64_t txg;
-	char *pname;
-	int error;
-
-	/*
-	 * Read the label from the boot device and generate a configuration.
-	 */
-	config = spa_generate_rootconf(name);
-
-	mutex_enter(&spa_namespace_lock);
-	if (config != NULL) {
-		VERIFY(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME,
-		    &pname) == 0 && strcmp(name, pname) == 0);
-		VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, &txg)
-		    == 0);
-
-		if ((spa = spa_lookup(pname)) != NULL) {
-			/*
-			 * The pool could already be imported,
-			 * e.g., after reboot -r.
-			 */
-			if (spa->spa_state == POOL_STATE_ACTIVE) {
-				mutex_exit(&spa_namespace_lock);
-				nvlist_free(config);
-				return (0);
-			}
-
-			/*
-			 * Remove the existing root pool from the namespace so
-			 * that we can replace it with the correct config
-			 * we just read in.
-			 */
-			spa_remove(spa);
-		}
-		spa = spa_add(pname, config, NULL);
-
-		/*
-		 * Set spa_ubsync.ub_version as it can be used in vdev_alloc()
-		 * via spa_version().
-		 */
-		if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION,
-		    &spa->spa_ubsync.ub_version) != 0)
-			spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL;
-	} else if ((spa = spa_lookup(name)) == NULL) {
-		mutex_exit(&spa_namespace_lock);
-		nvlist_free(config);
-		cmn_err(CE_NOTE, "Cannot find the pool label for '%s'",
-		    name);
-		return (EIO);
-	} else {
-		VERIFY(nvlist_dup(spa->spa_config, &config, KM_SLEEP) == 0);
-	}
-	spa->spa_is_root = B_TRUE;
-	spa->spa_import_flags = ZFS_IMPORT_VERBATIM;
-
-	/*
-	 * Build up a vdev tree based on the boot device's label config.
-	 */
-	VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
-	    &nvtop) == 0);
-	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
-	error = spa_config_parse(spa, &rvd, nvtop, NULL, 0,
-	    VDEV_ALLOC_ROOTPOOL);
-	spa_config_exit(spa, SCL_ALL, FTAG);
-	if (error) {
-		mutex_exit(&spa_namespace_lock);
-		nvlist_free(config);
-		cmn_err(CE_NOTE, "Can not parse the config for pool '%s'",
-		    pname);
-		return (error);
-	}
-
-	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
-	vdev_free(rvd);
-	spa_config_exit(spa, SCL_ALL, FTAG);
-	mutex_exit(&spa_namespace_lock);
-
-	nvlist_free(config);
-	return (0);
-}
-
-#endif	/* illumos */
-#endif	/* _KERNEL */
-
-/*
- * Import a non-root pool into the system.
- */
-int
-spa_import(const char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags)
-{
-	spa_t *spa;
-	char *altroot = NULL;
-	spa_load_state_t state = SPA_LOAD_IMPORT;
-	zpool_load_policy_t policy;
-	uint64_t mode = spa_mode_global;
-	uint64_t readonly = B_FALSE;
-	int error;
-	nvlist_t *nvroot;
-	nvlist_t **spares, **l2cache;
-	uint_t nspares, nl2cache;
-
-	/*
-	 * If a pool with this name exists, return failure.
-	 */
-	mutex_enter(&spa_namespace_lock);
-	if (spa_lookup(pool) != NULL) {
-		mutex_exit(&spa_namespace_lock);
-		return (SET_ERROR(EEXIST));
-	}
-
-	/*
-	 * Create and initialize the spa structure.
-	 */
-	(void) nvlist_lookup_string(props,
-	    zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
-	(void) nvlist_lookup_uint64(props,
-	    zpool_prop_to_name(ZPOOL_PROP_READONLY), &readonly);
-	if (readonly)
-		mode = FREAD;
-	spa = spa_add(pool, config, altroot);
-	spa->spa_import_flags = flags;
-
-	/*
-	 * Verbatim import - Take a pool and insert it into the namespace
-	 * as if it had been loaded at boot.
-	 */
-	if (spa->spa_import_flags & ZFS_IMPORT_VERBATIM) {
-		if (props != NULL)
-			spa_configfile_set(spa, props, B_FALSE);
-
-		spa_write_cachefile(spa, B_FALSE, B_TRUE);
-		spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_IMPORT);
-		zfs_dbgmsg("spa_import: verbatim import of %s", pool);
-		mutex_exit(&spa_namespace_lock);
-		return (0);
-	}
-
-	spa_activate(spa, mode);
-
-	/*
-	 * Don't start async tasks until we know everything is healthy.
-	 */
-	spa_async_suspend(spa);
-
-	zpool_get_load_policy(config, &policy);
-	if (policy.zlp_rewind & ZPOOL_DO_REWIND)
-		state = SPA_LOAD_RECOVER;
-
-	spa->spa_config_source = SPA_CONFIG_SRC_TRYIMPORT;
-
-	if (state != SPA_LOAD_RECOVER) {
-		spa->spa_last_ubsync_txg = spa->spa_load_txg = 0;
-		zfs_dbgmsg("spa_import: importing %s", pool);
-	} else {
-		zfs_dbgmsg("spa_import: importing %s, max_txg=%lld "
-		    "(RECOVERY MODE)", pool, (longlong_t)policy.zlp_txg);
-	}
-	error = spa_load_best(spa, state, policy.zlp_txg, policy.zlp_rewind);
-
-	/*
-	 * Propagate anything learned while loading the pool and pass it
-	 * back to caller (i.e. rewind info, missing devices, etc).
-	 */
-	VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO,
-	    spa->spa_load_info) == 0);
-
-	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
-	/*
-	 * Toss any existing sparelist, as it doesn't have any validity
-	 * anymore, and conflicts with spa_has_spare().
-	 */
-	if (spa->spa_spares.sav_config) {
-		nvlist_free(spa->spa_spares.sav_config);
-		spa->spa_spares.sav_config = NULL;
-		spa_load_spares(spa);
-	}
-	if (spa->spa_l2cache.sav_config) {
-		nvlist_free(spa->spa_l2cache.sav_config);
-		spa->spa_l2cache.sav_config = NULL;
-		spa_load_l2cache(spa);
-	}
-
-	VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
-	    &nvroot) == 0);
-	if (error == 0)
-		error = spa_validate_aux(spa, nvroot, -1ULL,
-		    VDEV_ALLOC_SPARE);
-	if (error == 0)
-		error = spa_validate_aux(spa, nvroot, -1ULL,
-		    VDEV_ALLOC_L2CACHE);
-	spa_config_exit(spa, SCL_ALL, FTAG);
-
-	if (props != NULL)
-		spa_configfile_set(spa, props, B_FALSE);
-
-	if (error != 0 || (props && spa_writeable(spa) &&
-	    (error = spa_prop_set(spa, props)))) {
-		spa_unload(spa);
-		spa_deactivate(spa);
-		spa_remove(spa);
-		mutex_exit(&spa_namespace_lock);
-		return (error);
-	}
-
-	spa_async_resume(spa);
-
-	/*
-	 * Override any spares and level 2 cache devices as specified by
-	 * the user, as these may have correct device names/devids, etc.
-	 */
-	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
-	    &spares, &nspares) == 0) {
-		if (spa->spa_spares.sav_config)
-			VERIFY(nvlist_remove(spa->spa_spares.sav_config,
-			    ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0);
-		else
-			VERIFY(nvlist_alloc(&spa->spa_spares.sav_config,
-			    NV_UNIQUE_NAME, KM_SLEEP) == 0);
-		VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config,
-		    ZPOOL_CONFIG_SPARES, spares, nspares) == 0);
-		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
-		spa_load_spares(spa);
-		spa_config_exit(spa, SCL_ALL, FTAG);
-		spa->spa_spares.sav_sync = B_TRUE;
-	}
-	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE,
-	    &l2cache, &nl2cache) == 0) {
-		if (spa->spa_l2cache.sav_config)
-			VERIFY(nvlist_remove(spa->spa_l2cache.sav_config,
-			    ZPOOL_CONFIG_L2CACHE, DATA_TYPE_NVLIST_ARRAY) == 0);
-		else
-			VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config,
-			    NV_UNIQUE_NAME, KM_SLEEP) == 0);
-		VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config,
-		    ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0);
-		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
-		spa_load_l2cache(spa);
-		spa_config_exit(spa, SCL_ALL, FTAG);
-		spa->spa_l2cache.sav_sync = B_TRUE;
-	}
-
-	/*
-	 * Check for any removed devices.
-	 */
-	if (spa->spa_autoreplace) {
-		spa_aux_check_removed(&spa->spa_spares);
-		spa_aux_check_removed(&spa->spa_l2cache);
-	}
-
-	if (spa_writeable(spa)) {
-		/*
-		 * Update the config cache to include the newly-imported pool.
-		 */
-		spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
-	}
-
-	/*
-	 * It's possible that the pool was expanded while it was exported.
-	 * We kick off an async task to handle this for us.
-	 */
-	spa_async_request(spa, SPA_ASYNC_AUTOEXPAND);
-
-	spa_history_log_version(spa, "import");
-
-	spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_IMPORT);
-
-	mutex_exit(&spa_namespace_lock);
-
-#ifdef __FreeBSD__
-#ifdef _KERNEL
-	zvol_create_minors(spa, pool);
-#endif
-#endif
-	return (0);
-}
-
-nvlist_t *
-spa_tryimport(nvlist_t *tryconfig)
-{
-	nvlist_t *config = NULL;
-	char *poolname, *cachefile;
-	spa_t *spa;
-	uint64_t state;
-	int error;
-	zpool_load_policy_t policy;
-
-	if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname))
-		return (NULL);
-
-	if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state))
-		return (NULL);
-
-	/*
-	 * Create and initialize the spa structure.
-	 */
-	mutex_enter(&spa_namespace_lock);
-	spa = spa_add(TRYIMPORT_NAME, tryconfig, NULL);
-	spa_activate(spa, FREAD);
-
-	/*
-	 * Rewind pool if a max txg was provided.
-	 */
-	zpool_get_load_policy(spa->spa_config, &policy);
-	if (policy.zlp_txg != UINT64_MAX) {
-		spa->spa_load_max_txg = policy.zlp_txg;
-		spa->spa_extreme_rewind = B_TRUE;
-		zfs_dbgmsg("spa_tryimport: importing %s, max_txg=%lld",
-		    poolname, (longlong_t)policy.zlp_txg);
-	} else {
-		zfs_dbgmsg("spa_tryimport: importing %s", poolname);
-	}
-
-	if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_CACHEFILE, &cachefile)
-	    == 0) {
-		zfs_dbgmsg("spa_tryimport: using cachefile '%s'", cachefile);
-		spa->spa_config_source = SPA_CONFIG_SRC_CACHEFILE;
-	} else {
-		spa->spa_config_source = SPA_CONFIG_SRC_SCAN;
-	}
-
-	error = spa_load(spa, SPA_LOAD_TRYIMPORT, SPA_IMPORT_EXISTING);
-
-	/*
-	 * If 'tryconfig' was at least parsable, return the current config.
-	 */
-	if (spa->spa_root_vdev != NULL) {
-		config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
-		VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME,
-		    poolname) == 0);
-		VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE,
-		    state) == 0);
-		VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_TIMESTAMP,
-		    spa->spa_uberblock.ub_timestamp) == 0);
-		VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO,
-		    spa->spa_load_info) == 0);
-
-		/*
-		 * If the bootfs property exists on this pool then we
-		 * copy it out so that external consumers can tell which
-		 * pools are bootable.
-		 */
-		if ((!error || error == EEXIST) && spa->spa_bootfs) {
-			char *tmpname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
-
-			/*
-			 * We have to play games with the name since the
-			 * pool was opened as TRYIMPORT_NAME.
-			 */
-			if (dsl_dsobj_to_dsname(spa_name(spa),
-			    spa->spa_bootfs, tmpname) == 0) {
-				char *cp;
-				char *dsname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
-
-				cp = strchr(tmpname, '/');
-				if (cp == NULL) {
-					(void) strlcpy(dsname, tmpname,
-					    MAXPATHLEN);
-				} else {
-					(void) snprintf(dsname, MAXPATHLEN,
-					    "%s/%s", poolname, ++cp);
-				}
-				VERIFY(nvlist_add_string(config,
-				    ZPOOL_CONFIG_BOOTFS, dsname) == 0);
-				kmem_free(dsname, MAXPATHLEN);
-			}
-			kmem_free(tmpname, MAXPATHLEN);
-		}
-
-		/*
-		 * Add the list of hot spares and level 2 cache devices.
-		 */
-		spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
-		spa_add_spares(spa, config);
-		spa_add_l2cache(spa, config);
-		spa_config_exit(spa, SCL_CONFIG, FTAG);
-	}
-
-	spa_unload(spa);
-	spa_deactivate(spa);
-	spa_remove(spa);
-	mutex_exit(&spa_namespace_lock);
-
-	return (config);
-}
-
-/*
- * Pool export/destroy
- *
- * The act of destroying or exporting a pool is very simple.  We make sure there
- * is no more pending I/O and any references to the pool are gone.  Then, we
- * update the pool state and sync all the labels to disk, removing the
- * configuration from the cache afterwards. If the 'hardforce' flag is set, then
- * we don't sync the labels or remove the configuration cache.
- */
-static int
-spa_export_common(char *pool, int new_state, nvlist_t **oldconfig,
-    boolean_t force, boolean_t hardforce)
-{
-	spa_t *spa;
-
-	if (oldconfig)
-		*oldconfig = NULL;
-
-	if (!(spa_mode_global & FWRITE))
-		return (SET_ERROR(EROFS));
-
-	mutex_enter(&spa_namespace_lock);
-	if ((spa = spa_lookup(pool)) == NULL) {
-		mutex_exit(&spa_namespace_lock);
-		return (SET_ERROR(ENOENT));
-	}
-
-	/*
-	 * Put a hold on the pool, drop the namespace lock, stop async tasks,
-	 * reacquire the namespace lock, and see if we can export.
-	 */
-	spa_open_ref(spa, FTAG);
-	mutex_exit(&spa_namespace_lock);
-	spa_async_suspend(spa);
-	if (spa->spa_zvol_taskq) {
-#ifdef _KERNEL
-		zvol_remove_minors(spa, spa_name(spa));
-#endif
-		taskq_wait(spa->spa_zvol_taskq);
-	}
-	mutex_enter(&spa_namespace_lock);
-	spa_close(spa, FTAG);
-
-	/*
-	 * The pool will be in core if it's openable,
-	 * in which case we can modify its state.
-	 */
-	if (spa->spa_state != POOL_STATE_UNINITIALIZED && spa->spa_sync_on) {
-
-		/*
-		 * Objsets may be open only because they're dirty, so we
-		 * have to force it to sync before checking spa_refcnt.
-		 */
-		txg_wait_synced(spa->spa_dsl_pool, 0);
-		spa_evicting_os_wait(spa);
-
-		/*
-		 * A pool cannot be exported or destroyed if there are active
-		 * references.  If we are resetting a pool, allow references by
-		 * fault injection handlers.
-		 */
-		if (!spa_refcount_zero(spa) ||
-		    (spa->spa_inject_ref != 0 &&
-		    new_state != POOL_STATE_UNINITIALIZED)) {
-			spa_async_resume(spa);
-			mutex_exit(&spa_namespace_lock);
-			return (SET_ERROR(EBUSY));
-		}
-
-		/*
-		 * A pool cannot be exported if it has an active shared spare.
-		 * This is to prevent other pools stealing the active spare
-		 * from an exported pool. At user's own will, such pool can
-		 * be forcedly exported.
-		 */
-		if (!force && new_state == POOL_STATE_EXPORTED &&
-		    spa_has_active_shared_spare(spa)) {
-			spa_async_resume(spa);
-			mutex_exit(&spa_namespace_lock);
-			return (SET_ERROR(EXDEV));
-		}
-
-		/*
-		 * We're about to export or destroy this pool. Make sure
-		 * we stop all initializtion activity here before we
-		 * set the spa_final_txg. This will ensure that all
-		 * dirty data resulting from the initialization is
-		 * committed to disk before we unload the pool.
-		 */
-		if (spa->spa_root_vdev != NULL) {
-			vdev_initialize_stop_all(spa->spa_root_vdev,
-			    VDEV_INITIALIZE_ACTIVE);
-		}
-
-		/*
-		 * We want this to be reflected on every label,
-		 * so mark them all dirty.  spa_unload() will do the
-		 * final sync that pushes these changes out.
-		 */
-		if (new_state != POOL_STATE_UNINITIALIZED && !hardforce) {
-			spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
-			spa->spa_state = new_state;
-			spa->spa_final_txg = spa_last_synced_txg(spa) +
-			    TXG_DEFER_SIZE + 1;
-			vdev_config_dirty(spa->spa_root_vdev);
-			spa_config_exit(spa, SCL_ALL, FTAG);
-		}
-	}
-
-	spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_DESTROY);
-
-	if (spa->spa_state != POOL_STATE_UNINITIALIZED) {
-		spa_unload(spa);
-		spa_deactivate(spa);
-	}
-
-	if (oldconfig && spa->spa_config)
-		VERIFY(nvlist_dup(spa->spa_config, oldconfig, 0) == 0);
-
-	if (new_state != POOL_STATE_UNINITIALIZED) {
-		if (!hardforce)
-			spa_write_cachefile(spa, B_TRUE, B_TRUE);
-		spa_remove(spa);
-	}
-	mutex_exit(&spa_namespace_lock);
-
-	return (0);
-}
-
-/*
- * Destroy a storage pool.
- */
-int
-spa_destroy(char *pool)
-{
-	return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL,
-	    B_FALSE, B_FALSE));
-}
-
-/*
- * Export a storage pool.
- */
-int
-spa_export(char *pool, nvlist_t **oldconfig, boolean_t force,
-    boolean_t hardforce)
-{
-	return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig,
-	    force, hardforce));
-}
-
-/*
- * Similar to spa_export(), this unloads the spa_t without actually removing it
- * from the namespace in any way.
- */
-int
-spa_reset(char *pool)
-{
-	return (spa_export_common(pool, POOL_STATE_UNINITIALIZED, NULL,
-	    B_FALSE, B_FALSE));
-}
-
-/*
- * ==========================================================================
- * Device manipulation
- * ==========================================================================
- */
-
-/*
- * Add a device to a storage pool.
- */
-int
-spa_vdev_add(spa_t *spa, nvlist_t *nvroot)
-{
-	uint64_t txg, id;
-	int error;
-	vdev_t *rvd = spa->spa_root_vdev;
-	vdev_t *vd, *tvd;
-	nvlist_t **spares, **l2cache;
-	uint_t nspares, nl2cache;
-
-	ASSERT(spa_writeable(spa));
-
-	txg = spa_vdev_enter(spa);
-
-	if ((error = spa_config_parse(spa, &vd, nvroot, NULL, 0,
-	    VDEV_ALLOC_ADD)) != 0)
-		return (spa_vdev_exit(spa, NULL, txg, error));
-
-	spa->spa_pending_vdev = vd;	/* spa_vdev_exit() will clear this */
-
-	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares,
-	    &nspares) != 0)
-		nspares = 0;
-
-	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2cache,
-	    &nl2cache) != 0)
-		nl2cache = 0;
-
-	if (vd->vdev_children == 0 && nspares == 0 && nl2cache == 0)
-		return (spa_vdev_exit(spa, vd, txg, EINVAL));
-
-	if (vd->vdev_children != 0 &&
-	    (error = vdev_create(vd, txg, B_FALSE)) != 0)
-		return (spa_vdev_exit(spa, vd, txg, error));
-
-	/*
-	 * We must validate the spares and l2cache devices after checking the
-	 * children.  Otherwise, vdev_inuse() will blindly overwrite the spare.
-	 */
-	if ((error = spa_validate_aux(spa, nvroot, txg, VDEV_ALLOC_ADD)) != 0)
-		return (spa_vdev_exit(spa, vd, txg, error));
-
-	/*
-	 * If we are in the middle of a device removal, we can only add
-	 * devices which match the existing devices in the pool.
-	 * If we are in the middle of a removal, or have some indirect
-	 * vdevs, we can not add raidz toplevels.
-	 */
-	if (spa->spa_vdev_removal != NULL ||
-	    spa->spa_removing_phys.sr_prev_indirect_vdev != -1) {
-		for (int c = 0; c < vd->vdev_children; c++) {
-			tvd = vd->vdev_child[c];
-			if (spa->spa_vdev_removal != NULL &&
-			    tvd->vdev_ashift != spa->spa_max_ashift) {
-				return (spa_vdev_exit(spa, vd, txg, EINVAL));
-			}
-			/* Fail if top level vdev is raidz */
-			if (tvd->vdev_ops == &vdev_raidz_ops) {
-				return (spa_vdev_exit(spa, vd, txg, EINVAL));
-			}
-			/*
-			 * Need the top level mirror to be
-			 * a mirror of leaf vdevs only
-			 */
-			if (tvd->vdev_ops == &vdev_mirror_ops) {
-				for (uint64_t cid = 0;
-				    cid < tvd->vdev_children; cid++) {
-					vdev_t *cvd = tvd->vdev_child[cid];
-					if (!cvd->vdev_ops->vdev_op_leaf) {
-						return (spa_vdev_exit(spa, vd,
-						    txg, EINVAL));
-					}
-				}
-			}
-		}
-	}
-
-	for (int c = 0; c < vd->vdev_children; c++) {
-
-		/*
-		 * Set the vdev id to the first hole, if one exists.
-		 */
-		for (id = 0; id < rvd->vdev_children; id++) {
-			if (rvd->vdev_child[id]->vdev_ishole) {
-				vdev_free(rvd->vdev_child[id]);
-				break;
-			}
-		}
-		tvd = vd->vdev_child[c];
-		vdev_remove_child(vd, tvd);
-		tvd->vdev_id = id;
-		vdev_add_child(rvd, tvd);
-		vdev_config_dirty(tvd);
-	}
-
-	if (nspares != 0) {
-		spa_set_aux_vdevs(&spa->spa_spares, spares, nspares,
-		    ZPOOL_CONFIG_SPARES);
-		spa_load_spares(spa);
-		spa->spa_spares.sav_sync = B_TRUE;
-	}
-
-	if (nl2cache != 0) {
-		spa_set_aux_vdevs(&spa->spa_l2cache, l2cache, nl2cache,
-		    ZPOOL_CONFIG_L2CACHE);
-		spa_load_l2cache(spa);
-		spa->spa_l2cache.sav_sync = B_TRUE;
-	}
-
-	/*
-	 * We have to be careful when adding new vdevs to an existing pool.
-	 * If other threads start allocating from these vdevs before we
-	 * sync the config cache, and we lose power, then upon reboot we may
-	 * fail to open the pool because there are DVAs that the config cache
-	 * can't translate.  Therefore, we first add the vdevs without
-	 * initializing metaslabs; sync the config cache (via spa_vdev_exit());
-	 * and then let spa_config_update() initialize the new metaslabs.
-	 *
-	 * spa_load() checks for added-but-not-initialized vdevs, so that
-	 * if we lose power at any point in this sequence, the remaining
-	 * steps will be completed the next time we load the pool.
-	 */
-	(void) spa_vdev_exit(spa, vd, txg, 0);
-
-	mutex_enter(&spa_namespace_lock);
-	spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
-	spa_event_notify(spa, NULL, NULL, ESC_ZFS_VDEV_ADD);
-	mutex_exit(&spa_namespace_lock);
-
-	return (0);
-}
-
-/*
- * Attach a device to a mirror.  The arguments are the path to any device
- * in the mirror, and the nvroot for the new device.  If the path specifies
- * a device that is not mirrored, we automatically insert the mirror vdev.
- *
- * If 'replacing' is specified, the new device is intended to replace the
- * existing device; in this case the two devices are made into their own
- * mirror using the 'replacing' vdev, which is functionally identical to
- * the mirror vdev (it actually reuses all the same ops) but has a few
- * extra rules: you can't attach to it after it's been created, and upon
- * completion of resilvering, the first disk (the one being replaced)
- * is automatically detached.
- */
-int
-spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing)
-{
-	uint64_t txg, dtl_max_txg;
-	vdev_t *rvd = spa->spa_root_vdev;
-	vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd;
-	vdev_ops_t *pvops;
-	char *oldvdpath, *newvdpath;
-	int newvd_isspare;
-	int error;
-
-	ASSERT(spa_writeable(spa));
-
-	txg = spa_vdev_enter(spa);
-
-	oldvd = spa_lookup_by_guid(spa, guid, B_FALSE);
-
-	ASSERT(MUTEX_HELD(&spa_namespace_lock));
-	if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) {
-		error = (spa_has_checkpoint(spa)) ?
-		    ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT;
-		return (spa_vdev_exit(spa, NULL, txg, error));
-	}
-
-	if (spa->spa_vdev_removal != NULL)
-		return (spa_vdev_exit(spa, NULL, txg, EBUSY));
-
-	if (oldvd == NULL)
-		return (spa_vdev_exit(spa, NULL, txg, ENODEV));
-
-	if (!oldvd->vdev_ops->vdev_op_leaf)
-		return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
-
-	pvd = oldvd->vdev_parent;
-
-	if ((error = spa_config_parse(spa, &newrootvd, nvroot, NULL, 0,
-	    VDEV_ALLOC_ATTACH)) != 0)
-		return (spa_vdev_exit(spa, NULL, txg, EINVAL));
-
-	if (newrootvd->vdev_children != 1)
-		return (spa_vdev_exit(spa, newrootvd, txg, EINVAL));
-
-	newvd = newrootvd->vdev_child[0];
-
-	if (!newvd->vdev_ops->vdev_op_leaf)
-		return (spa_vdev_exit(spa, newrootvd, txg, EINVAL));
-
-	if ((error = vdev_create(newrootvd, txg, replacing)) != 0)
-		return (spa_vdev_exit(spa, newrootvd, txg, error));
-
-	/*
-	 * Spares can't replace logs
-	 */
-	if (oldvd->vdev_top->vdev_islog && newvd->vdev_isspare)
-		return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
-
-	if (!replacing) {
-		/*
-		 * For attach, the only allowable parent is a mirror or the root
-		 * vdev.
-		 */
-		if (pvd->vdev_ops != &vdev_mirror_ops &&
-		    pvd->vdev_ops != &vdev_root_ops)
-			return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
-
-		pvops = &vdev_mirror_ops;
-	} else {
-		/*
-		 * Active hot spares can only be replaced by inactive hot
-		 * spares.
-		 */
-		if (pvd->vdev_ops == &vdev_spare_ops &&
-		    oldvd->vdev_isspare &&
-		    !spa_has_spare(spa, newvd->vdev_guid))
-			return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
-
-		/*
-		 * If the source is a hot spare, and the parent isn't already a
-		 * spare, then we want to create a new hot spare.  Otherwise, we
-		 * want to create a replacing vdev.  The user is not allowed to
-		 * attach to a spared vdev child unless the 'isspare' state is
-		 * the same (spare replaces spare, non-spare replaces
-		 * non-spare).
-		 */
-		if (pvd->vdev_ops == &vdev_replacing_ops &&
-		    spa_version(spa) < SPA_VERSION_MULTI_REPLACE) {
-			return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
-		} else if (pvd->vdev_ops == &vdev_spare_ops &&
-		    newvd->vdev_isspare != oldvd->vdev_isspare) {
-			return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
-		}
-
-		if (newvd->vdev_isspare)
-			pvops = &vdev_spare_ops;
-		else
-			pvops = &vdev_replacing_ops;
-	}
-
-	/*
-	 * Make sure the new device is big enough.
-	 */
-	if (newvd->vdev_asize < vdev_get_min_asize(oldvd))
-		return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW));
-
-	/*
-	 * The new device cannot have a higher alignment requirement
-	 * than the top-level vdev.
-	 */
-	if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift)
-		return (spa_vdev_exit(spa, newrootvd, txg, EDOM));
-
-	/*
-	 * If this is an in-place replacement, update oldvd's path and devid
-	 * to make it distinguishable from newvd, and unopenable from now on.
-	 */
-	if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) {
-		spa_strfree(oldvd->vdev_path);
-		oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5,
-		    KM_SLEEP);
-		(void) sprintf(oldvd->vdev_path, "%s/%s",
-		    newvd->vdev_path, "old");
-		if (oldvd->vdev_devid != NULL) {
-			spa_strfree(oldvd->vdev_devid);
-			oldvd->vdev_devid = NULL;
-		}
-	}
-
-	/* mark the device being resilvered */
-	newvd->vdev_resilver_txg = txg;
-
-	/*
-	 * If the parent is not a mirror, or if we're replacing, insert the new
-	 * mirror/replacing/spare vdev above oldvd.
-	 */
-	if (pvd->vdev_ops != pvops)
-		pvd = vdev_add_parent(oldvd, pvops);
-
-	ASSERT(pvd->vdev_top->vdev_parent == rvd);
-	ASSERT(pvd->vdev_ops == pvops);
-	ASSERT(oldvd->vdev_parent == pvd);
-
-	/*
-	 * Extract the new device from its root and add it to pvd.
-	 */
-	vdev_remove_child(newrootvd, newvd);
-	newvd->vdev_id = pvd->vdev_children;
-	newvd->vdev_crtxg = oldvd->vdev_crtxg;
-	vdev_add_child(pvd, newvd);
-
-	tvd = newvd->vdev_top;
-	ASSERT(pvd->vdev_top == tvd);
-	ASSERT(tvd->vdev_parent == rvd);
-
-	vdev_config_dirty(tvd);
-
-	/*
-	 * Set newvd's DTL to [TXG_INITIAL, dtl_max_txg) so that we account
-	 * for any dmu_sync-ed blocks.  It will propagate upward when
-	 * spa_vdev_exit() calls vdev_dtl_reassess().
-	 */
-	dtl_max_txg = txg + TXG_CONCURRENT_STATES;
-
-	vdev_dtl_dirty(newvd, DTL_MISSING, TXG_INITIAL,
-	    dtl_max_txg - TXG_INITIAL);
-
-	if (newvd->vdev_isspare) {
-		spa_spare_activate(newvd);
-		spa_event_notify(spa, newvd, NULL, ESC_ZFS_VDEV_SPARE);
-	}
-
-	oldvdpath = spa_strdup(oldvd->vdev_path);
-	newvdpath = spa_strdup(newvd->vdev_path);
-	newvd_isspare = newvd->vdev_isspare;
-
-	/*
-	 * Mark newvd's DTL dirty in this txg.
-	 */
-	vdev_dirty(tvd, VDD_DTL, newvd, txg);
-
-	/*
-	 * Schedule the resilver to restart in the future. We do this to
-	 * ensure that dmu_sync-ed blocks have been stitched into the
-	 * respective datasets.
-	 */
-	dsl_resilver_restart(spa->spa_dsl_pool, dtl_max_txg);
-
-	if (spa->spa_bootfs)
-		spa_event_notify(spa, newvd, NULL, ESC_ZFS_BOOTFS_VDEV_ATTACH);
-
-	spa_event_notify(spa, newvd, NULL, ESC_ZFS_VDEV_ATTACH);
-
-	/*
-	 * Commit the config
-	 */
-	(void) spa_vdev_exit(spa, newrootvd, dtl_max_txg, 0);
-
-	spa_history_log_internal(spa, "vdev attach", NULL,
-	    "%s vdev=%s %s vdev=%s",
-	    replacing && newvd_isspare ? "spare in" :
-	    replacing ? "replace" : "attach", newvdpath,
-	    replacing ? "for" : "to", oldvdpath);
-
-	spa_strfree(oldvdpath);
-	spa_strfree(newvdpath);
-
-	return (0);
-}
-
-/*
- * Detach a device from a mirror or replacing vdev.
- *
- * If 'replace_done' is specified, only detach if the parent
- * is a replacing vdev.
- */
-int
-spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done)
-{
-	uint64_t txg;
-	int error;
-	vdev_t *rvd = spa->spa_root_vdev;
-	vdev_t *vd, *pvd, *cvd, *tvd;
-	boolean_t unspare = B_FALSE;
-	uint64_t unspare_guid = 0;
-	char *vdpath;
-
-	ASSERT(spa_writeable(spa));
-
-	txg = spa_vdev_enter(spa);
-
-	vd = spa_lookup_by_guid(spa, guid, B_FALSE);
-
-	/*
-	 * Besides being called directly from the userland through the
-	 * ioctl interface, spa_vdev_detach() can be potentially called
-	 * at the end of spa_vdev_resilver_done().
-	 *
-	 * In the regular case, when we have a checkpoint this shouldn't
-	 * happen as we never empty the DTLs of a vdev during the scrub
-	 * [see comment in dsl_scan_done()]. Thus spa_vdev_resilvering_done()
-	 * should never get here when we have a checkpoint.
-	 *
-	 * That said, even in a case when we checkpoint the pool exactly
-	 * as spa_vdev_resilver_done() calls this function everything
-	 * should be fine as the resilver will return right away.
-	 */
-	ASSERT(MUTEX_HELD(&spa_namespace_lock));
-	if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) {
-		error = (spa_has_checkpoint(spa)) ?
-		    ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT;
-		return (spa_vdev_exit(spa, NULL, txg, error));
-	}
-
-	if (vd == NULL)
-		return (spa_vdev_exit(spa, NULL, txg, ENODEV));
-
-	if (!vd->vdev_ops->vdev_op_leaf)
-		return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
-
-	pvd = vd->vdev_parent;
-
-	/*
-	 * If the parent/child relationship is not as expected, don't do it.
-	 * Consider M(A,R(B,C)) -- that is, a mirror of A with a replacing
-	 * vdev that's replacing B with C.  The user's intent in replacing
-	 * is to go from M(A,B) to M(A,C).  If the user decides to cancel
-	 * the replace by detaching C, the expected behavior is to end up
-	 * M(A,B).  But suppose that right after deciding to detach C,
-	 * the replacement of B completes.  We would have M(A,C), and then
-	 * ask to detach C, which would leave us with just A -- not what
-	 * the user wanted.  To prevent this, we make sure that the
-	 * parent/child relationship hasn't changed -- in this example,
-	 * that C's parent is still the replacing vdev R.
-	 */
-	if (pvd->vdev_guid != pguid && pguid != 0)
-		return (spa_vdev_exit(spa, NULL, txg, EBUSY));
-
-	/*
-	 * Only 'replacing' or 'spare' vdevs can be replaced.
-	 */
-	if (replace_done && pvd->vdev_ops != &vdev_replacing_ops &&
-	    pvd->vdev_ops != &vdev_spare_ops)
-		return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
-
-	ASSERT(pvd->vdev_ops != &vdev_spare_ops ||
-	    spa_version(spa) >= SPA_VERSION_SPARES);
-
-	/*
-	 * Only mirror, replacing, and spare vdevs support detach.
-	 */
-	if (pvd->vdev_ops != &vdev_replacing_ops &&
-	    pvd->vdev_ops != &vdev_mirror_ops &&
-	    pvd->vdev_ops != &vdev_spare_ops)
-		return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
-
-	/*
-	 * If this device has the only valid copy of some data,
-	 * we cannot safely detach it.
-	 */
-	if (vdev_dtl_required(vd))
-		return (spa_vdev_exit(spa, NULL, txg, EBUSY));
-
-	ASSERT(pvd->vdev_children >= 2);
-
-	/*
-	 * If we are detaching the second disk from a replacing vdev, then
-	 * check to see if we changed the original vdev's path to have "/old"
-	 * at the end in spa_vdev_attach().  If so, undo that change now.
-	 */
-	if (pvd->vdev_ops == &vdev_replacing_ops && vd->vdev_id > 0 &&
-	    vd->vdev_path != NULL) {
-		size_t len = strlen(vd->vdev_path);
-
-		for (int c = 0; c < pvd->vdev_children; c++) {
-			cvd = pvd->vdev_child[c];
-
-			if (cvd == vd || cvd->vdev_path == NULL)
-				continue;
-
-			if (strncmp(cvd->vdev_path, vd->vdev_path, len) == 0 &&
-			    strcmp(cvd->vdev_path + len, "/old") == 0) {
-				spa_strfree(cvd->vdev_path);
-				cvd->vdev_path = spa_strdup(vd->vdev_path);
-				break;
-			}
-		}
-	}
-
-	/*
-	 * If we are detaching the original disk from a spare, then it implies
-	 * that the spare should become a real disk, and be removed from the
-	 * active spare list for the pool.
-	 */
-	if (pvd->vdev_ops == &vdev_spare_ops &&
-	    vd->vdev_id == 0 &&
-	    pvd->vdev_child[pvd->vdev_children - 1]->vdev_isspare)
-		unspare = B_TRUE;
-
-	/*
-	 * Erase the disk labels so the disk can be used for other things.
-	 * This must be done after all other error cases are handled,
-	 * but before we disembowel vd (so we can still do I/O to it).
-	 * But if we can't do it, don't treat the error as fatal --
-	 * it may be that the unwritability of the disk is the reason
-	 * it's being detached!
-	 */
-	error = vdev_label_init(vd, 0, VDEV_LABEL_REMOVE);
-
-	/*
-	 * Remove vd from its parent and compact the parent's children.
-	 */
-	vdev_remove_child(pvd, vd);
-	vdev_compact_children(pvd);
-
-	/*
-	 * Remember one of the remaining children so we can get tvd below.
-	 */
-	cvd = pvd->vdev_child[pvd->vdev_children - 1];
-
-	/*
-	 * If we need to remove the remaining child from the list of hot spares,
-	 * do it now, marking the vdev as no longer a spare in the process.
-	 * We must do this before vdev_remove_parent(), because that can
-	 * change the GUID if it creates a new toplevel GUID.  For a similar
-	 * reason, we must remove the spare now, in the same txg as the detach;
-	 * otherwise someone could attach a new sibling, change the GUID, and
-	 * the subsequent attempt to spa_vdev_remove(unspare_guid) would fail.
-	 */
-	if (unspare) {
-		ASSERT(cvd->vdev_isspare);
-		spa_spare_remove(cvd);
-		unspare_guid = cvd->vdev_guid;
-		(void) spa_vdev_remove(spa, unspare_guid, B_TRUE);
-		cvd->vdev_unspare = B_TRUE;
-	}
-
-	/*
-	 * If the parent mirror/replacing vdev only has one child,
-	 * the parent is no longer needed.  Remove it from the tree.
-	 */
-	if (pvd->vdev_children == 1) {
-		if (pvd->vdev_ops == &vdev_spare_ops)
-			cvd->vdev_unspare = B_FALSE;
-		vdev_remove_parent(cvd);
-	}
-
-
-	/*
-	 * We don't set tvd until now because the parent we just removed
-	 * may have been the previous top-level vdev.
-	 */
-	tvd = cvd->vdev_top;
-	ASSERT(tvd->vdev_parent == rvd);
-
-	/*
-	 * Reevaluate the parent vdev state.
-	 */
-	vdev_propagate_state(cvd);
-
-	/*
-	 * If the 'autoexpand' property is set on the pool then automatically
-	 * try to expand the size of the pool. For example if the device we
-	 * just detached was smaller than the others, it may be possible to
-	 * add metaslabs (i.e. grow the pool). We need to reopen the vdev
-	 * first so that we can obtain the updated sizes of the leaf vdevs.
-	 */
-	if (spa->spa_autoexpand) {
-		vdev_reopen(tvd);
-		vdev_expand(tvd, txg);
-	}
-
-	vdev_config_dirty(tvd);
-
-	/*
-	 * Mark vd's DTL as dirty in this txg.  vdev_dtl_sync() will see that
-	 * vd->vdev_detached is set and free vd's DTL object in syncing context.
-	 * But first make sure we're not on any *other* txg's DTL list, to
-	 * prevent vd from being accessed after it's freed.
-	 */
-	vdpath = spa_strdup(vd->vdev_path);
-	for (int t = 0; t < TXG_SIZE; t++)
-		(void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t);
-	vd->vdev_detached = B_TRUE;
-	vdev_dirty(tvd, VDD_DTL, vd, txg);
-
-	spa_event_notify(spa, vd, NULL, ESC_ZFS_VDEV_REMOVE);
-
-	/* hang on to the spa before we release the lock */
-	spa_open_ref(spa, FTAG);
-
-	error = spa_vdev_exit(spa, vd, txg, 0);
-
-	spa_history_log_internal(spa, "detach", NULL,
-	    "vdev=%s", vdpath);
-	spa_strfree(vdpath);
-
-	/*
-	 * If this was the removal of the original device in a hot spare vdev,
-	 * then we want to go through and remove the device from the hot spare
-	 * list of every other pool.
-	 */
-	if (unspare) {
-		spa_t *altspa = NULL;
-
-		mutex_enter(&spa_namespace_lock);
-		while ((altspa = spa_next(altspa)) != NULL) {
-			if (altspa->spa_state != POOL_STATE_ACTIVE ||
-			    altspa == spa)
-				continue;
-
-			spa_open_ref(altspa, FTAG);
-			mutex_exit(&spa_namespace_lock);
-			(void) spa_vdev_remove(altspa, unspare_guid, B_TRUE);
-			mutex_enter(&spa_namespace_lock);
-			spa_close(altspa, FTAG);
-		}
-		mutex_exit(&spa_namespace_lock);
-
-		/* search the rest of the vdevs for spares to remove */
-		spa_vdev_resilver_done(spa);
-	}
-
-	/* all done with the spa; OK to release */
-	mutex_enter(&spa_namespace_lock);
-	spa_close(spa, FTAG);
-	mutex_exit(&spa_namespace_lock);
-
-	return (error);
-}
-
-int
-spa_vdev_initialize(spa_t *spa, uint64_t guid, uint64_t cmd_type)
-{
-	/*
-	 * We hold the namespace lock through the whole function
-	 * to prevent any changes to the pool while we're starting or
-	 * stopping initialization. The config and state locks are held so that
-	 * we can properly assess the vdev state before we commit to
-	 * the initializing operation.
-	 */
-	mutex_enter(&spa_namespace_lock);
-	spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER);
-
-	/* Look up vdev and ensure it's a leaf. */
-	vdev_t *vd = spa_lookup_by_guid(spa, guid, B_FALSE);
-	if (vd == NULL || vd->vdev_detached) {
-		spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
-		mutex_exit(&spa_namespace_lock);
-		return (SET_ERROR(ENODEV));
-	} else if (!vd->vdev_ops->vdev_op_leaf || !vdev_is_concrete(vd)) {
-		spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
-		mutex_exit(&spa_namespace_lock);
-		return (SET_ERROR(EINVAL));
-	} else if (!vdev_writeable(vd)) {
-		spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
-		mutex_exit(&spa_namespace_lock);
-		return (SET_ERROR(EROFS));
-	}
-	mutex_enter(&vd->vdev_initialize_lock);
-	spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
-
-	/*
-	 * When we activate an initialize action we check to see
-	 * if the vdev_initialize_thread is NULL. We do this instead
-	 * of using the vdev_initialize_state since there might be
-	 * a previous initialization process which has completed but
-	 * the thread is not exited.
-	 */
-	if (cmd_type == POOL_INITIALIZE_DO &&
-	    (vd->vdev_initialize_thread != NULL ||
-	    vd->vdev_top->vdev_removing)) {
-		mutex_exit(&vd->vdev_initialize_lock);
-		mutex_exit(&spa_namespace_lock);
-		return (SET_ERROR(EBUSY));
-	} else if (cmd_type == POOL_INITIALIZE_CANCEL &&
-	    (vd->vdev_initialize_state != VDEV_INITIALIZE_ACTIVE &&
-	    vd->vdev_initialize_state != VDEV_INITIALIZE_SUSPENDED)) {
-		mutex_exit(&vd->vdev_initialize_lock);
-		mutex_exit(&spa_namespace_lock);
-		return (SET_ERROR(ESRCH));
-	} else if (cmd_type == POOL_INITIALIZE_SUSPEND &&
-	    vd->vdev_initialize_state != VDEV_INITIALIZE_ACTIVE) {
-		mutex_exit(&vd->vdev_initialize_lock);
-		mutex_exit(&spa_namespace_lock);
-		return (SET_ERROR(ESRCH));
-	}
-
-	switch (cmd_type) {
-	case POOL_INITIALIZE_DO:
-		vdev_initialize(vd);
-		break;
-	case POOL_INITIALIZE_CANCEL:
-		vdev_initialize_stop(vd, VDEV_INITIALIZE_CANCELED);
-		break;
-	case POOL_INITIALIZE_SUSPEND:
-		vdev_initialize_stop(vd, VDEV_INITIALIZE_SUSPENDED);
-		break;
-	default:
-		panic("invalid cmd_type %llu", (unsigned long long)cmd_type);
-	}
-	mutex_exit(&vd->vdev_initialize_lock);
-
-	/* Sync out the initializing state */
-	txg_wait_synced(spa->spa_dsl_pool, 0);
-	mutex_exit(&spa_namespace_lock);
-
-	return (0);
-}
-
-
-/*
- * Split a set of devices from their mirrors, and create a new pool from them.
- */
-int
-spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config,
-    nvlist_t *props, boolean_t exp)
-{
-	int error = 0;
-	uint64_t txg, *glist;
-	spa_t *newspa;
-	uint_t c, children, lastlog;
-	nvlist_t **child, *nvl, *tmp;
-	dmu_tx_t *tx;
-	char *altroot = NULL;
-	vdev_t *rvd, **vml = NULL;			/* vdev modify list */
-	boolean_t activate_slog;
-
-	ASSERT(spa_writeable(spa));
-
-	txg = spa_vdev_enter(spa);
-
-	ASSERT(MUTEX_HELD(&spa_namespace_lock));
-	if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) {
-		error = (spa_has_checkpoint(spa)) ?
-		    ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT;
-		return (spa_vdev_exit(spa, NULL, txg, error));
-	}
-
-	/* clear the log and flush everything up to now */
-	activate_slog = spa_passivate_log(spa);
-	(void) spa_vdev_config_exit(spa, NULL, txg, 0, FTAG);
-	error = spa_reset_logs(spa);
-	txg = spa_vdev_config_enter(spa);
-
-	if (activate_slog)
-		spa_activate_log(spa);
-
-	if (error != 0)
-		return (spa_vdev_exit(spa, NULL, txg, error));
-
-	/* check new spa name before going any further */
-	if (spa_lookup(newname) != NULL)
-		return (spa_vdev_exit(spa, NULL, txg, EEXIST));
-
-	/*
-	 * scan through all the children to ensure they're all mirrors
-	 */
-	if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvl) != 0 ||
-	    nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_CHILDREN, &child,
-	    &children) != 0)
-		return (spa_vdev_exit(spa, NULL, txg, EINVAL));
-
-	/* first, check to ensure we've got the right child count */
-	rvd = spa->spa_root_vdev;
-	lastlog = 0;
-	for (c = 0; c < rvd->vdev_children; c++) {
-		vdev_t *vd = rvd->vdev_child[c];
-
-		/* don't count the holes & logs as children */
-		if (vd->vdev_islog || !vdev_is_concrete(vd)) {
-			if (lastlog == 0)
-				lastlog = c;
-			continue;
-		}
-
-		lastlog = 0;
-	}
-	if (children != (lastlog != 0 ? lastlog : rvd->vdev_children))
-		return (spa_vdev_exit(spa, NULL, txg, EINVAL));
-
-	/* next, ensure no spare or cache devices are part of the split */
-	if (nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_SPARES, &tmp) == 0 ||
-	    nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_L2CACHE, &tmp) == 0)
-		return (spa_vdev_exit(spa, NULL, txg, EINVAL));
-
-	vml = kmem_zalloc(children * sizeof (vdev_t *), KM_SLEEP);
-	glist = kmem_zalloc(children * sizeof (uint64_t), KM_SLEEP);
-
-	/* then, loop over each vdev and validate it */
-	for (c = 0; c < children; c++) {
-		uint64_t is_hole = 0;
-
-		(void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_HOLE,
-		    &is_hole);
-
-		if (is_hole != 0) {
-			if (spa->spa_root_vdev->vdev_child[c]->vdev_ishole ||
-			    spa->spa_root_vdev->vdev_child[c]->vdev_islog) {
-				continue;
-			} else {
-				error = SET_ERROR(EINVAL);
-				break;
-			}
-		}
-
-		/* which disk is going to be split? */
-		if (nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_GUID,
-		    &glist[c]) != 0) {
-			error = SET_ERROR(EINVAL);
-			break;
-		}
-
-		/* look it up in the spa */
-		vml[c] = spa_lookup_by_guid(spa, glist[c], B_FALSE);
-		if (vml[c] == NULL) {
-			error = SET_ERROR(ENODEV);
-			break;
-		}
-
-		/* make sure there's nothing stopping the split */
-		if (vml[c]->vdev_parent->vdev_ops != &vdev_mirror_ops ||
-		    vml[c]->vdev_islog ||
-		    !vdev_is_concrete(vml[c]) ||
-		    vml[c]->vdev_isspare ||
-		    vml[c]->vdev_isl2cache ||
-		    !vdev_writeable(vml[c]) ||
-		    vml[c]->vdev_children != 0 ||
-		    vml[c]->vdev_state != VDEV_STATE_HEALTHY ||
-		    c != spa->spa_root_vdev->vdev_child[c]->vdev_id) {
-			error = SET_ERROR(EINVAL);
-			break;
-		}
-
-		if (vdev_dtl_required(vml[c])) {
-			error = SET_ERROR(EBUSY);
-			break;
-		}
-
-		/* we need certain info from the top level */
-		VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_ARRAY,
-		    vml[c]->vdev_top->vdev_ms_array) == 0);
-		VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_SHIFT,
-		    vml[c]->vdev_top->vdev_ms_shift) == 0);
-		VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_ASIZE,
-		    vml[c]->vdev_top->vdev_asize) == 0);
-		VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_ASHIFT,
-		    vml[c]->vdev_top->vdev_ashift) == 0);
-
-		/* transfer per-vdev ZAPs */
-		ASSERT3U(vml[c]->vdev_leaf_zap, !=, 0);
-		VERIFY0(nvlist_add_uint64(child[c],
-		    ZPOOL_CONFIG_VDEV_LEAF_ZAP, vml[c]->vdev_leaf_zap));
-
-		ASSERT3U(vml[c]->vdev_top->vdev_top_zap, !=, 0);
-		VERIFY0(nvlist_add_uint64(child[c],
-		    ZPOOL_CONFIG_VDEV_TOP_ZAP,
-		    vml[c]->vdev_parent->vdev_top_zap));
-	}
-
-	if (error != 0) {
-		kmem_free(vml, children * sizeof (vdev_t *));
-		kmem_free(glist, children * sizeof (uint64_t));
-		return (spa_vdev_exit(spa, NULL, txg, error));
-	}
-
-	/* stop writers from using the disks */
-	for (c = 0; c < children; c++) {
-		if (vml[c] != NULL)
-			vml[c]->vdev_offline = B_TRUE;
-	}
-	vdev_reopen(spa->spa_root_vdev);
-
-	/*
-	 * Temporarily record the splitting vdevs in the spa config.  This
-	 * will disappear once the config is regenerated.
-	 */
-	VERIFY(nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP) == 0);
-	VERIFY(nvlist_add_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST,
-	    glist, children) == 0);
-	kmem_free(glist, children * sizeof (uint64_t));
-
-	mutex_enter(&spa->spa_props_lock);
-	VERIFY(nvlist_add_nvlist(spa->spa_config, ZPOOL_CONFIG_SPLIT,
-	    nvl) == 0);
-	mutex_exit(&spa->spa_props_lock);
-	spa->spa_config_splitting = nvl;
-	vdev_config_dirty(spa->spa_root_vdev);
-
-	/* configure and create the new pool */
-	VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, newname) == 0);
-	VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE,
-	    exp ? POOL_STATE_EXPORTED : POOL_STATE_ACTIVE) == 0);
-	VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_VERSION,
-	    spa_version(spa)) == 0);
-	VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_TXG,
-	    spa->spa_config_txg) == 0);
-	VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_GUID,
-	    spa_generate_guid(NULL)) == 0);
-	VERIFY0(nvlist_add_boolean(config, ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS));
-	(void) nvlist_lookup_string(props,
-	    zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
-
-	/* add the new pool to the namespace */
-	newspa = spa_add(newname, config, altroot);
-	newspa->spa_avz_action = AVZ_ACTION_REBUILD;
-	newspa->spa_config_txg = spa->spa_config_txg;
-	spa_set_log_state(newspa, SPA_LOG_CLEAR);
-
-	/* release the spa config lock, retaining the namespace lock */
-	spa_vdev_config_exit(spa, NULL, txg, 0, FTAG);
-
-	if (zio_injection_enabled)
-		zio_handle_panic_injection(spa, FTAG, 1);
-
-	spa_activate(newspa, spa_mode_global);
-	spa_async_suspend(newspa);
-
-	for (c = 0; c < children; c++) {
-		if (vml[c] != NULL) {
-			/*
-			 * Temporarily stop the initializing activity. We set
-			 * the state to ACTIVE so that we know to resume
-			 * the initializing once the split has completed.
-			 */
-			mutex_enter(&vml[c]->vdev_initialize_lock);
-			vdev_initialize_stop(vml[c], VDEV_INITIALIZE_ACTIVE);
-			mutex_exit(&vml[c]->vdev_initialize_lock);
-		}
-	}
-
-#ifndef illumos
-	/* mark that we are creating new spa by splitting */
-	newspa->spa_splitting_newspa = B_TRUE;
-#endif
-	newspa->spa_config_source = SPA_CONFIG_SRC_SPLIT;
-
-	/* create the new pool from the disks of the original pool */
-	error = spa_load(newspa, SPA_LOAD_IMPORT, SPA_IMPORT_ASSEMBLE);
-#ifndef illumos
-	newspa->spa_splitting_newspa = B_FALSE;
-#endif
-	if (error)
-		goto out;
-
-	/* if that worked, generate a real config for the new pool */
-	if (newspa->spa_root_vdev != NULL) {
-		VERIFY(nvlist_alloc(&newspa->spa_config_splitting,
-		    NV_UNIQUE_NAME, KM_SLEEP) == 0);
-		VERIFY(nvlist_add_uint64(newspa->spa_config_splitting,
-		    ZPOOL_CONFIG_SPLIT_GUID, spa_guid(spa)) == 0);
-		spa_config_set(newspa, spa_config_generate(newspa, NULL, -1ULL,
-		    B_TRUE));
-	}
-
-	/* set the props */
-	if (props != NULL) {
-		spa_configfile_set(newspa, props, B_FALSE);
-		error = spa_prop_set(newspa, props);
-		if (error)
-			goto out;
-	}
-
-	/* flush everything */
-	txg = spa_vdev_config_enter(newspa);
-	vdev_config_dirty(newspa->spa_root_vdev);
-	(void) spa_vdev_config_exit(newspa, NULL, txg, 0, FTAG);
-
-	if (zio_injection_enabled)
-		zio_handle_panic_injection(spa, FTAG, 2);
-
-	spa_async_resume(newspa);
-
-	/* finally, update the original pool's config */
-	txg = spa_vdev_config_enter(spa);
-	tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
-	error = dmu_tx_assign(tx, TXG_WAIT);
-	if (error != 0)
-		dmu_tx_abort(tx);
-	for (c = 0; c < children; c++) {
-		if (vml[c] != NULL) {
-			vdev_split(vml[c]);
-			if (error == 0)
-				spa_history_log_internal(spa, "detach", tx,
-				    "vdev=%s", vml[c]->vdev_path);
-
-			vdev_free(vml[c]);
-		}
-	}
-	spa->spa_avz_action = AVZ_ACTION_REBUILD;
-	vdev_config_dirty(spa->spa_root_vdev);
-	spa->spa_config_splitting = NULL;
-	nvlist_free(nvl);
-	if (error == 0)
-		dmu_tx_commit(tx);
-	(void) spa_vdev_exit(spa, NULL, txg, 0);
-
-	if (zio_injection_enabled)
-		zio_handle_panic_injection(spa, FTAG, 3);
-
-	/* split is complete; log a history record */
-	spa_history_log_internal(newspa, "split", NULL,
-	    "from pool %s", spa_name(spa));
-
-	kmem_free(vml, children * sizeof (vdev_t *));
-
-	/* if we're not going to mount the filesystems in userland, export */
-	if (exp)
-		error = spa_export_common(newname, POOL_STATE_EXPORTED, NULL,
-		    B_FALSE, B_FALSE);
-
-	return (error);
-
-out:
-	spa_unload(newspa);
-	spa_deactivate(newspa);
-	spa_remove(newspa);
-
-	txg = spa_vdev_config_enter(spa);
-
-	/* re-online all offlined disks */
-	for (c = 0; c < children; c++) {
-		if (vml[c] != NULL)
-			vml[c]->vdev_offline = B_FALSE;
-	}
-
-	/* restart initializing disks as necessary */
-	spa_async_request(spa, SPA_ASYNC_INITIALIZE_RESTART);
-
-	vdev_reopen(spa->spa_root_vdev);
-
-	nvlist_free(spa->spa_config_splitting);
-	spa->spa_config_splitting = NULL;
-	(void) spa_vdev_exit(spa, NULL, txg, error);
-
-	kmem_free(vml, children * sizeof (vdev_t *));
-	return (error);
-}
-
-/*
- * Find any device that's done replacing, or a vdev marked 'unspare' that's
- * currently spared, so we can detach it.
- */
-static vdev_t *
-spa_vdev_resilver_done_hunt(vdev_t *vd)
-{
-	vdev_t *newvd, *oldvd;
-
-	for (int c = 0; c < vd->vdev_children; c++) {
-		oldvd = spa_vdev_resilver_done_hunt(vd->vdev_child[c]);
-		if (oldvd != NULL)
-			return (oldvd);
-	}
-
-	/*
-	 * Check for a completed replacement.  We always consider the first
-	 * vdev in the list to be the oldest vdev, and the last one to be
-	 * the newest (see spa_vdev_attach() for how that works).  In
-	 * the case where the newest vdev is faulted, we will not automatically
-	 * remove it after a resilver completes.  This is OK as it will require
-	 * user intervention to determine which disk the admin wishes to keep.
-	 */
-	if (vd->vdev_ops == &vdev_replacing_ops) {
-		ASSERT(vd->vdev_children > 1);
-
-		newvd = vd->vdev_child[vd->vdev_children - 1];
-		oldvd = vd->vdev_child[0];
-
-		if (vdev_dtl_empty(newvd, DTL_MISSING) &&
-		    vdev_dtl_empty(newvd, DTL_OUTAGE) &&
-		    !vdev_dtl_required(oldvd))
-			return (oldvd);
-	}
-
-	/*
-	 * Check for a completed resilver with the 'unspare' flag set.
-	 * Also potentially update faulted state.
-	 */
-	if (vd->vdev_ops == &vdev_spare_ops) {
-		vdev_t *first = vd->vdev_child[0];
-		vdev_t *last = vd->vdev_child[vd->vdev_children - 1];
-
-		if (last->vdev_unspare) {
-			oldvd = first;
-			newvd = last;
-		} else if (first->vdev_unspare) {
-			oldvd = last;
-			newvd = first;
-		} else {
-			oldvd = NULL;
-		}
-
-		if (oldvd != NULL &&
-		    vdev_dtl_empty(newvd, DTL_MISSING) &&
-		    vdev_dtl_empty(newvd, DTL_OUTAGE) &&
-		    !vdev_dtl_required(oldvd))
-			return (oldvd);
-
-		vdev_propagate_state(vd);
-
-		/*
-		 * If there are more than two spares attached to a disk,
-		 * and those spares are not required, then we want to
-		 * attempt to free them up now so that they can be used
-		 * by other pools.  Once we're back down to a single
-		 * disk+spare, we stop removing them.
-		 */
-		if (vd->vdev_children > 2) {
-			newvd = vd->vdev_child[1];
-
-			if (newvd->vdev_isspare && last->vdev_isspare &&
-			    vdev_dtl_empty(last, DTL_MISSING) &&
-			    vdev_dtl_empty(last, DTL_OUTAGE) &&
-			    !vdev_dtl_required(newvd))
-				return (newvd);
-		}
-	}
-
-	return (NULL);
-}
-
-static void
-spa_vdev_resilver_done(spa_t *spa)
-{
-	vdev_t *vd, *pvd, *ppvd;
-	uint64_t guid, sguid, pguid, ppguid;
-
-	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
-
-	while ((vd = spa_vdev_resilver_done_hunt(spa->spa_root_vdev)) != NULL) {
-		pvd = vd->vdev_parent;
-		ppvd = pvd->vdev_parent;
-		guid = vd->vdev_guid;
-		pguid = pvd->vdev_guid;
-		ppguid = ppvd->vdev_guid;
-		sguid = 0;
-		/*
-		 * If we have just finished replacing a hot spared device, then
-		 * we need to detach the parent's first child (the original hot
-		 * spare) as well.
-		 */
-		if (ppvd->vdev_ops == &vdev_spare_ops && pvd->vdev_id == 0 &&
-		    ppvd->vdev_children == 2) {
-			ASSERT(pvd->vdev_ops == &vdev_replacing_ops);
-			sguid = ppvd->vdev_child[1]->vdev_guid;
-		}
-		ASSERT(vd->vdev_resilver_txg == 0 || !vdev_dtl_required(vd));
-
-		spa_config_exit(spa, SCL_ALL, FTAG);
-		if (spa_vdev_detach(spa, guid, pguid, B_TRUE) != 0)
-			return;
-		if (sguid && spa_vdev_detach(spa, sguid, ppguid, B_TRUE) != 0)
-			return;
-		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
-	}
-
-	spa_config_exit(spa, SCL_ALL, FTAG);
-}
-
-/*
- * Update the stored path or FRU for this vdev.
- */
-int
-spa_vdev_set_common(spa_t *spa, uint64_t guid, const char *value,
-    boolean_t ispath)
-{
-	vdev_t *vd;
-	boolean_t sync = B_FALSE;
-
-	ASSERT(spa_writeable(spa));
-
-	spa_vdev_state_enter(spa, SCL_ALL);
-
-	if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
-		return (spa_vdev_state_exit(spa, NULL, ENOENT));
-
-	if (!vd->vdev_ops->vdev_op_leaf)
-		return (spa_vdev_state_exit(spa, NULL, ENOTSUP));
-
-	if (ispath) {
-		if (strcmp(value, vd->vdev_path) != 0) {
-			spa_strfree(vd->vdev_path);
-			vd->vdev_path = spa_strdup(value);
-			sync = B_TRUE;
-		}
-	} else {
-		if (vd->vdev_fru == NULL) {
-			vd->vdev_fru = spa_strdup(value);
-			sync = B_TRUE;
-		} else if (strcmp(value, vd->vdev_fru) != 0) {
-			spa_strfree(vd->vdev_fru);
-			vd->vdev_fru = spa_strdup(value);
-			sync = B_TRUE;
-		}
-	}
-
-	return (spa_vdev_state_exit(spa, sync ? vd : NULL, 0));
-}
-
-int
-spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath)
-{
-	return (spa_vdev_set_common(spa, guid, newpath, B_TRUE));
-}
-
-int
-spa_vdev_setfru(spa_t *spa, uint64_t guid, const char *newfru)
-{
-	return (spa_vdev_set_common(spa, guid, newfru, B_FALSE));
-}
-
-/*
- * ==========================================================================
- * SPA Scanning
- * ==========================================================================
- */
-int
-spa_scrub_pause_resume(spa_t *spa, pool_scrub_cmd_t cmd)
-{
-	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0);
-
-	if (dsl_scan_resilvering(spa->spa_dsl_pool))
-		return (SET_ERROR(EBUSY));
-
-	return (dsl_scrub_set_pause_resume(spa->spa_dsl_pool, cmd));
-}
-
-int
-spa_scan_stop(spa_t *spa)
-{
-	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0);
-	if (dsl_scan_resilvering(spa->spa_dsl_pool))
-		return (SET_ERROR(EBUSY));
-	return (dsl_scan_cancel(spa->spa_dsl_pool));
-}
-
-int
-spa_scan(spa_t *spa, pool_scan_func_t func)
-{
-	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0);
-
-	if (func >= POOL_SCAN_FUNCS || func == POOL_SCAN_NONE)
-		return (SET_ERROR(ENOTSUP));
-
-	/*
-	 * If a resilver was requested, but there is no DTL on a
-	 * writeable leaf device, we have nothing to do.
-	 */
-	if (func == POOL_SCAN_RESILVER &&
-	    !vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) {
-		spa_async_request(spa, SPA_ASYNC_RESILVER_DONE);
-		return (0);
-	}
-
-	return (dsl_scan(spa->spa_dsl_pool, func));
-}
-
-/*
- * ==========================================================================
- * SPA async task processing
- * ==========================================================================
- */
-
-static void
-spa_async_remove(spa_t *spa, vdev_t *vd)
-{
-	if (vd->vdev_remove_wanted) {
-		vd->vdev_remove_wanted = B_FALSE;
-		vd->vdev_delayed_close = B_FALSE;
-		vdev_set_state(vd, B_FALSE, VDEV_STATE_REMOVED, VDEV_AUX_NONE);
-
-		/*
-		 * We want to clear the stats, but we don't want to do a full
-		 * vdev_clear() as that will cause us to throw away
-		 * degraded/faulted state as well as attempt to reopen the
-		 * device, all of which is a waste.
-		 */
-		vd->vdev_stat.vs_read_errors = 0;
-		vd->vdev_stat.vs_write_errors = 0;
-		vd->vdev_stat.vs_checksum_errors = 0;
-
-		vdev_state_dirty(vd->vdev_top);
-		/* Tell userspace that the vdev is gone. */
-		zfs_post_remove(spa, vd);
-	}
-
-	for (int c = 0; c < vd->vdev_children; c++)
-		spa_async_remove(spa, vd->vdev_child[c]);
-}
-
-static void
-spa_async_probe(spa_t *spa, vdev_t *vd)
-{
-	if (vd->vdev_probe_wanted) {
-		vd->vdev_probe_wanted = B_FALSE;
-		vdev_reopen(vd);	/* vdev_open() does the actual probe */
-	}
-
-	for (int c = 0; c < vd->vdev_children; c++)
-		spa_async_probe(spa, vd->vdev_child[c]);
-}
-
-static void
-spa_async_autoexpand(spa_t *spa, vdev_t *vd)
-{
-	sysevent_id_t eid;
-	nvlist_t *attr;
-	char *physpath;
-
-	if (!spa->spa_autoexpand)
-		return;
-
-	for (int c = 0; c < vd->vdev_children; c++) {
-		vdev_t *cvd = vd->vdev_child[c];
-		spa_async_autoexpand(spa, cvd);
-	}
-
-	if (!vd->vdev_ops->vdev_op_leaf || vd->vdev_physpath == NULL)
-		return;
-
-	physpath = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
-	(void) snprintf(physpath, MAXPATHLEN, "/devices%s", vd->vdev_physpath);
-
-	VERIFY(nvlist_alloc(&attr, NV_UNIQUE_NAME, KM_SLEEP) == 0);
-	VERIFY(nvlist_add_string(attr, DEV_PHYS_PATH, physpath) == 0);
-
-	(void) ddi_log_sysevent(zfs_dip, SUNW_VENDOR, EC_DEV_STATUS,
-	    ESC_ZFS_VDEV_AUTOEXPAND, attr, &eid, DDI_SLEEP);
-
-	nvlist_free(attr);
-	kmem_free(physpath, MAXPATHLEN);
-}
-
-static void
-spa_async_thread(void *arg)
-{
-	spa_t *spa = (spa_t *)arg;
-	int tasks;
-
-	ASSERT(spa->spa_sync_on);
-
-	mutex_enter(&spa->spa_async_lock);
-	tasks = spa->spa_async_tasks;
-	spa->spa_async_tasks &= SPA_ASYNC_REMOVE;
-	mutex_exit(&spa->spa_async_lock);
-
-	/*
-	 * See if the config needs to be updated.
-	 */
-	if (tasks & SPA_ASYNC_CONFIG_UPDATE) {
-		uint64_t old_space, new_space;
-
-		mutex_enter(&spa_namespace_lock);
-		old_space = metaslab_class_get_space(spa_normal_class(spa));
-		old_space += metaslab_class_get_space(spa_special_class(spa));
-		old_space += metaslab_class_get_space(spa_dedup_class(spa));
-
-		spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
-
-		new_space = metaslab_class_get_space(spa_normal_class(spa));
-		new_space += metaslab_class_get_space(spa_special_class(spa));
-		new_space += metaslab_class_get_space(spa_dedup_class(spa));
-		mutex_exit(&spa_namespace_lock);
-
-		/*
-		 * If the pool grew as a result of the config update,
-		 * then log an internal history event.
-		 */
-		if (new_space != old_space) {
-			spa_history_log_internal(spa, "vdev online", NULL,
-			    "pool '%s' size: %llu(+%llu)",
-			    spa_name(spa), new_space, new_space - old_space);
-		}
-	}
-
-	if ((tasks & SPA_ASYNC_AUTOEXPAND) && !spa_suspended(spa)) {
-		spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
-		spa_async_autoexpand(spa, spa->spa_root_vdev);
-		spa_config_exit(spa, SCL_CONFIG, FTAG);
-	}
-
-	/*
-	 * See if any devices need to be probed.
-	 */
-	if (tasks & SPA_ASYNC_PROBE) {
-		spa_vdev_state_enter(spa, SCL_NONE);
-		spa_async_probe(spa, spa->spa_root_vdev);
-		(void) spa_vdev_state_exit(spa, NULL, 0);
-	}
-
-	/*
-	 * If any devices are done replacing, detach them.
-	 */
-	if (tasks & SPA_ASYNC_RESILVER_DONE)
-		spa_vdev_resilver_done(spa);
-
-	/*
-	 * Kick off a resilver.
-	 */
-	if (tasks & SPA_ASYNC_RESILVER)
-		dsl_resilver_restart(spa->spa_dsl_pool, 0);
-
-	if (tasks & SPA_ASYNC_INITIALIZE_RESTART) {
-		mutex_enter(&spa_namespace_lock);
-		spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
-		vdev_initialize_restart(spa->spa_root_vdev);
-		spa_config_exit(spa, SCL_CONFIG, FTAG);
-		mutex_exit(&spa_namespace_lock);
-	}
-
-	/*
-	 * Let the world know that we're done.
-	 */
-	mutex_enter(&spa->spa_async_lock);
-	spa->spa_async_thread = NULL;
-	cv_broadcast(&spa->spa_async_cv);
-	mutex_exit(&spa->spa_async_lock);
-	thread_exit();
-}
-
-static void
-spa_async_thread_vd(void *arg)
-{
-	spa_t *spa = arg;
-	int tasks;
-
-	mutex_enter(&spa->spa_async_lock);
-	tasks = spa->spa_async_tasks;
-retry:
-	spa->spa_async_tasks &= ~SPA_ASYNC_REMOVE;
-	mutex_exit(&spa->spa_async_lock);
-
-	/*
-	 * See if any devices need to be marked REMOVED.
-	 */
-	if (tasks & SPA_ASYNC_REMOVE) {
-		spa_vdev_state_enter(spa, SCL_NONE);
-		spa_async_remove(spa, spa->spa_root_vdev);
-		for (int i = 0; i < spa->spa_l2cache.sav_count; i++)
-			spa_async_remove(spa, spa->spa_l2cache.sav_vdevs[i]);
-		for (int i = 0; i < spa->spa_spares.sav_count; i++)
-			spa_async_remove(spa, spa->spa_spares.sav_vdevs[i]);
-		(void) spa_vdev_state_exit(spa, NULL, 0);
-	}
-
-	/*
-	 * Let the world know that we're done.
-	 */
-	mutex_enter(&spa->spa_async_lock);
-	tasks = spa->spa_async_tasks;
-	if ((tasks & SPA_ASYNC_REMOVE) != 0)
-		goto retry;
-	spa->spa_async_thread_vd = NULL;
-	cv_broadcast(&spa->spa_async_cv);
-	mutex_exit(&spa->spa_async_lock);
-	thread_exit();
-}
-
-void
-spa_async_suspend(spa_t *spa)
-{
-	mutex_enter(&spa->spa_async_lock);
-	spa->spa_async_suspended++;
-	while (spa->spa_async_thread != NULL ||
-	    spa->spa_async_thread_vd != NULL)
-		cv_wait(&spa->spa_async_cv, &spa->spa_async_lock);
-	mutex_exit(&spa->spa_async_lock);
-
-	spa_vdev_remove_suspend(spa);
-
-	zthr_t *condense_thread = spa->spa_condense_zthr;
-	if (condense_thread != NULL)
-		zthr_cancel(condense_thread);
-
-	zthr_t *discard_thread = spa->spa_checkpoint_discard_zthr;
-	if (discard_thread != NULL)
-		zthr_cancel(discard_thread);
-}
-
-void
-spa_async_resume(spa_t *spa)
-{
-	mutex_enter(&spa->spa_async_lock);
-	ASSERT(spa->spa_async_suspended != 0);
-	spa->spa_async_suspended--;
-	mutex_exit(&spa->spa_async_lock);
-	spa_restart_removal(spa);
-
-	zthr_t *condense_thread = spa->spa_condense_zthr;
-	if (condense_thread != NULL)
-		zthr_resume(condense_thread);
-
-	zthr_t *discard_thread = spa->spa_checkpoint_discard_zthr;
-	if (discard_thread != NULL)
-		zthr_resume(discard_thread);
-}
-
-static boolean_t
-spa_async_tasks_pending(spa_t *spa)
-{
-	uint_t non_config_tasks;
-	uint_t config_task;
-	boolean_t config_task_suspended;
-
-	non_config_tasks = spa->spa_async_tasks & ~(SPA_ASYNC_CONFIG_UPDATE |
-	    SPA_ASYNC_REMOVE);
-	config_task = spa->spa_async_tasks & SPA_ASYNC_CONFIG_UPDATE;
-	if (spa->spa_ccw_fail_time == 0) {
-		config_task_suspended = B_FALSE;
-	} else {
-		config_task_suspended =
-		    (gethrtime() - spa->spa_ccw_fail_time) <
-		    (zfs_ccw_retry_interval * NANOSEC);
-	}
-
-	return (non_config_tasks || (config_task && !config_task_suspended));
-}
-
-static void
-spa_async_dispatch(spa_t *spa)
-{
-	mutex_enter(&spa->spa_async_lock);
-	if (spa_async_tasks_pending(spa) &&
-	    !spa->spa_async_suspended &&
-	    spa->spa_async_thread == NULL &&
-	    rootdir != NULL)
-		spa->spa_async_thread = thread_create(NULL, 0,
-		    spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri);
-	mutex_exit(&spa->spa_async_lock);
-}
-
-static void
-spa_async_dispatch_vd(spa_t *spa)
-{
-	mutex_enter(&spa->spa_async_lock);
-	if ((spa->spa_async_tasks & SPA_ASYNC_REMOVE) != 0 &&
-	    !spa->spa_async_suspended &&
-	    spa->spa_async_thread_vd == NULL &&
-	    rootdir != NULL)
-		spa->spa_async_thread_vd = thread_create(NULL, 0,
-		    spa_async_thread_vd, spa, 0, &p0, TS_RUN, maxclsyspri);
-	mutex_exit(&spa->spa_async_lock);
-}
-
-void
-spa_async_request(spa_t *spa, int task)
-{
-	zfs_dbgmsg("spa=%s async request task=%u", spa->spa_name, task);
-	mutex_enter(&spa->spa_async_lock);
-	spa->spa_async_tasks |= task;
-	mutex_exit(&spa->spa_async_lock);
-	spa_async_dispatch_vd(spa);
-}
-
-/*
- * ==========================================================================
- * SPA syncing routines
- * ==========================================================================
- */
-
-static int
-bpobj_enqueue_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
-{
-	bpobj_t *bpo = arg;
-	bpobj_enqueue(bpo, bp, tx);
-	return (0);
-}
-
-static int
-spa_free_sync_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
-{
-	zio_t *zio = arg;
-
-	zio_nowait(zio_free_sync(zio, zio->io_spa, dmu_tx_get_txg(tx), bp,
-	    BP_GET_PSIZE(bp), zio->io_flags));
-	return (0);
-}
-
-/*
- * Note: this simple function is not inlined to make it easier to dtrace the
- * amount of time spent syncing frees.
- */
-static void
-spa_sync_frees(spa_t *spa, bplist_t *bpl, dmu_tx_t *tx)
-{
-	zio_t *zio = zio_root(spa, NULL, NULL, 0);
-	bplist_iterate(bpl, spa_free_sync_cb, zio, tx);
-	VERIFY(zio_wait(zio) == 0);
-}
-
-/*
- * Note: this simple function is not inlined to make it easier to dtrace the
- * amount of time spent syncing deferred frees.
- */
-static void
-spa_sync_deferred_frees(spa_t *spa, dmu_tx_t *tx)
-{
-	zio_t *zio = zio_root(spa, NULL, NULL, 0);
-	VERIFY3U(bpobj_iterate(&spa->spa_deferred_bpobj,
-	    spa_free_sync_cb, zio, tx), ==, 0);
-	VERIFY0(zio_wait(zio));
-}
-
-
-static void
-spa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx)
-{
-	char *packed = NULL;
-	size_t bufsize;
-	size_t nvsize = 0;
-	dmu_buf_t *db;
-
-	VERIFY(nvlist_size(nv, &nvsize, NV_ENCODE_XDR) == 0);
-
-	/*
-	 * Write full (SPA_CONFIG_BLOCKSIZE) blocks of configuration
-	 * information.  This avoids the dmu_buf_will_dirty() path and
-	 * saves us a pre-read to get data we don't actually care about.
-	 */
-	bufsize = P2ROUNDUP((uint64_t)nvsize, SPA_CONFIG_BLOCKSIZE);
-	packed = kmem_alloc(bufsize, KM_SLEEP);
-
-	VERIFY(nvlist_pack(nv, &packed, &nvsize, NV_ENCODE_XDR,
-	    KM_SLEEP) == 0);
-	bzero(packed + nvsize, bufsize - nvsize);
-
-	dmu_write(spa->spa_meta_objset, obj, 0, bufsize, packed, tx);
-
-	kmem_free(packed, bufsize);
-
-	VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db));
-	dmu_buf_will_dirty(db, tx);
-	*(uint64_t *)db->db_data = nvsize;
-	dmu_buf_rele(db, FTAG);
-}
-
-static void
-spa_sync_aux_dev(spa_t *spa, spa_aux_vdev_t *sav, dmu_tx_t *tx,
-    const char *config, const char *entry)
-{
-	nvlist_t *nvroot;
-	nvlist_t **list;
-	int i;
-
-	if (!sav->sav_sync)
-		return;
-
-	/*
-	 * Update the MOS nvlist describing the list of available devices.
-	 * spa_validate_aux() will have already made sure this nvlist is
-	 * valid and the vdevs are labeled appropriately.
-	 */
-	if (sav->sav_object == 0) {
-		sav->sav_object = dmu_object_alloc(spa->spa_meta_objset,
-		    DMU_OT_PACKED_NVLIST, 1 << 14, DMU_OT_PACKED_NVLIST_SIZE,
-		    sizeof (uint64_t), tx);
-		VERIFY(zap_update(spa->spa_meta_objset,
-		    DMU_POOL_DIRECTORY_OBJECT, entry, sizeof (uint64_t), 1,
-		    &sav->sav_object, tx) == 0);
-	}
-
-	VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0);
-	if (sav->sav_count == 0) {
-		VERIFY(nvlist_add_nvlist_array(nvroot, config, NULL, 0) == 0);
-	} else {
-		list = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP);
-		for (i = 0; i < sav->sav_count; i++)
-			list[i] = vdev_config_generate(spa, sav->sav_vdevs[i],
-			    B_FALSE, VDEV_CONFIG_L2CACHE);
-		VERIFY(nvlist_add_nvlist_array(nvroot, config, list,
-		    sav->sav_count) == 0);
-		for (i = 0; i < sav->sav_count; i++)
-			nvlist_free(list[i]);
-		kmem_free(list, sav->sav_count * sizeof (void *));
-	}
-
-	spa_sync_nvlist(spa, sav->sav_object, nvroot, tx);
-	nvlist_free(nvroot);
-
-	sav->sav_sync = B_FALSE;
-}
-
-/*
- * Rebuild spa's all-vdev ZAP from the vdev ZAPs indicated in each vdev_t.
- * The all-vdev ZAP must be empty.
- */
-static void
-spa_avz_build(vdev_t *vd, uint64_t avz, dmu_tx_t *tx)
-{
-	spa_t *spa = vd->vdev_spa;
-	if (vd->vdev_top_zap != 0) {
-		VERIFY0(zap_add_int(spa->spa_meta_objset, avz,
-		    vd->vdev_top_zap, tx));
-	}
-	if (vd->vdev_leaf_zap != 0) {
-		VERIFY0(zap_add_int(spa->spa_meta_objset, avz,
-		    vd->vdev_leaf_zap, tx));
-	}
-	for (uint64_t i = 0; i < vd->vdev_children; i++) {
-		spa_avz_build(vd->vdev_child[i], avz, tx);
-	}
-}
-
-static void
-spa_sync_config_object(spa_t *spa, dmu_tx_t *tx)
-{
-	nvlist_t *config;
-
-	/*
-	 * If the pool is being imported from a pre-per-vdev-ZAP version of ZFS,
-	 * its config may not be dirty but we still need to build per-vdev ZAPs.
-	 * Similarly, if the pool is being assembled (e.g. after a split), we
-	 * need to rebuild the AVZ although the config may not be dirty.
-	 */
-	if (list_is_empty(&spa->spa_config_dirty_list) &&
-	    spa->spa_avz_action == AVZ_ACTION_NONE)
-		return;
-
-	spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
-
-	ASSERT(spa->spa_avz_action == AVZ_ACTION_NONE ||
-	    spa->spa_avz_action == AVZ_ACTION_INITIALIZE ||
-	    spa->spa_all_vdev_zaps != 0);
-
-	if (spa->spa_avz_action == AVZ_ACTION_REBUILD) {
-		/* Make and build the new AVZ */
-		uint64_t new_avz = zap_create(spa->spa_meta_objset,
-		    DMU_OTN_ZAP_METADATA, DMU_OT_NONE, 0, tx);
-		spa_avz_build(spa->spa_root_vdev, new_avz, tx);
-
-		/* Diff old AVZ with new one */
-		zap_cursor_t zc;
-		zap_attribute_t za;
-
-		for (zap_cursor_init(&zc, spa->spa_meta_objset,
-		    spa->spa_all_vdev_zaps);
-		    zap_cursor_retrieve(&zc, &za) == 0;
-		    zap_cursor_advance(&zc)) {
-			uint64_t vdzap = za.za_first_integer;
-			if (zap_lookup_int(spa->spa_meta_objset, new_avz,
-			    vdzap) == ENOENT) {
-				/*
-				 * ZAP is listed in old AVZ but not in new one;
-				 * destroy it
-				 */
-				VERIFY0(zap_destroy(spa->spa_meta_objset, vdzap,
-				    tx));
-			}
-		}
-
-		zap_cursor_fini(&zc);
-
-		/* Destroy the old AVZ */
-		VERIFY0(zap_destroy(spa->spa_meta_objset,
-		    spa->spa_all_vdev_zaps, tx));
-
-		/* Replace the old AVZ in the dir obj with the new one */
-		VERIFY0(zap_update(spa->spa_meta_objset,
-		    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_VDEV_ZAP_MAP,
-		    sizeof (new_avz), 1, &new_avz, tx));
-
-		spa->spa_all_vdev_zaps = new_avz;
-	} else if (spa->spa_avz_action == AVZ_ACTION_DESTROY) {
-		zap_cursor_t zc;
-		zap_attribute_t za;
-
-		/* Walk through the AVZ and destroy all listed ZAPs */
-		for (zap_cursor_init(&zc, spa->spa_meta_objset,
-		    spa->spa_all_vdev_zaps);
-		    zap_cursor_retrieve(&zc, &za) == 0;
-		    zap_cursor_advance(&zc)) {
-			uint64_t zap = za.za_first_integer;
-			VERIFY0(zap_destroy(spa->spa_meta_objset, zap, tx));
-		}
-
-		zap_cursor_fini(&zc);
-
-		/* Destroy and unlink the AVZ itself */
-		VERIFY0(zap_destroy(spa->spa_meta_objset,
-		    spa->spa_all_vdev_zaps, tx));
-		VERIFY0(zap_remove(spa->spa_meta_objset,
-		    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_VDEV_ZAP_MAP, tx));
-		spa->spa_all_vdev_zaps = 0;
-	}
-
-	if (spa->spa_all_vdev_zaps == 0) {
-		spa->spa_all_vdev_zaps = zap_create_link(spa->spa_meta_objset,
-		    DMU_OTN_ZAP_METADATA, DMU_POOL_DIRECTORY_OBJECT,
-		    DMU_POOL_VDEV_ZAP_MAP, tx);
-	}
-	spa->spa_avz_action = AVZ_ACTION_NONE;
-
-	/* Create ZAPs for vdevs that don't have them. */
-	vdev_construct_zaps(spa->spa_root_vdev, tx);
-
-	config = spa_config_generate(spa, spa->spa_root_vdev,
-	    dmu_tx_get_txg(tx), B_FALSE);
-
-	/*
-	 * If we're upgrading the spa version then make sure that
-	 * the config object gets updated with the correct version.
-	 */
-	if (spa->spa_ubsync.ub_version < spa->spa_uberblock.ub_version)
-		fnvlist_add_uint64(config, ZPOOL_CONFIG_VERSION,
-		    spa->spa_uberblock.ub_version);
-
-	spa_config_exit(spa, SCL_STATE, FTAG);
-
-	nvlist_free(spa->spa_config_syncing);
-	spa->spa_config_syncing = config;
-
-	spa_sync_nvlist(spa, spa->spa_config_object, config, tx);
-}
-
-static void
-spa_sync_version(void *arg, dmu_tx_t *tx)
-{
-	uint64_t *versionp = arg;
-	uint64_t version = *versionp;
-	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
-
-	/*
-	 * Setting the version is special cased when first creating the pool.
-	 */
-	ASSERT(tx->tx_txg != TXG_INITIAL);
-
-	ASSERT(SPA_VERSION_IS_SUPPORTED(version));
-	ASSERT(version >= spa_version(spa));
-
-	spa->spa_uberblock.ub_version = version;
-	vdev_config_dirty(spa->spa_root_vdev);
-	spa_history_log_internal(spa, "set", tx, "version=%lld", version);
-}
-
-/*
- * Set zpool properties.
- */
-static void
-spa_sync_props(void *arg, dmu_tx_t *tx)
-{
-	nvlist_t *nvp = arg;
-	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
-	objset_t *mos = spa->spa_meta_objset;
-	nvpair_t *elem = NULL;
-
-	mutex_enter(&spa->spa_props_lock);
-
-	while ((elem = nvlist_next_nvpair(nvp, elem))) {
-		uint64_t intval;
-		char *strval, *fname;
-		zpool_prop_t prop;
-		const char *propname;
-		zprop_type_t proptype;
-		spa_feature_t fid;
-
-		switch (prop = zpool_name_to_prop(nvpair_name(elem))) {
-		case ZPOOL_PROP_INVAL:
-			/*
-			 * We checked this earlier in spa_prop_validate().
-			 */
-			ASSERT(zpool_prop_feature(nvpair_name(elem)));
-
-			fname = strchr(nvpair_name(elem), '@') + 1;
-			VERIFY0(zfeature_lookup_name(fname, &fid));
-
-			spa_feature_enable(spa, fid, tx);
-			spa_history_log_internal(spa, "set", tx,
-			    "%s=enabled", nvpair_name(elem));
-			break;
-
-		case ZPOOL_PROP_VERSION:
-			intval = fnvpair_value_uint64(elem);
-			/*
-			 * The version is synced seperatly before other
-			 * properties and should be correct by now.
-			 */
-			ASSERT3U(spa_version(spa), >=, intval);
-			break;
-
-		case ZPOOL_PROP_ALTROOT:
-			/*
-			 * 'altroot' is a non-persistent property. It should
-			 * have been set temporarily at creation or import time.
-			 */
-			ASSERT(spa->spa_root != NULL);
-			break;
-
-		case ZPOOL_PROP_READONLY:
-		case ZPOOL_PROP_CACHEFILE:
-			/*
-			 * 'readonly' and 'cachefile' are also non-persisitent
-			 * properties.
-			 */
-			break;
-		case ZPOOL_PROP_COMMENT:
-			strval = fnvpair_value_string(elem);
-			if (spa->spa_comment != NULL)
-				spa_strfree(spa->spa_comment);
-			spa->spa_comment = spa_strdup(strval);
-			/*
-			 * We need to dirty the configuration on all the vdevs
-			 * so that their labels get updated.  It's unnecessary
-			 * to do this for pool creation since the vdev's
-			 * configuratoin has already been dirtied.
-			 */
-			if (tx->tx_txg != TXG_INITIAL)
-				vdev_config_dirty(spa->spa_root_vdev);
-			spa_history_log_internal(spa, "set", tx,
-			    "%s=%s", nvpair_name(elem), strval);
-			break;
-		default:
-			/*
-			 * Set pool property values in the poolprops mos object.
-			 */
-			if (spa->spa_pool_props_object == 0) {
-				spa->spa_pool_props_object =
-				    zap_create_link(mos, DMU_OT_POOL_PROPS,
-				    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_PROPS,
-				    tx);
-			}
-
-			/* normalize the property name */
-			propname = zpool_prop_to_name(prop);
-			proptype = zpool_prop_get_type(prop);
-
-			if (nvpair_type(elem) == DATA_TYPE_STRING) {
-				ASSERT(proptype == PROP_TYPE_STRING);
-				strval = fnvpair_value_string(elem);
-				VERIFY0(zap_update(mos,
-				    spa->spa_pool_props_object, propname,
-				    1, strlen(strval) + 1, strval, tx));
-				spa_history_log_internal(spa, "set", tx,
-				    "%s=%s", nvpair_name(elem), strval);
-			} else if (nvpair_type(elem) == DATA_TYPE_UINT64) {
-				intval = fnvpair_value_uint64(elem);
-
-				if (proptype == PROP_TYPE_INDEX) {
-					const char *unused;
-					VERIFY0(zpool_prop_index_to_string(
-					    prop, intval, &unused));
-				}
-				VERIFY0(zap_update(mos,
-				    spa->spa_pool_props_object, propname,
-				    8, 1, &intval, tx));
-				spa_history_log_internal(spa, "set", tx,
-				    "%s=%lld", nvpair_name(elem), intval);
-			} else {
-				ASSERT(0); /* not allowed */
-			}
-
-			switch (prop) {
-			case ZPOOL_PROP_DELEGATION:
-				spa->spa_delegation = intval;
-				break;
-			case ZPOOL_PROP_BOOTFS:
-				spa->spa_bootfs = intval;
-				break;
-			case ZPOOL_PROP_FAILUREMODE:
-				spa->spa_failmode = intval;
-				break;
-			case ZPOOL_PROP_AUTOEXPAND:
-				spa->spa_autoexpand = intval;
-				if (tx->tx_txg != TXG_INITIAL)
-					spa_async_request(spa,
-					    SPA_ASYNC_AUTOEXPAND);
-				break;
-			case ZPOOL_PROP_MULTIHOST:
-				spa->spa_multihost = intval;
-				break;
-			case ZPOOL_PROP_DEDUPDITTO:
-				spa->spa_dedup_ditto = intval;
-				break;
-			default:
-				break;
-			}
-		}
-
-	}
-
-	mutex_exit(&spa->spa_props_lock);
-}
-
-/*
- * Perform one-time upgrade on-disk changes.  spa_version() does not
- * reflect the new version this txg, so there must be no changes this
- * txg to anything that the upgrade code depends on after it executes.
- * Therefore this must be called after dsl_pool_sync() does the sync
- * tasks.
- */
-static void
-spa_sync_upgrades(spa_t *spa, dmu_tx_t *tx)
-{
-	dsl_pool_t *dp = spa->spa_dsl_pool;
-
-	ASSERT(spa->spa_sync_pass == 1);
-
-	rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG);
-
-	if (spa->spa_ubsync.ub_version < SPA_VERSION_ORIGIN &&
-	    spa->spa_uberblock.ub_version >= SPA_VERSION_ORIGIN) {
-		dsl_pool_create_origin(dp, tx);
-
-		/* Keeping the origin open increases spa_minref */
-		spa->spa_minref += 3;
-	}
-
-	if (spa->spa_ubsync.ub_version < SPA_VERSION_NEXT_CLONES &&
-	    spa->spa_uberblock.ub_version >= SPA_VERSION_NEXT_CLONES) {
-		dsl_pool_upgrade_clones(dp, tx);
-	}
-
-	if (spa->spa_ubsync.ub_version < SPA_VERSION_DIR_CLONES &&
-	    spa->spa_uberblock.ub_version >= SPA_VERSION_DIR_CLONES) {
-		dsl_pool_upgrade_dir_clones(dp, tx);
-
-		/* Keeping the freedir open increases spa_minref */
-		spa->spa_minref += 3;
-	}
-
-	if (spa->spa_ubsync.ub_version < SPA_VERSION_FEATURES &&
-	    spa->spa_uberblock.ub_version >= SPA_VERSION_FEATURES) {
-		spa_feature_create_zap_objects(spa, tx);
-	}
-
-	/*
-	 * LZ4_COMPRESS feature's behaviour was changed to activate_on_enable
-	 * when possibility to use lz4 compression for metadata was added
-	 * Old pools that have this feature enabled must be upgraded to have
-	 * this feature active
-	 */
-	if (spa->spa_uberblock.ub_version >= SPA_VERSION_FEATURES) {
-		boolean_t lz4_en = spa_feature_is_enabled(spa,
-		    SPA_FEATURE_LZ4_COMPRESS);
-		boolean_t lz4_ac = spa_feature_is_active(spa,
-		    SPA_FEATURE_LZ4_COMPRESS);
-
-		if (lz4_en && !lz4_ac)
-			spa_feature_incr(spa, SPA_FEATURE_LZ4_COMPRESS, tx);
-	}
-
-	/*
-	 * If we haven't written the salt, do so now.  Note that the
-	 * feature may not be activated yet, but that's fine since
-	 * the presence of this ZAP entry is backwards compatible.
-	 */
-	if (zap_contains(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
-	    DMU_POOL_CHECKSUM_SALT) == ENOENT) {
-		VERIFY0(zap_add(spa->spa_meta_objset,
-		    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CHECKSUM_SALT, 1,
-		    sizeof (spa->spa_cksum_salt.zcs_bytes),
-		    spa->spa_cksum_salt.zcs_bytes, tx));
-	}
-
-	rrw_exit(&dp->dp_config_rwlock, FTAG);
-}
-
-static void
-vdev_indirect_state_sync_verify(vdev_t *vd)
-{
-	vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
-	vdev_indirect_births_t *vib = vd->vdev_indirect_births;
-
-	if (vd->vdev_ops == &vdev_indirect_ops) {
-		ASSERT(vim != NULL);
-		ASSERT(vib != NULL);
-	}
-
-	if (vdev_obsolete_sm_object(vd) != 0) {
-		ASSERT(vd->vdev_obsolete_sm != NULL);
-		ASSERT(vd->vdev_removing ||
-		    vd->vdev_ops == &vdev_indirect_ops);
-		ASSERT(vdev_indirect_mapping_num_entries(vim) > 0);
-		ASSERT(vdev_indirect_mapping_bytes_mapped(vim) > 0);
-
-		ASSERT3U(vdev_obsolete_sm_object(vd), ==,
-		    space_map_object(vd->vdev_obsolete_sm));
-		ASSERT3U(vdev_indirect_mapping_bytes_mapped(vim), >=,
-		    space_map_allocated(vd->vdev_obsolete_sm));
-	}
-	ASSERT(vd->vdev_obsolete_segments != NULL);
-
-	/*
-	 * Since frees / remaps to an indirect vdev can only
-	 * happen in syncing context, the obsolete segments
-	 * tree must be empty when we start syncing.
-	 */
-	ASSERT0(range_tree_space(vd->vdev_obsolete_segments));
-}
-
-/*
- * Sync the specified transaction group.  New blocks may be dirtied as
- * part of the process, so we iterate until it converges.
- */
-void
-spa_sync(spa_t *spa, uint64_t txg)
-{
-	dsl_pool_t *dp = spa->spa_dsl_pool;
-	objset_t *mos = spa->spa_meta_objset;
-	bplist_t *free_bpl = &spa->spa_free_bplist[txg & TXG_MASK];
-	metaslab_class_t *normal = spa_normal_class(spa);
-	metaslab_class_t *special = spa_special_class(spa);
-	metaslab_class_t *dedup = spa_dedup_class(spa);
-	vdev_t *rvd = spa->spa_root_vdev;
-	vdev_t *vd;
-	dmu_tx_t *tx;
-	int error;
-	uint32_t max_queue_depth = zfs_vdev_async_write_max_active *
-	    zfs_vdev_queue_depth_pct / 100;
-
-	VERIFY(spa_writeable(spa));
-
-	/*
-	 * Wait for i/os issued in open context that need to complete
-	 * before this txg syncs.
-	 */
-	(void) zio_wait(spa->spa_txg_zio[txg & TXG_MASK]);
-	spa->spa_txg_zio[txg & TXG_MASK] = zio_root(spa, NULL, NULL,
-	    ZIO_FLAG_CANFAIL);
-
-	/*
-	 * Lock out configuration changes.
-	 */
-	spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
-
-	spa->spa_syncing_txg = txg;
-	spa->spa_sync_pass = 0;
-
-	for (int i = 0; i < spa->spa_alloc_count; i++) {
-		mutex_enter(&spa->spa_alloc_locks[i]);
-		VERIFY0(avl_numnodes(&spa->spa_alloc_trees[i]));
-		mutex_exit(&spa->spa_alloc_locks[i]);
-	}
-
-	/*
-	 * If there are any pending vdev state changes, convert them
-	 * into config changes that go out with this transaction group.
-	 */
-	spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
-	while (list_head(&spa->spa_state_dirty_list) != NULL) {
-		/*
-		 * We need the write lock here because, for aux vdevs,
-		 * calling vdev_config_dirty() modifies sav_config.
-		 * This is ugly and will become unnecessary when we
-		 * eliminate the aux vdev wart by integrating all vdevs
-		 * into the root vdev tree.
-		 */
-		spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
-		spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_WRITER);
-		while ((vd = list_head(&spa->spa_state_dirty_list)) != NULL) {
-			vdev_state_clean(vd);
-			vdev_config_dirty(vd);
-		}
-		spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
-		spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER);
-	}
-	spa_config_exit(spa, SCL_STATE, FTAG);
-
-	tx = dmu_tx_create_assigned(dp, txg);
-
-	spa->spa_sync_starttime = gethrtime();
-#ifdef illumos
-	VERIFY(cyclic_reprogram(spa->spa_deadman_cycid,
-	    spa->spa_sync_starttime + spa->spa_deadman_synctime));
-#else	/* !illumos */
-#ifdef _KERNEL
-	callout_schedule(&spa->spa_deadman_cycid,
-	    hz * spa->spa_deadman_synctime / NANOSEC);
-#endif
-#endif	/* illumos */
-
-	/*
-	 * If we are upgrading to SPA_VERSION_RAIDZ_DEFLATE this txg,
-	 * set spa_deflate if we have no raid-z vdevs.
-	 */
-	if (spa->spa_ubsync.ub_version < SPA_VERSION_RAIDZ_DEFLATE &&
-	    spa->spa_uberblock.ub_version >= SPA_VERSION_RAIDZ_DEFLATE) {
-		int i;
-
-		for (i = 0; i < rvd->vdev_children; i++) {
-			vd = rvd->vdev_child[i];
-			if (vd->vdev_deflate_ratio != SPA_MINBLOCKSIZE)
-				break;
-		}
-		if (i == rvd->vdev_children) {
-			spa->spa_deflate = TRUE;
-			VERIFY(0 == zap_add(spa->spa_meta_objset,
-			    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE,
-			    sizeof (uint64_t), 1, &spa->spa_deflate, tx));
-		}
-	}
-
-	/*
-	 * Set the top-level vdev's max queue depth. Evaluate each
-	 * top-level's async write queue depth in case it changed.
-	 * The max queue depth will not change in the middle of syncing
-	 * out this txg.
-	 */
-	uint64_t slots_per_allocator = 0;
-	for (int c = 0; c < rvd->vdev_children; c++) {
-		vdev_t *tvd = rvd->vdev_child[c];
-		metaslab_group_t *mg = tvd->vdev_mg;
-		metaslab_class_t *mc;
-
-		if (mg == NULL || !metaslab_group_initialized(mg))
-			continue;
-
-		mc = mg->mg_class;
-		if (mc != normal && mc != special && mc != dedup)
-			continue;
-
-		/*
-		 * It is safe to do a lock-free check here because only async
-		 * allocations look at mg_max_alloc_queue_depth, and async
-		 * allocations all happen from spa_sync().
-		 */
-		for (int i = 0; i < spa->spa_alloc_count; i++)
-			ASSERT0(zfs_refcount_count(
-			    &(mg->mg_alloc_queue_depth[i])));
-		mg->mg_max_alloc_queue_depth = max_queue_depth;
-
-		for (int i = 0; i < spa->spa_alloc_count; i++) {
-			mg->mg_cur_max_alloc_queue_depth[i] =
-			    zfs_vdev_def_queue_depth;
-		}
-		slots_per_allocator += zfs_vdev_def_queue_depth;
-	}
-
-	for (int i = 0; i < spa->spa_alloc_count; i++) {
-		ASSERT0(zfs_refcount_count(&normal->mc_alloc_slots[i]));
-		ASSERT0(zfs_refcount_count(&special->mc_alloc_slots[i]));
-		ASSERT0(zfs_refcount_count(&dedup->mc_alloc_slots[i]));
-		normal->mc_alloc_max_slots[i] = slots_per_allocator;
-		special->mc_alloc_max_slots[i] = slots_per_allocator;
-		dedup->mc_alloc_max_slots[i] = slots_per_allocator;
-	}
-	normal->mc_alloc_throttle_enabled = zio_dva_throttle_enabled;
-	special->mc_alloc_throttle_enabled = zio_dva_throttle_enabled;
-	dedup->mc_alloc_throttle_enabled = zio_dva_throttle_enabled;
-
-	for (int c = 0; c < rvd->vdev_children; c++) {
-		vdev_t *vd = rvd->vdev_child[c];
-		vdev_indirect_state_sync_verify(vd);
-
-		if (vdev_indirect_should_condense(vd)) {
-			spa_condense_indirect_start_sync(vd, tx);
-			break;
-		}
-	}
-
-	/*
-	 * Iterate to convergence.
-	 */
-	do {
-		int pass = ++spa->spa_sync_pass;
-
-		spa_sync_config_object(spa, tx);
-		spa_sync_aux_dev(spa, &spa->spa_spares, tx,
-		    ZPOOL_CONFIG_SPARES, DMU_POOL_SPARES);
-		spa_sync_aux_dev(spa, &spa->spa_l2cache, tx,
-		    ZPOOL_CONFIG_L2CACHE, DMU_POOL_L2CACHE);
-		spa_errlog_sync(spa, txg);
-		dsl_pool_sync(dp, txg);
-
-		if (pass < zfs_sync_pass_deferred_free) {
-			spa_sync_frees(spa, free_bpl, tx);
-		} else {
-			/*
-			 * We can not defer frees in pass 1, because
-			 * we sync the deferred frees later in pass 1.
-			 */
-			ASSERT3U(pass, >, 1);
-			bplist_iterate(free_bpl, bpobj_enqueue_cb,
-			    &spa->spa_deferred_bpobj, tx);
-		}
-
-		ddt_sync(spa, txg);
-		dsl_scan_sync(dp, tx);
-
-		if (spa->spa_vdev_removal != NULL)
-			svr_sync(spa, tx);
-
-		while ((vd = txg_list_remove(&spa->spa_vdev_txg_list, txg))
-		    != NULL)
-			vdev_sync(vd, txg);
-
-		if (pass == 1) {
-			spa_sync_upgrades(spa, tx);
-			ASSERT3U(txg, >=,
-			    spa->spa_uberblock.ub_rootbp.blk_birth);
-			/*
-			 * Note: We need to check if the MOS is dirty
-			 * because we could have marked the MOS dirty
-			 * without updating the uberblock (e.g. if we
-			 * have sync tasks but no dirty user data).  We
-			 * need to check the uberblock's rootbp because
-			 * it is updated if we have synced out dirty
-			 * data (though in this case the MOS will most
-			 * likely also be dirty due to second order
-			 * effects, we don't want to rely on that here).
-			 */
-			if (spa->spa_uberblock.ub_rootbp.blk_birth < txg &&
-			    !dmu_objset_is_dirty(mos, txg)) {
-				/*
-				 * Nothing changed on the first pass,
-				 * therefore this TXG is a no-op.  Avoid
-				 * syncing deferred frees, so that we
-				 * can keep this TXG as a no-op.
-				 */
-				ASSERT(txg_list_empty(&dp->dp_dirty_datasets,
-				    txg));
-				ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg));
-				ASSERT(txg_list_empty(&dp->dp_sync_tasks, txg));
-				ASSERT(txg_list_empty(&dp->dp_early_sync_tasks,
-				    txg));
-				break;
-			}
-			spa_sync_deferred_frees(spa, tx);
-		}
-
-	} while (dmu_objset_is_dirty(mos, txg));
-
-	if (!list_is_empty(&spa->spa_config_dirty_list)) {
-		/*
-		 * Make sure that the number of ZAPs for all the vdevs matches
-		 * the number of ZAPs in the per-vdev ZAP list. This only gets
-		 * called if the config is dirty; otherwise there may be
-		 * outstanding AVZ operations that weren't completed in
-		 * spa_sync_config_object.
-		 */
-		uint64_t all_vdev_zap_entry_count;
-		ASSERT0(zap_count(spa->spa_meta_objset,
-		    spa->spa_all_vdev_zaps, &all_vdev_zap_entry_count));
-		ASSERT3U(vdev_count_verify_zaps(spa->spa_root_vdev), ==,
-		    all_vdev_zap_entry_count);
-	}
-
-	if (spa->spa_vdev_removal != NULL) {
-		ASSERT0(spa->spa_vdev_removal->svr_bytes_done[txg & TXG_MASK]);
-	}
-
-	/*
-	 * Rewrite the vdev configuration (which includes the uberblock)
-	 * to commit the transaction group.
-	 *
-	 * If there are no dirty vdevs, we sync the uberblock to a few
-	 * random top-level vdevs that are known to be visible in the
-	 * config cache (see spa_vdev_add() for a complete description).
-	 * If there *are* dirty vdevs, sync the uberblock to all vdevs.
-	 */
-	for (;;) {
-		/*
-		 * We hold SCL_STATE to prevent vdev open/close/etc.
-		 * while we're attempting to write the vdev labels.
-		 */
-		spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
-
-		if (list_is_empty(&spa->spa_config_dirty_list)) {
-			vdev_t *svd[SPA_SYNC_MIN_VDEVS] = { NULL };
-			int svdcount = 0;
-			int children = rvd->vdev_children;
-			int c0 = spa_get_random(children);
-
-			for (int c = 0; c < children; c++) {
-				vd = rvd->vdev_child[(c0 + c) % children];
-
-				/* Stop when revisiting the first vdev */
-				if (c > 0 && svd[0] == vd)
-					break;
-
-				if (vd->vdev_ms_array == 0 || vd->vdev_islog ||
-				    !vdev_is_concrete(vd))
-					continue;
-
-				svd[svdcount++] = vd;
-				if (svdcount == SPA_SYNC_MIN_VDEVS)
-					break;
-			}
-			error = vdev_config_sync(svd, svdcount, txg);
-		} else {
-			error = vdev_config_sync(rvd->vdev_child,
-			    rvd->vdev_children, txg);
-		}
-
-		if (error == 0)
-			spa->spa_last_synced_guid = rvd->vdev_guid;
-
-		spa_config_exit(spa, SCL_STATE, FTAG);
-
-		if (error == 0)
-			break;
-		zio_suspend(spa, NULL, ZIO_SUSPEND_IOERR);
-		zio_resume_wait(spa);
-	}
-	dmu_tx_commit(tx);
-
-#ifdef illumos
-	VERIFY(cyclic_reprogram(spa->spa_deadman_cycid, CY_INFINITY));
-#else	/* !illumos */
-#ifdef _KERNEL
-	callout_drain(&spa->spa_deadman_cycid);
-#endif
-#endif	/* illumos */
-
-	/*
-	 * Clear the dirty config list.
-	 */
-	while ((vd = list_head(&spa->spa_config_dirty_list)) != NULL)
-		vdev_config_clean(vd);
-
-	/*
-	 * Now that the new config has synced transactionally,
-	 * let it become visible to the config cache.
-	 */
-	if (spa->spa_config_syncing != NULL) {
-		spa_config_set(spa, spa->spa_config_syncing);
-		spa->spa_config_txg = txg;
-		spa->spa_config_syncing = NULL;
-	}
-
-	dsl_pool_sync_done(dp, txg);
-
-	for (int i = 0; i < spa->spa_alloc_count; i++) {
-		mutex_enter(&spa->spa_alloc_locks[i]);
-		VERIFY0(avl_numnodes(&spa->spa_alloc_trees[i]));
-		mutex_exit(&spa->spa_alloc_locks[i]);
-	}
-
-	/*
-	 * Update usable space statistics.
-	 */
-	while ((vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg)))
-	    != NULL)
-		vdev_sync_done(vd, txg);
-
-	spa_update_dspace(spa);
-
-	/*
-	 * It had better be the case that we didn't dirty anything
-	 * since vdev_config_sync().
-	 */
-	ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg));
-	ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg));
-	ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg));
-
-	while (zfs_pause_spa_sync)
-		delay(1);
-
-	spa->spa_sync_pass = 0;
-
-	/*
-	 * Update the last synced uberblock here. We want to do this at
-	 * the end of spa_sync() so that consumers of spa_last_synced_txg()
-	 * will be guaranteed that all the processing associated with
-	 * that txg has been completed.
-	 */
-	spa->spa_ubsync = spa->spa_uberblock;
-	spa_config_exit(spa, SCL_CONFIG, FTAG);
-
-	spa_handle_ignored_writes(spa);
-
-	/*
-	 * If any async tasks have been requested, kick them off.
-	 */
-	spa_async_dispatch(spa);
-	spa_async_dispatch_vd(spa);
-}
-
-/*
- * Sync all pools.  We don't want to hold the namespace lock across these
- * operations, so we take a reference on the spa_t and drop the lock during the
- * sync.
- */
-void
-spa_sync_allpools(void)
-{
-	spa_t *spa = NULL;
-	mutex_enter(&spa_namespace_lock);
-	while ((spa = spa_next(spa)) != NULL) {
-		if (spa_state(spa) != POOL_STATE_ACTIVE ||
-		    !spa_writeable(spa) || spa_suspended(spa))
-			continue;
-		spa_open_ref(spa, FTAG);
-		mutex_exit(&spa_namespace_lock);
-		txg_wait_synced(spa_get_dsl(spa), 0);
-		mutex_enter(&spa_namespace_lock);
-		spa_close(spa, FTAG);
-	}
-	mutex_exit(&spa_namespace_lock);
-}
-
-/*
- * ==========================================================================
- * Miscellaneous routines
- * ==========================================================================
- */
-
-/*
- * Remove all pools in the system.
- */
-void
-spa_evict_all(void)
-{
-	spa_t *spa;
-
-	/*
-	 * Remove all cached state.  All pools should be closed now,
-	 * so every spa in the AVL tree should be unreferenced.
-	 */
-	mutex_enter(&spa_namespace_lock);
-	while ((spa = spa_next(NULL)) != NULL) {
-		/*
-		 * Stop async tasks.  The async thread may need to detach
-		 * a device that's been replaced, which requires grabbing
-		 * spa_namespace_lock, so we must drop it here.
-		 */
-		spa_open_ref(spa, FTAG);
-		mutex_exit(&spa_namespace_lock);
-		spa_async_suspend(spa);
-		mutex_enter(&spa_namespace_lock);
-		spa_close(spa, FTAG);
-
-		if (spa->spa_state != POOL_STATE_UNINITIALIZED) {
-			spa_unload(spa);
-			spa_deactivate(spa);
-		}
-		spa_remove(spa);
-	}
-	mutex_exit(&spa_namespace_lock);
-}
-
-vdev_t *
-spa_lookup_by_guid(spa_t *spa, uint64_t guid, boolean_t aux)
-{
-	vdev_t *vd;
-	int i;
-
-	if ((vd = vdev_lookup_by_guid(spa->spa_root_vdev, guid)) != NULL)
-		return (vd);
-
-	if (aux) {
-		for (i = 0; i < spa->spa_l2cache.sav_count; i++) {
-			vd = spa->spa_l2cache.sav_vdevs[i];
-			if (vd->vdev_guid == guid)
-				return (vd);
-		}
-
-		for (i = 0; i < spa->spa_spares.sav_count; i++) {
-			vd = spa->spa_spares.sav_vdevs[i];
-			if (vd->vdev_guid == guid)
-				return (vd);
-		}
-	}
-
-	return (NULL);
-}
-
-void
-spa_upgrade(spa_t *spa, uint64_t version)
-{
-	ASSERT(spa_writeable(spa));
-
-	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
-
-	/*
-	 * This should only be called for a non-faulted pool, and since a
-	 * future version would result in an unopenable pool, this shouldn't be
-	 * possible.
-	 */
-	ASSERT(SPA_VERSION_IS_SUPPORTED(spa->spa_uberblock.ub_version));
-	ASSERT3U(version, >=, spa->spa_uberblock.ub_version);
-
-	spa->spa_uberblock.ub_version = version;
-	vdev_config_dirty(spa->spa_root_vdev);
-
-	spa_config_exit(spa, SCL_ALL, FTAG);
-
-	txg_wait_synced(spa_get_dsl(spa), 0);
-}
-
-boolean_t
-spa_has_spare(spa_t *spa, uint64_t guid)
-{
-	int i;
-	uint64_t spareguid;
-	spa_aux_vdev_t *sav = &spa->spa_spares;
-
-	for (i = 0; i < sav->sav_count; i++)
-		if (sav->sav_vdevs[i]->vdev_guid == guid)
-			return (B_TRUE);
-
-	for (i = 0; i < sav->sav_npending; i++) {
-		if (nvlist_lookup_uint64(sav->sav_pending[i], ZPOOL_CONFIG_GUID,
-		    &spareguid) == 0 && spareguid == guid)
-			return (B_TRUE);
-	}
-
-	return (B_FALSE);
-}
-
-/*
- * Check if a pool has an active shared spare device.
- * Note: reference count of an active spare is 2, as a spare and as a replace
- */
-static boolean_t
-spa_has_active_shared_spare(spa_t *spa)
-{
-	int i, refcnt;
-	uint64_t pool;
-	spa_aux_vdev_t *sav = &spa->spa_spares;
-
-	for (i = 0; i < sav->sav_count; i++) {
-		if (spa_spare_exists(sav->sav_vdevs[i]->vdev_guid, &pool,
-		    &refcnt) && pool != 0ULL && pool == spa_guid(spa) &&
-		    refcnt > 2)
-			return (B_TRUE);
-	}
-
-	return (B_FALSE);
-}
-
-sysevent_t *
-spa_event_create(spa_t *spa, vdev_t *vd, nvlist_t *hist_nvl, const char *name)
-{
-	sysevent_t		*ev = NULL;
-#ifdef _KERNEL
-	sysevent_attr_list_t	*attr = NULL;
-	sysevent_value_t	value;
-
-	ev = sysevent_alloc(EC_ZFS, (char *)name, SUNW_KERN_PUB "zfs",
-	    SE_SLEEP);
-	ASSERT(ev != NULL);
-
-	value.value_type = SE_DATA_TYPE_STRING;
-	value.value.sv_string = spa_name(spa);
-	if (sysevent_add_attr(&attr, ZFS_EV_POOL_NAME, &value, SE_SLEEP) != 0)
-		goto done;
-
-	value.value_type = SE_DATA_TYPE_UINT64;
-	value.value.sv_uint64 = spa_guid(spa);
-	if (sysevent_add_attr(&attr, ZFS_EV_POOL_GUID, &value, SE_SLEEP) != 0)
-		goto done;
-
-	if (vd) {
-		value.value_type = SE_DATA_TYPE_UINT64;
-		value.value.sv_uint64 = vd->vdev_guid;
-		if (sysevent_add_attr(&attr, ZFS_EV_VDEV_GUID, &value,
-		    SE_SLEEP) != 0)
-			goto done;
-
-		if (vd->vdev_path) {
-			value.value_type = SE_DATA_TYPE_STRING;
-			value.value.sv_string = vd->vdev_path;
-			if (sysevent_add_attr(&attr, ZFS_EV_VDEV_PATH,
-			    &value, SE_SLEEP) != 0)
-				goto done;
-		}
-	}
-
-	if (hist_nvl != NULL) {
-		fnvlist_merge((nvlist_t *)attr, hist_nvl);
-	}
-
-	if (sysevent_attach_attributes(ev, attr) != 0)
-		goto done;
-	attr = NULL;
-
-done:
-	if (attr)
-		sysevent_free_attr(attr);
-
-#endif
-	return (ev);
-}
-
-void
-spa_event_post(sysevent_t *ev)
-{
-#ifdef _KERNEL
-	sysevent_id_t		eid;
-
-	(void) log_sysevent(ev, SE_SLEEP, &eid);
-	sysevent_free(ev);
-#endif
-}
-
-void
-spa_event_discard(sysevent_t *ev)
-{
-#ifdef _KERNEL
-	sysevent_free(ev);
-#endif
-}
-
-/*
- * Post a sysevent corresponding to the given event.  The 'name' must be one of
- * the event definitions in sys/sysevent/eventdefs.h.  The payload will be
- * filled in from the spa and (optionally) the vdev and history nvl.  This
- * doesn't do anything in the userland libzpool, as we don't want consumers to
- * misinterpret ztest or zdb as real changes.
- */
-void
-spa_event_notify(spa_t *spa, vdev_t *vd, nvlist_t *hist_nvl, const char *name)
-{
-	spa_event_post(spa_event_create(spa, vd, hist_nvl, name));
-}
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_checkpoint.c
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_checkpoint.c
+++ /dev/null
@@ -1,623 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-
-/*
- * Copyright (c) 2017 by Delphix. All rights reserved.
- */
-
-/*
- * Storage Pool Checkpoint
- *
- * A storage pool checkpoint can be thought of as a pool-wide snapshot or
- * a stable version of extreme rewind that guarantees no blocks from the
- * checkpointed state will have been overwritten. It remembers the entire
- * state of the storage pool (e.g. snapshots, dataset names, etc..) from the
- * point that it was taken and the user can rewind back to that point even if
- * they applied destructive operations on their datasets or even enabled new
- * zpool on-disk features. If a pool has a checkpoint that is no longer
- * needed, the user can discard it.
- *
- * == On disk data structures used ==
- *
- * - The pool has a new feature flag and a new entry in the MOS. The feature
- *   flag is set to active when we create the checkpoint and remains active
- *   until the checkpoint is fully discarded. The entry in the MOS config
- *   (DMU_POOL_ZPOOL_CHECKPOINT) is populated with the uberblock that
- *   references the state of the pool when we take the checkpoint. The entry
- *   remains populated until we start discarding the checkpoint or we rewind
- *   back to it.
- *
- * - Each vdev contains a vdev-wide space map while the pool has a checkpoint,
- *   which persists until the checkpoint is fully discarded. The space map
- *   contains entries that have been freed in the current state of the pool
- *   but we want to keep around in case we decide to rewind to the checkpoint.
- *   [see vdev_checkpoint_sm]
- *
- * - Each metaslab's ms_sm space map behaves the same as without the
- *   checkpoint, with the only exception being the scenario when we free
- *   blocks that belong to the checkpoint. In this case, these blocks remain
- *   ALLOCATED in the metaslab's space map and they are added as FREE in the
- *   vdev's checkpoint space map.
- *
- * - Each uberblock has a field (ub_checkpoint_txg) which holds the txg that
- *   the uberblock was checkpointed. For normal uberblocks this field is 0.
- *
- * == Overview of operations ==
- *
- * - To create a checkpoint, we first wait for the current TXG to be synced,
- *   so we can use the most recently synced uberblock (spa_ubsync) as the
- *   checkpointed uberblock. Then we use an early synctask to place that
- *   uberblock in MOS config, increment the feature flag for the checkpoint
- *   (marking it active), and setting spa_checkpoint_txg (see its use below)
- *   to the TXG of the checkpointed uberblock. We use an early synctask for
- *   the aforementioned operations to ensure that no blocks were dirtied
- *   between the current TXG and the TXG of the checkpointed uberblock
- *   (e.g the previous txg).
- *
- * - When a checkpoint exists, we need to ensure that the blocks that
- *   belong to the checkpoint are freed but never reused. This means that
- *   these blocks should never end up in the ms_allocatable or the ms_freeing
- *   trees of a metaslab. Therefore, whenever there is a checkpoint the new
- *   ms_checkpointing tree is used in addition to the aforementioned ones.
- *
- *   Whenever a block is freed and we find out that it is referenced by the
- *   checkpoint (we find out by comparing its birth to spa_checkpoint_txg),
- *   we place it in the ms_checkpointing tree instead of the ms_freeingtree.
- *   This way, we divide the blocks that are being freed into checkpointed
- *   and not-checkpointed blocks.
- *
- *   In order to persist these frees, we write the extents from the
- *   ms_freeingtree to the ms_sm as usual, and the extents from the
- *   ms_checkpointing tree to the vdev_checkpoint_sm. This way, these
- *   checkpointed extents will remain allocated in the metaslab's ms_sm space
- *   map, and therefore won't be reused [see metaslab_sync()]. In addition,
- *   when we discard the checkpoint, we can find the entries that have
- *   actually been freed in vdev_checkpoint_sm.
- *   [see spa_checkpoint_discard_thread_sync()]
- *
- * - To discard the checkpoint we use an early synctask to delete the
- *   checkpointed uberblock from the MOS config, set spa_checkpoint_txg to 0,
- *   and wakeup the discarding zthr thread (an open-context async thread).
- *   We use an early synctask to ensure that the operation happens before any
- *   new data end up in the checkpoint's data structures.
- *
- *   Once the synctask is done and the discarding zthr is awake, we discard
- *   the checkpointed data over multiple TXGs by having the zthr prefetching
- *   entries from vdev_checkpoint_sm and then starting a synctask that places
- *   them as free blocks in to their respective ms_allocatable and ms_sm
- *   structures.
- *   [see spa_checkpoint_discard_thread()]
- *
- *   When there are no entries left in the vdev_checkpoint_sm of all
- *   top-level vdevs, a final synctask runs that decrements the feature flag.
- *
- * - To rewind to the checkpoint, we first use the current uberblock and
- *   open the MOS so we can access the checkpointed uberblock from the MOS
- *   config. After we retrieve the checkpointed uberblock, we use it as the
- *   current uberblock for the pool by writing it to disk with an updated
- *   TXG, opening its version of the MOS, and moving on as usual from there.
- *   [see spa_ld_checkpoint_rewind()]
- *
- *   An important note on rewinding to the checkpoint has to do with how we
- *   handle ZIL blocks. In the scenario of a rewind, we clear out any ZIL
- *   blocks that have not been claimed by the time we took the checkpoint
- *   as they should no longer be valid.
- *   [see comment in zil_claim()]
- *
- * == Miscellaneous information ==
- *
- * - In the hypothetical event that we take a checkpoint, remove a vdev,
- *   and attempt to rewind, the rewind would fail as the checkpointed
- *   uberblock would reference data in the removed device. For this reason
- *   and others of similar nature, we disallow the following operations that
- *   can change the config:
- *	vdev removal and attach/detach, mirror splitting, and pool reguid.
- *
- * - As most of the checkpoint logic is implemented in the SPA and doesn't
- *   distinguish datasets when it comes to space accounting, having a
- *   checkpoint can potentially break the boundaries set by dataset
- *   reservations.
- */
-
-#include <sys/dmu_tx.h>
-#include <sys/dsl_dir.h>
-#include <sys/dsl_synctask.h>
-#include <sys/metaslab_impl.h>
-#include <sys/spa.h>
-#include <sys/spa_impl.h>
-#include <sys/spa_checkpoint.h>
-#include <sys/vdev_impl.h>
-#include <sys/zap.h>
-#include <sys/zfeature.h>
-
-/*
- * The following parameter limits the amount of memory to be used for the
- * prefetching of the checkpoint space map done on each vdev while
- * discarding the checkpoint.
- *
- * The reason it exists is because top-level vdevs with long checkpoint
- * space maps can potentially take up a lot of memory depending on the
- * amount of checkpointed data that has been freed within them while
- * the pool had a checkpoint.
- */
-uint64_t	zfs_spa_discard_memory_limit = 16 * 1024 * 1024;
-
-int
-spa_checkpoint_get_stats(spa_t *spa, pool_checkpoint_stat_t *pcs)
-{
-	if (!spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT))
-		return (SET_ERROR(ZFS_ERR_NO_CHECKPOINT));
-
-	bzero(pcs, sizeof (pool_checkpoint_stat_t));
-
-	int error = zap_contains(spa_meta_objset(spa),
-	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ZPOOL_CHECKPOINT);
-	ASSERT(error == 0 || error == ENOENT);
-
-	if (error == ENOENT)
-		pcs->pcs_state = CS_CHECKPOINT_DISCARDING;
-	else
-		pcs->pcs_state = CS_CHECKPOINT_EXISTS;
-
-	pcs->pcs_space = spa->spa_checkpoint_info.sci_dspace;
-	pcs->pcs_start_time = spa->spa_checkpoint_info.sci_timestamp;
-
-	return (0);
-}
-
-static void
-spa_checkpoint_discard_complete_sync(void *arg, dmu_tx_t *tx)
-{
-	spa_t *spa = arg;
-
-	spa->spa_checkpoint_info.sci_timestamp = 0;
-
-	spa_feature_decr(spa, SPA_FEATURE_POOL_CHECKPOINT, tx);
-
-	spa_history_log_internal(spa, "spa discard checkpoint", tx,
-	    "finished discarding checkpointed state from the pool");
-}
-
-typedef struct spa_checkpoint_discard_sync_callback_arg {
-	vdev_t *sdc_vd;
-	uint64_t sdc_txg;
-	uint64_t sdc_entry_limit;
-} spa_checkpoint_discard_sync_callback_arg_t;
-
-static int
-spa_checkpoint_discard_sync_callback(space_map_entry_t *sme, void *arg)
-{
-	spa_checkpoint_discard_sync_callback_arg_t *sdc = arg;
-	vdev_t *vd = sdc->sdc_vd;
-	metaslab_t *ms = vd->vdev_ms[sme->sme_offset >> vd->vdev_ms_shift];
-	uint64_t end = sme->sme_offset + sme->sme_run;
-
-	if (sdc->sdc_entry_limit == 0)
-		return (EINTR);
-
-	/*
-	 * Since the space map is not condensed, we know that
-	 * none of its entries is crossing the boundaries of
-	 * its respective metaslab.
-	 *
-	 * That said, there is no fundamental requirement that
-	 * the checkpoint's space map entries should not cross
-	 * metaslab boundaries. So if needed we could add code
-	 * that handles metaslab-crossing segments in the future.
-	 */
-	VERIFY3U(sme->sme_type, ==, SM_FREE);
-	VERIFY3U(sme->sme_offset, >=, ms->ms_start);
-	VERIFY3U(end, <=, ms->ms_start + ms->ms_size);
-
-	/*
-	 * At this point we should not be processing any
-	 * other frees concurrently, so the lock is technically
-	 * unnecessary. We use the lock anyway though to
-	 * potentially save ourselves from future headaches.
-	 */
-	mutex_enter(&ms->ms_lock);
-	if (range_tree_is_empty(ms->ms_freeing))
-		vdev_dirty(vd, VDD_METASLAB, ms, sdc->sdc_txg);
-	range_tree_add(ms->ms_freeing, sme->sme_offset, sme->sme_run);
-	mutex_exit(&ms->ms_lock);
-
-	ASSERT3U(vd->vdev_spa->spa_checkpoint_info.sci_dspace, >=,
-	    sme->sme_run);
-	ASSERT3U(vd->vdev_stat.vs_checkpoint_space, >=, sme->sme_run);
-
-	vd->vdev_spa->spa_checkpoint_info.sci_dspace -= sme->sme_run;
-	vd->vdev_stat.vs_checkpoint_space -= sme->sme_run;
-	sdc->sdc_entry_limit--;
-
-	return (0);
-}
-
-static void
-spa_checkpoint_accounting_verify(spa_t *spa)
-{
-	vdev_t *rvd = spa->spa_root_vdev;
-	uint64_t ckpoint_sm_space_sum = 0;
-	uint64_t vs_ckpoint_space_sum = 0;
-
-	for (uint64_t c = 0; c < rvd->vdev_children; c++) {
-		vdev_t *vd = rvd->vdev_child[c];
-
-		if (vd->vdev_checkpoint_sm != NULL) {
-			ckpoint_sm_space_sum +=
-			    -space_map_allocated(vd->vdev_checkpoint_sm);
-			vs_ckpoint_space_sum +=
-			    vd->vdev_stat.vs_checkpoint_space;
-			ASSERT3U(ckpoint_sm_space_sum, ==,
-			    vs_ckpoint_space_sum);
-		} else {
-			ASSERT0(vd->vdev_stat.vs_checkpoint_space);
-		}
-	}
-	ASSERT3U(spa->spa_checkpoint_info.sci_dspace, ==, ckpoint_sm_space_sum);
-}
-
-static void
-spa_checkpoint_discard_thread_sync(void *arg, dmu_tx_t *tx)
-{
-	vdev_t *vd = arg;
-	int error;
-
-	/*
-	 * The space map callback is applied only to non-debug entries.
-	 * Because the number of debug entries is less or equal to the
-	 * number of non-debug entries, we want to ensure that we only
-	 * read what we prefetched from open-context.
-	 *
-	 * Thus, we set the maximum entries that the space map callback
-	 * will be applied to be half the entries that could fit in the
-	 * imposed memory limit.
-	 *
-	 * Note that since this is a conservative estimate we also
-	 * assume the worst case scenario in our computation where each
-	 * entry is two-word.
-	 */
-	uint64_t max_entry_limit =
-	    (zfs_spa_discard_memory_limit / (2 * sizeof (uint64_t))) >> 1;
-
-	/*
-	 * Iterate from the end of the space map towards the beginning,
-	 * placing its entries on ms_freeing and removing them from the
-	 * space map. The iteration stops if one of the following
-	 * conditions is true:
-	 *
-	 * 1] We reached the beginning of the space map. At this point
-	 *    the space map should be completely empty and
-	 *    space_map_incremental_destroy should have returned 0.
-	 *    The next step would be to free and close the space map
-	 *    and remove its entry from its vdev's top zap. This allows
-	 *    spa_checkpoint_discard_thread() to move on to the next vdev.
-	 *
-	 * 2] We reached the memory limit (amount of memory used to hold
-	 *    space map entries in memory) and space_map_incremental_destroy
-	 *    returned EINTR. This means that there are entries remaining
-	 *    in the space map that will be cleared in a future invocation
-	 *    of this function by spa_checkpoint_discard_thread().
-	 */
-	spa_checkpoint_discard_sync_callback_arg_t sdc;
-	sdc.sdc_vd = vd;
-	sdc.sdc_txg = tx->tx_txg;
-	sdc.sdc_entry_limit = max_entry_limit;
-
-	uint64_t words_before =
-	    space_map_length(vd->vdev_checkpoint_sm) / sizeof (uint64_t);
-
-	error = space_map_incremental_destroy(vd->vdev_checkpoint_sm,
-	    spa_checkpoint_discard_sync_callback, &sdc, tx);
-
-	uint64_t words_after =
-	    space_map_length(vd->vdev_checkpoint_sm) / sizeof (uint64_t);
-
-#ifdef DEBUG
-	spa_checkpoint_accounting_verify(vd->vdev_spa);
-#endif
-
-	zfs_dbgmsg("discarding checkpoint: txg %llu, vdev id %d, "
-	    "deleted %llu words - %llu words are left",
-	    tx->tx_txg, vd->vdev_id, (words_before - words_after),
-	    words_after);
-
-	if (error != EINTR) {
-		if (error != 0) {
-			zfs_panic_recover("zfs: error %d was returned "
-			    "while incrementally destroying the checkpoint "
-			    "space map of vdev %llu\n",
-			    error, vd->vdev_id);
-		}
-		ASSERT0(words_after);
-		ASSERT0(space_map_allocated(vd->vdev_checkpoint_sm));
-		ASSERT0(space_map_length(vd->vdev_checkpoint_sm));
-
-		space_map_free(vd->vdev_checkpoint_sm, tx);
-		space_map_close(vd->vdev_checkpoint_sm);
-		vd->vdev_checkpoint_sm = NULL;
-
-		VERIFY0(zap_remove(spa_meta_objset(vd->vdev_spa),
-		    vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM, tx));
-	}
-}
-
-static boolean_t
-spa_checkpoint_discard_is_done(spa_t *spa)
-{
-	vdev_t *rvd = spa->spa_root_vdev;
-
-	ASSERT(!spa_has_checkpoint(spa));
-	ASSERT(spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT));
-
-	for (uint64_t c = 0; c < rvd->vdev_children; c++) {
-		if (rvd->vdev_child[c]->vdev_checkpoint_sm != NULL)
-			return (B_FALSE);
-		ASSERT0(rvd->vdev_child[c]->vdev_stat.vs_checkpoint_space);
-	}
-
-	return (B_TRUE);
-}
-
-/* ARGSUSED */
-boolean_t
-spa_checkpoint_discard_thread_check(void *arg, zthr_t *zthr)
-{
-	spa_t *spa = arg;
-
-	if (!spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT))
-		return (B_FALSE);
-
-	if (spa_has_checkpoint(spa))
-		return (B_FALSE);
-
-	return (B_TRUE);
-}
-
-void
-spa_checkpoint_discard_thread(void *arg, zthr_t *zthr)
-{
-	spa_t *spa = arg;
-	vdev_t *rvd = spa->spa_root_vdev;
-
-	for (uint64_t c = 0; c < rvd->vdev_children; c++) {
-		vdev_t *vd = rvd->vdev_child[c];
-
-		while (vd->vdev_checkpoint_sm != NULL) {
-			space_map_t *checkpoint_sm = vd->vdev_checkpoint_sm;
-			int numbufs;
-			dmu_buf_t **dbp;
-
-			if (zthr_iscancelled(zthr))
-				return;
-
-			ASSERT3P(vd->vdev_ops, !=, &vdev_indirect_ops);
-
-			uint64_t size = MIN(space_map_length(checkpoint_sm),
-			    zfs_spa_discard_memory_limit);
-			uint64_t offset =
-			    space_map_length(checkpoint_sm) - size;
-
-			/*
-			 * Ensure that the part of the space map that will
-			 * be destroyed by the synctask, is prefetched in
-			 * memory before the synctask runs.
-			 */
-			int error = dmu_buf_hold_array_by_bonus(
-			    checkpoint_sm->sm_dbuf, offset, size,
-			    B_TRUE, FTAG, &numbufs, &dbp);
-			if (error != 0) {
-				zfs_panic_recover("zfs: error %d was returned "
-				    "while prefetching checkpoint space map "
-				    "entries of vdev %llu\n",
-				    error, vd->vdev_id);
-			}
-
-			VERIFY0(dsl_sync_task(spa->spa_name, NULL,
-			    spa_checkpoint_discard_thread_sync, vd,
-			    0, ZFS_SPACE_CHECK_NONE));
-
-			dmu_buf_rele_array(dbp, numbufs, FTAG);
-		}
-	}
-
-	VERIFY(spa_checkpoint_discard_is_done(spa));
-	VERIFY0(spa->spa_checkpoint_info.sci_dspace);
-	VERIFY0(dsl_sync_task(spa->spa_name, NULL,
-	    spa_checkpoint_discard_complete_sync, spa,
-	    0, ZFS_SPACE_CHECK_NONE));
-}
-
-
-/* ARGSUSED */
-static int
-spa_checkpoint_check(void *arg, dmu_tx_t *tx)
-{
-	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
-
-	if (!spa_feature_is_enabled(spa, SPA_FEATURE_POOL_CHECKPOINT))
-		return (SET_ERROR(ENOTSUP));
-
-	if (!spa_top_vdevs_spacemap_addressable(spa))
-		return (SET_ERROR(ZFS_ERR_VDEV_TOO_BIG));
-
-	if (spa->spa_vdev_removal != NULL)
-		return (SET_ERROR(ZFS_ERR_DEVRM_IN_PROGRESS));
-
-	if (spa->spa_checkpoint_txg != 0)
-		return (SET_ERROR(ZFS_ERR_CHECKPOINT_EXISTS));
-
-	if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT))
-		return (SET_ERROR(ZFS_ERR_DISCARDING_CHECKPOINT));
-
-	return (0);
-}
-
-/* ARGSUSED */
-static void
-spa_checkpoint_sync(void *arg, dmu_tx_t *tx)
-{
-	dsl_pool_t *dp = dmu_tx_pool(tx);
-	spa_t *spa = dp->dp_spa;
-	uberblock_t checkpoint = spa->spa_ubsync;
-
-	/*
-	 * At this point, there should not be a checkpoint in the MOS.
-	 */
-	ASSERT3U(zap_contains(spa_meta_objset(spa), DMU_POOL_DIRECTORY_OBJECT,
-	    DMU_POOL_ZPOOL_CHECKPOINT), ==, ENOENT);
-
-	ASSERT0(spa->spa_checkpoint_info.sci_timestamp);
-	ASSERT0(spa->spa_checkpoint_info.sci_dspace);
-
-	/*
-	 * Since the checkpointed uberblock is the one that just got synced
-	 * (we use spa_ubsync), its txg must be equal to the txg number of
-	 * the txg we are syncing, minus 1.
-	 */
-	ASSERT3U(checkpoint.ub_txg, ==, spa->spa_syncing_txg - 1);
-
-	/*
-	 * Once the checkpoint is in place, we need to ensure that none of
-	 * its blocks will be marked for reuse after it has been freed.
-	 * When there is a checkpoint and a block is freed, we compare its
-	 * birth txg to the txg of the checkpointed uberblock to see if the
-	 * block is part of the checkpoint or not. Therefore, we have to set
-	 * spa_checkpoint_txg before any frees happen in this txg (which is
-	 * why this is done as an early_synctask as explained in the comment
-	 * in spa_checkpoint()).
-	 */
-	spa->spa_checkpoint_txg = checkpoint.ub_txg;
-	spa->spa_checkpoint_info.sci_timestamp = checkpoint.ub_timestamp;
-
-	checkpoint.ub_checkpoint_txg = checkpoint.ub_txg;
-	VERIFY0(zap_add(spa->spa_dsl_pool->dp_meta_objset,
-	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ZPOOL_CHECKPOINT,
-	    sizeof (uint64_t), sizeof (uberblock_t) / sizeof (uint64_t),
-	    &checkpoint, tx));
-
-	/*
-	 * Increment the feature refcount and thus activate the feature.
-	 * Note that the feature will be deactivated when we've
-	 * completely discarded all checkpointed state (both vdev
-	 * space maps and uberblock).
-	 */
-	spa_feature_incr(spa, SPA_FEATURE_POOL_CHECKPOINT, tx);
-
-	spa_history_log_internal(spa, "spa checkpoint", tx,
-	    "checkpointed uberblock txg=%llu", checkpoint.ub_txg);
-}
-
-/*
- * Create a checkpoint for the pool.
- */
-int
-spa_checkpoint(const char *pool)
-{
-	int error;
-	spa_t *spa;
-
-	error = spa_open(pool, &spa, FTAG);
-	if (error != 0)
-		return (error);
-
-	mutex_enter(&spa->spa_vdev_top_lock);
-
-	/*
-	 * Wait for current syncing txg to finish so the latest synced
-	 * uberblock (spa_ubsync) has all the changes that we expect
-	 * to see if we were to revert later to the checkpoint. In other
-	 * words we want the checkpointed uberblock to include/reference
-	 * all the changes that were pending at the time that we issued
-	 * the checkpoint command.
-	 */
-	txg_wait_synced(spa_get_dsl(spa), 0);
-
-	/*
-	 * As the checkpointed uberblock references blocks from the previous
-	 * txg (spa_ubsync) we want to ensure that are not freeing any of
-	 * these blocks in the same txg that the following synctask will
-	 * run. Thus, we run it as an early synctask, so the dirty changes
-	 * that are synced to disk afterwards during zios and other synctasks
-	 * do not reuse checkpointed blocks.
-	 */
-	error = dsl_early_sync_task(pool, spa_checkpoint_check,
-	    spa_checkpoint_sync, NULL, 0, ZFS_SPACE_CHECK_NORMAL);
-
-	mutex_exit(&spa->spa_vdev_top_lock);
-
-	spa_close(spa, FTAG);
-	return (error);
-}
-
-/* ARGSUSED */
-static int
-spa_checkpoint_discard_check(void *arg, dmu_tx_t *tx)
-{
-	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
-
-	if (!spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT))
-		return (SET_ERROR(ZFS_ERR_NO_CHECKPOINT));
-
-	if (spa->spa_checkpoint_txg == 0)
-		return (SET_ERROR(ZFS_ERR_DISCARDING_CHECKPOINT));
-
-	VERIFY0(zap_contains(spa_meta_objset(spa),
-	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ZPOOL_CHECKPOINT));
-
-	return (0);
-}
-
-/* ARGSUSED */
-static void
-spa_checkpoint_discard_sync(void *arg, dmu_tx_t *tx)
-{
-	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
-
-	VERIFY0(zap_remove(spa_meta_objset(spa), DMU_POOL_DIRECTORY_OBJECT,
-	    DMU_POOL_ZPOOL_CHECKPOINT, tx));
-
-	spa->spa_checkpoint_txg = 0;
-
-	zthr_wakeup(spa->spa_checkpoint_discard_zthr);
-
-	spa_history_log_internal(spa, "spa discard checkpoint", tx,
-	    "started discarding checkpointed state from the pool");
-}
-
-/*
- * Discard the checkpoint from a pool.
- */
-int
-spa_checkpoint_discard(const char *pool)
-{
-	/*
-	 * Similarly to spa_checkpoint(), we want our synctask to run
-	 * before any pending dirty data are written to disk so they
-	 * won't end up in the checkpoint's data structures (e.g.
-	 * ms_checkpointing and vdev_checkpoint_sm) and re-create any
-	 * space maps that the discarding open-context thread has
-	 * deleted.
-	 * [see spa_discard_checkpoint_sync and spa_discard_checkpoint_thread]
-	 */
-	return (dsl_early_sync_task(pool, spa_checkpoint_discard_check,
-	    spa_checkpoint_discard_sync, NULL, 0,
-	    ZFS_SPACE_CHECK_DISCARD_CHECKPOINT));
-}
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_config.c
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_config.c
+++ /dev/null
@@ -1,594 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-
-/*
- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
- * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
- * Copyright 2017 Joyent, Inc.
- */
-
-#include <sys/zfs_context.h>
-#include <sys/spa.h>
-#include <sys/fm/fs/zfs.h>
-#include <sys/spa_impl.h>
-#include <sys/nvpair.h>
-#include <sys/uio.h>
-#include <sys/fs/zfs.h>
-#include <sys/vdev_impl.h>
-#include <sys/zfs_ioctl.h>
-#include <sys/utsname.h>
-#include <sys/sunddi.h>
-#include <sys/zfeature.h>
-#ifdef _KERNEL
-#include <sys/kobj.h>
-#include <sys/zone.h>
-#endif
-
-/*
- * Pool configuration repository.
- *
- * Pool configuration is stored as a packed nvlist on the filesystem.  By
- * default, all pools are stored in /etc/zfs/zpool.cache and loaded on boot
- * (when the ZFS module is loaded).  Pools can also have the 'cachefile'
- * property set that allows them to be stored in an alternate location until
- * the control of external software.
- *
- * For each cache file, we have a single nvlist which holds all the
- * configuration information.  When the module loads, we read this information
- * from /etc/zfs/zpool.cache and populate the SPA namespace.  This namespace is
- * maintained independently in spa.c.  Whenever the namespace is modified, or
- * the configuration of a pool is changed, we call spa_write_cachefile(), which
- * walks through all the active pools and writes the configuration to disk.
- */
-
-static uint64_t spa_config_generation = 1;
-
-/*
- * This can be overridden in userland to preserve an alternate namespace for
- * userland pools when doing testing.
- */
-const char *spa_config_path = ZPOOL_CACHE;
-
-/*
- * Called when the module is first loaded, this routine loads the configuration
- * file into the SPA namespace.  It does not actually open or load the pools; it
- * only populates the namespace.
- */
-void
-spa_config_load(void)
-{
-	void *buf = NULL;
-	nvlist_t *nvlist, *child;
-	nvpair_t *nvpair;
-	char *pathname;
-	struct _buf *file;
-	uint64_t fsize;
-
-	/*
-	 * Open the configuration file.
-	 */
-	pathname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
-
-	(void) snprintf(pathname, MAXPATHLEN, "%s", spa_config_path);
-
-	file = kobj_open_file(pathname);
-
-	kmem_free(pathname, MAXPATHLEN);
-
-	if (file == (struct _buf *)-1)
-		return;
-
-	if (kobj_get_filesize(file, &fsize) != 0)
-		goto out;
-
-	buf = kmem_alloc(fsize, KM_SLEEP);
-
-	/*
-	 * Read the nvlist from the file.
-	 */
-	if (kobj_read_file(file, buf, fsize, 0) < 0)
-		goto out;
-
-	/*
-	 * Unpack the nvlist.
-	 */
-	if (nvlist_unpack(buf, fsize, &nvlist, KM_SLEEP) != 0)
-		goto out;
-
-	/*
-	 * Iterate over all elements in the nvlist, creating a new spa_t for
-	 * each one with the specified configuration.
-	 */
-	mutex_enter(&spa_namespace_lock);
-	nvpair = NULL;
-	while ((nvpair = nvlist_next_nvpair(nvlist, nvpair)) != NULL) {
-		if (nvpair_type(nvpair) != DATA_TYPE_NVLIST)
-			continue;
-
-		child = fnvpair_value_nvlist(nvpair);
-
-		if (spa_lookup(nvpair_name(nvpair)) != NULL)
-			continue;
-		(void) spa_add(nvpair_name(nvpair), child, NULL);
-	}
-	mutex_exit(&spa_namespace_lock);
-
-	nvlist_free(nvlist);
-
-out:
-	if (buf != NULL)
-		kmem_free(buf, fsize);
-
-	kobj_close_file(file);
-}
-
-static void
-spa_config_clean(nvlist_t *nvl)
-{
-	nvlist_t **child;
-	nvlist_t *nvroot = NULL;
-	uint_t c, children;
-
-	if (nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_CHILDREN, &child,
-	    &children) == 0) {
-		for (c = 0; c < children; c++)
-			spa_config_clean(child[c]);
-	}
-
-	if (nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0)
-		spa_config_clean(nvroot);
-
-	nvlist_remove(nvl, ZPOOL_CONFIG_VDEV_STATS, DATA_TYPE_UINT64_ARRAY);
-	nvlist_remove(nvl, ZPOOL_CONFIG_SCAN_STATS, DATA_TYPE_UINT64_ARRAY);
-}
-
-static int
-spa_config_write(spa_config_dirent_t *dp, nvlist_t *nvl)
-{
-	size_t buflen;
-	char *buf;
-	vnode_t *vp;
-	int oflags = FWRITE | FTRUNC | FCREAT | FOFFMAX;
-	char *temp;
-	int err;
-
-	/*
-	 * If the nvlist is empty (NULL), then remove the old cachefile.
-	 */
-	if (nvl == NULL) {
-		err = vn_remove(dp->scd_path, UIO_SYSSPACE, RMFILE);
-		return (err);
-	}
-
-	/*
-	 * Pack the configuration into a buffer.
-	 */
-	buf = fnvlist_pack(nvl, &buflen);
-	temp = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
-
-	/*
-	 * Write the configuration to disk.  We need to do the traditional
-	 * 'write to temporary file, sync, move over original' to make sure we
-	 * always have a consistent view of the data.
-	 */
-	(void) snprintf(temp, MAXPATHLEN, "%s.tmp", dp->scd_path);
-
-	err = vn_open(temp, UIO_SYSSPACE, oflags, 0644, &vp, CRCREAT, 0);
-	if (err == 0) {
-		err = vn_rdwr(UIO_WRITE, vp, buf, buflen, 0, UIO_SYSSPACE,
-		    0, RLIM64_INFINITY, kcred, NULL);
-		if (err == 0)
-			err = VOP_FSYNC(vp, FSYNC, kcred, NULL);
-		if (err == 0)
-			err = vn_rename(temp, dp->scd_path, UIO_SYSSPACE);
-		(void) VOP_CLOSE(vp, oflags, 1, 0, kcred, NULL);
-	}
-
-	(void) vn_remove(temp, UIO_SYSSPACE, RMFILE);
-
-	fnvlist_pack_free(buf, buflen);
-	kmem_free(temp, MAXPATHLEN);
-	return (err);
-}
-
-/*
- * Synchronize pool configuration to disk.  This must be called with the
- * namespace lock held. Synchronizing the pool cache is typically done after
- * the configuration has been synced to the MOS. This exposes a window where
- * the MOS config will have been updated but the cache file has not. If
- * the system were to crash at that instant then the cached config may not
- * contain the correct information to open the pool and an explicit import
- * would be required.
- */
-void
-spa_write_cachefile(spa_t *target, boolean_t removing, boolean_t postsysevent)
-{
-	spa_config_dirent_t *dp, *tdp;
-	nvlist_t *nvl;
-	boolean_t ccw_failure;
-	int error;
-	char *pool_name;
-
-	ASSERT(MUTEX_HELD(&spa_namespace_lock));
-
-	if (rootdir == NULL || !(spa_mode_global & FWRITE))
-		return;
-
-	/*
-	 * Iterate over all cachefiles for the pool, past or present.  When the
-	 * cachefile is changed, the new one is pushed onto this list, allowing
-	 * us to update previous cachefiles that no longer contain this pool.
-	 */
-	ccw_failure = B_FALSE;
-	for (dp = list_head(&target->spa_config_list); dp != NULL;
-	    dp = list_next(&target->spa_config_list, dp)) {
-		spa_t *spa = NULL;
-		if (dp->scd_path == NULL)
-			continue;
-
-		/*
-		 * Iterate over all pools, adding any matching pools to 'nvl'.
-		 */
-		nvl = NULL;
-		while ((spa = spa_next(spa)) != NULL) {
-			nvlist_t *nvroot = NULL;
-			/*
-			 * Skip over our own pool if we're about to remove
-			 * ourselves from the spa namespace or any pool that
-			 * is readonly. Since we cannot guarantee that a
-			 * readonly pool would successfully import upon reboot,
-			 * we don't allow them to be written to the cache file.
-			 */
-			if ((spa == target && removing) ||
-			    (spa_state(spa) == POOL_STATE_ACTIVE &&
-			    !spa_writeable(spa)))
-				continue;
-
-			mutex_enter(&spa->spa_props_lock);
-			tdp = list_head(&spa->spa_config_list);
-			if (spa->spa_config == NULL ||
-			    tdp->scd_path == NULL ||
-			    strcmp(tdp->scd_path, dp->scd_path) != 0) {
-				mutex_exit(&spa->spa_props_lock);
-				continue;
-			}
-
-			if (nvl == NULL)
-				nvl = fnvlist_alloc();
-
-			if (spa->spa_import_flags & ZFS_IMPORT_TEMP_NAME) {
-				pool_name = fnvlist_lookup_string(
-				    spa->spa_config, ZPOOL_CONFIG_POOL_NAME);
-			} else {
-				pool_name = spa_name(spa);
-			}
-
-			fnvlist_add_nvlist(nvl, pool_name,
-			    spa->spa_config);
-			mutex_exit(&spa->spa_props_lock);
-
-			if (nvlist_lookup_nvlist(nvl, pool_name, &nvroot) == 0)
-				spa_config_clean(nvroot);
-		}
-
-		error = spa_config_write(dp, nvl);
-		if (error != 0)
-			ccw_failure = B_TRUE;
-		nvlist_free(nvl);
-	}
-
-	if (ccw_failure) {
-		/*
-		 * Keep trying so that configuration data is
-		 * written if/when any temporary filesystem
-		 * resource issues are resolved.
-		 */
-		if (target->spa_ccw_fail_time == 0) {
-			zfs_ereport_post(FM_EREPORT_ZFS_CONFIG_CACHE_WRITE,
-			    target, NULL, NULL, 0, 0);
-		}
-		target->spa_ccw_fail_time = gethrtime();
-		spa_async_request(target, SPA_ASYNC_CONFIG_UPDATE);
-	} else {
-		/*
-		 * Do not rate limit future attempts to update
-		 * the config cache.
-		 */
-		target->spa_ccw_fail_time = 0;
-	}
-
-	/*
-	 * Remove any config entries older than the current one.
-	 */
-	dp = list_head(&target->spa_config_list);
-	while ((tdp = list_next(&target->spa_config_list, dp)) != NULL) {
-		list_remove(&target->spa_config_list, tdp);
-		if (tdp->scd_path != NULL)
-			spa_strfree(tdp->scd_path);
-		kmem_free(tdp, sizeof (spa_config_dirent_t));
-	}
-
-	spa_config_generation++;
-
-	if (postsysevent)
-		spa_event_notify(target, NULL, NULL, ESC_ZFS_CONFIG_SYNC);
-}
-
-/*
- * Sigh.  Inside a local zone, we don't have access to /etc/zfs/zpool.cache,
- * and we don't want to allow the local zone to see all the pools anyway.
- * So we have to invent the ZFS_IOC_CONFIG ioctl to grab the configuration
- * information for all pool visible within the zone.
- */
-nvlist_t *
-spa_all_configs(uint64_t *generation)
-{
-	nvlist_t *pools;
-	spa_t *spa = NULL;
-
-	if (*generation == spa_config_generation)
-		return (NULL);
-
-	pools = fnvlist_alloc();
-
-	mutex_enter(&spa_namespace_lock);
-	while ((spa = spa_next(spa)) != NULL) {
-		if (INGLOBALZONE(curthread) ||
-		    zone_dataset_visible(spa_name(spa), NULL)) {
-			mutex_enter(&spa->spa_props_lock);
-			fnvlist_add_nvlist(pools, spa_name(spa),
-			    spa->spa_config);
-			mutex_exit(&spa->spa_props_lock);
-		}
-	}
-	*generation = spa_config_generation;
-	mutex_exit(&spa_namespace_lock);
-
-	return (pools);
-}
-
-void
-spa_config_set(spa_t *spa, nvlist_t *config)
-{
-	mutex_enter(&spa->spa_props_lock);
-	if (spa->spa_config != NULL && spa->spa_config != config)
-		nvlist_free(spa->spa_config);
-	spa->spa_config = config;
-	mutex_exit(&spa->spa_props_lock);
-}
-
-/*
- * Generate the pool's configuration based on the current in-core state.
- *
- * We infer whether to generate a complete config or just one top-level config
- * based on whether vd is the root vdev.
- */
-nvlist_t *
-spa_config_generate(spa_t *spa, vdev_t *vd, uint64_t txg, int getstats)
-{
-	nvlist_t *config, *nvroot;
-	vdev_t *rvd = spa->spa_root_vdev;
-	unsigned long hostid = 0;
-	boolean_t locked = B_FALSE;
-	uint64_t split_guid;
-	char *pool_name;
-
-	if (vd == NULL) {
-		vd = rvd;
-		locked = B_TRUE;
-		spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER);
-	}
-
-	ASSERT(spa_config_held(spa, SCL_CONFIG | SCL_STATE, RW_READER) ==
-	    (SCL_CONFIG | SCL_STATE));
-
-	/*
-	 * If txg is -1, report the current value of spa->spa_config_txg.
-	 */
-	if (txg == -1ULL)
-		txg = spa->spa_config_txg;
-
-	/*
-	 * Originally, users had to handle spa namespace collisions by either
-	 * exporting the already imported pool or by specifying a new name for
-	 * the pool with a conflicting name. In the case of root pools from
-	 * virtual guests, neither approach to collision resolution is
-	 * reasonable. This is addressed by extending the new name syntax with
-	 * an option to specify that the new name is temporary. When specified,
-	 * ZFS_IMPORT_TEMP_NAME will be set in spa->spa_import_flags to tell us
-	 * to use the previous name, which we do below.
-	 */
-	if (spa->spa_import_flags & ZFS_IMPORT_TEMP_NAME) {
-		pool_name = fnvlist_lookup_string(spa->spa_config,
-		    ZPOOL_CONFIG_POOL_NAME);
-	} else {
-		pool_name = spa_name(spa);
-	}
-
-	config = fnvlist_alloc();
-
-	fnvlist_add_uint64(config, ZPOOL_CONFIG_VERSION, spa_version(spa));
-	fnvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, pool_name);
-	fnvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, spa_state(spa));
-	fnvlist_add_uint64(config, ZPOOL_CONFIG_POOL_TXG, txg);
-	fnvlist_add_uint64(config, ZPOOL_CONFIG_POOL_GUID, spa_guid(spa));
-	if (spa->spa_comment != NULL) {
-		fnvlist_add_string(config, ZPOOL_CONFIG_COMMENT,
-		    spa->spa_comment);
-	}
-
-	hostid = spa_get_hostid();
-	if (hostid != 0) {
-		fnvlist_add_uint64(config, ZPOOL_CONFIG_HOSTID, hostid);
-	}
-	fnvlist_add_string(config, ZPOOL_CONFIG_HOSTNAME, utsname.nodename);
-
-	int config_gen_flags = 0;
-	if (vd != rvd) {
-		fnvlist_add_uint64(config, ZPOOL_CONFIG_TOP_GUID,
-		    vd->vdev_top->vdev_guid);
-		fnvlist_add_uint64(config, ZPOOL_CONFIG_GUID,
-		    vd->vdev_guid);
-		if (vd->vdev_isspare) {
-			fnvlist_add_uint64(config,
-			    ZPOOL_CONFIG_IS_SPARE, 1ULL);
-		}
-		if (vd->vdev_islog) {
-			fnvlist_add_uint64(config,
-			    ZPOOL_CONFIG_IS_LOG, 1ULL);
-		}
-		vd = vd->vdev_top;		/* label contains top config */
-	} else {
-		/*
-		 * Only add the (potentially large) split information
-		 * in the mos config, and not in the vdev labels
-		 */
-		if (spa->spa_config_splitting != NULL)
-			fnvlist_add_nvlist(config, ZPOOL_CONFIG_SPLIT,
-			    spa->spa_config_splitting);
-		fnvlist_add_boolean(config,
-		    ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS);
-
-		config_gen_flags |= VDEV_CONFIG_MOS;
-	}
-
-	/*
-	 * Add the top-level config.  We even add this on pools which
-	 * don't support holes in the namespace.
-	 */
-	vdev_top_config_generate(spa, config);
-
-	/*
-	 * If we're splitting, record the original pool's guid.
-	 */
-	if (spa->spa_config_splitting != NULL &&
-	    nvlist_lookup_uint64(spa->spa_config_splitting,
-	    ZPOOL_CONFIG_SPLIT_GUID, &split_guid) == 0) {
-		fnvlist_add_uint64(config, ZPOOL_CONFIG_SPLIT_GUID,
-		    split_guid);
-	}
-
-	nvroot = vdev_config_generate(spa, vd, getstats, config_gen_flags);
-	fnvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot);
-	nvlist_free(nvroot);
-
-	/*
-	 * Store what's necessary for reading the MOS in the label.
-	 */
-	fnvlist_add_nvlist(config, ZPOOL_CONFIG_FEATURES_FOR_READ,
-	    spa->spa_label_features);
-
-	if (getstats && spa_load_state(spa) == SPA_LOAD_NONE) {
-		ddt_histogram_t *ddh;
-		ddt_stat_t *dds;
-		ddt_object_t *ddo;
-
-		ddh = kmem_zalloc(sizeof (ddt_histogram_t), KM_SLEEP);
-		ddt_get_dedup_histogram(spa, ddh);
-		fnvlist_add_uint64_array(config,
-		    ZPOOL_CONFIG_DDT_HISTOGRAM,
-		    (uint64_t *)ddh, sizeof (*ddh) / sizeof (uint64_t));
-		kmem_free(ddh, sizeof (ddt_histogram_t));
-
-		ddo = kmem_zalloc(sizeof (ddt_object_t), KM_SLEEP);
-		ddt_get_dedup_object_stats(spa, ddo);
-		fnvlist_add_uint64_array(config,
-		    ZPOOL_CONFIG_DDT_OBJ_STATS,
-		    (uint64_t *)ddo, sizeof (*ddo) / sizeof (uint64_t));
-		kmem_free(ddo, sizeof (ddt_object_t));
-
-		dds = kmem_zalloc(sizeof (ddt_stat_t), KM_SLEEP);
-		ddt_get_dedup_stats(spa, dds);
-		fnvlist_add_uint64_array(config,
-		    ZPOOL_CONFIG_DDT_STATS,
-		    (uint64_t *)dds, sizeof (*dds) / sizeof (uint64_t));
-		kmem_free(dds, sizeof (ddt_stat_t));
-	}
-
-	if (locked)
-		spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
-
-	return (config);
-}
-
-/*
- * Update all disk labels, generate a fresh config based on the current
- * in-core state, and sync the global config cache (do not sync the config
- * cache if this is a booting rootpool).
- */
-void
-spa_config_update(spa_t *spa, int what)
-{
-	vdev_t *rvd = spa->spa_root_vdev;
-	uint64_t txg;
-	int c;
-
-	ASSERT(MUTEX_HELD(&spa_namespace_lock));
-
-	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
-	txg = spa_last_synced_txg(spa) + 1;
-	if (what == SPA_CONFIG_UPDATE_POOL) {
-		vdev_config_dirty(rvd);
-	} else {
-		/*
-		 * If we have top-level vdevs that were added but have
-		 * not yet been prepared for allocation, do that now.
-		 * (It's safe now because the config cache is up to date,
-		 * so it will be able to translate the new DVAs.)
-		 * See comments in spa_vdev_add() for full details.
-		 */
-		for (c = 0; c < rvd->vdev_children; c++) {
-			vdev_t *tvd = rvd->vdev_child[c];
-
-			/*
-			 * Explicitly skip vdevs that are indirect or
-			 * log vdevs that are being removed. The reason
-			 * is that both of those can have vdev_ms_array
-			 * set to 0 and we wouldn't want to change their
-			 * metaslab size nor call vdev_expand() on them.
-			 */
-			if (!vdev_is_concrete(tvd) ||
-			    (tvd->vdev_islog && tvd->vdev_removing))
-				continue;
-
-			if (tvd->vdev_ms_array == 0) {
-				vdev_ashift_optimize(tvd);
-				vdev_metaslab_set_size(tvd);
-			}
-			vdev_expand(tvd, txg);
-		}
-	}
-	spa_config_exit(spa, SCL_ALL, FTAG);
-
-	/*
-	 * Wait for the mosconfig to be regenerated and synced.
-	 */
-	txg_wait_synced(spa->spa_dsl_pool, txg);
-
-	/*
-	 * Update the global config cache to reflect the new mosconfig.
-	 */
-	spa_write_cachefile(spa, B_FALSE, what != SPA_CONFIG_UPDATE_POOL);
-
-	if (what == SPA_CONFIG_UPDATE_POOL)
-		spa_config_update(spa, SPA_CONFIG_UPDATE_VDEVS);
-}
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_errlog.c
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_errlog.c
+++ /dev/null
@@ -1,406 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2013, 2014 by Delphix. All rights reserved.
- */
-
-/*
- * Routines to manage the on-disk persistent error log.
- *
- * Each pool stores a log of all logical data errors seen during normal
- * operation.  This is actually the union of two distinct logs: the last log,
- * and the current log.  All errors seen are logged to the current log.  When a
- * scrub completes, the current log becomes the last log, the last log is thrown
- * out, and the current log is reinitialized.  This way, if an error is somehow
- * corrected, a new scrub will show that that it no longer exists, and will be
- * deleted from the log when the scrub completes.
- *
- * The log is stored using a ZAP object whose key is a string form of the
- * zbookmark_phys tuple (objset, object, level, blkid), and whose contents is an
- * optional 'objset:object' human-readable string describing the data.  When an
- * error is first logged, this string will be empty, indicating that no name is
- * known.  This prevents us from having to issue a potentially large amount of
- * I/O to discover the object name during an error path.  Instead, we do the
- * calculation when the data is requested, storing the result so future queries
- * will be faster.
- *
- * This log is then shipped into an nvlist where the key is the dataset name and
- * the value is the object name.  Userland is then responsible for uniquifying
- * this list and displaying it to the user.
- */
-
-#include <sys/dmu_tx.h>
-#include <sys/spa.h>
-#include <sys/spa_impl.h>
-#include <sys/zap.h>
-#include <sys/zio.h>
-
-
-/*
- * Convert a bookmark to a string.
- */
-static void
-bookmark_to_name(zbookmark_phys_t *zb, char *buf, size_t len)
-{
-	(void) snprintf(buf, len, "%llx:%llx:%llx:%llx",
-	    (u_longlong_t)zb->zb_objset, (u_longlong_t)zb->zb_object,
-	    (u_longlong_t)zb->zb_level, (u_longlong_t)zb->zb_blkid);
-}
-
-/*
- * Convert a string to a bookmark
- */
-#ifdef _KERNEL
-static void
-name_to_bookmark(char *buf, zbookmark_phys_t *zb)
-{
-	zb->zb_objset = zfs_strtonum(buf, &buf);
-	ASSERT(*buf == ':');
-	zb->zb_object = zfs_strtonum(buf + 1, &buf);
-	ASSERT(*buf == ':');
-	zb->zb_level = (int)zfs_strtonum(buf + 1, &buf);
-	ASSERT(*buf == ':');
-	zb->zb_blkid = zfs_strtonum(buf + 1, &buf);
-	ASSERT(*buf == '\0');
-}
-#endif
-
-/*
- * Log an uncorrectable error to the persistent error log.  We add it to the
- * spa's list of pending errors.  The changes are actually synced out to disk
- * during spa_errlog_sync().
- */
-void
-spa_log_error(spa_t *spa, zio_t *zio)
-{
-	zbookmark_phys_t *zb = &zio->io_logical->io_bookmark;
-	spa_error_entry_t search;
-	spa_error_entry_t *new;
-	avl_tree_t *tree;
-	avl_index_t where;
-
-	/*
-	 * If we are trying to import a pool, ignore any errors, as we won't be
-	 * writing to the pool any time soon.
-	 */
-	if (spa_load_state(spa) == SPA_LOAD_TRYIMPORT)
-		return;
-
-	mutex_enter(&spa->spa_errlist_lock);
-
-	/*
-	 * If we have had a request to rotate the log, log it to the next list
-	 * instead of the current one.
-	 */
-	if (spa->spa_scrub_active || spa->spa_scrub_finished)
-		tree = &spa->spa_errlist_scrub;
-	else
-		tree = &spa->spa_errlist_last;
-
-	search.se_bookmark = *zb;
-	if (avl_find(tree, &search, &where) != NULL) {
-		mutex_exit(&spa->spa_errlist_lock);
-		return;
-	}
-
-	new = kmem_zalloc(sizeof (spa_error_entry_t), KM_SLEEP);
-	new->se_bookmark = *zb;
-	avl_insert(tree, new, where);
-
-	mutex_exit(&spa->spa_errlist_lock);
-}
-
-/*
- * Return the number of errors currently in the error log.  This is actually the
- * sum of both the last log and the current log, since we don't know the union
- * of these logs until we reach userland.
- */
-uint64_t
-spa_get_errlog_size(spa_t *spa)
-{
-	uint64_t total = 0, count;
-
-	mutex_enter(&spa->spa_errlog_lock);
-	if (spa->spa_errlog_scrub != 0 &&
-	    zap_count(spa->spa_meta_objset, spa->spa_errlog_scrub,
-	    &count) == 0)
-		total += count;
-
-	if (spa->spa_errlog_last != 0 && !spa->spa_scrub_finished &&
-	    zap_count(spa->spa_meta_objset, spa->spa_errlog_last,
-	    &count) == 0)
-		total += count;
-	mutex_exit(&spa->spa_errlog_lock);
-
-	mutex_enter(&spa->spa_errlist_lock);
-	total += avl_numnodes(&spa->spa_errlist_last);
-	total += avl_numnodes(&spa->spa_errlist_scrub);
-	mutex_exit(&spa->spa_errlist_lock);
-
-	return (total);
-}
-
-#ifdef _KERNEL
-static int
-process_error_log(spa_t *spa, uint64_t obj, void *addr, size_t *count)
-{
-	zap_cursor_t zc;
-	zap_attribute_t za;
-	zbookmark_phys_t zb;
-
-	if (obj == 0)
-		return (0);
-
-	for (zap_cursor_init(&zc, spa->spa_meta_objset, obj);
-	    zap_cursor_retrieve(&zc, &za) == 0;
-	    zap_cursor_advance(&zc)) {
-
-		if (*count == 0) {
-			zap_cursor_fini(&zc);
-			return (SET_ERROR(ENOMEM));
-		}
-
-		name_to_bookmark(za.za_name, &zb);
-
-		if (copyout(&zb, (char *)addr +
-		    (*count - 1) * sizeof (zbookmark_phys_t),
-		    sizeof (zbookmark_phys_t)) != 0) {
-			zap_cursor_fini(&zc);
-			return (SET_ERROR(EFAULT));
-		}
-
-		*count -= 1;
-	}
-
-	zap_cursor_fini(&zc);
-
-	return (0);
-}
-
-static int
-process_error_list(avl_tree_t *list, void *addr, size_t *count)
-{
-	spa_error_entry_t *se;
-
-	for (se = avl_first(list); se != NULL; se = AVL_NEXT(list, se)) {
-
-		if (*count == 0)
-			return (SET_ERROR(ENOMEM));
-
-		if (copyout(&se->se_bookmark, (char *)addr +
-		    (*count - 1) * sizeof (zbookmark_phys_t),
-		    sizeof (zbookmark_phys_t)) != 0)
-			return (SET_ERROR(EFAULT));
-
-		*count -= 1;
-	}
-
-	return (0);
-}
-#endif
-
-/*
- * Copy all known errors to userland as an array of bookmarks.  This is
- * actually a union of the on-disk last log and current log, as well as any
- * pending error requests.
- *
- * Because the act of reading the on-disk log could cause errors to be
- * generated, we have two separate locks: one for the error log and one for the
- * in-core error lists.  We only need the error list lock to log and error, so
- * we grab the error log lock while we read the on-disk logs, and only pick up
- * the error list lock when we are finished.
- */
-int
-spa_get_errlog(spa_t *spa, void *uaddr, size_t *count)
-{
-	int ret = 0;
-
-#ifdef _KERNEL
-	mutex_enter(&spa->spa_errlog_lock);
-
-	ret = process_error_log(spa, spa->spa_errlog_scrub, uaddr, count);
-
-	if (!ret && !spa->spa_scrub_finished)
-		ret = process_error_log(spa, spa->spa_errlog_last, uaddr,
-		    count);
-
-	mutex_enter(&spa->spa_errlist_lock);
-	if (!ret)
-		ret = process_error_list(&spa->spa_errlist_scrub, uaddr,
-		    count);
-	if (!ret)
-		ret = process_error_list(&spa->spa_errlist_last, uaddr,
-		    count);
-	mutex_exit(&spa->spa_errlist_lock);
-
-	mutex_exit(&spa->spa_errlog_lock);
-#endif
-
-	return (ret);
-}
-
-/*
- * Called when a scrub completes.  This simply set a bit which tells which AVL
- * tree to add new errors.  spa_errlog_sync() is responsible for actually
- * syncing the changes to the underlying objects.
- */
-void
-spa_errlog_rotate(spa_t *spa)
-{
-	mutex_enter(&spa->spa_errlist_lock);
-	spa->spa_scrub_finished = B_TRUE;
-	mutex_exit(&spa->spa_errlist_lock);
-}
-
-/*
- * Discard any pending errors from the spa_t.  Called when unloading a faulted
- * pool, as the errors encountered during the open cannot be synced to disk.
- */
-void
-spa_errlog_drain(spa_t *spa)
-{
-	spa_error_entry_t *se;
-	void *cookie;
-
-	mutex_enter(&spa->spa_errlist_lock);
-
-	cookie = NULL;
-	while ((se = avl_destroy_nodes(&spa->spa_errlist_last,
-	    &cookie)) != NULL)
-		kmem_free(se, sizeof (spa_error_entry_t));
-	cookie = NULL;
-	while ((se = avl_destroy_nodes(&spa->spa_errlist_scrub,
-	    &cookie)) != NULL)
-		kmem_free(se, sizeof (spa_error_entry_t));
-
-	mutex_exit(&spa->spa_errlist_lock);
-}
-
-/*
- * Process a list of errors into the current on-disk log.
- */
-static void
-sync_error_list(spa_t *spa, avl_tree_t *t, uint64_t *obj, dmu_tx_t *tx)
-{
-	spa_error_entry_t *se;
-	char buf[64];
-	void *cookie;
-
-	if (avl_numnodes(t) != 0) {
-		/* create log if necessary */
-		if (*obj == 0)
-			*obj = zap_create(spa->spa_meta_objset,
-			    DMU_OT_ERROR_LOG, DMU_OT_NONE,
-			    0, tx);
-
-		/* add errors to the current log */
-		for (se = avl_first(t); se != NULL; se = AVL_NEXT(t, se)) {
-			char *name = se->se_name ? se->se_name : "";
-
-			bookmark_to_name(&se->se_bookmark, buf, sizeof (buf));
-
-			(void) zap_update(spa->spa_meta_objset,
-			    *obj, buf, 1, strlen(name) + 1, name, tx);
-		}
-
-		/* purge the error list */
-		cookie = NULL;
-		while ((se = avl_destroy_nodes(t, &cookie)) != NULL)
-			kmem_free(se, sizeof (spa_error_entry_t));
-	}
-}
-
-/*
- * Sync the error log out to disk.  This is a little tricky because the act of
- * writing the error log requires the spa_errlist_lock.  So, we need to lock the
- * error lists, take a copy of the lists, and then reinitialize them.  Then, we
- * drop the error list lock and take the error log lock, at which point we
- * do the errlog processing.  Then, if we encounter an I/O error during this
- * process, we can successfully add the error to the list.  Note that this will
- * result in the perpetual recycling of errors, but it is an unlikely situation
- * and not a performance critical operation.
- */
-void
-spa_errlog_sync(spa_t *spa, uint64_t txg)
-{
-	dmu_tx_t *tx;
-	avl_tree_t scrub, last;
-	int scrub_finished;
-
-	mutex_enter(&spa->spa_errlist_lock);
-
-	/*
-	 * Bail out early under normal circumstances.
-	 */
-	if (avl_numnodes(&spa->spa_errlist_scrub) == 0 &&
-	    avl_numnodes(&spa->spa_errlist_last) == 0 &&
-	    !spa->spa_scrub_finished) {
-		mutex_exit(&spa->spa_errlist_lock);
-		return;
-	}
-
-	spa_get_errlists(spa, &last, &scrub);
-	scrub_finished = spa->spa_scrub_finished;
-	spa->spa_scrub_finished = B_FALSE;
-
-	mutex_exit(&spa->spa_errlist_lock);
-	mutex_enter(&spa->spa_errlog_lock);
-
-	tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
-
-	/*
-	 * Sync out the current list of errors.
-	 */
-	sync_error_list(spa, &last, &spa->spa_errlog_last, tx);
-
-	/*
-	 * Rotate the log if necessary.
-	 */
-	if (scrub_finished) {
-		if (spa->spa_errlog_last != 0)
-			VERIFY(dmu_object_free(spa->spa_meta_objset,
-			    spa->spa_errlog_last, tx) == 0);
-		spa->spa_errlog_last = spa->spa_errlog_scrub;
-		spa->spa_errlog_scrub = 0;
-
-		sync_error_list(spa, &scrub, &spa->spa_errlog_last, tx);
-	}
-
-	/*
-	 * Sync out any pending scrub errors.
-	 */
-	sync_error_list(spa, &scrub, &spa->spa_errlog_scrub, tx);
-
-	/*
-	 * Update the MOS to reflect the new values.
-	 */
-	(void) zap_update(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
-	    DMU_POOL_ERRLOG_LAST, sizeof (uint64_t), 1,
-	    &spa->spa_errlog_last, tx);
-	(void) zap_update(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
-	    DMU_POOL_ERRLOG_SCRUB, sizeof (uint64_t), 1,
-	    &spa->spa_errlog_scrub, tx);
-
-	dmu_tx_commit(tx);
-
-	mutex_exit(&spa->spa_errlog_lock);
-}
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_history.c
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_history.c
+++ /dev/null
@@ -1,628 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-
-/*
- * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
- * Copyright (c) 2014 Integros [integros.com]
- * Copyright 2017 Joyent, Inc.
- */
-
-#include <sys/spa.h>
-#include <sys/spa_impl.h>
-#include <sys/zap.h>
-#include <sys/dsl_synctask.h>
-#include <sys/dmu_tx.h>
-#include <sys/dmu_objset.h>
-#include <sys/dsl_dataset.h>
-#include <sys/dsl_dir.h>
-#include <sys/utsname.h>
-#include <sys/sunddi.h>
-#include <sys/cred.h>
-#include "zfs_comutil.h"
-#ifdef _KERNEL
-#include <sys/cmn_err.h>
-#include <sys/zone.h>
-#endif
-
-/*
- * Routines to manage the on-disk history log.
- *
- * The history log is stored as a dmu object containing
- * <packed record length, record nvlist> tuples.
- *
- * Where "record nvlist" is a nvlist containing uint64_ts and strings, and
- * "packed record length" is the packed length of the "record nvlist" stored
- * as a little endian uint64_t.
- *
- * The log is implemented as a ring buffer, though the original creation
- * of the pool ('zpool create') is never overwritten.
- *
- * The history log is tracked as object 'spa_t::spa_history'.  The bonus buffer
- * of 'spa_history' stores the offsets for logging/retrieving history as
- * 'spa_history_phys_t'.  'sh_pool_create_len' is the ending offset in bytes of
- * where the 'zpool create' record is stored.  This allows us to never
- * overwrite the original creation of the pool.  'sh_phys_max_off' is the
- * physical ending offset in bytes of the log.  This tells you the length of
- * the buffer. 'sh_eof' is the logical EOF (in bytes).  Whenever a record
- * is added, 'sh_eof' is incremented by the the size of the record.
- * 'sh_eof' is never decremented.  'sh_bof' is the logical BOF (in bytes).
- * This is where the consumer should start reading from after reading in
- * the 'zpool create' portion of the log.
- *
- * 'sh_records_lost' keeps track of how many records have been overwritten
- * and permanently lost.
- */
-
-/* convert a logical offset to physical */
-static uint64_t
-spa_history_log_to_phys(uint64_t log_off, spa_history_phys_t *shpp)
-{
-	uint64_t phys_len;
-
-	phys_len = shpp->sh_phys_max_off - shpp->sh_pool_create_len;
-	return ((log_off - shpp->sh_pool_create_len) % phys_len
-	    + shpp->sh_pool_create_len);
-}
-
-void
-spa_history_create_obj(spa_t *spa, dmu_tx_t *tx)
-{
-	dmu_buf_t *dbp;
-	spa_history_phys_t *shpp;
-	objset_t *mos = spa->spa_meta_objset;
-
-	ASSERT(spa->spa_history == 0);
-	spa->spa_history = dmu_object_alloc(mos, DMU_OT_SPA_HISTORY,
-	    SPA_OLD_MAXBLOCKSIZE, DMU_OT_SPA_HISTORY_OFFSETS,
-	    sizeof (spa_history_phys_t), tx);
-
-	VERIFY(zap_add(mos, DMU_POOL_DIRECTORY_OBJECT,
-	    DMU_POOL_HISTORY, sizeof (uint64_t), 1,
-	    &spa->spa_history, tx) == 0);
-
-	VERIFY(0 == dmu_bonus_hold(mos, spa->spa_history, FTAG, &dbp));
-	ASSERT(dbp->db_size >= sizeof (spa_history_phys_t));
-
-	shpp = dbp->db_data;
-	dmu_buf_will_dirty(dbp, tx);
-
-	/*
-	 * Figure out maximum size of history log.  We set it at
-	 * 0.1% of pool size, with a max of 1G and min of 128KB.
-	 */
-	shpp->sh_phys_max_off =
-	    metaslab_class_get_dspace(spa_normal_class(spa)) / 1000;
-	shpp->sh_phys_max_off = MIN(shpp->sh_phys_max_off, 1<<30);
-	shpp->sh_phys_max_off = MAX(shpp->sh_phys_max_off, 128<<10);
-
-	dmu_buf_rele(dbp, FTAG);
-}
-
-/*
- * Change 'sh_bof' to the beginning of the next record.
- */
-static int
-spa_history_advance_bof(spa_t *spa, spa_history_phys_t *shpp)
-{
-	objset_t *mos = spa->spa_meta_objset;
-	uint64_t firstread, reclen, phys_bof;
-	char buf[sizeof (reclen)];
-	int err;
-
-	phys_bof = spa_history_log_to_phys(shpp->sh_bof, shpp);
-	firstread = MIN(sizeof (reclen), shpp->sh_phys_max_off - phys_bof);
-
-	if ((err = dmu_read(mos, spa->spa_history, phys_bof, firstread,
-	    buf, DMU_READ_PREFETCH)) != 0)
-		return (err);
-	if (firstread != sizeof (reclen)) {
-		if ((err = dmu_read(mos, spa->spa_history,
-		    shpp->sh_pool_create_len, sizeof (reclen) - firstread,
-		    buf + firstread, DMU_READ_PREFETCH)) != 0)
-			return (err);
-	}
-
-	reclen = LE_64(*((uint64_t *)buf));
-	shpp->sh_bof += reclen + sizeof (reclen);
-	shpp->sh_records_lost++;
-	return (0);
-}
-
-static int
-spa_history_write(spa_t *spa, void *buf, uint64_t len, spa_history_phys_t *shpp,
-    dmu_tx_t *tx)
-{
-	uint64_t firstwrite, phys_eof;
-	objset_t *mos = spa->spa_meta_objset;
-	int err;
-
-	ASSERT(MUTEX_HELD(&spa->spa_history_lock));
-
-	/* see if we need to reset logical BOF */
-	while (shpp->sh_phys_max_off - shpp->sh_pool_create_len -
-	    (shpp->sh_eof - shpp->sh_bof) <= len) {
-		if ((err = spa_history_advance_bof(spa, shpp)) != 0) {
-			return (err);
-		}
-	}
-
-	phys_eof = spa_history_log_to_phys(shpp->sh_eof, shpp);
-	firstwrite = MIN(len, shpp->sh_phys_max_off - phys_eof);
-	shpp->sh_eof += len;
-	dmu_write(mos, spa->spa_history, phys_eof, firstwrite, buf, tx);
-
-	len -= firstwrite;
-	if (len > 0) {
-		/* write out the rest at the beginning of physical file */
-		dmu_write(mos, spa->spa_history, shpp->sh_pool_create_len,
-		    len, (char *)buf + firstwrite, tx);
-	}
-
-	return (0);
-}
-
-static char *
-spa_history_zone(void)
-{
-#ifdef _KERNEL
-	/* XXX: pr_hostname can be changed by default from within a jail! */
-	if (jailed(curthread->td_ucred))
-		return (curthread->td_ucred->cr_prison->pr_hostname);
-#endif
-	return (NULL);
-}
-
-/*
- * Post a history sysevent.
- *
- * The nvlist_t* passed into this function will be transformed into a new
- * nvlist where:
- *
- * 1. Nested nvlists will be flattened to a single level
- * 2. Keys will have their names normalized (to remove any problematic
- * characters, such as whitespace)
- *
- * The nvlist_t passed into this function will duplicated and should be freed
- * by caller.
- *
- */
-static void
-spa_history_log_notify(spa_t *spa, nvlist_t *nvl)
-{
-	nvlist_t *hist_nvl = fnvlist_alloc();
-	uint64_t uint64;
-	char *string;
-
-	if (nvlist_lookup_string(nvl, ZPOOL_HIST_CMD, &string) == 0)
-		fnvlist_add_string(hist_nvl, ZFS_EV_HIST_CMD, string);
-
-	if (nvlist_lookup_string(nvl, ZPOOL_HIST_INT_NAME, &string) == 0)
-		fnvlist_add_string(hist_nvl, ZFS_EV_HIST_INT_NAME, string);
-
-	if (nvlist_lookup_string(nvl, ZPOOL_HIST_ZONE, &string) == 0)
-		fnvlist_add_string(hist_nvl, ZFS_EV_HIST_ZONE, string);
-
-	if (nvlist_lookup_string(nvl, ZPOOL_HIST_HOST, &string) == 0)
-		fnvlist_add_string(hist_nvl, ZFS_EV_HIST_HOST, string);
-
-	if (nvlist_lookup_string(nvl, ZPOOL_HIST_DSNAME, &string) == 0)
-		fnvlist_add_string(hist_nvl, ZFS_EV_HIST_DSNAME, string);
-
-	if (nvlist_lookup_string(nvl, ZPOOL_HIST_INT_STR, &string) == 0)
-		fnvlist_add_string(hist_nvl, ZFS_EV_HIST_INT_STR, string);
-
-	if (nvlist_lookup_string(nvl, ZPOOL_HIST_IOCTL, &string) == 0)
-		fnvlist_add_string(hist_nvl, ZFS_EV_HIST_IOCTL, string);
-
-	if (nvlist_lookup_string(nvl, ZPOOL_HIST_INT_NAME, &string) == 0)
-		fnvlist_add_string(hist_nvl, ZFS_EV_HIST_INT_NAME, string);
-
-	if (nvlist_lookup_uint64(nvl, ZPOOL_HIST_DSID, &uint64) == 0)
-		fnvlist_add_uint64(hist_nvl, ZFS_EV_HIST_DSID, uint64);
-
-	if (nvlist_lookup_uint64(nvl, ZPOOL_HIST_TXG, &uint64) == 0)
-		fnvlist_add_uint64(hist_nvl, ZFS_EV_HIST_TXG, uint64);
-
-	if (nvlist_lookup_uint64(nvl, ZPOOL_HIST_TIME, &uint64) == 0)
-		fnvlist_add_uint64(hist_nvl, ZFS_EV_HIST_TIME, uint64);
-
-	if (nvlist_lookup_uint64(nvl, ZPOOL_HIST_WHO, &uint64) == 0)
-		fnvlist_add_uint64(hist_nvl, ZFS_EV_HIST_WHO, uint64);
-
-	if (nvlist_lookup_uint64(nvl, ZPOOL_HIST_INT_EVENT, &uint64) == 0)
-		fnvlist_add_uint64(hist_nvl, ZFS_EV_HIST_INT_EVENT, uint64);
-
-	spa_event_notify(spa, NULL, hist_nvl, ESC_ZFS_HISTORY_EVENT);
-
-	nvlist_free(hist_nvl);
-}
-
-/*
- * Write out a history event.
- */
-/*ARGSUSED*/
-static void
-spa_history_log_sync(void *arg, dmu_tx_t *tx)
-{
-	nvlist_t	*nvl = arg;
-	spa_t		*spa = dmu_tx_pool(tx)->dp_spa;
-	objset_t	*mos = spa->spa_meta_objset;
-	dmu_buf_t	*dbp;
-	spa_history_phys_t *shpp;
-	size_t		reclen;
-	uint64_t	le_len;
-	char		*record_packed = NULL;
-	int		ret;
-
-	/*
-	 * If we have an older pool that doesn't have a command
-	 * history object, create it now.
-	 */
-	mutex_enter(&spa->spa_history_lock);
-	if (!spa->spa_history)
-		spa_history_create_obj(spa, tx);
-	mutex_exit(&spa->spa_history_lock);
-
-	/*
-	 * Get the offset of where we need to write via the bonus buffer.
-	 * Update the offset when the write completes.
-	 */
-	VERIFY0(dmu_bonus_hold(mos, spa->spa_history, FTAG, &dbp));
-	shpp = dbp->db_data;
-
-	dmu_buf_will_dirty(dbp, tx);
-
-#ifdef ZFS_DEBUG
-	{
-		dmu_object_info_t doi;
-		dmu_object_info_from_db(dbp, &doi);
-		ASSERT3U(doi.doi_bonus_type, ==, DMU_OT_SPA_HISTORY_OFFSETS);
-	}
-#endif
-
-	fnvlist_add_uint64(nvl, ZPOOL_HIST_TIME, gethrestime_sec());
-#ifdef _KERNEL
-	fnvlist_add_string(nvl, ZPOOL_HIST_HOST, utsname.nodename);
-#endif
-	if (nvlist_exists(nvl, ZPOOL_HIST_CMD)) {
-		zfs_dbgmsg("command: %s",
-		    fnvlist_lookup_string(nvl, ZPOOL_HIST_CMD));
-	} else if (nvlist_exists(nvl, ZPOOL_HIST_INT_NAME)) {
-		if (nvlist_exists(nvl, ZPOOL_HIST_DSNAME)) {
-			zfs_dbgmsg("txg %lld %s %s (id %llu) %s",
-			    fnvlist_lookup_uint64(nvl, ZPOOL_HIST_TXG),
-			    fnvlist_lookup_string(nvl, ZPOOL_HIST_INT_NAME),
-			    fnvlist_lookup_string(nvl, ZPOOL_HIST_DSNAME),
-			    fnvlist_lookup_uint64(nvl, ZPOOL_HIST_DSID),
-			    fnvlist_lookup_string(nvl, ZPOOL_HIST_INT_STR));
-		} else {
-			zfs_dbgmsg("txg %lld %s %s",
-			    fnvlist_lookup_uint64(nvl, ZPOOL_HIST_TXG),
-			    fnvlist_lookup_string(nvl, ZPOOL_HIST_INT_NAME),
-			    fnvlist_lookup_string(nvl, ZPOOL_HIST_INT_STR));
-		}
-		/*
-		 * The history sysevent is posted only for internal history
-		 * messages to show what has happened, not how it happened. For
-		 * example, the following command:
-		 *
-		 * # zfs destroy -r tank/foo
-		 *
-		 * will result in one sysevent posted per dataset that is
-		 * destroyed as a result of the command - which could be more
-		 * than one event in total.  By contrast, if the sysevent was
-		 * posted as a result of the ZPOOL_HIST_CMD key being present
-		 * it would result in only one sysevent being posted with the
-		 * full command line arguments, requiring the consumer to know
-		 * how to parse and understand zfs(1M) command invocations.
-		 */
-		spa_history_log_notify(spa, nvl);
-	} else if (nvlist_exists(nvl, ZPOOL_HIST_IOCTL)) {
-		zfs_dbgmsg("ioctl %s",
-		    fnvlist_lookup_string(nvl, ZPOOL_HIST_IOCTL));
-	}
-
-	record_packed = fnvlist_pack(nvl, &reclen);
-
-	mutex_enter(&spa->spa_history_lock);
-
-	/* write out the packed length as little endian */
-	le_len = LE_64((uint64_t)reclen);
-	ret = spa_history_write(spa, &le_len, sizeof (le_len), shpp, tx);
-	if (!ret)
-		ret = spa_history_write(spa, record_packed, reclen, shpp, tx);
-
-	/* The first command is the create, which we keep forever */
-	if (ret == 0 && shpp->sh_pool_create_len == 0 &&
-	    nvlist_exists(nvl, ZPOOL_HIST_CMD)) {
-		shpp->sh_pool_create_len = shpp->sh_bof = shpp->sh_eof;
-	}
-
-	mutex_exit(&spa->spa_history_lock);
-	fnvlist_pack_free(record_packed, reclen);
-	dmu_buf_rele(dbp, FTAG);
-	fnvlist_free(nvl);
-}
-
-/*
- * Write out a history event.
- */
-int
-spa_history_log(spa_t *spa, const char *msg)
-{
-	int err;
-	nvlist_t *nvl = fnvlist_alloc();
-
-	fnvlist_add_string(nvl, ZPOOL_HIST_CMD, msg);
-	err = spa_history_log_nvl(spa, nvl);
-	fnvlist_free(nvl);
-	return (err);
-}
-
-int
-spa_history_log_nvl(spa_t *spa, nvlist_t *nvl)
-{
-	int err = 0;
-	dmu_tx_t *tx;
-	nvlist_t *nvarg;
-
-	if (spa_version(spa) < SPA_VERSION_ZPOOL_HISTORY)
-		return (EINVAL);
-
-	if (spa_version(spa) < SPA_VERSION_ZPOOL_HISTORY || !spa_writeable(spa))
-		return (SET_ERROR(EINVAL));
-
-	tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
-	err = dmu_tx_assign(tx, TXG_WAIT);
-	if (err) {
-		dmu_tx_abort(tx);
-		return (err);
-	}
-
-	nvarg = fnvlist_dup(nvl);
-	if (spa_history_zone() != NULL) {
-		fnvlist_add_string(nvarg, ZPOOL_HIST_ZONE,
-		    spa_history_zone());
-	}
-	fnvlist_add_uint64(nvarg, ZPOOL_HIST_WHO, crgetruid(CRED()));
-
-	/* Kick this off asynchronously; errors are ignored. */
-	dsl_sync_task_nowait(spa_get_dsl(spa), spa_history_log_sync,
-	    nvarg, 0, ZFS_SPACE_CHECK_NONE, tx);
-	dmu_tx_commit(tx);
-
-	/* spa_history_log_sync will free nvl */
-	return (err);
-
-}
-
-/*
- * Read out the command history.
- */
-int
-spa_history_get(spa_t *spa, uint64_t *offp, uint64_t *len, char *buf)
-{
-	objset_t *mos = spa->spa_meta_objset;
-	dmu_buf_t *dbp;
-	uint64_t read_len, phys_read_off, phys_eof;
-	uint64_t leftover = 0;
-	spa_history_phys_t *shpp;
-	int err;
-
-	/*
-	 * If the command history doesn't exist (older pool),
-	 * that's ok, just return ENOENT.
-	 */
-	if (!spa->spa_history)
-		return (SET_ERROR(ENOENT));
-
-	/*
-	 * The history is logged asynchronously, so when they request
-	 * the first chunk of history, make sure everything has been
-	 * synced to disk so that we get it.
-	 */
-	if (*offp == 0 && spa_writeable(spa))
-		txg_wait_synced(spa_get_dsl(spa), 0);
-
-	if ((err = dmu_bonus_hold(mos, spa->spa_history, FTAG, &dbp)) != 0)
-		return (err);
-	shpp = dbp->db_data;
-
-#ifdef ZFS_DEBUG
-	{
-		dmu_object_info_t doi;
-		dmu_object_info_from_db(dbp, &doi);
-		ASSERT3U(doi.doi_bonus_type, ==, DMU_OT_SPA_HISTORY_OFFSETS);
-	}
-#endif
-
-	mutex_enter(&spa->spa_history_lock);
-	phys_eof = spa_history_log_to_phys(shpp->sh_eof, shpp);
-
-	if (*offp < shpp->sh_pool_create_len) {
-		/* read in just the zpool create history */
-		phys_read_off = *offp;
-		read_len = MIN(*len, shpp->sh_pool_create_len -
-		    phys_read_off);
-	} else {
-		/*
-		 * Need to reset passed in offset to BOF if the passed in
-		 * offset has since been overwritten.
-		 */
-		*offp = MAX(*offp, shpp->sh_bof);
-		phys_read_off = spa_history_log_to_phys(*offp, shpp);
-
-		/*
-		 * Read up to the minimum of what the user passed down or
-		 * the EOF (physical or logical).  If we hit physical EOF,
-		 * use 'leftover' to read from the physical BOF.
-		 */
-		if (phys_read_off <= phys_eof) {
-			read_len = MIN(*len, phys_eof - phys_read_off);
-		} else {
-			read_len = MIN(*len,
-			    shpp->sh_phys_max_off - phys_read_off);
-			if (phys_read_off + *len > shpp->sh_phys_max_off) {
-				leftover = MIN(*len - read_len,
-				    phys_eof - shpp->sh_pool_create_len);
-			}
-		}
-	}
-
-	/* offset for consumer to use next */
-	*offp += read_len + leftover;
-
-	/* tell the consumer how much you actually read */
-	*len = read_len + leftover;
-
-	if (read_len == 0) {
-		mutex_exit(&spa->spa_history_lock);
-		dmu_buf_rele(dbp, FTAG);
-		return (0);
-	}
-
-	err = dmu_read(mos, spa->spa_history, phys_read_off, read_len, buf,
-	    DMU_READ_PREFETCH);
-	if (leftover && err == 0) {
-		err = dmu_read(mos, spa->spa_history, shpp->sh_pool_create_len,
-		    leftover, buf + read_len, DMU_READ_PREFETCH);
-	}
-	mutex_exit(&spa->spa_history_lock);
-
-	dmu_buf_rele(dbp, FTAG);
-	return (err);
-}
-
-/*
- * The nvlist will be consumed by this call.
- */
-static void
-log_internal(nvlist_t *nvl, const char *operation, spa_t *spa,
-    dmu_tx_t *tx, const char *fmt, va_list adx)
-{
-	char *msg;
-	va_list adx2;
-
-	/*
-	 * If this is part of creating a pool, not everything is
-	 * initialized yet, so don't bother logging the internal events.
-	 * Likewise if the pool is not writeable.
-	 */
-	if (tx->tx_txg == TXG_INITIAL || !spa_writeable(spa)) {
-		fnvlist_free(nvl);
-		return;
-	}
-
-	va_copy(adx2, adx);
-
-	msg = kmem_alloc(vsnprintf(NULL, 0, fmt, adx) + 1, KM_SLEEP);
-	(void) vsprintf(msg, fmt, adx2);
-	fnvlist_add_string(nvl, ZPOOL_HIST_INT_STR, msg);
-	strfree(msg);
-
-	va_end(adx2);
-
-	fnvlist_add_string(nvl, ZPOOL_HIST_INT_NAME, operation);
-	fnvlist_add_uint64(nvl, ZPOOL_HIST_TXG, tx->tx_txg);
-
-	if (dmu_tx_is_syncing(tx)) {
-		spa_history_log_sync(nvl, tx);
-	} else {
-		dsl_sync_task_nowait(spa_get_dsl(spa),
-		    spa_history_log_sync, nvl, 0, ZFS_SPACE_CHECK_NONE, tx);
-	}
-	/* spa_history_log_sync() will free nvl */
-}
-
-void
-spa_history_log_internal(spa_t *spa, const char *operation,
-    dmu_tx_t *tx, const char *fmt, ...)
-{
-	dmu_tx_t *htx = tx;
-	va_list adx;
-
-	/* create a tx if we didn't get one */
-	if (tx == NULL) {
-		htx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
-		if (dmu_tx_assign(htx, TXG_WAIT) != 0) {
-			dmu_tx_abort(htx);
-			return;
-		}
-	}
-
-	va_start(adx, fmt);
-	log_internal(fnvlist_alloc(), operation, spa, htx, fmt, adx);
-	va_end(adx);
-
-	/* if we didn't get a tx from the caller, commit the one we made */
-	if (tx == NULL)
-		dmu_tx_commit(htx);
-}
-
-void
-spa_history_log_internal_ds(dsl_dataset_t *ds, const char *operation,
-    dmu_tx_t *tx, const char *fmt, ...)
-{
-	va_list adx;
-	char namebuf[ZFS_MAX_DATASET_NAME_LEN];
-	nvlist_t *nvl = fnvlist_alloc();
-
-	ASSERT(tx != NULL);
-
-	dsl_dataset_name(ds, namebuf);
-	fnvlist_add_string(nvl, ZPOOL_HIST_DSNAME, namebuf);
-	fnvlist_add_uint64(nvl, ZPOOL_HIST_DSID, ds->ds_object);
-
-	va_start(adx, fmt);
-	log_internal(nvl, operation, dsl_dataset_get_spa(ds), tx, fmt, adx);
-	va_end(adx);
-}
-
-void
-spa_history_log_internal_dd(dsl_dir_t *dd, const char *operation,
-    dmu_tx_t *tx, const char *fmt, ...)
-{
-	va_list adx;
-	char namebuf[ZFS_MAX_DATASET_NAME_LEN];
-	nvlist_t *nvl = fnvlist_alloc();
-
-	ASSERT(tx != NULL);
-
-	dsl_dir_name(dd, namebuf);
-	fnvlist_add_string(nvl, ZPOOL_HIST_DSNAME, namebuf);
-	fnvlist_add_uint64(nvl, ZPOOL_HIST_DSID,
-	    dsl_dir_phys(dd)->dd_head_dataset_obj);
-
-	va_start(adx, fmt);
-	log_internal(nvl, operation, dd->dd_pool->dp_spa, tx, fmt, adx);
-	va_end(adx);
-}
-
-void
-spa_history_log_version(spa_t *spa, const char *operation)
-{
-	spa_history_log_internal(spa, operation, NULL,
-	    "pool version %llu; software version %llu/%llu; uts %s %s %s %s",
-	    (u_longlong_t)spa_version(spa), SPA_VERSION, ZPL_VERSION,
-	    utsname.nodename, utsname.release, utsname.version,
-	    utsname.machine);
-}
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c
+++ /dev/null
@@ -1,2523 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
- * Copyright 2015 Nexenta Systems, Inc.  All rights reserved.
- * Copyright 2013 Martin Matuska <mm@FreeBSD.org>. All rights reserved.
- * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
- * Copyright 2013 Saso Kiselkov. All rights reserved.
- * Copyright (c) 2014 Integros [integros.com]
- * Copyright (c) 2017 Datto Inc.
- * Copyright (c) 2017, Intel Corporation.
- */
-
-#include <sys/zfs_context.h>
-#include <sys/spa_impl.h>
-#include <sys/spa_boot.h>
-#include <sys/zio.h>
-#include <sys/zio_checksum.h>
-#include <sys/zio_compress.h>
-#include <sys/dmu.h>
-#include <sys/dmu_tx.h>
-#include <sys/zap.h>
-#include <sys/zil.h>
-#include <sys/vdev_impl.h>
-#include <sys/vdev_file.h>
-#include <sys/vdev_initialize.h>
-#include <sys/metaslab.h>
-#include <sys/uberblock_impl.h>
-#include <sys/txg.h>
-#include <sys/avl.h>
-#include <sys/unique.h>
-#include <sys/dsl_pool.h>
-#include <sys/dsl_dir.h>
-#include <sys/dsl_prop.h>
-#include <sys/dsl_scan.h>
-#include <sys/fs/zfs.h>
-#include <sys/metaslab_impl.h>
-#include <sys/arc.h>
-#include <sys/ddt.h>
-#include "zfs_prop.h"
-#include <sys/zfeature.h>
-
-#if defined(__FreeBSD__) && defined(_KERNEL)
-#include <sys/types.h>
-#include <sys/sysctl.h>
-#endif
-
-/*
- * SPA locking
- *
- * There are four basic locks for managing spa_t structures:
- *
- * spa_namespace_lock (global mutex)
- *
- *	This lock must be acquired to do any of the following:
- *
- *		- Lookup a spa_t by name
- *		- Add or remove a spa_t from the namespace
- *		- Increase spa_refcount from non-zero
- *		- Check if spa_refcount is zero
- *		- Rename a spa_t
- *		- add/remove/attach/detach devices
- *		- Held for the duration of create/destroy/import/export
- *
- *	It does not need to handle recursion.  A create or destroy may
- *	reference objects (files or zvols) in other pools, but by
- *	definition they must have an existing reference, and will never need
- *	to lookup a spa_t by name.
- *
- * spa_refcount (per-spa zfs_refcount_t protected by mutex)
- *
- *	This reference count keep track of any active users of the spa_t.  The
- *	spa_t cannot be destroyed or freed while this is non-zero.  Internally,
- *	the refcount is never really 'zero' - opening a pool implicitly keeps
- *	some references in the DMU.  Internally we check against spa_minref, but
- *	present the image of a zero/non-zero value to consumers.
- *
- * spa_config_lock[] (per-spa array of rwlocks)
- *
- *	This protects the spa_t from config changes, and must be held in
- *	the following circumstances:
- *
- *		- RW_READER to perform I/O to the spa
- *		- RW_WRITER to change the vdev config
- *
- * The locking order is fairly straightforward:
- *
- *		spa_namespace_lock	->	spa_refcount
- *
- *	The namespace lock must be acquired to increase the refcount from 0
- *	or to check if it is zero.
- *
- *		spa_refcount		->	spa_config_lock[]
- *
- *	There must be at least one valid reference on the spa_t to acquire
- *	the config lock.
- *
- *		spa_namespace_lock	->	spa_config_lock[]
- *
- *	The namespace lock must always be taken before the config lock.
- *
- *
- * The spa_namespace_lock can be acquired directly and is globally visible.
- *
- * The namespace is manipulated using the following functions, all of which
- * require the spa_namespace_lock to be held.
- *
- *	spa_lookup()		Lookup a spa_t by name.
- *
- *	spa_add()		Create a new spa_t in the namespace.
- *
- *	spa_remove()		Remove a spa_t from the namespace.  This also
- *				frees up any memory associated with the spa_t.
- *
- *	spa_next()		Returns the next spa_t in the system, or the
- *				first if NULL is passed.
- *
- *	spa_evict_all()		Shutdown and remove all spa_t structures in
- *				the system.
- *
- *	spa_guid_exists()	Determine whether a pool/device guid exists.
- *
- * The spa_refcount is manipulated using the following functions:
- *
- *	spa_open_ref()		Adds a reference to the given spa_t.  Must be
- *				called with spa_namespace_lock held if the
- *				refcount is currently zero.
- *
- *	spa_close()		Remove a reference from the spa_t.  This will
- *				not free the spa_t or remove it from the
- *				namespace.  No locking is required.
- *
- *	spa_refcount_zero()	Returns true if the refcount is currently
- *				zero.  Must be called with spa_namespace_lock
- *				held.
- *
- * The spa_config_lock[] is an array of rwlocks, ordered as follows:
- * SCL_CONFIG > SCL_STATE > SCL_ALLOC > SCL_ZIO > SCL_FREE > SCL_VDEV.
- * spa_config_lock[] is manipulated with spa_config_{enter,exit,held}().
- *
- * To read the configuration, it suffices to hold one of these locks as reader.
- * To modify the configuration, you must hold all locks as writer.  To modify
- * vdev state without altering the vdev tree's topology (e.g. online/offline),
- * you must hold SCL_STATE and SCL_ZIO as writer.
- *
- * We use these distinct config locks to avoid recursive lock entry.
- * For example, spa_sync() (which holds SCL_CONFIG as reader) induces
- * block allocations (SCL_ALLOC), which may require reading space maps
- * from disk (dmu_read() -> zio_read() -> SCL_ZIO).
- *
- * The spa config locks cannot be normal rwlocks because we need the
- * ability to hand off ownership.  For example, SCL_ZIO is acquired
- * by the issuing thread and later released by an interrupt thread.
- * They do, however, obey the usual write-wanted semantics to prevent
- * writer (i.e. system administrator) starvation.
- *
- * The lock acquisition rules are as follows:
- *
- * SCL_CONFIG
- *	Protects changes to the vdev tree topology, such as vdev
- *	add/remove/attach/detach.  Protects the dirty config list
- *	(spa_config_dirty_list) and the set of spares and l2arc devices.
- *
- * SCL_STATE
- *	Protects changes to pool state and vdev state, such as vdev
- *	online/offline/fault/degrade/clear.  Protects the dirty state list
- *	(spa_state_dirty_list) and global pool state (spa_state).
- *
- * SCL_ALLOC
- *	Protects changes to metaslab groups and classes.
- *	Held as reader by metaslab_alloc() and metaslab_claim().
- *
- * SCL_ZIO
- *	Held by bp-level zios (those which have no io_vd upon entry)
- *	to prevent changes to the vdev tree.  The bp-level zio implicitly
- *	protects all of its vdev child zios, which do not hold SCL_ZIO.
- *
- * SCL_FREE
- *	Protects changes to metaslab groups and classes.
- *	Held as reader by metaslab_free().  SCL_FREE is distinct from
- *	SCL_ALLOC, and lower than SCL_ZIO, so that we can safely free
- *	blocks in zio_done() while another i/o that holds either
- *	SCL_ALLOC or SCL_ZIO is waiting for this i/o to complete.
- *
- * SCL_VDEV
- *	Held as reader to prevent changes to the vdev tree during trivial
- *	inquiries such as bp_get_dsize().  SCL_VDEV is distinct from the
- *	other locks, and lower than all of them, to ensure that it's safe
- *	to acquire regardless of caller context.
- *
- * In addition, the following rules apply:
- *
- * (a)	spa_props_lock protects pool properties, spa_config and spa_config_list.
- *	The lock ordering is SCL_CONFIG > spa_props_lock.
- *
- * (b)	I/O operations on leaf vdevs.  For any zio operation that takes
- *	an explicit vdev_t argument -- such as zio_ioctl(), zio_read_phys(),
- *	or zio_write_phys() -- the caller must ensure that the config cannot
- *	cannot change in the interim, and that the vdev cannot be reopened.
- *	SCL_STATE as reader suffices for both.
- *
- * The vdev configuration is protected by spa_vdev_enter() / spa_vdev_exit().
- *
- *	spa_vdev_enter()	Acquire the namespace lock and the config lock
- *				for writing.
- *
- *	spa_vdev_exit()		Release the config lock, wait for all I/O
- *				to complete, sync the updated configs to the
- *				cache, and release the namespace lock.
- *
- * vdev state is protected by spa_vdev_state_enter() / spa_vdev_state_exit().
- * Like spa_vdev_enter/exit, these are convenience wrappers -- the actual
- * locking is, always, based on spa_namespace_lock and spa_config_lock[].
- */
-
-static avl_tree_t spa_namespace_avl;
-kmutex_t spa_namespace_lock;
-static kcondvar_t spa_namespace_cv;
-static int spa_active_count;
-int spa_max_replication_override = SPA_DVAS_PER_BP;
-
-static kmutex_t spa_spare_lock;
-static avl_tree_t spa_spare_avl;
-static kmutex_t spa_l2cache_lock;
-static avl_tree_t spa_l2cache_avl;
-
-kmem_cache_t *spa_buffer_pool;
-int spa_mode_global;
-
-#ifdef ZFS_DEBUG
-/*
- * Everything except dprintf, spa, and indirect_remap is on by default
- * in debug builds.
- */
-int zfs_flags = ~(ZFS_DEBUG_DPRINTF | ZFS_DEBUG_INDIRECT_REMAP);
-#else
-int zfs_flags = 0;
-#endif
-
-/*
- * zfs_recover can be set to nonzero to attempt to recover from
- * otherwise-fatal errors, typically caused by on-disk corruption.  When
- * set, calls to zfs_panic_recover() will turn into warning messages.
- * This should only be used as a last resort, as it typically results
- * in leaked space, or worse.
- */
-boolean_t zfs_recover = B_FALSE;
-
-/*
- * If destroy encounters an EIO while reading metadata (e.g. indirect
- * blocks), space referenced by the missing metadata can not be freed.
- * Normally this causes the background destroy to become "stalled", as
- * it is unable to make forward progress.  While in this stalled state,
- * all remaining space to free from the error-encountering filesystem is
- * "temporarily leaked".  Set this flag to cause it to ignore the EIO,
- * permanently leak the space from indirect blocks that can not be read,
- * and continue to free everything else that it can.
- *
- * The default, "stalling" behavior is useful if the storage partially
- * fails (i.e. some but not all i/os fail), and then later recovers.  In
- * this case, we will be able to continue pool operations while it is
- * partially failed, and when it recovers, we can continue to free the
- * space, with no leaks.  However, note that this case is actually
- * fairly rare.
- *
- * Typically pools either (a) fail completely (but perhaps temporarily,
- * e.g. a top-level vdev going offline), or (b) have localized,
- * permanent errors (e.g. disk returns the wrong data due to bit flip or
- * firmware bug).  In case (a), this setting does not matter because the
- * pool will be suspended and the sync thread will not be able to make
- * forward progress regardless.  In case (b), because the error is
- * permanent, the best we can do is leak the minimum amount of space,
- * which is what setting this flag will do.  Therefore, it is reasonable
- * for this flag to normally be set, but we chose the more conservative
- * approach of not setting it, so that there is no possibility of
- * leaking space in the "partial temporary" failure case.
- */
-boolean_t zfs_free_leak_on_eio = B_FALSE;
-
-/*
- * Expiration time in milliseconds. This value has two meanings. First it is
- * used to determine when the spa_deadman() logic should fire. By default the
- * spa_deadman() will fire if spa_sync() has not completed in 1000 seconds.
- * Secondly, the value determines if an I/O is considered "hung". Any I/O that
- * has not completed in zfs_deadman_synctime_ms is considered "hung" resulting
- * in a system panic.
- */
-uint64_t zfs_deadman_synctime_ms = 1000000ULL;
-
-/*
- * Check time in milliseconds. This defines the frequency at which we check
- * for hung I/O.
- */
-uint64_t zfs_deadman_checktime_ms = 5000ULL;
-
-/*
- * Default value of -1 for zfs_deadman_enabled is resolved in
- * zfs_deadman_init()
- */
-int zfs_deadman_enabled = -1;
-
-/*
- * The worst case is single-sector max-parity RAID-Z blocks, in which
- * case the space requirement is exactly (VDEV_RAIDZ_MAXPARITY + 1)
- * times the size; so just assume that.  Add to this the fact that
- * we can have up to 3 DVAs per bp, and one more factor of 2 because
- * the block may be dittoed with up to 3 DVAs by ddt_sync().  All together,
- * the worst case is:
- *     (VDEV_RAIDZ_MAXPARITY + 1) * SPA_DVAS_PER_BP * 2 == 24
- */
-int spa_asize_inflation = 24;
-
-#if defined(__FreeBSD__) && defined(_KERNEL)
-SYSCTL_DECL(_vfs_zfs);
-SYSCTL_INT(_vfs_zfs, OID_AUTO, recover, CTLFLAG_RWTUN, &zfs_recover, 0,
-    "Try to recover from otherwise-fatal errors.");
-
-static int
-sysctl_vfs_zfs_debug_flags(SYSCTL_HANDLER_ARGS)
-{
-	int err, val;
-
-	val = zfs_flags;
-	err = sysctl_handle_int(oidp, &val, 0, req);
-	if (err != 0 || req->newptr == NULL)
-		return (err);
-
-	/*
-	 * ZFS_DEBUG_MODIFY must be enabled prior to boot so all
-	 * arc buffers in the system have the necessary additional
-	 * checksum data.  However, it is safe to disable at any
-	 * time.
-	 */
-	if (!(zfs_flags & ZFS_DEBUG_MODIFY))
-		val &= ~ZFS_DEBUG_MODIFY;
-	zfs_flags = val;
-
-	return (0);
-}
-
-SYSCTL_PROC(_vfs_zfs, OID_AUTO, debugflags,
-    CTLTYPE_UINT | CTLFLAG_MPSAFE | CTLFLAG_RWTUN, 0, sizeof(int),
-    sysctl_vfs_zfs_debug_flags, "IU", "Debug flags for ZFS testing.");
-
-SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, deadman_synctime_ms, CTLFLAG_RWTUN,
-    &zfs_deadman_synctime_ms, 0,
-    "Stalled ZFS I/O expiration time in milliseconds");
-SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, deadman_checktime_ms, CTLFLAG_RWTUN,
-    &zfs_deadman_checktime_ms, 0,
-    "Period of checks for stalled ZFS I/O in milliseconds");
-SYSCTL_INT(_vfs_zfs, OID_AUTO, deadman_enabled, CTLFLAG_RWTUN,
-    &zfs_deadman_enabled, 0, "Kernel panic on stalled ZFS I/O");
-SYSCTL_INT(_vfs_zfs, OID_AUTO, spa_asize_inflation, CTLFLAG_RWTUN,
-    &spa_asize_inflation, 0, "Worst case inflation factor for single sector writes");
-#endif
-
-#ifndef illumos
-#ifdef _KERNEL
-static void
-zfs_deadman_init()
-{
-	/*
-	 * If we are not i386 or amd64 or in a virtual machine,
-	 * disable ZFS deadman thread by default
-	 */
-	if (zfs_deadman_enabled == -1) {
-#if defined(__amd64__) || defined(__i386__)
-		zfs_deadman_enabled = (vm_guest == VM_GUEST_NO) ? 1 : 0;
-#else
-		zfs_deadman_enabled = 0;
-#endif
-	}
-}
-#endif	/* _KERNEL */
-#endif	/* !illumos */
-
-/*
- * Normally, we don't allow the last 3.2% (1/(2^spa_slop_shift)) of space in
- * the pool to be consumed.  This ensures that we don't run the pool
- * completely out of space, due to unaccounted changes (e.g. to the MOS).
- * It also limits the worst-case time to allocate space.  If we have
- * less than this amount of free space, most ZPL operations (e.g. write,
- * create) will return ENOSPC.
- *
- * Certain operations (e.g. file removal, most administrative actions) can
- * use half the slop space.  They will only return ENOSPC if less than half
- * the slop space is free.  Typically, once the pool has less than the slop
- * space free, the user will use these operations to free up space in the pool.
- * These are the operations that call dsl_pool_adjustedsize() with the netfree
- * argument set to TRUE.
- *
- * Operations that are almost guaranteed to free up space in the absence of
- * a pool checkpoint can use up to three quarters of the slop space
- * (e.g zfs destroy).
- *
- * A very restricted set of operations are always permitted, regardless of
- * the amount of free space.  These are the operations that call
- * dsl_sync_task(ZFS_SPACE_CHECK_NONE). If these operations result in a net
- * increase in the amount of space used, it is possible to run the pool
- * completely out of space, causing it to be permanently read-only.
- *
- * Note that on very small pools, the slop space will be larger than
- * 3.2%, in an effort to have it be at least spa_min_slop (128MB),
- * but we never allow it to be more than half the pool size.
- *
- * See also the comments in zfs_space_check_t.
- */
-int spa_slop_shift = 5;
-SYSCTL_INT(_vfs_zfs, OID_AUTO, spa_slop_shift, CTLFLAG_RWTUN,
-    &spa_slop_shift, 0,
-    "Shift value of reserved space (1/(2^spa_slop_shift)).");
-uint64_t spa_min_slop = 128 * 1024 * 1024;
-SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, spa_min_slop, CTLFLAG_RWTUN,
-    &spa_min_slop, 0,
-    "Minimal value of reserved space");
-
-int spa_allocators = 4;
-
-SYSCTL_INT(_vfs_zfs, OID_AUTO, spa_allocators, CTLFLAG_RWTUN,
-    &spa_allocators, 0,
-    "Number of allocators per metaslab group");
-
-/*PRINTFLIKE2*/
-void
-spa_load_failed(spa_t *spa, const char *fmt, ...)
-{
-	va_list adx;
-	char buf[256];
-
-	va_start(adx, fmt);
-	(void) vsnprintf(buf, sizeof (buf), fmt, adx);
-	va_end(adx);
-
-	zfs_dbgmsg("spa_load(%s, config %s): FAILED: %s", spa->spa_name,
-	    spa->spa_trust_config ? "trusted" : "untrusted", buf);
-}
-
-/*PRINTFLIKE2*/
-void
-spa_load_note(spa_t *spa, const char *fmt, ...)
-{
-	va_list adx;
-	char buf[256];
-
-	va_start(adx, fmt);
-	(void) vsnprintf(buf, sizeof (buf), fmt, adx);
-	va_end(adx);
-
-	zfs_dbgmsg("spa_load(%s, config %s): %s", spa->spa_name,
-	    spa->spa_trust_config ? "trusted" : "untrusted", buf);
-}
-
-/*
- * By default dedup and user data indirects land in the special class
- */
-int zfs_ddt_data_is_special = B_TRUE;
-int zfs_user_indirect_is_special = B_TRUE;
-
-/*
- * The percentage of special class final space reserved for metadata only.
- * Once we allocate 100 - zfs_special_class_metadata_reserve_pct we only
- * let metadata into the class.
- */
-int zfs_special_class_metadata_reserve_pct = 25;
-
-#if defined(__FreeBSD__) && defined(_KERNEL)
-SYSCTL_INT(_vfs_zfs, OID_AUTO, ddt_data_is_special, CTLFLAG_RWTUN,
-    &zfs_ddt_data_is_special, 0,
-    "Whether DDT data is eligible for the special class vdevs");
-SYSCTL_INT(_vfs_zfs, OID_AUTO, user_indirect_is_special, CTLFLAG_RWTUN,
-    &zfs_user_indirect_is_special, 0,
-    "Whether indirect blocks are eligible for the special class vdevs");
-SYSCTL_INT(_vfs_zfs, OID_AUTO, special_class_metadata_reserve_pct,
-    CTLFLAG_RWTUN, &zfs_special_class_metadata_reserve_pct, 0,
-    "Percentage of space in the special class reserved solely for metadata");
-#endif
-
-/*
- * ==========================================================================
- * SPA config locking
- * ==========================================================================
- */
-static void
-spa_config_lock_init(spa_t *spa)
-{
-	for (int i = 0; i < SCL_LOCKS; i++) {
-		spa_config_lock_t *scl = &spa->spa_config_lock[i];
-		mutex_init(&scl->scl_lock, NULL, MUTEX_DEFAULT, NULL);
-		cv_init(&scl->scl_cv, NULL, CV_DEFAULT, NULL);
-		zfs_refcount_create_untracked(&scl->scl_count);
-		scl->scl_writer = NULL;
-		scl->scl_write_wanted = 0;
-	}
-}
-
-static void
-spa_config_lock_destroy(spa_t *spa)
-{
-	for (int i = 0; i < SCL_LOCKS; i++) {
-		spa_config_lock_t *scl = &spa->spa_config_lock[i];
-		mutex_destroy(&scl->scl_lock);
-		cv_destroy(&scl->scl_cv);
-		zfs_refcount_destroy(&scl->scl_count);
-		ASSERT(scl->scl_writer == NULL);
-		ASSERT(scl->scl_write_wanted == 0);
-	}
-}
-
-int
-spa_config_tryenter(spa_t *spa, int locks, void *tag, krw_t rw)
-{
-	for (int i = 0; i < SCL_LOCKS; i++) {
-		spa_config_lock_t *scl = &spa->spa_config_lock[i];
-		if (!(locks & (1 << i)))
-			continue;
-		mutex_enter(&scl->scl_lock);
-		if (rw == RW_READER) {
-			if (scl->scl_writer || scl->scl_write_wanted) {
-				mutex_exit(&scl->scl_lock);
-				spa_config_exit(spa, locks & ((1 << i) - 1),
-				    tag);
-				return (0);
-			}
-		} else {
-			ASSERT(scl->scl_writer != curthread);
-			if (!zfs_refcount_is_zero(&scl->scl_count)) {
-				mutex_exit(&scl->scl_lock);
-				spa_config_exit(spa, locks & ((1 << i) - 1),
-				    tag);
-				return (0);
-			}
-			scl->scl_writer = curthread;
-		}
-		(void) zfs_refcount_add(&scl->scl_count, tag);
-		mutex_exit(&scl->scl_lock);
-	}
-	return (1);
-}
-
-void
-spa_config_enter(spa_t *spa, int locks, void *tag, krw_t rw)
-{
-	int wlocks_held = 0;
-
-	ASSERT3U(SCL_LOCKS, <, sizeof (wlocks_held) * NBBY);
-
-	for (int i = 0; i < SCL_LOCKS; i++) {
-		spa_config_lock_t *scl = &spa->spa_config_lock[i];
-		if (scl->scl_writer == curthread)
-			wlocks_held |= (1 << i);
-		if (!(locks & (1 << i)))
-			continue;
-		mutex_enter(&scl->scl_lock);
-		if (rw == RW_READER) {
-			while (scl->scl_writer || scl->scl_write_wanted) {
-				cv_wait(&scl->scl_cv, &scl->scl_lock);
-			}
-		} else {
-			ASSERT(scl->scl_writer != curthread);
-			while (!zfs_refcount_is_zero(&scl->scl_count)) {
-				scl->scl_write_wanted++;
-				cv_wait(&scl->scl_cv, &scl->scl_lock);
-				scl->scl_write_wanted--;
-			}
-			scl->scl_writer = curthread;
-		}
-		(void) zfs_refcount_add(&scl->scl_count, tag);
-		mutex_exit(&scl->scl_lock);
-	}
-	ASSERT3U(wlocks_held, <=, locks);
-}
-
-void
-spa_config_exit(spa_t *spa, int locks, void *tag)
-{
-	for (int i = SCL_LOCKS - 1; i >= 0; i--) {
-		spa_config_lock_t *scl = &spa->spa_config_lock[i];
-		if (!(locks & (1 << i)))
-			continue;
-		mutex_enter(&scl->scl_lock);
-		ASSERT(!zfs_refcount_is_zero(&scl->scl_count));
-		if (zfs_refcount_remove(&scl->scl_count, tag) == 0) {
-			ASSERT(scl->scl_writer == NULL ||
-			    scl->scl_writer == curthread);
-			scl->scl_writer = NULL;	/* OK in either case */
-			cv_broadcast(&scl->scl_cv);
-		}
-		mutex_exit(&scl->scl_lock);
-	}
-}
-
-int
-spa_config_held(spa_t *spa, int locks, krw_t rw)
-{
-	int locks_held = 0;
-
-	for (int i = 0; i < SCL_LOCKS; i++) {
-		spa_config_lock_t *scl = &spa->spa_config_lock[i];
-		if (!(locks & (1 << i)))
-			continue;
-		if ((rw == RW_READER &&
-		    !zfs_refcount_is_zero(&scl->scl_count)) ||
-		    (rw == RW_WRITER && scl->scl_writer == curthread))
-			locks_held |= 1 << i;
-	}
-
-	return (locks_held);
-}
-
-/*
- * ==========================================================================
- * SPA namespace functions
- * ==========================================================================
- */
-
-/*
- * Lookup the named spa_t in the AVL tree.  The spa_namespace_lock must be held.
- * Returns NULL if no matching spa_t is found.
- */
-spa_t *
-spa_lookup(const char *name)
-{
-	static spa_t search;	/* spa_t is large; don't allocate on stack */
-	spa_t *spa;
-	avl_index_t where;
-	char *cp;
-
-	ASSERT(MUTEX_HELD(&spa_namespace_lock));
-
-	(void) strlcpy(search.spa_name, name, sizeof (search.spa_name));
-
-	/*
-	 * If it's a full dataset name, figure out the pool name and
-	 * just use that.
-	 */
-	cp = strpbrk(search.spa_name, "/@#");
-	if (cp != NULL)
-		*cp = '\0';
-
-	spa = avl_find(&spa_namespace_avl, &search, &where);
-
-	return (spa);
-}
-
-/*
- * Fires when spa_sync has not completed within zfs_deadman_synctime_ms.
- * If the zfs_deadman_enabled flag is set then it inspects all vdev queues
- * looking for potentially hung I/Os.
- */
-static void
-spa_deadman(void *arg, int pending)
-{
-	spa_t *spa = arg;
-
-	/*
-	 * Disable the deadman timer if the pool is suspended.
-	 */
-	if (spa_suspended(spa)) {
-#ifdef illumos
-		VERIFY(cyclic_reprogram(spa->spa_deadman_cycid, CY_INFINITY));
-#else
-		/* Nothing.  just don't schedule any future callouts. */
-#endif
-		return;
-	}
-
-	zfs_dbgmsg("slow spa_sync: started %llu seconds ago, calls %llu",
-	    (gethrtime() - spa->spa_sync_starttime) / NANOSEC,
-	    ++spa->spa_deadman_calls);
-	if (zfs_deadman_enabled)
-		vdev_deadman(spa->spa_root_vdev);
-#ifdef __FreeBSD__
-#ifdef _KERNEL
-	callout_schedule(&spa->spa_deadman_cycid,
-	    hz * zfs_deadman_checktime_ms / MILLISEC);
-#endif
-#endif
-}
-
-#if defined(__FreeBSD__) && defined(_KERNEL)
-static void
-spa_deadman_timeout(void *arg)
-{
-	spa_t *spa = arg;
-
-	taskqueue_enqueue(taskqueue_thread, &spa->spa_deadman_task);
-}
-#endif
-
-/*
- * Create an uninitialized spa_t with the given name.  Requires
- * spa_namespace_lock.  The caller must ensure that the spa_t doesn't already
- * exist by calling spa_lookup() first.
- */
-spa_t *
-spa_add(const char *name, nvlist_t *config, const char *altroot)
-{
-	spa_t *spa;
-	spa_config_dirent_t *dp;
-#ifdef illumos
-	cyc_handler_t hdlr;
-	cyc_time_t when;
-#endif
-
-	ASSERT(MUTEX_HELD(&spa_namespace_lock));
-
-	spa = kmem_zalloc(sizeof (spa_t), KM_SLEEP);
-
-	mutex_init(&spa->spa_async_lock, NULL, MUTEX_DEFAULT, NULL);
-	mutex_init(&spa->spa_errlist_lock, NULL, MUTEX_DEFAULT, NULL);
-	mutex_init(&spa->spa_errlog_lock, NULL, MUTEX_DEFAULT, NULL);
-	mutex_init(&spa->spa_evicting_os_lock, NULL, MUTEX_DEFAULT, NULL);
-	mutex_init(&spa->spa_history_lock, NULL, MUTEX_DEFAULT, NULL);
-	mutex_init(&spa->spa_proc_lock, NULL, MUTEX_DEFAULT, NULL);
-	mutex_init(&spa->spa_props_lock, NULL, MUTEX_DEFAULT, NULL);
-	mutex_init(&spa->spa_cksum_tmpls_lock, NULL, MUTEX_DEFAULT, NULL);
-	mutex_init(&spa->spa_scrub_lock, NULL, MUTEX_DEFAULT, NULL);
-	mutex_init(&spa->spa_suspend_lock, NULL, MUTEX_DEFAULT, NULL);
-	mutex_init(&spa->spa_vdev_top_lock, NULL, MUTEX_DEFAULT, NULL);
-	mutex_init(&spa->spa_feat_stats_lock, NULL, MUTEX_DEFAULT, NULL);
-
-	cv_init(&spa->spa_async_cv, NULL, CV_DEFAULT, NULL);
-	cv_init(&spa->spa_evicting_os_cv, NULL, CV_DEFAULT, NULL);
-	cv_init(&spa->spa_proc_cv, NULL, CV_DEFAULT, NULL);
-	cv_init(&spa->spa_scrub_io_cv, NULL, CV_DEFAULT, NULL);
-	cv_init(&spa->spa_suspend_cv, NULL, CV_DEFAULT, NULL);
-
-	for (int t = 0; t < TXG_SIZE; t++)
-		bplist_create(&spa->spa_free_bplist[t]);
-
-	(void) strlcpy(spa->spa_name, name, sizeof (spa->spa_name));
-	spa->spa_state = POOL_STATE_UNINITIALIZED;
-	spa->spa_freeze_txg = UINT64_MAX;
-	spa->spa_final_txg = UINT64_MAX;
-	spa->spa_load_max_txg = UINT64_MAX;
-	spa->spa_proc = &p0;
-	spa->spa_proc_state = SPA_PROC_NONE;
-	spa->spa_trust_config = B_TRUE;
-
-#ifdef illumos
-	hdlr.cyh_func = spa_deadman;
-	hdlr.cyh_arg = spa;
-	hdlr.cyh_level = CY_LOW_LEVEL;
-#endif
-
-	spa->spa_deadman_synctime = MSEC2NSEC(zfs_deadman_synctime_ms);
-
-#ifdef illumos
-	/*
-	 * This determines how often we need to check for hung I/Os after
-	 * the cyclic has already fired. Since checking for hung I/Os is
-	 * an expensive operation we don't want to check too frequently.
-	 * Instead wait for 5 seconds before checking again.
-	 */
-	when.cyt_interval = MSEC2NSEC(zfs_deadman_checktime_ms);
-	when.cyt_when = CY_INFINITY;
-	mutex_enter(&cpu_lock);
-	spa->spa_deadman_cycid = cyclic_add(&hdlr, &when);
-	mutex_exit(&cpu_lock);
-#else	/* !illumos */
-#ifdef _KERNEL
-	/*
-	 * callout(9) does not provide a way to initialize a callout with
-	 * a function and an argument, so we use callout_reset() to schedule
-	 * the callout in the very distant future.  Even if that event ever
-	 * fires, it should be okayas we won't have any active zio-s.
-	 * But normally spa_sync() will reschedule the callout with a proper
-	 * timeout.
-	 * callout(9) does not allow the callback function to sleep but
-	 * vdev_deadman() needs to acquire vq_lock and illumos mutexes are
-	 * emulated using sx(9).  For this reason spa_deadman_timeout()
-	 * will schedule spa_deadman() as task on a taskqueue that allows
-	 * sleeping.
-	 */
-	TASK_INIT(&spa->spa_deadman_task, 0, spa_deadman, spa);
-	callout_init(&spa->spa_deadman_cycid, 1);
-	callout_reset_sbt(&spa->spa_deadman_cycid, SBT_MAX, 0,
-	    spa_deadman_timeout, spa, 0);
-#endif
-#endif
-	zfs_refcount_create(&spa->spa_refcount);
-	spa_config_lock_init(spa);
-
-	avl_add(&spa_namespace_avl, spa);
-
-	/*
-	 * Set the alternate root, if there is one.
-	 */
-	if (altroot) {
-		spa->spa_root = spa_strdup(altroot);
-		spa_active_count++;
-	}
-
-	spa->spa_alloc_count = spa_allocators;
-	spa->spa_alloc_locks = kmem_zalloc(spa->spa_alloc_count *
-	    sizeof (kmutex_t), KM_SLEEP);
-	spa->spa_alloc_trees = kmem_zalloc(spa->spa_alloc_count *
-	    sizeof (avl_tree_t), KM_SLEEP);
-	for (int i = 0; i < spa->spa_alloc_count; i++) {
-		mutex_init(&spa->spa_alloc_locks[i], NULL, MUTEX_DEFAULT, NULL);
-		avl_create(&spa->spa_alloc_trees[i], zio_bookmark_compare,
-		    sizeof (zio_t), offsetof(zio_t, io_alloc_node));
-	}
-
-	/*
-	 * Every pool starts with the default cachefile
-	 */
-	list_create(&spa->spa_config_list, sizeof (spa_config_dirent_t),
-	    offsetof(spa_config_dirent_t, scd_link));
-
-	dp = kmem_zalloc(sizeof (spa_config_dirent_t), KM_SLEEP);
-	dp->scd_path = altroot ? NULL : spa_strdup(spa_config_path);
-	list_insert_head(&spa->spa_config_list, dp);
-
-	VERIFY(nvlist_alloc(&spa->spa_load_info, NV_UNIQUE_NAME,
-	    KM_SLEEP) == 0);
-
-	if (config != NULL) {
-		nvlist_t *features;
-
-		if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_FEATURES_FOR_READ,
-		    &features) == 0) {
-			VERIFY(nvlist_dup(features, &spa->spa_label_features,
-			    0) == 0);
-		}
-
-		VERIFY(nvlist_dup(config, &spa->spa_config, 0) == 0);
-	}
-
-	if (spa->spa_label_features == NULL) {
-		VERIFY(nvlist_alloc(&spa->spa_label_features, NV_UNIQUE_NAME,
-		    KM_SLEEP) == 0);
-	}
-
-	spa->spa_min_ashift = INT_MAX;
-	spa->spa_max_ashift = 0;
-
-	/*
-	 * As a pool is being created, treat all features as disabled by
-	 * setting SPA_FEATURE_DISABLED for all entries in the feature
-	 * refcount cache.
-	 */
-	for (int i = 0; i < SPA_FEATURES; i++) {
-		spa->spa_feat_refcount_cache[i] = SPA_FEATURE_DISABLED;
-	}
-
-	list_create(&spa->spa_leaf_list, sizeof (vdev_t),
-	    offsetof(vdev_t, vdev_leaf_node));
-
-	return (spa);
-}
-
-/*
- * Removes a spa_t from the namespace, freeing up any memory used.  Requires
- * spa_namespace_lock.  This is called only after the spa_t has been closed and
- * deactivated.
- */
-void
-spa_remove(spa_t *spa)
-{
-	spa_config_dirent_t *dp;
-
-	ASSERT(MUTEX_HELD(&spa_namespace_lock));
-	ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED);
-	ASSERT3U(zfs_refcount_count(&spa->spa_refcount), ==, 0);
-
-	nvlist_free(spa->spa_config_splitting);
-
-	avl_remove(&spa_namespace_avl, spa);
-	cv_broadcast(&spa_namespace_cv);
-
-	if (spa->spa_root) {
-		spa_strfree(spa->spa_root);
-		spa_active_count--;
-	}
-
-	while ((dp = list_head(&spa->spa_config_list)) != NULL) {
-		list_remove(&spa->spa_config_list, dp);
-		if (dp->scd_path != NULL)
-			spa_strfree(dp->scd_path);
-		kmem_free(dp, sizeof (spa_config_dirent_t));
-	}
-
-	for (int i = 0; i < spa->spa_alloc_count; i++) {
-		avl_destroy(&spa->spa_alloc_trees[i]);
-		mutex_destroy(&spa->spa_alloc_locks[i]);
-	}
-	kmem_free(spa->spa_alloc_locks, spa->spa_alloc_count *
-	    sizeof (kmutex_t));
-	kmem_free(spa->spa_alloc_trees, spa->spa_alloc_count *
-	    sizeof (avl_tree_t));
-
-	list_destroy(&spa->spa_config_list);
-	list_destroy(&spa->spa_leaf_list);
-
-	nvlist_free(spa->spa_label_features);
-	nvlist_free(spa->spa_load_info);
-	nvlist_free(spa->spa_feat_stats);
-	spa_config_set(spa, NULL);
-
-#ifdef illumos
-	mutex_enter(&cpu_lock);
-	if (spa->spa_deadman_cycid != CYCLIC_NONE)
-		cyclic_remove(spa->spa_deadman_cycid);
-	mutex_exit(&cpu_lock);
-	spa->spa_deadman_cycid = CYCLIC_NONE;
-#else	/* !illumos */
-#ifdef _KERNEL
-	callout_drain(&spa->spa_deadman_cycid);
-	taskqueue_drain(taskqueue_thread, &spa->spa_deadman_task);
-#endif
-#endif
-
-	zfs_refcount_destroy(&spa->spa_refcount);
-
-	spa_config_lock_destroy(spa);
-
-	for (int t = 0; t < TXG_SIZE; t++)
-		bplist_destroy(&spa->spa_free_bplist[t]);
-
-	zio_checksum_templates_free(spa);
-
-	cv_destroy(&spa->spa_async_cv);
-	cv_destroy(&spa->spa_evicting_os_cv);
-	cv_destroy(&spa->spa_proc_cv);
-	cv_destroy(&spa->spa_scrub_io_cv);
-	cv_destroy(&spa->spa_suspend_cv);
-
-	mutex_destroy(&spa->spa_async_lock);
-	mutex_destroy(&spa->spa_errlist_lock);
-	mutex_destroy(&spa->spa_errlog_lock);
-	mutex_destroy(&spa->spa_evicting_os_lock);
-	mutex_destroy(&spa->spa_history_lock);
-	mutex_destroy(&spa->spa_proc_lock);
-	mutex_destroy(&spa->spa_props_lock);
-	mutex_destroy(&spa->spa_cksum_tmpls_lock);
-	mutex_destroy(&spa->spa_scrub_lock);
-	mutex_destroy(&spa->spa_suspend_lock);
-	mutex_destroy(&spa->spa_vdev_top_lock);
-	mutex_destroy(&spa->spa_feat_stats_lock);
-
-	kmem_free(spa, sizeof (spa_t));
-}
-
-/*
- * Given a pool, return the next pool in the namespace, or NULL if there is
- * none.  If 'prev' is NULL, return the first pool.
- */
-spa_t *
-spa_next(spa_t *prev)
-{
-	ASSERT(MUTEX_HELD(&spa_namespace_lock));
-
-	if (prev)
-		return (AVL_NEXT(&spa_namespace_avl, prev));
-	else
-		return (avl_first(&spa_namespace_avl));
-}
-
-/*
- * ==========================================================================
- * SPA refcount functions
- * ==========================================================================
- */
-
-/*
- * Add a reference to the given spa_t.  Must have at least one reference, or
- * have the namespace lock held.
- */
-void
-spa_open_ref(spa_t *spa, void *tag)
-{
-	ASSERT(zfs_refcount_count(&spa->spa_refcount) >= spa->spa_minref ||
-	    MUTEX_HELD(&spa_namespace_lock));
-	(void) zfs_refcount_add(&spa->spa_refcount, tag);
-}
-
-/*
- * Remove a reference to the given spa_t.  Must have at least one reference, or
- * have the namespace lock held.
- */
-void
-spa_close(spa_t *spa, void *tag)
-{
-	ASSERT(zfs_refcount_count(&spa->spa_refcount) > spa->spa_minref ||
-	    MUTEX_HELD(&spa_namespace_lock));
-	(void) zfs_refcount_remove(&spa->spa_refcount, tag);
-}
-
-/*
- * Remove a reference to the given spa_t held by a dsl dir that is
- * being asynchronously released.  Async releases occur from a taskq
- * performing eviction of dsl datasets and dirs.  The namespace lock
- * isn't held and the hold by the object being evicted may contribute to
- * spa_minref (e.g. dataset or directory released during pool export),
- * so the asserts in spa_close() do not apply.
- */
-void
-spa_async_close(spa_t *spa, void *tag)
-{
-	(void) zfs_refcount_remove(&spa->spa_refcount, tag);
-}
-
-/*
- * Check to see if the spa refcount is zero.  Must be called with
- * spa_namespace_lock held.  We really compare against spa_minref, which is the
- * number of references acquired when opening a pool
- */
-boolean_t
-spa_refcount_zero(spa_t *spa)
-{
-	ASSERT(MUTEX_HELD(&spa_namespace_lock));
-
-	return (zfs_refcount_count(&spa->spa_refcount) == spa->spa_minref);
-}
-
-/*
- * ==========================================================================
- * SPA spare and l2cache tracking
- * ==========================================================================
- */
-
-/*
- * Hot spares and cache devices are tracked using the same code below,
- * for 'auxiliary' devices.
- */
-
-typedef struct spa_aux {
-	uint64_t	aux_guid;
-	uint64_t	aux_pool;
-	avl_node_t	aux_avl;
-	int		aux_count;
-} spa_aux_t;
-
-static inline int
-spa_aux_compare(const void *a, const void *b)
-{
-	const spa_aux_t *sa = (const spa_aux_t *)a;
-	const spa_aux_t *sb = (const spa_aux_t *)b;
-
-	return (AVL_CMP(sa->aux_guid, sb->aux_guid));
-}
-
-void
-spa_aux_add(vdev_t *vd, avl_tree_t *avl)
-{
-	avl_index_t where;
-	spa_aux_t search;
-	spa_aux_t *aux;
-
-	search.aux_guid = vd->vdev_guid;
-	if ((aux = avl_find(avl, &search, &where)) != NULL) {
-		aux->aux_count++;
-	} else {
-		aux = kmem_zalloc(sizeof (spa_aux_t), KM_SLEEP);
-		aux->aux_guid = vd->vdev_guid;
-		aux->aux_count = 1;
-		avl_insert(avl, aux, where);
-	}
-}
-
-void
-spa_aux_remove(vdev_t *vd, avl_tree_t *avl)
-{
-	spa_aux_t search;
-	spa_aux_t *aux;
-	avl_index_t where;
-
-	search.aux_guid = vd->vdev_guid;
-	aux = avl_find(avl, &search, &where);
-
-	ASSERT(aux != NULL);
-
-	if (--aux->aux_count == 0) {
-		avl_remove(avl, aux);
-		kmem_free(aux, sizeof (spa_aux_t));
-	} else if (aux->aux_pool == spa_guid(vd->vdev_spa)) {
-		aux->aux_pool = 0ULL;
-	}
-}
-
-boolean_t
-spa_aux_exists(uint64_t guid, uint64_t *pool, int *refcnt, avl_tree_t *avl)
-{
-	spa_aux_t search, *found;
-
-	search.aux_guid = guid;
-	found = avl_find(avl, &search, NULL);
-
-	if (pool) {
-		if (found)
-			*pool = found->aux_pool;
-		else
-			*pool = 0ULL;
-	}
-
-	if (refcnt) {
-		if (found)
-			*refcnt = found->aux_count;
-		else
-			*refcnt = 0;
-	}
-
-	return (found != NULL);
-}
-
-void
-spa_aux_activate(vdev_t *vd, avl_tree_t *avl)
-{
-	spa_aux_t search, *found;
-	avl_index_t where;
-
-	search.aux_guid = vd->vdev_guid;
-	found = avl_find(avl, &search, &where);
-	ASSERT(found != NULL);
-	ASSERT(found->aux_pool == 0ULL);
-
-	found->aux_pool = spa_guid(vd->vdev_spa);
-}
-
-/*
- * Spares are tracked globally due to the following constraints:
- *
- *	- A spare may be part of multiple pools.
- *	- A spare may be added to a pool even if it's actively in use within
- *	  another pool.
- *	- A spare in use in any pool can only be the source of a replacement if
- *	  the target is a spare in the same pool.
- *
- * We keep track of all spares on the system through the use of a reference
- * counted AVL tree.  When a vdev is added as a spare, or used as a replacement
- * spare, then we bump the reference count in the AVL tree.  In addition, we set
- * the 'vdev_isspare' member to indicate that the device is a spare (active or
- * inactive).  When a spare is made active (used to replace a device in the
- * pool), we also keep track of which pool its been made a part of.
- *
- * The 'spa_spare_lock' protects the AVL tree.  These functions are normally
- * called under the spa_namespace lock as part of vdev reconfiguration.  The
- * separate spare lock exists for the status query path, which does not need to
- * be completely consistent with respect to other vdev configuration changes.
- */
-
-static int
-spa_spare_compare(const void *a, const void *b)
-{
-	return (spa_aux_compare(a, b));
-}
-
-void
-spa_spare_add(vdev_t *vd)
-{
-	mutex_enter(&spa_spare_lock);
-	ASSERT(!vd->vdev_isspare);
-	spa_aux_add(vd, &spa_spare_avl);
-	vd->vdev_isspare = B_TRUE;
-	mutex_exit(&spa_spare_lock);
-}
-
-void
-spa_spare_remove(vdev_t *vd)
-{
-	mutex_enter(&spa_spare_lock);
-	ASSERT(vd->vdev_isspare);
-	spa_aux_remove(vd, &spa_spare_avl);
-	vd->vdev_isspare = B_FALSE;
-	mutex_exit(&spa_spare_lock);
-}
-
-boolean_t
-spa_spare_exists(uint64_t guid, uint64_t *pool, int *refcnt)
-{
-	boolean_t found;
-
-	mutex_enter(&spa_spare_lock);
-	found = spa_aux_exists(guid, pool, refcnt, &spa_spare_avl);
-	mutex_exit(&spa_spare_lock);
-
-	return (found);
-}
-
-void
-spa_spare_activate(vdev_t *vd)
-{
-	mutex_enter(&spa_spare_lock);
-	ASSERT(vd->vdev_isspare);
-	spa_aux_activate(vd, &spa_spare_avl);
-	mutex_exit(&spa_spare_lock);
-}
-
-/*
- * Level 2 ARC devices are tracked globally for the same reasons as spares.
- * Cache devices currently only support one pool per cache device, and so
- * for these devices the aux reference count is currently unused beyond 1.
- */
-
-static int
-spa_l2cache_compare(const void *a, const void *b)
-{
-	return (spa_aux_compare(a, b));
-}
-
-void
-spa_l2cache_add(vdev_t *vd)
-{
-	mutex_enter(&spa_l2cache_lock);
-	ASSERT(!vd->vdev_isl2cache);
-	spa_aux_add(vd, &spa_l2cache_avl);
-	vd->vdev_isl2cache = B_TRUE;
-	mutex_exit(&spa_l2cache_lock);
-}
-
-void
-spa_l2cache_remove(vdev_t *vd)
-{
-	mutex_enter(&spa_l2cache_lock);
-	ASSERT(vd->vdev_isl2cache);
-	spa_aux_remove(vd, &spa_l2cache_avl);
-	vd->vdev_isl2cache = B_FALSE;
-	mutex_exit(&spa_l2cache_lock);
-}
-
-boolean_t
-spa_l2cache_exists(uint64_t guid, uint64_t *pool)
-{
-	boolean_t found;
-
-	mutex_enter(&spa_l2cache_lock);
-	found = spa_aux_exists(guid, pool, NULL, &spa_l2cache_avl);
-	mutex_exit(&spa_l2cache_lock);
-
-	return (found);
-}
-
-void
-spa_l2cache_activate(vdev_t *vd)
-{
-	mutex_enter(&spa_l2cache_lock);
-	ASSERT(vd->vdev_isl2cache);
-	spa_aux_activate(vd, &spa_l2cache_avl);
-	mutex_exit(&spa_l2cache_lock);
-}
-
-/*
- * ==========================================================================
- * SPA vdev locking
- * ==========================================================================
- */
-
-/*
- * Lock the given spa_t for the purpose of adding or removing a vdev.
- * Grabs the global spa_namespace_lock plus the spa config lock for writing.
- * It returns the next transaction group for the spa_t.
- */
-uint64_t
-spa_vdev_enter(spa_t *spa)
-{
-	mutex_enter(&spa->spa_vdev_top_lock);
-	mutex_enter(&spa_namespace_lock);
-	return (spa_vdev_config_enter(spa));
-}
-
-/*
- * Internal implementation for spa_vdev_enter().  Used when a vdev
- * operation requires multiple syncs (i.e. removing a device) while
- * keeping the spa_namespace_lock held.
- */
-uint64_t
-spa_vdev_config_enter(spa_t *spa)
-{
-	ASSERT(MUTEX_HELD(&spa_namespace_lock));
-
-	spa_config_enter(spa, SCL_ALL, spa, RW_WRITER);
-
-	return (spa_last_synced_txg(spa) + 1);
-}
-
-/*
- * Used in combination with spa_vdev_config_enter() to allow the syncing
- * of multiple transactions without releasing the spa_namespace_lock.
- */
-void
-spa_vdev_config_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error, char *tag)
-{
-	ASSERT(MUTEX_HELD(&spa_namespace_lock));
-
-	int config_changed = B_FALSE;
-
-	ASSERT(txg > spa_last_synced_txg(spa));
-
-	spa->spa_pending_vdev = NULL;
-
-	/*
-	 * Reassess the DTLs.
-	 */
-	vdev_dtl_reassess(spa->spa_root_vdev, 0, 0, B_FALSE);
-
-	if (error == 0 && !list_is_empty(&spa->spa_config_dirty_list)) {
-		config_changed = B_TRUE;
-		spa->spa_config_generation++;
-	}
-
-	/*
-	 * Verify the metaslab classes.
-	 */
-	ASSERT(metaslab_class_validate(spa_normal_class(spa)) == 0);
-	ASSERT(metaslab_class_validate(spa_log_class(spa)) == 0);
-	ASSERT(metaslab_class_validate(spa_special_class(spa)) == 0);
-	ASSERT(metaslab_class_validate(spa_dedup_class(spa)) == 0);
-
-	spa_config_exit(spa, SCL_ALL, spa);
-
-	/*
-	 * Panic the system if the specified tag requires it.  This
-	 * is useful for ensuring that configurations are updated
-	 * transactionally.
-	 */
-	if (zio_injection_enabled)
-		zio_handle_panic_injection(spa, tag, 0);
-
-	/*
-	 * Note: this txg_wait_synced() is important because it ensures
-	 * that there won't be more than one config change per txg.
-	 * This allows us to use the txg as the generation number.
-	 */
-	if (error == 0)
-		txg_wait_synced(spa->spa_dsl_pool, txg);
-
-	if (vd != NULL) {
-		ASSERT(!vd->vdev_detached || vd->vdev_dtl_sm == NULL);
-		if (vd->vdev_ops->vdev_op_leaf) {
-			mutex_enter(&vd->vdev_initialize_lock);
-			vdev_initialize_stop(vd, VDEV_INITIALIZE_CANCELED);
-			mutex_exit(&vd->vdev_initialize_lock);
-		}
-
-		spa_config_enter(spa, SCL_ALL, spa, RW_WRITER);
-		vdev_free(vd);
-		spa_config_exit(spa, SCL_ALL, spa);
-	}
-
-	/*
-	 * If the config changed, update the config cache.
-	 */
-	if (config_changed)
-		spa_write_cachefile(spa, B_FALSE, B_TRUE);
-}
-
-/*
- * Unlock the spa_t after adding or removing a vdev.  Besides undoing the
- * locking of spa_vdev_enter(), we also want make sure the transactions have
- * synced to disk, and then update the global configuration cache with the new
- * information.
- */
-int
-spa_vdev_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error)
-{
-	spa_vdev_config_exit(spa, vd, txg, error, FTAG);
-	mutex_exit(&spa_namespace_lock);
-	mutex_exit(&spa->spa_vdev_top_lock);
-
-	return (error);
-}
-
-/*
- * Lock the given spa_t for the purpose of changing vdev state.
- */
-void
-spa_vdev_state_enter(spa_t *spa, int oplocks)
-{
-	int locks = SCL_STATE_ALL | oplocks;
-
-	/*
-	 * Root pools may need to read of the underlying devfs filesystem
-	 * when opening up a vdev.  Unfortunately if we're holding the
-	 * SCL_ZIO lock it will result in a deadlock when we try to issue
-	 * the read from the root filesystem.  Instead we "prefetch"
-	 * the associated vnodes that we need prior to opening the
-	 * underlying devices and cache them so that we can prevent
-	 * any I/O when we are doing the actual open.
-	 */
-	if (spa_is_root(spa)) {
-		int low = locks & ~(SCL_ZIO - 1);
-		int high = locks & ~low;
-
-		spa_config_enter(spa, high, spa, RW_WRITER);
-		vdev_hold(spa->spa_root_vdev);
-		spa_config_enter(spa, low, spa, RW_WRITER);
-	} else {
-		spa_config_enter(spa, locks, spa, RW_WRITER);
-	}
-	spa->spa_vdev_locks = locks;
-}
-
-int
-spa_vdev_state_exit(spa_t *spa, vdev_t *vd, int error)
-{
-	boolean_t config_changed = B_FALSE;
-
-	if (vd != NULL || error == 0)
-		vdev_dtl_reassess(vd ? vd->vdev_top : spa->spa_root_vdev,
-		    0, 0, B_FALSE);
-
-	if (vd != NULL) {
-		vdev_state_dirty(vd->vdev_top);
-		config_changed = B_TRUE;
-		spa->spa_config_generation++;
-	}
-
-	if (spa_is_root(spa))
-		vdev_rele(spa->spa_root_vdev);
-
-	ASSERT3U(spa->spa_vdev_locks, >=, SCL_STATE_ALL);
-	spa_config_exit(spa, spa->spa_vdev_locks, spa);
-
-	/*
-	 * If anything changed, wait for it to sync.  This ensures that,
-	 * from the system administrator's perspective, zpool(1M) commands
-	 * are synchronous.  This is important for things like zpool offline:
-	 * when the command completes, you expect no further I/O from ZFS.
-	 */
-	if (vd != NULL)
-		txg_wait_synced(spa->spa_dsl_pool, 0);
-
-	/*
-	 * If the config changed, update the config cache.
-	 */
-	if (config_changed) {
-		mutex_enter(&spa_namespace_lock);
-		spa_write_cachefile(spa, B_FALSE, B_TRUE);
-		mutex_exit(&spa_namespace_lock);
-	}
-
-	return (error);
-}
-
-/*
- * ==========================================================================
- * Miscellaneous functions
- * ==========================================================================
- */
-
-void
-spa_activate_mos_feature(spa_t *spa, const char *feature, dmu_tx_t *tx)
-{
-	if (!nvlist_exists(spa->spa_label_features, feature)) {
-		fnvlist_add_boolean(spa->spa_label_features, feature);
-		/*
-		 * When we are creating the pool (tx_txg==TXG_INITIAL), we can't
-		 * dirty the vdev config because lock SCL_CONFIG is not held.
-		 * Thankfully, in this case we don't need to dirty the config
-		 * because it will be written out anyway when we finish
-		 * creating the pool.
-		 */
-		if (tx->tx_txg != TXG_INITIAL)
-			vdev_config_dirty(spa->spa_root_vdev);
-	}
-}
-
-void
-spa_deactivate_mos_feature(spa_t *spa, const char *feature)
-{
-	if (nvlist_remove_all(spa->spa_label_features, feature) == 0)
-		vdev_config_dirty(spa->spa_root_vdev);
-}
-
-/*
- * Return the spa_t associated with given pool_guid, if it exists.  If
- * device_guid is non-zero, determine whether the pool exists *and* contains
- * a device with the specified device_guid.
- */
-spa_t *
-spa_by_guid(uint64_t pool_guid, uint64_t device_guid)
-{
-	spa_t *spa;
-	avl_tree_t *t = &spa_namespace_avl;
-
-	ASSERT(MUTEX_HELD(&spa_namespace_lock));
-
-	for (spa = avl_first(t); spa != NULL; spa = AVL_NEXT(t, spa)) {
-		if (spa->spa_state == POOL_STATE_UNINITIALIZED)
-			continue;
-		if (spa->spa_root_vdev == NULL)
-			continue;
-		if (spa_guid(spa) == pool_guid) {
-			if (device_guid == 0)
-				break;
-
-			if (vdev_lookup_by_guid(spa->spa_root_vdev,
-			    device_guid) != NULL)
-				break;
-
-			/*
-			 * Check any devices we may be in the process of adding.
-			 */
-			if (spa->spa_pending_vdev) {
-				if (vdev_lookup_by_guid(spa->spa_pending_vdev,
-				    device_guid) != NULL)
-					break;
-			}
-		}
-	}
-
-	return (spa);
-}
-
-/*
- * Determine whether a pool with the given pool_guid exists.
- */
-boolean_t
-spa_guid_exists(uint64_t pool_guid, uint64_t device_guid)
-{
-	return (spa_by_guid(pool_guid, device_guid) != NULL);
-}
-
-char *
-spa_strdup(const char *s)
-{
-	size_t len;
-	char *new;
-
-	len = strlen(s);
-	new = kmem_alloc(len + 1, KM_SLEEP);
-	bcopy(s, new, len);
-	new[len] = '\0';
-
-	return (new);
-}
-
-void
-spa_strfree(char *s)
-{
-	kmem_free(s, strlen(s) + 1);
-}
-
-uint64_t
-spa_get_random(uint64_t range)
-{
-	uint64_t r;
-
-	ASSERT(range != 0);
-
-	if (range == 1)
-		return (0);
-
-	(void) random_get_pseudo_bytes((void *)&r, sizeof (uint64_t));
-
-	return (r % range);
-}
-
-uint64_t
-spa_generate_guid(spa_t *spa)
-{
-	uint64_t guid = spa_get_random(-1ULL);
-
-	if (spa != NULL) {
-		while (guid == 0 || spa_guid_exists(spa_guid(spa), guid))
-			guid = spa_get_random(-1ULL);
-	} else {
-		while (guid == 0 || spa_guid_exists(guid, 0))
-			guid = spa_get_random(-1ULL);
-	}
-
-	return (guid);
-}
-
-void
-snprintf_blkptr(char *buf, size_t buflen, const blkptr_t *bp)
-{
-	char type[256];
-	char *checksum = NULL;
-	char *compress = NULL;
-
-	if (bp != NULL) {
-		if (BP_GET_TYPE(bp) & DMU_OT_NEWTYPE) {
-			dmu_object_byteswap_t bswap =
-			    DMU_OT_BYTESWAP(BP_GET_TYPE(bp));
-			(void) snprintf(type, sizeof (type), "bswap %s %s",
-			    DMU_OT_IS_METADATA(BP_GET_TYPE(bp)) ?
-			    "metadata" : "data",
-			    dmu_ot_byteswap[bswap].ob_name);
-		} else {
-			(void) strlcpy(type, dmu_ot[BP_GET_TYPE(bp)].ot_name,
-			    sizeof (type));
-		}
-		if (!BP_IS_EMBEDDED(bp)) {
-			checksum =
-			    zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_name;
-		}
-		compress = zio_compress_table[BP_GET_COMPRESS(bp)].ci_name;
-	}
-
-	SNPRINTF_BLKPTR(snprintf, ' ', buf, buflen, bp, type, checksum,
-	    compress);
-}
-
-void
-spa_freeze(spa_t *spa)
-{
-	uint64_t freeze_txg = 0;
-
-	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
-	if (spa->spa_freeze_txg == UINT64_MAX) {
-		freeze_txg = spa_last_synced_txg(spa) + TXG_SIZE;
-		spa->spa_freeze_txg = freeze_txg;
-	}
-	spa_config_exit(spa, SCL_ALL, FTAG);
-	if (freeze_txg != 0)
-		txg_wait_synced(spa_get_dsl(spa), freeze_txg);
-}
-
-void
-zfs_panic_recover(const char *fmt, ...)
-{
-	va_list adx;
-
-	va_start(adx, fmt);
-	vcmn_err(zfs_recover ? CE_WARN : CE_PANIC, fmt, adx);
-	va_end(adx);
-}
-
-/*
- * This is a stripped-down version of strtoull, suitable only for converting
- * lowercase hexadecimal numbers that don't overflow.
- */
-uint64_t
-zfs_strtonum(const char *str, char **nptr)
-{
-	uint64_t val = 0;
-	char c;
-	int digit;
-
-	while ((c = *str) != '\0') {
-		if (c >= '0' && c <= '9')
-			digit = c - '0';
-		else if (c >= 'a' && c <= 'f')
-			digit = 10 + c - 'a';
-		else
-			break;
-
-		val *= 16;
-		val += digit;
-
-		str++;
-	}
-
-	if (nptr)
-		*nptr = (char *)str;
-
-	return (val);
-}
-
-void
-spa_activate_allocation_classes(spa_t *spa, dmu_tx_t *tx)
-{
-	/*
-	 * We bump the feature refcount for each special vdev added to the pool
-	 */
-	ASSERT(spa_feature_is_enabled(spa, SPA_FEATURE_ALLOCATION_CLASSES));
-	spa_feature_incr(spa, SPA_FEATURE_ALLOCATION_CLASSES, tx);
-}
-
-/*
- * ==========================================================================
- * Accessor functions
- * ==========================================================================
- */
-
-boolean_t
-spa_shutting_down(spa_t *spa)
-{
-	return (spa->spa_async_suspended);
-}
-
-dsl_pool_t *
-spa_get_dsl(spa_t *spa)
-{
-	return (spa->spa_dsl_pool);
-}
-
-boolean_t
-spa_is_initializing(spa_t *spa)
-{
-	return (spa->spa_is_initializing);
-}
-
-boolean_t
-spa_indirect_vdevs_loaded(spa_t *spa)
-{
-	return (spa->spa_indirect_vdevs_loaded);
-}
-
-blkptr_t *
-spa_get_rootblkptr(spa_t *spa)
-{
-	return (&spa->spa_ubsync.ub_rootbp);
-}
-
-void
-spa_set_rootblkptr(spa_t *spa, const blkptr_t *bp)
-{
-	spa->spa_uberblock.ub_rootbp = *bp;
-}
-
-void
-spa_altroot(spa_t *spa, char *buf, size_t buflen)
-{
-	if (spa->spa_root == NULL)
-		buf[0] = '\0';
-	else
-		(void) strncpy(buf, spa->spa_root, buflen);
-}
-
-int
-spa_sync_pass(spa_t *spa)
-{
-	return (spa->spa_sync_pass);
-}
-
-char *
-spa_name(spa_t *spa)
-{
-	return (spa->spa_name);
-}
-
-uint64_t
-spa_guid(spa_t *spa)
-{
-	dsl_pool_t *dp = spa_get_dsl(spa);
-	uint64_t guid;
-
-	/*
-	 * If we fail to parse the config during spa_load(), we can go through
-	 * the error path (which posts an ereport) and end up here with no root
-	 * vdev.  We stash the original pool guid in 'spa_config_guid' to handle
-	 * this case.
-	 */
-	if (spa->spa_root_vdev == NULL)
-		return (spa->spa_config_guid);
-
-	guid = spa->spa_last_synced_guid != 0 ?
-	    spa->spa_last_synced_guid : spa->spa_root_vdev->vdev_guid;
-
-	/*
-	 * Return the most recently synced out guid unless we're
-	 * in syncing context.
-	 */
-	if (dp && dsl_pool_sync_context(dp))
-		return (spa->spa_root_vdev->vdev_guid);
-	else
-		return (guid);
-}
-
-uint64_t
-spa_load_guid(spa_t *spa)
-{
-	/*
-	 * This is a GUID that exists solely as a reference for the
-	 * purposes of the arc.  It is generated at load time, and
-	 * is never written to persistent storage.
-	 */
-	return (spa->spa_load_guid);
-}
-
-uint64_t
-spa_last_synced_txg(spa_t *spa)
-{
-	return (spa->spa_ubsync.ub_txg);
-}
-
-uint64_t
-spa_first_txg(spa_t *spa)
-{
-	return (spa->spa_first_txg);
-}
-
-uint64_t
-spa_syncing_txg(spa_t *spa)
-{
-	return (spa->spa_syncing_txg);
-}
-
-/*
- * Return the last txg where data can be dirtied. The final txgs
- * will be used to just clear out any deferred frees that remain.
- */
-uint64_t
-spa_final_dirty_txg(spa_t *spa)
-{
-	return (spa->spa_final_txg - TXG_DEFER_SIZE);
-}
-
-pool_state_t
-spa_state(spa_t *spa)
-{
-	return (spa->spa_state);
-}
-
-spa_load_state_t
-spa_load_state(spa_t *spa)
-{
-	return (spa->spa_load_state);
-}
-
-uint64_t
-spa_freeze_txg(spa_t *spa)
-{
-	return (spa->spa_freeze_txg);
-}
-
-/* ARGSUSED */
-uint64_t
-spa_get_worst_case_asize(spa_t *spa, uint64_t lsize)
-{
-	return (lsize * spa_asize_inflation);
-}
-
-/*
- * Return the amount of slop space in bytes.  It is 1/32 of the pool (3.2%),
- * or at least 128MB, unless that would cause it to be more than half the
- * pool size.
- *
- * See the comment above spa_slop_shift for details.
- */
-uint64_t
-spa_get_slop_space(spa_t *spa)
-{
-	uint64_t space = spa_get_dspace(spa);
-	return (MAX(space >> spa_slop_shift, MIN(space >> 1, spa_min_slop)));
-}
-
-uint64_t
-spa_get_dspace(spa_t *spa)
-{
-	return (spa->spa_dspace);
-}
-
-uint64_t
-spa_get_checkpoint_space(spa_t *spa)
-{
-	return (spa->spa_checkpoint_info.sci_dspace);
-}
-
-void
-spa_update_dspace(spa_t *spa)
-{
-	spa->spa_dspace = metaslab_class_get_dspace(spa_normal_class(spa)) +
-	    ddt_get_dedup_dspace(spa);
-	if (spa->spa_vdev_removal != NULL) {
-		/*
-		 * We can't allocate from the removing device, so
-		 * subtract its size.  This prevents the DMU/DSL from
-		 * filling up the (now smaller) pool while we are in the
-		 * middle of removing the device.
-		 *
-		 * Note that the DMU/DSL doesn't actually know or care
-		 * how much space is allocated (it does its own tracking
-		 * of how much space has been logically used).  So it
-		 * doesn't matter that the data we are moving may be
-		 * allocated twice (on the old device and the new
-		 * device).
-		 */
-		spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
-		vdev_t *vd =
-		    vdev_lookup_top(spa, spa->spa_vdev_removal->svr_vdev_id);
-		spa->spa_dspace -= spa_deflate(spa) ?
-		    vd->vdev_stat.vs_dspace : vd->vdev_stat.vs_space;
-		spa_config_exit(spa, SCL_VDEV, FTAG);
-	}
-}
-
-/*
- * Return the failure mode that has been set to this pool. The default
- * behavior will be to block all I/Os when a complete failure occurs.
- */
-uint8_t
-spa_get_failmode(spa_t *spa)
-{
-	return (spa->spa_failmode);
-}
-
-boolean_t
-spa_suspended(spa_t *spa)
-{
-	return (spa->spa_suspended != ZIO_SUSPEND_NONE);
-}
-
-uint64_t
-spa_version(spa_t *spa)
-{
-	return (spa->spa_ubsync.ub_version);
-}
-
-boolean_t
-spa_deflate(spa_t *spa)
-{
-	return (spa->spa_deflate);
-}
-
-metaslab_class_t *
-spa_normal_class(spa_t *spa)
-{
-	return (spa->spa_normal_class);
-}
-
-metaslab_class_t *
-spa_log_class(spa_t *spa)
-{
-	return (spa->spa_log_class);
-}
-
-metaslab_class_t *
-spa_special_class(spa_t *spa)
-{
-	return (spa->spa_special_class);
-}
-
-metaslab_class_t *
-spa_dedup_class(spa_t *spa)
-{
-	return (spa->spa_dedup_class);
-}
-
-/*
- * Locate an appropriate allocation class
- */
-metaslab_class_t *
-spa_preferred_class(spa_t *spa, uint64_t size, dmu_object_type_t objtype,
-    uint_t level, uint_t special_smallblk)
-{
-	if (DMU_OT_IS_ZIL(objtype)) {
-		if (spa->spa_log_class->mc_groups != 0)
-			return (spa_log_class(spa));
-		else
-			return (spa_normal_class(spa));
-	}
-
-	boolean_t has_special_class = spa->spa_special_class->mc_groups != 0;
-
-	if (DMU_OT_IS_DDT(objtype)) {
-		if (spa->spa_dedup_class->mc_groups != 0)
-			return (spa_dedup_class(spa));
-		else if (has_special_class && zfs_ddt_data_is_special)
-			return (spa_special_class(spa));
-		else
-			return (spa_normal_class(spa));
-	}
-
-	/* Indirect blocks for user data can land in special if allowed */
-	if (level > 0 && (DMU_OT_IS_FILE(objtype) || objtype == DMU_OT_ZVOL)) {
-		if (has_special_class && zfs_user_indirect_is_special)
-			return (spa_special_class(spa));
-		else
-			return (spa_normal_class(spa));
-	}
-
-	if (DMU_OT_IS_METADATA(objtype) || level > 0) {
-		if (has_special_class)
-			return (spa_special_class(spa));
-		else
-			return (spa_normal_class(spa));
-	}
-
-	/*
-	 * Allow small file blocks in special class in some cases (like
-	 * for the dRAID vdev feature). But always leave a reserve of
-	 * zfs_special_class_metadata_reserve_pct exclusively for metadata.
-	 */
-	if (DMU_OT_IS_FILE(objtype) &&
-	    has_special_class && size <= special_smallblk) {
-		metaslab_class_t *special = spa_special_class(spa);
-		uint64_t alloc = metaslab_class_get_alloc(special);
-		uint64_t space = metaslab_class_get_space(special);
-		uint64_t limit =
-		    (space * (100 - zfs_special_class_metadata_reserve_pct))
-		    / 100;
-
-		if (alloc < limit)
-			return (special);
-	}
-
-	return (spa_normal_class(spa));
-}
-
-void
-spa_evicting_os_register(spa_t *spa, objset_t *os)
-{
-	mutex_enter(&spa->spa_evicting_os_lock);
-	list_insert_head(&spa->spa_evicting_os_list, os);
-	mutex_exit(&spa->spa_evicting_os_lock);
-}
-
-void
-spa_evicting_os_deregister(spa_t *spa, objset_t *os)
-{
-	mutex_enter(&spa->spa_evicting_os_lock);
-	list_remove(&spa->spa_evicting_os_list, os);
-	cv_broadcast(&spa->spa_evicting_os_cv);
-	mutex_exit(&spa->spa_evicting_os_lock);
-}
-
-void
-spa_evicting_os_wait(spa_t *spa)
-{
-	mutex_enter(&spa->spa_evicting_os_lock);
-	while (!list_is_empty(&spa->spa_evicting_os_list))
-		cv_wait(&spa->spa_evicting_os_cv, &spa->spa_evicting_os_lock);
-	mutex_exit(&spa->spa_evicting_os_lock);
-
-	dmu_buf_user_evict_wait();
-}
-
-int
-spa_max_replication(spa_t *spa)
-{
-	/*
-	 * As of SPA_VERSION == SPA_VERSION_DITTO_BLOCKS, we are able to
-	 * handle BPs with more than one DVA allocated.  Set our max
-	 * replication level accordingly.
-	 */
-	if (spa_version(spa) < SPA_VERSION_DITTO_BLOCKS)
-		return (1);
-	return (MIN(SPA_DVAS_PER_BP, spa_max_replication_override));
-}
-
-int
-spa_prev_software_version(spa_t *spa)
-{
-	return (spa->spa_prev_software_version);
-}
-
-uint64_t
-spa_deadman_synctime(spa_t *spa)
-{
-	return (spa->spa_deadman_synctime);
-}
-
-struct proc *
-spa_proc(spa_t *spa)
-{
-	return (spa->spa_proc);
-}
-
-uint64_t
-dva_get_dsize_sync(spa_t *spa, const dva_t *dva)
-{
-	uint64_t asize = DVA_GET_ASIZE(dva);
-	uint64_t dsize = asize;
-
-	ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0);
-
-	if (asize != 0 && spa->spa_deflate) {
-		uint64_t vdev = DVA_GET_VDEV(dva);
-		vdev_t *vd = vdev_lookup_top(spa, vdev);
-		if (vd == NULL) {
-			panic(
-			    "dva_get_dsize_sync(): bad DVA %llu:%llu",
-			    (u_longlong_t)vdev, (u_longlong_t)asize);
-		}
-		dsize = (asize >> SPA_MINBLOCKSHIFT) * vd->vdev_deflate_ratio;
-	}
-
-	return (dsize);
-}
-
-uint64_t
-bp_get_dsize_sync(spa_t *spa, const blkptr_t *bp)
-{
-	uint64_t dsize = 0;
-
-	for (int d = 0; d < BP_GET_NDVAS(bp); d++)
-		dsize += dva_get_dsize_sync(spa, &bp->blk_dva[d]);
-
-	return (dsize);
-}
-
-uint64_t
-bp_get_dsize(spa_t *spa, const blkptr_t *bp)
-{
-	uint64_t dsize = 0;
-
-	spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
-
-	for (int d = 0; d < BP_GET_NDVAS(bp); d++)
-		dsize += dva_get_dsize_sync(spa, &bp->blk_dva[d]);
-
-	spa_config_exit(spa, SCL_VDEV, FTAG);
-
-	return (dsize);
-}
-
-uint64_t
-spa_dirty_data(spa_t *spa)
-{
-	return (spa->spa_dsl_pool->dp_dirty_total);
-}
-
-/*
- * ==========================================================================
- * Initialization and Termination
- * ==========================================================================
- */
-
-static int
-spa_name_compare(const void *a1, const void *a2)
-{
-	const spa_t *s1 = a1;
-	const spa_t *s2 = a2;
-	int s;
-
-	s = strcmp(s1->spa_name, s2->spa_name);
-
-	return (AVL_ISIGN(s));
-}
-
-int
-spa_busy(void)
-{
-	return (spa_active_count);
-}
-
-void
-spa_boot_init()
-{
-	spa_config_load();
-}
-
-#ifdef _KERNEL
-EVENTHANDLER_DEFINE(mountroot, spa_boot_init, NULL, 0);
-#endif
-
-void
-spa_init(int mode)
-{
-	mutex_init(&spa_namespace_lock, NULL, MUTEX_DEFAULT, NULL);
-	mutex_init(&spa_spare_lock, NULL, MUTEX_DEFAULT, NULL);
-	mutex_init(&spa_l2cache_lock, NULL, MUTEX_DEFAULT, NULL);
-	cv_init(&spa_namespace_cv, NULL, CV_DEFAULT, NULL);
-
-	avl_create(&spa_namespace_avl, spa_name_compare, sizeof (spa_t),
-	    offsetof(spa_t, spa_avl));
-
-	avl_create(&spa_spare_avl, spa_spare_compare, sizeof (spa_aux_t),
-	    offsetof(spa_aux_t, aux_avl));
-
-	avl_create(&spa_l2cache_avl, spa_l2cache_compare, sizeof (spa_aux_t),
-	    offsetof(spa_aux_t, aux_avl));
-
-	spa_mode_global = mode;
-
-#ifdef illumos
-#ifdef _KERNEL
-	spa_arch_init();
-#else
-	if (spa_mode_global != FREAD && dprintf_find_string("watch")) {
-		arc_procfd = open("/proc/self/ctl", O_WRONLY);
-		if (arc_procfd == -1) {
-			perror("could not enable watchpoints: "
-			    "opening /proc/self/ctl failed: ");
-		} else {
-			arc_watch = B_TRUE;
-		}
-	}
-#endif
-#endif /* illumos */
-
-	zfs_refcount_init();
-	unique_init();
-	range_tree_init();
-	metaslab_alloc_trace_init();
-	zio_init();
-	lz4_init();
-	dmu_init();
-	zil_init();
-	vdev_cache_stat_init();
-	vdev_file_init();
-	zfs_prop_init();
-	zpool_prop_init();
-	zpool_feature_init();
-	spa_config_load();
-	l2arc_start();
-	scan_init();
-	dsl_scan_global_init();
-#ifndef illumos
-#ifdef _KERNEL
-	zfs_deadman_init();
-#endif
-#endif	/* !illumos */
-}
-
-void
-spa_fini(void)
-{
-	l2arc_stop();
-
-	spa_evict_all();
-
-	vdev_file_fini();
-	vdev_cache_stat_fini();
-	zil_fini();
-	dmu_fini();
-	lz4_fini();
-	zio_fini();
-	metaslab_alloc_trace_fini();
-	range_tree_fini();
-	unique_fini();
-	zfs_refcount_fini();
-	scan_fini();
-	
-	avl_destroy(&spa_namespace_avl);
-	avl_destroy(&spa_spare_avl);
-	avl_destroy(&spa_l2cache_avl);
-
-	cv_destroy(&spa_namespace_cv);
-	mutex_destroy(&spa_namespace_lock);
-	mutex_destroy(&spa_spare_lock);
-	mutex_destroy(&spa_l2cache_lock);
-}
-
-/*
- * Return whether this pool has slogs. No locking needed.
- * It's not a problem if the wrong answer is returned as it's only for
- * performance and not correctness
- */
-boolean_t
-spa_has_slogs(spa_t *spa)
-{
-	return (spa->spa_log_class->mc_rotor != NULL);
-}
-
-spa_log_state_t
-spa_get_log_state(spa_t *spa)
-{
-	return (spa->spa_log_state);
-}
-
-void
-spa_set_log_state(spa_t *spa, spa_log_state_t state)
-{
-	spa->spa_log_state = state;
-}
-
-boolean_t
-spa_is_root(spa_t *spa)
-{
-	return (spa->spa_is_root);
-}
-
-boolean_t
-spa_writeable(spa_t *spa)
-{
-	return (!!(spa->spa_mode & FWRITE) && spa->spa_trust_config);
-}
-
-/*
- * Returns true if there is a pending sync task in any of the current
- * syncing txg, the current quiescing txg, or the current open txg.
- */
-boolean_t
-spa_has_pending_synctask(spa_t *spa)
-{
-	return (!txg_all_lists_empty(&spa->spa_dsl_pool->dp_sync_tasks) ||
-	    !txg_all_lists_empty(&spa->spa_dsl_pool->dp_early_sync_tasks));
-}
-
-int
-spa_mode(spa_t *spa)
-{
-	return (spa->spa_mode);
-}
-
-uint64_t
-spa_bootfs(spa_t *spa)
-{
-	return (spa->spa_bootfs);
-}
-
-uint64_t
-spa_delegation(spa_t *spa)
-{
-	return (spa->spa_delegation);
-}
-
-objset_t *
-spa_meta_objset(spa_t *spa)
-{
-	return (spa->spa_meta_objset);
-}
-
-enum zio_checksum
-spa_dedup_checksum(spa_t *spa)
-{
-	return (spa->spa_dedup_checksum);
-}
-
-/*
- * Reset pool scan stat per scan pass (or reboot).
- */
-void
-spa_scan_stat_init(spa_t *spa)
-{
-	/* data not stored on disk */
-	spa->spa_scan_pass_start = gethrestime_sec();
-	if (dsl_scan_is_paused_scrub(spa->spa_dsl_pool->dp_scan))
-		spa->spa_scan_pass_scrub_pause = spa->spa_scan_pass_start;
-	else
-		spa->spa_scan_pass_scrub_pause = 0;
-	spa->spa_scan_pass_scrub_spent_paused = 0;
-	spa->spa_scan_pass_exam = 0;
-	spa->spa_scan_pass_issued = 0;
-	vdev_scan_stat_init(spa->spa_root_vdev);
-}
-
-/*
- * Get scan stats for zpool status reports
- */
-int
-spa_scan_get_stats(spa_t *spa, pool_scan_stat_t *ps)
-{
-	dsl_scan_t *scn = spa->spa_dsl_pool ? spa->spa_dsl_pool->dp_scan : NULL;
-
-	if (scn == NULL || scn->scn_phys.scn_func == POOL_SCAN_NONE)
-		return (SET_ERROR(ENOENT));
-	bzero(ps, sizeof (pool_scan_stat_t));
-
-	/* data stored on disk */
-	ps->pss_func = scn->scn_phys.scn_func;
-	ps->pss_state = scn->scn_phys.scn_state;
-	ps->pss_start_time = scn->scn_phys.scn_start_time;
-	ps->pss_end_time = scn->scn_phys.scn_end_time;
-	ps->pss_to_examine = scn->scn_phys.scn_to_examine;
-	ps->pss_to_process = scn->scn_phys.scn_to_process;
-	ps->pss_processed = scn->scn_phys.scn_processed;
-	ps->pss_errors = scn->scn_phys.scn_errors;
-	ps->pss_examined = scn->scn_phys.scn_examined;
-	ps->pss_issued =
-		scn->scn_issued_before_pass + spa->spa_scan_pass_issued;
-	/* data not stored on disk */
-	ps->pss_pass_start = spa->spa_scan_pass_start;
-	ps->pss_pass_exam = spa->spa_scan_pass_exam;
-	ps->pss_pass_issued = spa->spa_scan_pass_issued;
-	ps->pss_pass_scrub_pause = spa->spa_scan_pass_scrub_pause;
-	ps->pss_pass_scrub_spent_paused = spa->spa_scan_pass_scrub_spent_paused;
-
-	return (0);
-}
-
-int
-spa_maxblocksize(spa_t *spa)
-{
-	if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_BLOCKS))
-		return (SPA_MAXBLOCKSIZE);
-	else
-		return (SPA_OLD_MAXBLOCKSIZE);
-}
-
-int
-spa_maxdnodesize(spa_t *spa)
-{
-	if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_DNODE))
-		return (DNODE_MAX_SIZE);
-	else
-		return (DNODE_MIN_SIZE);
-}
-
-boolean_t
-spa_multihost(spa_t *spa)
-{
-	return (spa->spa_multihost ? B_TRUE : B_FALSE);
-}
-
-unsigned long
-spa_get_hostid(void)
-{
-	unsigned long myhostid;
-
-#ifdef	_KERNEL
-	myhostid = zone_get_hostid(NULL);
-#else	/* _KERNEL */
-	/*
-	 * We're emulating the system's hostid in userland, so
-	 * we can't use zone_get_hostid().
-	 */
-	(void) ddi_strtoul(hw_serial, NULL, 10, &myhostid);
-#endif	/* _KERNEL */
-
-	return (myhostid);
-}
-
-/*
- * Returns the txg that the last device removal completed. No indirect mappings
- * have been added since this txg.
- */
-uint64_t
-spa_get_last_removal_txg(spa_t *spa)
-{
-	uint64_t vdevid;
-	uint64_t ret = -1ULL;
-
-	spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
-	/*
-	 * sr_prev_indirect_vdev is only modified while holding all the
-	 * config locks, so it is sufficient to hold SCL_VDEV as reader when
-	 * examining it.
-	 */
-	vdevid = spa->spa_removing_phys.sr_prev_indirect_vdev;
-
-	while (vdevid != -1ULL) {
-		vdev_t *vd = vdev_lookup_top(spa, vdevid);
-		vdev_indirect_births_t *vib = vd->vdev_indirect_births;
-
-		ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops);
-
-		/*
-		 * If the removal did not remap any data, we don't care.
-		 */
-		if (vdev_indirect_births_count(vib) != 0) {
-			ret = vdev_indirect_births_last_entry_txg(vib);
-			break;
-		}
-
-		vdevid = vd->vdev_indirect_config.vic_prev_indirect_vdev;
-	}
-	spa_config_exit(spa, SCL_VDEV, FTAG);
-
-	IMPLY(ret != -1ULL,
-	    spa_feature_is_active(spa, SPA_FEATURE_DEVICE_REMOVAL));
-
-	return (ret);
-}
-
-boolean_t
-spa_trust_config(spa_t *spa)
-{
-	return (spa->spa_trust_config);
-}
-
-uint64_t
-spa_missing_tvds_allowed(spa_t *spa)
-{
-	return (spa->spa_missing_tvds_allowed);
-}
-
-void
-spa_set_missing_tvds(spa_t *spa, uint64_t missing)
-{
-	spa->spa_missing_tvds = missing;
-}
-
-boolean_t
-spa_top_vdevs_spacemap_addressable(spa_t *spa)
-{
-	vdev_t *rvd = spa->spa_root_vdev;
-	for (uint64_t c = 0; c < rvd->vdev_children; c++) {
-		if (!vdev_is_spacemap_addressable(rvd->vdev_child[c]))
-			return (B_FALSE);
-	}
-	return (B_TRUE);
-}
-
-boolean_t
-spa_has_checkpoint(spa_t *spa)
-{
-	return (spa->spa_checkpoint_txg != 0);
-}
-
-boolean_t
-spa_importing_readonly_checkpoint(spa_t *spa)
-{
-	return ((spa->spa_import_flags & ZFS_IMPORT_CHECKPOINT) &&
-	    spa->spa_mode == FREAD);
-}
-
-uint64_t
-spa_min_claim_txg(spa_t *spa)
-{
-	uint64_t checkpoint_txg = spa->spa_uberblock.ub_checkpoint_txg;
-
-	if (checkpoint_txg != 0)
-		return (checkpoint_txg + 1);
-
-	return (spa->spa_first_txg);
-}
-
-/*
- * If there is a checkpoint, async destroys may consume more space from
- * the pool instead of freeing it. In an attempt to save the pool from
- * getting suspended when it is about to run out of space, we stop
- * processing async destroys.
- */
-boolean_t
-spa_suspend_async_destroy(spa_t *spa)
-{
-	dsl_pool_t *dp = spa_get_dsl(spa);
-
-	uint64_t unreserved = dsl_pool_unreserved_space(dp,
-	    ZFS_SPACE_CHECK_EXTRA_RESERVED);
-	uint64_t used = dsl_dir_phys(dp->dp_root_dir)->dd_used_bytes;
-	uint64_t avail = (unreserved > used) ? (unreserved - used) : 0;
-
-	if (spa_has_checkpoint(spa) && avail == 0)
-		return (B_TRUE);
-
-	return (B_FALSE);
-}
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_map.c
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_map.c
+++ /dev/null
@@ -1,1073 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-/*
- * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
- */
-
-#include <sys/zfs_context.h>
-#include <sys/spa.h>
-#include <sys/dmu.h>
-#include <sys/dmu_tx.h>
-#include <sys/dnode.h>
-#include <sys/dsl_pool.h>
-#include <sys/zio.h>
-#include <sys/space_map.h>
-#include <sys/refcount.h>
-#include <sys/zfeature.h>
-
-SYSCTL_DECL(_vfs_zfs);
-
-/*
- * Note on space map block size:
- *
- * The data for a given space map can be kept on blocks of any size.
- * Larger blocks entail fewer I/O operations, but they also cause the
- * DMU to keep more data in-core, and also to waste more I/O bandwidth
- * when only a few blocks have changed since the last transaction group.
- */
-
-/*
- * Enabled whenever we want to stress test the use of double-word
- * space map entries.
- */
-boolean_t zfs_force_some_double_word_sm_entries = B_FALSE;
-
-/*
- * Override the default indirect block size of 128K, instead using 16K for
- * spacemaps (2^14 bytes).  This dramatically reduces write inflation since
- * appending to a spacemap typically has to write one data block (4KB) and one
- * or two indirect blocks (16K-32K, rather than 128K).
- */
-int space_map_ibs = 14;
-
-SYSCTL_INT(_vfs_zfs, OID_AUTO, space_map_ibs, CTLFLAG_RWTUN,
-    &space_map_ibs, 0, "Space map indirect block shift");
-
-boolean_t
-sm_entry_is_debug(uint64_t e)
-{
-	return (SM_PREFIX_DECODE(e) == SM_DEBUG_PREFIX);
-}
-
-boolean_t
-sm_entry_is_single_word(uint64_t e)
-{
-	uint8_t prefix = SM_PREFIX_DECODE(e);
-	return (prefix != SM_DEBUG_PREFIX && prefix != SM2_PREFIX);
-}
-
-boolean_t
-sm_entry_is_double_word(uint64_t e)
-{
-	return (SM_PREFIX_DECODE(e) == SM2_PREFIX);
-}
-
-/*
- * Iterate through the space map, invoking the callback on each (non-debug)
- * space map entry. Stop after reading 'end' bytes of the space map.
- */
-int
-space_map_iterate(space_map_t *sm, uint64_t end, sm_cb_t callback, void *arg)
-{
-	uint64_t blksz = sm->sm_blksz;
-
-	ASSERT3U(blksz, !=, 0);
-	ASSERT3U(end, <=, space_map_length(sm));
-	ASSERT0(P2PHASE(end, sizeof (uint64_t)));
-
-	dmu_prefetch(sm->sm_os, space_map_object(sm), 0, 0, end,
-	    ZIO_PRIORITY_SYNC_READ);
-
-	int error = 0;
-	for (uint64_t block_base = 0; block_base < end && error == 0;
-	    block_base += blksz) {
-		dmu_buf_t *db;
-		error = dmu_buf_hold(sm->sm_os, space_map_object(sm),
-		    block_base, FTAG, &db, DMU_READ_PREFETCH);
-		if (error != 0)
-			return (error);
-
-		uint64_t *block_start = db->db_data;
-		uint64_t block_length = MIN(end - block_base, blksz);
-		uint64_t *block_end = block_start +
-		    (block_length / sizeof (uint64_t));
-
-		VERIFY0(P2PHASE(block_length, sizeof (uint64_t)));
-		VERIFY3U(block_length, !=, 0);
-		ASSERT3U(blksz, ==, db->db_size);
-
-		for (uint64_t *block_cursor = block_start;
-		    block_cursor < block_end && error == 0; block_cursor++) {
-			uint64_t e = *block_cursor;
-
-			if (sm_entry_is_debug(e)) /* Skip debug entries */
-				continue;
-
-			uint64_t raw_offset, raw_run, vdev_id;
-			maptype_t type;
-			if (sm_entry_is_single_word(e)) {
-				type = SM_TYPE_DECODE(e);
-				vdev_id = SM_NO_VDEVID;
-				raw_offset = SM_OFFSET_DECODE(e);
-				raw_run = SM_RUN_DECODE(e);
-			} else {
-				/* it is a two-word entry */
-				ASSERT(sm_entry_is_double_word(e));
-				raw_run = SM2_RUN_DECODE(e);
-				vdev_id = SM2_VDEV_DECODE(e);
-
-				/* move on to the second word */
-				block_cursor++;
-				e = *block_cursor;
-				VERIFY3P(block_cursor, <=, block_end);
-
-				type = SM2_TYPE_DECODE(e);
-				raw_offset = SM2_OFFSET_DECODE(e);
-			}
-
-			uint64_t entry_offset = (raw_offset << sm->sm_shift) +
-			    sm->sm_start;
-			uint64_t entry_run = raw_run << sm->sm_shift;
-
-			VERIFY0(P2PHASE(entry_offset, 1ULL << sm->sm_shift));
-			VERIFY0(P2PHASE(entry_run, 1ULL << sm->sm_shift));
-			ASSERT3U(entry_offset, >=, sm->sm_start);
-			ASSERT3U(entry_offset, <, sm->sm_start + sm->sm_size);
-			ASSERT3U(entry_run, <=, sm->sm_size);
-			ASSERT3U(entry_offset + entry_run, <=,
-			    sm->sm_start + sm->sm_size);
-
-			space_map_entry_t sme = {
-			    .sme_type = type,
-			    .sme_vdev = vdev_id,
-			    .sme_offset = entry_offset,
-			    .sme_run = entry_run
-			};
-			error = callback(&sme, arg);
-		}
-		dmu_buf_rele(db, FTAG);
-	}
-	return (error);
-}
-
-/*
- * Reads the entries from the last block of the space map into
- * buf in reverse order. Populates nwords with number of words
- * in the last block.
- *
- * Refer to block comment within space_map_incremental_destroy()
- * to understand why this function is needed.
- */
-static int
-space_map_reversed_last_block_entries(space_map_t *sm, uint64_t *buf,
-    uint64_t bufsz, uint64_t *nwords)
-{
-	int error = 0;
-	dmu_buf_t *db;
-
-	/*
-	 * Find the offset of the last word in the space map and use
-	 * that to read the last block of the space map with
-	 * dmu_buf_hold().
-	 */
-	uint64_t last_word_offset =
-	    sm->sm_phys->smp_length - sizeof (uint64_t);
-	error = dmu_buf_hold(sm->sm_os, space_map_object(sm), last_word_offset,
-	    FTAG, &db, DMU_READ_NO_PREFETCH);
-	if (error != 0)
-		return (error);
-
-	ASSERT3U(sm->sm_object, ==, db->db_object);
-	ASSERT3U(sm->sm_blksz, ==, db->db_size);
-	ASSERT3U(bufsz, >=, db->db_size);
-	ASSERT(nwords != NULL);
-
-	uint64_t *words = db->db_data;
-	*nwords =
-	    (sm->sm_phys->smp_length - db->db_offset) / sizeof (uint64_t);
-
-	ASSERT3U(*nwords, <=, bufsz / sizeof (uint64_t));
-
-	uint64_t n = *nwords;
-	uint64_t j = n - 1;
-	for (uint64_t i = 0; i < n; i++) {
-		uint64_t entry = words[i];
-		if (sm_entry_is_double_word(entry)) {
-			/*
-			 * Since we are populating the buffer backwards
-			 * we have to be extra careful and add the two
-			 * words of the double-word entry in the right
-			 * order.
-			 */
-			ASSERT3U(j, >, 0);
-			buf[j - 1] = entry;
-
-			i++;
-			ASSERT3U(i, <, n);
-			entry = words[i];
-			buf[j] = entry;
-			j -= 2;
-		} else {
-			ASSERT(sm_entry_is_debug(entry) ||
-			    sm_entry_is_single_word(entry));
-			buf[j] = entry;
-			j--;
-		}
-	}
-
-	/*
-	 * Assert that we wrote backwards all the
-	 * way to the beginning of the buffer.
-	 */
-	ASSERT3S(j, ==, -1);
-
-	dmu_buf_rele(db, FTAG);
-	return (error);
-}
-
-/*
- * Note: This function performs destructive actions - specifically
- * it deletes entries from the end of the space map. Thus, callers
- * should ensure that they are holding the appropriate locks for
- * the space map that they provide.
- */
-int
-space_map_incremental_destroy(space_map_t *sm, sm_cb_t callback, void *arg,
-    dmu_tx_t *tx)
-{
-	uint64_t bufsz = MAX(sm->sm_blksz, SPA_MINBLOCKSIZE);
-	uint64_t *buf = zio_buf_alloc(bufsz);
-
-	dmu_buf_will_dirty(sm->sm_dbuf, tx);
-
-	/*
-	 * Ideally we would want to iterate from the beginning of the
-	 * space map to the end in incremental steps. The issue with this
-	 * approach is that we don't have any field on-disk that points
-	 * us where to start between each step. We could try zeroing out
-	 * entries that we've destroyed, but this doesn't work either as
-	 * an entry that is 0 is a valid one (ALLOC for range [0x0:0x200]).
-	 *
-	 * As a result, we destroy its entries incrementally starting from
-	 * the end after applying the callback to each of them.
-	 *
-	 * The problem with this approach is that we cannot literally
-	 * iterate through the words in the space map backwards as we
-	 * can't distinguish two-word space map entries from their second
-	 * word. Thus we do the following:
-	 *
-	 * 1] We get all the entries from the last block of the space map
-	 *    and put them into a buffer in reverse order. This way the
-	 *    last entry comes first in the buffer, the second to last is
-	 *    second, etc.
-	 * 2] We iterate through the entries in the buffer and we apply
-	 *    the callback to each one. As we move from entry to entry we
-	 *    we decrease the size of the space map, deleting effectively
-	 *    each entry.
-	 * 3] If there are no more entries in the space map or the callback
-	 *    returns a value other than 0, we stop iterating over the
-	 *    space map. If there are entries remaining and the callback
-	 *    returned 0, we go back to step [1].
-	 */
-	int error = 0;
-	while (space_map_length(sm) > 0 && error == 0) {
-		uint64_t nwords = 0;
-		error = space_map_reversed_last_block_entries(sm, buf, bufsz,
-		    &nwords);
-		if (error != 0)
-			break;
-
-		ASSERT3U(nwords, <=, bufsz / sizeof (uint64_t));
-
-		for (uint64_t i = 0; i < nwords; i++) {
-			uint64_t e = buf[i];
-
-			if (sm_entry_is_debug(e)) {
-				sm->sm_phys->smp_length -= sizeof (uint64_t);
-				continue;
-			}
-
-			int words = 1;
-			uint64_t raw_offset, raw_run, vdev_id;
-			maptype_t type;
-			if (sm_entry_is_single_word(e)) {
-				type = SM_TYPE_DECODE(e);
-				vdev_id = SM_NO_VDEVID;
-				raw_offset = SM_OFFSET_DECODE(e);
-				raw_run = SM_RUN_DECODE(e);
-			} else {
-				ASSERT(sm_entry_is_double_word(e));
-				words = 2;
-
-				raw_run = SM2_RUN_DECODE(e);
-				vdev_id = SM2_VDEV_DECODE(e);
-
-				/* move to the second word */
-				i++;
-				e = buf[i];
-
-				ASSERT3P(i, <=, nwords);
-
-				type = SM2_TYPE_DECODE(e);
-				raw_offset = SM2_OFFSET_DECODE(e);
-			}
-
-			uint64_t entry_offset =
-			    (raw_offset << sm->sm_shift) + sm->sm_start;
-			uint64_t entry_run = raw_run << sm->sm_shift;
-
-			VERIFY0(P2PHASE(entry_offset, 1ULL << sm->sm_shift));
-			VERIFY0(P2PHASE(entry_run, 1ULL << sm->sm_shift));
-			VERIFY3U(entry_offset, >=, sm->sm_start);
-			VERIFY3U(entry_offset, <, sm->sm_start + sm->sm_size);
-			VERIFY3U(entry_run, <=, sm->sm_size);
-			VERIFY3U(entry_offset + entry_run, <=,
-			    sm->sm_start + sm->sm_size);
-
-			space_map_entry_t sme = {
-			    .sme_type = type,
-			    .sme_vdev = vdev_id,
-			    .sme_offset = entry_offset,
-			    .sme_run = entry_run
-			};
-			error = callback(&sme, arg);
-			if (error != 0)
-				break;
-
-			if (type == SM_ALLOC)
-				sm->sm_phys->smp_alloc -= entry_run;
-			else
-				sm->sm_phys->smp_alloc += entry_run;
-			sm->sm_phys->smp_length -= words * sizeof (uint64_t);
-		}
-	}
-
-	if (space_map_length(sm) == 0) {
-		ASSERT0(error);
-		ASSERT0(space_map_allocated(sm));
-	}
-
-	zio_buf_free(buf, bufsz);
-	return (error);
-}
-
-typedef struct space_map_load_arg {
-	space_map_t	*smla_sm;
-	range_tree_t	*smla_rt;
-	maptype_t	smla_type;
-} space_map_load_arg_t;
-
-static int
-space_map_load_callback(space_map_entry_t *sme, void *arg)
-{
-	space_map_load_arg_t *smla = arg;
-	if (sme->sme_type == smla->smla_type) {
-		VERIFY3U(range_tree_space(smla->smla_rt) + sme->sme_run, <=,
-		    smla->smla_sm->sm_size);
-		range_tree_add(smla->smla_rt, sme->sme_offset, sme->sme_run);
-	} else {
-		range_tree_remove(smla->smla_rt, sme->sme_offset, sme->sme_run);
-	}
-
-	return (0);
-}
-
-/*
- * Load the spacemap into the rangetree, like space_map_load. But only
- * read the first 'length' bytes of the spacemap.
- */
-int
-space_map_load_length(space_map_t *sm, range_tree_t *rt, maptype_t maptype,
-    uint64_t length)
-{
-	space_map_load_arg_t smla;
-
-	VERIFY0(range_tree_space(rt));
-
-	if (maptype == SM_FREE)
-		range_tree_add(rt, sm->sm_start, sm->sm_size);
-
-	smla.smla_rt = rt;
-	smla.smla_sm = sm;
-	smla.smla_type = maptype;
-	int err = space_map_iterate(sm, length,
-	    space_map_load_callback, &smla);
-
-	if (err != 0)
-		range_tree_vacate(rt, NULL, NULL);
-
-	return (err);
-}
-
-/*
- * Load the space map disk into the specified range tree. Segments of maptype
- * are added to the range tree, other segment types are removed.
- */
-int
-space_map_load(space_map_t *sm, range_tree_t *rt, maptype_t maptype)
-{
-	return (space_map_load_length(sm, rt, maptype, space_map_length(sm)));
-}
-
-void
-space_map_histogram_clear(space_map_t *sm)
-{
-	if (sm->sm_dbuf->db_size != sizeof (space_map_phys_t))
-		return;
-
-	bzero(sm->sm_phys->smp_histogram, sizeof (sm->sm_phys->smp_histogram));
-}
-
-boolean_t
-space_map_histogram_verify(space_map_t *sm, range_tree_t *rt)
-{
-	/*
-	 * Verify that the in-core range tree does not have any
-	 * ranges smaller than our sm_shift size.
-	 */
-	for (int i = 0; i < sm->sm_shift; i++) {
-		if (rt->rt_histogram[i] != 0)
-			return (B_FALSE);
-	}
-	return (B_TRUE);
-}
-
-void
-space_map_histogram_add(space_map_t *sm, range_tree_t *rt, dmu_tx_t *tx)
-{
-	int idx = 0;
-
-	ASSERT(dmu_tx_is_syncing(tx));
-	VERIFY3U(space_map_object(sm), !=, 0);
-
-	if (sm->sm_dbuf->db_size != sizeof (space_map_phys_t))
-		return;
-
-	dmu_buf_will_dirty(sm->sm_dbuf, tx);
-
-	ASSERT(space_map_histogram_verify(sm, rt));
-	/*
-	 * Transfer the content of the range tree histogram to the space
-	 * map histogram. The space map histogram contains 32 buckets ranging
-	 * between 2^sm_shift to 2^(32+sm_shift-1). The range tree,
-	 * however, can represent ranges from 2^0 to 2^63. Since the space
-	 * map only cares about allocatable blocks (minimum of sm_shift) we
-	 * can safely ignore all ranges in the range tree smaller than sm_shift.
-	 */
-	for (int i = sm->sm_shift; i < RANGE_TREE_HISTOGRAM_SIZE; i++) {
-
-		/*
-		 * Since the largest histogram bucket in the space map is
-		 * 2^(32+sm_shift-1), we need to normalize the values in
-		 * the range tree for any bucket larger than that size. For
-		 * example given an sm_shift of 9, ranges larger than 2^40
-		 * would get normalized as if they were 1TB ranges. Assume
-		 * the range tree had a count of 5 in the 2^44 (16TB) bucket,
-		 * the calculation below would normalize this to 5 * 2^4 (16).
-		 */
-		ASSERT3U(i, >=, idx + sm->sm_shift);
-		sm->sm_phys->smp_histogram[idx] +=
-		    rt->rt_histogram[i] << (i - idx - sm->sm_shift);
-
-		/*
-		 * Increment the space map's index as long as we haven't
-		 * reached the maximum bucket size. Accumulate all ranges
-		 * larger than the max bucket size into the last bucket.
-		 */
-		if (idx < SPACE_MAP_HISTOGRAM_SIZE - 1) {
-			ASSERT3U(idx + sm->sm_shift, ==, i);
-			idx++;
-			ASSERT3U(idx, <, SPACE_MAP_HISTOGRAM_SIZE);
-		}
-	}
-}
-
-static void
-space_map_write_intro_debug(space_map_t *sm, maptype_t maptype, dmu_tx_t *tx)
-{
-	dmu_buf_will_dirty(sm->sm_dbuf, tx);
-
-	uint64_t dentry = SM_PREFIX_ENCODE(SM_DEBUG_PREFIX) |
-	    SM_DEBUG_ACTION_ENCODE(maptype) |
-	    SM_DEBUG_SYNCPASS_ENCODE(spa_sync_pass(tx->tx_pool->dp_spa)) |
-	    SM_DEBUG_TXG_ENCODE(dmu_tx_get_txg(tx));
-
-	dmu_write(sm->sm_os, space_map_object(sm), sm->sm_phys->smp_length,
-	    sizeof (dentry), &dentry, tx);
-
-	sm->sm_phys->smp_length += sizeof (dentry);
-}
-
-/*
- * Writes one or more entries given a segment.
- *
- * Note: The function may release the dbuf from the pointer initially
- * passed to it, and return a different dbuf. Also, the space map's
- * dbuf must be dirty for the changes in sm_phys to take effect.
- */
-static void
-space_map_write_seg(space_map_t *sm, range_seg_t *rs, maptype_t maptype,
-    uint64_t vdev_id, uint8_t words, dmu_buf_t **dbp, void *tag, dmu_tx_t *tx)
-{
-	ASSERT3U(words, !=, 0);
-	ASSERT3U(words, <=, 2);
-
-	/* ensure the vdev_id can be represented by the space map */
-	ASSERT3U(vdev_id, <=, SM_NO_VDEVID);
-
-	/*
-	 * if this is a single word entry, ensure that no vdev was
-	 * specified.
-	 */
-	IMPLY(words == 1, vdev_id == SM_NO_VDEVID);
-
-	dmu_buf_t *db = *dbp;
-	ASSERT3U(db->db_size, ==, sm->sm_blksz);
-
-	uint64_t *block_base = db->db_data;
-	uint64_t *block_end = block_base + (sm->sm_blksz / sizeof (uint64_t));
-	uint64_t *block_cursor = block_base +
-	    (sm->sm_phys->smp_length - db->db_offset) / sizeof (uint64_t);
-
-	ASSERT3P(block_cursor, <=, block_end);
-
-	uint64_t size = (rs->rs_end - rs->rs_start) >> sm->sm_shift;
-	uint64_t start = (rs->rs_start - sm->sm_start) >> sm->sm_shift;
-	uint64_t run_max = (words == 2) ? SM2_RUN_MAX : SM_RUN_MAX;
-
-	ASSERT3U(rs->rs_start, >=, sm->sm_start);
-	ASSERT3U(rs->rs_start, <, sm->sm_start + sm->sm_size);
-	ASSERT3U(rs->rs_end - rs->rs_start, <=, sm->sm_size);
-	ASSERT3U(rs->rs_end, <=, sm->sm_start + sm->sm_size);
-
-	while (size != 0) {
-		ASSERT3P(block_cursor, <=, block_end);
-
-		/*
-		 * If we are at the end of this block, flush it and start
-		 * writing again from the beginning.
-		 */
-		if (block_cursor == block_end) {
-			dmu_buf_rele(db, tag);
-
-			uint64_t next_word_offset = sm->sm_phys->smp_length;
-			VERIFY0(dmu_buf_hold(sm->sm_os,
-			    space_map_object(sm), next_word_offset,
-			    tag, &db, DMU_READ_PREFETCH));
-			dmu_buf_will_dirty(db, tx);
-
-			/* update caller's dbuf */
-			*dbp = db;
-
-			ASSERT3U(db->db_size, ==, sm->sm_blksz);
-
-			block_base = db->db_data;
-			block_cursor = block_base;
-			block_end = block_base +
-			    (db->db_size / sizeof (uint64_t));
-		}
-
-		/*
-		 * If we are writing a two-word entry and we only have one
-		 * word left on this block, just pad it with an empty debug
-		 * entry and write the two-word entry in the next block.
-		 */
-		uint64_t *next_entry = block_cursor + 1;
-		if (next_entry == block_end && words > 1) {
-			ASSERT3U(words, ==, 2);
-			*block_cursor = SM_PREFIX_ENCODE(SM_DEBUG_PREFIX) |
-			    SM_DEBUG_ACTION_ENCODE(0) |
-			    SM_DEBUG_SYNCPASS_ENCODE(0) |
-			    SM_DEBUG_TXG_ENCODE(0);
-			block_cursor++;
-			sm->sm_phys->smp_length += sizeof (uint64_t);
-			ASSERT3P(block_cursor, ==, block_end);
-			continue;
-		}
-
-		uint64_t run_len = MIN(size, run_max);
-		switch (words) {
-		case 1:
-			*block_cursor = SM_OFFSET_ENCODE(start) |
-			    SM_TYPE_ENCODE(maptype) |
-			    SM_RUN_ENCODE(run_len);
-			block_cursor++;
-			break;
-		case 2:
-			/* write the first word of the entry */
-			*block_cursor = SM_PREFIX_ENCODE(SM2_PREFIX) |
-			    SM2_RUN_ENCODE(run_len) |
-			    SM2_VDEV_ENCODE(vdev_id);
-			block_cursor++;
-
-			/* move on to the second word of the entry */
-			ASSERT3P(block_cursor, <, block_end);
-			*block_cursor = SM2_TYPE_ENCODE(maptype) |
-			    SM2_OFFSET_ENCODE(start);
-			block_cursor++;
-			break;
-		default:
-			panic("%d-word space map entries are not supported",
-			    words);
-			break;
-		}
-		sm->sm_phys->smp_length += words * sizeof (uint64_t);
-
-		start += run_len;
-		size -= run_len;
-	}
-	ASSERT0(size);
-
-}
-
-/*
- * Note: The space map's dbuf must be dirty for the changes in sm_phys to
- * take effect.
- */
-static void
-space_map_write_impl(space_map_t *sm, range_tree_t *rt, maptype_t maptype,
-    uint64_t vdev_id, dmu_tx_t *tx)
-{
-	spa_t *spa = tx->tx_pool->dp_spa;
-	dmu_buf_t *db;
-
-	space_map_write_intro_debug(sm, maptype, tx);
-
-#ifdef DEBUG
-	/*
-	 * We do this right after we write the intro debug entry
-	 * because the estimate does not take it into account.
-	 */
-	uint64_t initial_objsize = sm->sm_phys->smp_length;
-	uint64_t estimated_growth =
-	    space_map_estimate_optimal_size(sm, rt, SM_NO_VDEVID);
-	uint64_t estimated_final_objsize = initial_objsize + estimated_growth;
-#endif
-
-	/*
-	 * Find the offset right after the last word in the space map
-	 * and use that to get a hold of the last block, so we can
-	 * start appending to it.
-	 */
-	uint64_t next_word_offset = sm->sm_phys->smp_length;
-	VERIFY0(dmu_buf_hold(sm->sm_os, space_map_object(sm),
-	    next_word_offset, FTAG, &db, DMU_READ_PREFETCH));
-	ASSERT3U(db->db_size, ==, sm->sm_blksz);
-
-	dmu_buf_will_dirty(db, tx);
-
-	avl_tree_t *t = &rt->rt_root;
-	for (range_seg_t *rs = avl_first(t); rs != NULL; rs = AVL_NEXT(t, rs)) {
-		uint64_t offset = (rs->rs_start - sm->sm_start) >> sm->sm_shift;
-		uint64_t length = (rs->rs_end - rs->rs_start) >> sm->sm_shift;
-		uint8_t words = 1;
-
-		/*
-		 * We only write two-word entries when both of the following
-		 * are true:
-		 *
-		 * [1] The feature is enabled.
-		 * [2] The offset or run is too big for a single-word entry,
-		 *	or the vdev_id is set (meaning not equal to
-		 *	SM_NO_VDEVID).
-		 *
-		 * Note that for purposes of testing we've added the case that
-		 * we write two-word entries occasionally when the feature is
-		 * enabled and zfs_force_some_double_word_sm_entries has been
-		 * set.
-		 */
-		if (spa_feature_is_active(spa, SPA_FEATURE_SPACEMAP_V2) &&
-		    (offset >= (1ULL << SM_OFFSET_BITS) ||
-		    length > SM_RUN_MAX ||
-		    vdev_id != SM_NO_VDEVID ||
-		    (zfs_force_some_double_word_sm_entries &&
-		    spa_get_random(100) == 0)))
-			words = 2;
-
-		space_map_write_seg(sm, rs, maptype, vdev_id, words,
-		    &db, FTAG, tx);
-	}
-
-	dmu_buf_rele(db, FTAG);
-
-#ifdef DEBUG
-	/*
-	 * We expect our estimation to be based on the worst case
-	 * scenario [see comment in space_map_estimate_optimal_size()].
-	 * Therefore we expect the actual objsize to be equal or less
-	 * than whatever we estimated it to be.
-	 */
-	ASSERT3U(estimated_final_objsize, >=, sm->sm_phys->smp_length);
-#endif
-}
-
-/*
- * Note: This function manipulates the state of the given space map but
- * does not hold any locks implicitly. Thus the caller is responsible
- * for synchronizing writes to the space map.
- */
-void
-space_map_write(space_map_t *sm, range_tree_t *rt, maptype_t maptype,
-    uint64_t vdev_id, dmu_tx_t *tx)
-{
-	objset_t *os = sm->sm_os;
-
-	ASSERT(dsl_pool_sync_context(dmu_objset_pool(os)));
-	VERIFY3U(space_map_object(sm), !=, 0);
-
-	dmu_buf_will_dirty(sm->sm_dbuf, tx);
-
-	/*
-	 * This field is no longer necessary since the in-core space map
-	 * now contains the object number but is maintained for backwards
-	 * compatibility.
-	 */
-	sm->sm_phys->smp_object = sm->sm_object;
-
-	if (range_tree_is_empty(rt)) {
-		VERIFY3U(sm->sm_object, ==, sm->sm_phys->smp_object);
-		return;
-	}
-
-	if (maptype == SM_ALLOC)
-		sm->sm_phys->smp_alloc += range_tree_space(rt);
-	else
-		sm->sm_phys->smp_alloc -= range_tree_space(rt);
-
-	uint64_t nodes = avl_numnodes(&rt->rt_root);
-	uint64_t rt_space = range_tree_space(rt);
-
-	space_map_write_impl(sm, rt, maptype, vdev_id, tx);
-
-	/*
-	 * Ensure that the space_map's accounting wasn't changed
-	 * while we were in the middle of writing it out.
-	 */
-	VERIFY3U(nodes, ==, avl_numnodes(&rt->rt_root));
-	VERIFY3U(range_tree_space(rt), ==, rt_space);
-}
-
-static int
-space_map_open_impl(space_map_t *sm)
-{
-	int error;
-	u_longlong_t blocks;
-
-	error = dmu_bonus_hold(sm->sm_os, sm->sm_object, sm, &sm->sm_dbuf);
-	if (error)
-		return (error);
-
-	dmu_object_size_from_db(sm->sm_dbuf, &sm->sm_blksz, &blocks);
-	sm->sm_phys = sm->sm_dbuf->db_data;
-	return (0);
-}
-
-int
-space_map_open(space_map_t **smp, objset_t *os, uint64_t object,
-    uint64_t start, uint64_t size, uint8_t shift)
-{
-	space_map_t *sm;
-	int error;
-
-	ASSERT(*smp == NULL);
-	ASSERT(os != NULL);
-	ASSERT(object != 0);
-
-	sm = kmem_zalloc(sizeof (space_map_t), KM_SLEEP);
-
-	sm->sm_start = start;
-	sm->sm_size = size;
-	sm->sm_shift = shift;
-	sm->sm_os = os;
-	sm->sm_object = object;
-
-	error = space_map_open_impl(sm);
-	if (error != 0) {
-		space_map_close(sm);
-		return (error);
-	}
-	*smp = sm;
-
-	return (0);
-}
-
-void
-space_map_close(space_map_t *sm)
-{
-	if (sm == NULL)
-		return;
-
-	if (sm->sm_dbuf != NULL)
-		dmu_buf_rele(sm->sm_dbuf, sm);
-	sm->sm_dbuf = NULL;
-	sm->sm_phys = NULL;
-
-	kmem_free(sm, sizeof (*sm));
-}
-
-void
-space_map_truncate(space_map_t *sm, int blocksize, dmu_tx_t *tx)
-{
-	objset_t *os = sm->sm_os;
-	spa_t *spa = dmu_objset_spa(os);
-	dmu_object_info_t doi;
-
-	ASSERT(dsl_pool_sync_context(dmu_objset_pool(os)));
-	ASSERT(dmu_tx_is_syncing(tx));
-	VERIFY3U(dmu_tx_get_txg(tx), <=, spa_final_dirty_txg(spa));
-
-	dmu_object_info_from_db(sm->sm_dbuf, &doi);
-
-	/*
-	 * If the space map has the wrong bonus size (because
-	 * SPA_FEATURE_SPACEMAP_HISTOGRAM has recently been enabled), or
-	 * the wrong block size (because space_map_blksz has changed),
-	 * free and re-allocate its object with the updated sizes.
-	 *
-	 * Otherwise, just truncate the current object.
-	 */
-	if ((spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM) &&
-	    doi.doi_bonus_size != sizeof (space_map_phys_t)) ||
-	    doi.doi_data_block_size != blocksize ||
-	    doi.doi_metadata_block_size != 1 << space_map_ibs) {
-		zfs_dbgmsg("txg %llu, spa %s, sm %p, reallocating "
-		    "object[%llu]: old bonus %u, old blocksz %u",
-		    dmu_tx_get_txg(tx), spa_name(spa), sm, sm->sm_object,
-		    doi.doi_bonus_size, doi.doi_data_block_size);
-
-		space_map_free(sm, tx);
-		dmu_buf_rele(sm->sm_dbuf, sm);
-
-		sm->sm_object = space_map_alloc(sm->sm_os, blocksize, tx);
-		VERIFY0(space_map_open_impl(sm));
-	} else {
-		VERIFY0(dmu_free_range(os, space_map_object(sm), 0, -1ULL, tx));
-
-		/*
-		 * If the spacemap is reallocated, its histogram
-		 * will be reset.  Do the same in the common case so that
-		 * bugs related to the uncommon case do not go unnoticed.
-		 */
-		bzero(sm->sm_phys->smp_histogram,
-		    sizeof (sm->sm_phys->smp_histogram));
-	}
-
-	dmu_buf_will_dirty(sm->sm_dbuf, tx);
-	sm->sm_phys->smp_length = 0;
-	sm->sm_phys->smp_alloc = 0;
-}
-
-uint64_t
-space_map_alloc(objset_t *os, int blocksize, dmu_tx_t *tx)
-{
-	spa_t *spa = dmu_objset_spa(os);
-	uint64_t object;
-	int bonuslen;
-
-	if (spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM)) {
-		spa_feature_incr(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM, tx);
-		bonuslen = sizeof (space_map_phys_t);
-		ASSERT3U(bonuslen, <=, dmu_bonus_max());
-	} else {
-		bonuslen = SPACE_MAP_SIZE_V0;
-	}
-
-	object = dmu_object_alloc_ibs(os, DMU_OT_SPACE_MAP, blocksize,
-	    space_map_ibs, DMU_OT_SPACE_MAP_HEADER, bonuslen, tx);
-
-	return (object);
-}
-
-void
-space_map_free_obj(objset_t *os, uint64_t smobj, dmu_tx_t *tx)
-{
-	spa_t *spa = dmu_objset_spa(os);
-	if (spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM)) {
-		dmu_object_info_t doi;
-
-		VERIFY0(dmu_object_info(os, smobj, &doi));
-		if (doi.doi_bonus_size != SPACE_MAP_SIZE_V0) {
-			spa_feature_decr(spa,
-			    SPA_FEATURE_SPACEMAP_HISTOGRAM, tx);
-		}
-	}
-
-	VERIFY0(dmu_object_free(os, smobj, tx));
-}
-
-void
-space_map_free(space_map_t *sm, dmu_tx_t *tx)
-{
-	if (sm == NULL)
-		return;
-
-	space_map_free_obj(sm->sm_os, space_map_object(sm), tx);
-	sm->sm_object = 0;
-}
-
-/*
- * Given a range tree, it makes a worst-case estimate of how much
- * space would the tree's segments take if they were written to
- * the given space map.
- */
-uint64_t
-space_map_estimate_optimal_size(space_map_t *sm, range_tree_t *rt,
-    uint64_t vdev_id)
-{
-	spa_t *spa = dmu_objset_spa(sm->sm_os);
-	uint64_t shift = sm->sm_shift;
-	uint64_t *histogram = rt->rt_histogram;
-	uint64_t entries_for_seg = 0;
-
-	/*
-	 * In order to get a quick estimate of the optimal size that this
-	 * range tree would have on-disk as a space map, we iterate through
-	 * its histogram buckets instead of iterating through its nodes.
-	 *
-	 * Note that this is a highest-bound/worst-case estimate for the
-	 * following reasons:
-	 *
-	 * 1] We assume that we always add a debug padding for each block
-	 *    we write and we also assume that we start at the last word
-	 *    of a block attempting to write a two-word entry.
-	 * 2] Rounding up errors due to the way segments are distributed
-	 *    in the buckets of the range tree's histogram.
-	 * 3] The activation of zfs_force_some_double_word_sm_entries
-	 *    (tunable) when testing.
-	 *
-	 * = Math and Rounding Errors =
-	 *
-	 * rt_histogram[i] bucket of a range tree represents the number
-	 * of entries in [2^i, (2^(i+1))-1] of that range_tree. Given
-	 * that, we want to divide the buckets into groups: Buckets that
-	 * can be represented using a single-word entry, ones that can
-	 * be represented with a double-word entry, and ones that can
-	 * only be represented with multiple two-word entries.
-	 *
-	 * [Note that if the new encoding feature is not enabled there
-	 * are only two groups: single-word entry buckets and multiple
-	 * single-word entry buckets. The information below assumes
-	 * two-word entries enabled, but it can easily applied when
-	 * the feature is not enabled]
-	 *
-	 * To find the highest bucket that can be represented with a
-	 * single-word entry we look at the maximum run that such entry
-	 * can have, which is 2^(SM_RUN_BITS + sm_shift) [remember that
-	 * the run of a space map entry is shifted by sm_shift, thus we
-	 * add it to the exponent]. This way, excluding the value of the
-	 * maximum run that can be represented by a single-word entry,
-	 * all runs that are smaller exist in buckets 0 to
-	 * SM_RUN_BITS + shift - 1.
-	 *
-	 * To find the highest bucket that can be represented with a
-	 * double-word entry, we follow the same approach. Finally, any
-	 * bucket higher than that are represented with multiple two-word
-	 * entries. To be more specific, if the highest bucket whose
-	 * segments can be represented with a single two-word entry is X,
-	 * then bucket X+1 will need 2 two-word entries for each of its
-	 * segments, X+2 will need 4, X+3 will need 8, ...etc.
-	 *
-	 * With all of the above we make our estimation based on bucket
-	 * groups. There is a rounding error though. As we mentioned in
-	 * the example with the one-word entry, the maximum run that can
-	 * be represented in a one-word entry 2^(SM_RUN_BITS + shift) is
-	 * not part of bucket SM_RUN_BITS + shift - 1. Thus, segments of
-	 * that length fall into the next bucket (and bucket group) where
-	 * we start counting two-word entries and this is one more reason
-	 * why the estimated size may end up being bigger than the actual
-	 * size written.
-	 */
-	uint64_t size = 0;
-	uint64_t idx = 0;
-
-	if (!spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_V2) ||
-	    (vdev_id == SM_NO_VDEVID && sm->sm_size < SM_OFFSET_MAX)) {
-
-		/*
-		 * If we are trying to force some double word entries just
-		 * assume the worst-case of every single word entry being
-		 * written as a double word entry.
-		 */
-		uint64_t entry_size =
-		    (spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_V2) &&
-		    zfs_force_some_double_word_sm_entries) ?
-		    (2 * sizeof (uint64_t)) : sizeof (uint64_t);
-
-		uint64_t single_entry_max_bucket = SM_RUN_BITS + shift - 1;
-		for (; idx <= single_entry_max_bucket; idx++)
-			size += histogram[idx] * entry_size;
-
-		if (!spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_V2)) {
-			for (; idx < RANGE_TREE_HISTOGRAM_SIZE; idx++) {
-				ASSERT3U(idx, >=, single_entry_max_bucket);
-				entries_for_seg =
-				    1ULL << (idx - single_entry_max_bucket);
-				size += histogram[idx] *
-				    entries_for_seg * entry_size;
-			}
-			return (size);
-		}
-	}
-
-	ASSERT(spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_V2));
-
-	uint64_t double_entry_max_bucket = SM2_RUN_BITS + shift - 1;
-	for (; idx <= double_entry_max_bucket; idx++)
-		size += histogram[idx] * 2 * sizeof (uint64_t);
-
-	for (; idx < RANGE_TREE_HISTOGRAM_SIZE; idx++) {
-		ASSERT3U(idx, >=, double_entry_max_bucket);
-		entries_for_seg = 1ULL << (idx - double_entry_max_bucket);
-		size += histogram[idx] *
-		    entries_for_seg * 2 * sizeof (uint64_t);
-	}
-
-	/*
-	 * Assume the worst case where we start with the padding at the end
-	 * of the current block and we add an extra padding entry at the end
-	 * of all subsequent blocks.
-	 */
-	size += ((size / sm->sm_blksz) + 1) * sizeof (uint64_t);
-
-	return (size);
-}
-
-uint64_t
-space_map_object(space_map_t *sm)
-{
-	return (sm != NULL ? sm->sm_object : 0);
-}
-
-int64_t
-space_map_allocated(space_map_t *sm)
-{
-	return (sm != NULL ? sm->sm_phys->smp_alloc : 0);
-}
-
-uint64_t
-space_map_length(space_map_t *sm)
-{
-	return (sm != NULL ? sm->sm_phys->smp_length : 0);
-}
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_reftree.c
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_reftree.c
+++ /dev/null
@@ -1,149 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-/*
- * Copyright (c) 2013, 2015 by Delphix. All rights reserved.
- */
-
-#include <sys/zfs_context.h>
-#include <sys/range_tree.h>
-#include <sys/space_reftree.h>
-
-/*
- * Space reference trees.
- *
- * A range tree is a collection of integers.  Every integer is either
- * in the tree, or it's not.  A space reference tree generalizes
- * the idea: it allows its members to have arbitrary reference counts,
- * as opposed to the implicit reference count of 0 or 1 in a range tree.
- * This representation comes in handy when computing the union or
- * intersection of multiple space maps.  For example, the union of
- * N range trees is the subset of the reference tree with refcnt >= 1.
- * The intersection of N range trees is the subset with refcnt >= N.
- *
- * [It's very much like a Fourier transform.  Unions and intersections
- * are hard to perform in the 'range tree domain', so we convert the trees
- * into the 'reference count domain', where it's trivial, then invert.]
- *
- * vdev_dtl_reassess() uses computations of this form to determine
- * DTL_MISSING and DTL_OUTAGE for interior vdevs -- e.g. a RAID-Z vdev
- * has an outage wherever refcnt >= vdev_nparity + 1, and a mirror vdev
- * has an outage wherever refcnt >= vdev_children.
- */
-static int
-space_reftree_compare(const void *x1, const void *x2)
-{
-	const space_ref_t *sr1 = (const space_ref_t *)x1;
-	const space_ref_t *sr2 = (const space_ref_t *)x2;
-
-	int cmp = AVL_CMP(sr1->sr_offset, sr2->sr_offset);
-	if (likely(cmp))
-		return (cmp);
-
-	return (AVL_PCMP(sr1, sr2));
-}
-
-void
-space_reftree_create(avl_tree_t *t)
-{
-	avl_create(t, space_reftree_compare,
-	    sizeof (space_ref_t), offsetof(space_ref_t, sr_node));
-}
-
-void
-space_reftree_destroy(avl_tree_t *t)
-{
-	space_ref_t *sr;
-	void *cookie = NULL;
-
-	while ((sr = avl_destroy_nodes(t, &cookie)) != NULL)
-		kmem_free(sr, sizeof (*sr));
-
-	avl_destroy(t);
-}
-
-static void
-space_reftree_add_node(avl_tree_t *t, uint64_t offset, int64_t refcnt)
-{
-	space_ref_t *sr;
-
-	sr = kmem_alloc(sizeof (*sr), KM_SLEEP);
-	sr->sr_offset = offset;
-	sr->sr_refcnt = refcnt;
-
-	avl_add(t, sr);
-}
-
-void
-space_reftree_add_seg(avl_tree_t *t, uint64_t start, uint64_t end,
-    int64_t refcnt)
-{
-	space_reftree_add_node(t, start, refcnt);
-	space_reftree_add_node(t, end, -refcnt);
-}
-
-/*
- * Convert (or add) a range tree into a reference tree.
- */
-void
-space_reftree_add_map(avl_tree_t *t, range_tree_t *rt, int64_t refcnt)
-{
-	range_seg_t *rs;
-
-	for (rs = avl_first(&rt->rt_root); rs; rs = AVL_NEXT(&rt->rt_root, rs))
-		space_reftree_add_seg(t, rs->rs_start, rs->rs_end, refcnt);
-}
-
-/*
- * Convert a reference tree into a range tree.  The range tree will contain
- * all members of the reference tree for which refcnt >= minref.
- */
-void
-space_reftree_generate_map(avl_tree_t *t, range_tree_t *rt, int64_t minref)
-{
-	uint64_t start = -1ULL;
-	int64_t refcnt = 0;
-	space_ref_t *sr;
-
-	range_tree_vacate(rt, NULL, NULL);
-
-	for (sr = avl_first(t); sr != NULL; sr = AVL_NEXT(t, sr)) {
-		refcnt += sr->sr_refcnt;
-		if (refcnt >= minref) {
-			if (start == -1ULL) {
-				start = sr->sr_offset;
-			}
-		} else {
-			if (start != -1ULL) {
-				uint64_t end = sr->sr_offset;
-				ASSERT(start <= end);
-				if (end > start)
-					range_tree_add(rt, start, end - start);
-				start = -1ULL;
-			}
-		}
-	}
-	ASSERT(refcnt == 0);
-	ASSERT(start == -1ULL);
-}
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/abd.h
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/abd.h
+++ /dev/null
@@ -1,154 +0,0 @@
-/*
- * This file and its contents are supplied under the terms of the
- * Common Development and Distribution License ("CDDL"), version 1.0.
- * You may only use this file in accordance with the terms of version
- * 1.0 of the CDDL.
- *
- * A full copy of the text of the CDDL should have accompanied this
- * source.  A copy of the CDDL is also available via the Internet at
- * http://www.illumos.org/license/CDDL.
- */
-
-/*
- * Copyright (c) 2014 by Chunwei Chen. All rights reserved.
- * Copyright (c) 2016 by Delphix. All rights reserved.
- */
-
-#ifndef _ABD_H
-#define	_ABD_H
-
-#include <sys/isa_defs.h>
-#ifdef illumos
-#include <sys/int_types.h>
-#else
-#include <sys/stdint.h>
-#endif
-#include <sys/debug.h>
-#include <sys/refcount.h>
-#ifdef _KERNEL
-#include <sys/uio.h>
-#endif
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-typedef enum abd_flags {
-	ABD_FLAG_LINEAR	= 1 << 0,	/* is buffer linear (or scattered)? */
-	ABD_FLAG_OWNER	= 1 << 1,	/* does it own its data buffers? */
-	ABD_FLAG_META	= 1 << 2	/* does this represent FS metadata? */
-} abd_flags_t;
-
-typedef struct abd {
-	abd_flags_t	abd_flags;
-	uint_t		abd_size;	/* excludes scattered abd_offset */
-	struct abd	*abd_parent;
-	zfs_refcount_t	abd_children;
-	union {
-		struct abd_scatter {
-			uint_t	abd_offset;
-			uint_t	abd_chunk_size;
-			void	*abd_chunks[];
-		} abd_scatter;
-		struct abd_linear {
-			void	*abd_buf;
-		} abd_linear;
-	} abd_u;
-} abd_t;
-
-typedef int abd_iter_func_t(void *, size_t, void *);
-typedef int abd_iter_func2_t(void *, void *, size_t, void *);
-
-extern boolean_t zfs_abd_scatter_enabled;
-
-inline boolean_t
-abd_is_linear(abd_t *abd)
-{
-	return ((abd->abd_flags & ABD_FLAG_LINEAR) != 0 ? B_TRUE : B_FALSE);
-}
-
-/*
- * Allocations and deallocations
- */
-
-abd_t *abd_alloc(size_t, boolean_t);
-abd_t *abd_alloc_linear(size_t, boolean_t);
-abd_t *abd_alloc_for_io(size_t, boolean_t);
-abd_t *abd_alloc_sametype(abd_t *, size_t);
-void abd_free(abd_t *);
-abd_t *abd_get_offset(abd_t *, size_t);
-abd_t *abd_get_from_buf(void *, size_t);
-void abd_put(abd_t *);
-
-/*
- * Conversion to and from a normal buffer
- */
-
-void *abd_to_buf(abd_t *);
-void *abd_borrow_buf(abd_t *, size_t);
-void *abd_borrow_buf_copy(abd_t *, size_t);
-void abd_return_buf(abd_t *, void *, size_t);
-void abd_return_buf_copy(abd_t *, void *, size_t);
-void abd_take_ownership_of_buf(abd_t *, boolean_t);
-void abd_release_ownership_of_buf(abd_t *);
-
-/*
- * ABD operations
- */
-
-int abd_iterate_func(abd_t *, size_t, size_t, abd_iter_func_t *, void *);
-int abd_iterate_func2(abd_t *, abd_t *, size_t, size_t, size_t,
-    abd_iter_func2_t *, void *);
-void abd_copy_off(abd_t *, abd_t *, size_t, size_t, size_t);
-void abd_copy_from_buf_off(abd_t *, const void *, size_t, size_t);
-void abd_copy_to_buf_off(void *, abd_t *, size_t, size_t);
-int abd_cmp(abd_t *, abd_t *, size_t);
-int abd_cmp_buf_off(abd_t *, const void *, size_t, size_t);
-void abd_zero_off(abd_t *, size_t, size_t);
-
-/*
- * Wrappers for calls with offsets of 0
- */
-
-inline void
-abd_copy(abd_t *dabd, abd_t *sabd, size_t size)
-{
-	abd_copy_off(dabd, sabd, 0, 0, size);
-}
-
-inline void
-abd_copy_from_buf(abd_t *abd, const void *buf, size_t size)
-{
-	abd_copy_from_buf_off(abd, buf, 0, size);
-}
-
-inline void
-abd_copy_to_buf(void* buf, abd_t *abd, size_t size)
-{
-	abd_copy_to_buf_off(buf, abd, 0, size);
-}
-
-inline int
-abd_cmp_buf(abd_t *abd, const void *buf, size_t size)
-{
-	return (abd_cmp_buf_off(abd, buf, 0, size));
-}
-
-inline void
-abd_zero(abd_t *abd, size_t size)
-{
-	abd_zero_off(abd, 0, size);
-}
-
-/*
- * Module lifecycle
- */
-
-void abd_init(void);
-void abd_fini(void);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif	/* _ABD_H */
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/aggsum.h
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/aggsum.h
+++ /dev/null
@@ -1,58 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * This file and its contents are supplied under the terms of the
- * Common Development and Distribution License ("CDDL"), version 1.0.
- * You may only use this file in accordance with the terms of version
- * 1.0 of the CDDL.
- *
- * A full copy of the text of the CDDL should have accompanied this
- * source.  A copy of the CDDL is also available via the Internet at
- * http://www.illumos.org/license/CDDL.
- *
- * CDDL HEADER END
- */
-/*
- * Copyright (c) 2017 by Delphix. All rights reserved.
- */
-
-#ifndef	_SYS_AGGSUM_H
-#define	_SYS_AGGSUM_H
-
-#include <sys/zfs_context.h>
-
-#ifdef	__cplusplus
-extern "C" {
-#endif
-
-typedef struct aggsum_bucket {
-	kmutex_t asc_lock;
-	int64_t asc_delta;
-	uint64_t asc_borrowed;
-	uint64_t asc_pad[2]; /* pad out to cache line (64 bytes) */
-} aggsum_bucket_t __aligned(CACHE_LINE_SIZE);
-
-/*
- * Fan out over FANOUT cpus.
- */
-typedef struct aggsum {
-	kmutex_t as_lock;
-	int64_t as_lower_bound;
-	int64_t as_upper_bound;
-	uint_t as_numbuckets;
-	aggsum_bucket_t *as_buckets;
-} aggsum_t;
-
-void aggsum_init(aggsum_t *, uint64_t);
-void aggsum_fini(aggsum_t *);
-int64_t aggsum_lower_bound(aggsum_t *);
-int64_t aggsum_upper_bound(aggsum_t *);
-int aggsum_compare(aggsum_t *, uint64_t);
-uint64_t aggsum_value(aggsum_t *);
-void aggsum_add(aggsum_t *, int64_t);
-
-#ifdef	__cplusplus
-}
-#endif
-
-#endif /* _SYS_AGGSUM_H */
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/arc.h
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/arc.h
+++ /dev/null
@@ -1,290 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
- * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
- */
-
-#ifndef	_SYS_ARC_H
-#define	_SYS_ARC_H
-
-#include <sys/zfs_context.h>
-
-#ifdef	__cplusplus
-extern "C" {
-#endif
-
-#include <sys/zio.h>
-#include <sys/dmu.h>
-#include <sys/spa.h>
-
-/*
- * Used by arc_flush() to inform arc_evict_state() that it should evict
- * all available buffers from the arc state being passed in.
- */
-#define	ARC_EVICT_ALL	-1ULL
-
-#define	HDR_SET_LSIZE(hdr, x) do { \
-	ASSERT(IS_P2ALIGNED(x, 1U << SPA_MINBLOCKSHIFT)); \
-	(hdr)->b_lsize = ((x) >> SPA_MINBLOCKSHIFT); \
-_NOTE(CONSTCOND) } while (0)
-
-#define	HDR_SET_PSIZE(hdr, x) do { \
-	ASSERT(IS_P2ALIGNED((x), 1U << SPA_MINBLOCKSHIFT)); \
-	(hdr)->b_psize = ((x) >> SPA_MINBLOCKSHIFT); \
-_NOTE(CONSTCOND) } while (0)
-
-#define	HDR_GET_LSIZE(hdr)	((hdr)->b_lsize << SPA_MINBLOCKSHIFT)
-#define	HDR_GET_PSIZE(hdr)	((hdr)->b_psize << SPA_MINBLOCKSHIFT)
-
-typedef struct arc_buf_hdr arc_buf_hdr_t;
-typedef struct arc_buf arc_buf_t;
-typedef struct arc_prune arc_prune_t;
-
-/*
- * Because the ARC can store encrypted data, errors (not due to bugs) may arise
- * while transforming data into its desired format - specifically, when
- * decrypting, the key may not be present, or the HMAC may not be correct
- * which signifies deliberate tampering with the on-disk state
- * (assuming that the checksum was correct). If any error occurs, the "buf"
- * parameter will be NULL.
- */
-typedef void arc_read_done_func_t(zio_t *zio, const zbookmark_phys_t *zb,
-    const blkptr_t *bp, arc_buf_t *buf, void *priv);
-typedef void arc_write_done_func_t(zio_t *zio, arc_buf_t *buf, void *priv);
-typedef void arc_prune_func_t(int64_t bytes, void *priv);
-
-/* Shared module parameters */
-extern uint64_t zfs_arc_average_blocksize;
-
-/* generic arc_done_func_t's which you can use */
-arc_read_done_func_t arc_bcopy_func;
-arc_read_done_func_t arc_getbuf_func;
-
-/* generic arc_prune_func_t wrapper for callbacks */
-struct arc_prune {
-	arc_prune_func_t	*p_pfunc;
-	void			*p_private;
-	uint64_t		p_adjust;
-	list_node_t		p_node;
-	zfs_refcount_t		p_refcnt;
-};
-
-typedef enum arc_strategy {
-	ARC_STRATEGY_META_ONLY		= 0, /* Evict only meta data buffers */
-	ARC_STRATEGY_META_BALANCED	= 1, /* Evict data buffers if needed */
-} arc_strategy_t;
-
-typedef enum arc_flags
-{
-	/*
-	 * Public flags that can be passed into the ARC by external consumers.
-	 */
-	ARC_FLAG_WAIT			= 1 << 0,	/* perform sync I/O */
-	ARC_FLAG_NOWAIT			= 1 << 1,	/* perform async I/O */
-	ARC_FLAG_PREFETCH		= 1 << 2,	/* I/O is a prefetch */
-	ARC_FLAG_CACHED			= 1 << 3,	/* I/O was in cache */
-	ARC_FLAG_L2CACHE		= 1 << 4,	/* cache in L2ARC */
-	ARC_FLAG_PREDICTIVE_PREFETCH	= 1 << 5,	/* I/O from zfetch */
-	ARC_FLAG_PRESCIENT_PREFETCH	= 1 << 6,	/* long min lifespan */
-
-	/*
-	 * Private ARC flags.  These flags are private ARC only flags that
-	 * will show up in b_flags in the arc_hdr_buf_t. These flags should
-	 * only be set by ARC code.
-	 */
-	ARC_FLAG_IN_HASH_TABLE		= 1 << 7,	/* buffer is hashed */
-	ARC_FLAG_IO_IN_PROGRESS		= 1 << 8,	/* I/O in progress */
-	ARC_FLAG_IO_ERROR		= 1 << 9,	/* I/O failed for buf */
-	ARC_FLAG_INDIRECT		= 1 << 10,	/* indirect block */
-	/* Indicates that block was read with ASYNC priority. */
-	ARC_FLAG_PRIO_ASYNC_READ	= 1 << 11,
-	ARC_FLAG_L2_WRITING		= 1 << 12,	/* write in progress */
-	ARC_FLAG_L2_EVICTED		= 1 << 13,	/* evicted during I/O */
-	ARC_FLAG_L2_WRITE_HEAD		= 1 << 14,	/* head of write list */
-	/* indicates that the buffer contains metadata (otherwise, data) */
-	ARC_FLAG_BUFC_METADATA		= 1 << 15,
-
-	/* Flags specifying whether optional hdr struct fields are defined */
-	ARC_FLAG_HAS_L1HDR		= 1 << 16,
-	ARC_FLAG_HAS_L2HDR		= 1 << 17,
-
-	/*
-	 * Indicates the arc_buf_hdr_t's b_pdata matches the on-disk data.
-	 * This allows the l2arc to use the blkptr's checksum to verify
-	 * the data without having to store the checksum in the hdr.
-	 */
-	ARC_FLAG_COMPRESSED_ARC		= 1 << 18,
-	ARC_FLAG_SHARED_DATA		= 1 << 19,
-
-	/*
-	 * The arc buffer's compression mode is stored in the top 7 bits of the
-	 * flags field, so these dummy flags are included so that MDB can
-	 * interpret the enum properly.
-	 */
-	ARC_FLAG_COMPRESS_0		= 1 << 24,
-	ARC_FLAG_COMPRESS_1		= 1 << 25,
-	ARC_FLAG_COMPRESS_2		= 1 << 26,
-	ARC_FLAG_COMPRESS_3		= 1 << 27,
-	ARC_FLAG_COMPRESS_4		= 1 << 28,
-	ARC_FLAG_COMPRESS_5		= 1 << 29,
-	ARC_FLAG_COMPRESS_6		= 1 << 30
-
-} arc_flags_t;
-
-typedef enum arc_buf_flags {
-	ARC_BUF_FLAG_SHARED		= 1 << 0,
-	ARC_BUF_FLAG_COMPRESSED		= 1 << 1
-} arc_buf_flags_t;
-
-struct arc_buf {
-	arc_buf_hdr_t		*b_hdr;
-	arc_buf_t		*b_next;
-	kmutex_t		b_evict_lock;
-	void			*b_data;
-	arc_buf_flags_t		b_flags;
-};
-
-typedef enum arc_buf_contents {
-	ARC_BUFC_INVALID,			/* invalid type */
-	ARC_BUFC_DATA,				/* buffer contains data */
-	ARC_BUFC_METADATA,			/* buffer contains metadata */
-	ARC_BUFC_NUMTYPES
-} arc_buf_contents_t;
-
-/*
- * The following breakdows of arc_size exist for kstat only.
- */
-typedef enum arc_space_type {
-	ARC_SPACE_DATA,
-	ARC_SPACE_META,
-	ARC_SPACE_HDRS,
-	ARC_SPACE_L2HDRS,
-	ARC_SPACE_DBUF,
-	ARC_SPACE_DNODE,
-	ARC_SPACE_BONUS,
-	ARC_SPACE_NUMTYPES
-} arc_space_type_t;
-
-typedef enum arc_state_type {
-	ARC_STATE_ANON,
-	ARC_STATE_MRU,
-	ARC_STATE_MRU_GHOST,
-	ARC_STATE_MFU,
-	ARC_STATE_MFU_GHOST,
-	ARC_STATE_L2C_ONLY,
-	ARC_STATE_NUMTYPES
-} arc_state_type_t;
-
-typedef struct arc_buf_info {
-	arc_state_type_t	abi_state_type;
-	arc_buf_contents_t	abi_state_contents;
-	uint64_t		abi_state_index;
-	uint32_t		abi_flags;
-	uint32_t		abi_bufcnt;
-	uint64_t		abi_size;
-	uint64_t		abi_spa;
-	uint64_t		abi_access;
-	uint32_t		abi_mru_hits;
-	uint32_t		abi_mru_ghost_hits;
-	uint32_t		abi_mfu_hits;
-	uint32_t		abi_mfu_ghost_hits;
-	uint32_t		abi_l2arc_hits;
-	uint32_t		abi_holds;
-	uint64_t		abi_l2arc_dattr;
-	uint64_t		abi_l2arc_asize;
-	enum zio_compress	abi_l2arc_compress;
-} arc_buf_info_t;
-
-void arc_space_consume(uint64_t space, arc_space_type_t type);
-void arc_space_return(uint64_t space, arc_space_type_t type);
-boolean_t arc_is_metadata(arc_buf_t *buf);
-enum zio_compress arc_get_compression(arc_buf_t *buf);
-int arc_decompress(arc_buf_t *buf);
-arc_buf_t *arc_alloc_buf(spa_t *spa, void *tag, arc_buf_contents_t type,
-    int32_t size);
-arc_buf_t *arc_alloc_compressed_buf(spa_t *spa, void *tag,
-    uint64_t psize, uint64_t lsize, enum zio_compress compression_type);
-arc_buf_t *arc_loan_buf(spa_t *spa, boolean_t is_metadata, int size);
-arc_buf_t *arc_loan_compressed_buf(spa_t *spa, uint64_t psize, uint64_t lsize,
-    enum zio_compress compression_type);
-void arc_return_buf(arc_buf_t *buf, void *tag);
-void arc_loan_inuse_buf(arc_buf_t *buf, void *tag);
-void arc_buf_destroy(arc_buf_t *buf, void *tag);
-void arc_buf_info(arc_buf_t *buf, arc_buf_info_t *abi, int state_index);
-int arc_buf_size(arc_buf_t *buf);
-int arc_buf_lsize(arc_buf_t *buf);
-void arc_buf_access(arc_buf_t *buf);
-void arc_release(arc_buf_t *buf, void *tag);
-int arc_released(arc_buf_t *buf);
-void arc_buf_freeze(arc_buf_t *buf);
-void arc_buf_thaw(arc_buf_t *buf);
-#ifdef ZFS_DEBUG
-int arc_referenced(arc_buf_t *buf);
-#endif
-
-int arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp,
-    arc_read_done_func_t *done, void *priv, zio_priority_t priority,
-    int flags, arc_flags_t *arc_flags, const zbookmark_phys_t *zb);
-zio_t *arc_write(zio_t *pio, spa_t *spa, uint64_t txg,
-    blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, const zio_prop_t *zp,
-    arc_write_done_func_t *ready, arc_write_done_func_t *child_ready,
-    arc_write_done_func_t *physdone, arc_write_done_func_t *done,
-    void *priv, zio_priority_t priority, int zio_flags,
-    const zbookmark_phys_t *zb);
-
-arc_prune_t *arc_add_prune_callback(arc_prune_func_t *func, void *priv);
-void arc_remove_prune_callback(arc_prune_t *p);
-void arc_freed(spa_t *spa, const blkptr_t *bp);
-
-void arc_flush(spa_t *spa, boolean_t retry);
-void arc_tempreserve_clear(uint64_t reserve);
-int arc_tempreserve_space(spa_t *spa, uint64_t reserve, uint64_t txg);
-
-uint64_t arc_max_bytes(void);
-void arc_init(void);
-void arc_fini(void);
-
-/*
- * Level 2 ARC
- */
-
-void l2arc_add_vdev(spa_t *spa, vdev_t *vd);
-void l2arc_remove_vdev(vdev_t *vd);
-boolean_t l2arc_vdev_present(vdev_t *vd);
-void l2arc_init(void);
-void l2arc_fini(void);
-void l2arc_start(void);
-void l2arc_stop(void);
-
-#ifdef illumos
-#ifndef _KERNEL
-extern boolean_t arc_watch;
-extern int arc_procfd;
-#endif
-#endif /* illumos */
-
-#ifdef	__cplusplus
-}
-#endif
-
-#endif /* _SYS_ARC_H */
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/blkptr.h
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/blkptr.h
+++ /dev/null
@@ -1,39 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * This file and its contents are supplied under the terms of the
- * Common Development and Distribution License ("CDDL"), version 1.0.
- * You may only use this file in accordance with the terms of version
- * 1.0 of the CDDL.
- *
- * A full copy of the text of the CDDL should have accompanied this
- * source.  A copy of the CDDL is also available via the Internet at
- * http://www.illumos.org/license/CDDL.
- *
- * CDDL HEADER END
- */
-
-/*
- * Copyright (c) 2013 by Delphix. All rights reserved.
- */
-
-#ifndef _SYS_BLKPTR_H
-#define	_SYS_BLKPTR_H
-
-#include <sys/spa.h>
-#include <sys/zio.h>
-
-#ifdef	__cplusplus
-extern "C" {
-#endif
-
-void encode_embedded_bp_compressed(blkptr_t *, void *,
-    enum zio_compress, int, int);
-void decode_embedded_bp_compressed(const blkptr_t *, void *);
-int decode_embedded_bp(const blkptr_t *, void *, int);
-
-#ifdef	__cplusplus
-}
-#endif
-
-#endif	/* _SYS_BLKPTR_H */
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/bplist.h
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/bplist.h
+++ /dev/null
@@ -1,57 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- */
-
-#ifndef	_SYS_BPLIST_H
-#define	_SYS_BPLIST_H
-
-#include <sys/zfs_context.h>
-#include <sys/spa.h>
-
-#ifdef	__cplusplus
-extern "C" {
-#endif
-
-typedef struct bplist_entry {
-	blkptr_t	bpe_blk;
-	list_node_t	bpe_node;
-} bplist_entry_t;
-
-typedef struct bplist {
-	kmutex_t	bpl_lock;
-	list_t		bpl_list;
-} bplist_t;
-
-typedef int bplist_itor_t(void *arg, const blkptr_t *bp, dmu_tx_t *tx);
-
-void bplist_create(bplist_t *bpl);
-void bplist_destroy(bplist_t *bpl);
-void bplist_append(bplist_t *bpl, const blkptr_t *bp);
-void bplist_iterate(bplist_t *bpl, bplist_itor_t *func,
-    void *arg, dmu_tx_t *tx);
-
-#ifdef	__cplusplus
-}
-#endif
-
-#endif /* _SYS_BPLIST_H */
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/bpobj.h
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/bpobj.h
+++ /dev/null
@@ -1,95 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
- */
-
-#ifndef	_SYS_BPOBJ_H
-#define	_SYS_BPOBJ_H
-
-#include <sys/dmu.h>
-#include <sys/spa.h>
-#include <sys/txg.h>
-#include <sys/zio.h>
-#include <sys/zfs_context.h>
-
-#ifdef	__cplusplus
-extern "C" {
-#endif
-
-typedef struct bpobj_phys {
-	/*
-	 * This is the bonus buffer for the dead lists.  The object's
-	 * contents is an array of bpo_entries blkptr_t's, representing
-	 * a total of bpo_bytes physical space.
-	 */
-	uint64_t	bpo_num_blkptrs;
-	uint64_t	bpo_bytes;
-	uint64_t	bpo_comp;
-	uint64_t	bpo_uncomp;
-	uint64_t	bpo_subobjs;
-	uint64_t	bpo_num_subobjs;
-} bpobj_phys_t;
-
-#define	BPOBJ_SIZE_V0	(2 * sizeof (uint64_t))
-#define	BPOBJ_SIZE_V1	(4 * sizeof (uint64_t))
-
-typedef struct bpobj {
-	kmutex_t	bpo_lock;
-	objset_t	*bpo_os;
-	uint64_t	bpo_object;
-	int		bpo_epb;
-	uint8_t		bpo_havecomp;
-	uint8_t		bpo_havesubobj;
-	bpobj_phys_t	*bpo_phys;
-	dmu_buf_t	*bpo_dbuf;
-	dmu_buf_t	*bpo_cached_dbuf;
-} bpobj_t;
-
-typedef int bpobj_itor_t(void *arg, const blkptr_t *bp, dmu_tx_t *tx);
-
-uint64_t bpobj_alloc(objset_t *mos, int blocksize, dmu_tx_t *tx);
-uint64_t bpobj_alloc_empty(objset_t *os, int blocksize, dmu_tx_t *tx);
-void bpobj_free(objset_t *os, uint64_t obj, dmu_tx_t *tx);
-void bpobj_decr_empty(objset_t *os, dmu_tx_t *tx);
-
-int bpobj_open(bpobj_t *bpo, objset_t *mos, uint64_t object);
-void bpobj_close(bpobj_t *bpo);
-boolean_t bpobj_is_open(const bpobj_t *bpo);
-
-int bpobj_iterate(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx);
-int bpobj_iterate_nofree(bpobj_t *bpo, bpobj_itor_t func, void *, dmu_tx_t *);
-
-void bpobj_enqueue_subobj(bpobj_t *bpo, uint64_t subobj, dmu_tx_t *tx);
-void bpobj_enqueue(bpobj_t *bpo, const blkptr_t *bp, dmu_tx_t *tx);
-
-int bpobj_space(bpobj_t *bpo,
-    uint64_t *usedp, uint64_t *compp, uint64_t *uncompp);
-int bpobj_space_range(bpobj_t *bpo, uint64_t mintxg, uint64_t maxtxg,
-    uint64_t *usedp, uint64_t *compp, uint64_t *uncompp);
-boolean_t bpobj_is_empty(bpobj_t *bpo);
-
-#ifdef	__cplusplus
-}
-#endif
-
-#endif /* _SYS_BPOBJ_H */
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/bptree.h
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/bptree.h
+++ /dev/null
@@ -1,65 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
- */
-
-#ifndef	_SYS_BPTREE_H
-#define	_SYS_BPTREE_H
-
-#include <sys/spa.h>
-#include <sys/zio.h>
-
-#ifdef	__cplusplus
-extern "C" {
-#endif
-
-typedef struct bptree_phys {
-	uint64_t bt_begin;
-	uint64_t bt_end;
-	uint64_t bt_bytes;
-	uint64_t bt_comp;
-	uint64_t bt_uncomp;
-} bptree_phys_t;
-
-typedef struct bptree_entry_phys {
-	blkptr_t be_bp;
-	uint64_t be_birth_txg; /* only delete blocks born after this txg */
-	zbookmark_phys_t be_zb; /* holds traversal resume point if needed */
-} bptree_entry_phys_t;
-
-typedef int bptree_itor_t(void *arg, const blkptr_t *bp, dmu_tx_t *tx);
-
-uint64_t bptree_alloc(objset_t *os, dmu_tx_t *tx);
-int bptree_free(objset_t *os, uint64_t obj, dmu_tx_t *tx);
-boolean_t bptree_is_empty(objset_t *os, uint64_t obj);
-
-void bptree_add(objset_t *os, uint64_t obj, blkptr_t *bp, uint64_t birth_txg,
-    uint64_t bytes, uint64_t comp, uint64_t uncomp, dmu_tx_t *tx);
-
-int bptree_iterate(objset_t *os, uint64_t obj, boolean_t free,
-    bptree_itor_t func, void *arg, dmu_tx_t *tx);
-
-#ifdef	__cplusplus
-}
-#endif
-
-#endif	/* _SYS_BPTREE_H */
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/bqueue.h
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/bqueue.h
+++ /dev/null
@@ -1,54 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * This file and its contents are supplied under the terms of the
- * Common Development and Distribution License ("CDDL"), version 1.0.
- * You may only use this file in accordance with the terms of version
- * 1.0 of the CDDL.
- *
- * A full copy of the text of the CDDL should have accompanied this
- * source.  A copy of the CDDL is also available via the Internet at
- * http://www.illumos.org/license/CDDL.
- *
- * CDDL HEADER END
- */
-/*
- * Copyright (c) 2014 by Delphix. All rights reserved.
- */
-
-#ifndef	_BQUEUE_H
-#define	_BQUEUE_H
-
-#ifdef	__cplusplus
-extern "C" {
-#endif
-
-#include	<sys/zfs_context.h>
-
-typedef struct bqueue {
-	list_t bq_list;
-	kmutex_t bq_lock;
-	kcondvar_t bq_add_cv;
-	kcondvar_t bq_pop_cv;
-	uint64_t bq_size;
-	uint64_t bq_maxsize;
-	size_t bq_node_offset;
-} bqueue_t;
-
-typedef struct bqueue_node {
-	list_node_t bqn_node;
-	uint64_t bqn_size;
-} bqueue_node_t;
-
-
-int bqueue_init(bqueue_t *, uint64_t, size_t);
-void bqueue_destroy(bqueue_t *);
-void bqueue_enqueue(bqueue_t *, void *, uint64_t);
-void *bqueue_dequeue(bqueue_t *);
-boolean_t bqueue_empty(bqueue_t *);
-
-#ifdef	__cplusplus
-}
-#endif
-
-#endif	/* _BQUEUE_H */
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/cityhash.h
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/cityhash.h
+++ /dev/null
@@ -1,41 +0,0 @@
-// Copyright (c) 2011 Google, Inc.
-//
-// Permission is hereby granted, free of charge, to any person obtaining a copy
-// of this software and associated documentation files (the "Software"), to deal
-// in the Software without restriction, including without limitation the rights
-// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-// copies of the Software, and to permit persons to whom the Software is
-// furnished to do so, subject to the following conditions:
-//
-// The above copyright notice and this permission notice shall be included in
-// all copies or substantial portions of the Software.
-//
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-// THE SOFTWARE.
-
-
-/*
- * Copyright (c) 2017 by Delphix. All rights reserved.
- */
-
-#ifndef	_SYS_CITYHASH_H
-#define	_SYS_CITYHASH_H
-
-#include <sys/zfs_context.h>
-
-#ifdef	__cplusplus
-extern "C" {
-#endif
-
-uint64_t cityhash4(uint64_t, uint64_t, uint64_t, uint64_t);
-
-#ifdef	__cplusplus
-}
-#endif
-
-#endif	/* _SYS_CITYHASH_H */
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h
+++ /dev/null
@@ -1,417 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
- * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
- * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
- */
-
-#ifndef	_SYS_DBUF_H
-#define	_SYS_DBUF_H
-
-#include <sys/dmu.h>
-#include <sys/spa.h>
-#include <sys/txg.h>
-#include <sys/zio.h>
-#include <sys/arc.h>
-#include <sys/zfs_context.h>
-#include <sys/refcount.h>
-#include <sys/zrlock.h>
-#include <sys/multilist.h>
-
-#ifdef	__cplusplus
-extern "C" {
-#endif
-
-#define	IN_DMU_SYNC 2
-
-/*
- * define flags for dbuf_read
- */
-
-#define	DB_RF_MUST_SUCCEED	(1 << 0)
-#define	DB_RF_CANFAIL		(1 << 1)
-#define	DB_RF_HAVESTRUCT	(1 << 2)
-#define	DB_RF_NOPREFETCH	(1 << 3)
-#define	DB_RF_NEVERWAIT		(1 << 4)
-#define	DB_RF_CACHED		(1 << 5)
-
-/*
- * The simplified state transition diagram for dbufs looks like:
- *
- *		+----> READ ----+
- *		|		|
- *		|		V
- *  (alloc)-->UNCACHED	     CACHED-->EVICTING-->(free)
- *		|		^	 ^
- *		|		|	 |
- *		+----> FILL ----+	 |
- *		|			 |
- *		|			 |
- *		+--------> NOFILL -------+
- *
- * DB_SEARCH is an invalid state for a dbuf. It is used by dbuf_free_range
- * to find all dbufs in a range of a dnode and must be less than any other
- * dbuf_states_t (see comment on dn_dbufs in dnode.h).
- */
-typedef enum dbuf_states {
-	DB_SEARCH = -1,
-	DB_UNCACHED,
-	DB_FILL,
-	DB_NOFILL,
-	DB_READ,
-	DB_CACHED,
-	DB_EVICTING
-} dbuf_states_t;
-
-typedef enum dbuf_cached_state {
-	DB_NO_CACHE = -1,
-	DB_DBUF_CACHE,
-	DB_DBUF_METADATA_CACHE,
-	DB_CACHE_MAX
-} dbuf_cached_state_t;
-
-struct dnode;
-struct dmu_tx;
-
-/*
- * level = 0 means the user data
- * level = 1 means the single indirect block
- * etc.
- */
-
-struct dmu_buf_impl;
-
-typedef enum override_states {
-	DR_NOT_OVERRIDDEN,
-	DR_IN_DMU_SYNC,
-	DR_OVERRIDDEN
-} override_states_t;
-
-typedef struct dbuf_dirty_record {
-	/* link on our parents dirty list */
-	list_node_t dr_dirty_node;
-
-	/* transaction group this data will sync in */
-	uint64_t dr_txg;
-
-	/* zio of outstanding write IO */
-	zio_t *dr_zio;
-
-	/* pointer back to our dbuf */
-	struct dmu_buf_impl *dr_dbuf;
-
-	/* pointer to next dirty record */
-	struct dbuf_dirty_record *dr_next;
-
-	/* pointer to parent dirty record */
-	struct dbuf_dirty_record *dr_parent;
-
-	/* How much space was changed to dsl_pool_dirty_space() for this? */
-	unsigned int dr_accounted;
-
-	/* A copy of the bp that points to us */
-	blkptr_t dr_bp_copy;
-
-	union dirty_types {
-		struct dirty_indirect {
-
-			/* protect access to list */
-			kmutex_t dr_mtx;
-
-			/* Our list of dirty children */
-			list_t dr_children;
-		} di;
-		struct dirty_leaf {
-
-			/*
-			 * dr_data is set when we dirty the buffer
-			 * so that we can retain the pointer even if it
-			 * gets COW'd in a subsequent transaction group.
-			 */
-			arc_buf_t *dr_data;
-			blkptr_t dr_overridden_by;
-			override_states_t dr_override_state;
-			uint8_t dr_copies;
-			boolean_t dr_nopwrite;
-		} dl;
-	} dt;
-} dbuf_dirty_record_t;
-
-typedef struct dmu_buf_impl {
-	/*
-	 * The following members are immutable, with the exception of
-	 * db.db_data, which is protected by db_mtx.
-	 */
-
-	/* the publicly visible structure */
-	dmu_buf_t db;
-
-	/* the objset we belong to */
-	struct objset *db_objset;
-
-	/*
-	 * handle to safely access the dnode we belong to (NULL when evicted)
-	 */
-	struct dnode_handle *db_dnode_handle;
-
-	/*
-	 * our parent buffer; if the dnode points to us directly,
-	 * db_parent == db_dnode_handle->dnh_dnode->dn_dbuf
-	 * only accessed by sync thread ???
-	 * (NULL when evicted)
-	 * May change from NULL to non-NULL under the protection of db_mtx
-	 * (see dbuf_check_blkptr())
-	 */
-	struct dmu_buf_impl *db_parent;
-
-	/*
-	 * link for hash table of all dmu_buf_impl_t's
-	 */
-	struct dmu_buf_impl *db_hash_next;
-
-	/*
-	 * Our link on the owner dnodes's dn_dbufs list.
-	 * Protected by its dn_dbufs_mtx.  Should be on the same cache line
-	 * as db_level and db_blkid for the best avl_add() performance.
-	 */
-	avl_node_t db_link;
-
-	/* our block number */
-	uint64_t db_blkid;
-
-	/*
-	 * Pointer to the blkptr_t which points to us. May be NULL if we
-	 * don't have one yet. (NULL when evicted)
-	 */
-	blkptr_t *db_blkptr;
-
-	/*
-	 * Our indirection level.  Data buffers have db_level==0.
-	 * Indirect buffers which point to data buffers have
-	 * db_level==1. etc.  Buffers which contain dnodes have
-	 * db_level==0, since the dnodes are stored in a file.
-	 */
-	uint8_t db_level;
-
-	/* db_mtx protects the members below */
-	kmutex_t db_mtx;
-
-	/*
-	 * Current state of the buffer
-	 */
-	dbuf_states_t db_state;
-
-	/*
-	 * Refcount accessed by dmu_buf_{hold,rele}.
-	 * If nonzero, the buffer can't be destroyed.
-	 * Protected by db_mtx.
-	 */
-	zfs_refcount_t db_holds;
-
-	/* buffer holding our data */
-	arc_buf_t *db_buf;
-
-	kcondvar_t db_changed;
-	dbuf_dirty_record_t *db_data_pending;
-
-	/* pointer to most recent dirty record for this buffer */
-	dbuf_dirty_record_t *db_last_dirty;
-
-	/* Link in dbuf_cache or dbuf_metadata_cache */
-	multilist_node_t db_cache_link;
-
-	/* Tells us which dbuf cache this dbuf is in, if any */
-	dbuf_cached_state_t db_caching_status;
-
-	/* Data which is unique to data (leaf) blocks: */
-
-	/* User callback information. */
-	dmu_buf_user_t *db_user;
-
-	/*
-	 * Evict user data as soon as the dirty and reference
-	 * counts are equal.
-	 */
-	uint8_t db_user_immediate_evict;
-
-	/*
-	 * This block was freed while a read or write was
-	 * active.
-	 */
-	uint8_t db_freed_in_flight;
-
-	/*
-	 * dnode_evict_dbufs() or dnode_evict_bonus() tried to
-	 * evict this dbuf, but couldn't due to outstanding
-	 * references.  Evict once the refcount drops to 0.
-	 */
-	uint8_t db_pending_evict;
-
-	uint8_t db_dirtycnt;
-} dmu_buf_impl_t;
-
-/* Note: the dbuf hash table is exposed only for the mdb module */
-#define	DBUF_MUTEXES 256
-#define	DBUF_HASH_MUTEX(h, idx) (&(h)->hash_mutexes[(idx) & (DBUF_MUTEXES-1)])
-typedef struct dbuf_hash_table {
-	uint64_t hash_table_mask;
-	dmu_buf_impl_t **hash_table;
-	kmutex_t hash_mutexes[DBUF_MUTEXES];
-} dbuf_hash_table_t;
-
-uint64_t dbuf_whichblock(struct dnode *di, int64_t level, uint64_t offset);
-
-dmu_buf_impl_t *dbuf_create_tlib(struct dnode *dn, char *data);
-void dbuf_create_bonus(struct dnode *dn);
-int dbuf_spill_set_blksz(dmu_buf_t *db, uint64_t blksz, dmu_tx_t *tx);
-void dbuf_spill_hold(struct dnode *dn, dmu_buf_impl_t **dbp, void *tag);
-
-void dbuf_rm_spill(struct dnode *dn, dmu_tx_t *tx);
-
-dmu_buf_impl_t *dbuf_hold(struct dnode *dn, uint64_t blkid, void *tag);
-dmu_buf_impl_t *dbuf_hold_level(struct dnode *dn, int level, uint64_t blkid,
-    void *tag);
-int dbuf_hold_impl(struct dnode *dn, uint8_t level, uint64_t blkid,
-    boolean_t fail_sparse, boolean_t fail_uncached,
-    void *tag, dmu_buf_impl_t **dbp);
-
-void dbuf_prefetch(struct dnode *dn, int64_t level, uint64_t blkid,
-    zio_priority_t prio, arc_flags_t aflags);
-
-void dbuf_add_ref(dmu_buf_impl_t *db, void *tag);
-boolean_t dbuf_try_add_ref(dmu_buf_t *db, objset_t *os, uint64_t obj,
-    uint64_t blkid, void *tag);
-uint64_t dbuf_refcount(dmu_buf_impl_t *db);
-
-void dbuf_rele(dmu_buf_impl_t *db, void *tag);
-void dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag, boolean_t evicting);
-
-dmu_buf_impl_t *dbuf_find(struct objset *os, uint64_t object, uint8_t level,
-    uint64_t blkid);
-
-int dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags);
-void dmu_buf_will_not_fill(dmu_buf_t *db, dmu_tx_t *tx);
-void dmu_buf_will_fill(dmu_buf_t *db, dmu_tx_t *tx);
-void dmu_buf_fill_done(dmu_buf_t *db, dmu_tx_t *tx);
-void dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx);
-dbuf_dirty_record_t *dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
-arc_buf_t *dbuf_loan_arcbuf(dmu_buf_impl_t *db);
-void dmu_buf_write_embedded(dmu_buf_t *dbuf, void *data,
-    bp_embedded_type_t etype, enum zio_compress comp,
-    int uncompressed_size, int compressed_size, int byteorder, dmu_tx_t *tx);
-
-void dbuf_destroy(dmu_buf_impl_t *db);
-
-void dbuf_setdirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
-void dbuf_unoverride(dbuf_dirty_record_t *dr);
-void dbuf_sync_list(list_t *list, int level, dmu_tx_t *tx);
-void dbuf_release_bp(dmu_buf_impl_t *db);
-
-boolean_t dbuf_can_remap(const dmu_buf_impl_t *buf);
-
-void dbuf_free_range(struct dnode *dn, uint64_t start, uint64_t end,
-    struct dmu_tx *);
-
-void dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx);
-
-void dbuf_stats_init(dbuf_hash_table_t *hash);
-void dbuf_stats_destroy(void);
-
-#define	DB_DNODE(_db)		((_db)->db_dnode_handle->dnh_dnode)
-#define	DB_DNODE_LOCK(_db)	((_db)->db_dnode_handle->dnh_zrlock)
-#define	DB_DNODE_ENTER(_db)	(zrl_add(&DB_DNODE_LOCK(_db)))
-#define	DB_DNODE_EXIT(_db)	(zrl_remove(&DB_DNODE_LOCK(_db)))
-#define	DB_DNODE_HELD(_db)	(!zrl_is_zero(&DB_DNODE_LOCK(_db)))
-
-void dbuf_init(void);
-void dbuf_fini(void);
-
-boolean_t dbuf_is_metadata(dmu_buf_impl_t *db);
-
-#define	DBUF_GET_BUFC_TYPE(_db)	\
-	(dbuf_is_metadata(_db) ? ARC_BUFC_METADATA : ARC_BUFC_DATA)
-
-#define	DBUF_IS_CACHEABLE(_db)						\
-	((_db)->db_objset->os_primary_cache == ZFS_CACHE_ALL ||		\
-	(dbuf_is_metadata(_db) &&					\
-	((_db)->db_objset->os_primary_cache == ZFS_CACHE_METADATA)))
-
-#define	DBUF_IS_L2CACHEABLE(_db)					\
-	((_db)->db_objset->os_secondary_cache == ZFS_CACHE_ALL ||	\
-	(dbuf_is_metadata(_db) &&					\
-	((_db)->db_objset->os_secondary_cache == ZFS_CACHE_METADATA)))
-
-#define	DNODE_LEVEL_IS_L2CACHEABLE(_dn, _level)				\
-	((_dn)->dn_objset->os_secondary_cache == ZFS_CACHE_ALL ||	\
-	(((_level) > 0 ||						\
-	DMU_OT_IS_METADATA((_dn)->dn_handle->dnh_dnode->dn_type)) &&	\
-	((_dn)->dn_objset->os_secondary_cache == ZFS_CACHE_METADATA)))
-
-#ifdef ZFS_DEBUG
-
-/*
- * There should be a ## between the string literal and fmt, to make it
- * clear that we're joining two strings together, but gcc does not
- * support that preprocessor token.
- */
-#define	dprintf_dbuf(dbuf, fmt, ...) do { \
-	if (zfs_flags & ZFS_DEBUG_DPRINTF) { \
-	char __db_buf[32]; \
-	uint64_t __db_obj = (dbuf)->db.db_object; \
-	if (__db_obj == DMU_META_DNODE_OBJECT) \
-		(void) strcpy(__db_buf, "mdn"); \
-	else \
-		(void) snprintf(__db_buf, sizeof (__db_buf), "%lld", \
-		    (u_longlong_t)__db_obj); \
-	dprintf_ds((dbuf)->db_objset->os_dsl_dataset, \
-	    "obj=%s lvl=%u blkid=%lld " fmt, \
-	    __db_buf, (dbuf)->db_level, \
-	    (u_longlong_t)(dbuf)->db_blkid, __VA_ARGS__); \
-	} \
-_NOTE(CONSTCOND) } while (0)
-
-#define	dprintf_dbuf_bp(db, bp, fmt, ...) do {			\
-	if (zfs_flags & ZFS_DEBUG_DPRINTF) {			\
-	char *__blkbuf = kmem_alloc(BP_SPRINTF_LEN, KM_SLEEP);	\
-	snprintf_blkptr(__blkbuf, BP_SPRINTF_LEN, bp);		\
-	dprintf_dbuf(db, fmt " %s\n", __VA_ARGS__, __blkbuf);	\
-	kmem_free(__blkbuf, BP_SPRINTF_LEN);			\
-	}							\
-_NOTE(CONSTCOND) } while (0)
-
-#define	DBUF_VERIFY(db)	dbuf_verify(db)
-
-#else
-
-#define	dprintf_dbuf(db, fmt, ...)
-#define	dprintf_dbuf_bp(db, bp, fmt, ...)
-#define	DBUF_VERIFY(db)
-
-#endif
-
-
-#ifdef	__cplusplus
-}
-#endif
-
-#endif /* _SYS_DBUF_H */
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/ddt.h
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/ddt.h
+++ /dev/null
@@ -1,248 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2016 by Delphix. All rights reserved.
- */
-
-#ifndef _SYS_DDT_H
-#define	_SYS_DDT_H
-
-#include <sys/sysmacros.h>
-#include <sys/types.h>
-#include <sys/fs/zfs.h>
-#include <sys/zio.h>
-#include <sys/dmu.h>
-
-#ifdef	__cplusplus
-extern "C" {
-#endif
-
-struct abd;
-
-/*
- * On-disk DDT formats, in the desired search order (newest version first).
- */
-enum ddt_type {
-	DDT_TYPE_ZAP = 0,
-	DDT_TYPES
-};
-
-/*
- * DDT classes, in the desired search order (highest replication level first).
- */
-enum ddt_class {
-	DDT_CLASS_DITTO = 0,
-	DDT_CLASS_DUPLICATE,
-	DDT_CLASS_UNIQUE,
-	DDT_CLASSES
-};
-
-#define	DDT_TYPE_CURRENT		0
-
-#define	DDT_COMPRESS_BYTEORDER_MASK	0x80
-#define	DDT_COMPRESS_FUNCTION_MASK	0x7f
-
-/*
- * On-disk ddt entry:  key (name) and physical storage (value).
- */
-typedef struct ddt_key {
-	zio_cksum_t	ddk_cksum;	/* 256-bit block checksum */
-	/*
-	 * Encoded with logical & physical size, and compression, as follows:
-	 *   +-------+-------+-------+-------+-------+-------+-------+-------+
-	 *   |   0   |   0   |   0   | comp  |     PSIZE     |     LSIZE     |
-	 *   +-------+-------+-------+-------+-------+-------+-------+-------+
-	 */
-	uint64_t	ddk_prop;
-} ddt_key_t;
-
-#define	DDK_GET_LSIZE(ddk)	\
-	BF64_GET_SB((ddk)->ddk_prop, 0, 16, SPA_MINBLOCKSHIFT, 1)
-#define	DDK_SET_LSIZE(ddk, x)	\
-	BF64_SET_SB((ddk)->ddk_prop, 0, 16, SPA_MINBLOCKSHIFT, 1, x)
-
-#define	DDK_GET_PSIZE(ddk)	\
-	BF64_GET_SB((ddk)->ddk_prop, 16, 16, SPA_MINBLOCKSHIFT, 1)
-#define	DDK_SET_PSIZE(ddk, x)	\
-	BF64_SET_SB((ddk)->ddk_prop, 16, 16, SPA_MINBLOCKSHIFT, 1, x)
-
-#define	DDK_GET_COMPRESS(ddk)		BF64_GET((ddk)->ddk_prop, 32, 8)
-#define	DDK_SET_COMPRESS(ddk, x)	BF64_SET((ddk)->ddk_prop, 32, 8, x)
-
-#define	DDT_KEY_WORDS	(sizeof (ddt_key_t) / sizeof (uint64_t))
-
-typedef struct ddt_phys {
-	dva_t		ddp_dva[SPA_DVAS_PER_BP];
-	uint64_t	ddp_refcnt;
-	uint64_t	ddp_phys_birth;
-} ddt_phys_t;
-
-enum ddt_phys_type {
-	DDT_PHYS_DITTO = 0,
-	DDT_PHYS_SINGLE = 1,
-	DDT_PHYS_DOUBLE = 2,
-	DDT_PHYS_TRIPLE = 3,
-	DDT_PHYS_TYPES
-};
-
-/*
- * In-core ddt entry
- */
-struct ddt_entry {
-	ddt_key_t	dde_key;
-	ddt_phys_t	dde_phys[DDT_PHYS_TYPES];
-	zio_t		*dde_lead_zio[DDT_PHYS_TYPES];
-	struct abd	*dde_repair_abd;
-	enum ddt_type	dde_type;
-	enum ddt_class	dde_class;
-	uint8_t		dde_loading;
-	uint8_t		dde_loaded;
-	kcondvar_t	dde_cv;
-	avl_node_t	dde_node;
-};
-
-/*
- * In-core ddt
- */
-struct ddt {
-	kmutex_t	ddt_lock;
-	avl_tree_t	ddt_tree;
-	avl_tree_t	ddt_repair_tree;
-	enum zio_checksum ddt_checksum;
-	spa_t		*ddt_spa;
-	objset_t	*ddt_os;
-	uint64_t	ddt_stat_object;
-	uint64_t	ddt_object[DDT_TYPES][DDT_CLASSES];
-	ddt_histogram_t	ddt_histogram[DDT_TYPES][DDT_CLASSES];
-	ddt_histogram_t	ddt_histogram_cache[DDT_TYPES][DDT_CLASSES];
-	ddt_object_t	ddt_object_stats[DDT_TYPES][DDT_CLASSES];
-	avl_node_t	ddt_node;
-};
-
-/*
- * In-core and on-disk bookmark for DDT walks
- */
-typedef struct ddt_bookmark {
-	uint64_t	ddb_class;
-	uint64_t	ddb_type;
-	uint64_t	ddb_checksum;
-	uint64_t	ddb_cursor;
-} ddt_bookmark_t;
-
-/*
- * Ops vector to access a specific DDT object type.
- */
-typedef struct ddt_ops {
-	char ddt_op_name[32];
-	int (*ddt_op_create)(objset_t *os, uint64_t *object, dmu_tx_t *tx,
-	    boolean_t prehash);
-	int (*ddt_op_destroy)(objset_t *os, uint64_t object, dmu_tx_t *tx);
-	int (*ddt_op_lookup)(objset_t *os, uint64_t object, ddt_entry_t *dde);
-	void (*ddt_op_prefetch)(objset_t *os, uint64_t object,
-	    ddt_entry_t *dde);
-	int (*ddt_op_update)(objset_t *os, uint64_t object, ddt_entry_t *dde,
-	    dmu_tx_t *tx);
-	int (*ddt_op_remove)(objset_t *os, uint64_t object, ddt_entry_t *dde,
-	    dmu_tx_t *tx);
-	int (*ddt_op_walk)(objset_t *os, uint64_t object, ddt_entry_t *dde,
-	    uint64_t *walk);
-	int (*ddt_op_count)(objset_t *os, uint64_t object, uint64_t *count);
-} ddt_ops_t;
-
-#define	DDT_NAMELEN	80
-
-extern void ddt_object_name(ddt_t *ddt, enum ddt_type type,
-    enum ddt_class cls, char *name);
-extern int ddt_object_walk(ddt_t *ddt, enum ddt_type type,
-    enum ddt_class cls, uint64_t *walk, ddt_entry_t *dde);
-extern int ddt_object_count(ddt_t *ddt, enum ddt_type type,
-    enum ddt_class cls, uint64_t *count);
-extern int ddt_object_info(ddt_t *ddt, enum ddt_type type,
-    enum ddt_class cls, dmu_object_info_t *);
-extern boolean_t ddt_object_exists(ddt_t *ddt, enum ddt_type type,
-    enum ddt_class cls);
-
-extern void ddt_bp_fill(const ddt_phys_t *ddp, blkptr_t *bp,
-    uint64_t txg);
-extern void ddt_bp_create(enum zio_checksum checksum, const ddt_key_t *ddk,
-    const ddt_phys_t *ddp, blkptr_t *bp);
-
-extern void ddt_key_fill(ddt_key_t *ddk, const blkptr_t *bp);
-
-extern void ddt_phys_fill(ddt_phys_t *ddp, const blkptr_t *bp);
-extern void ddt_phys_clear(ddt_phys_t *ddp);
-extern void ddt_phys_addref(ddt_phys_t *ddp);
-extern void ddt_phys_decref(ddt_phys_t *ddp);
-extern void ddt_phys_free(ddt_t *ddt, ddt_key_t *ddk, ddt_phys_t *ddp,
-    uint64_t txg);
-extern ddt_phys_t *ddt_phys_select(const ddt_entry_t *dde, const blkptr_t *bp);
-extern uint64_t ddt_phys_total_refcnt(const ddt_entry_t *dde);
-
-extern void ddt_stat_add(ddt_stat_t *dst, const ddt_stat_t *src, uint64_t neg);
-
-extern void ddt_histogram_add(ddt_histogram_t *dst, const ddt_histogram_t *src);
-extern void ddt_histogram_stat(ddt_stat_t *dds, const ddt_histogram_t *ddh);
-extern boolean_t ddt_histogram_empty(const ddt_histogram_t *ddh);
-extern void ddt_get_dedup_object_stats(spa_t *spa, ddt_object_t *ddo);
-extern void ddt_get_dedup_histogram(spa_t *spa, ddt_histogram_t *ddh);
-extern void ddt_get_dedup_stats(spa_t *spa, ddt_stat_t *dds_total);
-
-extern uint64_t ddt_get_dedup_dspace(spa_t *spa);
-extern uint64_t ddt_get_pool_dedup_ratio(spa_t *spa);
-
-extern int ddt_ditto_copies_needed(ddt_t *ddt, ddt_entry_t *dde,
-    ddt_phys_t *ddp_willref);
-extern int ddt_ditto_copies_present(ddt_entry_t *dde);
-
-extern size_t ddt_compress(void *src, uchar_t *dst, size_t s_len, size_t d_len);
-extern void ddt_decompress(uchar_t *src, void *dst, size_t s_len, size_t d_len);
-
-extern ddt_t *ddt_select(spa_t *spa, const blkptr_t *bp);
-extern void ddt_enter(ddt_t *ddt);
-extern void ddt_exit(ddt_t *ddt);
-extern ddt_entry_t *ddt_lookup(ddt_t *ddt, const blkptr_t *bp, boolean_t add);
-extern void ddt_prefetch(spa_t *spa, const blkptr_t *bp);
-extern void ddt_remove(ddt_t *ddt, ddt_entry_t *dde);
-
-extern boolean_t ddt_class_contains(spa_t *spa, enum ddt_class max_class,
-    const blkptr_t *bp);
-
-extern ddt_entry_t *ddt_repair_start(ddt_t *ddt, const blkptr_t *bp);
-extern void ddt_repair_done(ddt_t *ddt, ddt_entry_t *dde);
-
-extern int ddt_entry_compare(const void *x1, const void *x2);
-
-extern void ddt_create(spa_t *spa);
-extern int ddt_load(spa_t *spa);
-extern void ddt_unload(spa_t *spa);
-extern void ddt_sync(spa_t *spa, uint64_t txg);
-extern int ddt_walk(spa_t *spa, ddt_bookmark_t *ddb, ddt_entry_t *dde);
-extern int ddt_object_update(ddt_t *ddt, enum ddt_type type,
-    enum ddt_class cls, ddt_entry_t *dde, dmu_tx_t *tx);
-
-extern const ddt_ops_t ddt_zap_ops;
-
-#ifdef	__cplusplus
-}
-#endif
-
-#endif	/* _SYS_DDT_H */
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h
+++ /dev/null
@@ -1,1028 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-
-/*
- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
- * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
- * Copyright (c) 2012, Joyent, Inc. All rights reserved.
- * Copyright 2013 DEY Storage Systems, Inc.
- * Copyright 2014 HybridCluster. All rights reserved.
- * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
- * Copyright 2013 Saso Kiselkov. All rights reserved.
- * Copyright (c) 2017, Intel Corporation.
- * Copyright (c) 2014 Integros [integros.com]
- */
-
-/* Portions Copyright 2010 Robert Milkowski */
-
-#ifndef	_SYS_DMU_H
-#define	_SYS_DMU_H
-
-/*
- * This file describes the interface that the DMU provides for its
- * consumers.
- *
- * The DMU also interacts with the SPA.  That interface is described in
- * dmu_spa.h.
- */
-
-#include <sys/zfs_context.h>
-#include <sys/cred.h>
-#include <sys/fs/zfs.h>
-#include <sys/zio_compress.h>
-#include <sys/zio_priority.h>
-
-#ifdef	__cplusplus
-extern "C" {
-#endif
-
-struct uio;
-struct xuio;
-struct page;
-struct vnode;
-struct spa;
-struct zilog;
-struct zio;
-struct blkptr;
-struct zap_cursor;
-struct dsl_dataset;
-struct dsl_pool;
-struct dnode;
-struct drr_begin;
-struct drr_end;
-struct zbookmark_phys;
-struct spa;
-struct nvlist;
-struct arc_buf;
-struct zio_prop;
-struct sa_handle;
-struct file;
-struct locked_range;
-
-typedef struct objset objset_t;
-typedef struct dmu_tx dmu_tx_t;
-typedef struct dsl_dir dsl_dir_t;
-typedef struct dnode dnode_t;
-
-typedef enum dmu_object_byteswap {
-	DMU_BSWAP_UINT8,
-	DMU_BSWAP_UINT16,
-	DMU_BSWAP_UINT32,
-	DMU_BSWAP_UINT64,
-	DMU_BSWAP_ZAP,
-	DMU_BSWAP_DNODE,
-	DMU_BSWAP_OBJSET,
-	DMU_BSWAP_ZNODE,
-	DMU_BSWAP_OLDACL,
-	DMU_BSWAP_ACL,
-	/*
-	 * Allocating a new byteswap type number makes the on-disk format
-	 * incompatible with any other format that uses the same number.
-	 *
-	 * Data can usually be structured to work with one of the
-	 * DMU_BSWAP_UINT* or DMU_BSWAP_ZAP types.
-	 */
-	DMU_BSWAP_NUMFUNCS
-} dmu_object_byteswap_t;
-
-#define	DMU_OT_NEWTYPE 0x80
-#define	DMU_OT_METADATA 0x40
-#define	DMU_OT_BYTESWAP_MASK 0x3f
-
-/*
- * Defines a uint8_t object type. Object types specify if the data
- * in the object is metadata (boolean) and how to byteswap the data
- * (dmu_object_byteswap_t). All of the types created by this method
- * are cached in the dbuf metadata cache.
- */
-#define	DMU_OT(byteswap, metadata) \
-	(DMU_OT_NEWTYPE | \
-	((metadata) ? DMU_OT_METADATA : 0) | \
-	((byteswap) & DMU_OT_BYTESWAP_MASK))
-
-#define	DMU_OT_IS_VALID(ot) (((ot) & DMU_OT_NEWTYPE) ? \
-	((ot) & DMU_OT_BYTESWAP_MASK) < DMU_BSWAP_NUMFUNCS : \
-	(ot) < DMU_OT_NUMTYPES)
-
-#define	DMU_OT_IS_METADATA(ot) (((ot) & DMU_OT_NEWTYPE) ? \
-	((ot) & DMU_OT_METADATA) : \
-	dmu_ot[(ot)].ot_metadata)
-
-#define	DMU_OT_IS_DDT(ot) \
-	((ot) == DMU_OT_DDT_ZAP)
-
-#define	DMU_OT_IS_ZIL(ot) \
-	((ot) == DMU_OT_INTENT_LOG)
-
-/* Note: ztest uses DMU_OT_UINT64_OTHER as a proxy for file blocks */
-#define	DMU_OT_IS_FILE(ot) \
-	((ot) == DMU_OT_PLAIN_FILE_CONTENTS || (ot) == DMU_OT_UINT64_OTHER)
-
-#define	DMU_OT_IS_METADATA_CACHED(ot) (((ot) & DMU_OT_NEWTYPE) ? \
-	B_TRUE : dmu_ot[(ot)].ot_dbuf_metadata_cache)
-
-/*
- * These object types use bp_fill != 1 for their L0 bp's. Therefore they can't
- * have their data embedded (i.e. use a BP_IS_EMBEDDED() bp), because bp_fill
- * is repurposed for embedded BPs.
- */
-#define	DMU_OT_HAS_FILL(ot) \
-	((ot) == DMU_OT_DNODE || (ot) == DMU_OT_OBJSET)
-
-#define	DMU_OT_BYTESWAP(ot) (((ot) & DMU_OT_NEWTYPE) ? \
-	((ot) & DMU_OT_BYTESWAP_MASK) : \
-	dmu_ot[(ot)].ot_byteswap)
-
-typedef enum dmu_object_type {
-	DMU_OT_NONE,
-	/* general: */
-	DMU_OT_OBJECT_DIRECTORY,	/* ZAP */
-	DMU_OT_OBJECT_ARRAY,		/* UINT64 */
-	DMU_OT_PACKED_NVLIST,		/* UINT8 (XDR by nvlist_pack/unpack) */
-	DMU_OT_PACKED_NVLIST_SIZE,	/* UINT64 */
-	DMU_OT_BPOBJ,			/* UINT64 */
-	DMU_OT_BPOBJ_HDR,		/* UINT64 */
-	/* spa: */
-	DMU_OT_SPACE_MAP_HEADER,	/* UINT64 */
-	DMU_OT_SPACE_MAP,		/* UINT64 */
-	/* zil: */
-	DMU_OT_INTENT_LOG,		/* UINT64 */
-	/* dmu: */
-	DMU_OT_DNODE,			/* DNODE */
-	DMU_OT_OBJSET,			/* OBJSET */
-	/* dsl: */
-	DMU_OT_DSL_DIR,			/* UINT64 */
-	DMU_OT_DSL_DIR_CHILD_MAP,	/* ZAP */
-	DMU_OT_DSL_DS_SNAP_MAP,		/* ZAP */
-	DMU_OT_DSL_PROPS,		/* ZAP */
-	DMU_OT_DSL_DATASET,		/* UINT64 */
-	/* zpl: */
-	DMU_OT_ZNODE,			/* ZNODE */
-	DMU_OT_OLDACL,			/* Old ACL */
-	DMU_OT_PLAIN_FILE_CONTENTS,	/* UINT8 */
-	DMU_OT_DIRECTORY_CONTENTS,	/* ZAP */
-	DMU_OT_MASTER_NODE,		/* ZAP */
-	DMU_OT_UNLINKED_SET,		/* ZAP */
-	/* zvol: */
-	DMU_OT_ZVOL,			/* UINT8 */
-	DMU_OT_ZVOL_PROP,		/* ZAP */
-	/* other; for testing only! */
-	DMU_OT_PLAIN_OTHER,		/* UINT8 */
-	DMU_OT_UINT64_OTHER,		/* UINT64 */
-	DMU_OT_ZAP_OTHER,		/* ZAP */
-	/* new object types: */
-	DMU_OT_ERROR_LOG,		/* ZAP */
-	DMU_OT_SPA_HISTORY,		/* UINT8 */
-	DMU_OT_SPA_HISTORY_OFFSETS,	/* spa_his_phys_t */
-	DMU_OT_POOL_PROPS,		/* ZAP */
-	DMU_OT_DSL_PERMS,		/* ZAP */
-	DMU_OT_ACL,			/* ACL */
-	DMU_OT_SYSACL,			/* SYSACL */
-	DMU_OT_FUID,			/* FUID table (Packed NVLIST UINT8) */
-	DMU_OT_FUID_SIZE,		/* FUID table size UINT64 */
-	DMU_OT_NEXT_CLONES,		/* ZAP */
-	DMU_OT_SCAN_QUEUE,		/* ZAP */
-	DMU_OT_USERGROUP_USED,		/* ZAP */
-	DMU_OT_USERGROUP_QUOTA,		/* ZAP */
-	DMU_OT_USERREFS,		/* ZAP */
-	DMU_OT_DDT_ZAP,			/* ZAP */
-	DMU_OT_DDT_STATS,		/* ZAP */
-	DMU_OT_SA,			/* System attr */
-	DMU_OT_SA_MASTER_NODE,		/* ZAP */
-	DMU_OT_SA_ATTR_REGISTRATION,	/* ZAP */
-	DMU_OT_SA_ATTR_LAYOUTS,		/* ZAP */
-	DMU_OT_SCAN_XLATE,		/* ZAP */
-	DMU_OT_DEDUP,			/* fake dedup BP from ddt_bp_create() */
-	DMU_OT_DEADLIST,		/* ZAP */
-	DMU_OT_DEADLIST_HDR,		/* UINT64 */
-	DMU_OT_DSL_CLONES,		/* ZAP */
-	DMU_OT_BPOBJ_SUBOBJ,		/* UINT64 */
-	/*
-	 * Do not allocate new object types here. Doing so makes the on-disk
-	 * format incompatible with any other format that uses the same object
-	 * type number.
-	 *
-	 * When creating an object which does not have one of the above types
-	 * use the DMU_OTN_* type with the correct byteswap and metadata
-	 * values.
-	 *
-	 * The DMU_OTN_* types do not have entries in the dmu_ot table,
-	 * use the DMU_OT_IS_METDATA() and DMU_OT_BYTESWAP() macros instead
-	 * use the DMU_OT_IS_METADATA() and DMU_OT_BYTESWAP() macros instead
-	 * of indexing into dmu_ot directly (this works for both DMU_OT_* types
-	 * and DMU_OTN_* types).
-	 */
-	DMU_OT_NUMTYPES,
-
-	/*
-	 * Names for valid types declared with DMU_OT().
-	 */
-	DMU_OTN_UINT8_DATA = DMU_OT(DMU_BSWAP_UINT8, B_FALSE),
-	DMU_OTN_UINT8_METADATA = DMU_OT(DMU_BSWAP_UINT8, B_TRUE),
-	DMU_OTN_UINT16_DATA = DMU_OT(DMU_BSWAP_UINT16, B_FALSE),
-	DMU_OTN_UINT16_METADATA = DMU_OT(DMU_BSWAP_UINT16, B_TRUE),
-	DMU_OTN_UINT32_DATA = DMU_OT(DMU_BSWAP_UINT32, B_FALSE),
-	DMU_OTN_UINT32_METADATA = DMU_OT(DMU_BSWAP_UINT32, B_TRUE),
-	DMU_OTN_UINT64_DATA = DMU_OT(DMU_BSWAP_UINT64, B_FALSE),
-	DMU_OTN_UINT64_METADATA = DMU_OT(DMU_BSWAP_UINT64, B_TRUE),
-	DMU_OTN_ZAP_DATA = DMU_OT(DMU_BSWAP_ZAP, B_FALSE),
-	DMU_OTN_ZAP_METADATA = DMU_OT(DMU_BSWAP_ZAP, B_TRUE),
-} dmu_object_type_t;
-
-/*
- * These flags are intended to be used to specify the "txg_how"
- * parameter when calling the dmu_tx_assign() function. See the comment
- * above dmu_tx_assign() for more details on the meaning of these flags.
- */
-#define	TXG_NOWAIT	(0ULL)
-#define	TXG_WAIT	(1ULL<<0)
-#define	TXG_NOTHROTTLE	(1ULL<<1)
-
-void byteswap_uint64_array(void *buf, size_t size);
-void byteswap_uint32_array(void *buf, size_t size);
-void byteswap_uint16_array(void *buf, size_t size);
-void byteswap_uint8_array(void *buf, size_t size);
-void zap_byteswap(void *buf, size_t size);
-void zfs_oldacl_byteswap(void *buf, size_t size);
-void zfs_acl_byteswap(void *buf, size_t size);
-void zfs_znode_byteswap(void *buf, size_t size);
-
-#define	DS_FIND_SNAPSHOTS	(1<<0)
-#define	DS_FIND_CHILDREN	(1<<1)
-#define	DS_FIND_SERIALIZE	(1<<2)
-
-/*
- * The maximum number of bytes that can be accessed as part of one
- * operation, including metadata.
- */
-#define	DMU_MAX_ACCESS (32 * 1024 * 1024) /* 32MB */
-#define	DMU_MAX_DELETEBLKCNT (20480) /* ~5MB of indirect blocks */
-
-#define	DMU_USERUSED_OBJECT	(-1ULL)
-#define	DMU_GROUPUSED_OBJECT	(-2ULL)
-
-/*
- * artificial blkids for bonus buffer and spill blocks
- */
-#define	DMU_BONUS_BLKID		(-1ULL)
-#define	DMU_SPILL_BLKID		(-2ULL)
-/*
- * Public routines to create, destroy, open, and close objsets.
- */
-int dmu_objset_hold(const char *name, void *tag, objset_t **osp);
-int dmu_objset_own(const char *name, dmu_objset_type_t type,
-    boolean_t readonly, void *tag, objset_t **osp);
-void dmu_objset_rele(objset_t *os, void *tag);
-void dmu_objset_disown(objset_t *os, void *tag);
-int dmu_objset_open_ds(struct dsl_dataset *ds, objset_t **osp);
-
-void dmu_objset_evict_dbufs(objset_t *os);
-int dmu_objset_create(const char *name, dmu_objset_type_t type, uint64_t flags,
-    void (*func)(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx), void *arg);
-int dmu_get_recursive_snaps_nvl(char *fsname, const char *snapname,
-    struct nvlist *snaps);
-int dmu_objset_clone(const char *name, const char *origin);
-int dsl_destroy_snapshots_nvl(struct nvlist *snaps, boolean_t defer,
-    struct nvlist *errlist);
-int dmu_objset_snapshot_one(const char *fsname, const char *snapname);
-int dmu_objset_snapshot_tmp(const char *, const char *, int);
-int dmu_objset_find(char *name, int func(const char *, void *), void *arg,
-    int flags);
-void dmu_objset_byteswap(void *buf, size_t size);
-int dsl_dataset_rename_snapshot(const char *fsname,
-    const char *oldsnapname, const char *newsnapname, boolean_t recursive);
-int dmu_objset_remap_indirects(const char *fsname);
-
-typedef struct dmu_buf {
-	uint64_t db_object;		/* object that this buffer is part of */
-	uint64_t db_offset;		/* byte offset in this object */
-	uint64_t db_size;		/* size of buffer in bytes */
-	void *db_data;			/* data in buffer */
-} dmu_buf_t;
-
-/*
- * The names of zap entries in the DIRECTORY_OBJECT of the MOS.
- */
-#define	DMU_POOL_DIRECTORY_OBJECT	1
-#define	DMU_POOL_CONFIG			"config"
-#define	DMU_POOL_FEATURES_FOR_WRITE	"features_for_write"
-#define	DMU_POOL_FEATURES_FOR_READ	"features_for_read"
-#define	DMU_POOL_FEATURE_DESCRIPTIONS	"feature_descriptions"
-#define	DMU_POOL_FEATURE_ENABLED_TXG	"feature_enabled_txg"
-#define	DMU_POOL_ROOT_DATASET		"root_dataset"
-#define	DMU_POOL_SYNC_BPOBJ		"sync_bplist"
-#define	DMU_POOL_ERRLOG_SCRUB		"errlog_scrub"
-#define	DMU_POOL_ERRLOG_LAST		"errlog_last"
-#define	DMU_POOL_SPARES			"spares"
-#define	DMU_POOL_DEFLATE		"deflate"
-#define	DMU_POOL_HISTORY		"history"
-#define	DMU_POOL_PROPS			"pool_props"
-#define	DMU_POOL_L2CACHE		"l2cache"
-#define	DMU_POOL_TMP_USERREFS		"tmp_userrefs"
-#define	DMU_POOL_DDT			"DDT-%s-%s-%s"
-#define	DMU_POOL_DDT_STATS		"DDT-statistics"
-#define	DMU_POOL_CREATION_VERSION	"creation_version"
-#define	DMU_POOL_SCAN			"scan"
-#define	DMU_POOL_FREE_BPOBJ		"free_bpobj"
-#define	DMU_POOL_BPTREE_OBJ		"bptree_obj"
-#define	DMU_POOL_EMPTY_BPOBJ		"empty_bpobj"
-#define	DMU_POOL_CHECKSUM_SALT		"org.illumos:checksum_salt"
-#define	DMU_POOL_VDEV_ZAP_MAP		"com.delphix:vdev_zap_map"
-#define	DMU_POOL_REMOVING		"com.delphix:removing"
-#define	DMU_POOL_OBSOLETE_BPOBJ		"com.delphix:obsolete_bpobj"
-#define	DMU_POOL_CONDENSING_INDIRECT	"com.delphix:condensing_indirect"
-#define	DMU_POOL_ZPOOL_CHECKPOINT	"com.delphix:zpool_checkpoint"
-
-/*
- * Allocate an object from this objset.  The range of object numbers
- * available is (0, DN_MAX_OBJECT).  Object 0 is the meta-dnode.
- *
- * The transaction must be assigned to a txg.  The newly allocated
- * object will be "held" in the transaction (ie. you can modify the
- * newly allocated object in this transaction).
- *
- * dmu_object_alloc() chooses an object and returns it in *objectp.
- *
- * dmu_object_claim() allocates a specific object number.  If that
- * number is already allocated, it fails and returns EEXIST.
- *
- * Return 0 on success, or ENOSPC or EEXIST as specified above.
- */
-uint64_t dmu_object_alloc(objset_t *os, dmu_object_type_t ot,
-    int blocksize, dmu_object_type_t bonus_type, int bonus_len, dmu_tx_t *tx);
-uint64_t dmu_object_alloc_ibs(objset_t *os, dmu_object_type_t ot, int blocksize,
-    int indirect_blockshift,
-    dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx);
-uint64_t dmu_object_alloc_dnsize(objset_t *os, dmu_object_type_t ot,
-    int blocksize, dmu_object_type_t bonus_type, int bonus_len,
-    int dnodesize, dmu_tx_t *tx);
-int dmu_object_claim_dnsize(objset_t *os, uint64_t object, dmu_object_type_t ot,
-    int blocksize, dmu_object_type_t bonus_type, int bonus_len,
-    int dnodesize, dmu_tx_t *tx);
-int dmu_object_reclaim_dnsize(objset_t *os, uint64_t object,
-    dmu_object_type_t ot, int blocksize, dmu_object_type_t bonustype,
-    int bonuslen, int dnodesize, dmu_tx_t *txp);
-int dmu_object_claim(objset_t *os, uint64_t object, dmu_object_type_t ot,
-    int blocksize, dmu_object_type_t bonus_type, int bonus_len, dmu_tx_t *tx);
-int dmu_object_reclaim(objset_t *os, uint64_t object, dmu_object_type_t ot,
-    int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *txp);
-
-/*
- * Free an object from this objset.
- *
- * The object's data will be freed as well (ie. you don't need to call
- * dmu_free(object, 0, -1, tx)).
- *
- * The object need not be held in the transaction.
- *
- * If there are any holds on this object's buffers (via dmu_buf_hold()),
- * or tx holds on the object (via dmu_tx_hold_object()), you can not
- * free it; it fails and returns EBUSY.
- *
- * If the object is not allocated, it fails and returns ENOENT.
- *
- * Return 0 on success, or EBUSY or ENOENT as specified above.
- */
-int dmu_object_free(objset_t *os, uint64_t object, dmu_tx_t *tx);
-
-/*
- * Find the next allocated or free object.
- *
- * The objectp parameter is in-out.  It will be updated to be the next
- * object which is allocated.  Ignore objects which have not been
- * modified since txg.
- *
- * XXX Can only be called on a objset with no dirty data.
- *
- * Returns 0 on success, or ENOENT if there are no more objects.
- */
-int dmu_object_next(objset_t *os, uint64_t *objectp,
-    boolean_t hole, uint64_t txg);
-
-/*
- * Set the data blocksize for an object.
- *
- * The object cannot have any blocks allcated beyond the first.  If
- * the first block is allocated already, the new size must be greater
- * than the current block size.  If these conditions are not met,
- * ENOTSUP will be returned.
- *
- * Returns 0 on success, or EBUSY if there are any holds on the object
- * contents, or ENOTSUP as described above.
- */
-int dmu_object_set_blocksize(objset_t *os, uint64_t object, uint64_t size,
-    int ibs, dmu_tx_t *tx);
-
-/*
- * Set the checksum property on a dnode.  The new checksum algorithm will
- * apply to all newly written blocks; existing blocks will not be affected.
- */
-void dmu_object_set_checksum(objset_t *os, uint64_t object, uint8_t checksum,
-    dmu_tx_t *tx);
-
-/*
- * Set the compress property on a dnode.  The new compression algorithm will
- * apply to all newly written blocks; existing blocks will not be affected.
- */
-void dmu_object_set_compress(objset_t *os, uint64_t object, uint8_t compress,
-    dmu_tx_t *tx);
-
-int dmu_object_remap_indirects(objset_t *os, uint64_t object, uint64_t txg);
-
-void
-dmu_write_embedded(objset_t *os, uint64_t object, uint64_t offset,
-    void *data, uint8_t etype, uint8_t comp, int uncompressed_size,
-    int compressed_size, int byteorder, dmu_tx_t *tx);
-
-/*
- * Decide how to write a block: checksum, compression, number of copies, etc.
- */
-#define	WP_NOFILL	0x1
-#define	WP_DMU_SYNC	0x2
-#define	WP_SPILL	0x4
-
-void dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp,
-    struct zio_prop *zp);
-/*
- * The bonus data is accessed more or less like a regular buffer.
- * You must dmu_bonus_hold() to get the buffer, which will give you a
- * dmu_buf_t with db_offset==-1ULL, and db_size = the size of the bonus
- * data.  As with any normal buffer, you must call dmu_buf_will_dirty()
- * before modifying it, and the
- * object must be held in an assigned transaction before calling
- * dmu_buf_will_dirty.  You may use dmu_buf_set_user() on the bonus
- * buffer as well.  You must release your hold with dmu_buf_rele().
- *
- * Returns ENOENT, EIO, or 0.
- */
-int dmu_bonus_hold(objset_t *os, uint64_t object, void *tag, dmu_buf_t **);
-int dmu_bonus_max(void);
-int dmu_set_bonus(dmu_buf_t *, int, dmu_tx_t *);
-int dmu_set_bonustype(dmu_buf_t *, dmu_object_type_t, dmu_tx_t *);
-dmu_object_type_t dmu_get_bonustype(dmu_buf_t *);
-int dmu_rm_spill(objset_t *, uint64_t, dmu_tx_t *);
-
-/*
- * Special spill buffer support used by "SA" framework
- */
-
-int dmu_spill_hold_by_bonus(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp);
-int dmu_spill_hold_by_dnode(dnode_t *dn, uint32_t flags,
-    void *tag, dmu_buf_t **dbp);
-int dmu_spill_hold_existing(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp);
-
-/*
- * Obtain the DMU buffer from the specified object which contains the
- * specified offset.  dmu_buf_hold() puts a "hold" on the buffer, so
- * that it will remain in memory.  You must release the hold with
- * dmu_buf_rele().  You musn't access the dmu_buf_t after releasing your
- * hold.  You must have a hold on any dmu_buf_t* you pass to the DMU.
- *
- * You must call dmu_buf_read, dmu_buf_will_dirty, or dmu_buf_will_fill
- * on the returned buffer before reading or writing the buffer's
- * db_data.  The comments for those routines describe what particular
- * operations are valid after calling them.
- *
- * The object number must be a valid, allocated object number.
- */
-int dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset,
-    void *tag, dmu_buf_t **, int flags);
-int dmu_buf_hold_by_dnode(dnode_t *dn, uint64_t offset,
-    void *tag, dmu_buf_t **dbp, int flags);
-
-/*
- * Add a reference to a dmu buffer that has already been held via
- * dmu_buf_hold() in the current context.
- */
-void dmu_buf_add_ref(dmu_buf_t *db, void* tag);
-
-/*
- * Attempt to add a reference to a dmu buffer that is in an unknown state,
- * using a pointer that may have been invalidated by eviction processing.
- * The request will succeed if the passed in dbuf still represents the
- * same os/object/blkid, is ineligible for eviction, and has at least
- * one hold by a user other than the syncer.
- */
-boolean_t dmu_buf_try_add_ref(dmu_buf_t *, objset_t *os, uint64_t object,
-    uint64_t blkid, void *tag);
-
-void dmu_buf_rele(dmu_buf_t *db, void *tag);
-uint64_t dmu_buf_refcount(dmu_buf_t *db);
-
-/*
- * dmu_buf_hold_array holds the DMU buffers which contain all bytes in a
- * range of an object.  A pointer to an array of dmu_buf_t*'s is
- * returned (in *dbpp).
- *
- * dmu_buf_rele_array releases the hold on an array of dmu_buf_t*'s, and
- * frees the array.  The hold on the array of buffers MUST be released
- * with dmu_buf_rele_array.  You can NOT release the hold on each buffer
- * individually with dmu_buf_rele.
- */
-int dmu_buf_hold_array_by_bonus(dmu_buf_t *db, uint64_t offset,
-    uint64_t length, boolean_t read, void *tag,
-    int *numbufsp, dmu_buf_t ***dbpp);
-int dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length,
-    boolean_t read, void *tag, int *numbufsp, dmu_buf_t ***dbpp,
-    uint32_t flags);
-void dmu_buf_rele_array(dmu_buf_t **, int numbufs, void *tag);
-
-typedef void dmu_buf_evict_func_t(void *user_ptr);
-
-/*
- * A DMU buffer user object may be associated with a dbuf for the
- * duration of its lifetime.  This allows the user of a dbuf (client)
- * to attach private data to a dbuf (e.g. in-core only data such as a
- * dnode_children_t, zap_t, or zap_leaf_t) and be optionally notified
- * when that dbuf has been evicted.  Clients typically respond to the
- * eviction notification by freeing their private data, thus ensuring
- * the same lifetime for both dbuf and private data.
- *
- * The mapping from a dmu_buf_user_t to any client private data is the
- * client's responsibility.  All current consumers of the API with private
- * data embed a dmu_buf_user_t as the first member of the structure for
- * their private data.  This allows conversions between the two types
- * with a simple cast.  Since the DMU buf user API never needs access
- * to the private data, other strategies can be employed if necessary
- * or convenient for the client (e.g. using container_of() to do the
- * conversion for private data that cannot have the dmu_buf_user_t as
- * its first member).
- *
- * Eviction callbacks are executed without the dbuf mutex held or any
- * other type of mechanism to guarantee that the dbuf is still available.
- * For this reason, users must assume the dbuf has already been freed
- * and not reference the dbuf from the callback context.
- *
- * Users requesting "immediate eviction" are notified as soon as the dbuf
- * is only referenced by dirty records (dirties == holds).  Otherwise the
- * notification occurs after eviction processing for the dbuf begins.
- */
-typedef struct dmu_buf_user {
-	/*
-	 * Asynchronous user eviction callback state.
-	 */
-	taskq_ent_t	dbu_tqent;
-
-	/*
-	 * This instance's eviction function pointers.
-	 *
-	 * dbu_evict_func_sync is called synchronously and then
-	 * dbu_evict_func_async is executed asynchronously on a taskq.
-	 */
-	dmu_buf_evict_func_t *dbu_evict_func_sync;
-	dmu_buf_evict_func_t *dbu_evict_func_async;
-#ifdef ZFS_DEBUG
-	/*
-	 * Pointer to user's dbuf pointer.  NULL for clients that do
-	 * not associate a dbuf with their user data.
-	 *
-	 * The dbuf pointer is cleared upon eviction so as to catch
-	 * use-after-evict bugs in clients.
-	 */
-	dmu_buf_t **dbu_clear_on_evict_dbufp;
-#endif
-} dmu_buf_user_t;
-
-/*
- * Initialize the given dmu_buf_user_t instance with the eviction function
- * evict_func, to be called when the user is evicted.
- *
- * NOTE: This function should only be called once on a given dmu_buf_user_t.
- *       To allow enforcement of this, dbu must already be zeroed on entry.
- */
-/*ARGSUSED*/
-inline void
-dmu_buf_init_user(dmu_buf_user_t *dbu, dmu_buf_evict_func_t *evict_func_sync,
-    dmu_buf_evict_func_t *evict_func_async, dmu_buf_t **clear_on_evict_dbufp)
-{
-	ASSERT(dbu->dbu_evict_func_sync == NULL);
-	ASSERT(dbu->dbu_evict_func_async == NULL);
-
-	/* must have at least one evict func */
-	IMPLY(evict_func_sync == NULL, evict_func_async != NULL);
-	dbu->dbu_evict_func_sync = evict_func_sync;
-	dbu->dbu_evict_func_async = evict_func_async;
-#ifdef ZFS_DEBUG
-	dbu->dbu_clear_on_evict_dbufp = clear_on_evict_dbufp;
-#endif
-}
-
-/*
- * Attach user data to a dbuf and mark it for normal (when the dbuf's
- * data is cleared or its reference count goes to zero) eviction processing.
- *
- * Returns NULL on success, or the existing user if another user currently
- * owns the buffer.
- */
-void *dmu_buf_set_user(dmu_buf_t *db, dmu_buf_user_t *user);
-
-/*
- * Attach user data to a dbuf and mark it for immediate (its dirty and
- * reference counts are equal) eviction processing.
- *
- * Returns NULL on success, or the existing user if another user currently
- * owns the buffer.
- */
-void *dmu_buf_set_user_ie(dmu_buf_t *db, dmu_buf_user_t *user);
-
-/*
- * Replace the current user of a dbuf.
- *
- * If given the current user of a dbuf, replaces the dbuf's user with
- * "new_user" and returns the user data pointer that was replaced.
- * Otherwise returns the current, and unmodified, dbuf user pointer.
- */
-void *dmu_buf_replace_user(dmu_buf_t *db,
-    dmu_buf_user_t *old_user, dmu_buf_user_t *new_user);
-
-/*
- * Remove the specified user data for a DMU buffer.
- *
- * Returns the user that was removed on success, or the current user if
- * another user currently owns the buffer.
- */
-void *dmu_buf_remove_user(dmu_buf_t *db, dmu_buf_user_t *user);
-
-/*
- * Returns the user data (dmu_buf_user_t *) associated with this dbuf.
- */
-void *dmu_buf_get_user(dmu_buf_t *db);
-
-objset_t *dmu_buf_get_objset(dmu_buf_t *db);
-dnode_t *dmu_buf_dnode_enter(dmu_buf_t *db);
-void dmu_buf_dnode_exit(dmu_buf_t *db);
-
-/* Block until any in-progress dmu buf user evictions complete. */
-void dmu_buf_user_evict_wait(void);
-
-/*
- * Returns the blkptr associated with this dbuf, or NULL if not set.
- */
-struct blkptr *dmu_buf_get_blkptr(dmu_buf_t *db);
-
-/*
- * Indicate that you are going to modify the buffer's data (db_data).
- *
- * The transaction (tx) must be assigned to a txg (ie. you've called
- * dmu_tx_assign()).  The buffer's object must be held in the tx
- * (ie. you've called dmu_tx_hold_object(tx, db->db_object)).
- */
-void dmu_buf_will_dirty(dmu_buf_t *db, dmu_tx_t *tx);
-
-/*
- * You must create a transaction, then hold the objects which you will
- * (or might) modify as part of this transaction.  Then you must assign
- * the transaction to a transaction group.  Once the transaction has
- * been assigned, you can modify buffers which belong to held objects as
- * part of this transaction.  You can't modify buffers before the
- * transaction has been assigned; you can't modify buffers which don't
- * belong to objects which this transaction holds; you can't hold
- * objects once the transaction has been assigned.  You may hold an
- * object which you are going to free (with dmu_object_free()), but you
- * don't have to.
- *
- * You can abort the transaction before it has been assigned.
- *
- * Note that you may hold buffers (with dmu_buf_hold) at any time,
- * regardless of transaction state.
- */
-
-#define	DMU_NEW_OBJECT	(-1ULL)
-#define	DMU_OBJECT_END	(-1ULL)
-
-dmu_tx_t *dmu_tx_create(objset_t *os);
-void dmu_tx_hold_write(dmu_tx_t *tx, uint64_t object, uint64_t off, int len);
-void dmu_tx_hold_write_by_dnode(dmu_tx_t *tx, dnode_t *dn, uint64_t off,
-    int len);
-void dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off,
-    uint64_t len);
-void dmu_tx_hold_free_by_dnode(dmu_tx_t *tx, dnode_t *dn, uint64_t off,
-    uint64_t len);
-void dmu_tx_hold_remap_l1indirect(dmu_tx_t *tx, uint64_t object);
-void dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, const char *name);
-void dmu_tx_hold_zap_by_dnode(dmu_tx_t *tx, dnode_t *dn, int add,
-    const char *name);
-void dmu_tx_hold_bonus(dmu_tx_t *tx, uint64_t object);
-void dmu_tx_hold_bonus_by_dnode(dmu_tx_t *tx, dnode_t *dn);
-void dmu_tx_hold_spill(dmu_tx_t *tx, uint64_t object);
-void dmu_tx_hold_sa(dmu_tx_t *tx, struct sa_handle *hdl, boolean_t may_grow);
-void dmu_tx_hold_sa_create(dmu_tx_t *tx, int total_size);
-void dmu_tx_abort(dmu_tx_t *tx);
-int dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how);
-void dmu_tx_wait(dmu_tx_t *tx);
-void dmu_tx_commit(dmu_tx_t *tx);
-void dmu_tx_mark_netfree(dmu_tx_t *tx);
-
-/*
- * To register a commit callback, dmu_tx_callback_register() must be called.
- *
- * dcb_data is a pointer to caller private data that is passed on as a
- * callback parameter. The caller is responsible for properly allocating and
- * freeing it.
- *
- * When registering a callback, the transaction must be already created, but
- * it cannot be committed or aborted. It can be assigned to a txg or not.
- *
- * The callback will be called after the transaction has been safely written
- * to stable storage and will also be called if the dmu_tx is aborted.
- * If there is any error which prevents the transaction from being committed to
- * disk, the callback will be called with a value of error != 0.
- */
-typedef void dmu_tx_callback_func_t(void *dcb_data, int error);
-
-void dmu_tx_callback_register(dmu_tx_t *tx, dmu_tx_callback_func_t *dcb_func,
-    void *dcb_data);
-
-/*
- * Free up the data blocks for a defined range of a file.  If size is
- * -1, the range from offset to end-of-file is freed.
- */
-int dmu_free_range(objset_t *os, uint64_t object, uint64_t offset,
-	uint64_t size, dmu_tx_t *tx);
-int dmu_free_long_range(objset_t *os, uint64_t object, uint64_t offset,
-	uint64_t size);
-int dmu_free_long_object(objset_t *os, uint64_t object);
-
-/*
- * Convenience functions.
- *
- * Canfail routines will return 0 on success, or an errno if there is a
- * nonrecoverable I/O error.
- */
-#define	DMU_READ_PREFETCH	0 /* prefetch */
-#define	DMU_READ_NO_PREFETCH	1 /* don't prefetch */
-int dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
-	void *buf, uint32_t flags);
-int dmu_read_by_dnode(dnode_t *dn, uint64_t offset, uint64_t size, void *buf,
-    uint32_t flags);
-void dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
-	const void *buf, dmu_tx_t *tx);
-void dmu_write_by_dnode(dnode_t *dn, uint64_t offset, uint64_t size,
-    const void *buf, dmu_tx_t *tx);
-void dmu_prealloc(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
-	dmu_tx_t *tx);
-int dmu_read_uio(objset_t *os, uint64_t object, struct uio *uio, uint64_t size);
-int dmu_read_uio_dbuf(dmu_buf_t *zdb, struct uio *uio, uint64_t size);
-int dmu_read_uio_dnode(dnode_t *dn, struct uio *uio, uint64_t size);
-int dmu_write_uio(objset_t *os, uint64_t object, struct uio *uio, uint64_t size,
-    dmu_tx_t *tx);
-int dmu_write_uio_dbuf(dmu_buf_t *zdb, struct uio *uio, uint64_t size,
-    dmu_tx_t *tx);
-int dmu_write_uio_dnode(dnode_t *dn, struct uio *uio, uint64_t size,
-    dmu_tx_t *tx);
-#ifdef _KERNEL
-#ifdef illumos
-int dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset,
-    uint64_t size, struct page *pp, dmu_tx_t *tx);
-#else
-int dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset,
-    uint64_t size, struct vm_page **ppa, dmu_tx_t *tx);
-int dmu_read_pages(objset_t *os, uint64_t object, vm_page_t *ma, int count,
-    int *rbehind, int *rahead, int last_size);
-#endif
-#endif
-struct arc_buf *dmu_request_arcbuf(dmu_buf_t *handle, int size);
-void dmu_return_arcbuf(struct arc_buf *buf);
-void dmu_assign_arcbuf_dnode(dnode_t *handle, uint64_t offset,
-    struct arc_buf *buf, dmu_tx_t *tx);
-void dmu_assign_arcbuf(dmu_buf_t *handle, uint64_t offset, struct arc_buf *buf,
-    dmu_tx_t *tx);
-int dmu_xuio_init(struct xuio *uio, int niov);
-void dmu_xuio_fini(struct xuio *uio);
-int dmu_xuio_add(struct xuio *uio, struct arc_buf *abuf, offset_t off,
-    size_t n);
-int dmu_xuio_cnt(struct xuio *uio);
-struct arc_buf *dmu_xuio_arcbuf(struct xuio *uio, int i);
-void dmu_xuio_clear(struct xuio *uio, int i);
-void xuio_stat_wbuf_copied(void);
-void xuio_stat_wbuf_nocopy(void);
-
-extern boolean_t zfs_prefetch_disable;
-extern int zfs_max_recordsize;
-
-/*
- * Asynchronously try to read in the data.
- */
-void dmu_prefetch(objset_t *os, uint64_t object, int64_t level, uint64_t offset,
-    uint64_t len, enum zio_priority pri);
-
-typedef struct dmu_object_info {
-	/* All sizes are in bytes unless otherwise indicated. */
-	uint32_t doi_data_block_size;
-	uint32_t doi_metadata_block_size;
-	dmu_object_type_t doi_type;
-	dmu_object_type_t doi_bonus_type;
-	uint64_t doi_bonus_size;
-	uint8_t doi_indirection;		/* 2 = dnode->indirect->data */
-	uint8_t doi_checksum;
-	uint8_t doi_compress;
-	uint8_t doi_nblkptr;
-	int8_t doi_pad[4];
-	uint64_t doi_dnodesize;
-	uint64_t doi_physical_blocks_512;	/* data + metadata, 512b blks */
-	uint64_t doi_max_offset;
-	uint64_t doi_fill_count;		/* number of non-empty blocks */
-} dmu_object_info_t;
-
-typedef void arc_byteswap_func_t(void *buf, size_t size);
-
-typedef struct dmu_object_type_info {
-	dmu_object_byteswap_t	ot_byteswap;
-	boolean_t		ot_metadata;
-	boolean_t		ot_dbuf_metadata_cache;
-	char			*ot_name;
-} dmu_object_type_info_t;
-
-typedef struct dmu_object_byteswap_info {
-	arc_byteswap_func_t	*ob_func;
-	char			*ob_name;
-} dmu_object_byteswap_info_t;
-
-extern const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES];
-extern const dmu_object_byteswap_info_t dmu_ot_byteswap[DMU_BSWAP_NUMFUNCS];
-
-/*
- * Get information on a DMU object.
- *
- * Return 0 on success or ENOENT if object is not allocated.
- *
- * If doi is NULL, just indicates whether the object exists.
- */
-int dmu_object_info(objset_t *os, uint64_t object, dmu_object_info_t *doi);
-void __dmu_object_info_from_dnode(struct dnode *dn, dmu_object_info_t *doi);
-/* Like dmu_object_info, but faster if you have a held dnode in hand. */
-void dmu_object_info_from_dnode(dnode_t *dn, dmu_object_info_t *doi);
-/* Like dmu_object_info, but faster if you have a held dbuf in hand. */
-void dmu_object_info_from_db(dmu_buf_t *db, dmu_object_info_t *doi);
-/*
- * Like dmu_object_info_from_db, but faster still when you only care about
- * the size.  This is specifically optimized for zfs_getattr().
- */
-void dmu_object_size_from_db(dmu_buf_t *db, uint32_t *blksize,
-    u_longlong_t *nblk512);
-
-void dmu_object_dnsize_from_db(dmu_buf_t *db, int *dnsize);
-
-typedef struct dmu_objset_stats {
-	uint64_t dds_num_clones; /* number of clones of this */
-	uint64_t dds_creation_txg;
-	uint64_t dds_guid;
-	dmu_objset_type_t dds_type;
-	uint8_t dds_is_snapshot;
-	uint8_t dds_inconsistent;
-	char dds_origin[ZFS_MAX_DATASET_NAME_LEN];
-} dmu_objset_stats_t;
-
-/*
- * Get stats on a dataset.
- */
-void dmu_objset_fast_stat(objset_t *os, dmu_objset_stats_t *stat);
-
-/*
- * Add entries to the nvlist for all the objset's properties.  See
- * zfs_prop_table[] and zfs(1m) for details on the properties.
- */
-void dmu_objset_stats(objset_t *os, struct nvlist *nv);
-
-/*
- * Get the space usage statistics for statvfs().
- *
- * refdbytes is the amount of space "referenced" by this objset.
- * availbytes is the amount of space available to this objset, taking
- * into account quotas & reservations, assuming that no other objsets
- * use the space first.  These values correspond to the 'referenced' and
- * 'available' properties, described in the zfs(1m) manpage.
- *
- * usedobjs and availobjs are the number of objects currently allocated,
- * and available.
- */
-void dmu_objset_space(objset_t *os, uint64_t *refdbytesp, uint64_t *availbytesp,
-    uint64_t *usedobjsp, uint64_t *availobjsp);
-
-/*
- * The fsid_guid is a 56-bit ID that can change to avoid collisions.
- * (Contrast with the ds_guid which is a 64-bit ID that will never
- * change, so there is a small probability that it will collide.)
- */
-uint64_t dmu_objset_fsid_guid(objset_t *os);
-
-/*
- * Get the [cm]time for an objset's snapshot dir
- */
-timestruc_t dmu_objset_snap_cmtime(objset_t *os);
-
-int dmu_objset_is_snapshot(objset_t *os);
-
-extern struct spa *dmu_objset_spa(objset_t *os);
-extern struct zilog *dmu_objset_zil(objset_t *os);
-extern struct dsl_pool *dmu_objset_pool(objset_t *os);
-extern struct dsl_dataset *dmu_objset_ds(objset_t *os);
-extern void dmu_objset_name(objset_t *os, char *buf);
-extern dmu_objset_type_t dmu_objset_type(objset_t *os);
-extern uint64_t dmu_objset_id(objset_t *os);
-extern uint64_t dmu_objset_dnodesize(objset_t *os);
-extern zfs_sync_type_t dmu_objset_syncprop(objset_t *os);
-extern zfs_logbias_op_t dmu_objset_logbias(objset_t *os);
-extern int dmu_snapshot_list_next(objset_t *os, int namelen, char *name,
-    uint64_t *id, uint64_t *offp, boolean_t *case_conflict);
-extern int dmu_snapshot_realname(objset_t *os, char *name, char *real,
-    int maxlen, boolean_t *conflict);
-extern int dmu_dir_list_next(objset_t *os, int namelen, char *name,
-    uint64_t *idp, uint64_t *offp);
-
-typedef int objset_used_cb_t(dmu_object_type_t bonustype,
-    void *bonus, uint64_t *userp, uint64_t *groupp);
-extern void dmu_objset_register_type(dmu_objset_type_t ost,
-    objset_used_cb_t *cb);
-extern void dmu_objset_set_user(objset_t *os, void *user_ptr);
-extern void *dmu_objset_get_user(objset_t *os);
-
-/*
- * Return the txg number for the given assigned transaction.
- */
-uint64_t dmu_tx_get_txg(dmu_tx_t *tx);
-
-/*
- * Synchronous write.
- * If a parent zio is provided this function initiates a write on the
- * provided buffer as a child of the parent zio.
- * In the absence of a parent zio, the write is completed synchronously.
- * At write completion, blk is filled with the bp of the written block.
- * Note that while the data covered by this function will be on stable
- * storage when the write completes this new data does not become a
- * permanent part of the file until the associated transaction commits.
- */
-
-/*
- * {zfs,zvol,ztest}_get_done() args
- */
-typedef struct zgd {
-	struct lwb	*zgd_lwb;
-	struct blkptr	*zgd_bp;
-	dmu_buf_t	*zgd_db;
-	struct locked_range *zgd_lr;
-	void		*zgd_private;
-} zgd_t;
-
-typedef void dmu_sync_cb_t(zgd_t *arg, int error);
-int dmu_sync(struct zio *zio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd);
-
-/*
- * Find the next hole or data block in file starting at *off
- * Return found offset in *off. Return ESRCH for end of file.
- */
-int dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole,
-    uint64_t *off);
-
-/*
- * Check if a DMU object has any dirty blocks. If so, sync out
- * all pending transaction groups. Otherwise, this function
- * does not alter DMU state. This could be improved to only sync
- * out the necessary transaction groups for this particular
- * object.
- */
-int dmu_object_wait_synced(objset_t *os, uint64_t object);
-
-/*
- * Initial setup and final teardown.
- */
-extern void dmu_init(void);
-extern void dmu_fini(void);
-
-typedef void (*dmu_traverse_cb_t)(objset_t *os, void *arg, struct blkptr *bp,
-    uint64_t object, uint64_t offset, int len);
-void dmu_traverse_objset(objset_t *os, uint64_t txg_start,
-    dmu_traverse_cb_t cb, void *arg);
-int dmu_diff(const char *tosnap_name, const char *fromsnap_name,
-    struct file *fp, offset_t *offp);
-
-/* CRC64 table */
-#define	ZFS_CRC64_POLY	0xC96C5795D7870F42ULL	/* ECMA-182, reflected form */
-extern uint64_t zfs_crc64_table[256];
-
-extern int zfs_mdcomp_disable;
-
-#ifdef	__cplusplus
-}
-#endif
-
-#endif	/* _SYS_DMU_H */
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_impl.h
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_impl.h
+++ /dev/null
@@ -1,315 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-/*
- * Copyright (c) 2012, Joyent, Inc. All rights reserved.
- * Copyright (c) 2012, Martin Matuska <mm@FreeBSD.org>. All rights reserved.
- * Copyright (c) 2013, 2015 by Delphix. All rights reserved.
- */
-
-#ifndef _SYS_DMU_IMPL_H
-#define	_SYS_DMU_IMPL_H
-
-#include <sys/txg_impl.h>
-#include <sys/zio.h>
-#include <sys/dnode.h>
-#include <sys/kstat.h>
-#include <sys/zfs_context.h>
-#include <sys/zfs_ioctl.h>
-
-#ifdef	__cplusplus
-extern "C" {
-#endif
-
-/*
- * This is the locking strategy for the DMU.  Numbers in parenthesis are
- * cases that use that lock order, referenced below:
- *
- * ARC is self-contained
- * bplist is self-contained
- * refcount is self-contained
- * txg is self-contained (hopefully!)
- * zst_lock
- * zf_rwlock
- *
- * XXX try to improve evicting path?
- *
- * dp_config_rwlock > os_obj_lock > dn_struct_rwlock >
- * 	dn_dbufs_mtx > hash_mutexes > db_mtx > dd_lock > leafs
- *
- * dp_config_rwlock
- *    must be held before: everything
- *    protects dd namespace changes
- *    protects property changes globally
- *    held from:
- *    	dsl_dir_open/r:
- *    	dsl_dir_create_sync/w:
- *    	dsl_dir_sync_destroy/w:
- *    	dsl_dir_rename_sync/w:
- *    	dsl_prop_changed_notify/r:
- *
- * os_obj_lock
- *   must be held before:
- *   	everything except dp_config_rwlock
- *   protects os_obj_next
- *   held from:
- *   	dmu_object_alloc: dn_dbufs_mtx, db_mtx, hash_mutexes, dn_struct_rwlock
- *
- * dn_struct_rwlock
- *   must be held before:
- *   	everything except dp_config_rwlock and os_obj_lock
- *   protects structure of dnode (eg. nlevels)
- *   	db_blkptr can change when syncing out change to nlevels
- *   	dn_maxblkid
- *   	dn_nlevels
- *   	dn_*blksz*
- *   	phys nlevels, maxblkid, physical blkptr_t's (?)
- *   held from:
- *   	callers of dbuf_read_impl, dbuf_hold[_impl], dbuf_prefetch
- *   	dmu_object_info_from_dnode: dn_dirty_mtx (dn_datablksz)
- *   	dbuf_read_impl: db_mtx, dmu_zfetch()
- *   	dmu_zfetch: zf_rwlock/r, zst_lock, dbuf_prefetch()
- *   	dbuf_new_size: db_mtx
- *   	dbuf_dirty: db_mtx
- *	dbuf_findbp: (callers, phys? - the real need)
- *	dbuf_create: dn_dbufs_mtx, hash_mutexes, db_mtx (phys?)
- *	dbuf_prefetch: dn_dirty_mtx, hash_mutexes, db_mtx, dn_dbufs_mtx
- *	dbuf_hold_impl: hash_mutexes, db_mtx, dn_dbufs_mtx, dbuf_findbp()
- *	dnode_sync/w (increase_indirection): db_mtx (phys)
- *	dnode_set_blksz/w: dn_dbufs_mtx (dn_*blksz*)
- *	dnode_new_blkid/w: (dn_maxblkid)
- *	dnode_free_range/w: dn_dirty_mtx (dn_maxblkid)
- *	dnode_next_offset: (phys)
- *
- * dn_dbufs_mtx
- *    must be held before:
- *    	db_mtx, hash_mutexes
- *    protects:
- *    	dn_dbufs
- *    	dn_evicted
- *    held from:
- *    	dmu_evict_user: db_mtx (dn_dbufs)
- *    	dbuf_free_range: db_mtx (dn_dbufs)
- *    	dbuf_remove_ref: db_mtx, callees:
- *    		dbuf_hash_remove: hash_mutexes, db_mtx
- *    	dbuf_create: hash_mutexes, db_mtx (dn_dbufs)
- *    	dnode_set_blksz: (dn_dbufs)
- *
- * hash_mutexes (global)
- *   must be held before:
- *   	db_mtx
- *   protects dbuf_hash_table (global) and db_hash_next
- *   held from:
- *   	dbuf_find: db_mtx
- *   	dbuf_hash_insert: db_mtx
- *   	dbuf_hash_remove: db_mtx
- *
- * db_mtx (meta-leaf)
- *   must be held before:
- *   	dn_mtx, dn_dirty_mtx, dd_lock (leaf mutexes)
- *   protects:
- *   	db_state
- * 	db_holds
- * 	db_buf
- * 	db_changed
- * 	db_data_pending
- * 	db_dirtied
- * 	db_link
- * 	db_dirty_node (??)
- * 	db_dirtycnt
- * 	db_d.*
- * 	db.*
- *   held from:
- * 	dbuf_dirty: dn_mtx, dn_dirty_mtx
- * 	dbuf_dirty->dsl_dir_willuse_space: dd_lock
- * 	dbuf_dirty->dbuf_new_block->dsl_dataset_block_freeable: dd_lock
- * 	dbuf_undirty: dn_dirty_mtx (db_d)
- * 	dbuf_write_done: dn_dirty_mtx (db_state)
- * 	dbuf_*
- * 	dmu_buf_update_user: none (db_d)
- * 	dmu_evict_user: none (db_d) (maybe can eliminate)
- *   	dbuf_find: none (db_holds)
- *   	dbuf_hash_insert: none (db_holds)
- *   	dmu_buf_read_array_impl: none (db_state, db_changed)
- *   	dmu_sync: none (db_dirty_node, db_d)
- *   	dnode_reallocate: none (db)
- *
- * dn_mtx (leaf)
- *   protects:
- *   	dn_dirty_dbufs
- *   	dn_ranges
- *   	phys accounting
- * 	dn_allocated_txg
- * 	dn_free_txg
- * 	dn_assigned_txg
- * 	dn_dirty_txg
- * 	dn_notxholds
- * 	dn_dirtyctx
- * 	dn_dirtyctx_firstset
- * 	(dn_phys copy fields?)
- * 	(dn_phys contents?)
- *   held from:
- *   	dnode_*
- *   	dbuf_dirty: none
- *   	dbuf_sync: none (phys accounting)
- *   	dbuf_undirty: none (dn_ranges, dn_dirty_dbufs)
- *   	dbuf_write_done: none (phys accounting)
- *   	dmu_object_info_from_dnode: none (accounting)
- *   	dmu_tx_commit: none
- *   	dmu_tx_hold_object_impl: none
- *   	dmu_tx_try_assign: dn_notxholds(cv)
- *   	dmu_tx_unassign: none
- *
- * dd_lock
- *    must be held before:
- *      ds_lock
- *      ancestors' dd_lock
- *    protects:
- *    	dd_prop_cbs
- *    	dd_sync_*
- *    	dd_used_bytes
- *    	dd_tempreserved
- *    	dd_space_towrite
- *    	dd_myname
- *    	dd_phys accounting?
- *    held from:
- *    	dsl_dir_*
- *    	dsl_prop_changed_notify: none (dd_prop_cbs)
- *    	dsl_prop_register: none (dd_prop_cbs)
- *    	dsl_prop_unregister: none (dd_prop_cbs)
- *
- * os_lock (leaf)
- *   protects:
- *   	os_dirty_dnodes
- *   	os_free_dnodes
- *   	os_dnodes
- *   	os_downgraded_dbufs
- *   	dn_dirtyblksz
- *   	dn_dirty_link
- *   held from:
- *   	dnode_create: none (os_dnodes)
- *   	dnode_destroy: none (os_dnodes)
- *   	dnode_setdirty: none (dn_dirtyblksz, os_*_dnodes)
- *   	dnode_free: none (dn_dirtyblksz, os_*_dnodes)
- *
- * ds_lock
- *    protects:
- *    	ds_objset
- *    	ds_open_refcount
- *    	ds_snapname
- *    	ds_phys accounting
- *	ds_phys userrefs zapobj
- *	ds_reserved
- *    held from:
- *    	dsl_dataset_*
- *
- * dr_mtx (leaf)
- *    protects:
- *	dr_children
- *    held from:
- *	dbuf_dirty
- *	dbuf_undirty
- *	dbuf_sync_indirect
- *	dnode_new_blkid
- */
-
-struct objset;
-struct dmu_pool;
-
-typedef struct dmu_xuio {
-	int next;
-	int cnt;
-	struct arc_buf **bufs;
-	iovec_t *iovp;
-} dmu_xuio_t;
-
-typedef struct xuio_stats {
-	/* loaned yet not returned arc_buf */
-	kstat_named_t xuiostat_onloan_rbuf;
-	kstat_named_t xuiostat_onloan_wbuf;
-	/* whether a copy is made when loaning out a read buffer */
-	kstat_named_t xuiostat_rbuf_copied;
-	kstat_named_t xuiostat_rbuf_nocopy;
-	/* whether a copy is made when assigning a write buffer */
-	kstat_named_t xuiostat_wbuf_copied;
-	kstat_named_t xuiostat_wbuf_nocopy;
-} xuio_stats_t;
-
-static xuio_stats_t xuio_stats = {
-	{ "onloan_read_buf",	KSTAT_DATA_UINT64 },
-	{ "onloan_write_buf",	KSTAT_DATA_UINT64 },
-	{ "read_buf_copied",	KSTAT_DATA_UINT64 },
-	{ "read_buf_nocopy",	KSTAT_DATA_UINT64 },
-	{ "write_buf_copied",	KSTAT_DATA_UINT64 },
-	{ "write_buf_nocopy",	KSTAT_DATA_UINT64 }
-};
-
-#define	XUIOSTAT_INCR(stat, val)	\
-	atomic_add_64(&xuio_stats.stat.value.ui64, (val))
-#define	XUIOSTAT_BUMP(stat)	XUIOSTAT_INCR(stat, 1)
-
-/*
- * The list of data whose inclusion in a send stream can be pending from
- * one call to backup_cb to another.  Multiple calls to dump_free() and
- * dump_freeobjects() can be aggregated into a single DRR_FREE or
- * DRR_FREEOBJECTS replay record.
- */
-typedef enum {
-	PENDING_NONE,
-	PENDING_FREE,
-	PENDING_FREEOBJECTS
-} dmu_pendop_t;
-
-typedef struct dmu_sendarg {
-	list_node_t dsa_link;
-	dmu_replay_record_t *dsa_drr;
-	kthread_t *dsa_td;
-	struct file *dsa_fp;
-	int dsa_outfd;
-	struct proc *dsa_proc;
-	offset_t *dsa_off;
-	objset_t *dsa_os;
-	zio_cksum_t dsa_zc;
-	uint64_t dsa_toguid;
-	int dsa_err;
-	dmu_pendop_t dsa_pending_op;
-	uint64_t dsa_featureflags;
-	uint64_t dsa_last_data_object;
-	uint64_t dsa_last_data_offset;
-	uint64_t dsa_resume_object;
-	uint64_t dsa_resume_offset;
-	boolean_t dsa_sent_begin;
-	boolean_t dsa_sent_end;
-} dmu_sendarg_t;
-
-void dmu_object_zapify(objset_t *, uint64_t, dmu_object_type_t, dmu_tx_t *);
-void dmu_object_free_zapified(objset_t *, uint64_t, dmu_tx_t *);
-int dmu_buf_hold_noread(objset_t *, uint64_t, uint64_t,
-    void *, dmu_buf_t **);
-
-#ifdef	__cplusplus
-}
-#endif
-
-#endif	/* _SYS_DMU_IMPL_H */
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_objset.h
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_objset.h
+++ /dev/null
@@ -1,221 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
- * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
- * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
- * Copyright (c) 2014 Integros [integros.com]
- */
-
-/* Portions Copyright 2010 Robert Milkowski */
-
-#ifndef	_SYS_DMU_OBJSET_H
-#define	_SYS_DMU_OBJSET_H
-
-#include <sys/spa.h>
-#include <sys/arc.h>
-#include <sys/txg.h>
-#include <sys/zfs_context.h>
-#include <sys/dnode.h>
-#include <sys/zio.h>
-#include <sys/zil.h>
-#include <sys/sa.h>
-#include <sys/zfs_ioctl.h>
-
-#ifdef	__cplusplus
-extern "C" {
-#endif
-
-extern krwlock_t os_lock;
-
-struct dsl_pool;
-struct dsl_dataset;
-struct dmu_tx;
-
-#define	OBJSET_PHYS_SIZE 2048
-#define	OBJSET_OLD_PHYS_SIZE 1024
-
-#define	OBJSET_BUF_HAS_USERUSED(buf) \
-	(arc_buf_size(buf) > OBJSET_OLD_PHYS_SIZE)
-
-#define	OBJSET_FLAG_USERACCOUNTING_COMPLETE	(1ULL<<0)
-
-typedef struct objset_phys {
-	dnode_phys_t os_meta_dnode;
-	zil_header_t os_zil_header;
-	uint64_t os_type;
-	uint64_t os_flags;
-	char os_pad[OBJSET_PHYS_SIZE - sizeof (dnode_phys_t)*3 -
-	    sizeof (zil_header_t) - sizeof (uint64_t)*2];
-	dnode_phys_t os_userused_dnode;
-	dnode_phys_t os_groupused_dnode;
-} objset_phys_t;
-
-#define	OBJSET_PROP_UNINITIALIZED	((uint64_t)-1)
-struct objset {
-	/* Immutable: */
-	struct dsl_dataset *os_dsl_dataset;
-	spa_t *os_spa;
-	arc_buf_t *os_phys_buf;
-	objset_phys_t *os_phys;
-	/*
-	 * The following "special" dnodes have no parent, are exempt
-	 * from dnode_move(), and are not recorded in os_dnodes, but they
-	 * root their descendents in this objset using handles anyway, so
-	 * that all access to dnodes from dbufs consistently uses handles.
-	 */
-	dnode_handle_t os_meta_dnode;
-	dnode_handle_t os_userused_dnode;
-	dnode_handle_t os_groupused_dnode;
-	zilog_t *os_zil;
-
-	list_node_t os_evicting_node;
-
-	/* can change, under dsl_dir's locks: */
-	uint64_t os_dnodesize; /* default dnode size for new objects */
-	enum zio_checksum os_checksum;
-	enum zio_compress os_compress;
-	uint8_t os_copies;
-	enum zio_checksum os_dedup_checksum;
-	boolean_t os_dedup_verify;
-	zfs_logbias_op_t os_logbias;
-	zfs_cache_type_t os_primary_cache;
-	zfs_cache_type_t os_secondary_cache;
-	zfs_sync_type_t os_sync;
-	zfs_redundant_metadata_type_t os_redundant_metadata;
-	int os_recordsize;
-	/*
-	 * The next four values are used as a cache of whatever's on disk, and
-	 * are initialized the first time these properties are queried. Before
-	 * being initialized with their real values, their values are
-	 * OBJSET_PROP_UNINITIALIZED.
-	 */
-	uint64_t os_version;
-	uint64_t os_normalization;
-	uint64_t os_utf8only;
-	uint64_t os_casesensitivity;
-	/*
-	 * The largest zpl file block allowed in special class.
-	 * cached here instead of zfsvfs for easier access.
-	 */
-	int os_zpl_special_smallblock;
-
-	/*
-	 * Pointer is constant; the blkptr it points to is protected by
-	 * os_dsl_dataset->ds_bp_rwlock
-	 */
-	blkptr_t *os_rootbp;
-
-	/* no lock needed: */
-	struct dmu_tx *os_synctx; /* XXX sketchy */
-	zil_header_t os_zil_header;
-	multilist_t *os_synced_dnodes;
-	uint64_t os_flags;
-	uint64_t os_freed_dnodes;
-	boolean_t os_rescan_dnodes;
-
-	/* Protected by os_obj_lock */
-	kmutex_t os_obj_lock;
-	uint64_t os_obj_next_chunk;
-
-	/* Per-CPU next object to allocate, protected by atomic ops. */
-	uint64_t *os_obj_next_percpu;
-	int os_obj_next_percpu_len;
-
-	/* Protected by os_lock */
-	kmutex_t os_lock;
-	multilist_t *os_dirty_dnodes[TXG_SIZE];
-	list_t os_dnodes;
-	list_t os_downgraded_dbufs;
-
-	/* Protects changes to DMU_{USER,GROUP}USED_OBJECT */
-	kmutex_t os_userused_lock;
-
-	/* stuff we store for the user */
-	kmutex_t os_user_ptr_lock;
-	void *os_user_ptr;
-	sa_os_t *os_sa;
-};
-
-#define	DMU_META_OBJSET		0
-#define	DMU_META_DNODE_OBJECT	0
-#define	DMU_OBJECT_IS_SPECIAL(obj) ((int64_t)(obj) <= 0)
-#define	DMU_META_DNODE(os)	((os)->os_meta_dnode.dnh_dnode)
-#define	DMU_USERUSED_DNODE(os)	((os)->os_userused_dnode.dnh_dnode)
-#define	DMU_GROUPUSED_DNODE(os)	((os)->os_groupused_dnode.dnh_dnode)
-
-#define	DMU_OS_IS_L2CACHEABLE(os)				\
-	((os)->os_secondary_cache == ZFS_CACHE_ALL ||		\
-	(os)->os_secondary_cache == ZFS_CACHE_METADATA)
-
-#define	DMU_OS_IS_L2COMPRESSIBLE(os)	(zfs_mdcomp_disable == B_FALSE)
-
-/* called from zpl */
-int dmu_objset_hold(const char *name, void *tag, objset_t **osp);
-int dmu_objset_own(const char *name, dmu_objset_type_t type,
-    boolean_t readonly, void *tag, objset_t **osp);
-int dmu_objset_own_obj(struct dsl_pool *dp, uint64_t obj,
-    dmu_objset_type_t type, boolean_t readonly, void *tag, objset_t **osp);
-void dmu_objset_refresh_ownership(struct dsl_dataset *ds,
-    struct dsl_dataset **newds, void *tag);
-void dmu_objset_rele(objset_t *os, void *tag);
-void dmu_objset_disown(objset_t *os, void *tag);
-int dmu_objset_from_ds(struct dsl_dataset *ds, objset_t **osp);
-
-void dmu_objset_stats(objset_t *os, nvlist_t *nv);
-void dmu_objset_fast_stat(objset_t *os, dmu_objset_stats_t *stat);
-void dmu_objset_space(objset_t *os, uint64_t *refdbytesp, uint64_t *availbytesp,
-    uint64_t *usedobjsp, uint64_t *availobjsp);
-uint64_t dmu_objset_fsid_guid(objset_t *os);
-int dmu_objset_find_dp(struct dsl_pool *dp, uint64_t ddobj,
-    int func(struct dsl_pool *, struct dsl_dataset *, void *),
-    void *arg, int flags);
-int dmu_objset_prefetch(const char *name, void *arg);
-void dmu_objset_evict_dbufs(objset_t *os);
-timestruc_t dmu_objset_snap_cmtime(objset_t *os);
-
-/* called from dsl */
-void dmu_objset_sync(objset_t *os, zio_t *zio, dmu_tx_t *tx);
-boolean_t dmu_objset_is_dirty(objset_t *os, uint64_t txg);
-objset_t *dmu_objset_create_impl(spa_t *spa, struct dsl_dataset *ds,
-    blkptr_t *bp, dmu_objset_type_t type, dmu_tx_t *tx);
-int dmu_objset_open_impl(spa_t *spa, struct dsl_dataset *ds, blkptr_t *bp,
-    objset_t **osp);
-void dmu_objset_evict(objset_t *os);
-void dmu_objset_do_userquota_updates(objset_t *os, dmu_tx_t *tx);
-void dmu_objset_userquota_get_ids(dnode_t *dn, boolean_t before, dmu_tx_t *tx);
-boolean_t dmu_objset_userused_enabled(objset_t *os);
-int dmu_objset_userspace_upgrade(objset_t *os);
-boolean_t dmu_objset_userspace_present(objset_t *os);
-int dmu_fsname(const char *snapname, char *buf);
-
-void dmu_objset_evict_done(objset_t *os);
-void dmu_objset_willuse_space(objset_t *os, int64_t space, dmu_tx_t *tx);
-
-void dmu_objset_init(void);
-void dmu_objset_fini(void);
-
-#ifdef	__cplusplus
-}
-#endif
-
-#endif /* _SYS_DMU_OBJSET_H */
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_send.h
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_send.h
+++ /dev/null
@@ -1,93 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-
-/*
- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
- * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
- * Copyright (c) 2013, Joyent, Inc. All rights reserved.
- * Copyright (c) 2014 Integros [integros.com]
- */
-
-#ifndef _DMU_SEND_H
-#define	_DMU_SEND_H
-
-#include <sys/spa.h>
-
-struct vnode;
-struct dsl_dataset;
-struct drr_begin;
-struct avl_tree;
-struct dmu_replay_record;
-
-extern const char *recv_clone_name;
-
-int dmu_send(const char *tosnap, const char *fromsnap, boolean_t embedok,
-    boolean_t large_block_ok, boolean_t compressok, int outfd,
-    uint64_t resumeobj, uint64_t resumeoff,
-#ifdef illumos
-    struct vnode *vp, offset_t *off);
-#else
-    struct file *fp, offset_t *off);
-#endif
-int dmu_send_estimate(struct dsl_dataset *ds, struct dsl_dataset *fromds,
-    boolean_t stream_compressed, uint64_t *sizep);
-int dmu_send_estimate_from_txg(struct dsl_dataset *ds, uint64_t fromtxg,
-    boolean_t stream_compressed, uint64_t *sizep);
-int dmu_send_obj(const char *pool, uint64_t tosnap, uint64_t fromsnap,
-    boolean_t embedok, boolean_t large_block_ok, boolean_t compressok,
-#ifdef illumos
-    int outfd, struct vnode *vp, offset_t *off);
-#else
-    int outfd, struct file *fp, offset_t *off);
-#endif
-
-typedef struct dmu_recv_cookie {
-	struct dsl_dataset *drc_ds;
-	struct dmu_replay_record *drc_drr_begin;
-	struct drr_begin *drc_drrb;
-	const char *drc_tofs;
-	const char *drc_tosnap;
-	boolean_t drc_newfs;
-	boolean_t drc_byteswap;
-	boolean_t drc_force;
-	boolean_t drc_resumable;
-	boolean_t drc_clone;
-	struct avl_tree *drc_guid_to_ds_map;
-	zio_cksum_t drc_cksum;
-	uint64_t drc_newsnapobj;
-	void *drc_owner;
-	cred_t *drc_cred;
-} dmu_recv_cookie_t;
-
-int dmu_recv_begin(char *tofs, char *tosnap,
-    struct dmu_replay_record *drr_begin,
-    boolean_t force, boolean_t resumable, char *origin, dmu_recv_cookie_t *drc);
-#ifdef illumos
-int dmu_recv_stream(dmu_recv_cookie_t *drc, struct vnode *vp, offset_t *voffp,
-#else
-int dmu_recv_stream(dmu_recv_cookie_t *drc, struct file *fp, offset_t *voffp,
-#endif
-    int cleanup_fd, uint64_t *action_handlep);
-int dmu_recv_end(dmu_recv_cookie_t *drc, void *owner);
-boolean_t dmu_objset_is_receiving(objset_t *os);
-
-#endif /* _DMU_SEND_H */
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_traverse.h
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_traverse.h
+++ /dev/null
@@ -1,69 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
- */
-
-#ifndef	_SYS_DMU_TRAVERSE_H
-#define	_SYS_DMU_TRAVERSE_H
-
-#include <sys/zfs_context.h>
-#include <sys/spa.h>
-#include <sys/zio.h>
-
-#ifdef	__cplusplus
-extern "C" {
-#endif
-
-struct dnode_phys;
-struct dsl_dataset;
-struct zilog;
-struct arc_buf;
-
-typedef int (blkptr_cb_t)(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
-    const zbookmark_phys_t *zb, const struct dnode_phys *dnp, void *arg);
-
-#define	TRAVERSE_PRE			(1<<0)
-#define	TRAVERSE_POST			(1<<1)
-#define	TRAVERSE_PREFETCH_METADATA	(1<<2)
-#define	TRAVERSE_PREFETCH_DATA		(1<<3)
-#define	TRAVERSE_PREFETCH (TRAVERSE_PREFETCH_METADATA | TRAVERSE_PREFETCH_DATA)
-#define	TRAVERSE_HARD			(1<<4)
-
-/* Special traverse error return value to indicate skipping of children */
-#define	TRAVERSE_VISIT_NO_CHILDREN	-1
-
-int traverse_dataset(struct dsl_dataset *ds,
-    uint64_t txg_start, int flags, blkptr_cb_t func, void *arg);
-int traverse_dataset_resume(struct dsl_dataset *ds, uint64_t txg_start,
-    zbookmark_phys_t *resume, int flags, blkptr_cb_t func, void *arg);
-int traverse_dataset_destroyed(spa_t *spa, blkptr_t *blkptr,
-    uint64_t txg_start, zbookmark_phys_t *resume, int flags,
-    blkptr_cb_t func, void *arg);
-int traverse_pool(spa_t *spa,
-    uint64_t txg_start, int flags, blkptr_cb_t func, void *arg);
-
-#ifdef	__cplusplus
-}
-#endif
-
-#endif /* _SYS_DMU_TRAVERSE_H */
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_tx.h
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_tx.h
+++ /dev/null
@@ -1,152 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-/*
- * Copyright (c) 2012, 2016 by Delphix. All rights reserved.
- */
-
-#ifndef	_SYS_DMU_TX_H
-#define	_SYS_DMU_TX_H
-
-#include <sys/dmu.h>
-#include <sys/txg.h>
-#include <sys/refcount.h>
-
-#ifdef	__cplusplus
-extern "C" {
-#endif
-
-struct dmu_buf_impl;
-struct dmu_tx_hold;
-struct dnode_link;
-struct dsl_pool;
-struct dnode;
-struct dsl_dir;
-
-struct dmu_tx {
-	/*
-	 * No synchronization is needed because a tx can only be handled
-	 * by one thread.
-	 */
-	list_t tx_holds; /* list of dmu_tx_hold_t */
-	objset_t *tx_objset;
-	struct dsl_dir *tx_dir;
-	struct dsl_pool *tx_pool;
-	uint64_t tx_txg;
-	uint64_t tx_lastsnap_txg;
-	uint64_t tx_lasttried_txg;
-	txg_handle_t tx_txgh;
-	void *tx_tempreserve_cookie;
-	struct dmu_tx_hold *tx_needassign_txh;
-
-	/* list of dmu_tx_callback_t on this dmu_tx */
-	list_t tx_callbacks;
-
-	/* placeholder for syncing context, doesn't need specific holds */
-	boolean_t tx_anyobj;
-
-	/* transaction is marked as being a "net free" of space */
-	boolean_t tx_netfree;
-
-	/* time this transaction was created */
-	hrtime_t tx_start;
-
-	/* need to wait for sufficient dirty space */
-	boolean_t tx_wait_dirty;
-
-	/* has this transaction already been delayed? */
-	boolean_t tx_dirty_delayed;
-
-	int tx_err;
-};
-
-enum dmu_tx_hold_type {
-	THT_NEWOBJECT,
-	THT_WRITE,
-	THT_BONUS,
-	THT_FREE,
-	THT_ZAP,
-	THT_SPACE,
-	THT_SPILL,
-	THT_NUMTYPES
-};
-
-typedef struct dmu_tx_hold {
-	dmu_tx_t *txh_tx;
-	list_node_t txh_node;
-	struct dnode *txh_dnode;
-	zfs_refcount_t txh_space_towrite;
-	zfs_refcount_t txh_memory_tohold;
-	enum dmu_tx_hold_type txh_type;
-	uint64_t txh_arg1;
-	uint64_t txh_arg2;
-} dmu_tx_hold_t;
-
-typedef struct dmu_tx_callback {
-	list_node_t		dcb_node;    /* linked to tx_callbacks list */
-	dmu_tx_callback_func_t	*dcb_func;   /* caller function pointer */
-	void			*dcb_data;   /* caller private data */
-} dmu_tx_callback_t;
-
-/*
- * These routines are defined in dmu.h, and are called by the user.
- */
-dmu_tx_t *dmu_tx_create(objset_t *dd);
-int dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how);
-void dmu_tx_commit(dmu_tx_t *tx);
-void dmu_tx_abort(dmu_tx_t *tx);
-uint64_t dmu_tx_get_txg(dmu_tx_t *tx);
-struct dsl_pool *dmu_tx_pool(dmu_tx_t *tx);
-void dmu_tx_wait(dmu_tx_t *tx);
-
-void dmu_tx_callback_register(dmu_tx_t *tx, dmu_tx_callback_func_t *dcb_func,
-    void *dcb_data);
-void dmu_tx_do_callbacks(list_t *cb_list, int error);
-
-/*
- * These routines are defined in dmu_spa.h, and are called by the SPA.
- */
-extern dmu_tx_t *dmu_tx_create_assigned(struct dsl_pool *dp, uint64_t txg);
-
-/*
- * These routines are only called by the DMU.
- */
-dmu_tx_t *dmu_tx_create_dd(dsl_dir_t *dd);
-int dmu_tx_is_syncing(dmu_tx_t *tx);
-int dmu_tx_private_ok(dmu_tx_t *tx);
-void dmu_tx_add_new_object(dmu_tx_t *tx, dnode_t *dn);
-void dmu_tx_dirty_buf(dmu_tx_t *tx, struct dmu_buf_impl *db);
-void dmu_tx_hold_space(dmu_tx_t *tx, uint64_t space);
-
-#ifdef ZFS_DEBUG
-#define	DMU_TX_DIRTY_BUF(tx, db)	dmu_tx_dirty_buf(tx, db)
-#else
-#define	DMU_TX_DIRTY_BUF(tx, db)
-#endif
-
-#ifdef	__cplusplus
-}
-#endif
-
-#endif	/* _SYS_DMU_TX_H */
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_zfetch.h
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_zfetch.h
+++ /dev/null
@@ -1,76 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-
-/*
- * Copyright (c) 2014 by Delphix. All rights reserved.
- */
-
-#ifndef	_DMU_ZFETCH_H
-#define	_DMU_ZFETCH_H
-
-#include <sys/zfs_context.h>
-
-#ifdef	__cplusplus
-extern "C" {
-#endif
-
-extern uint64_t	zfetch_array_rd_sz;
-
-struct dnode;				/* so we can reference dnode */
-
-typedef struct zstream {
-	uint64_t	zs_blkid;	/* expect next access at this blkid */
-	uint64_t	zs_pf_blkid;	/* next block to prefetch */
-
-	/*
-	 * We will next prefetch the L1 indirect block of this level-0
-	 * block id.
-	 */
-	uint64_t	zs_ipf_blkid;
-
-	kmutex_t	zs_lock;	/* protects stream */
-	hrtime_t	zs_atime;	/* time last prefetch issued */
-	list_node_t	zs_node;	/* link for zf_stream */
-} zstream_t;
-
-typedef struct zfetch {
-	krwlock_t	zf_rwlock;	/* protects zfetch structure */
-	list_t		zf_stream;	/* list of zstream_t's */
-	struct dnode	*zf_dnode;	/* dnode that owns this zfetch */
-} zfetch_t;
-
-void		zfetch_init(void);
-void		zfetch_fini(void);
-
-void		dmu_zfetch_init(zfetch_t *, struct dnode *);
-void		dmu_zfetch_fini(zfetch_t *);
-void		dmu_zfetch(zfetch_t *, uint64_t, uint64_t, boolean_t);
-
-
-#ifdef	__cplusplus
-}
-#endif
-
-#endif	/* _DMU_ZFETCH_H */
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dnode.h
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dnode.h
+++ /dev/null
@@ -1,599 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
- * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
- */
-
-#ifndef	_SYS_DNODE_H
-#define	_SYS_DNODE_H
-
-#include <sys/zfs_context.h>
-#include <sys/avl.h>
-#include <sys/spa.h>
-#include <sys/txg.h>
-#include <sys/zio.h>
-#include <sys/refcount.h>
-#include <sys/dmu_zfetch.h>
-#include <sys/zrlock.h>
-#include <sys/multilist.h>
-
-#ifdef	__cplusplus
-extern "C" {
-#endif
-
-/*
- * dnode_hold() flags.
- */
-#define	DNODE_MUST_BE_ALLOCATED	1
-#define	DNODE_MUST_BE_FREE	2
-
-/*
- * dnode_next_offset() flags.
- */
-#define	DNODE_FIND_HOLE		1
-#define	DNODE_FIND_BACKWARDS	2
-#define	DNODE_FIND_HAVELOCK	4
-
-/*
- * Fixed constants.
- */
-#define	DNODE_SHIFT		9	/* 512 bytes */
-#define	DN_MIN_INDBLKSHIFT	12	/* 4k */
-/*
- * If we ever increase this value beyond 20, we need to revisit all logic that
- * does x << level * ebps to handle overflow.  With a 1M indirect block size,
- * 4 levels of indirect blocks would not be able to guarantee addressing an
- * entire object, so 5 levels will be used, but 5 * (20 - 7) = 65.
- */
-#define	DN_MAX_INDBLKSHIFT	17	/* 128k */
-#define	DNODE_BLOCK_SHIFT	14	/* 16k */
-#define	DNODE_CORE_SIZE		64	/* 64 bytes for dnode sans blkptrs */
-#define	DN_MAX_OBJECT_SHIFT	48	/* 256 trillion (zfs_fid_t limit) */
-#define	DN_MAX_OFFSET_SHIFT	64	/* 2^64 bytes in a dnode */
-
-/*
- * dnode id flags
- *
- * Note: a file will never ever have its
- * ids moved from bonus->spill
- * and only in a crypto environment would it be on spill
- */
-#define	DN_ID_CHKED_BONUS	0x1
-#define	DN_ID_CHKED_SPILL	0x2
-#define	DN_ID_OLD_EXIST		0x4
-#define	DN_ID_NEW_EXIST		0x8
-
-/*
- * Derived constants.
- */
-#define	DNODE_MIN_SIZE		(1 << DNODE_SHIFT)
-#define	DNODE_MAX_SIZE		(1 << DNODE_BLOCK_SHIFT)
-#define	DNODE_BLOCK_SIZE	(1 << DNODE_BLOCK_SHIFT)
-#define	DNODE_MIN_SLOTS		(DNODE_MIN_SIZE >> DNODE_SHIFT)
-#define	DNODE_MAX_SLOTS		(DNODE_MAX_SIZE >> DNODE_SHIFT)
-#define	DN_BONUS_SIZE(dnsize)	((dnsize) - DNODE_CORE_SIZE - \
-	(1 << SPA_BLKPTRSHIFT))
-#define	DN_SLOTS_TO_BONUSLEN(slots)	DN_BONUS_SIZE((slots) << DNODE_SHIFT)
-#define	DN_OLD_MAX_BONUSLEN	(DN_BONUS_SIZE(DNODE_MIN_SIZE))
-#define	DN_MAX_NBLKPTR	((DNODE_MIN_SIZE - DNODE_CORE_SIZE) >> SPA_BLKPTRSHIFT)
-#define	DN_MAX_OBJECT		(1ULL << DN_MAX_OBJECT_SHIFT)
-#define	DN_ZERO_BONUSLEN	(DN_BONUS_SIZE(DNODE_MAX_SIZE) + 1)
-#define	DN_KILL_SPILLBLK	(1)
-
-#define	DN_SLOT_UNINIT		((void *)NULL)	/* Uninitialized */
-#define	DN_SLOT_FREE		((void *)1UL)	/* Free slot */
-#define	DN_SLOT_ALLOCATED	((void *)2UL)	/* Allocated slot */
-#define	DN_SLOT_INTERIOR	((void *)3UL)	/* Interior allocated slot */
-#define	DN_SLOT_IS_PTR(dn)	((void *)dn > DN_SLOT_INTERIOR)
-#define	DN_SLOT_IS_VALID(dn)	((void *)dn != NULL)
-
-#define	DNODES_PER_BLOCK_SHIFT	(DNODE_BLOCK_SHIFT - DNODE_SHIFT)
-#define	DNODES_PER_BLOCK	(1ULL << DNODES_PER_BLOCK_SHIFT)
-
-/*
- * This is inaccurate if the indblkshift of the particular object is not the
- * max.  But it's only used by userland to calculate the zvol reservation.
- */
-#define	DNODES_PER_LEVEL_SHIFT	(DN_MAX_INDBLKSHIFT - SPA_BLKPTRSHIFT)
-#define	DNODES_PER_LEVEL	(1ULL << DNODES_PER_LEVEL_SHIFT)
-
-/* The +2 here is a cheesy way to round up */
-#define	DN_MAX_LEVELS	(2 + ((DN_MAX_OFFSET_SHIFT - SPA_MINBLOCKSHIFT) / \
-	(DN_MIN_INDBLKSHIFT - SPA_BLKPTRSHIFT)))
-
-#define	DN_BONUS(dnp)	((void*)((dnp)->dn_bonus + \
-	(((dnp)->dn_nblkptr - 1) * sizeof (blkptr_t))))
-#define	DN_MAX_BONUS_LEN(dnp) \
-	((dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) ? \
-	(uint8_t *)DN_SPILL_BLKPTR(dnp) - (uint8_t *)DN_BONUS(dnp) : \
-	(uint8_t *)(dnp + (dnp->dn_extra_slots + 1)) - (uint8_t *)DN_BONUS(dnp))
-
-#define	DN_USED_BYTES(dnp) (((dnp)->dn_flags & DNODE_FLAG_USED_BYTES) ? \
-	(dnp)->dn_used : (dnp)->dn_used << SPA_MINBLOCKSHIFT)
-
-#define	EPB(blkshift, typeshift)	(1 << (blkshift - typeshift))
-
-struct dmu_buf_impl;
-struct objset;
-struct zio;
-
-enum dnode_dirtycontext {
-	DN_UNDIRTIED,
-	DN_DIRTY_OPEN,
-	DN_DIRTY_SYNC
-};
-
-/* Is dn_used in bytes?  if not, it's in multiples of SPA_MINBLOCKSIZE */
-#define	DNODE_FLAG_USED_BYTES		(1<<0)
-#define	DNODE_FLAG_USERUSED_ACCOUNTED	(1<<1)
-
-/* Does dnode have a SA spill blkptr in bonus? */
-#define	DNODE_FLAG_SPILL_BLKPTR	(1<<2)
-
-/*
- * VARIABLE-LENGTH (LARGE) DNODES
- *
- * The motivation for variable-length dnodes is to eliminate the overhead
- * associated with using spill blocks.  Spill blocks are used to store
- * system attribute data (i.e. file metadata) that does not fit in the
- * dnode's bonus buffer. By allowing a larger bonus buffer area the use of
- * a spill block can be avoided.  Spill blocks potentially incur an
- * additional read I/O for every dnode in a dnode block. As a worst case
- * example, reading 32 dnodes from a 16k dnode block and all of the spill
- * blocks could issue 33 separate reads. Now suppose those dnodes have size
- * 1024 and therefore don't need spill blocks. Then the worst case number
- * of blocks read is reduced to from 33 to two--one per dnode block.
- *
- * ZFS-on-Linux systems that make heavy use of extended attributes benefit
- * from this feature. In particular, ZFS-on-Linux supports the xattr=sa
- * dataset property which allows file extended attribute data to be stored
- * in the dnode bonus buffer as an alternative to the traditional
- * directory-based format. Workloads such as SELinux and the Lustre
- * distributed filesystem often store enough xattr data to force spill
- * blocks when xattr=sa is in effect. Large dnodes may therefore provide a
- * performance benefit to such systems. Other use cases that benefit from
- * this feature include files with large ACLs and symbolic links with long
- * target names.
- *
- * The size of a dnode may be a multiple of 512 bytes up to the size of a
- * dnode block (currently 16384 bytes). The dn_extra_slots field of the
- * on-disk dnode_phys_t structure describes the size of the physical dnode
- * on disk. The field represents how many "extra" dnode_phys_t slots a
- * dnode consumes in its dnode block. This convention results in a value of
- * 0 for 512 byte dnodes which preserves on-disk format compatibility with
- * older software which doesn't support large dnodes.
- *
- * Similarly, the in-memory dnode_t structure has a dn_num_slots field
- * to represent the total number of dnode_phys_t slots consumed on disk.
- * Thus dn->dn_num_slots is 1 greater than the corresponding
- * dnp->dn_extra_slots. This difference in convention was adopted
- * because, unlike on-disk structures, backward compatibility is not a
- * concern for in-memory objects, so we used a more natural way to
- * represent size for a dnode_t.
- *
- * The default size for newly created dnodes is determined by the value of
- * the "dnodesize" dataset property. By default the property is set to
- * "legacy" which is compatible with older software. Setting the property
- * to "auto" will allow the filesystem to choose the most suitable dnode
- * size. Currently this just sets the default dnode size to 1k, but future
- * code improvements could dynamically choose a size based on observed
- * workload patterns. Dnodes of varying sizes can coexist within the same
- * dataset and even within the same dnode block.
- */
-
-typedef struct dnode_phys {
-	uint8_t dn_type;		/* dmu_object_type_t */
-	uint8_t dn_indblkshift;		/* ln2(indirect block size) */
-	uint8_t dn_nlevels;		/* 1=dn_blkptr->data blocks */
-	uint8_t dn_nblkptr;		/* length of dn_blkptr */
-	uint8_t dn_bonustype;		/* type of data in bonus buffer */
-	uint8_t	dn_checksum;		/* ZIO_CHECKSUM type */
-	uint8_t	dn_compress;		/* ZIO_COMPRESS type */
-	uint8_t dn_flags;		/* DNODE_FLAG_* */
-	uint16_t dn_datablkszsec;	/* data block size in 512b sectors */
-	uint16_t dn_bonuslen;		/* length of dn_bonus */
-	uint8_t dn_extra_slots;		/* # of subsequent slots consumed */
-	uint8_t dn_pad2[3];
-
-	/* accounting is protected by dn_dirty_mtx */
-	uint64_t dn_maxblkid;		/* largest allocated block ID */
-	uint64_t dn_used;		/* bytes (or sectors) of disk space */
-
-	/*
-	 * Both dn_pad2 and dn_pad3 are protected by the block's MAC. This
-	 * allows us to protect any fields that might be added here in the
-	 * future. In either case, developers will want to check
-	 * zio_crypt_init_uios_dnode() to ensure the new field is being
-	 * protected properly.
-	 */
-	uint64_t dn_pad3[4];
-	union {
-		blkptr_t dn_blkptr[1+DN_OLD_MAX_BONUSLEN/sizeof (blkptr_t)];
-		struct {
-			blkptr_t __dn_ignore1;
-			uint8_t dn_bonus[DN_OLD_MAX_BONUSLEN];
-		};
-		struct {
-			blkptr_t __dn_ignore2;
-			uint8_t __dn_ignore3[DN_OLD_MAX_BONUSLEN -
-			    sizeof (blkptr_t)];
-			blkptr_t dn_spill;
-		};
-	};
-} dnode_phys_t;
-
-#define	DN_SPILL_BLKPTR(dnp)	(blkptr_t *)((char *)(dnp) + \
-	(((dnp)->dn_extra_slots + 1) << DNODE_SHIFT) - (1 << SPA_BLKPTRSHIFT))
-
-struct dnode {
-	/*
-	 * Protects the structure of the dnode, including the number of levels
-	 * of indirection (dn_nlevels), dn_maxblkid, and dn_next_*
-	 */
-	krwlock_t dn_struct_rwlock;
-
-	/* Our link on dn_objset->os_dnodes list; protected by os_lock.  */
-	list_node_t dn_link;
-
-	/* immutable: */
-	struct objset *dn_objset;
-	uint64_t dn_object;
-	struct dmu_buf_impl *dn_dbuf;
-	struct dnode_handle *dn_handle;
-	dnode_phys_t *dn_phys; /* pointer into dn->dn_dbuf->db.db_data */
-
-	/*
-	 * Copies of stuff in dn_phys.  They're valid in the open
-	 * context (eg. even before the dnode is first synced).
-	 * Where necessary, these are protected by dn_struct_rwlock.
-	 */
-	dmu_object_type_t dn_type;	/* object type */
-	uint16_t dn_bonuslen;		/* bonus length */
-	uint8_t dn_bonustype;		/* bonus type */
-	uint8_t dn_nblkptr;		/* number of blkptrs (immutable) */
-	uint8_t dn_checksum;		/* ZIO_CHECKSUM type */
-	uint8_t dn_compress;		/* ZIO_COMPRESS type */
-	uint8_t dn_nlevels;
-	uint8_t dn_indblkshift;
-	uint8_t dn_datablkshift;	/* zero if blksz not power of 2! */
-	uint8_t dn_moved;		/* Has this dnode been moved? */
-	uint16_t dn_datablkszsec;	/* in 512b sectors */
-	uint32_t dn_datablksz;		/* in bytes */
-	uint64_t dn_maxblkid;
-	uint8_t dn_next_type[TXG_SIZE];
-	uint8_t dn_num_slots;		/* metadnode slots consumed on disk */
-	uint8_t dn_next_nblkptr[TXG_SIZE];
-	uint8_t dn_next_nlevels[TXG_SIZE];
-	uint8_t dn_next_indblkshift[TXG_SIZE];
-	uint8_t dn_next_bonustype[TXG_SIZE];
-	uint8_t dn_rm_spillblk[TXG_SIZE];	/* for removing spill blk */
-	uint16_t dn_next_bonuslen[TXG_SIZE];
-	uint32_t dn_next_blksz[TXG_SIZE];	/* next block size in bytes */
-
-	/* protected by dn_dbufs_mtx; declared here to fill 32-bit hole */
-	uint32_t dn_dbufs_count;	/* count of dn_dbufs */
-
-	/* protected by os_lock: */
-	multilist_node_t dn_dirty_link[TXG_SIZE]; /* next on dataset's dirty */
-
-	/* protected by dn_mtx: */
-	kmutex_t dn_mtx;
-	list_t dn_dirty_records[TXG_SIZE];
-	struct range_tree *dn_free_ranges[TXG_SIZE];
-	uint64_t dn_allocated_txg;
-	uint64_t dn_free_txg;
-	uint64_t dn_assigned_txg;
-	uint64_t dn_dirty_txg;			/* txg dnode was last dirtied */
-	kcondvar_t dn_notxholds;
-	enum dnode_dirtycontext dn_dirtyctx;
-	uint8_t *dn_dirtyctx_firstset;		/* dbg: contents meaningless */
-
-	/* protected by own devices */
-	zfs_refcount_t dn_tx_holds;
-	zfs_refcount_t dn_holds;
-
-	kmutex_t dn_dbufs_mtx;
-	/*
-	 * Descendent dbufs, ordered by dbuf_compare. Note that dn_dbufs
-	 * can contain multiple dbufs of the same (level, blkid) when a
-	 * dbuf is marked DB_EVICTING without being removed from
-	 * dn_dbufs. To maintain the avl invariant that there cannot be
-	 * duplicate entries, we order the dbufs by an arbitrary value -
-	 * their address in memory. This means that dn_dbufs cannot be used to
-	 * directly look up a dbuf. Instead, callers must use avl_walk, have
-	 * a reference to the dbuf, or look up a non-existant node with
-	 * db_state = DB_SEARCH (see dbuf_free_range for an example).
-	 */
-	avl_tree_t dn_dbufs;
-
-	/* protected by dn_struct_rwlock */
-	struct dmu_buf_impl *dn_bonus;	/* bonus buffer dbuf */
-
-	boolean_t dn_have_spill;	/* have spill or are spilling */
-
-	/* parent IO for current sync write */
-	zio_t *dn_zio;
-
-	/* used in syncing context */
-	uint64_t dn_oldused;	/* old phys used bytes */
-	uint64_t dn_oldflags;	/* old phys dn_flags */
-	uint64_t dn_olduid, dn_oldgid;
-	uint64_t dn_newuid, dn_newgid;
-	int dn_id_flags;
-
-	/* holds prefetch structure */
-	struct zfetch	dn_zfetch;
-};
-
-/*
- * Since AVL already has embedded element counter, use dn_dbufs_count
- * only for dbufs not counted there (bonus buffers) and just add them.
- */
-#define	DN_DBUFS_COUNT(dn)	((dn)->dn_dbufs_count + \
-    avl_numnodes(&(dn)->dn_dbufs))
-
-/*
- * Adds a level of indirection between the dbuf and the dnode to avoid
- * iterating descendent dbufs in dnode_move(). Handles are not allocated
- * individually, but as an array of child dnodes in dnode_hold_impl().
- */
-typedef struct dnode_handle {
-	/* Protects dnh_dnode from modification by dnode_move(). */
-	zrlock_t dnh_zrlock;
-	dnode_t *dnh_dnode;
-} dnode_handle_t;
-
-typedef struct dnode_children {
-	dmu_buf_user_t dnc_dbu;		/* User evict data */
-	size_t dnc_count;		/* number of children */
-	dnode_handle_t dnc_children[];	/* sized dynamically */
-} dnode_children_t;
-
-typedef struct free_range {
-	avl_node_t fr_node;
-	uint64_t fr_blkid;
-	uint64_t fr_nblks;
-} free_range_t;
-
-void dnode_special_open(struct objset *dd, dnode_phys_t *dnp,
-    uint64_t object, dnode_handle_t *dnh);
-void dnode_special_close(dnode_handle_t *dnh);
-
-void dnode_setbonuslen(dnode_t *dn, int newsize, dmu_tx_t *tx);
-void dnode_setbonus_type(dnode_t *dn, dmu_object_type_t, dmu_tx_t *tx);
-void dnode_rm_spill(dnode_t *dn, dmu_tx_t *tx);
-
-int dnode_hold(struct objset *dd, uint64_t object,
-    void *ref, dnode_t **dnp);
-int dnode_hold_impl(struct objset *dd, uint64_t object, int flag, int dn_slots,
-    void *ref, dnode_t **dnp);
-boolean_t dnode_add_ref(dnode_t *dn, void *ref);
-void dnode_rele(dnode_t *dn, void *ref);
-void dnode_rele_and_unlock(dnode_t *dn, void *tag, boolean_t evicting);
-void dnode_setdirty(dnode_t *dn, dmu_tx_t *tx);
-void dnode_sync(dnode_t *dn, dmu_tx_t *tx);
-void dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs,
-    dmu_object_type_t bonustype, int bonuslen, int dn_slots, dmu_tx_t *tx);
-void dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize,
-    dmu_object_type_t bonustype, int bonuslen, int dn_slots, dmu_tx_t *tx);
-void dnode_free(dnode_t *dn, dmu_tx_t *tx);
-void dnode_byteswap(dnode_phys_t *dnp);
-void dnode_buf_byteswap(void *buf, size_t size);
-void dnode_verify(dnode_t *dn);
-int dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx);
-void dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx);
-void dnode_diduse_space(dnode_t *dn, int64_t space);
-void dnode_new_blkid(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx, boolean_t);
-uint64_t dnode_block_freed(dnode_t *dn, uint64_t blkid);
-void dnode_init(void);
-void dnode_fini(void);
-int dnode_next_offset(dnode_t *dn, int flags, uint64_t *off,
-    int minlvl, uint64_t blkfill, uint64_t txg);
-void dnode_evict_dbufs(dnode_t *dn);
-void dnode_evict_bonus(dnode_t *dn);
-void dnode_free_interior_slots(dnode_t *dn);
-boolean_t dnode_needs_remap(const dnode_t *dn);
-
-#define	DNODE_IS_DIRTY(_dn)						\
-	((_dn)->dn_dirty_txg >= spa_syncing_txg((_dn)->dn_objset->os_spa))
-
-#define	DNODE_IS_CACHEABLE(_dn)						\
-	((_dn)->dn_objset->os_primary_cache == ZFS_CACHE_ALL ||		\
-	(DMU_OT_IS_METADATA((_dn)->dn_type) &&				\
-	(_dn)->dn_objset->os_primary_cache == ZFS_CACHE_METADATA))
-
-#define	DNODE_META_IS_CACHEABLE(_dn)					\
-	((_dn)->dn_objset->os_primary_cache == ZFS_CACHE_ALL ||		\
-	(_dn)->dn_objset->os_primary_cache == ZFS_CACHE_METADATA)
-
-/*
- * Used for dnodestats kstat.
- */
-typedef struct dnode_stats {
-	/*
-	 * Number of failed attempts to hold a meta dnode dbuf.
-	 */
-	kstat_named_t dnode_hold_dbuf_hold;
-	/*
-	 * Number of failed attempts to read a meta dnode dbuf.
-	 */
-	kstat_named_t dnode_hold_dbuf_read;
-	/*
-	 * Number of times dnode_hold(..., DNODE_MUST_BE_ALLOCATED) was able
-	 * to hold the requested object number which was allocated.  This is
-	 * the common case when looking up any allocated object number.
-	 */
-	kstat_named_t dnode_hold_alloc_hits;
-	/*
-	 * Number of times dnode_hold(..., DNODE_MUST_BE_ALLOCATED) was not
-	 * able to hold the request object number because it was not allocated.
-	 */
-	kstat_named_t dnode_hold_alloc_misses;
-	/*
-	 * Number of times dnode_hold(..., DNODE_MUST_BE_ALLOCATED) was not
-	 * able to hold the request object number because the object number
-	 * refers to an interior large dnode slot.
-	 */
-	kstat_named_t dnode_hold_alloc_interior;
-	/*
-	 * Number of times dnode_hold(..., DNODE_MUST_BE_ALLOCATED) needed
-	 * to retry acquiring slot zrl locks due to contention.
-	 */
-	kstat_named_t dnode_hold_alloc_lock_retry;
-	/*
-	 * Number of times dnode_hold(..., DNODE_MUST_BE_ALLOCATED) did not
-	 * need to create the dnode because another thread did so after
-	 * dropping the read lock but before acquiring the write lock.
-	 */
-	kstat_named_t dnode_hold_alloc_lock_misses;
-	/*
-	 * Number of times dnode_hold(..., DNODE_MUST_BE_ALLOCATED) found
-	 * a free dnode instantiated by dnode_create() but not yet allocated
-	 * by dnode_allocate().
-	 */
-	kstat_named_t dnode_hold_alloc_type_none;
-	/*
-	 * Number of times dnode_hold(..., DNODE_MUST_BE_FREE) was able
-	 * to hold the requested range of free dnode slots.
-	 */
-	kstat_named_t dnode_hold_free_hits;
-	/*
-	 * Number of times dnode_hold(..., DNODE_MUST_BE_FREE) was not
-	 * able to hold the requested range of free dnode slots because
-	 * at least one slot was allocated.
-	 */
-	kstat_named_t dnode_hold_free_misses;
-	/*
-	 * Number of times dnode_hold(..., DNODE_MUST_BE_FREE) was not
-	 * able to hold the requested range of free dnode slots because
-	 * after acquiring the zrl lock at least one slot was allocated.
-	 */
-	kstat_named_t dnode_hold_free_lock_misses;
-	/*
-	 * Number of times dnode_hold(..., DNODE_MUST_BE_FREE) needed
-	 * to retry acquiring slot zrl locks due to contention.
-	 */
-	kstat_named_t dnode_hold_free_lock_retry;
-	/*
-	 * Number of times dnode_hold(..., DNODE_MUST_BE_FREE) requested
-	 * a range of dnode slots which were held by another thread.
-	 */
-	kstat_named_t dnode_hold_free_refcount;
-	/*
-	 * Number of times dnode_hold(..., DNODE_MUST_BE_FREE) requested
-	 * a range of dnode slots which would overflow the dnode_phys_t.
-	 */
-	kstat_named_t dnode_hold_free_overflow;
-	/*
-	 * Number of times a dnode_hold(...) was attempted on a dnode
-	 * which had already been unlinked in an earlier txg.
-	 */
-	kstat_named_t dnode_hold_free_txg;
-	/*
-	 * Number of times dnode_free_interior_slots() needed to retry
-	 * acquiring a slot zrl lock due to contention.
-	 */
-	kstat_named_t dnode_free_interior_lock_retry;
-	/*
-	 * Number of new dnodes allocated by dnode_allocate().
-	 */
-	kstat_named_t dnode_allocate;
-	/*
-	 * Number of dnodes re-allocated by dnode_reallocate().
-	 */
-	kstat_named_t dnode_reallocate;
-	/*
-	 * Number of meta dnode dbufs evicted.
-	 */
-	kstat_named_t dnode_buf_evict;
-	/*
-	 * Number of times dmu_object_alloc*() reached the end of the existing
-	 * object ID chunk and advanced to a new one.
-	 */
-	kstat_named_t dnode_alloc_next_chunk;
-	/*
-	 * Number of times multiple threads attempted to allocate a dnode
-	 * from the same block of free dnodes.
-	 */
-	kstat_named_t dnode_alloc_race;
-	/*
-	 * Number of times dmu_object_alloc*() was forced to advance to the
-	 * next meta dnode dbuf due to an error from  dmu_object_next().
-	 */
-	kstat_named_t dnode_alloc_next_block;
-	/*
-	 * Statistics for tracking dnodes which have been moved.
-	 */
-	kstat_named_t dnode_move_invalid;
-	kstat_named_t dnode_move_recheck1;
-	kstat_named_t dnode_move_recheck2;
-	kstat_named_t dnode_move_special;
-	kstat_named_t dnode_move_handle;
-	kstat_named_t dnode_move_rwlock;
-	kstat_named_t dnode_move_active;
-} dnode_stats_t;
-
-extern dnode_stats_t dnode_stats;
-
-#define	DNODE_STAT_INCR(stat, val) \
-    atomic_add_64(&dnode_stats.stat.value.ui64, (val));
-#define	DNODE_STAT_BUMP(stat) \
-    DNODE_STAT_INCR(stat, 1);
-
-#ifdef ZFS_DEBUG
-
-/*
- * There should be a ## between the string literal and fmt, to make it
- * clear that we're joining two strings together, but that piece of shit
- * gcc doesn't support that preprocessor token.
- */
-#define	dprintf_dnode(dn, fmt, ...) do { \
-	if (zfs_flags & ZFS_DEBUG_DPRINTF) { \
-	char __db_buf[32]; \
-	uint64_t __db_obj = (dn)->dn_object; \
-	if (__db_obj == DMU_META_DNODE_OBJECT) \
-		(void) strcpy(__db_buf, "mdn"); \
-	else \
-		(void) snprintf(__db_buf, sizeof (__db_buf), "%lld", \
-		    (u_longlong_t)__db_obj);\
-	dprintf_ds((dn)->dn_objset->os_dsl_dataset, "obj=%s " fmt, \
-	    __db_buf, __VA_ARGS__); \
-	} \
-_NOTE(CONSTCOND) } while (0)
-
-#define	DNODE_VERIFY(dn)		dnode_verify(dn)
-#define	FREE_VERIFY(db, start, end, tx)	free_verify(db, start, end, tx)
-
-#else
-
-#define	dprintf_dnode(db, fmt, ...)
-#define	DNODE_VERIFY(dn)
-#define	FREE_VERIFY(db, start, end, tx)
-
-#endif
-
-#ifdef	__cplusplus
-}
-#endif
-
-#endif	/* _SYS_DNODE_H */
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_bookmark.h
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_bookmark.h
+++ /dev/null
@@ -1,52 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * This file and its contents are supplied under the terms of the
- * Common Development and Distribution License ("CDDL"), version 1.0.
- * You may only use this file in accordance with the terms of version
- * 1.0 of the CDDL.
- *
- * A full copy of the text of the CDDL should have accompanied this
- * source.  A copy of the CDDL is also available via the Internet at
- * http://www.illumos.org/license/CDDL.
- *
- * CDDL HEADER END
- */
-/*
- * Copyright (c) 2013 by Delphix. All rights reserved.
- */
-
-#ifndef	_SYS_DSL_BOOKMARK_H
-#define	_SYS_DSL_BOOKMARK_H
-
-#include <sys/zfs_context.h>
-
-#ifdef	__cplusplus
-extern "C" {
-#endif
-
-struct dsl_pool;
-struct dsl_dataset;
-
-/*
- * On disk zap object.
- */
-typedef struct zfs_bookmark_phys {
-	uint64_t zbm_guid;		/* guid of bookmarked dataset */
-	uint64_t zbm_creation_txg;	/* birth transaction group */
-	uint64_t zbm_creation_time;	/* bookmark creation time */
-} zfs_bookmark_phys_t;
-
-int dsl_bookmark_create(nvlist_t *, nvlist_t *);
-int dsl_get_bookmarks(const char *, nvlist_t *, nvlist_t *);
-int dsl_get_bookmarks_impl(dsl_dataset_t *, nvlist_t *, nvlist_t *);
-int dsl_bookmark_destroy(nvlist_t *, nvlist_t *);
-int dsl_bookmark_rename(const char *fs, const char *from, const char *to);
-int dsl_bookmark_lookup(struct dsl_pool *, const char *,
-    struct dsl_dataset *, zfs_bookmark_phys_t *);
-
-#ifdef	__cplusplus
-}
-#endif
-
-#endif /* _SYS_DSL_BOOKMARK_H */
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dataset.h
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dataset.h
+++ /dev/null
@@ -1,457 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, 2017 by Delphix. All rights reserved.
- * Copyright (c) 2013, Joyent, Inc. All rights reserved.
- * Copyright (c) 2013 Steven Hartland. All rights reserved.
- * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
- * Copyright (c) 2014 Integros [integros.com]
- */
-
-#ifndef	_SYS_DSL_DATASET_H
-#define	_SYS_DSL_DATASET_H
-
-#include <sys/dmu.h>
-#include <sys/spa.h>
-#include <sys/txg.h>
-#include <sys/zio.h>
-#include <sys/bplist.h>
-#include <sys/dsl_synctask.h>
-#include <sys/zfs_context.h>
-#include <sys/dsl_deadlist.h>
-#include <sys/refcount.h>
-#include <sys/rrwlock.h>
-#include <zfeature_common.h>
-
-#ifdef	__cplusplus
-extern "C" {
-#endif
-
-struct dsl_dataset;
-struct dsl_dir;
-struct dsl_pool;
-
-#define	DS_FLAG_INCONSISTENT	(1ULL<<0)
-#define	DS_IS_INCONSISTENT(ds)	\
-	(dsl_dataset_phys(ds)->ds_flags & DS_FLAG_INCONSISTENT)
-
-/*
- * Do not allow this dataset to be promoted.
- */
-#define	DS_FLAG_NOPROMOTE	(1ULL<<1)
-
-/*
- * DS_FLAG_UNIQUE_ACCURATE is set if ds_unique_bytes has been correctly
- * calculated for head datasets (starting with SPA_VERSION_UNIQUE_ACCURATE,
- * refquota/refreservations).
- */
-#define	DS_FLAG_UNIQUE_ACCURATE	(1ULL<<2)
-
-/*
- * DS_FLAG_DEFER_DESTROY is set after 'zfs destroy -d' has been called
- * on a dataset. This allows the dataset to be destroyed using 'zfs release'.
- */
-#define	DS_FLAG_DEFER_DESTROY	(1ULL<<3)
-#define	DS_IS_DEFER_DESTROY(ds)	\
-	(dsl_dataset_phys(ds)->ds_flags & DS_FLAG_DEFER_DESTROY)
-
-/*
- * DS_FIELD_* are strings that are used in the "extensified" dataset zap object.
- * They should be of the format <reverse-dns>:<field>.
- */
-
-/*
- * This field's value is the object ID of a zap object which contains the
- * bookmarks of this dataset.  If it is present, then this dataset is counted
- * in the refcount of the SPA_FEATURES_BOOKMARKS feature.
- */
-#define	DS_FIELD_BOOKMARK_NAMES "com.delphix:bookmarks"
-
-/*
- * This field is present (with value=0) if this dataset may contain large
- * dnodes (>512B).  If it is present, then this dataset is counted in the
- * refcount of the SPA_FEATURE_LARGE_DNODE feature.
- */
-#define	DS_FIELD_LARGE_DNODE "org.zfsonlinux:large_dnode"
-
-/*
- * These fields are set on datasets that are in the middle of a resumable
- * receive, and allow the sender to resume the send if it is interrupted.
- */
-#define	DS_FIELD_RESUME_FROMGUID "com.delphix:resume_fromguid"
-#define	DS_FIELD_RESUME_TONAME "com.delphix:resume_toname"
-#define	DS_FIELD_RESUME_TOGUID "com.delphix:resume_toguid"
-#define	DS_FIELD_RESUME_OBJECT "com.delphix:resume_object"
-#define	DS_FIELD_RESUME_OFFSET "com.delphix:resume_offset"
-#define	DS_FIELD_RESUME_BYTES "com.delphix:resume_bytes"
-#define	DS_FIELD_RESUME_LARGEBLOCK "com.delphix:resume_largeblockok"
-#define	DS_FIELD_RESUME_EMBEDOK "com.delphix:resume_embedok"
-#define	DS_FIELD_RESUME_COMPRESSOK "com.delphix:resume_compressok"
-
-/*
- * This field is set to the object number of the remap deadlist if one exists.
- */
-#define	DS_FIELD_REMAP_DEADLIST	"com.delphix:remap_deadlist"
-
-/*
- * DS_FLAG_CI_DATASET is set if the dataset contains a file system whose
- * name lookups should be performed case-insensitively.
- */
-#define	DS_FLAG_CI_DATASET	(1ULL<<16)
-
-#define	DS_CREATE_FLAG_NODIRTY	(1ULL<<24)
-
-typedef struct dsl_dataset_phys {
-	uint64_t ds_dir_obj;		/* DMU_OT_DSL_DIR */
-	uint64_t ds_prev_snap_obj;	/* DMU_OT_DSL_DATASET */
-	uint64_t ds_prev_snap_txg;
-	uint64_t ds_next_snap_obj;	/* DMU_OT_DSL_DATASET */
-	uint64_t ds_snapnames_zapobj;	/* DMU_OT_DSL_DS_SNAP_MAP 0 for snaps */
-	uint64_t ds_num_children;	/* clone/snap children; ==0 for head */
-	uint64_t ds_creation_time;	/* seconds since 1970 */
-	uint64_t ds_creation_txg;
-	uint64_t ds_deadlist_obj;	/* DMU_OT_DEADLIST */
-	/*
-	 * ds_referenced_bytes, ds_compressed_bytes, and ds_uncompressed_bytes
-	 * include all blocks referenced by this dataset, including those
-	 * shared with any other datasets.
-	 */
-	uint64_t ds_referenced_bytes;
-	uint64_t ds_compressed_bytes;
-	uint64_t ds_uncompressed_bytes;
-	uint64_t ds_unique_bytes;	/* only relevant to snapshots */
-	/*
-	 * The ds_fsid_guid is a 56-bit ID that can change to avoid
-	 * collisions.  The ds_guid is a 64-bit ID that will never
-	 * change, so there is a small probability that it will collide.
-	 */
-	uint64_t ds_fsid_guid;
-	uint64_t ds_guid;
-	uint64_t ds_flags;		/* DS_FLAG_* */
-	blkptr_t ds_bp;
-	uint64_t ds_next_clones_obj;	/* DMU_OT_DSL_CLONES */
-	uint64_t ds_props_obj;		/* DMU_OT_DSL_PROPS for snaps */
-	uint64_t ds_userrefs_obj;	/* DMU_OT_USERREFS */
-	uint64_t ds_pad[5]; /* pad out to 320 bytes for good measure */
-} dsl_dataset_phys_t;
-
-typedef struct dsl_dataset {
-	dmu_buf_user_t ds_dbu;
-	rrwlock_t ds_bp_rwlock; /* Protects ds_phys->ds_bp */
-
-	/* Immutable: */
-	struct dsl_dir *ds_dir;
-	dmu_buf_t *ds_dbuf;
-	uint64_t ds_object;
-	uint64_t ds_fsid_guid;
-	boolean_t ds_is_snapshot;
-
-	/* only used in syncing context, only valid for non-snapshots: */
-	struct dsl_dataset *ds_prev;
-	uint64_t ds_bookmarks;  /* DMU_OTN_ZAP_METADATA */
-
-	/* has internal locking: */
-	dsl_deadlist_t ds_deadlist;
-	bplist_t ds_pending_deadlist;
-
-	/*
-	 * The remap deadlist contains blocks (DVA's, really) that are
-	 * referenced by the previous snapshot and point to indirect vdevs,
-	 * but in this dataset they have been remapped to point to concrete
-	 * (or at least, less-indirect) vdevs.  In other words, the
-	 * physical DVA is referenced by the previous snapshot but not by
-	 * this dataset.  Logically, the DVA continues to be referenced,
-	 * but we are using a different (less indirect) physical DVA.
-	 * This deadlist is used to determine when physical DVAs that
-	 * point to indirect vdevs are no longer referenced anywhere,
-	 * and thus should be marked obsolete.
-	 *
-	 * This is only used if SPA_FEATURE_OBSOLETE_COUNTS is enabled.
-	 */
-	dsl_deadlist_t ds_remap_deadlist;
-	/* protects creation of the ds_remap_deadlist */
-	kmutex_t ds_remap_deadlist_lock;
-
-	/* protected by lock on pool's dp_dirty_datasets list */
-	txg_node_t ds_dirty_link;
-	list_node_t ds_synced_link;
-
-	/*
-	 * ds_phys->ds_<accounting> is also protected by ds_lock.
-	 * Protected by ds_lock:
-	 */
-	kmutex_t ds_lock;
-	objset_t *ds_objset;
-	uint64_t ds_userrefs;
-	void *ds_owner;
-
-	/*
-	 * Long holds prevent the ds from being destroyed; they allow the
-	 * ds to remain held even after dropping the dp_config_rwlock.
-	 * Owning counts as a long hold.  See the comments above
-	 * dsl_pool_hold() for details.
-	 */
-	zfs_refcount_t ds_longholds;
-
-	/* no locking; only for making guesses */
-	uint64_t ds_trysnap_txg;
-
-	/* for objset_open() */
-	kmutex_t ds_opening_lock;
-
-	uint64_t ds_reserved;	/* cached refreservation */
-	uint64_t ds_quota;	/* cached refquota */
-
-	kmutex_t ds_sendstream_lock;
-	list_t ds_sendstreams;
-
-	/*
-	 * When in the middle of a resumable receive, tracks how much
-	 * progress we have made.
-	 */
-	uint64_t ds_resume_object[TXG_SIZE];
-	uint64_t ds_resume_offset[TXG_SIZE];
-	uint64_t ds_resume_bytes[TXG_SIZE];
-
-	/* Protected by our dsl_dir's dd_lock */
-	list_t ds_prop_cbs;
-
-	/*
-	 * For ZFEATURE_FLAG_PER_DATASET features, set if this dataset
-	 * uses this feature.
-	 */
-	uint8_t ds_feature_inuse[SPA_FEATURES];
-
-	/*
-	 * Set if we need to activate the feature on this dataset this txg
-	 * (used only in syncing context).
-	 */
-	uint8_t ds_feature_activation_needed[SPA_FEATURES];
-
-	/* Protected by ds_lock; keep at end of struct for better locality */
-	char ds_snapname[ZFS_MAX_DATASET_NAME_LEN];
-} dsl_dataset_t;
-
-inline dsl_dataset_phys_t *
-dsl_dataset_phys(dsl_dataset_t *ds)
-{
-	return (ds->ds_dbuf->db_data);
-}
-
-typedef struct dsl_dataset_promote_arg {
-	const char *ddpa_clonename;
-	dsl_dataset_t *ddpa_clone;
-	list_t shared_snaps, origin_snaps, clone_snaps;
-	dsl_dataset_t *origin_origin; /* origin of the origin */
-	uint64_t used, comp, uncomp, unique, cloneusedsnap, originusedsnap;
-	nvlist_t *err_ds;
-	cred_t *cr;
-} dsl_dataset_promote_arg_t;
-
-typedef struct dsl_dataset_rollback_arg {
-	const char *ddra_fsname;
-	const char *ddra_tosnap;
-	void *ddra_owner;
-	nvlist_t *ddra_result;
-} dsl_dataset_rollback_arg_t;
-
-typedef struct dsl_dataset_snapshot_arg {
-	nvlist_t *ddsa_snaps;
-	nvlist_t *ddsa_props;
-	nvlist_t *ddsa_errors;
-	cred_t *ddsa_cr;
-} dsl_dataset_snapshot_arg_t;
-
-/*
- * The max length of a temporary tag prefix is the number of hex digits
- * required to express UINT64_MAX plus one for the hyphen.
- */
-#define	MAX_TAG_PREFIX_LEN	17
-
-#define	dsl_dataset_is_snapshot(ds) \
-	(dsl_dataset_phys(ds)->ds_num_children != 0)
-
-#define	DS_UNIQUE_IS_ACCURATE(ds)	\
-	((dsl_dataset_phys(ds)->ds_flags & DS_FLAG_UNIQUE_ACCURATE) != 0)
-
-int dsl_dataset_hold(struct dsl_pool *dp, const char *name, void *tag,
-    dsl_dataset_t **dsp);
-boolean_t dsl_dataset_try_add_ref(struct dsl_pool *dp, dsl_dataset_t *ds,
-    void *tag);
-int dsl_dataset_hold_obj(struct dsl_pool *dp, uint64_t dsobj, void *tag,
-    dsl_dataset_t **);
-void dsl_dataset_rele(dsl_dataset_t *ds, void *tag);
-int dsl_dataset_own(struct dsl_pool *dp, const char *name,
-    void *tag, dsl_dataset_t **dsp);
-int dsl_dataset_own_obj(struct dsl_pool *dp, uint64_t dsobj,
-    void *tag, dsl_dataset_t **dsp);
-void dsl_dataset_disown(dsl_dataset_t *ds, void *tag);
-void dsl_dataset_name(dsl_dataset_t *ds, char *name);
-boolean_t dsl_dataset_tryown(dsl_dataset_t *ds, void *tag);
-int dsl_dataset_namelen(dsl_dataset_t *ds);
-boolean_t dsl_dataset_has_owner(dsl_dataset_t *ds);
-uint64_t dsl_dataset_create_sync(dsl_dir_t *pds, const char *lastname,
-    dsl_dataset_t *origin, uint64_t flags, cred_t *, dmu_tx_t *);
-uint64_t dsl_dataset_create_sync_dd(dsl_dir_t *dd, dsl_dataset_t *origin,
-    uint64_t flags, dmu_tx_t *tx);
-void dsl_dataset_snapshot_sync(void *arg, dmu_tx_t *tx);
-int dsl_dataset_snapshot_check(void *arg, dmu_tx_t *tx);
-int dsl_dataset_snapshot(nvlist_t *snaps, nvlist_t *props, nvlist_t *errors);
-void dsl_dataset_promote_sync(void *arg, dmu_tx_t *tx);
-int dsl_dataset_promote_check(void *arg, dmu_tx_t *tx);
-int dsl_dataset_promote(const char *name, char *conflsnap);
-int dsl_dataset_clone_swap(dsl_dataset_t *clone, dsl_dataset_t *origin_head,
-    boolean_t force);
-int dsl_dataset_rename_snapshot(const char *fsname,
-    const char *oldsnapname, const char *newsnapname, boolean_t recursive);
-int dsl_dataset_snapshot_tmp(const char *fsname, const char *snapname,
-    minor_t cleanup_minor, const char *htag);
-
-blkptr_t *dsl_dataset_get_blkptr(dsl_dataset_t *ds);
-
-spa_t *dsl_dataset_get_spa(dsl_dataset_t *ds);
-
-boolean_t dsl_dataset_modified_since_snap(dsl_dataset_t *ds,
-    dsl_dataset_t *snap);
-
-void dsl_dataset_sync(dsl_dataset_t *os, zio_t *zio, dmu_tx_t *tx);
-void dsl_dataset_sync_done(dsl_dataset_t *os, dmu_tx_t *tx);
-
-void dsl_dataset_block_born(dsl_dataset_t *ds, const blkptr_t *bp,
-    dmu_tx_t *tx);
-int dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp,
-    dmu_tx_t *tx, boolean_t async);
-void dsl_dataset_block_remapped(dsl_dataset_t *ds, uint64_t vdev,
-    uint64_t offset, uint64_t size, uint64_t birth, dmu_tx_t *tx);
-
-void dsl_dataset_dirty(dsl_dataset_t *ds, dmu_tx_t *tx);
-
-int get_clones_stat_impl(dsl_dataset_t *ds, nvlist_t *val);
-char *get_receive_resume_stats_impl(dsl_dataset_t *ds);
-char *get_child_receive_stats(dsl_dataset_t *ds);
-uint64_t dsl_get_refratio(dsl_dataset_t *ds);
-uint64_t dsl_get_logicalreferenced(dsl_dataset_t *ds);
-uint64_t dsl_get_compressratio(dsl_dataset_t *ds);
-uint64_t dsl_get_used(dsl_dataset_t *ds);
-uint64_t dsl_get_creation(dsl_dataset_t *ds);
-uint64_t dsl_get_creationtxg(dsl_dataset_t *ds);
-uint64_t dsl_get_refquota(dsl_dataset_t *ds);
-uint64_t dsl_get_refreservation(dsl_dataset_t *ds);
-uint64_t dsl_get_guid(dsl_dataset_t *ds);
-uint64_t dsl_get_unique(dsl_dataset_t *ds);
-uint64_t dsl_get_objsetid(dsl_dataset_t *ds);
-uint64_t dsl_get_userrefs(dsl_dataset_t *ds);
-uint64_t dsl_get_defer_destroy(dsl_dataset_t *ds);
-uint64_t dsl_get_referenced(dsl_dataset_t *ds);
-uint64_t dsl_get_numclones(dsl_dataset_t *ds);
-uint64_t dsl_get_inconsistent(dsl_dataset_t *ds);
-uint64_t dsl_get_available(dsl_dataset_t *ds);
-int dsl_get_written(dsl_dataset_t *ds, uint64_t *written);
-int dsl_get_prev_snap(dsl_dataset_t *ds, char *snap);
-int dsl_get_mountpoint(dsl_dataset_t *ds, const char *dsname, char *value,
-    char *source);
-
-void get_clones_stat(dsl_dataset_t *ds, nvlist_t *nv);
-
-void dsl_dataset_stats(dsl_dataset_t *os, nvlist_t *nv);
-
-void dsl_dataset_fast_stat(dsl_dataset_t *ds, dmu_objset_stats_t *stat);
-void dsl_dataset_space(dsl_dataset_t *ds,
-    uint64_t *refdbytesp, uint64_t *availbytesp,
-    uint64_t *usedobjsp, uint64_t *availobjsp);
-uint64_t dsl_dataset_fsid_guid(dsl_dataset_t *ds);
-int dsl_dataset_space_written(dsl_dataset_t *oldsnap, dsl_dataset_t *new,
-    uint64_t *usedp, uint64_t *compp, uint64_t *uncompp);
-int dsl_dataset_space_wouldfree(dsl_dataset_t *firstsnap, dsl_dataset_t *last,
-    uint64_t *usedp, uint64_t *compp, uint64_t *uncompp);
-boolean_t dsl_dataset_is_dirty(dsl_dataset_t *ds);
-
-int dsl_dsobj_to_dsname(char *pname, uint64_t obj, char *buf);
-
-int dsl_dataset_check_quota(dsl_dataset_t *ds, boolean_t check_quota,
-    uint64_t asize, uint64_t inflight, uint64_t *used,
-    uint64_t *ref_rsrv);
-int dsl_dataset_set_refquota(const char *dsname, zprop_source_t source,
-    uint64_t quota);
-int dsl_dataset_set_refreservation(const char *dsname, zprop_source_t source,
-    uint64_t reservation);
-
-boolean_t dsl_dataset_is_before(dsl_dataset_t *later, dsl_dataset_t *earlier,
-    uint64_t earlier_txg);
-void dsl_dataset_long_hold(dsl_dataset_t *ds, void *tag);
-void dsl_dataset_long_rele(dsl_dataset_t *ds, void *tag);
-boolean_t dsl_dataset_long_held(dsl_dataset_t *ds);
-
-int dsl_dataset_clone_swap_check_impl(dsl_dataset_t *clone,
-    dsl_dataset_t *origin_head, boolean_t force, void *owner, dmu_tx_t *tx);
-void dsl_dataset_clone_swap_sync_impl(dsl_dataset_t *clone,
-    dsl_dataset_t *origin_head, dmu_tx_t *tx);
-int dsl_dataset_snapshot_check_impl(dsl_dataset_t *ds, const char *snapname,
-    dmu_tx_t *tx, boolean_t recv, uint64_t cnt, cred_t *cr);
-void dsl_dataset_snapshot_sync_impl(dsl_dataset_t *ds, const char *snapname,
-    dmu_tx_t *tx);
-
-void dsl_dataset_remove_from_next_clones(dsl_dataset_t *ds, uint64_t obj,
-    dmu_tx_t *tx);
-void dsl_dataset_recalc_head_uniq(dsl_dataset_t *ds);
-int dsl_dataset_get_snapname(dsl_dataset_t *ds);
-int dsl_dataset_snap_lookup(dsl_dataset_t *ds, const char *name,
-    uint64_t *value);
-int dsl_dataset_snap_remove(dsl_dataset_t *ds, const char *name, dmu_tx_t *tx,
-    boolean_t adj_cnt);
-void dsl_dataset_set_refreservation_sync_impl(dsl_dataset_t *ds,
-    zprop_source_t source, uint64_t value, dmu_tx_t *tx);
-void dsl_dataset_zapify(dsl_dataset_t *ds, dmu_tx_t *tx);
-boolean_t dsl_dataset_is_zapified(dsl_dataset_t *ds);
-boolean_t dsl_dataset_has_resume_receive_state(dsl_dataset_t *ds);
-
-int dsl_dataset_rollback_check(void *arg, dmu_tx_t *tx);
-void dsl_dataset_rollback_sync(void *arg, dmu_tx_t *tx);
-int dsl_dataset_rollback(const char *fsname, const char *tosnap, void *owner,
-    nvlist_t *result);
-
-uint64_t dsl_dataset_get_remap_deadlist_object(dsl_dataset_t *ds);
-void dsl_dataset_create_remap_deadlist(dsl_dataset_t *ds, dmu_tx_t *tx);
-boolean_t dsl_dataset_remap_deadlist_exists(dsl_dataset_t *ds);
-void dsl_dataset_destroy_remap_deadlist(dsl_dataset_t *ds, dmu_tx_t *tx);
-
-void dsl_dataset_deactivate_feature(uint64_t dsobj,
-    spa_feature_t f, dmu_tx_t *tx);
-
-#ifdef ZFS_DEBUG
-#define	dprintf_ds(ds, fmt, ...) do { \
-	if (zfs_flags & ZFS_DEBUG_DPRINTF) { \
-	char *__ds_name = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP); \
-	dsl_dataset_name(ds, __ds_name); \
-	dprintf("ds=%s " fmt, __ds_name, __VA_ARGS__); \
-	kmem_free(__ds_name, ZFS_MAX_DATASET_NAME_LEN); \
-	} \
-_NOTE(CONSTCOND) } while (0)
-#else
-#define	dprintf_ds(dd, fmt, ...)
-#endif
-
-#ifdef	__cplusplus
-}
-#endif
-
-#endif /* _SYS_DSL_DATASET_H */
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_deadlist.h
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_deadlist.h
+++ /dev/null
@@ -1,89 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2015 by Delphix. All rights reserved.
- */
-
-#ifndef	_SYS_DSL_DEADLIST_H
-#define	_SYS_DSL_DEADLIST_H
-
-#include <sys/bpobj.h>
-#include <sys/zfs_context.h>
-
-#ifdef	__cplusplus
-extern "C" {
-#endif
-
-struct dmu_buf;
-struct dsl_dataset;
-
-typedef struct dsl_deadlist_phys {
-	uint64_t dl_used;
-	uint64_t dl_comp;
-	uint64_t dl_uncomp;
-	uint64_t dl_pad[37]; /* pad out to 320b for future expansion */
-} dsl_deadlist_phys_t;
-
-typedef struct dsl_deadlist {
-	objset_t *dl_os;
-	uint64_t dl_object;
-	avl_tree_t dl_tree;
-	boolean_t dl_havetree;
-	struct dmu_buf *dl_dbuf;
-	dsl_deadlist_phys_t *dl_phys;
-	kmutex_t dl_lock;
-
-	/* if it's the old on-disk format: */
-	bpobj_t dl_bpobj;
-	boolean_t dl_oldfmt;
-} dsl_deadlist_t;
-
-typedef struct dsl_deadlist_entry {
-	avl_node_t dle_node;
-	uint64_t dle_mintxg;
-	bpobj_t dle_bpobj;
-} dsl_deadlist_entry_t;
-
-void dsl_deadlist_open(dsl_deadlist_t *dl, objset_t *os, uint64_t object);
-void dsl_deadlist_close(dsl_deadlist_t *dl);
-uint64_t dsl_deadlist_alloc(objset_t *os, dmu_tx_t *tx);
-void dsl_deadlist_free(objset_t *os, uint64_t dlobj, dmu_tx_t *tx);
-void dsl_deadlist_insert(dsl_deadlist_t *dl, const blkptr_t *bp, dmu_tx_t *tx);
-void dsl_deadlist_add_key(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx);
-void dsl_deadlist_remove_key(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx);
-uint64_t dsl_deadlist_clone(dsl_deadlist_t *dl, uint64_t maxtxg,
-    uint64_t mrs_obj, dmu_tx_t *tx);
-void dsl_deadlist_space(dsl_deadlist_t *dl,
-    uint64_t *usedp, uint64_t *compp, uint64_t *uncompp);
-void dsl_deadlist_space_range(dsl_deadlist_t *dl,
-    uint64_t mintxg, uint64_t maxtxg,
-    uint64_t *usedp, uint64_t *compp, uint64_t *uncompp);
-void dsl_deadlist_merge(dsl_deadlist_t *dl, uint64_t obj, dmu_tx_t *tx);
-void dsl_deadlist_move_bpobj(dsl_deadlist_t *dl, bpobj_t *bpo, uint64_t mintxg,
-    dmu_tx_t *tx);
-boolean_t dsl_deadlist_is_open(dsl_deadlist_t *dl);
-
-#ifdef	__cplusplus
-}
-#endif
-
-#endif /* _SYS_DSL_DEADLIST_H */
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_deleg.h
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_deleg.h
+++ /dev/null
@@ -1,81 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2013, 2015 by Delphix. All rights reserved.
- */
-
-#ifndef	_SYS_DSL_DELEG_H
-#define	_SYS_DSL_DELEG_H
-
-#include <sys/dmu.h>
-#include <sys/dsl_pool.h>
-#include <sys/zfs_context.h>
-
-#ifdef	__cplusplus
-extern "C" {
-#endif
-
-#define	ZFS_DELEG_PERM_NONE		""
-#define	ZFS_DELEG_PERM_CREATE		"create"
-#define	ZFS_DELEG_PERM_DESTROY		"destroy"
-#define	ZFS_DELEG_PERM_SNAPSHOT		"snapshot"
-#define	ZFS_DELEG_PERM_ROLLBACK		"rollback"
-#define	ZFS_DELEG_PERM_CLONE		"clone"
-#define	ZFS_DELEG_PERM_PROMOTE		"promote"
-#define	ZFS_DELEG_PERM_RENAME		"rename"
-#define	ZFS_DELEG_PERM_MOUNT		"mount"
-#define	ZFS_DELEG_PERM_SHARE		"share"
-#define	ZFS_DELEG_PERM_SEND		"send"
-#define	ZFS_DELEG_PERM_RECEIVE		"receive"
-#define	ZFS_DELEG_PERM_ALLOW		"allow"
-#define	ZFS_DELEG_PERM_USERPROP		"userprop"
-#define	ZFS_DELEG_PERM_VSCAN		"vscan"
-#define	ZFS_DELEG_PERM_USERQUOTA	"userquota"
-#define	ZFS_DELEG_PERM_GROUPQUOTA	"groupquota"
-#define	ZFS_DELEG_PERM_USERUSED		"userused"
-#define	ZFS_DELEG_PERM_GROUPUSED	"groupused"
-#define	ZFS_DELEG_PERM_HOLD		"hold"
-#define	ZFS_DELEG_PERM_RELEASE		"release"
-#define	ZFS_DELEG_PERM_DIFF		"diff"
-#define	ZFS_DELEG_PERM_BOOKMARK		"bookmark"
-#define	ZFS_DELEG_PERM_REMAP		"remap"
-
-/*
- * Note: the names of properties that are marked delegatable are also
- * valid delegated permissions
- */
-
-int dsl_deleg_get(const char *ddname, nvlist_t **nvp);
-int dsl_deleg_set(const char *ddname, nvlist_t *nvp, boolean_t unset);
-int dsl_deleg_access(const char *ddname, const char *perm, cred_t *cr);
-int dsl_deleg_access_impl(struct dsl_dataset *ds, const char *perm, cred_t *cr);
-void dsl_deleg_set_create_perms(dsl_dir_t *dd, dmu_tx_t *tx, cred_t *cr);
-int dsl_deleg_can_allow(char *ddname, nvlist_t *nvp, cred_t *cr);
-int dsl_deleg_can_unallow(char *ddname, nvlist_t *nvp, cred_t *cr);
-int dsl_deleg_destroy(objset_t *os, uint64_t zapobj, dmu_tx_t *tx);
-boolean_t dsl_delegation_on(objset_t *os);
-
-#ifdef	__cplusplus
-}
-#endif
-
-#endif	/* _SYS_DSL_DELEG_H */
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_destroy.h
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_destroy.h
+++ /dev/null
@@ -1,68 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, 2016 by Delphix. All rights reserved.
- * Copyright (c) 2012, Joyent, Inc. All rights reserved.
- */
-
-#ifndef	_SYS_DSL_DESTROY_H
-#define	_SYS_DSL_DESTROY_H
-
-#ifdef	__cplusplus
-extern "C" {
-#endif
-
-struct nvlist;
-struct dsl_dataset;
-struct dmu_tx;
-
-int dsl_destroy_snapshots_nvl(struct nvlist *, boolean_t,
-    struct nvlist *);
-int dsl_destroy_snapshot(const char *, boolean_t);
-int dsl_destroy_head(const char *);
-int dsl_destroy_head_check_impl(struct dsl_dataset *, int);
-void dsl_destroy_head_sync_impl(struct dsl_dataset *, struct dmu_tx *);
-int dsl_destroy_inconsistent(const char *, void *);
-int dsl_destroy_snapshot_check_impl(struct dsl_dataset *, boolean_t);
-void dsl_destroy_snapshot_sync_impl(struct dsl_dataset *,
-    boolean_t, struct dmu_tx *);
-
-typedef struct dsl_destroy_snapshot_arg {
-	const char *ddsa_name;
-	boolean_t ddsa_defer;
-} dsl_destroy_snapshot_arg_t;
-
-int dsl_destroy_snapshot_check(void *, dmu_tx_t *);
-void dsl_destroy_snapshot_sync(void *, dmu_tx_t *);
-
-typedef struct dsl_destroy_head_arg {
-	const char *ddha_name;
-} dsl_destroy_head_arg_t;
-
-int dsl_destroy_head_check(void *, dmu_tx_t *);
-void dsl_destroy_head_sync(void *, dmu_tx_t *);
-
-#ifdef	__cplusplus
-}
-#endif
-
-#endif /* _SYS_DSL_DESTROY_H */
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dir.h
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dir.h
+++ /dev/null
@@ -1,209 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
- * Copyright (c) 2014, Joyent, Inc. All rights reserved.
- * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
- */
-
-#ifndef	_SYS_DSL_DIR_H
-#define	_SYS_DSL_DIR_H
-
-#include <sys/dmu.h>
-#include <sys/dsl_pool.h>
-#include <sys/dsl_synctask.h>
-#include <sys/refcount.h>
-#include <sys/zfs_context.h>
-
-#ifdef	__cplusplus
-extern "C" {
-#endif
-
-struct dsl_dataset;
-
-/*
- * DD_FIELD_* are strings that are used in the "extensified" dsl_dir zap object.
- * They should be of the format <reverse-dns>:<field>.
- */
-
-#define	DD_FIELD_FILESYSTEM_COUNT	"com.joyent:filesystem_count"
-#define	DD_FIELD_SNAPSHOT_COUNT		"com.joyent:snapshot_count"
-#define	DD_FIELD_LAST_REMAP_TXG		"com.delphix:last_remap_txg"
-
-typedef enum dd_used {
-	DD_USED_HEAD,
-	DD_USED_SNAP,
-	DD_USED_CHILD,
-	DD_USED_CHILD_RSRV,
-	DD_USED_REFRSRV,
-	DD_USED_NUM
-} dd_used_t;
-
-#define	DD_FLAG_USED_BREAKDOWN (1<<0)
-
-typedef struct dsl_dir_phys {
-	uint64_t dd_creation_time; /* not actually used */
-	uint64_t dd_head_dataset_obj;
-	uint64_t dd_parent_obj;
-	uint64_t dd_origin_obj;
-	uint64_t dd_child_dir_zapobj;
-	/*
-	 * how much space our children are accounting for; for leaf
-	 * datasets, == physical space used by fs + snaps
-	 */
-	uint64_t dd_used_bytes;
-	uint64_t dd_compressed_bytes;
-	uint64_t dd_uncompressed_bytes;
-	/* Administrative quota setting */
-	uint64_t dd_quota;
-	/* Administrative reservation setting */
-	uint64_t dd_reserved;
-	uint64_t dd_props_zapobj;
-	uint64_t dd_deleg_zapobj; /* dataset delegation permissions */
-	uint64_t dd_flags;
-	uint64_t dd_used_breakdown[DD_USED_NUM];
-	uint64_t dd_clones; /* dsl_dir objects */
-	uint64_t dd_pad[13]; /* pad out to 256 bytes for good measure */
-} dsl_dir_phys_t;
-
-struct dsl_dir {
-	dmu_buf_user_t dd_dbu;
-
-	/* These are immutable; no lock needed: */
-	uint64_t dd_object;
-	dsl_pool_t *dd_pool;
-
-	/* Stable until user eviction; no lock needed: */
-	dmu_buf_t *dd_dbuf;
-
-	/* protected by lock on pool's dp_dirty_dirs list */
-	txg_node_t dd_dirty_link;
-
-	/* protected by dp_config_rwlock */
-	dsl_dir_t *dd_parent;
-
-	/* Protected by dd_lock */
-	kmutex_t dd_lock;
-	list_t dd_props; /* list of dsl_prop_record_t's */
-	timestruc_t dd_snap_cmtime; /* last time snapshot namespace changed */
-	uint64_t dd_origin_txg;
-
-	/* gross estimate of space used by in-flight tx's */
-	uint64_t dd_tempreserved[TXG_SIZE];
-	/* amount of space we expect to write; == amount of dirty data */
-	int64_t dd_space_towrite[TXG_SIZE];
-
-	/* protected by dd_lock; keep at end of struct for better locality */
-	char dd_myname[ZFS_MAX_DATASET_NAME_LEN];
-};
-
-inline dsl_dir_phys_t *
-dsl_dir_phys(dsl_dir_t *dd)
-{
-	return (dd->dd_dbuf->db_data);
-}
-
-void dsl_dir_rele(dsl_dir_t *dd, void *tag);
-void dsl_dir_async_rele(dsl_dir_t *dd, void *tag);
-int dsl_dir_hold(dsl_pool_t *dp, const char *name, void *tag,
-    dsl_dir_t **, const char **tail);
-int dsl_dir_hold_obj(dsl_pool_t *dp, uint64_t ddobj,
-    const char *tail, void *tag, dsl_dir_t **);
-void dsl_dir_name(dsl_dir_t *dd, char *buf);
-int dsl_dir_namelen(dsl_dir_t *dd);
-uint64_t dsl_dir_create_sync(dsl_pool_t *dp, dsl_dir_t *pds,
-    const char *name, dmu_tx_t *tx);
-
-uint64_t dsl_dir_get_used(dsl_dir_t *dd);
-uint64_t dsl_dir_get_compressed(dsl_dir_t *dd);
-uint64_t dsl_dir_get_quota(dsl_dir_t *dd);
-uint64_t dsl_dir_get_reservation(dsl_dir_t *dd);
-uint64_t dsl_dir_get_compressratio(dsl_dir_t *dd);
-uint64_t dsl_dir_get_logicalused(dsl_dir_t *dd);
-uint64_t dsl_dir_get_usedsnap(dsl_dir_t *dd);
-uint64_t dsl_dir_get_usedds(dsl_dir_t *dd);
-uint64_t dsl_dir_get_usedrefreserv(dsl_dir_t *dd);
-uint64_t dsl_dir_get_usedchild(dsl_dir_t *dd);
-void dsl_dir_get_origin(dsl_dir_t *dd, char *buf);
-int dsl_dir_get_filesystem_count(dsl_dir_t *dd, uint64_t *count);
-int dsl_dir_get_snapshot_count(dsl_dir_t *dd, uint64_t *count);
-int dsl_dir_get_remaptxg(dsl_dir_t *dd, uint64_t *count);
-
-void dsl_dir_stats(dsl_dir_t *dd, nvlist_t *nv);
-uint64_t dsl_dir_space_available(dsl_dir_t *dd,
-    dsl_dir_t *ancestor, int64_t delta, int ondiskonly);
-void dsl_dir_dirty(dsl_dir_t *dd, dmu_tx_t *tx);
-void dsl_dir_sync(dsl_dir_t *dd, dmu_tx_t *tx);
-int dsl_dir_tempreserve_space(dsl_dir_t *dd, uint64_t mem,
-    uint64_t asize, boolean_t netfree, void **tr_cookiep, dmu_tx_t *tx);
-void dsl_dir_tempreserve_clear(void *tr_cookie, dmu_tx_t *tx);
-void dsl_dir_willuse_space(dsl_dir_t *dd, int64_t space, dmu_tx_t *tx);
-void dsl_dir_diduse_space(dsl_dir_t *dd, dd_used_t type,
-    int64_t used, int64_t compressed, int64_t uncompressed, dmu_tx_t *tx);
-void dsl_dir_transfer_space(dsl_dir_t *dd, int64_t delta,
-    dd_used_t oldtype, dd_used_t newtype, dmu_tx_t *tx);
-int dsl_dir_set_quota(const char *ddname, zprop_source_t source,
-    uint64_t quota);
-int dsl_dir_set_reservation(const char *ddname, zprop_source_t source,
-    uint64_t reservation);
-int dsl_dir_activate_fs_ss_limit(const char *);
-int dsl_fs_ss_limit_check(dsl_dir_t *, uint64_t, zfs_prop_t, dsl_dir_t *,
-    cred_t *);
-void dsl_fs_ss_count_adjust(dsl_dir_t *, int64_t, const char *, dmu_tx_t *);
-int dsl_dir_update_last_remap_txg(dsl_dir_t *, uint64_t);
-int dsl_dir_rename(const char *oldname, const char *newname);
-int dsl_dir_transfer_possible(dsl_dir_t *sdd, dsl_dir_t *tdd,
-    uint64_t fs_cnt, uint64_t ss_cnt, uint64_t space, cred_t *);
-boolean_t dsl_dir_is_clone(dsl_dir_t *dd);
-void dsl_dir_new_refreservation(dsl_dir_t *dd, struct dsl_dataset *ds,
-    uint64_t reservation, cred_t *cr, dmu_tx_t *tx);
-void dsl_dir_snap_cmtime_update(dsl_dir_t *dd);
-timestruc_t dsl_dir_snap_cmtime(dsl_dir_t *dd);
-void dsl_dir_set_reservation_sync_impl(dsl_dir_t *dd, uint64_t value,
-    dmu_tx_t *tx);
-void dsl_dir_zapify(dsl_dir_t *dd, dmu_tx_t *tx);
-boolean_t dsl_dir_is_zapified(dsl_dir_t *dd);
-
-/* internal reserved dir name */
-#define	MOS_DIR_NAME "$MOS"
-#define	ORIGIN_DIR_NAME "$ORIGIN"
-#define	FREE_DIR_NAME "$FREE"
-#define	LEAK_DIR_NAME "$LEAK"
-
-#ifdef ZFS_DEBUG
-#define	dprintf_dd(dd, fmt, ...) do { \
-	if (zfs_flags & ZFS_DEBUG_DPRINTF) { \
-	char *__ds_name = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP); \
-	dsl_dir_name(dd, __ds_name); \
-	dprintf("dd=%s " fmt, __ds_name, __VA_ARGS__); \
-	kmem_free(__ds_name, ZFS_MAX_DATASET_NAME_LEN); \
-	} \
-_NOTE(CONSTCOND) } while (0)
-#else
-#define	dprintf_dd(dd, fmt, ...)
-#endif
-
-#ifdef	__cplusplus
-}
-#endif
-
-#endif /* _SYS_DSL_DIR_H */
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_pool.h
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_pool.h
+++ /dev/null
@@ -1,191 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2013, 2017 by Delphix. All rights reserved.
- * Copyright 2016 Nexenta Systems, Inc.  All rights reserved.
- */
-
-#ifndef	_SYS_DSL_POOL_H
-#define	_SYS_DSL_POOL_H
-
-#include <sys/spa.h>
-#include <sys/txg.h>
-#include <sys/txg_impl.h>
-#include <sys/zfs_context.h>
-#include <sys/zio.h>
-#include <sys/dnode.h>
-#include <sys/ddt.h>
-#include <sys/arc.h>
-#include <sys/bpobj.h>
-#include <sys/bptree.h>
-#include <sys/rrwlock.h>
-#include <sys/dsl_synctask.h>
-#include <sys/mmp.h>
-
-#ifdef	__cplusplus
-extern "C" {
-#endif
-
-struct objset;
-struct dsl_dir;
-struct dsl_dataset;
-struct dsl_pool;
-struct dmu_tx;
-struct dsl_scan;
-
-extern uint64_t zfs_dirty_data_max;
-extern uint64_t zfs_dirty_data_max_max;
-extern uint64_t zfs_dirty_data_sync_pct;
-extern int zfs_dirty_data_max_percent;
-extern int zfs_delay_min_dirty_percent;
-extern uint64_t zfs_delay_scale;
-
-/* These macros are for indexing into the zfs_all_blkstats_t. */
-#define	DMU_OT_DEFERRED	DMU_OT_NONE
-#define	DMU_OT_OTHER	DMU_OT_NUMTYPES /* place holder for DMU_OT() types */
-#define	DMU_OT_TOTAL	(DMU_OT_NUMTYPES + 1)
-
-typedef struct zfs_blkstat {
-	uint64_t	zb_count;
-	uint64_t	zb_asize;
-	uint64_t	zb_lsize;
-	uint64_t	zb_psize;
-	uint64_t	zb_gangs;
-	uint64_t	zb_ditto_2_of_2_samevdev;
-	uint64_t	zb_ditto_2_of_3_samevdev;
-	uint64_t	zb_ditto_3_of_3_samevdev;
-} zfs_blkstat_t;
-
-typedef struct zfs_all_blkstats {
-	zfs_blkstat_t	zab_type[DN_MAX_LEVELS + 1][DMU_OT_TOTAL + 1];
-	kmutex_t	zab_lock;
-} zfs_all_blkstats_t;
-
-
-typedef struct dsl_pool {
-	/* Immutable */
-	spa_t *dp_spa;
-	struct objset *dp_meta_objset;
-	struct dsl_dir *dp_root_dir;
-	struct dsl_dir *dp_mos_dir;
-	struct dsl_dir *dp_free_dir;
-	struct dsl_dir *dp_leak_dir;
-	struct dsl_dataset *dp_origin_snap;
-	uint64_t dp_root_dir_obj;
-	struct taskq *dp_vnrele_taskq;
-
-	/* No lock needed - sync context only */
-	blkptr_t dp_meta_rootbp;
-	uint64_t dp_tmp_userrefs_obj;
-	bpobj_t dp_free_bpobj;
-	uint64_t dp_bptree_obj;
-	uint64_t dp_empty_bpobj;
-	bpobj_t dp_obsolete_bpobj;
-
-	struct dsl_scan *dp_scan;
-
-	/* Uses dp_lock */
-	kmutex_t dp_lock;
-	kcondvar_t dp_spaceavail_cv;
-	uint64_t dp_dirty_pertxg[TXG_SIZE];
-	uint64_t dp_dirty_total;
-	uint64_t dp_long_free_dirty_pertxg[TXG_SIZE];
-	uint64_t dp_mos_used_delta;
-	uint64_t dp_mos_compressed_delta;
-	uint64_t dp_mos_uncompressed_delta;
-
-	/*
-	 * Time of most recently scheduled (furthest in the future)
-	 * wakeup for delayed transactions.
-	 */
-	hrtime_t dp_last_wakeup;
-
-	/* Has its own locking */
-	tx_state_t dp_tx;
-	txg_list_t dp_dirty_datasets;
-	txg_list_t dp_dirty_zilogs;
-	txg_list_t dp_dirty_dirs;
-	txg_list_t dp_sync_tasks;
-	txg_list_t dp_early_sync_tasks;
-	taskq_t *dp_sync_taskq;
-	taskq_t *dp_zil_clean_taskq;
-
-	/*
-	 * Protects administrative changes (properties, namespace)
-	 *
-	 * It is only held for write in syncing context.  Therefore
-	 * syncing context does not need to ever have it for read, since
-	 * nobody else could possibly have it for write.
-	 */
-	rrwlock_t dp_config_rwlock;
-
-	zfs_all_blkstats_t *dp_blkstats;
-} dsl_pool_t;
-
-int dsl_pool_init(spa_t *spa, uint64_t txg, dsl_pool_t **dpp);
-int dsl_pool_open(dsl_pool_t *dp);
-void dsl_pool_close(dsl_pool_t *dp);
-dsl_pool_t *dsl_pool_create(spa_t *spa, nvlist_t *zplprops, uint64_t txg);
-void dsl_pool_sync(dsl_pool_t *dp, uint64_t txg);
-void dsl_pool_sync_done(dsl_pool_t *dp, uint64_t txg);
-int dsl_pool_sync_context(dsl_pool_t *dp);
-uint64_t dsl_pool_adjustedsize(dsl_pool_t *dp, zfs_space_check_t slop_policy);
-uint64_t dsl_pool_unreserved_space(dsl_pool_t *dp,
-    zfs_space_check_t slop_policy);
-void dsl_pool_dirty_space(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx);
-void dsl_pool_undirty_space(dsl_pool_t *dp, int64_t space, uint64_t txg);
-void dsl_free(dsl_pool_t *dp, uint64_t txg, const blkptr_t *bpp);
-void dsl_free_sync(zio_t *pio, dsl_pool_t *dp, uint64_t txg,
-    const blkptr_t *bpp);
-void dsl_pool_create_origin(dsl_pool_t *dp, dmu_tx_t *tx);
-void dsl_pool_upgrade_clones(dsl_pool_t *dp, dmu_tx_t *tx);
-void dsl_pool_upgrade_dir_clones(dsl_pool_t *dp, dmu_tx_t *tx);
-void dsl_pool_mos_diduse_space(dsl_pool_t *dp,
-    int64_t used, int64_t comp, int64_t uncomp);
-void dsl_pool_ckpoint_diduse_space(dsl_pool_t *dp,
-    int64_t used, int64_t comp, int64_t uncomp);
-void dsl_pool_config_enter(dsl_pool_t *dp, void *tag);
-void dsl_pool_config_enter_prio(dsl_pool_t *dp, void *tag);
-void dsl_pool_config_exit(dsl_pool_t *dp, void *tag);
-boolean_t dsl_pool_config_held(dsl_pool_t *dp);
-boolean_t dsl_pool_config_held_writer(dsl_pool_t *dp);
-boolean_t dsl_pool_need_dirty_delay(dsl_pool_t *dp);
-
-taskq_t *dsl_pool_vnrele_taskq(dsl_pool_t *dp);
-
-int dsl_pool_user_hold(dsl_pool_t *dp, uint64_t dsobj,
-    const char *tag, uint64_t now, dmu_tx_t *tx);
-int dsl_pool_user_release(dsl_pool_t *dp, uint64_t dsobj,
-    const char *tag, dmu_tx_t *tx);
-void dsl_pool_clean_tmp_userrefs(dsl_pool_t *dp);
-int dsl_pool_open_special_dir(dsl_pool_t *dp, const char *name, dsl_dir_t **);
-int dsl_pool_hold(const char *name, void *tag, dsl_pool_t **dp);
-void dsl_pool_rele(dsl_pool_t *dp, void *tag);
-
-void dsl_pool_create_obsolete_bpobj(dsl_pool_t *dp, dmu_tx_t *tx);
-void dsl_pool_destroy_obsolete_bpobj(dsl_pool_t *dp, dmu_tx_t *tx);
-
-#ifdef	__cplusplus
-}
-#endif
-
-#endif /* _SYS_DSL_POOL_H */
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_prop.h
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_prop.h
+++ /dev/null
@@ -1,115 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012 by Delphix. All rights reserved.
- */
-
-#ifndef	_SYS_DSL_PROP_H
-#define	_SYS_DSL_PROP_H
-
-#include <sys/dmu.h>
-#include <sys/dsl_pool.h>
-#include <sys/zfs_context.h>
-#include <sys/dsl_synctask.h>
-
-#ifdef	__cplusplus
-extern "C" {
-#endif
-
-struct dsl_dataset;
-struct dsl_dir;
-
-/* The callback func may not call into the DMU or DSL! */
-typedef void (dsl_prop_changed_cb_t)(void *arg, uint64_t newval);
-
-typedef struct dsl_prop_record {
-	list_node_t pr_node; /* link on dd_props */
-	const char *pr_propname;
-	list_t pr_cbs;
-} dsl_prop_record_t;
-
-typedef struct dsl_prop_cb_record {
-	list_node_t cbr_pr_node; /* link on pr_cbs */
-	list_node_t cbr_ds_node; /* link on ds_prop_cbs */
-	dsl_prop_record_t *cbr_pr;
-	struct dsl_dataset *cbr_ds;
-	dsl_prop_changed_cb_t *cbr_func;
-	void *cbr_arg;
-} dsl_prop_cb_record_t;
-
-typedef struct dsl_props_arg {
-	nvlist_t *pa_props;
-	zprop_source_t pa_source;
-} dsl_props_arg_t;
-
-void dsl_prop_init(dsl_dir_t *dd);
-void dsl_prop_fini(dsl_dir_t *dd);
-int dsl_prop_register(struct dsl_dataset *ds, const char *propname,
-    dsl_prop_changed_cb_t *callback, void *cbarg);
-void dsl_prop_unregister_all(struct dsl_dataset *ds, void *cbarg);
-void dsl_prop_notify_all(struct dsl_dir *dd);
-boolean_t dsl_prop_hascb(struct dsl_dataset *ds);
-
-int dsl_prop_get(const char *ddname, const char *propname,
-    int intsz, int numints, void *buf, char *setpoint);
-int dsl_prop_get_integer(const char *ddname, const char *propname,
-    uint64_t *valuep, char *setpoint);
-int dsl_prop_get_all(objset_t *os, nvlist_t **nvp);
-int dsl_prop_get_received(const char *dsname, nvlist_t **nvp);
-int dsl_prop_get_ds(struct dsl_dataset *ds, const char *propname,
-    int intsz, int numints, void *buf, char *setpoint);
-int dsl_prop_get_int_ds(struct dsl_dataset *ds, const char *propname,
-    uint64_t *valuep);
-int dsl_prop_get_dd(struct dsl_dir *dd, const char *propname,
-    int intsz, int numints, void *buf, char *setpoint,
-    boolean_t snapshot);
-
-void dsl_props_set_sync_impl(struct dsl_dataset *ds, zprop_source_t source,
-    nvlist_t *props, dmu_tx_t *tx);
-void dsl_prop_set_sync_impl(struct dsl_dataset *ds, const char *propname,
-    zprop_source_t source, int intsz, int numints, const void *value,
-    dmu_tx_t *tx);
-int dsl_props_set(const char *dsname, zprop_source_t source, nvlist_t *nvl);
-int dsl_prop_set_int(const char *dsname, const char *propname,
-    zprop_source_t source, uint64_t value);
-int dsl_prop_set_string(const char *dsname, const char *propname,
-    zprop_source_t source, const char *value);
-int dsl_prop_inherit(const char *dsname, const char *propname,
-    zprop_source_t source);
-
-int dsl_prop_predict(dsl_dir_t *dd, const char *propname,
-    zprop_source_t source, uint64_t value, uint64_t *newvalp);
-
-/* flag first receive on or after SPA_VERSION_RECVD_PROPS */
-boolean_t dsl_prop_get_hasrecvd(const char *dsname);
-int dsl_prop_set_hasrecvd(const char *dsname);
-void dsl_prop_unset_hasrecvd(const char *dsname);
-
-void dsl_prop_nvlist_add_uint64(nvlist_t *nv, zfs_prop_t prop, uint64_t value);
-void dsl_prop_nvlist_add_string(nvlist_t *nv,
-    zfs_prop_t prop, const char *value);
-
-#ifdef	__cplusplus
-}
-#endif
-
-#endif	/* _SYS_DSL_PROP_H */
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_scan.h
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_scan.h
+++ /dev/null
@@ -1,188 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
- * Copyright (c) 2017 Datto Inc.
- */
-
-#ifndef	_SYS_DSL_SCAN_H
-#define	_SYS_DSL_SCAN_H
-
-#include <sys/zfs_context.h>
-#include <sys/zio.h>
-#include <sys/ddt.h>
-#include <sys/bplist.h>
-
-#ifdef	__cplusplus
-extern "C" {
-#endif
-
-struct objset;
-struct dsl_dir;
-struct dsl_dataset;
-struct dsl_pool;
-struct dmu_tx;
-
-/*
- * All members of this structure must be uint64_t, for byteswap
- * purposes.
- */
-typedef struct dsl_scan_phys {
-	uint64_t scn_func; /* pool_scan_func_t */
-	uint64_t scn_state; /* dsl_scan_state_t */
-	uint64_t scn_queue_obj;
-	uint64_t scn_min_txg;
-	uint64_t scn_max_txg;
-	uint64_t scn_cur_min_txg;
-	uint64_t scn_cur_max_txg;
-	uint64_t scn_start_time;
-	uint64_t scn_end_time;
-	uint64_t scn_to_examine; /* total bytes to be scanned */
-	uint64_t scn_examined; /* bytes scanned so far */
-	uint64_t scn_to_process;
-	uint64_t scn_processed;
-	uint64_t scn_errors;	/* scan I/O error count */
-	uint64_t scn_ddt_class_max;
-	ddt_bookmark_t scn_ddt_bookmark;
-	zbookmark_phys_t scn_bookmark;
-	uint64_t scn_flags; /* dsl_scan_flags_t */
-} dsl_scan_phys_t;
-
-#define	SCAN_PHYS_NUMINTS (sizeof (dsl_scan_phys_t) / sizeof (uint64_t))
-
-typedef enum dsl_scan_flags {
-	DSF_VISIT_DS_AGAIN = 1<<0,
-	DSF_SCRUB_PAUSED = 1<<1,
-} dsl_scan_flags_t;
-
-/*
- * Every pool will have one dsl_scan_t and this structure will contain
- * in-memory information about the scan and a pointer to the on-disk
- * representation (i.e. dsl_scan_phys_t). Most of the state of the scan
- * is contained on-disk to allow the scan to resume in the event of a reboot
- * or panic. This structure maintains information about the behavior of a
- * running scan, some caching information, and how it should traverse the pool.
- *
- * The following members of this structure direct the behavior of the scan:
- *
- * scn_suspending -	a scan that cannot be completed in a single txg or
- *			has exceeded its allotted time will need to suspend.
- *			When this flag is set the scanner will stop traversing
- *			the pool and write out the current state to disk.
- *
- * scn_restart_txg -	directs the scanner to either restart or start a
- *			a scan at the specified txg value.
- *
- * scn_done_txg -	when a scan completes its traversal it will set
- *			the completion txg to the next txg. This is necessary
- *			to ensure that any blocks that were freed during
- *			the scan but have not yet been processed (i.e deferred
- *			frees) are accounted for.
- *
- * This structure also maintains information about deferred frees which are
- * a special kind of traversal. Deferred free can exist in either a bptree or
- * a bpobj structure. The scn_is_bptree flag will indicate the type of
- * deferred free that is in progress. If the deferred free is part of an
- * asynchronous destroy then the scn_async_destroying flag will be set.
- */
-typedef struct dsl_scan {
-	struct dsl_pool *scn_dp;
-
-	uint64_t scn_restart_txg;
-	uint64_t scn_done_txg;
-	uint64_t scn_sync_start_time;
-	uint64_t scn_issued_before_pass;
-
-	/* for freeing blocks */
-	boolean_t scn_is_bptree;
-	boolean_t scn_async_destroying;
-	boolean_t scn_async_stalled;
-	uint64_t  scn_async_block_min_time_ms;
-	/* flags and stats for controlling scan state */
-	boolean_t scn_is_sorted;	/* doing sequential scan */
-	boolean_t scn_clearing;		/* scan is issuing sequential extents */
-	boolean_t scn_checkpointing;	/* scan is issuing all queued extents */
-	boolean_t scn_suspending;	/* scan is suspending until next txg */
-	uint64_t scn_last_checkpoint;	/* time of last checkpoint */
-
-	/* members for thread synchronization */
-	zio_t *scn_zio_root;		/* root zio for waiting on IO */
-	taskq_t *scn_taskq;		/* task queue for issuing extents */
-
-	/* for controlling scan prefetch, protected by spa_scrub_lock */
-	boolean_t scn_prefetch_stop;	/* prefetch should stop */
-	zbookmark_phys_t scn_prefetch_bookmark;	/* prefetch start bookmark */
-	avl_tree_t scn_prefetch_queue;	/* priority queue of prefetch IOs */
-	uint64_t scn_maxinflight_bytes;	/* max bytes in flight for poool */
-	
-	/* per txg statistics */
-	uint64_t scn_visited_this_txg;	/* total bps visited this txg */
-	uint64_t scn_holes_this_txg;
-	uint64_t scn_lt_min_this_txg;
-	uint64_t scn_gt_max_this_txg;
-	uint64_t scn_ddt_contained_this_txg;
-	uint64_t scn_objsets_visited_this_txg;
-	uint64_t scn_avg_seg_size_this_txg;
-	uint64_t scn_segs_this_txg;
-	uint64_t scn_avg_zio_size_this_txg;
-	uint64_t scn_zios_this_txg;
-
-	/* members needed for syncing scan status to disk */
-	dsl_scan_phys_t scn_phys;	/* on disk representation of scan */
-	dsl_scan_phys_t scn_phys_cached;
-	avl_tree_t scn_queue;		/* queue of datasets to scan */
-	uint64_t scn_bytes_pending;	/* outstanding data to issue */
-} dsl_scan_t;
-
-typedef struct dsl_scan_io_queue dsl_scan_io_queue_t;
-
-void dsl_scan_global_init(void);
-
-void scan_init(void);
-void scan_fini(void);
-int dsl_scan_init(struct dsl_pool *dp, uint64_t txg);
-void dsl_scan_fini(struct dsl_pool *dp);
-void dsl_scan_sync(struct dsl_pool *, dmu_tx_t *);
-int dsl_scan_cancel(struct dsl_pool *);
-int dsl_scan(struct dsl_pool *, pool_scan_func_t);
-boolean_t dsl_scan_scrubbing(const struct dsl_pool *dp);
-int dsl_scrub_set_pause_resume(const struct dsl_pool *dp, pool_scrub_cmd_t cmd);
-void dsl_resilver_restart(struct dsl_pool *, uint64_t txg);
-boolean_t dsl_scan_resilvering(struct dsl_pool *dp);
-boolean_t dsl_dataset_unstable(struct dsl_dataset *ds);
-void dsl_scan_ddt_entry(dsl_scan_t *scn, enum zio_checksum checksum,
-    ddt_entry_t *dde, dmu_tx_t *tx);
-void dsl_scan_ds_destroyed(struct dsl_dataset *ds, struct dmu_tx *tx);
-void dsl_scan_ds_snapshotted(struct dsl_dataset *ds, struct dmu_tx *tx);
-void dsl_scan_ds_clone_swapped(struct dsl_dataset *ds1, struct dsl_dataset *ds2,
-    struct dmu_tx *tx);
-boolean_t dsl_scan_active(dsl_scan_t *scn);
-boolean_t dsl_scan_is_paused_scrub(const dsl_scan_t *scn);
-void dsl_scan_freed(spa_t *spa, const blkptr_t *bp);
-void dsl_scan_io_queue_destroy(dsl_scan_io_queue_t *queue);
-void dsl_scan_io_queue_vdev_xfer(vdev_t *svd, vdev_t *tvd);
-
-#ifdef	__cplusplus
-}
-#endif
-
-#endif /* _SYS_DSL_SCAN_H */
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_synctask.h
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_synctask.h
+++ /dev/null
@@ -1,127 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
- */
-
-#ifndef	_SYS_DSL_SYNCTASK_H
-#define	_SYS_DSL_SYNCTASK_H
-
-#include <sys/txg.h>
-#include <sys/zfs_context.h>
-
-#ifdef	__cplusplus
-extern "C" {
-#endif
-
-struct dsl_pool;
-
-typedef int (dsl_checkfunc_t)(void *, dmu_tx_t *);
-typedef void (dsl_syncfunc_t)(void *, dmu_tx_t *);
-typedef void (dsl_sigfunc_t)(void *, dmu_tx_t *);
-
-typedef enum zfs_space_check {
-	/*
-	 * Normal space check: if there is less than 3.2% free space,
-	 * the operation will fail.  Operations which are logically
-	 * creating things should use this (e.g. "zfs create", "zfs snapshot").
-	 * User writes (via the ZPL / ZVOL) also fail at this point.
-	 */
-	ZFS_SPACE_CHECK_NORMAL,
-
-	/*
-	 * Space check allows use of half the slop space.  If there
-	 * is less than 1.6% free space, the operation will fail.  Most
-	 * operations should use this (e.g. "zfs set", "zfs rename"),
-	 * because we want them to succeed even after user writes are failing,
-	 * so that they can be used as part of the space recovery process.
-	 */
-	ZFS_SPACE_CHECK_RESERVED,
-
-	/*
-	 * Space check allows use of three quarters of the slop space.
-	 * If there is less than 0.8% free space, the operation will
-	 * fail.
-	 */
-	ZFS_SPACE_CHECK_EXTRA_RESERVED,
-
-	/*
-	 * In all cases "zfs destroy" is expected to result in an net
-	 * reduction of space, except one. When the pool has a
-	 * checkpoint, space freed by "zfs destroy" will not actually
-	 * free anything internally. Thus, it starts failing after
-	 * three quarters of the slop space is exceeded.
-	 */
-	ZFS_SPACE_CHECK_DESTROY = ZFS_SPACE_CHECK_EXTRA_RESERVED,
-
-	/*
-	 * A channel program can run a "zfs destroy" as part of its
-	 * script and therefore has the same space_check policy when
-	 * being evaluated.
-	 */
-	ZFS_SPACE_CHECK_ZCP_EVAL = ZFS_SPACE_CHECK_DESTROY,
-
-	/*
-	 * No space check is performed. This level of space check should
-	 * be used cautiously as operations that use it can even run when
-	 * 0.8% capacity is left for use. In this scenario, if there is a
-	 * checkpoint, async destroys are suspended and any kind of freeing
-	 * can potentially add space instead of freeing it.
-	 *
-	 * See also the comments above spa_slop_shift.
-	 */
-	ZFS_SPACE_CHECK_NONE,
-
-	ZFS_SPACE_CHECK_DISCARD_CHECKPOINT = ZFS_SPACE_CHECK_NONE,
-
-} zfs_space_check_t;
-
-typedef struct dsl_sync_task {
-	txg_node_t dst_node;
-	struct dsl_pool *dst_pool;
-	uint64_t dst_txg;
-	int dst_space;
-	zfs_space_check_t dst_space_check;
-	dsl_checkfunc_t *dst_checkfunc;
-	dsl_syncfunc_t *dst_syncfunc;
-	void *dst_arg;
-	int dst_error;
-	boolean_t dst_nowaiter;
-} dsl_sync_task_t;
-
-void dsl_sync_task_sync(dsl_sync_task_t *, dmu_tx_t *);
-int dsl_sync_task(const char *, dsl_checkfunc_t *,
-    dsl_syncfunc_t *, void *, int, zfs_space_check_t);
-void dsl_sync_task_nowait(struct dsl_pool *, dsl_syncfunc_t *,
-    void *, int, zfs_space_check_t, dmu_tx_t *);
-int dsl_early_sync_task(const char *, dsl_checkfunc_t *,
-    dsl_syncfunc_t *, void *, int, zfs_space_check_t);
-void dsl_early_sync_task_nowait(struct dsl_pool *, dsl_syncfunc_t *,
-    void *, int, zfs_space_check_t, dmu_tx_t *);
-int dsl_sync_task_sig(const char *, dsl_checkfunc_t *, dsl_syncfunc_t *,
-    dsl_sigfunc_t *, void *, int, zfs_space_check_t);
-
-#ifdef	__cplusplus
-}
-#endif
-
-#endif /* _SYS_DSL_SYNCTASK_H */
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_userhold.h
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_userhold.h
+++ /dev/null
@@ -1,57 +0,0 @@
-
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012 by Delphix. All rights reserved.
- * Copyright (c) 2012, Joyent, Inc. All rights reserved.
- * Copyright (c) 2013 Steven Hartland. All rights reserved.
- */
-
-#ifndef	_SYS_DSL_USERHOLD_H
-#define	_SYS_DSL_USERHOLD_H
-
-#include <sys/nvpair.h>
-#include <sys/types.h>
-
-#ifdef	__cplusplus
-extern "C" {
-#endif
-
-struct dsl_pool;
-struct dsl_dataset;
-struct dmu_tx;
-
-int dsl_dataset_user_hold(nvlist_t *holds, minor_t cleanup_minor,
-    nvlist_t *errlist);
-int dsl_dataset_user_release(nvlist_t *holds, nvlist_t *errlist);
-int dsl_dataset_get_holds(const char *dsname, nvlist_t *nvl);
-void dsl_dataset_user_release_tmp(struct dsl_pool *dp, nvlist_t *holds);
-int dsl_dataset_user_hold_check_one(struct dsl_dataset *ds, const char *htag,
-    boolean_t temphold, struct dmu_tx *tx);
-void dsl_dataset_user_hold_sync_one(struct dsl_dataset *ds, const char *htag,
-    minor_t minor, uint64_t now, struct dmu_tx *tx);
-
-#ifdef	__cplusplus
-}
-#endif
-
-#endif /* _SYS_DSL_USERHOLD_H */
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab.h
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab.h
+++ /dev/null
@@ -1,127 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
- * Copyright (c) 2017, Intel Corporation.
- */
-
-#ifndef _SYS_METASLAB_H
-#define	_SYS_METASLAB_H
-
-#include <sys/spa.h>
-#include <sys/space_map.h>
-#include <sys/txg.h>
-#include <sys/zio.h>
-#include <sys/avl.h>
-
-#ifdef	__cplusplus
-extern "C" {
-#endif
-
-
-typedef struct metaslab_ops {
-	uint64_t (*msop_alloc)(metaslab_t *, uint64_t);
-} metaslab_ops_t;
-
-
-extern metaslab_ops_t *zfs_metaslab_ops;
-
-int metaslab_init(metaslab_group_t *, uint64_t, uint64_t, uint64_t,
-    metaslab_t **);
-void metaslab_fini(metaslab_t *);
-
-int metaslab_load(metaslab_t *);
-void metaslab_unload(metaslab_t *);
-
-uint64_t metaslab_allocated_space(metaslab_t *);
-
-void metaslab_sync(metaslab_t *, uint64_t);
-void metaslab_sync_done(metaslab_t *, uint64_t);
-void metaslab_sync_reassess(metaslab_group_t *);
-uint64_t metaslab_block_maxsize(metaslab_t *);
-
-/*
- * metaslab alloc flags
- */
-#define	METASLAB_HINTBP_FAVOR		0x0
-#define	METASLAB_HINTBP_AVOID		0x1
-#define	METASLAB_GANG_HEADER		0x2
-#define	METASLAB_GANG_CHILD		0x4
-#define	METASLAB_ASYNC_ALLOC		0x8
-#define	METASLAB_DONT_THROTTLE		0x10
-#define	METASLAB_MUST_RESERVE		0x20
-#define	METASLAB_FASTWRITE		0x40
-
-int metaslab_alloc(spa_t *, metaslab_class_t *, uint64_t,
-    blkptr_t *, int, uint64_t, blkptr_t *, int, zio_alloc_list_t *, zio_t *,
-    int);
-int metaslab_alloc_dva(spa_t *, metaslab_class_t *, uint64_t,
-    dva_t *, int, dva_t *, uint64_t, int, zio_alloc_list_t *, int);
-void metaslab_free(spa_t *, const blkptr_t *, uint64_t, boolean_t);
-void metaslab_free_concrete(vdev_t *, uint64_t, uint64_t, boolean_t);
-void metaslab_free_dva(spa_t *, const dva_t *, boolean_t);
-void metaslab_free_impl_cb(uint64_t, vdev_t *, uint64_t, uint64_t, void *);
-void metaslab_unalloc_dva(spa_t *, const dva_t *, uint64_t);
-int metaslab_claim(spa_t *, const blkptr_t *, uint64_t);
-int metaslab_claim_impl(vdev_t *, uint64_t, uint64_t, uint64_t);
-void metaslab_check_free(spa_t *, const blkptr_t *);
-
-void metaslab_alloc_trace_init(void);
-void metaslab_alloc_trace_fini(void);
-void metaslab_trace_init(zio_alloc_list_t *);
-void metaslab_trace_fini(zio_alloc_list_t *);
-
-metaslab_class_t *metaslab_class_create(spa_t *, metaslab_ops_t *);
-void metaslab_class_destroy(metaslab_class_t *);
-int metaslab_class_validate(metaslab_class_t *);
-void metaslab_class_histogram_verify(metaslab_class_t *);
-uint64_t metaslab_class_fragmentation(metaslab_class_t *);
-uint64_t metaslab_class_expandable_space(metaslab_class_t *);
-boolean_t metaslab_class_throttle_reserve(metaslab_class_t *, int, int,
-    zio_t *, int);
-void metaslab_class_throttle_unreserve(metaslab_class_t *, int, int, zio_t *);
-
-uint64_t metaslab_class_get_alloc(metaslab_class_t *);
-uint64_t metaslab_class_get_space(metaslab_class_t *);
-uint64_t metaslab_class_get_dspace(metaslab_class_t *);
-uint64_t metaslab_class_get_deferred(metaslab_class_t *);
-uint64_t metaslab_class_get_minblocksize(metaslab_class_t *mc);
-
-metaslab_group_t *metaslab_group_create(metaslab_class_t *, vdev_t *, int);
-void metaslab_group_destroy(metaslab_group_t *);
-void metaslab_group_activate(metaslab_group_t *);
-void metaslab_group_passivate(metaslab_group_t *);
-boolean_t metaslab_group_initialized(metaslab_group_t *);
-uint64_t metaslab_group_get_space(metaslab_group_t *);
-void metaslab_group_histogram_verify(metaslab_group_t *);
-uint64_t metaslab_group_fragmentation(metaslab_group_t *);
-void metaslab_group_histogram_remove(metaslab_group_t *, metaslab_t *);
-void metaslab_group_alloc_decrement(spa_t *, uint64_t, void *, int, int,
-    boolean_t);
-void metaslab_group_alloc_verify(spa_t *, const blkptr_t *, void *, int);
-void metaslab_recalculate_weight_and_sort(metaslab_t *);
-
-#ifdef	__cplusplus
-}
-#endif
-
-#endif	/* _SYS_METASLAB_H */
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab_impl.h
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab_impl.h
+++ /dev/null
@@ -1,501 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-
-/*
- * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
- */
-
-#ifndef _SYS_METASLAB_IMPL_H
-#define	_SYS_METASLAB_IMPL_H
-
-#include <sys/metaslab.h>
-#include <sys/space_map.h>
-#include <sys/range_tree.h>
-#include <sys/vdev.h>
-#include <sys/txg.h>
-#include <sys/avl.h>
-
-#ifdef	__cplusplus
-extern "C" {
-#endif
-
-/*
- * Metaslab allocation tracing record.
- */
-typedef struct metaslab_alloc_trace {
-	list_node_t			mat_list_node;
-	metaslab_group_t		*mat_mg;
-	metaslab_t			*mat_msp;
-	uint64_t			mat_size;
-	uint64_t			mat_weight;
-	uint32_t			mat_dva_id;
-	uint64_t			mat_offset;
-	int					mat_allocator;
-} metaslab_alloc_trace_t;
-
-/*
- * Used by the metaslab allocation tracing facility to indicate
- * error conditions. These errors are stored to the offset member
- * of the metaslab_alloc_trace_t record and displayed by mdb.
- */
-typedef enum trace_alloc_type {
-	TRACE_ALLOC_FAILURE	= -1ULL,
-	TRACE_TOO_SMALL		= -2ULL,
-	TRACE_FORCE_GANG	= -3ULL,
-	TRACE_NOT_ALLOCATABLE	= -4ULL,
-	TRACE_GROUP_FAILURE	= -5ULL,
-	TRACE_ENOSPC		= -6ULL,
-	TRACE_CONDENSING	= -7ULL,
-	TRACE_VDEV_ERROR	= -8ULL,
-	TRACE_INITIALIZING	= -9ULL
-} trace_alloc_type_t;
-
-#define	METASLAB_WEIGHT_PRIMARY		(1ULL << 63)
-#define	METASLAB_WEIGHT_SECONDARY	(1ULL << 62)
-#define	METASLAB_WEIGHT_CLAIM		(1ULL << 61)
-#define	METASLAB_WEIGHT_TYPE		(1ULL << 60)
-#define	METASLAB_ACTIVE_MASK		\
-	(METASLAB_WEIGHT_PRIMARY | METASLAB_WEIGHT_SECONDARY | \
-	METASLAB_WEIGHT_CLAIM)
-
-/*
- * The metaslab weight is used to encode the amount of free space in a
- * metaslab, such that the "best" metaslab appears first when sorting the
- * metaslabs by weight. The weight (and therefore the "best" metaslab) can
- * be determined in two different ways: by computing a weighted sum of all
- * the free space in the metaslab (a space based weight) or by counting only
- * the free segments of the largest size (a segment based weight). We prefer
- * the segment based weight because it reflects how the free space is
- * comprised, but we cannot always use it -- legacy pools do not have the
- * space map histogram information necessary to determine the largest
- * contiguous regions. Pools that have the space map histogram determine
- * the segment weight by looking at each bucket in the histogram and
- * determining the free space whose size in bytes is in the range:
- *	[2^i, 2^(i+1))
- * We then encode the largest index, i, that contains regions into the
- * segment-weighted value.
- *
- * Space-based weight:
- *
- *      64      56      48      40      32      24      16      8       0
- *      +-------+-------+-------+-------+-------+-------+-------+-------+
- *      |PSC1|                  weighted-free space                     |
- *      +-------+-------+-------+-------+-------+-------+-------+-------+
- *
- *	PS - indicates primary and secondary activation
- *	C - indicates activation for claimed block zio
- *	space - the fragmentation-weighted space
- *
- * Segment-based weight:
- *
- *      64      56      48      40      32      24      16      8       0
- *      +-------+-------+-------+-------+-------+-------+-------+-------+
- *      |PSC0| idx|            count of segments in region              |
- *      +-------+-------+-------+-------+-------+-------+-------+-------+
- *
- *	PS - indicates primary and secondary activation
- *	C - indicates activation for claimed block zio
- *	idx - index for the highest bucket in the histogram
- *	count - number of segments in the specified bucket
- */
-#define	WEIGHT_GET_ACTIVE(weight)		BF64_GET((weight), 61, 3)
-#define	WEIGHT_SET_ACTIVE(weight, x)		BF64_SET((weight), 61, 3, x)
-
-#define	WEIGHT_IS_SPACEBASED(weight)		\
-	((weight) == 0 || BF64_GET((weight), 60, 1))
-#define	WEIGHT_SET_SPACEBASED(weight)		BF64_SET((weight), 60, 1, 1)
-
-/*
- * These macros are only applicable to segment-based weighting.
- */
-#define	WEIGHT_GET_INDEX(weight)		BF64_GET((weight), 54, 6)
-#define	WEIGHT_SET_INDEX(weight, x)		BF64_SET((weight), 54, 6, x)
-#define	WEIGHT_GET_COUNT(weight)		BF64_GET((weight), 0, 54)
-#define	WEIGHT_SET_COUNT(weight, x)		BF64_SET((weight), 0, 54, x)
-
-/*
- * A metaslab class encompasses a category of allocatable top-level vdevs.
- * Each top-level vdev is associated with a metaslab group which defines
- * the allocatable region for that vdev. Examples of these categories include
- * "normal" for data block allocations (i.e. main pool allocations) or "log"
- * for allocations designated for intent log devices (i.e. slog devices).
- * When a block allocation is requested from the SPA it is associated with a
- * metaslab_class_t, and only top-level vdevs (i.e. metaslab groups) belonging
- * to the class can be used to satisfy that request. Allocations are done
- * by traversing the metaslab groups that are linked off of the mc_rotor field.
- * This rotor points to the next metaslab group where allocations will be
- * attempted. Allocating a block is a 3 step process -- select the metaslab
- * group, select the metaslab, and then allocate the block. The metaslab
- * class defines the low-level block allocator that will be used as the
- * final step in allocation. These allocators are pluggable allowing each class
- * to use a block allocator that best suits that class.
- */
-struct metaslab_class {
-	kmutex_t		mc_lock;
-	spa_t			*mc_spa;
-	metaslab_group_t	*mc_rotor;
-	metaslab_ops_t		*mc_ops;
-	uint64_t		mc_aliquot;
-
-	/*
-	 * Track the number of metaslab groups that have been initialized
-	 * and can accept allocations. An initialized metaslab group is
-	 * one has been completely added to the config (i.e. we have
-	 * updated the MOS config and the space has been added to the pool).
-	 */
-	uint64_t		mc_groups;
-
-	/*
-	 * Toggle to enable/disable the allocation throttle.
-	 */
-	boolean_t		mc_alloc_throttle_enabled;
-
-	/*
-	 * The allocation throttle works on a reservation system. Whenever
-	 * an asynchronous zio wants to perform an allocation it must
-	 * first reserve the number of blocks that it wants to allocate.
-	 * If there aren't sufficient slots available for the pending zio
-	 * then that I/O is throttled until more slots free up. The current
-	 * number of reserved allocations is maintained by the mc_alloc_slots
-	 * refcount. The mc_alloc_max_slots value determines the maximum
-	 * number of allocations that the system allows. Gang blocks are
-	 * allowed to reserve slots even if we've reached the maximum
-	 * number of allocations allowed.
-	 */
-	uint64_t		*mc_alloc_max_slots;
-	zfs_refcount_t		*mc_alloc_slots;
-
-	uint64_t		mc_alloc_groups; /* # of allocatable groups */
-
-	uint64_t		mc_alloc;	/* total allocated space */
-	uint64_t		mc_deferred;	/* total deferred frees */
-	uint64_t		mc_space;	/* total space (alloc + free) */
-	uint64_t		mc_dspace;	/* total deflated space */
-	uint64_t		mc_minblocksize;
-	uint64_t		mc_histogram[RANGE_TREE_HISTOGRAM_SIZE];
-};
-
-/*
- * Metaslab groups encapsulate all the allocatable regions (i.e. metaslabs)
- * of a top-level vdev. They are linked togther to form a circular linked
- * list and can belong to only one metaslab class. Metaslab groups may become
- * ineligible for allocations for a number of reasons such as limited free
- * space, fragmentation, or going offline. When this happens the allocator will
- * simply find the next metaslab group in the linked list and attempt
- * to allocate from that group instead.
- */
-struct metaslab_group {
-	kmutex_t		mg_lock;
-	metaslab_t		**mg_primaries;
-	metaslab_t		**mg_secondaries;
-	avl_tree_t		mg_metaslab_tree;
-	uint64_t		mg_aliquot;
-	boolean_t		mg_allocatable;		/* can we allocate? */
-	uint64_t		mg_ms_ready;
-
-	/*
-	 * A metaslab group is considered to be initialized only after
-	 * we have updated the MOS config and added the space to the pool.
-	 * We only allow allocation attempts to a metaslab group if it
-	 * has been initialized.
-	 */
-	boolean_t		mg_initialized;
-
-	uint64_t		mg_free_capacity;	/* percentage free */
-	int64_t			mg_bias;
-	int64_t			mg_activation_count;
-	metaslab_class_t	*mg_class;
-	vdev_t			*mg_vd;
-	taskq_t			*mg_taskq;
-	metaslab_group_t	*mg_prev;
-	metaslab_group_t	*mg_next;
-
-	/*
-	 * In order for the allocation throttle to function properly, we cannot
-	 * have too many IOs going to each disk by default; the throttle
-	 * operates by allocating more work to disks that finish quickly, so
-	 * allocating larger chunks to each disk reduces its effectiveness.
-	 * However, if the number of IOs going to each allocator is too small,
-	 * we will not perform proper aggregation at the vdev_queue layer,
-	 * also resulting in decreased performance. Therefore, we will use a
-	 * ramp-up strategy.
-	 *
-	 * Each allocator in each metaslab group has a current queue depth
-	 * (mg_alloc_queue_depth[allocator]) and a current max queue depth
-	 * (mg_cur_max_alloc_queue_depth[allocator]), and each metaslab group
-	 * has an absolute max queue depth (mg_max_alloc_queue_depth).  We
-	 * add IOs to an allocator until the mg_alloc_queue_depth for that
-	 * allocator hits the cur_max. Every time an IO completes for a given
-	 * allocator on a given metaslab group, we increment its cur_max until
-	 * it reaches mg_max_alloc_queue_depth. The cur_max resets every txg to
-	 * help protect against disks that decrease in performance over time.
-	 *
-	 * It's possible for an allocator to handle more allocations than
-	 * its max. This can occur when gang blocks are required or when other
-	 * groups are unable to handle their share of allocations.
-	 */
-	uint64_t		mg_max_alloc_queue_depth;
-	uint64_t		*mg_cur_max_alloc_queue_depth;
-	zfs_refcount_t		*mg_alloc_queue_depth;
-	int			mg_allocators;
-	/*
-	 * A metalab group that can no longer allocate the minimum block
-	 * size will set mg_no_free_space. Once a metaslab group is out
-	 * of space then its share of work must be distributed to other
-	 * groups.
-	 */
-	boolean_t		mg_no_free_space;
-
-	uint64_t		mg_allocations;
-	uint64_t		mg_failed_allocations;
-	uint64_t		mg_fragmentation;
-	uint64_t		mg_histogram[RANGE_TREE_HISTOGRAM_SIZE];
-
-	int			mg_ms_initializing;
-	boolean_t		mg_initialize_updating;
-	kmutex_t		mg_ms_initialize_lock;
-	kcondvar_t		mg_ms_initialize_cv;
-};
-
-/*
- * This value defines the number of elements in the ms_lbas array. The value
- * of 64 was chosen as it covers all power of 2 buckets up to UINT64_MAX.
- * This is the equivalent of highbit(UINT64_MAX).
- */
-#define	MAX_LBAS	64
-
-/*
- * Each metaslab maintains a set of in-core trees to track metaslab
- * operations.  The in-core free tree (ms_allocatable) contains the list of
- * free segments which are eligible for allocation.  As blocks are
- * allocated, the allocated segment are removed from the ms_allocatable and
- * added to a per txg allocation tree (ms_allocating).  As blocks are
- * freed, they are added to the free tree (ms_freeing).  These trees
- * allow us to process all allocations and frees in syncing context
- * where it is safe to update the on-disk space maps.  An additional set
- * of in-core trees is maintained to track deferred frees
- * (ms_defer).  Once a block is freed it will move from the
- * ms_freed to the ms_defer tree.  A deferred free means that a block
- * has been freed but cannot be used by the pool until TXG_DEFER_SIZE
- * transactions groups later.  For example, a block that is freed in txg
- * 50 will not be available for reallocation until txg 52 (50 +
- * TXG_DEFER_SIZE).  This provides a safety net for uberblock rollback.
- * A pool could be safely rolled back TXG_DEFERS_SIZE transactions
- * groups and ensure that no block has been reallocated.
- *
- * The simplified transition diagram looks like this:
- *
- *
- *      ALLOCATE
- *         |
- *         V
- *    free segment (ms_allocatable) -> ms_allocating[4] -> (write to space map)
- *         ^
- *         |                        ms_freeing <--- FREE
- *         |                             |
- *         |                             v
- *         |                         ms_freed
- *         |                             |
- *         +-------- ms_defer[2] <-------+-------> (write to space map)
- *
- *
- * Each metaslab's space is tracked in a single space map in the MOS,
- * which is only updated in syncing context.  Each time we sync a txg,
- * we append the allocs and frees from that txg to the space map.  The
- * pool space is only updated once all metaslabs have finished syncing.
- *
- * To load the in-core free tree we read the space map from disk.  This
- * object contains a series of alloc and free records that are combined
- * to make up the list of all free segments in this metaslab.  These
- * segments are represented in-core by the ms_allocatable and are stored
- * in an AVL tree.
- *
- * As the space map grows (as a result of the appends) it will
- * eventually become space-inefficient.  When the metaslab's in-core
- * free tree is zfs_condense_pct/100 times the size of the minimal
- * on-disk representation, we rewrite it in its minimized form.  If a
- * metaslab needs to condense then we must set the ms_condensing flag to
- * ensure that allocations are not performed on the metaslab that is
- * being written.
- */
-struct metaslab {
-	/*
-	 * This is the main lock of the metaslab and its purpose is to
-	 * coordinate our allocations and frees [e.g metaslab_block_alloc(),
-	 * metaslab_free_concrete(), ..etc] with our various syncing
-	 * procedures [e.g. metaslab_sync(), metaslab_sync_done(), ..etc].
-	 *
-	 * The lock is also used during some miscellaneous operations like
-	 * using the metaslab's histogram for the metaslab group's histogram
-	 * aggregation, or marking the metaslab for initialization.
-	 */
-	kmutex_t	ms_lock;
-
-	/*
-	 * Acquired together with the ms_lock whenever we expect to
-	 * write to metaslab data on-disk (i.e flushing entries to
-	 * the metaslab's space map). It helps coordinate readers of
-	 * the metaslab's space map [see spa_vdev_remove_thread()]
-	 * with writers [see metaslab_sync()].
-	 *
-	 * Note that metaslab_load(), even though a reader, uses
-	 * a completely different mechanism to deal with the reading
-	 * of the metaslab's space map based on ms_synced_length. That
-	 * said, the function still uses the ms_sync_lock after it
-	 * has read the ms_sm [see relevant comment in metaslab_load()
-	 * as to why].
-	 */
-	kmutex_t	ms_sync_lock;
-
-	kcondvar_t	ms_load_cv;
-	space_map_t	*ms_sm;
-	uint64_t	ms_id;
-	uint64_t	ms_start;
-	uint64_t	ms_size;
-	uint64_t	ms_fragmentation;
-
-	range_tree_t	*ms_allocating[TXG_SIZE];
-	range_tree_t	*ms_allocatable;
-	uint64_t	ms_allocated_this_txg;
-
-	/*
-	 * The following range trees are accessed only from syncing context.
-	 * ms_free*tree only have entries while syncing, and are empty
-	 * between syncs.
-	 */
-	range_tree_t	*ms_freeing;	/* to free this syncing txg */
-	range_tree_t	*ms_freed;	/* already freed this syncing txg */
-	range_tree_t	*ms_defer[TXG_DEFER_SIZE];
-	range_tree_t	*ms_checkpointing; /* to add to the checkpoint */
-
-	boolean_t	ms_condensing;	/* condensing? */
-	boolean_t	ms_condense_wanted;
-	uint64_t	ms_condense_checked_txg;
-
-	uint64_t	ms_initializing; /* leaves initializing this ms */
-
-	/*
-	 * We must always hold the ms_lock when modifying ms_loaded
-	 * and ms_loading.
-	 */
-	boolean_t	ms_loaded;
-	boolean_t	ms_loading;
-
-	/*
-	 * The following histograms count entries that are in the
-	 * metaslab's space map (and its histogram) but are not in
-	 * ms_allocatable yet, because they are in ms_freed, ms_freeing,
-	 * or ms_defer[].
-	 *
-	 * When the metaslab is not loaded, its ms_weight needs to
-	 * reflect what is allocatable (i.e. what will be part of
-	 * ms_allocatable if it is loaded).  The weight is computed from
-	 * the spacemap histogram, but that includes ranges that are
-	 * not yet allocatable (because they are in ms_freed,
-	 * ms_freeing, or ms_defer[]).  Therefore, when calculating the
-	 * weight, we need to remove those ranges.
-	 *
-	 * The ranges in the ms_freed and ms_defer[] range trees are all
-	 * present in the spacemap.  However, the spacemap may have
-	 * multiple entries to represent a contiguous range, because it
-	 * is written across multiple sync passes, but the changes of
-	 * all sync passes are consolidated into the range trees.
-	 * Adjacent ranges that are freed in different sync passes of
-	 * one txg will be represented separately (as 2 or more entries)
-	 * in the space map (and its histogram), but these adjacent
-	 * ranges will be consolidated (represented as one entry) in the
-	 * ms_freed/ms_defer[] range trees (and their histograms).
-	 *
-	 * When calculating the weight, we can not simply subtract the
-	 * range trees' histograms from the spacemap's histogram,
-	 * because the range trees' histograms may have entries in
-	 * higher buckets than the spacemap, due to consolidation.
-	 * Instead we must subtract the exact entries that were added to
-	 * the spacemap's histogram.  ms_synchist and ms_deferhist[]
-	 * represent these exact entries, so we can subtract them from
-	 * the spacemap's histogram when calculating ms_weight.
-	 *
-	 * ms_synchist represents the same ranges as ms_freeing +
-	 * ms_freed, but without consolidation across sync passes.
-	 *
-	 * ms_deferhist[i] represents the same ranges as ms_defer[i],
-	 * but without consolidation across sync passes.
-	 */
-	uint64_t	ms_synchist[SPACE_MAP_HISTOGRAM_SIZE];
-	uint64_t	ms_deferhist[TXG_DEFER_SIZE][SPACE_MAP_HISTOGRAM_SIZE];
-
-	/*
-	 * Tracks the exact amount of allocated space of this metaslab
-	 * (and specifically the metaslab's space map) up to the most
-	 * recently completed sync pass [see usage in metaslab_sync()].
-	 */
-	uint64_t	ms_allocated_space;
-	int64_t		ms_deferspace;	/* sum of ms_defermap[] space	*/
-	uint64_t	ms_weight;	/* weight vs. others in group	*/
-	uint64_t	ms_activation_weight;	/* activation weight	*/
-
-	/*
-	 * Track of whenever a metaslab is selected for loading or allocation.
-	 * We use this value to determine how long the metaslab should
-	 * stay cached.
-	 */
-	uint64_t	ms_selected_txg;
-
-	uint64_t	ms_alloc_txg;	/* last successful alloc (debug only) */
-	uint64_t	ms_max_size;	/* maximum allocatable size	*/
-
-	/*
-	 * -1 if it's not active in an allocator, otherwise set to the allocator
-	 * this metaslab is active for.
-	 */
-	int		ms_allocator;
-	boolean_t	ms_primary; /* Only valid if ms_allocator is not -1 */
-
-	/*
-	 * The metaslab block allocators can optionally use a size-ordered
-	 * range tree and/or an array of LBAs. Not all allocators use
-	 * this functionality. The ms_allocatable_by_size should always
-	 * contain the same number of segments as the ms_allocatable. The
-	 * only difference is that the ms_allocatable_by_size is ordered by
-	 * segment sizes.
-	 */
-	avl_tree_t	ms_allocatable_by_size;
-	uint64_t	ms_lbas[MAX_LBAS];
-
-	metaslab_group_t *ms_group;	/* metaslab group		*/
-	avl_node_t	ms_group_node;	/* node in metaslab group tree	*/
-	txg_node_t	ms_txg_node;	/* per-txg dirty metaslab links	*/
-
-	/* updated every time we are done syncing the metaslab's space map */
-	uint64_t	ms_synced_length;
-
-	boolean_t	ms_new;
-};
-
-#ifdef	__cplusplus
-}
-#endif
-
-#endif	/* _SYS_METASLAB_IMPL_H */
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/mmp.h
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/mmp.h
+++ /dev/null
@@ -1,74 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * This file and its contents are supplied under the terms of the
- * Common Development and Distribution License ("CDDL"), version 1.0.
- * You may only use this file in accordance with the terms of version
- * 1.0 of the CDDL.
- *
- * A full copy of the text of the CDDL should have accompanied this
- * source.  A copy of the CDDL is also available via the Internet at
- * http://www.illumos.org/license/CDDL.
- *
- * CDDL HEADER END
- */
-/*
- * Copyright (C) 2017 by Lawrence Livermore National Security, LLC.
- */
-
-#ifndef _SYS_MMP_H
-#define	_SYS_MMP_H
-
-#include <sys/spa.h>
-#include <sys/zfs_context.h>
-#include <sys/uberblock_impl.h>
-
-#ifdef	__cplusplus
-extern "C" {
-#endif
-
-#define	MMP_MIN_INTERVAL		100	/* ms */
-#define	MMP_DEFAULT_INTERVAL		1000	/* ms */
-#define	MMP_DEFAULT_IMPORT_INTERVALS	20
-#define	MMP_DEFAULT_FAIL_INTERVALS	10
-#define	MMP_MIN_FAIL_INTERVALS		2	/* min if != 0 */
-#define	MMP_IMPORT_SAFETY_FACTOR	200	/* pct */
-#define	MMP_INTERVAL_OK(interval)	MAX(interval, MMP_MIN_INTERVAL)
-#define	MMP_FAIL_INTVS_OK(fails)	(fails == 0 ? 0 : MAX(fails, \
-					    MMP_MIN_FAIL_INTERVALS))
-
-typedef struct mmp_thread {
-	kmutex_t	mmp_thread_lock; /* protect thread mgmt fields */
-	kcondvar_t	mmp_thread_cv;
-	kthread_t	*mmp_thread;
-	uint8_t		mmp_thread_exiting;
-	kmutex_t	mmp_io_lock;	/* protect below */
-	hrtime_t	mmp_last_write;	/* last successful MMP write */
-	uint64_t	mmp_delay;	/* decaying avg ns between MMP writes */
-	uberblock_t	mmp_ub;		/* last ub written by sync */
-	zio_t		*mmp_zio_root;	/* root of mmp write zios */
-	uint64_t	mmp_kstat_id;	/* unique id for next MMP write kstat */
-	int		mmp_skip_error; /* reason for last skipped write */
-	vdev_t		*mmp_last_leaf;	/* last mmp write sent here */
-	uint64_t	mmp_leaf_last_gen;	/* last mmp write sent here */
-	uint32_t	mmp_seq;	/* intra-second update counter */
-} mmp_thread_t;
-
-
-extern void mmp_init(struct spa *spa);
-extern void mmp_fini(struct spa *spa);
-extern void mmp_thread_start(struct spa *spa);
-extern void mmp_thread_stop(struct spa *spa);
-extern void mmp_update_uberblock(struct spa *spa, struct uberblock *ub);
-extern void mmp_signal_all_threads(void);
-
-/* Global tuning */
-extern ulong_t zfs_multihost_interval;
-extern uint_t zfs_multihost_fail_intervals;
-extern uint_t zfs_multihost_import_intervals;
-
-#ifdef	__cplusplus
-}
-#endif
-
-#endif	/* _SYS_MMP_H */
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/multilist.h
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/multilist.h
+++ /dev/null
@@ -1,107 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * This file and its contents are supplied under the terms of the
- * Common Development and Distribution License ("CDDL"), version 1.0.
- * You may only use this file in accordance with the terms of version
- * 1.0 of the CDDL.
- *
- * A full copy of the text of the CDDL should have accompanied this
- * source.  A copy of the CDDL is also available via the Internet at
- * http://www.illumos.org/license/CDDL.
- *
- * CDDL HEADER END
- */
-/*
- * Copyright (c) 2013, 2017 by Delphix. All rights reserved.
- */
-
-#ifndef	_SYS_MULTILIST_H
-#define	_SYS_MULTILIST_H
-
-#include <sys/zfs_context.h>
-
-#ifdef	__cplusplus
-extern "C" {
-#endif
-
-typedef list_node_t multilist_node_t;
-typedef struct multilist multilist_t;
-typedef struct multilist_sublist multilist_sublist_t;
-typedef unsigned int multilist_sublist_index_func_t(multilist_t *, void *);
-
-struct multilist_sublist {
-	/*
-	 * The mutex used internally to implement thread safe insertions
-	 * and removals to this individual sublist. It can also be locked
-	 * by a consumer using multilist_sublist_{lock,unlock}, which is
-	 * useful if a consumer needs to traverse the list in a thread
-	 * safe manner.
-	 */
-	kmutex_t	mls_lock;
-	/*
-	 * The actual list object containing all objects in this sublist.
-	 */
-	list_t		mls_list;
-	/*
-	 * Pad to cache line, in an effort to try and prevent cache line
-	 * contention.
-	 */
-} __aligned(CACHE_LINE_SIZE);
-
-struct multilist {
-	/*
-	 * This is used to get to the multilist_node_t structure given
-	 * the void *object contained on the list.
-	 */
-	size_t				ml_offset;
-	/*
-	 * The number of sublists used internally by this multilist.
-	 */
-	uint64_t			ml_num_sublists;
-	/*
-	 * The array of pointers to the actual sublists.
-	 */
-	multilist_sublist_t		*ml_sublists;
-	/*
-	 * Pointer to function which determines the sublist to use
-	 * when inserting and removing objects from this multilist.
-	 * Please see the comment above multilist_create for details.
-	 */
-	multilist_sublist_index_func_t	*ml_index_func;
-};
-
-void multilist_destroy(multilist_t *);
-multilist_t *multilist_create(size_t, size_t, multilist_sublist_index_func_t *);
-
-void multilist_insert(multilist_t *, void *);
-void multilist_remove(multilist_t *, void *);
-int  multilist_is_empty(multilist_t *);
-
-unsigned int multilist_get_num_sublists(multilist_t *);
-unsigned int multilist_get_random_index(multilist_t *);
-
-multilist_sublist_t *multilist_sublist_lock(multilist_t *, unsigned int);
-multilist_sublist_t *multilist_sublist_lock_obj(multilist_t *, void *);
-void multilist_sublist_unlock(multilist_sublist_t *);
-
-void multilist_sublist_insert_head(multilist_sublist_t *, void *);
-void multilist_sublist_insert_tail(multilist_sublist_t *, void *);
-void multilist_sublist_move_forward(multilist_sublist_t *mls, void *obj);
-void multilist_sublist_remove(multilist_sublist_t *, void *);
-int  multilist_sublist_is_empty(multilist_sublist_t *);
-int  multilist_sublist_is_empty_idx(multilist_t *, unsigned int);
-
-void *multilist_sublist_head(multilist_sublist_t *);
-void *multilist_sublist_tail(multilist_sublist_t *);
-void *multilist_sublist_next(multilist_sublist_t *, void *);
-void *multilist_sublist_prev(multilist_sublist_t *, void *);
-
-void multilist_link_init(multilist_node_t *);
-int  multilist_link_active(multilist_node_t *);
-
-#ifdef	__cplusplus
-}
-#endif
-
-#endif /* _SYS_MULTILIST_H */
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/range_tree.h
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/range_tree.h
+++ /dev/null
@@ -1,124 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-
-/*
- * Copyright (c) 2013, 2017 by Delphix. All rights reserved.
- */
-
-#ifndef _SYS_RANGE_TREE_H
-#define	_SYS_RANGE_TREE_H
-
-#include <sys/avl.h>
-#include <sys/dmu.h>
-
-#ifdef	__cplusplus
-extern "C" {
-#endif
-
-#define	RANGE_TREE_HISTOGRAM_SIZE	64
-
-typedef struct range_tree_ops range_tree_ops_t;
-
-/*
- * Note: the range_tree may not be accessed concurrently; consumers
- * must provide external locking if required.
- */
-typedef struct range_tree {
-	avl_tree_t	rt_root;	/* offset-ordered segment AVL tree */
-	uint64_t	rt_space;	/* sum of all segments in the map */
-	range_tree_ops_t *rt_ops;
-	void		*rt_arg;
-
-	/* rt_avl_compare should only be set it rt_arg is an AVL tree */
-	uint64_t	rt_gap;		/* allowable inter-segment gap */
-	int (*rt_avl_compare)(const void *, const void *);
-	/*
-	 * The rt_histogram maintains a histogram of ranges. Each bucket,
-	 * rt_histogram[i], contains the number of ranges whose size is:
-	 * 2^i <= size of range in bytes < 2^(i+1)
-	 */
-	uint64_t	rt_histogram[RANGE_TREE_HISTOGRAM_SIZE];
-} range_tree_t;
-
-typedef struct range_seg {
-	avl_node_t	rs_node;	/* AVL node */
-	avl_node_t	rs_pp_node;	/* AVL picker-private node */
-	uint64_t	rs_start;	/* starting offset of this segment */
-	uint64_t	rs_end;		/* ending offset (non-inclusive) */
-	uint64_t	rs_fill;	/* actual fill if gap mode is on */
-} range_seg_t;
-
-struct range_tree_ops {
-	void    (*rtop_create)(range_tree_t *rt, void *arg);
-	void    (*rtop_destroy)(range_tree_t *rt, void *arg);
-	void	(*rtop_add)(range_tree_t *rt, range_seg_t *rs, void *arg);
-	void    (*rtop_remove)(range_tree_t *rt, range_seg_t *rs, void *arg);
-	void	(*rtop_vacate)(range_tree_t *rt, void *arg);
-};
-
-typedef void range_tree_func_t(void *arg, uint64_t start, uint64_t size);
-
-void range_tree_init(void);
-void range_tree_fini(void);
-range_tree_t *range_tree_create_impl(range_tree_ops_t *ops, void *arg,
-    int (*avl_compare)(const void*, const void*), uint64_t gap);
-range_tree_t *range_tree_create(range_tree_ops_t *ops, void *arg);
-void range_tree_destroy(range_tree_t *rt);
-boolean_t range_tree_contains(range_tree_t *rt, uint64_t start, uint64_t size);
-void range_tree_verify_not_present(range_tree_t *rt,
-    uint64_t start, uint64_t size);
-range_seg_t *range_tree_find(range_tree_t *rt, uint64_t start, uint64_t size);
-void range_tree_resize_segment(range_tree_t *rt, range_seg_t *rs,
-    uint64_t newstart, uint64_t newsize);
-uint64_t range_tree_space(range_tree_t *rt);
-boolean_t range_tree_is_empty(range_tree_t *rt);
-void range_tree_swap(range_tree_t **rtsrc, range_tree_t **rtdst);
-void range_tree_stat_verify(range_tree_t *rt);
-uint64_t range_tree_min(range_tree_t *rt);
-uint64_t range_tree_max(range_tree_t *rt);
-uint64_t range_tree_span(range_tree_t *rt);
-
-void range_tree_add(void *arg, uint64_t start, uint64_t size);
-void range_tree_remove(void *arg, uint64_t start, uint64_t size);
-void range_tree_remove_fill(range_tree_t *rt, uint64_t start, uint64_t size);
-void range_tree_adjust_fill(range_tree_t *rt, range_seg_t *rs, int64_t delta);
-void range_tree_clear(range_tree_t *rt, uint64_t start, uint64_t size);
-
-void range_tree_vacate(range_tree_t *rt, range_tree_func_t *func, void *arg);
-void range_tree_walk(range_tree_t *rt, range_tree_func_t *func, void *arg);
-range_seg_t *range_tree_first(range_tree_t *rt);
-
-void rt_avl_create(range_tree_t *rt, void *arg);
-void rt_avl_destroy(range_tree_t *rt, void *arg);
-void rt_avl_add(range_tree_t *rt, range_seg_t *rs, void *arg);
-void rt_avl_remove(range_tree_t *rt, range_seg_t *rs, void *arg);
-void rt_avl_vacate(range_tree_t *rt, void *arg);
-extern struct range_tree_ops rt_avl_ops;
-
-#ifdef	__cplusplus
-}
-#endif
-
-#endif	/* _SYS_RANGE_TREE_H */
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/refcount.h
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/refcount.h
+++ /dev/null
@@ -1,125 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
- */
-
-#ifndef	_SYS_REFCOUNT_H
-#define	_SYS_REFCOUNT_H
-
-#include <sys/cdefs.h>
-#include <sys/types.h>
-/* For FreeBSD refcount(9). */
-#include_next <sys/refcount.h>
-#include <sys/list.h>
-#include <sys/zfs_context.h>
-
-#ifdef	__cplusplus
-extern "C" {
-#endif
-
-/*
- * If the reference is held only by the calling function and not any
- * particular object, use FTAG (which is a string) for the holder_tag.
- * Otherwise, use the object that holds the reference.
- */
-#define	FTAG ((char *)(uintptr_t)__func__)
-
-#ifdef	ZFS_DEBUG
-typedef struct reference {
-	list_node_t ref_link;
-	void *ref_holder;
-	uint64_t ref_number;
-	uint8_t *ref_removed;
-} reference_t;
-
-typedef struct refcount {
-	kmutex_t rc_mtx;
-	boolean_t rc_tracked;
-	list_t rc_list;
-	list_t rc_removed;
-	uint64_t rc_count;
-	uint64_t rc_removed_count;
-} zfs_refcount_t;
-
-/*
- * Note: zfs_refcount_t must be initialized with
- * refcount_create[_untracked]()
- */
-
-void zfs_refcount_create(zfs_refcount_t *);
-void zfs_refcount_create_untracked(zfs_refcount_t *);
-void zfs_refcount_create_tracked(zfs_refcount_t *);
-void zfs_refcount_destroy(zfs_refcount_t *);
-void zfs_refcount_destroy_many(zfs_refcount_t *, uint64_t);
-int zfs_refcount_is_zero(zfs_refcount_t *);
-int64_t zfs_refcount_count(zfs_refcount_t *);
-int64_t zfs_refcount_add(zfs_refcount_t *, void *);
-int64_t zfs_refcount_remove(zfs_refcount_t *, void *);
-int64_t zfs_refcount_add_many(zfs_refcount_t *, uint64_t, void *);
-int64_t zfs_refcount_remove_many(zfs_refcount_t *, uint64_t, void *);
-void zfs_refcount_transfer(zfs_refcount_t *, zfs_refcount_t *);
-void zfs_refcount_transfer_ownership(zfs_refcount_t *, void *, void *);
-boolean_t zfs_refcount_held(zfs_refcount_t *, void *);
-boolean_t zfs_refcount_not_held(zfs_refcount_t *, void *);
-
-void zfs_refcount_init(void);
-void zfs_refcount_fini(void);
-
-#else	/* ZFS_DEBUG */
-
-typedef struct refcount {
-	uint64_t rc_count;
-} zfs_refcount_t;
-
-#define	zfs_refcount_create(rc) ((rc)->rc_count = 0)
-#define	zfs_refcount_create_untracked(rc) ((rc)->rc_count = 0)
-#define	zfs_refcount_create_tracked(rc) ((rc)->rc_count = 0)
-#define	zfs_refcount_destroy(rc) ((rc)->rc_count = 0)
-#define	zfs_refcount_destroy_many(rc, number) ((rc)->rc_count = 0)
-#define	zfs_refcount_is_zero(rc) ((rc)->rc_count == 0)
-#define	zfs_refcount_count(rc) ((rc)->rc_count)
-#define	zfs_refcount_add(rc, holder) atomic_inc_64_nv(&(rc)->rc_count)
-#define	zfs_refcount_remove(rc, holder) atomic_dec_64_nv(&(rc)->rc_count)
-#define	zfs_refcount_add_many(rc, number, holder) \
-	atomic_add_64_nv(&(rc)->rc_count, number)
-#define	zfs_refcount_remove_many(rc, number, holder) \
-	atomic_add_64_nv(&(rc)->rc_count, -number)
-#define	zfs_refcount_transfer(dst, src) { \
-	uint64_t __tmp = (src)->rc_count; \
-	atomic_add_64(&(src)->rc_count, -__tmp); \
-	atomic_add_64(&(dst)->rc_count, __tmp); \
-}
-#define	zfs_refcount_transfer_ownership(rc, current_holder, new_holder)	(void)0
-#define	zfs_refcount_held(rc, holder)		((rc)->rc_count > 0)
-#define	zfs_refcount_not_held(rc, holder)		(B_TRUE)
-
-#define	zfs_refcount_init()
-#define	zfs_refcount_fini()
-
-#endif	/* ZFS_DEBUG */
-
-#ifdef	__cplusplus
-}
-#endif
-
-#endif /* _SYS_REFCOUNT_H */
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/rrwlock.h
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/rrwlock.h
+++ /dev/null
@@ -1,112 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-/*
- * Copyright (c) 2012 by Delphix. All rights reserved.
- */
-
-#ifndef	_SYS_RR_RW_LOCK_H
-#define	_SYS_RR_RW_LOCK_H
-
-#ifdef	__cplusplus
-extern "C" {
-#endif
-
-#include <sys/zfs_context.h>
-#include <sys/refcount.h>
-
-/*
- * A reader-writer lock implementation that allows re-entrant reads, but
- * still gives writers priority on "new" reads.
- *
- * See rrwlock.c for more details about the implementation.
- *
- * Fields of the rrwlock_t structure:
- * - rr_lock: protects modification and reading of rrwlock_t fields
- * - rr_cv: cv for waking up readers or waiting writers
- * - rr_writer: thread id of the current writer
- * - rr_anon_rount: number of active anonymous readers
- * - rr_linked_rcount: total number of non-anonymous active readers
- * - rr_writer_wanted: a writer wants the lock
- */
-typedef struct rrwlock {
-	kmutex_t	rr_lock;
-	kcondvar_t	rr_cv;
-	kthread_t	*rr_writer;
-	zfs_refcount_t	rr_anon_rcount;
-	zfs_refcount_t	rr_linked_rcount;
-	boolean_t	rr_writer_wanted;
-	boolean_t	rr_track_all;
-} rrwlock_t;
-
-/*
- * 'tag' is used in reference counting tracking.  The
- * 'tag' must be the same in a rrw_enter() as in its
- * corresponding rrw_exit().
- */
-void rrw_init(rrwlock_t *rrl, boolean_t track_all);
-void rrw_destroy(rrwlock_t *rrl);
-void rrw_enter(rrwlock_t *rrl, krw_t rw, void *tag);
-void rrw_enter_read(rrwlock_t *rrl, void *tag);
-void rrw_enter_read_prio(rrwlock_t *rrl, void *tag);
-void rrw_enter_write(rrwlock_t *rrl);
-void rrw_exit(rrwlock_t *rrl, void *tag);
-boolean_t rrw_held(rrwlock_t *rrl, krw_t rw);
-void rrw_tsd_destroy(void *arg);
-
-#define	RRW_READ_HELD(x)	rrw_held(x, RW_READER)
-#define	RRW_WRITE_HELD(x)	rrw_held(x, RW_WRITER)
-#define	RRW_LOCK_HELD(x) \
-	(rrw_held(x, RW_WRITER) || rrw_held(x, RW_READER))
-
-/*
- * A reader-mostly lock implementation, tuning above reader-writer locks
- * for hightly parallel read acquisitions, pessimizing write acquisitions.
- *
- * This should be a prime number.  See comment in rrwlock.c near
- * RRM_TD_LOCK() for details.
- */
-#define	RRM_NUM_LOCKS		17
-typedef struct rrmlock {
-	rrwlock_t	locks[RRM_NUM_LOCKS];
-} rrmlock_t;
-
-void rrm_init(rrmlock_t *rrl, boolean_t track_all);
-void rrm_destroy(rrmlock_t *rrl);
-void rrm_enter(rrmlock_t *rrl, krw_t rw, void *tag);
-void rrm_enter_read(rrmlock_t *rrl, void *tag);
-void rrm_enter_write(rrmlock_t *rrl);
-void rrm_exit(rrmlock_t *rrl, void *tag);
-boolean_t rrm_held(rrmlock_t *rrl, krw_t rw);
-
-#define	RRM_READ_HELD(x)	rrm_held(x, RW_READER)
-#define	RRM_WRITE_HELD(x)	rrm_held(x, RW_WRITER)
-#define	RRM_LOCK_HELD(x) \
-	(rrm_held(x, RW_WRITER) || rrm_held(x, RW_READER))
-
-#ifdef	__cplusplus
-}
-#endif
-
-#endif	/* _SYS_RR_RW_LOCK_H */
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/sa.h
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/sa.h
+++ /dev/null
@@ -1,170 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
- */
-
-#ifndef	_SYS_SA_H
-#define	_SYS_SA_H
-
-#include <sys/dmu.h>
-#include <sys/uio.h>
-
-/*
- * Currently available byteswap functions.
- * If it all possible new attributes should used
- * one of the already defined byteswap functions.
- * If a new byteswap function is added then the
- * ZPL/Pool version will need to be bumped.
- */
-
-typedef enum sa_bswap_type {
-	SA_UINT64_ARRAY,
-	SA_UINT32_ARRAY,
-	SA_UINT16_ARRAY,
-	SA_UINT8_ARRAY,
-	SA_ACL,
-} sa_bswap_type_t;
-
-typedef uint16_t	sa_attr_type_t;
-
-/*
- * Attribute to register support for.
- */
-typedef struct sa_attr_reg {
-	char 			*sa_name;	/* attribute name */
-	uint16_t 		sa_length;
-	sa_bswap_type_t		sa_byteswap;	/* bswap functon enum */
-	sa_attr_type_t 		sa_attr; /* filled in during registration */
-} sa_attr_reg_t;
-
-
-typedef void (sa_data_locator_t)(void **, uint32_t *, uint32_t,
-    boolean_t, void *userptr);
-
-/*
- * array of attributes to store.
- *
- * This array should be treated as opaque/private data.
- * The SA_BULK_ADD_ATTR() macro should be used for manipulating
- * the array.
- *
- * When sa_replace_all_by_template() is used the attributes
- * will be stored in the order defined in the array, except that
- * the attributes may be split between the bonus and the spill buffer
- *
- */
-typedef struct sa_bulk_attr {
-	void			*sa_data;
-	sa_data_locator_t	*sa_data_func;
-	uint16_t		sa_length;
-	sa_attr_type_t		sa_attr;
-	/* the following are private to the sa framework */
-	void 			*sa_addr;
-	uint16_t		sa_buftype;
-	uint16_t		sa_size;
-} sa_bulk_attr_t;
-
-
-/*
- * special macro for adding entries for bulk attr support
- * bulk - sa_bulk_attr_t
- * count - integer that will be incremented during each add
- * attr - attribute to manipulate
- * func - function for accessing data.
- * data - pointer to data.
- * len - length of data
- */
-
-#define	SA_ADD_BULK_ATTR(b, idx, attr, func, data, len) \
-{ \
-	b[idx].sa_attr = attr;\
-	b[idx].sa_data_func = func; \
-	b[idx].sa_data = data; \
-	b[idx++].sa_length = len; \
-}
-
-typedef struct sa_os sa_os_t;
-
-typedef enum sa_handle_type {
-	SA_HDL_SHARED,
-	SA_HDL_PRIVATE
-} sa_handle_type_t;
-
-struct sa_handle;
-typedef void *sa_lookup_tab_t;
-typedef struct sa_handle sa_handle_t;
-
-typedef void (sa_update_cb_t)(sa_handle_t *, dmu_tx_t *tx);
-
-int sa_handle_get(objset_t *, uint64_t, void *userp,
-    sa_handle_type_t, sa_handle_t **);
-int sa_handle_get_from_db(objset_t *, dmu_buf_t *, void *userp,
-    sa_handle_type_t, sa_handle_t **);
-void sa_handle_destroy(sa_handle_t *);
-int sa_buf_hold(objset_t *, uint64_t, void *, dmu_buf_t **);
-void sa_buf_rele(dmu_buf_t *, void *);
-int sa_lookup(sa_handle_t *, sa_attr_type_t, void *buf, uint32_t buflen);
-int sa_update(sa_handle_t *, sa_attr_type_t, void *buf,
-    uint32_t buflen, dmu_tx_t *);
-int sa_remove(sa_handle_t *, sa_attr_type_t, dmu_tx_t *);
-int sa_bulk_lookup(sa_handle_t *, sa_bulk_attr_t *, int count);
-int sa_bulk_lookup_locked(sa_handle_t *, sa_bulk_attr_t *, int count);
-int sa_bulk_update(sa_handle_t *, sa_bulk_attr_t *, int count, dmu_tx_t *);
-int sa_size(sa_handle_t *, sa_attr_type_t, int *);
-int sa_update_from_cb(sa_handle_t *, sa_attr_type_t,
-    uint32_t buflen, sa_data_locator_t *, void *userdata, dmu_tx_t *);
-void sa_object_info(sa_handle_t *, dmu_object_info_t *);
-void sa_object_size(sa_handle_t *, uint32_t *, u_longlong_t *);
-void *sa_get_userdata(sa_handle_t *);
-void sa_set_userp(sa_handle_t *, void *);
-dmu_buf_t *sa_get_db(sa_handle_t *);
-uint64_t sa_handle_object(sa_handle_t *);
-boolean_t sa_attr_would_spill(sa_handle_t *, sa_attr_type_t, int size);
-void sa_register_update_callback(objset_t *, sa_update_cb_t *);
-int sa_setup(objset_t *, uint64_t, sa_attr_reg_t *, int, sa_attr_type_t **);
-void sa_tear_down(objset_t *);
-int sa_replace_all_by_template(sa_handle_t *, sa_bulk_attr_t *,
-    int, dmu_tx_t *);
-int sa_replace_all_by_template_locked(sa_handle_t *, sa_bulk_attr_t *,
-    int, dmu_tx_t *);
-boolean_t sa_enabled(objset_t *);
-void sa_cache_init(void);
-void sa_cache_fini(void);
-int sa_set_sa_object(objset_t *, uint64_t);
-int sa_hdrsize(void *);
-void sa_handle_lock(sa_handle_t *);
-void sa_handle_unlock(sa_handle_t *);
-
-#ifdef _KERNEL
-int sa_lookup_uio(sa_handle_t *, sa_attr_type_t, uio_t *);
-#endif
-
-#ifdef	__cplusplus
-extern "C" {
-#endif
-
-
-#ifdef	__cplusplus
-}
-#endif
-
-#endif	/* _SYS_SA_H */
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/sa_impl.h
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/sa_impl.h
+++ /dev/null
@@ -1,291 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2013 by Delphix. All rights reserved.
- * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
- */
-
-#ifndef	_SYS_SA_IMPL_H
-#define	_SYS_SA_IMPL_H
-
-#include <sys/dmu.h>
-#include <sys/refcount.h>
-#include <sys/list.h>
-
-/*
- * Array of known attributes and their
- * various characteristics.
- */
-typedef struct sa_attr_table {
-	sa_attr_type_t	sa_attr;
-	uint8_t sa_registered;
-	uint16_t sa_length;
-	sa_bswap_type_t sa_byteswap;
-	char *sa_name;
-} sa_attr_table_t;
-
-/*
- * Zap attribute format for attribute registration
- *
- * 64      56      48      40      32      24      16      8       0
- * +-------+-------+-------+-------+-------+-------+-------+-------+
- * |        unused         |      len      | bswap |   attr num    |
- * +-------+-------+-------+-------+-------+-------+-------+-------+
- *
- * Zap attribute format for layout information.
- *
- * layout information is stored as an array of attribute numbers
- * The name of the attribute is the layout number (0, 1, 2, ...)
- *
- * 16       0
- * +---- ---+
- * | attr # |
- * +--------+
- * | attr # |
- * +--- ----+
- *  ......
- *
- */
-
-#define	ATTR_BSWAP(x)	BF32_GET(x, 16, 8)
-#define	ATTR_LENGTH(x)	BF32_GET(x, 24, 16)
-#define	ATTR_NUM(x)	BF32_GET(x, 0, 16)
-#define	ATTR_ENCODE(x, attr, length, bswap) \
-{ \
-	BF64_SET(x, 24, 16, length); \
-	BF64_SET(x, 16, 8, bswap); \
-	BF64_SET(x, 0, 16, attr); \
-}
-
-#define	TOC_OFF(x)		BF32_GET(x, 0, 23)
-#define	TOC_ATTR_PRESENT(x)	BF32_GET(x, 31, 1)
-#define	TOC_LEN_IDX(x)		BF32_GET(x, 24, 4)
-#define	TOC_ATTR_ENCODE(x, len_idx, offset) \
-{ \
-	BF32_SET(x, 31, 1, 1); \
-	BF32_SET(x, 24, 7, len_idx); \
-	BF32_SET(x, 0, 24, offset); \
-}
-
-#define	SA_LAYOUTS	"LAYOUTS"
-#define	SA_REGISTRY	"REGISTRY"
-
-/*
- * Each unique layout will have their own table
- * sa_lot (layout_table)
- */
-typedef struct sa_lot {
-	avl_node_t lot_num_node;
-	avl_node_t lot_hash_node;
-	uint64_t lot_num;
-	uint64_t lot_hash;
-	sa_attr_type_t *lot_attrs;	/* array of attr #'s */
-	uint32_t lot_var_sizes;	/* how many aren't fixed size */
-	uint32_t lot_attr_count;	/* total attr count */
-	list_t	lot_idx_tab;	/* should be only a couple of entries */
-	int	lot_instance;	/* used with lot_hash to identify entry */
-} sa_lot_t;
-
-/* index table of offsets */
-typedef struct sa_idx_tab {
-	list_node_t	sa_next;
-	sa_lot_t	*sa_layout;
-	uint16_t	*sa_variable_lengths;
-	zfs_refcount_t	sa_refcount;
-	uint32_t	*sa_idx_tab;	/* array of offsets */
-} sa_idx_tab_t;
-
-/*
- * Since the offset/index information into the actual data
- * will usually be identical we can share that information with
- * all handles that have the exact same offsets.
- *
- * You would typically only have a large number of different table of
- * contents if you had a several variable sized attributes.
- *
- * Two AVL trees are used to track the attribute layout numbers.
- * one is keyed by number and will be consulted when a DMU_OT_SA
- * object is first read.  The second tree is keyed by the hash signature
- * of the attributes and will be consulted when an attribute is added
- * to determine if we already have an instance of that layout.  Both
- * of these tree's are interconnected.  The only difference is that
- * when an entry is found in the "hash" tree the list of attributes will
- * need to be compared against the list of attributes you have in hand.
- * The assumption is that typically attributes will just be updated and
- * adding a completely new attribute is a very rare operation.
- */
-struct sa_os {
-	kmutex_t	sa_lock;
-	boolean_t	sa_need_attr_registration;
-	boolean_t	sa_force_spill;
-	uint64_t	sa_master_obj;
-	uint64_t	sa_reg_attr_obj;
-	uint64_t	sa_layout_attr_obj;
-	int		sa_num_attrs;
-	sa_attr_table_t *sa_attr_table;	 /* private attr table */
-	sa_update_cb_t	*sa_update_cb;
-	avl_tree_t	sa_layout_num_tree;  /* keyed by layout number */
-	avl_tree_t	sa_layout_hash_tree; /* keyed by layout hash value */
-	int		sa_user_table_sz;
-	sa_attr_type_t	*sa_user_table; /* user name->attr mapping table */
-};
-
-/*
- * header for all bonus and spill buffers.
- *
- * The header has a fixed portion with a variable number
- * of "lengths" depending on the number of variable sized
- * attributes which are determined by the "layout number"
- */
-
-#define	SA_MAGIC	0x2F505A  /* ZFS SA */
-typedef struct sa_hdr_phys {
-	uint32_t sa_magic;
-	/* BEGIN CSTYLED */
-	/*
-	 * Encoded with hdrsize and layout number as follows:
-	 * 16      10       0
-	 * +--------+-------+
-	 * | hdrsz  |layout |
-	 * +--------+-------+
-	 *
-	 * Bits 0-10 are the layout number
-	 * Bits 11-16 are the size of the header.
-	 * The hdrsize is the number * 8
-	 *
-	 * For example.
-	 * hdrsz of 1 ==> 8 byte header
-	 *          2 ==> 16 byte header
-	 *
-	 */
-	/* END CSTYLED */
-	uint16_t sa_layout_info;
-	uint16_t sa_lengths[1];	/* optional sizes for variable length attrs */
-	/* ... Data follows the lengths.  */
-} sa_hdr_phys_t;
-
-#define	SA_HDR_LAYOUT_NUM(hdr) BF32_GET(hdr->sa_layout_info, 0, 10)
-#define	SA_HDR_SIZE(hdr) BF32_GET_SB(hdr->sa_layout_info, 10, 6, 3, 0)
-#define	SA_HDR_LAYOUT_INFO_ENCODE(x, num, size) \
-{ \
-	BF32_SET_SB(x, 10, 6, 3, 0, size); \
-	BF32_SET(x, 0, 10, num); \
-}
-
-typedef enum sa_buf_type {
-	SA_BONUS = 1,
-	SA_SPILL = 2
-} sa_buf_type_t;
-
-typedef enum sa_data_op {
-	SA_LOOKUP,
-	SA_UPDATE,
-	SA_ADD,
-	SA_REPLACE,
-	SA_REMOVE
-} sa_data_op_t;
-
-/*
- * Opaque handle used for most sa functions
- *
- * This needs to be kept as small as possible.
- */
-
-struct sa_handle {
-	dmu_buf_user_t	sa_dbu;
-	kmutex_t	sa_lock;
-	dmu_buf_t	*sa_bonus;
-	dmu_buf_t	*sa_spill;
-	objset_t	*sa_os;
-	void		*sa_userp;
-	sa_idx_tab_t	*sa_bonus_tab;	 /* idx of bonus */
-	sa_idx_tab_t	*sa_spill_tab; /* only present if spill activated */
-};
-
-#define	SA_GET_DB(hdl, type)	\
-	(dmu_buf_impl_t *)((type == SA_BONUS) ? hdl->sa_bonus : hdl->sa_spill)
-
-#define	SA_GET_HDR(hdl, type) \
-	((sa_hdr_phys_t *)((dmu_buf_impl_t *)(SA_GET_DB(hdl, \
-	type))->db.db_data))
-
-#define	SA_IDX_TAB_GET(hdl, type) \
-	(type == SA_BONUS ? hdl->sa_bonus_tab : hdl->sa_spill_tab)
-
-#define	IS_SA_BONUSTYPE(a)	\
-	((a == DMU_OT_SA) ? B_TRUE : B_FALSE)
-
-#define	SA_BONUSTYPE_FROM_DB(db) \
-	(dmu_get_bonustype((dmu_buf_t *)db))
-
-#define	SA_BLKPTR_SPACE	(DN_OLD_MAX_BONUSLEN - sizeof (blkptr_t))
-
-#define	SA_LAYOUT_NUM(x, type) \
-	((!IS_SA_BONUSTYPE(type) ? 0 : (((IS_SA_BONUSTYPE(type)) && \
-	((SA_HDR_LAYOUT_NUM(x)) == 0)) ? 1 : SA_HDR_LAYOUT_NUM(x))))
-
-
-#define	SA_REGISTERED_LEN(sa, attr) sa->sa_attr_table[attr].sa_length
-
-#define	SA_ATTR_LEN(sa, idx, attr, hdr) ((SA_REGISTERED_LEN(sa, attr) == 0) ?\
-	hdr->sa_lengths[TOC_LEN_IDX(idx->sa_idx_tab[attr])] : \
-	SA_REGISTERED_LEN(sa, attr))
-
-#define	SA_SET_HDR(hdr, num, size) \
-	{ \
-		hdr->sa_magic = SA_MAGIC; \
-		SA_HDR_LAYOUT_INFO_ENCODE(hdr->sa_layout_info, num, size); \
-	}
-
-#define	SA_ATTR_INFO(sa, idx, hdr, attr, bulk, type, hdl) \
-	{ \
-		bulk.sa_size = SA_ATTR_LEN(sa, idx, attr, hdr); \
-		bulk.sa_buftype = type; \
-		bulk.sa_addr = \
-		    (void *)((uintptr_t)TOC_OFF(idx->sa_idx_tab[attr]) + \
-		    (uintptr_t)hdr); \
-}
-
-#define	SA_HDR_SIZE_MATCH_LAYOUT(hdr, tb) \
-	(SA_HDR_SIZE(hdr) == (sizeof (sa_hdr_phys_t) + \
-	(tb->lot_var_sizes > 1 ? P2ROUNDUP((tb->lot_var_sizes - 1) * \
-	sizeof (uint16_t), 8) : 0)))
-
-int sa_add_impl(sa_handle_t *, sa_attr_type_t,
-    uint32_t, sa_data_locator_t, void *, dmu_tx_t *);
-
-void sa_register_update_callback_locked(objset_t *, sa_update_cb_t *);
-int sa_size_locked(sa_handle_t *, sa_attr_type_t, int *);
-
-void sa_default_locator(void **, uint32_t *, uint32_t, boolean_t, void *);
-int sa_attr_size(sa_os_t *, sa_idx_tab_t *, sa_attr_type_t,
-    uint16_t *, sa_hdr_phys_t *);
-
-#ifdef	__cplusplus
-extern "C" {
-#endif
-
-#ifdef	__cplusplus
-}
-#endif
-
-#endif	/* _SYS_SA_IMPL_H */
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa.h
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa.h
+++ /dev/null
@@ -1,969 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
- * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
- * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
- * Copyright 2013 Saso Kiselkov. All rights reserved.
- * Copyright (c) 2014 Integros [integros.com]
- * Copyright 2017 Joyent, Inc.
- * Copyright (c) 2017 Datto Inc.
- * Copyright (c) 2017, Intel Corporation.
- */
-
-#ifndef _SYS_SPA_H
-#define	_SYS_SPA_H
-
-#include <sys/avl.h>
-#include <sys/zfs_context.h>
-#include <sys/nvpair.h>
-#include <sys/sysevent.h>
-#include <sys/sysmacros.h>
-#include <sys/types.h>
-#include <sys/fs/zfs.h>
-#include <sys/dmu.h>
-
-#ifdef	__cplusplus
-extern "C" {
-#endif
-
-/*
- * Forward references that lots of things need.
- */
-typedef struct spa spa_t;
-typedef struct vdev vdev_t;
-typedef struct metaslab metaslab_t;
-typedef struct metaslab_group metaslab_group_t;
-typedef struct metaslab_class metaslab_class_t;
-typedef struct zio zio_t;
-typedef struct zilog zilog_t;
-typedef struct spa_aux_vdev spa_aux_vdev_t;
-typedef struct ddt ddt_t;
-typedef struct ddt_entry ddt_entry_t;
-struct dsl_pool;
-struct dsl_dataset;
-
-/*
- * General-purpose 32-bit and 64-bit bitfield encodings.
- */
-#define	BF32_DECODE(x, low, len)	P2PHASE((x) >> (low), 1U << (len))
-#define	BF64_DECODE(x, low, len)	P2PHASE((x) >> (low), 1ULL << (len))
-#define	BF32_ENCODE(x, low, len)	(P2PHASE((x), 1U << (len)) << (low))
-#define	BF64_ENCODE(x, low, len)	(P2PHASE((x), 1ULL << (len)) << (low))
-
-#define	BF32_GET(x, low, len)		BF32_DECODE(x, low, len)
-#define	BF64_GET(x, low, len)		BF64_DECODE(x, low, len)
-
-#define	BF32_SET(x, low, len, val) do { \
-	ASSERT3U(val, <, 1U << (len)); \
-	ASSERT3U(low + len, <=, 32); \
-	(x) ^= BF32_ENCODE((x >> low) ^ (val), low, len); \
-_NOTE(CONSTCOND) } while (0)
-
-#define	BF64_SET(x, low, len, val) do { \
-	ASSERT3U(val, <, 1ULL << (len)); \
-	ASSERT3U(low + len, <=, 64); \
-	((x) ^= BF64_ENCODE((x >> low) ^ (val), low, len)); \
-_NOTE(CONSTCOND) } while (0)
-
-#define	BF32_GET_SB(x, low, len, shift, bias)	\
-	((BF32_GET(x, low, len) + (bias)) << (shift))
-#define	BF64_GET_SB(x, low, len, shift, bias)	\
-	((BF64_GET(x, low, len) + (bias)) << (shift))
-
-#define	BF32_SET_SB(x, low, len, shift, bias, val) do { \
-	ASSERT(IS_P2ALIGNED(val, 1U << shift)); \
-	ASSERT3S((val) >> (shift), >=, bias); \
-	BF32_SET(x, low, len, ((val) >> (shift)) - (bias)); \
-_NOTE(CONSTCOND) } while (0)
-#define	BF64_SET_SB(x, low, len, shift, bias, val) do { \
-	ASSERT(IS_P2ALIGNED(val, 1ULL << shift)); \
-	ASSERT3S((val) >> (shift), >=, bias); \
-	BF64_SET(x, low, len, ((val) >> (shift)) - (bias)); \
-_NOTE(CONSTCOND) } while (0)
-
-/*
- * We currently support block sizes from 512 bytes to 16MB.
- * The benefits of larger blocks, and thus larger IO, need to be weighed
- * against the cost of COWing a giant block to modify one byte, and the
- * large latency of reading or writing a large block.
- *
- * Note that although blocks up to 16MB are supported, the recordsize
- * property can not be set larger than zfs_max_recordsize (default 1MB).
- * See the comment near zfs_max_recordsize in dsl_dataset.c for details.
- *
- * Note that although the LSIZE field of the blkptr_t can store sizes up
- * to 32MB, the dnode's dn_datablkszsec can only store sizes up to
- * 32MB - 512 bytes.  Therefore, we limit SPA_MAXBLOCKSIZE to 16MB.
- */
-#define	SPA_MINBLOCKSHIFT	9
-#define	SPA_OLD_MAXBLOCKSHIFT	17
-#define	SPA_MAXBLOCKSHIFT	24
-#define	SPA_MINBLOCKSIZE	(1ULL << SPA_MINBLOCKSHIFT)
-#define	SPA_OLD_MAXBLOCKSIZE	(1ULL << SPA_OLD_MAXBLOCKSHIFT)
-#define	SPA_MAXBLOCKSIZE	(1ULL << SPA_MAXBLOCKSHIFT)
-
-/*
- * Default maximum supported logical ashift.
- *
- * The current 8k allocation block size limit is due to the 8k
- * aligned/sized operations performed by vdev_probe() on
- * vdev_label->vl_pad2.  Using another "safe region" for these tests
- * would allow the limit to be raised to 16k, at the expense of
- * only having 8 available uberblocks in the label area.
- */
-#define	SPA_MAXASHIFT		13
-
-/*
- * Default minimum supported logical ashift.
- */
-#define SPA_MINASHIFT		SPA_MINBLOCKSHIFT
-
-/*
- * Size of block to hold the configuration data (a packed nvlist)
- */
-#define	SPA_CONFIG_BLOCKSIZE	(1ULL << 14)
-
-/*
- * The DVA size encodings for LSIZE and PSIZE support blocks up to 32MB.
- * The ASIZE encoding should be at least 64 times larger (6 more bits)
- * to support up to 4-way RAID-Z mirror mode with worst-case gang block
- * overhead, three DVAs per bp, plus one more bit in case we do anything
- * else that expands the ASIZE.
- */
-#define	SPA_LSIZEBITS		16	/* LSIZE up to 32M (2^16 * 512)	*/
-#define	SPA_PSIZEBITS		16	/* PSIZE up to 32M (2^16 * 512)	*/
-#define	SPA_ASIZEBITS		24	/* ASIZE up to 64 times larger	*/
-
-#define	SPA_COMPRESSBITS	7
-#define	SPA_VDEVBITS		24
-
-/*
- * All SPA data is represented by 128-bit data virtual addresses (DVAs).
- * The members of the dva_t should be considered opaque outside the SPA.
- */
-typedef struct dva {
-	uint64_t	dva_word[2];
-} dva_t;
-
-/*
- * Each block has a 256-bit checksum -- strong enough for cryptographic hashes.
- */
-typedef struct zio_cksum {
-	uint64_t	zc_word[4];
-} zio_cksum_t;
-
-/*
- * Some checksums/hashes need a 256-bit initialization salt. This salt is kept
- * secret and is suitable for use in MAC algorithms as the key.
- */
-typedef struct zio_cksum_salt {
-	uint8_t		zcs_bytes[32];
-} zio_cksum_salt_t;
-
-/*
- * Each block is described by its DVAs, time of birth, checksum, etc.
- * The word-by-word, bit-by-bit layout of the blkptr is as follows:
- *
- *	64	56	48	40	32	24	16	8	0
- *	+-------+-------+-------+-------+-------+-------+-------+-------+
- * 0	|  pad  |	  vdev1         | GRID  |	  ASIZE		|
- *	+-------+-------+-------+-------+-------+-------+-------+-------+
- * 1	|G|			 offset1				|
- *	+-------+-------+-------+-------+-------+-------+-------+-------+
- * 2	|  pad  |	  vdev2         | GRID  |	  ASIZE		|
- *	+-------+-------+-------+-------+-------+-------+-------+-------+
- * 3	|G|			 offset2				|
- *	+-------+-------+-------+-------+-------+-------+-------+-------+
- * 4	|  pad  |	  vdev3         | GRID  |	  ASIZE		|
- *	+-------+-------+-------+-------+-------+-------+-------+-------+
- * 5	|G|			 offset3				|
- *	+-------+-------+-------+-------+-------+-------+-------+-------+
- * 6	|BDX|lvl| type	| cksum |E| comp|    PSIZE	|     LSIZE	|
- *	+-------+-------+-------+-------+-------+-------+-------+-------+
- * 7	|			padding					|
- *	+-------+-------+-------+-------+-------+-------+-------+-------+
- * 8	|			padding					|
- *	+-------+-------+-------+-------+-------+-------+-------+-------+
- * 9	|			physical birth txg			|
- *	+-------+-------+-------+-------+-------+-------+-------+-------+
- * a	|			logical birth txg			|
- *	+-------+-------+-------+-------+-------+-------+-------+-------+
- * b	|			fill count				|
- *	+-------+-------+-------+-------+-------+-------+-------+-------+
- * c	|			checksum[0]				|
- *	+-------+-------+-------+-------+-------+-------+-------+-------+
- * d	|			checksum[1]				|
- *	+-------+-------+-------+-------+-------+-------+-------+-------+
- * e	|			checksum[2]				|
- *	+-------+-------+-------+-------+-------+-------+-------+-------+
- * f	|			checksum[3]				|
- *	+-------+-------+-------+-------+-------+-------+-------+-------+
- *
- * Legend:
- *
- * vdev		virtual device ID
- * offset	offset into virtual device
- * LSIZE	logical size
- * PSIZE	physical size (after compression)
- * ASIZE	allocated size (including RAID-Z parity and gang block headers)
- * GRID		RAID-Z layout information (reserved for future use)
- * cksum	checksum function
- * comp		compression function
- * G		gang block indicator
- * B		byteorder (endianness)
- * D		dedup
- * X		encryption (on version 30, which is not supported)
- * E		blkptr_t contains embedded data (see below)
- * lvl		level of indirection
- * type		DMU object type
- * phys birth	txg when dva[0] was written; zero if same as logical birth txg
- *              note that typically all the dva's would be written in this
- *              txg, but they could be different if they were moved by
- *              device removal.
- * log. birth	transaction group in which the block was logically born
- * fill count	number of non-zero blocks under this bp
- * checksum[4]	256-bit checksum of the data this bp describes
- */
-
-/*
- * "Embedded" blkptr_t's don't actually point to a block, instead they
- * have a data payload embedded in the blkptr_t itself.  See the comment
- * in blkptr.c for more details.
- *
- * The blkptr_t is laid out as follows:
- *
- *	64	56	48	40	32	24	16	8	0
- *	+-------+-------+-------+-------+-------+-------+-------+-------+
- * 0	|      payload                                                  |
- * 1	|      payload                                                  |
- * 2	|      payload                                                  |
- * 3	|      payload                                                  |
- * 4	|      payload                                                  |
- * 5	|      payload                                                  |
- *	+-------+-------+-------+-------+-------+-------+-------+-------+
- * 6	|BDX|lvl| type	| etype |E| comp| PSIZE|              LSIZE	|
- *	+-------+-------+-------+-------+-------+-------+-------+-------+
- * 7	|      payload                                                  |
- * 8	|      payload                                                  |
- * 9	|      payload                                                  |
- *	+-------+-------+-------+-------+-------+-------+-------+-------+
- * a	|			logical birth txg			|
- *	+-------+-------+-------+-------+-------+-------+-------+-------+
- * b	|      payload                                                  |
- * c	|      payload                                                  |
- * d	|      payload                                                  |
- * e	|      payload                                                  |
- * f	|      payload                                                  |
- *	+-------+-------+-------+-------+-------+-------+-------+-------+
- *
- * Legend:
- *
- * payload		contains the embedded data
- * B (byteorder)	byteorder (endianness)
- * D (dedup)		padding (set to zero)
- * X			encryption (set to zero; see above)
- * E (embedded)		set to one
- * lvl			indirection level
- * type			DMU object type
- * etype		how to interpret embedded data (BP_EMBEDDED_TYPE_*)
- * comp			compression function of payload
- * PSIZE		size of payload after compression, in bytes
- * LSIZE		logical size of payload, in bytes
- *			note that 25 bits is enough to store the largest
- *			"normal" BP's LSIZE (2^16 * 2^9) in bytes
- * log. birth		transaction group in which the block was logically born
- *
- * Note that LSIZE and PSIZE are stored in bytes, whereas for non-embedded
- * bp's they are stored in units of SPA_MINBLOCKSHIFT.
- * Generally, the generic BP_GET_*() macros can be used on embedded BP's.
- * The B, D, X, lvl, type, and comp fields are stored the same as with normal
- * BP's so the BP_SET_* macros can be used with them.  etype, PSIZE, LSIZE must
- * be set with the BPE_SET_* macros.  BP_SET_EMBEDDED() should be called before
- * other macros, as they assert that they are only used on BP's of the correct
- * "embedded-ness".
- */
-
-#define	BPE_GET_ETYPE(bp)	\
-	(ASSERT(BP_IS_EMBEDDED(bp)), \
-	BF64_GET((bp)->blk_prop, 40, 8))
-#define	BPE_SET_ETYPE(bp, t)	do { \
-	ASSERT(BP_IS_EMBEDDED(bp)); \
-	BF64_SET((bp)->blk_prop, 40, 8, t); \
-_NOTE(CONSTCOND) } while (0)
-
-#define	BPE_GET_LSIZE(bp)	\
-	(ASSERT(BP_IS_EMBEDDED(bp)), \
-	BF64_GET_SB((bp)->blk_prop, 0, 25, 0, 1))
-#define	BPE_SET_LSIZE(bp, x)	do { \
-	ASSERT(BP_IS_EMBEDDED(bp)); \
-	BF64_SET_SB((bp)->blk_prop, 0, 25, 0, 1, x); \
-_NOTE(CONSTCOND) } while (0)
-
-#define	BPE_GET_PSIZE(bp)	\
-	(ASSERT(BP_IS_EMBEDDED(bp)), \
-	BF64_GET_SB((bp)->blk_prop, 25, 7, 0, 1))
-#define	BPE_SET_PSIZE(bp, x)	do { \
-	ASSERT(BP_IS_EMBEDDED(bp)); \
-	BF64_SET_SB((bp)->blk_prop, 25, 7, 0, 1, x); \
-_NOTE(CONSTCOND) } while (0)
-
-typedef enum bp_embedded_type {
-	BP_EMBEDDED_TYPE_DATA,
-	BP_EMBEDDED_TYPE_RESERVED, /* Reserved for an unintegrated feature. */
-	NUM_BP_EMBEDDED_TYPES = BP_EMBEDDED_TYPE_RESERVED
-} bp_embedded_type_t;
-
-#define	BPE_NUM_WORDS 14
-#define	BPE_PAYLOAD_SIZE (BPE_NUM_WORDS * sizeof (uint64_t))
-#define	BPE_IS_PAYLOADWORD(bp, wp) \
-	((wp) != &(bp)->blk_prop && (wp) != &(bp)->blk_birth)
-
-#define	SPA_BLKPTRSHIFT	7		/* blkptr_t is 128 bytes	*/
-#define	SPA_DVAS_PER_BP	3		/* Number of DVAs in a bp	*/
-#define	SPA_SYNC_MIN_VDEVS 3		/* min vdevs to update during sync */
-
-/*
- * A block is a hole when it has either 1) never been written to, or
- * 2) is zero-filled. In both cases, ZFS can return all zeroes for all reads
- * without physically allocating disk space. Holes are represented in the
- * blkptr_t structure by zeroed blk_dva. Correct checking for holes is
- * done through the BP_IS_HOLE macro. For holes, the logical size, level,
- * DMU object type, and birth times are all also stored for holes that
- * were written to at some point (i.e. were punched after having been filled).
- */
-typedef struct blkptr {
-	dva_t		blk_dva[SPA_DVAS_PER_BP]; /* Data Virtual Addresses */
-	uint64_t	blk_prop;	/* size, compression, type, etc	    */
-	uint64_t	blk_pad[2];	/* Extra space for the future	    */
-	uint64_t	blk_phys_birth;	/* txg when block was allocated	    */
-	uint64_t	blk_birth;	/* transaction group at birth	    */
-	uint64_t	blk_fill;	/* fill count			    */
-	zio_cksum_t	blk_cksum;	/* 256-bit checksum		    */
-} blkptr_t;
-
-/*
- * Macros to get and set fields in a bp or DVA.
- */
-#define	DVA_GET_ASIZE(dva)	\
-	BF64_GET_SB((dva)->dva_word[0], 0, SPA_ASIZEBITS, SPA_MINBLOCKSHIFT, 0)
-#define	DVA_SET_ASIZE(dva, x)	\
-	BF64_SET_SB((dva)->dva_word[0], 0, SPA_ASIZEBITS, \
-	SPA_MINBLOCKSHIFT, 0, x)
-
-#define	DVA_GET_GRID(dva)	BF64_GET((dva)->dva_word[0], 24, 8)
-#define	DVA_SET_GRID(dva, x)	BF64_SET((dva)->dva_word[0], 24, 8, x)
-
-#define	DVA_GET_VDEV(dva)	BF64_GET((dva)->dva_word[0], 32, SPA_VDEVBITS)
-#define	DVA_SET_VDEV(dva, x)	\
-	BF64_SET((dva)->dva_word[0], 32, SPA_VDEVBITS, x)
-
-#define	DVA_GET_OFFSET(dva)	\
-	BF64_GET_SB((dva)->dva_word[1], 0, 63, SPA_MINBLOCKSHIFT, 0)
-#define	DVA_SET_OFFSET(dva, x)	\
-	BF64_SET_SB((dva)->dva_word[1], 0, 63, SPA_MINBLOCKSHIFT, 0, x)
-
-#define	DVA_GET_GANG(dva)	BF64_GET((dva)->dva_word[1], 63, 1)
-#define	DVA_SET_GANG(dva, x)	BF64_SET((dva)->dva_word[1], 63, 1, x)
-
-#define	BP_GET_LSIZE(bp)	\
-	(BP_IS_EMBEDDED(bp) ?	\
-	(BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA ? BPE_GET_LSIZE(bp) : 0): \
-	BF64_GET_SB((bp)->blk_prop, 0, SPA_LSIZEBITS, SPA_MINBLOCKSHIFT, 1))
-#define	BP_SET_LSIZE(bp, x)	do { \
-	ASSERT(!BP_IS_EMBEDDED(bp)); \
-	BF64_SET_SB((bp)->blk_prop, \
-	    0, SPA_LSIZEBITS, SPA_MINBLOCKSHIFT, 1, x); \
-_NOTE(CONSTCOND) } while (0)
-
-#define	BP_GET_PSIZE(bp)	\
-	(BP_IS_EMBEDDED(bp) ? 0 : \
-	BF64_GET_SB((bp)->blk_prop, 16, SPA_PSIZEBITS, SPA_MINBLOCKSHIFT, 1))
-#define	BP_SET_PSIZE(bp, x)	do { \
-	ASSERT(!BP_IS_EMBEDDED(bp)); \
-	BF64_SET_SB((bp)->blk_prop, \
-	    16, SPA_PSIZEBITS, SPA_MINBLOCKSHIFT, 1, x); \
-_NOTE(CONSTCOND) } while (0)
-
-#define	BP_GET_COMPRESS(bp)		\
-	BF64_GET((bp)->blk_prop, 32, SPA_COMPRESSBITS)
-#define	BP_SET_COMPRESS(bp, x)		\
-	BF64_SET((bp)->blk_prop, 32, SPA_COMPRESSBITS, x)
-
-#define	BP_IS_EMBEDDED(bp)		BF64_GET((bp)->blk_prop, 39, 1)
-#define	BP_SET_EMBEDDED(bp, x)		BF64_SET((bp)->blk_prop, 39, 1, x)
-
-#define	BP_GET_CHECKSUM(bp)		\
-	(BP_IS_EMBEDDED(bp) ? ZIO_CHECKSUM_OFF : \
-	BF64_GET((bp)->blk_prop, 40, 8))
-#define	BP_SET_CHECKSUM(bp, x)		do { \
-	ASSERT(!BP_IS_EMBEDDED(bp)); \
-	BF64_SET((bp)->blk_prop, 40, 8, x); \
-_NOTE(CONSTCOND) } while (0)
-
-#define	BP_GET_TYPE(bp)			BF64_GET((bp)->blk_prop, 48, 8)
-#define	BP_SET_TYPE(bp, x)		BF64_SET((bp)->blk_prop, 48, 8, x)
-
-#define	BP_GET_LEVEL(bp)		BF64_GET((bp)->blk_prop, 56, 5)
-#define	BP_SET_LEVEL(bp, x)		BF64_SET((bp)->blk_prop, 56, 5, x)
-
-#define	BP_GET_DEDUP(bp)		BF64_GET((bp)->blk_prop, 62, 1)
-#define	BP_SET_DEDUP(bp, x)		BF64_SET((bp)->blk_prop, 62, 1, x)
-
-#define	BP_GET_BYTEORDER(bp)		BF64_GET((bp)->blk_prop, 63, 1)
-#define	BP_SET_BYTEORDER(bp, x)		BF64_SET((bp)->blk_prop, 63, 1, x)
-
-#define	BP_PHYSICAL_BIRTH(bp)		\
-	(BP_IS_EMBEDDED(bp) ? 0 : \
-	(bp)->blk_phys_birth ? (bp)->blk_phys_birth : (bp)->blk_birth)
-
-#define	BP_SET_BIRTH(bp, logical, physical)	\
-{						\
-	ASSERT(!BP_IS_EMBEDDED(bp));		\
-	(bp)->blk_birth = (logical);		\
-	(bp)->blk_phys_birth = ((logical) == (physical) ? 0 : (physical)); \
-}
-
-#define	BP_GET_FILL(bp) (BP_IS_EMBEDDED(bp) ? 1 : (bp)->blk_fill)
-
-#define	BP_IS_METADATA(bp)	\
-	(BP_GET_LEVEL(bp) > 0 || DMU_OT_IS_METADATA(BP_GET_TYPE(bp)))
-
-#define	BP_GET_ASIZE(bp)	\
-	(BP_IS_EMBEDDED(bp) ? 0 : \
-	DVA_GET_ASIZE(&(bp)->blk_dva[0]) + \
-	DVA_GET_ASIZE(&(bp)->blk_dva[1]) + \
-	DVA_GET_ASIZE(&(bp)->blk_dva[2]))
-
-#define	BP_GET_UCSIZE(bp)	\
-	(BP_IS_METADATA(bp) ? BP_GET_PSIZE(bp) : BP_GET_LSIZE(bp))
-
-#define	BP_GET_NDVAS(bp)	\
-	(BP_IS_EMBEDDED(bp) ? 0 : \
-	!!DVA_GET_ASIZE(&(bp)->blk_dva[0]) + \
-	!!DVA_GET_ASIZE(&(bp)->blk_dva[1]) + \
-	!!DVA_GET_ASIZE(&(bp)->blk_dva[2]))
-
-#define	BP_COUNT_GANG(bp)	\
-	(BP_IS_EMBEDDED(bp) ? 0 : \
-	(DVA_GET_GANG(&(bp)->blk_dva[0]) + \
-	DVA_GET_GANG(&(bp)->blk_dva[1]) + \
-	DVA_GET_GANG(&(bp)->blk_dva[2])))
-
-#define	DVA_EQUAL(dva1, dva2)	\
-	((dva1)->dva_word[1] == (dva2)->dva_word[1] && \
-	(dva1)->dva_word[0] == (dva2)->dva_word[0])
-
-#define	BP_EQUAL(bp1, bp2)	\
-	(BP_PHYSICAL_BIRTH(bp1) == BP_PHYSICAL_BIRTH(bp2) &&	\
-	(bp1)->blk_birth == (bp2)->blk_birth &&			\
-	DVA_EQUAL(&(bp1)->blk_dva[0], &(bp2)->blk_dva[0]) &&	\
-	DVA_EQUAL(&(bp1)->blk_dva[1], &(bp2)->blk_dva[1]) &&	\
-	DVA_EQUAL(&(bp1)->blk_dva[2], &(bp2)->blk_dva[2]))
-
-#define	ZIO_CHECKSUM_EQUAL(zc1, zc2) \
-	(0 == (((zc1).zc_word[0] - (zc2).zc_word[0]) | \
-	((zc1).zc_word[1] - (zc2).zc_word[1]) | \
-	((zc1).zc_word[2] - (zc2).zc_word[2]) | \
-	((zc1).zc_word[3] - (zc2).zc_word[3])))
-
-#define	ZIO_CHECKSUM_IS_ZERO(zc) \
-	(0 == ((zc)->zc_word[0] | (zc)->zc_word[1] | \
-	(zc)->zc_word[2] | (zc)->zc_word[3]))
-
-#define	ZIO_CHECKSUM_BSWAP(zcp)					\
-{								\
-	(zcp)->zc_word[0] = BSWAP_64((zcp)->zc_word[0]);	\
-	(zcp)->zc_word[1] = BSWAP_64((zcp)->zc_word[1]);	\
-	(zcp)->zc_word[2] = BSWAP_64((zcp)->zc_word[2]);	\
-	(zcp)->zc_word[3] = BSWAP_64((zcp)->zc_word[3]);	\
-}
-
-
-#define	DVA_IS_VALID(dva)	(DVA_GET_ASIZE(dva) != 0)
-
-#define	ZIO_SET_CHECKSUM(zcp, w0, w1, w2, w3)	\
-{						\
-	(zcp)->zc_word[0] = w0;			\
-	(zcp)->zc_word[1] = w1;			\
-	(zcp)->zc_word[2] = w2;			\
-	(zcp)->zc_word[3] = w3;			\
-}
-
-#define	BP_IDENTITY(bp)		(ASSERT(!BP_IS_EMBEDDED(bp)), &(bp)->blk_dva[0])
-#define	BP_IS_GANG(bp)		\
-	(BP_IS_EMBEDDED(bp) ? B_FALSE : DVA_GET_GANG(BP_IDENTITY(bp)))
-#define	DVA_IS_EMPTY(dva)	((dva)->dva_word[0] == 0ULL &&	\
-				(dva)->dva_word[1] == 0ULL)
-#define	BP_IS_HOLE(bp) \
-	(!BP_IS_EMBEDDED(bp) && DVA_IS_EMPTY(BP_IDENTITY(bp)))
-
-/* BP_IS_RAIDZ(bp) assumes no block compression */
-#define	BP_IS_RAIDZ(bp)		(DVA_GET_ASIZE(&(bp)->blk_dva[0]) > \
-				BP_GET_PSIZE(bp))
-
-#define	BP_ZERO(bp)				\
-{						\
-	(bp)->blk_dva[0].dva_word[0] = 0;	\
-	(bp)->blk_dva[0].dva_word[1] = 0;	\
-	(bp)->blk_dva[1].dva_word[0] = 0;	\
-	(bp)->blk_dva[1].dva_word[1] = 0;	\
-	(bp)->blk_dva[2].dva_word[0] = 0;	\
-	(bp)->blk_dva[2].dva_word[1] = 0;	\
-	(bp)->blk_prop = 0;			\
-	(bp)->blk_pad[0] = 0;			\
-	(bp)->blk_pad[1] = 0;			\
-	(bp)->blk_phys_birth = 0;		\
-	(bp)->blk_birth = 0;			\
-	(bp)->blk_fill = 0;			\
-	ZIO_SET_CHECKSUM(&(bp)->blk_cksum, 0, 0, 0, 0);	\
-}
-
-#if BYTE_ORDER == _BIG_ENDIAN
-#define	ZFS_HOST_BYTEORDER	(0ULL)
-#else
-#define	ZFS_HOST_BYTEORDER	(1ULL)
-#endif
-
-#define	BP_SHOULD_BYTESWAP(bp)	(BP_GET_BYTEORDER(bp) != ZFS_HOST_BYTEORDER)
-
-#define	BP_SPRINTF_LEN	320
-
-/*
- * This macro allows code sharing between zfs, libzpool, and mdb.
- * 'func' is either snprintf() or mdb_snprintf().
- * 'ws' (whitespace) can be ' ' for single-line format, '\n' for multi-line.
- */
-#define	SNPRINTF_BLKPTR(func, ws, buf, size, bp, type, checksum, compress) \
-{									\
-	static const char *copyname[] =					\
-	    { "zero", "single", "double", "triple" };			\
-	int len = 0;							\
-	int copies = 0;							\
-									\
-	if (bp == NULL) {						\
-		len += func(buf + len, size - len, "<NULL>");		\
-	} else if (BP_IS_HOLE(bp)) {					\
-		len += func(buf + len, size - len,			\
-		    "HOLE [L%llu %s] "					\
-		    "size=%llxL birth=%lluL",				\
-		    (u_longlong_t)BP_GET_LEVEL(bp),			\
-		    type,						\
-		    (u_longlong_t)BP_GET_LSIZE(bp),			\
-		    (u_longlong_t)bp->blk_birth);			\
-	} else if (BP_IS_EMBEDDED(bp)) {				\
-		len = func(buf + len, size - len,			\
-		    "EMBEDDED [L%llu %s] et=%u %s "			\
-		    "size=%llxL/%llxP birth=%lluL",			\
-		    (u_longlong_t)BP_GET_LEVEL(bp),			\
-		    type,						\
-		    (int)BPE_GET_ETYPE(bp),				\
-		    compress,						\
-		    (u_longlong_t)BPE_GET_LSIZE(bp),			\
-		    (u_longlong_t)BPE_GET_PSIZE(bp),			\
-		    (u_longlong_t)bp->blk_birth);			\
-	} else {							\
-		for (int d = 0; d < BP_GET_NDVAS(bp); d++) {		\
-			const dva_t *dva = &bp->blk_dva[d];		\
-			if (DVA_IS_VALID(dva))				\
-				copies++;				\
-			len += func(buf + len, size - len,		\
-			    "DVA[%d]=<%llu:%llx:%llx>%c", d,		\
-			    (u_longlong_t)DVA_GET_VDEV(dva),		\
-			    (u_longlong_t)DVA_GET_OFFSET(dva),		\
-			    (u_longlong_t)DVA_GET_ASIZE(dva),		\
-			    ws);					\
-		}							\
-		if (BP_IS_GANG(bp) &&					\
-		    DVA_GET_ASIZE(&bp->blk_dva[2]) <=			\
-		    DVA_GET_ASIZE(&bp->blk_dva[1]) / 2)			\
-			copies--;					\
-		len += func(buf + len, size - len,			\
-		    "[L%llu %s] %s %s %s %s %s %s%c"			\
-		    "size=%llxL/%llxP birth=%lluL/%lluP fill=%llu%c"	\
-		    "cksum=%llx:%llx:%llx:%llx",			\
-		    (u_longlong_t)BP_GET_LEVEL(bp),			\
-		    type,						\
-		    checksum,						\
-		    compress,						\
-		    BP_GET_BYTEORDER(bp) == 0 ? "BE" : "LE",		\
-		    BP_IS_GANG(bp) ? "gang" : "contiguous",		\
-		    BP_GET_DEDUP(bp) ? "dedup" : "unique",		\
-		    copyname[copies],					\
-		    ws,							\
-		    (u_longlong_t)BP_GET_LSIZE(bp),			\
-		    (u_longlong_t)BP_GET_PSIZE(bp),			\
-		    (u_longlong_t)bp->blk_birth,			\
-		    (u_longlong_t)BP_PHYSICAL_BIRTH(bp),		\
-		    (u_longlong_t)BP_GET_FILL(bp),			\
-		    ws,							\
-		    (u_longlong_t)bp->blk_cksum.zc_word[0],		\
-		    (u_longlong_t)bp->blk_cksum.zc_word[1],		\
-		    (u_longlong_t)bp->blk_cksum.zc_word[2],		\
-		    (u_longlong_t)bp->blk_cksum.zc_word[3]);		\
-	}								\
-	ASSERT(len < size);						\
-}
-
-#define	BP_GET_BUFC_TYPE(bp)						\
-	(BP_IS_METADATA(bp) ? ARC_BUFC_METADATA : ARC_BUFC_DATA)
-
-typedef enum spa_import_type {
-	SPA_IMPORT_EXISTING,
-	SPA_IMPORT_ASSEMBLE
-} spa_import_type_t;
-
-/* state manipulation functions */
-extern int spa_open(const char *pool, spa_t **, void *tag);
-extern int spa_open_rewind(const char *pool, spa_t **, void *tag,
-    nvlist_t *policy, nvlist_t **config);
-extern int spa_get_stats(const char *pool, nvlist_t **config, char *altroot,
-    size_t buflen);
-extern int spa_create(const char *pool, nvlist_t *config, nvlist_t *props,
-    nvlist_t *zplprops);
-#ifdef illumos
-extern int spa_import_rootpool(char *devpath, char *devid);
-#else
-extern int spa_import_rootpool(const char *name);
-#endif
-extern int spa_import(const char *pool, nvlist_t *config, nvlist_t *props,
-    uint64_t flags);
-extern nvlist_t *spa_tryimport(nvlist_t *tryconfig);
-extern int spa_destroy(char *pool);
-extern int spa_checkpoint(const char *pool);
-extern int spa_checkpoint_discard(const char *pool);
-extern int spa_export(char *pool, nvlist_t **oldconfig, boolean_t force,
-    boolean_t hardforce);
-extern int spa_reset(char *pool);
-extern void spa_async_request(spa_t *spa, int flag);
-extern void spa_async_unrequest(spa_t *spa, int flag);
-extern void spa_async_suspend(spa_t *spa);
-extern void spa_async_resume(spa_t *spa);
-extern spa_t *spa_inject_addref(char *pool);
-extern void spa_inject_delref(spa_t *spa);
-extern void spa_scan_stat_init(spa_t *spa);
-extern int spa_scan_get_stats(spa_t *spa, pool_scan_stat_t *ps);
-
-#define	SPA_ASYNC_CONFIG_UPDATE	0x01
-#define	SPA_ASYNC_REMOVE	0x02
-#define	SPA_ASYNC_PROBE		0x04
-#define	SPA_ASYNC_RESILVER_DONE	0x08
-#define	SPA_ASYNC_RESILVER	0x10
-#define	SPA_ASYNC_AUTOEXPAND	0x20
-#define	SPA_ASYNC_REMOVE_DONE	0x40
-#define	SPA_ASYNC_REMOVE_STOP	0x80
-#define	SPA_ASYNC_INITIALIZE_RESTART	0x100
-
-/*
- * Controls the behavior of spa_vdev_remove().
- */
-#define	SPA_REMOVE_UNSPARE	0x01
-#define	SPA_REMOVE_DONE		0x02
-
-/* device manipulation */
-extern int spa_vdev_add(spa_t *spa, nvlist_t *nvroot);
-extern int spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot,
-    int replacing);
-extern int spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid,
-    int replace_done);
-extern int spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare);
-extern boolean_t spa_vdev_remove_active(spa_t *spa);
-extern int spa_vdev_initialize(spa_t *spa, uint64_t guid, uint64_t cmd_type);
-extern int spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath);
-extern int spa_vdev_setfru(spa_t *spa, uint64_t guid, const char *newfru);
-extern int spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config,
-    nvlist_t *props, boolean_t exp);
-
-/* spare state (which is global across all pools) */
-extern void spa_spare_add(vdev_t *vd);
-extern void spa_spare_remove(vdev_t *vd);
-extern boolean_t spa_spare_exists(uint64_t guid, uint64_t *pool, int *refcnt);
-extern void spa_spare_activate(vdev_t *vd);
-
-/* L2ARC state (which is global across all pools) */
-extern void spa_l2cache_add(vdev_t *vd);
-extern void spa_l2cache_remove(vdev_t *vd);
-extern boolean_t spa_l2cache_exists(uint64_t guid, uint64_t *pool);
-extern void spa_l2cache_activate(vdev_t *vd);
-extern void spa_l2cache_drop(spa_t *spa);
-
-/* scanning */
-extern int spa_scan(spa_t *spa, pool_scan_func_t func);
-extern int spa_scan_stop(spa_t *spa);
-extern int spa_scrub_pause_resume(spa_t *spa, pool_scrub_cmd_t flag);
-
-/* spa syncing */
-extern void spa_sync(spa_t *spa, uint64_t txg); /* only for DMU use */
-extern void spa_sync_allpools(void);
-
-/* spa namespace global mutex */
-extern kmutex_t spa_namespace_lock;
-
-/*
- * SPA configuration functions in spa_config.c
- */
-
-#define	SPA_CONFIG_UPDATE_POOL	0
-#define	SPA_CONFIG_UPDATE_VDEVS	1
-
-extern void spa_write_cachefile(spa_t *, boolean_t, boolean_t);
-extern void spa_config_load(void);
-extern nvlist_t *spa_all_configs(uint64_t *);
-extern void spa_config_set(spa_t *spa, nvlist_t *config);
-extern nvlist_t *spa_config_generate(spa_t *spa, vdev_t *vd, uint64_t txg,
-    int getstats);
-extern void spa_config_update(spa_t *spa, int what);
-
-/*
- * Miscellaneous SPA routines in spa_misc.c
- */
-
-/* Namespace manipulation */
-extern spa_t *spa_lookup(const char *name);
-extern spa_t *spa_add(const char *name, nvlist_t *config, const char *altroot);
-extern void spa_remove(spa_t *spa);
-extern spa_t *spa_next(spa_t *prev);
-
-/* Refcount functions */
-extern void spa_open_ref(spa_t *spa, void *tag);
-extern void spa_close(spa_t *spa, void *tag);
-extern void spa_async_close(spa_t *spa, void *tag);
-extern boolean_t spa_refcount_zero(spa_t *spa);
-
-#define	SCL_NONE	0x00
-#define	SCL_CONFIG	0x01
-#define	SCL_STATE	0x02
-#define	SCL_L2ARC	0x04		/* hack until L2ARC 2.0 */
-#define	SCL_ALLOC	0x08
-#define	SCL_ZIO		0x10
-#define	SCL_FREE	0x20
-#define	SCL_VDEV	0x40
-#define	SCL_LOCKS	7
-#define	SCL_ALL		((1 << SCL_LOCKS) - 1)
-#define	SCL_STATE_ALL	(SCL_STATE | SCL_L2ARC | SCL_ZIO)
-
-/* Pool configuration locks */
-extern int spa_config_tryenter(spa_t *spa, int locks, void *tag, krw_t rw);
-extern void spa_config_enter(spa_t *spa, int locks, void *tag, krw_t rw);
-extern void spa_config_exit(spa_t *spa, int locks, void *tag);
-extern int spa_config_held(spa_t *spa, int locks, krw_t rw);
-
-/* Pool vdev add/remove lock */
-extern uint64_t spa_vdev_enter(spa_t *spa);
-extern uint64_t spa_vdev_config_enter(spa_t *spa);
-extern void spa_vdev_config_exit(spa_t *spa, vdev_t *vd, uint64_t txg,
-    int error, char *tag);
-extern int spa_vdev_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error);
-
-/* Pool vdev state change lock */
-extern void spa_vdev_state_enter(spa_t *spa, int oplock);
-extern int spa_vdev_state_exit(spa_t *spa, vdev_t *vd, int error);
-
-/* Log state */
-typedef enum spa_log_state {
-	SPA_LOG_UNKNOWN = 0,	/* unknown log state */
-	SPA_LOG_MISSING,	/* missing log(s) */
-	SPA_LOG_CLEAR,		/* clear the log(s) */
-	SPA_LOG_GOOD,		/* log(s) are good */
-} spa_log_state_t;
-
-extern spa_log_state_t spa_get_log_state(spa_t *spa);
-extern void spa_set_log_state(spa_t *spa, spa_log_state_t state);
-extern int spa_reset_logs(spa_t *spa);
-
-/* Log claim callback */
-extern void spa_claim_notify(zio_t *zio);
-
-/* Accessor functions */
-extern boolean_t spa_shutting_down(spa_t *spa);
-extern struct dsl_pool *spa_get_dsl(spa_t *spa);
-extern boolean_t spa_is_initializing(spa_t *spa);
-extern boolean_t spa_indirect_vdevs_loaded(spa_t *spa);
-extern blkptr_t *spa_get_rootblkptr(spa_t *spa);
-extern void spa_set_rootblkptr(spa_t *spa, const blkptr_t *bp);
-extern void spa_altroot(spa_t *, char *, size_t);
-extern int spa_sync_pass(spa_t *spa);
-extern char *spa_name(spa_t *spa);
-extern uint64_t spa_guid(spa_t *spa);
-extern uint64_t spa_load_guid(spa_t *spa);
-extern uint64_t spa_last_synced_txg(spa_t *spa);
-extern uint64_t spa_first_txg(spa_t *spa);
-extern uint64_t spa_syncing_txg(spa_t *spa);
-extern uint64_t spa_final_dirty_txg(spa_t *spa);
-extern uint64_t spa_version(spa_t *spa);
-extern pool_state_t spa_state(spa_t *spa);
-extern spa_load_state_t spa_load_state(spa_t *spa);
-extern uint64_t spa_freeze_txg(spa_t *spa);
-extern uint64_t spa_get_worst_case_asize(spa_t *spa, uint64_t lsize);
-extern uint64_t spa_get_dspace(spa_t *spa);
-extern uint64_t spa_get_checkpoint_space(spa_t *spa);
-extern uint64_t spa_get_slop_space(spa_t *spa);
-extern void spa_update_dspace(spa_t *spa);
-extern uint64_t spa_version(spa_t *spa);
-extern boolean_t spa_deflate(spa_t *spa);
-extern metaslab_class_t *spa_normal_class(spa_t *spa);
-extern metaslab_class_t *spa_log_class(spa_t *spa);
-extern metaslab_class_t *spa_special_class(spa_t *spa);
-extern metaslab_class_t *spa_dedup_class(spa_t *spa);
-extern metaslab_class_t *spa_preferred_class(spa_t *spa, uint64_t size,
-    dmu_object_type_t objtype, uint_t level, uint_t special_smallblk);
-
-extern void spa_evicting_os_register(spa_t *, objset_t *os);
-extern void spa_evicting_os_deregister(spa_t *, objset_t *os);
-extern void spa_evicting_os_wait(spa_t *spa);
-extern int spa_max_replication(spa_t *spa);
-extern int spa_prev_software_version(spa_t *spa);
-extern int spa_busy(void);
-extern uint8_t spa_get_failmode(spa_t *spa);
-extern boolean_t spa_suspended(spa_t *spa);
-extern uint64_t spa_bootfs(spa_t *spa);
-extern uint64_t spa_delegation(spa_t *spa);
-extern objset_t *spa_meta_objset(spa_t *spa);
-extern uint64_t spa_deadman_synctime(spa_t *spa);
-extern struct proc *spa_proc(spa_t *spa);
-extern uint64_t spa_dirty_data(spa_t *spa);
-
-/* Miscellaneous support routines */
-extern void spa_load_failed(spa_t *spa, const char *fmt, ...);
-extern void spa_load_note(spa_t *spa, const char *fmt, ...);
-extern void spa_activate_mos_feature(spa_t *spa, const char *feature,
-    dmu_tx_t *tx);
-extern void spa_deactivate_mos_feature(spa_t *spa, const char *feature);
-extern spa_t *spa_by_guid(uint64_t pool_guid, uint64_t device_guid);
-extern boolean_t spa_guid_exists(uint64_t pool_guid, uint64_t device_guid);
-extern char *spa_strdup(const char *);
-extern void spa_strfree(char *);
-extern uint64_t spa_get_random(uint64_t range);
-extern uint64_t spa_generate_guid(spa_t *spa);
-extern void snprintf_blkptr(char *buf, size_t buflen, const blkptr_t *bp);
-extern void spa_freeze(spa_t *spa);
-extern int spa_change_guid(spa_t *spa);
-extern void spa_upgrade(spa_t *spa, uint64_t version);
-extern void spa_evict_all(void);
-extern vdev_t *spa_lookup_by_guid(spa_t *spa, uint64_t guid,
-    boolean_t l2cache);
-extern boolean_t spa_has_spare(spa_t *, uint64_t guid);
-extern uint64_t dva_get_dsize_sync(spa_t *spa, const dva_t *dva);
-extern uint64_t bp_get_dsize_sync(spa_t *spa, const blkptr_t *bp);
-extern uint64_t bp_get_dsize(spa_t *spa, const blkptr_t *bp);
-extern boolean_t spa_has_slogs(spa_t *spa);
-extern boolean_t spa_is_root(spa_t *spa);
-extern boolean_t spa_writeable(spa_t *spa);
-extern boolean_t spa_has_pending_synctask(spa_t *spa);
-extern int spa_maxblocksize(spa_t *spa);
-extern int spa_maxdnodesize(spa_t *spa);
-extern boolean_t spa_multihost(spa_t *spa);
-extern unsigned long spa_get_hostid(void);
-extern boolean_t spa_has_checkpoint(spa_t *spa);
-extern boolean_t spa_importing_readonly_checkpoint(spa_t *spa);
-extern boolean_t spa_suspend_async_destroy(spa_t *spa);
-extern uint64_t spa_min_claim_txg(spa_t *spa);
-extern void zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp);
-extern boolean_t zfs_dva_valid(spa_t *spa, const dva_t *dva,
-    const blkptr_t *bp);
-typedef void (*spa_remap_cb_t)(uint64_t vdev, uint64_t offset, uint64_t size,
-    void *arg);
-extern boolean_t spa_remap_blkptr(spa_t *spa, blkptr_t *bp,
-    spa_remap_cb_t callback, void *arg);
-extern uint64_t spa_get_last_removal_txg(spa_t *spa);
-extern boolean_t spa_trust_config(spa_t *spa);
-extern uint64_t spa_missing_tvds_allowed(spa_t *spa);
-extern void spa_set_missing_tvds(spa_t *spa, uint64_t missing);
-extern boolean_t spa_top_vdevs_spacemap_addressable(spa_t *spa);
-extern void spa_activate_allocation_classes(spa_t *, dmu_tx_t *);
-
-extern int spa_mode(spa_t *spa);
-extern uint64_t zfs_strtonum(const char *str, char **nptr);
-
-extern char *spa_his_ievent_table[];
-
-extern void spa_history_create_obj(spa_t *spa, dmu_tx_t *tx);
-extern int spa_history_get(spa_t *spa, uint64_t *offset, uint64_t *len_read,
-    char *his_buf);
-extern int spa_history_log(spa_t *spa, const char *his_buf);
-extern int spa_history_log_nvl(spa_t *spa, nvlist_t *nvl);
-extern void spa_history_log_version(spa_t *spa, const char *operation);
-extern void spa_history_log_internal(spa_t *spa, const char *operation,
-    dmu_tx_t *tx, const char *fmt, ...);
-extern void spa_history_log_internal_ds(struct dsl_dataset *ds, const char *op,
-    dmu_tx_t *tx, const char *fmt, ...);
-extern void spa_history_log_internal_dd(dsl_dir_t *dd, const char *operation,
-    dmu_tx_t *tx, const char *fmt, ...);
-
-/* error handling */
-struct zbookmark_phys;
-extern void spa_log_error(spa_t *spa, zio_t *zio);
-extern void zfs_ereport_post(const char *cls, spa_t *spa, vdev_t *vd,
-    zio_t *zio, uint64_t stateoroffset, uint64_t length);
-extern void zfs_post_remove(spa_t *spa, vdev_t *vd);
-extern void zfs_post_state_change(spa_t *spa, vdev_t *vd);
-extern void zfs_post_autoreplace(spa_t *spa, vdev_t *vd);
-extern uint64_t spa_get_errlog_size(spa_t *spa);
-extern int spa_get_errlog(spa_t *spa, void *uaddr, size_t *count);
-extern void spa_errlog_rotate(spa_t *spa);
-extern void spa_errlog_drain(spa_t *spa);
-extern void spa_errlog_sync(spa_t *spa, uint64_t txg);
-extern void spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub);
-
-/* vdev cache */
-extern void vdev_cache_stat_init(void);
-extern void vdev_cache_stat_fini(void);
-
-/* Initialization and termination */
-extern void spa_init(int flags);
-extern void spa_fini(void);
-extern void spa_boot_init(void);
-
-/* properties */
-extern int spa_prop_set(spa_t *spa, nvlist_t *nvp);
-extern int spa_prop_get(spa_t *spa, nvlist_t **nvp);
-extern void spa_prop_clear_bootfs(spa_t *spa, uint64_t obj, dmu_tx_t *tx);
-extern void spa_configfile_set(spa_t *, nvlist_t *, boolean_t);
-
-/* asynchronous event notification */
-extern void spa_event_notify(spa_t *spa, vdev_t *vdev, nvlist_t *hist_nvl,
-    const char *name);
-extern sysevent_t *spa_event_create(spa_t *spa, vdev_t *vd, nvlist_t *hist_nvl,
-    const char *name);
-extern void spa_event_post(sysevent_t *ev);
-extern void spa_event_discard(sysevent_t *ev);
-
-#ifdef ZFS_DEBUG
-#define	dprintf_bp(bp, fmt, ...) do {				\
-	if (zfs_flags & ZFS_DEBUG_DPRINTF) {			\
-	char *__blkbuf = kmem_alloc(BP_SPRINTF_LEN, KM_SLEEP);	\
-	snprintf_blkptr(__blkbuf, BP_SPRINTF_LEN, (bp));	\
-	dprintf(fmt " %s\n", __VA_ARGS__, __blkbuf);		\
-	kmem_free(__blkbuf, BP_SPRINTF_LEN);			\
-	} \
-_NOTE(CONSTCOND) } while (0)
-#else
-#define	dprintf_bp(bp, fmt, ...)
-#endif
-
-extern int spa_mode_global;			/* mode, e.g. FREAD | FWRITE */
-
-#ifdef	__cplusplus
-}
-#endif
-
-#endif	/* _SYS_SPA_H */
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_boot.h
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_boot.h
+++ /dev/null
@@ -1,48 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-
-/*
- * Copyright (c) 2012 by Delphix. All rights reserved.
- */
-
-#ifndef _SYS_SPA_BOOT_H
-#define	_SYS_SPA_BOOT_H
-
-#include <sys/nvpair.h>
-
-#ifdef	__cplusplus
-extern "C" {
-#endif
-
-extern char *spa_get_bootprop(char *prop);
-extern void spa_free_bootprop(char *prop);
-
-extern void spa_arch_init(void);
-
-#ifdef	__cplusplus
-}
-#endif
-
-#endif	/* _SYS_SPA_BOOT_H */
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_checkpoint.h
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_checkpoint.h
+++ /dev/null
@@ -1,44 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-
-/*
- * Copyright (c) 2017 by Delphix. All rights reserved.
- */
-
-#ifndef _SYS_SPA_CHECKPOINT_H
-#define	_SYS_SPA_CHECKPOINT_H
-
-#include <sys/zthr.h>
-
-typedef struct spa_checkpoint_info {
-	uint64_t sci_timestamp; /* when checkpointed uberblock was synced  */
-	uint64_t sci_dspace;    /* disk space used by checkpoint in bytes */
-} spa_checkpoint_info_t;
-
-int spa_checkpoint(const char *);
-int spa_checkpoint_discard(const char *);
-
-boolean_t spa_checkpoint_discard_thread_check(void *, zthr_t *);
-void spa_checkpoint_discard_thread(void *, zthr_t *);
-
-int spa_checkpoint_get_stats(spa_t *, pool_checkpoint_stat_t *);
-
-#endif /* _SYS_SPA_CHECKPOINT_H */
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h
+++ /dev/null
@@ -1,435 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
- * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
- * Copyright 2013 Martin Matuska <mm@FreeBSD.org>. All rights reserved.
- * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
- * Copyright 2013 Saso Kiselkov. All rights reserved.
- * Copyright (c) 2017 Datto Inc.
- * Copyright (c) 2017, Intel Corporation.
- * Copyright (c) 2016 Actifio, Inc. All rights reserved.
- */
-
-#ifndef _SYS_SPA_IMPL_H
-#define	_SYS_SPA_IMPL_H
-
-#include <sys/spa.h>
-#include <sys/spa_checkpoint.h>
-#include <sys/vdev.h>
-#include <sys/vdev_removal.h>
-#include <sys/metaslab.h>
-#include <sys/dmu.h>
-#include <sys/dsl_pool.h>
-#include <sys/uberblock_impl.h>
-#include <sys/zfs_context.h>
-#include <sys/avl.h>
-#include <sys/refcount.h>
-#include <sys/bplist.h>
-#include <sys/bpobj.h>
-#include <sys/zfeature.h>
-#include <sys/zthr.h>
-#include <zfeature_common.h>
-
-#ifdef	__cplusplus
-extern "C" {
-#endif
-
-typedef struct spa_error_entry {
-	zbookmark_phys_t	se_bookmark;
-	char			*se_name;
-	avl_node_t		se_avl;
-} spa_error_entry_t;
-
-typedef struct spa_history_phys {
-	uint64_t sh_pool_create_len;	/* ending offset of zpool create */
-	uint64_t sh_phys_max_off;	/* physical EOF */
-	uint64_t sh_bof;		/* logical BOF */
-	uint64_t sh_eof;		/* logical EOF */
-	uint64_t sh_records_lost;	/* num of records overwritten */
-} spa_history_phys_t;
-
-/*
- * All members must be uint64_t, for byteswap purposes.
- */
-typedef struct spa_removing_phys {
-	uint64_t sr_state; /* dsl_scan_state_t */
-
-	/*
-	 * The vdev ID that we most recently attempted to remove,
-	 * or -1 if no removal has been attempted.
-	 */
-	uint64_t sr_removing_vdev;
-
-	/*
-	 * The vdev ID that we most recently successfully removed,
-	 * or -1 if no devices have been removed.
-	 */
-	uint64_t sr_prev_indirect_vdev;
-
-	uint64_t sr_start_time;
-	uint64_t sr_end_time;
-
-	/*
-	 * Note that we can not use the space map's or indirect mapping's
-	 * accounting as a substitute for these values, because we need to
-	 * count frees of not-yet-copied data as though it did the copy.
-	 * Otherwise, we could get into a situation where copied > to_copy,
-	 * or we complete before copied == to_copy.
-	 */
-	uint64_t sr_to_copy; /* bytes that need to be copied */
-	uint64_t sr_copied; /* bytes that have been copied or freed */
-} spa_removing_phys_t;
-
-/*
- * This struct is stored as an entry in the DMU_POOL_DIRECTORY_OBJECT
- * (with key DMU_POOL_CONDENSING_INDIRECT).  It is present if a condense
- * of an indirect vdev's mapping object is in progress.
- */
-typedef struct spa_condensing_indirect_phys {
-	/*
-	 * The vdev ID of the indirect vdev whose indirect mapping is
-	 * being condensed.
-	 */
-	uint64_t	scip_vdev;
-
-	/*
-	 * The vdev's old obsolete spacemap.  This spacemap's contents are
-	 * being integrated into the new mapping.
-	 */
-	uint64_t	scip_prev_obsolete_sm_object;
-
-	/*
-	 * The new mapping object that is being created.
-	 */
-	uint64_t	scip_next_mapping_object;
-} spa_condensing_indirect_phys_t;
-
-struct spa_aux_vdev {
-	uint64_t	sav_object;		/* MOS object for device list */
-	nvlist_t	*sav_config;		/* cached device config */
-	vdev_t		**sav_vdevs;		/* devices */
-	int		sav_count;		/* number devices */
-	boolean_t	sav_sync;		/* sync the device list */
-	nvlist_t	**sav_pending;		/* pending device additions */
-	uint_t		sav_npending;		/* # pending devices */
-};
-
-typedef struct spa_config_lock {
-	kmutex_t	scl_lock;
-	kthread_t	*scl_writer;
-	int		scl_write_wanted;
-	kcondvar_t	scl_cv;
-	zfs_refcount_t	scl_count;
-} spa_config_lock_t;
-
-typedef struct spa_config_dirent {
-	list_node_t	scd_link;
-	char		*scd_path;
-} spa_config_dirent_t;
-
-typedef enum zio_taskq_type {
-	ZIO_TASKQ_ISSUE = 0,
-	ZIO_TASKQ_ISSUE_HIGH,
-	ZIO_TASKQ_INTERRUPT,
-	ZIO_TASKQ_INTERRUPT_HIGH,
-	ZIO_TASKQ_TYPES
-} zio_taskq_type_t;
-
-/*
- * State machine for the zpool-poolname process.  The states transitions
- * are done as follows:
- *
- *	From		   To			Routine
- *	PROC_NONE	-> PROC_CREATED		spa_activate()
- *	PROC_CREATED	-> PROC_ACTIVE		spa_thread()
- *	PROC_ACTIVE	-> PROC_DEACTIVATE	spa_deactivate()
- *	PROC_DEACTIVATE	-> PROC_GONE		spa_thread()
- *	PROC_GONE	-> PROC_NONE		spa_deactivate()
- */
-typedef enum spa_proc_state {
-	SPA_PROC_NONE,		/* spa_proc = &p0, no process created */
-	SPA_PROC_CREATED,	/* spa_activate() has proc, is waiting */
-	SPA_PROC_ACTIVE,	/* taskqs created, spa_proc set */
-	SPA_PROC_DEACTIVATE,	/* spa_deactivate() requests process exit */
-	SPA_PROC_GONE		/* spa_thread() is exiting, spa_proc = &p0 */
-} spa_proc_state_t;
-
-typedef struct spa_taskqs {
-	uint_t stqs_count;
-	taskq_t **stqs_taskq;
-} spa_taskqs_t;
-
-typedef enum spa_all_vdev_zap_action {
-	AVZ_ACTION_NONE = 0,
-	AVZ_ACTION_DESTROY,	/* Destroy all per-vdev ZAPs and the AVZ. */
-	AVZ_ACTION_REBUILD,	/* Populate the new AVZ, see spa_avz_rebuild */
-	AVZ_ACTION_INITIALIZE
-} spa_avz_action_t;
-
-typedef enum spa_config_source {
-	SPA_CONFIG_SRC_NONE = 0,
-	SPA_CONFIG_SRC_SCAN,		/* scan of path (default: /dev/dsk) */
-	SPA_CONFIG_SRC_CACHEFILE,	/* any cachefile */
-	SPA_CONFIG_SRC_TRYIMPORT,	/* returned from call to tryimport */
-	SPA_CONFIG_SRC_SPLIT,		/* new pool in a pool split */
-	SPA_CONFIG_SRC_MOS		/* MOS, but not always from right txg */
-} spa_config_source_t;
-
-struct spa {
-	/*
-	 * Fields protected by spa_namespace_lock.
-	 */
-	char		spa_name[ZFS_MAX_DATASET_NAME_LEN];	/* pool name */
-	char		*spa_comment;		/* comment */
-	avl_node_t	spa_avl;		/* node in spa_namespace_avl */
-	nvlist_t	*spa_config;		/* last synced config */
-	nvlist_t	*spa_config_syncing;	/* currently syncing config */
-	nvlist_t	*spa_config_splitting;	/* config for splitting */
-	nvlist_t	*spa_load_info;		/* info and errors from load */
-	uint64_t	spa_config_txg;		/* txg of last config change */
-	int		spa_sync_pass;		/* iterate-to-convergence */
-	pool_state_t	spa_state;		/* pool state */
-	int		spa_inject_ref;		/* injection references */
-	uint8_t		spa_sync_on;		/* sync threads are running */
-	spa_load_state_t spa_load_state;	/* current load operation */
-	boolean_t	spa_indirect_vdevs_loaded; /* mappings loaded? */
-	boolean_t	spa_trust_config;	/* do we trust vdev tree? */
-	spa_config_source_t spa_config_source;	/* where config comes from? */
-	uint64_t	spa_import_flags;	/* import specific flags */
-	spa_taskqs_t	spa_zio_taskq[ZIO_TYPES][ZIO_TASKQ_TYPES];
-	dsl_pool_t	*spa_dsl_pool;
-	boolean_t	spa_is_initializing;	/* true while opening pool */
-	metaslab_class_t *spa_normal_class;	/* normal data class */
-	metaslab_class_t *spa_log_class;	/* intent log data class */
-	metaslab_class_t *spa_special_class;	/* special allocation class */
-	metaslab_class_t *spa_dedup_class;	/* dedup allocation class */
-	uint64_t	spa_first_txg;		/* first txg after spa_open() */
-	uint64_t	spa_final_txg;		/* txg of export/destroy */
-	uint64_t	spa_freeze_txg;		/* freeze pool at this txg */
-	uint64_t	spa_load_max_txg;	/* best initial ub_txg */
-	uint64_t	spa_claim_max_txg;	/* highest claimed birth txg */
-	timespec_t	spa_loaded_ts;		/* 1st successful open time */
-	objset_t	*spa_meta_objset;	/* copy of dp->dp_meta_objset */
-	kmutex_t	spa_evicting_os_lock;	/* Evicting objset list lock */
-	list_t		spa_evicting_os_list;	/* Objsets being evicted. */
-	kcondvar_t	spa_evicting_os_cv;	/* Objset Eviction Completion */
-	txg_list_t	spa_vdev_txg_list;	/* per-txg dirty vdev list */
-	vdev_t		*spa_root_vdev;		/* top-level vdev container */
-	int		spa_min_ashift;		/* of vdevs in normal class */
-	int		spa_max_ashift;		/* of vdevs in normal class */
-	uint64_t	spa_config_guid;	/* config pool guid */
-	uint64_t	spa_load_guid;		/* spa_load initialized guid */
-	uint64_t	spa_last_synced_guid;	/* last synced guid */
-	list_t		spa_config_dirty_list;	/* vdevs with dirty config */
-	list_t		spa_state_dirty_list;	/* vdevs with dirty state */
-	/*
-	 * spa_alloc_locks and spa_alloc_trees are arrays, whose lengths are
-	 * stored in spa_alloc_count. There is one tree and one lock for each
-	 * allocator, to help improve allocation performance in write-heavy
-	 * workloads.
-	 */
-	kmutex_t	*spa_alloc_locks;
-	avl_tree_t	*spa_alloc_trees;
-	int		spa_alloc_count;
-
-	spa_aux_vdev_t	spa_spares;		/* hot spares */
-	spa_aux_vdev_t	spa_l2cache;		/* L2ARC cache devices */
-	nvlist_t	*spa_label_features;	/* Features for reading MOS */
-	uint64_t	spa_config_object;	/* MOS object for pool config */
-	uint64_t	spa_config_generation;	/* config generation number */
-	uint64_t	spa_syncing_txg;	/* txg currently syncing */
-	bpobj_t		spa_deferred_bpobj;	/* deferred-free bplist */
-	bplist_t	spa_free_bplist[TXG_SIZE]; /* bplist of stuff to free */
-	zio_cksum_salt_t spa_cksum_salt;	/* secret salt for cksum */
-	/* checksum context templates */
-	kmutex_t	spa_cksum_tmpls_lock;
-	void		*spa_cksum_tmpls[ZIO_CHECKSUM_FUNCTIONS];
-	uberblock_t	spa_ubsync;		/* last synced uberblock */
-	uberblock_t	spa_uberblock;		/* current uberblock */
-	boolean_t	spa_extreme_rewind;	/* rewind past deferred frees */
-	uint64_t	spa_last_io;		/* lbolt of last non-scan I/O */
-	kmutex_t	spa_scrub_lock;		/* resilver/scrub lock */
-	uint64_t	spa_scrub_inflight;	/* in-flight scrub bytes */
-	uint64_t	spa_load_verify_ios;	/* in-flight verifications IOs */
-	kcondvar_t	spa_scrub_io_cv;	/* scrub I/O completion */
-	uint8_t		spa_scrub_active;	/* active or suspended? */
-	uint8_t		spa_scrub_type;		/* type of scrub we're doing */
-	uint8_t		spa_scrub_finished;	/* indicator to rotate logs */
-	uint8_t		spa_scrub_started;	/* started since last boot */
-	uint8_t		spa_scrub_reopen;	/* scrub doing vdev_reopen */
-	uint64_t	spa_scan_pass_start;	/* start time per pass/reboot */
-	uint64_t	spa_scan_pass_scrub_pause; /* scrub pause time */
-	uint64_t	spa_scan_pass_scrub_spent_paused; /* total paused */
-	uint64_t	spa_scan_pass_exam;	/* examined bytes per pass */
-	uint64_t	spa_scan_pass_issued;	/* issued bytes per pass */
-	kmutex_t	spa_async_lock;		/* protect async state */
-	kthread_t	*spa_async_thread;	/* thread doing async task */
-	kthread_t	*spa_async_thread_vd;	/* thread doing vd async task */
-	int		spa_async_suspended;	/* async tasks suspended */
-	kcondvar_t	spa_async_cv;		/* wait for thread_exit() */
-	uint16_t	spa_async_tasks;	/* async task mask */
-	uint64_t	spa_missing_tvds;	/* unopenable tvds on load */
-	uint64_t	spa_missing_tvds_allowed; /* allow loading spa? */
-
-	spa_removing_phys_t spa_removing_phys;
-	spa_vdev_removal_t *spa_vdev_removal;
-
-	spa_condensing_indirect_phys_t	spa_condensing_indirect_phys;
-	spa_condensing_indirect_t	*spa_condensing_indirect;
-	zthr_t		*spa_condense_zthr;	/* zthr doing condense. */
-
-	uint64_t	spa_checkpoint_txg;	/* the txg of the checkpoint */
-	spa_checkpoint_info_t spa_checkpoint_info; /* checkpoint accounting */
-	zthr_t		*spa_checkpoint_discard_zthr;
-
-	char		*spa_root;		/* alternate root directory */
-	uint64_t	spa_ena;		/* spa-wide ereport ENA */
-	int		spa_last_open_failed;	/* error if last open failed */
-	uint64_t	spa_last_ubsync_txg;	/* "best" uberblock txg */
-	uint64_t	spa_last_ubsync_txg_ts;	/* timestamp from that ub */
-	uint64_t	spa_load_txg;		/* ub txg that loaded */
-	uint64_t	spa_load_txg_ts;	/* timestamp from that ub */
-	uint64_t	spa_load_meta_errors;	/* verify metadata err count */
-	uint64_t	spa_load_data_errors;	/* verify data err count */
-	uint64_t	spa_verify_min_txg;	/* start txg of verify scrub */
-	kmutex_t	spa_errlog_lock;	/* error log lock */
-	uint64_t	spa_errlog_last;	/* last error log object */
-	uint64_t	spa_errlog_scrub;	/* scrub error log object */
-	kmutex_t	spa_errlist_lock;	/* error list/ereport lock */
-	avl_tree_t	spa_errlist_last;	/* last error list */
-	avl_tree_t	spa_errlist_scrub;	/* scrub error list */
-	uint64_t	spa_deflate;		/* should we deflate? */
-	uint64_t	spa_history;		/* history object */
-	kmutex_t	spa_history_lock;	/* history lock */
-	vdev_t		*spa_pending_vdev;	/* pending vdev additions */
-	kmutex_t	spa_props_lock;		/* property lock */
-	uint64_t	spa_pool_props_object;	/* object for properties */
-	uint64_t	spa_bootfs;		/* default boot filesystem */
-	uint64_t	spa_failmode;		/* failure mode for the pool */
-	uint64_t	spa_delegation;		/* delegation on/off */
-	list_t		spa_config_list;	/* previous cache file(s) */
-	/* per-CPU array of root of async I/O: */
-	zio_t		**spa_async_zio_root;
-	zio_t		*spa_suspend_zio_root;	/* root of all suspended I/O */
-	zio_t		*spa_txg_zio[TXG_SIZE]; /* spa_sync() waits for this */
-	kmutex_t	spa_suspend_lock;	/* protects suspend_zio_root */
-	kcondvar_t	spa_suspend_cv;		/* notification of resume */
-	zio_suspend_reason_t	spa_suspended;	/* pool is suspended */
-	uint8_t		spa_claiming;		/* pool is doing zil_claim() */
-	boolean_t	spa_is_root;		/* pool is root */
-	int		spa_minref;		/* num refs when first opened */
-	int		spa_mode;		/* FREAD | FWRITE */
-	spa_log_state_t spa_log_state;		/* log state */
-	uint64_t	spa_autoexpand;		/* lun expansion on/off */
-	uint64_t	spa_bootsize;		/* efi system partition size */
-	ddt_t		*spa_ddt[ZIO_CHECKSUM_FUNCTIONS]; /* in-core DDTs */
-	uint64_t	spa_ddt_stat_object;	/* DDT statistics */
-	uint64_t	spa_dedup_ditto;	/* dedup ditto threshold */
-	uint64_t	spa_dedup_checksum;	/* default dedup checksum */
-	uint64_t	spa_dspace;		/* dspace in normal class */
-	kmutex_t	spa_vdev_top_lock;	/* dueling offline/remove */
-	kmutex_t	spa_proc_lock;		/* protects spa_proc* */
-	kcondvar_t	spa_proc_cv;		/* spa_proc_state transitions */
-	spa_proc_state_t spa_proc_state;	/* see definition */
-	struct proc	*spa_proc;		/* "zpool-poolname" process */
-	uint64_t	spa_did;		/* if procp != p0, did of t1 */
-	kthread_t	*spa_trim_thread;	/* thread sending TRIM I/Os */
-	kmutex_t	spa_trim_lock;		/* protects spa_trim_cv */
-	kcondvar_t	spa_trim_cv;		/* used to notify TRIM thread */
-	boolean_t	spa_autoreplace;	/* autoreplace set in open */
-	int		spa_vdev_locks;		/* locks grabbed */
-	uint64_t	spa_creation_version;	/* version at pool creation */
-	uint64_t	spa_prev_software_version; /* See ub_software_version */
-	uint64_t	spa_feat_for_write_obj;	/* required to write to pool */
-	uint64_t	spa_feat_for_read_obj;	/* required to read from pool */
-	uint64_t	spa_feat_desc_obj;	/* Feature descriptions */
-	uint64_t	spa_feat_enabled_txg_obj; /* Feature enabled txg */
-	kmutex_t	spa_feat_stats_lock;	/* protects spa_feat_stats */
-	nvlist_t	*spa_feat_stats;	/* Cache of enabled features */
-	/* cache feature refcounts */
-	uint64_t	spa_feat_refcount_cache[SPA_FEATURES];
-#ifdef illumos
-	cyclic_id_t	spa_deadman_cycid;	/* cyclic id */
-#else	/* !illumos */
-#ifdef _KERNEL
-	struct callout	spa_deadman_cycid;	/* callout id */
-	struct task	spa_deadman_task;
-#endif
-#endif	/* illumos */
-	uint64_t	spa_deadman_calls;	/* number of deadman calls */
-	hrtime_t	spa_sync_starttime;	/* starting time fo spa_sync */
-	uint64_t	spa_deadman_synctime;	/* deadman expiration timer */
-	uint64_t	spa_all_vdev_zaps;	/* ZAP of per-vd ZAP obj #s */
-	spa_avz_action_t	spa_avz_action;	/* destroy/rebuild AVZ? */
-
-#ifdef illumos
-	/*
-	 * spa_iokstat_lock protects spa_iokstat and
-	 * spa_queue_stats[].
-	 */
-	kmutex_t	spa_iokstat_lock;
-	struct kstat	*spa_iokstat;		/* kstat of io to this pool */
-	struct {
-		int spa_active;
-		int spa_queued;
-	} spa_queue_stats[ZIO_PRIORITY_NUM_QUEUEABLE];
-#endif
-	/* arc_memory_throttle() parameters during low memory condition */
-	uint64_t	spa_lowmem_page_load;	/* memory load during txg */
-	uint64_t	spa_lowmem_last_txg;	/* txg window start */
-
-	hrtime_t	spa_ccw_fail_time;	/* Conf cache write fail time */
-
-	taskq_t		*spa_zvol_taskq;	/* Taskq for minor management */
-
-	uint64_t	spa_multihost;		/* multihost aware (mmp) */
-	mmp_thread_t	spa_mmp;		/* multihost mmp thread */
-	list_t		spa_leaf_list;		/* list of leaf vdevs */
-	uint64_t	spa_leaf_list_gen;	/* track leaf_list changes */
-
-	/*
-	 * spa_refcount & spa_config_lock must be the last elements
-	 * because refcount_t changes size based on compilation options.
-	 * because zfs_refcount_t changes size based on compilation options.
-	 * In order for the MDB module to function correctly, the other
-	 * fields must remain in the same location.
-	 */
-	spa_config_lock_t spa_config_lock[SCL_LOCKS]; /* config changes */
-	zfs_refcount_t	spa_refcount;		/* number of opens */
-#ifndef illumos
-	boolean_t	spa_splitting_newspa;	/* creating new spa in split */
-#endif
-};
-
-extern const char *spa_config_path;
-
-extern void spa_taskq_dispatch_ent(spa_t *spa, zio_type_t t, zio_taskq_type_t q,
-    task_func_t *func, void *arg, uint_t flags, taskq_ent_t *ent);
-extern void spa_load_spares(spa_t *spa);
-extern void spa_load_l2cache(spa_t *spa);
-
-#ifdef	__cplusplus
-}
-#endif
-
-#endif	/* _SYS_SPA_IMPL_H */
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/space_map.h
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/space_map.h
+++ /dev/null
@@ -1,230 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-
-/*
- * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
- */
-
-#ifndef _SYS_SPACE_MAP_H
-#define	_SYS_SPACE_MAP_H
-
-#include <sys/avl.h>
-#include <sys/range_tree.h>
-#include <sys/dmu.h>
-
-#ifdef	__cplusplus
-extern "C" {
-#endif
-
-/*
- * The size of the space map object has increased to include a histogram.
- * The SPACE_MAP_SIZE_V0 designates the original size and is used to
- * maintain backward compatibility.
- */
-#define	SPACE_MAP_SIZE_V0	(3 * sizeof (uint64_t))
-#define	SPACE_MAP_HISTOGRAM_SIZE	32
-
-/*
- * The space_map_phys is the on-disk representation of the space map.
- * Consumers of space maps should never reference any of the members of this
- * structure directly. These members may only be updated in syncing context.
- *
- * Note the smp_object is no longer used but remains in the structure
- * for backward compatibility.
- */
-typedef struct space_map_phys {
-	/* object number: not needed but kept for backwards compatibility */
-	uint64_t	smp_object;
-
-	/* length of the object in bytes */
-	uint64_t	smp_length;
-
-	/* space allocated from the map */
-	int64_t		smp_alloc;
-
-	/* reserved */
-	uint64_t	smp_pad[5];
-
-	/*
-	 * The smp_histogram maintains a histogram of free regions. Each
-	 * bucket, smp_histogram[i], contains the number of free regions
-	 * whose size is:
-	 * 2^(i+sm_shift) <= size of free region in bytes < 2^(i+sm_shift+1)
-	 */
-	uint64_t	smp_histogram[SPACE_MAP_HISTOGRAM_SIZE];
-} space_map_phys_t;
-
-/*
- * The space map object defines a region of space, its size, how much is
- * allocated, and the on-disk object that stores this information.
- * Consumers of space maps may only access the members of this structure.
- *
- * Note: the space_map may not be accessed concurrently; consumers
- * must provide external locking if required.
- */
-typedef struct space_map {
-	uint64_t	sm_start;	/* start of map */
-	uint64_t	sm_size;	/* size of map */
-	uint8_t		sm_shift;	/* unit shift */
-	objset_t	*sm_os;		/* objset for this map */
-	uint64_t	sm_object;	/* object id for this map */
-	uint32_t	sm_blksz;	/* block size for space map */
-	dmu_buf_t	*sm_dbuf;	/* space_map_phys_t dbuf */
-	space_map_phys_t *sm_phys;	/* on-disk space map */
-} space_map_t;
-
-/*
- * debug entry
- *
- *     2     2        10                     50
- *  +-----+-----+------------+----------------------------------+
- *  | 1 0 | act |  syncpass  |        txg (lower bits)          |
- *  +-----+-----+------------+----------------------------------+
- *   63 62 61 60 59        50 49                                0
- *
- *
- * one-word entry
- *
- *    1               47                   1           15
- *  +-----------------------------------------------------------+
- *  | 0 |   offset (sm_shift units)    | type |       run       |
- *  +-----------------------------------------------------------+
- *   63  62                          16   15   14               0
- *
- *
- * two-word entry
- *
- *     2     2               36                      24
- *  +-----+-----+---------------------------+-------------------+
- *  | 1 1 | pad |            run            |       vdev        |
- *  +-----+-----+---------------------------+-------------------+
- *   63 62 61 60 59                       24 23                 0
- *
- *     1                            63
- *  +------+----------------------------------------------------+
- *  | type |                      offset                        |
- *  +------+----------------------------------------------------+
- *     63   62                                                  0
- *
- * Note that a two-word entry will not strandle a block boundary.
- * If necessary, the last word of a block will be padded with a
- * debug entry (with act = syncpass = txg = 0).
- */
-
-typedef enum {
-	SM_ALLOC,
-	SM_FREE
-} maptype_t;
-
-typedef struct space_map_entry {
-	maptype_t sme_type;
-	uint32_t sme_vdev;	/* max is 2^24-1; SM_NO_VDEVID if not present */
-	uint64_t sme_offset;	/* max is 2^63-1; units of sm_shift */
-	uint64_t sme_run;	/* max is 2^36; units of sm_shift */
-} space_map_entry_t;
-
-#define	SM_NO_VDEVID	(1 << SPA_VDEVBITS)
-
-/* one-word entry constants */
-#define	SM_DEBUG_PREFIX	2
-#define	SM_OFFSET_BITS	47
-#define	SM_RUN_BITS	15
-
-/* two-word entry constants */
-#define	SM2_PREFIX	3
-#define	SM2_OFFSET_BITS	63
-#define	SM2_RUN_BITS	36
-
-#define	SM_PREFIX_DECODE(x)	BF64_DECODE(x, 62, 2)
-#define	SM_PREFIX_ENCODE(x)	BF64_ENCODE(x, 62, 2)
-
-#define	SM_DEBUG_ACTION_DECODE(x)	BF64_DECODE(x, 60, 2)
-#define	SM_DEBUG_ACTION_ENCODE(x)	BF64_ENCODE(x, 60, 2)
-#define	SM_DEBUG_SYNCPASS_DECODE(x)	BF64_DECODE(x, 50, 10)
-#define	SM_DEBUG_SYNCPASS_ENCODE(x)	BF64_ENCODE(x, 50, 10)
-#define	SM_DEBUG_TXG_DECODE(x)		BF64_DECODE(x, 0, 50)
-#define	SM_DEBUG_TXG_ENCODE(x)		BF64_ENCODE(x, 0, 50)
-
-#define	SM_OFFSET_DECODE(x)	BF64_DECODE(x, 16, SM_OFFSET_BITS)
-#define	SM_OFFSET_ENCODE(x)	BF64_ENCODE(x, 16, SM_OFFSET_BITS)
-#define	SM_TYPE_DECODE(x)	BF64_DECODE(x, 15, 1)
-#define	SM_TYPE_ENCODE(x)	BF64_ENCODE(x, 15, 1)
-#define	SM_RUN_DECODE(x)	(BF64_DECODE(x, 0, SM_RUN_BITS) + 1)
-#define	SM_RUN_ENCODE(x)	BF64_ENCODE((x) - 1, 0, SM_RUN_BITS)
-#define	SM_RUN_MAX		SM_RUN_DECODE(~0ULL)
-#define	SM_OFFSET_MAX		SM_OFFSET_DECODE(~0ULL)
-
-#define	SM2_RUN_DECODE(x)	(BF64_DECODE(x, SPA_VDEVBITS, SM2_RUN_BITS) + 1)
-#define	SM2_RUN_ENCODE(x)	BF64_ENCODE((x) - 1, SPA_VDEVBITS, SM2_RUN_BITS)
-#define	SM2_VDEV_DECODE(x)	BF64_DECODE(x, 0, SPA_VDEVBITS)
-#define	SM2_VDEV_ENCODE(x)	BF64_ENCODE(x, 0, SPA_VDEVBITS)
-#define	SM2_TYPE_DECODE(x)	BF64_DECODE(x, SM2_OFFSET_BITS, 1)
-#define	SM2_TYPE_ENCODE(x)	BF64_ENCODE(x, SM2_OFFSET_BITS, 1)
-#define	SM2_OFFSET_DECODE(x)	BF64_DECODE(x, 0, SM2_OFFSET_BITS)
-#define	SM2_OFFSET_ENCODE(x)	BF64_ENCODE(x, 0, SM2_OFFSET_BITS)
-#define	SM2_RUN_MAX		SM2_RUN_DECODE(~0ULL)
-#define	SM2_OFFSET_MAX		SM2_OFFSET_DECODE(~0ULL)
-
-boolean_t sm_entry_is_debug(uint64_t e);
-boolean_t sm_entry_is_single_word(uint64_t e);
-boolean_t sm_entry_is_double_word(uint64_t e);
-
-typedef int (*sm_cb_t)(space_map_entry_t *sme, void *arg);
-
-int space_map_load(space_map_t *sm, range_tree_t *rt, maptype_t maptype);
-int space_map_load_length(space_map_t *sm, range_tree_t *rt, maptype_t maptype,
-    uint64_t length);
-int space_map_iterate(space_map_t *sm, uint64_t length,
-    sm_cb_t callback, void *arg);
-int space_map_incremental_destroy(space_map_t *sm, sm_cb_t callback, void *arg,
-    dmu_tx_t *tx);
-
-boolean_t space_map_histogram_verify(space_map_t *sm, range_tree_t *rt);
-void space_map_histogram_clear(space_map_t *sm);
-void space_map_histogram_add(space_map_t *sm, range_tree_t *rt,
-    dmu_tx_t *tx);
-
-uint64_t space_map_object(space_map_t *sm);
-int64_t space_map_allocated(space_map_t *sm);
-uint64_t space_map_length(space_map_t *sm);
-
-void space_map_write(space_map_t *sm, range_tree_t *rt, maptype_t maptype,
-    uint64_t vdev_id, dmu_tx_t *tx);
-uint64_t space_map_estimate_optimal_size(space_map_t *sm, range_tree_t *rt,
-    uint64_t vdev_id);
-void space_map_truncate(space_map_t *sm, int blocksize, dmu_tx_t *tx);
-uint64_t space_map_alloc(objset_t *os, int blocksize, dmu_tx_t *tx);
-void space_map_free(space_map_t *sm, dmu_tx_t *tx);
-void space_map_free_obj(objset_t *os, uint64_t smobj, dmu_tx_t *tx);
-
-int space_map_open(space_map_t **smp, objset_t *os, uint64_t object,
-    uint64_t start, uint64_t size, uint8_t shift);
-void space_map_close(space_map_t *sm);
-
-#ifdef	__cplusplus
-}
-#endif
-
-#endif	/* _SYS_SPACE_MAP_H */
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/space_reftree.h
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/space_reftree.h
+++ /dev/null
@@ -1,57 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-
-/*
- * Copyright (c) 2013 by Delphix. All rights reserved.
- */
-
-#ifndef _SYS_SPACE_REFTREE_H
-#define	_SYS_SPACE_REFTREE_H
-
-#include <sys/range_tree.h>
-
-#ifdef	__cplusplus
-extern "C" {
-#endif
-
-typedef struct space_ref {
-	avl_node_t	sr_node;	/* AVL node */
-	uint64_t	sr_offset;	/* range offset (start or end) */
-	int64_t		sr_refcnt;	/* associated reference count */
-} space_ref_t;
-
-void space_reftree_create(avl_tree_t *t);
-void space_reftree_destroy(avl_tree_t *t);
-void space_reftree_add_seg(avl_tree_t *t, uint64_t start, uint64_t end,
-    int64_t refcnt);
-void space_reftree_add_map(avl_tree_t *t, range_tree_t *rt, int64_t refcnt);
-void space_reftree_generate_map(avl_tree_t *t, range_tree_t *rt,
-    int64_t minref);
-
-#ifdef	__cplusplus
-}
-#endif
-
-#endif	/* _SYS_SPACE_REFTREE_H */
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/trim_map.h
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/trim_map.h
+++ /dev/null
@@ -1,51 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright (c) 2012 Pawel Jakub Dawidek <pawel@dawidek.net>.
- * All rights reserved.
- */
-
-#ifndef _SYS_TRIM_MAP_H
-#define	_SYS_TRIM_MAP_H
-
-#include <sys/avl.h>
-#include <sys/list.h>
-#include <sys/spa.h>
-
-#ifdef	__cplusplus
-extern "C" {
-#endif
-
-extern void trim_map_create(vdev_t *vd);
-extern void trim_map_destroy(vdev_t *vd);
-extern void trim_map_free(vdev_t *vd, uint64_t offset, uint64_t size, uint64_t txg);
-extern boolean_t trim_map_write_start(zio_t *zio);
-extern void trim_map_write_done(zio_t *zio);
-
-extern void trim_thread_create(spa_t *spa);
-extern void trim_thread_destroy(spa_t *spa);
-extern void trim_thread_wakeup(spa_t *spa);
-
-#ifdef	__cplusplus
-}
-#endif
-
-#endif	/* _SYS_TRIM_MAP_H */
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/txg.h
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/txg.h
+++ /dev/null
@@ -1,136 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-/*
- * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
- */
-
-#ifndef _SYS_TXG_H
-#define	_SYS_TXG_H
-
-#include <sys/spa.h>
-#include <sys/zfs_context.h>
-
-#ifdef	__cplusplus
-extern "C" {
-#endif
-
-#define	TXG_CONCURRENT_STATES	3	/* open, quiescing, syncing	*/
-#define	TXG_SIZE		4		/* next power of 2	*/
-#define	TXG_MASK		(TXG_SIZE - 1)	/* mask for size	*/
-#define	TXG_INITIAL		TXG_SIZE	/* initial txg 		*/
-#define	TXG_IDX			(txg & TXG_MASK)
-
-/* Number of txgs worth of frees we defer adding to in-core spacemaps */
-#define	TXG_DEFER_SIZE		2
-
-typedef struct tx_cpu tx_cpu_t;
-
-typedef struct txg_handle {
-	tx_cpu_t	*th_cpu;
-	uint64_t	th_txg;
-} txg_handle_t;
-
-typedef struct txg_node {
-	struct txg_node	*tn_next[TXG_SIZE];
-	uint8_t		tn_member[TXG_SIZE];
-} txg_node_t;
-
-typedef struct txg_list {
-	kmutex_t	tl_lock;
-	size_t		tl_offset;
-	spa_t		*tl_spa;
-	txg_node_t	*tl_head[TXG_SIZE];
-} txg_list_t;
-
-struct dsl_pool;
-
-extern void txg_init(struct dsl_pool *dp, uint64_t txg);
-extern void txg_fini(struct dsl_pool *dp);
-extern void txg_sync_start(struct dsl_pool *dp);
-extern void txg_sync_stop(struct dsl_pool *dp);
-extern uint64_t txg_hold_open(struct dsl_pool *dp, txg_handle_t *txghp);
-extern void txg_rele_to_quiesce(txg_handle_t *txghp);
-extern void txg_rele_to_sync(txg_handle_t *txghp);
-extern void txg_register_callbacks(txg_handle_t *txghp, list_t *tx_callbacks);
-
-extern void txg_delay(struct dsl_pool *dp, uint64_t txg, hrtime_t delta,
-    hrtime_t resolution);
-extern void txg_kick(struct dsl_pool *dp);
-
-/*
- * Wait until the given transaction group has finished syncing.
- * Try to make this happen as soon as possible (eg. kick off any
- * necessary syncs immediately).  If txg==0, wait for the currently open
- * txg to finish syncing.
- */
-extern void txg_wait_synced(struct dsl_pool *dp, uint64_t txg);
-
-/*
- * Wait as above. Returns true if the thread was signaled while waiting.
- */
-extern boolean_t txg_wait_synced_sig(struct dsl_pool *dp, uint64_t txg);
-
-/*
- * Wait until the given transaction group, or one after it, is
- * the open transaction group.  Try to make this happen as soon
- * as possible (eg. kick off any necessary syncs immediately).
- * If txg == 0, wait for the next open txg.
- */
-extern void txg_wait_open(struct dsl_pool *dp, uint64_t txg);
-
-/*
- * Returns TRUE if we are "backed up" waiting for the syncing
- * transaction to complete; otherwise returns FALSE.
- */
-extern boolean_t txg_stalled(struct dsl_pool *dp);
-
-/* returns TRUE if someone is waiting for the next txg to sync */
-extern boolean_t txg_sync_waiting(struct dsl_pool *dp);
-
-extern void txg_verify(spa_t *spa, uint64_t txg);
-
-/*
- * Per-txg object lists.
- */
-
-#define	TXG_CLEAN(txg)	((txg) - 1)
-
-extern void txg_list_create(txg_list_t *tl, spa_t *spa, size_t offset);
-extern void txg_list_destroy(txg_list_t *tl);
-extern boolean_t txg_list_empty(txg_list_t *tl, uint64_t txg);
-extern boolean_t txg_all_lists_empty(txg_list_t *tl);
-extern boolean_t txg_list_add(txg_list_t *tl, void *p, uint64_t txg);
-extern boolean_t txg_list_add_tail(txg_list_t *tl, void *p, uint64_t txg);
-extern void *txg_list_remove(txg_list_t *tl, uint64_t txg);
-extern void *txg_list_remove_this(txg_list_t *tl, void *p, uint64_t txg);
-extern boolean_t txg_list_member(txg_list_t *tl, void *p, uint64_t txg);
-extern void *txg_list_head(txg_list_t *tl, uint64_t txg);
-extern void *txg_list_next(txg_list_t *tl, void *p, uint64_t txg);
-
-#ifdef	__cplusplus
-}
-#endif
-
-#endif	/* _SYS_TXG_H */
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/txg_impl.h
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/txg_impl.h
+++ /dev/null
@@ -1,125 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-
-/*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-
-/*
- * Copyright (c) 2013, 2017 by Delphix. All rights reserved.
- */
-
-#ifndef _SYS_TXG_IMPL_H
-#define	_SYS_TXG_IMPL_H
-
-#include <sys/spa.h>
-#include <sys/txg.h>
-
-#ifdef	__cplusplus
-extern "C" {
-#endif
-
-/*
- * The tx_cpu structure is a per-cpu structure that is used to track
- * the number of active transaction holds (tc_count). As transactions
- * are assigned into a transaction group the appropriate tc_count is
- * incremented to indicate that there are pending changes that have yet
- * to quiesce. Consumers evenutally call txg_rele_to_sync() to decrement
- * the tc_count. A transaction group is not considered quiesced until all
- * tx_cpu structures have reached a tc_count of zero.
- *
- * This structure is a per-cpu structure by design. Updates to this structure
- * are frequent and concurrent. Having a single structure would result in
- * heavy lock contention so a per-cpu design was implemented. With the fanned
- * out mutex design, consumers only need to lock the mutex associated with
- * thread's cpu.
- *
- * The tx_cpu contains two locks, the tc_lock and tc_open_lock.
- * The tc_lock is used to protect all members of the tx_cpu structure with
- * the exception of the tc_open_lock. This lock should only be held for a
- * short period of time, typically when updating the value of tc_count.
- *
- * The tc_open_lock protects the tx_open_txg member of the tx_state structure.
- * This lock is used to ensure that transactions are only assigned into
- * the current open transaction group. In order to move the current open
- * transaction group to the quiesce phase, the txg_quiesce thread must
- * grab all tc_open_locks, increment the tx_open_txg, and drop the locks.
- * The tc_open_lock is held until the transaction is assigned into the
- * transaction group. Typically, this is a short operation but if throttling
- * is occuring it may be held for longer periods of time.
- */
-struct tx_cpu {
-	kmutex_t	tc_open_lock;	/* protects tx_open_txg */
-	kmutex_t	tc_lock;	/* protects the rest of this struct */
-	kcondvar_t	tc_cv[TXG_SIZE];
-	uint64_t	tc_count[TXG_SIZE];	/* tx hold count on each txg */
-	list_t		tc_callbacks[TXG_SIZE]; /* commit cb list */
-	char		tc_pad[8];		/* pad to fill 3 cache lines */
-};
-
-/*
- * The tx_state structure maintains the state information about the different
- * stages of the pool's transcation groups. A per pool tx_state structure
- * is used to track this information. The tx_state structure also points to
- * an array of tx_cpu structures (described above). Although the tx_sync_lock
- * is used to protect the members of this structure, it is not used to
- * protect the tx_open_txg. Instead a special lock in the tx_cpu structure
- * is used. Readers of tx_open_txg must grab the per-cpu tc_open_lock.
- * Any thread wishing to update tx_open_txg must grab the tc_open_lock on
- * every cpu (see txg_quiesce()).
- */
-typedef struct tx_state {
-	tx_cpu_t	*tx_cpu;	/* protects access to tx_open_txg */
-	kmutex_t	tx_sync_lock;	/* protects the rest of this struct */
-
-	uint64_t	tx_open_txg;	/* currently open txg id */
-	uint64_t	tx_quiescing_txg; /* currently quiescing txg id */
-	uint64_t	tx_quiesced_txg; /* quiesced txg waiting for sync */
-	uint64_t	tx_syncing_txg;	/* currently syncing txg id */
-	uint64_t	tx_synced_txg;	/* last synced txg id */
-
-	hrtime_t	tx_open_time;	/* start time of tx_open_txg */
-
-	uint64_t	tx_sync_txg_waiting; /* txg we're waiting to sync */
-	uint64_t	tx_quiesce_txg_waiting; /* txg we're waiting to open */
-
-	kcondvar_t	tx_sync_more_cv;
-	kcondvar_t	tx_sync_done_cv;
-	kcondvar_t	tx_quiesce_more_cv;
-	kcondvar_t	tx_quiesce_done_cv;
-	kcondvar_t	tx_timeout_cv;
-	kcondvar_t	tx_exit_cv;	/* wait for all threads to exit */
-
-	uint8_t		tx_threads;	/* number of threads */
-	uint8_t		tx_exiting;	/* set when we're exiting */
-
-	kthread_t	*tx_sync_thread;
-	kthread_t	*tx_quiesce_thread;
-
-	taskq_t		*tx_commit_cb_taskq; /* commit callback taskq */
-} tx_state_t;
-
-#ifdef	__cplusplus
-}
-#endif
-
-#endif	/* _SYS_TXG_IMPL_H */
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/uberblock.h
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/uberblock.h
+++ /dev/null
@@ -1,50 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-/*
- * Copyright (c) 2014 by Delphix. All rights reserved.
- */
-
-#ifndef _SYS_UBERBLOCK_H
-#define	_SYS_UBERBLOCK_H
-
-#include <sys/spa.h>
-#include <sys/vdev.h>
-#include <sys/zio.h>
-
-#ifdef	__cplusplus
-extern "C" {
-#endif
-
-typedef struct uberblock uberblock_t;
-
-extern int uberblock_verify(uberblock_t *);
-extern boolean_t uberblock_update(uberblock_t *ub, vdev_t *rvd, uint64_t txg,
-    uint64_t mmp_delay);
-
-#ifdef	__cplusplus
-}
-#endif
-
-#endif	/* _SYS_UBERBLOCK_H */
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/uberblock_impl.h
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/uberblock_impl.h
+++ /dev/null
@@ -1,145 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2016, 2017 by Delphix. All rights reserved.
- */
-
-#ifndef _SYS_UBERBLOCK_IMPL_H
-#define	_SYS_UBERBLOCK_IMPL_H
-
-#include <sys/uberblock.h>
-
-#ifdef	__cplusplus
-extern "C" {
-#endif
-
-/*
- * The uberblock version is incremented whenever an incompatible on-disk
- * format change is made to the SPA, DMU, or ZAP.
- *
- * Note: the first two fields should never be moved.  When a storage pool
- * is opened, the uberblock must be read off the disk before the version
- * can be checked.  If the ub_version field is moved, we may not detect
- * version mismatch.  If the ub_magic field is moved, applications that
- * expect the magic number in the first word won't work.
- */
-#define	UBERBLOCK_MAGIC		0x00bab10c		/* oo-ba-bloc!	*/
-#define	UBERBLOCK_SHIFT		10			/* up to 1K	*/
-#define	MMP_MAGIC		0xa11cea11		/* all-see-all	*/
-
-#define	MMP_INTERVAL_VALID_BIT	0x01
-#define	MMP_SEQ_VALID_BIT	0x02
-#define	MMP_FAIL_INT_VALID_BIT	0x04
-
-#define	MMP_VALID(ubp)		(ubp->ub_magic == UBERBLOCK_MAGIC && \
-				    ubp->ub_mmp_magic == MMP_MAGIC)
-#define	MMP_INTERVAL_VALID(ubp)	(MMP_VALID(ubp) && (ubp->ub_mmp_config & \
-				    MMP_INTERVAL_VALID_BIT))
-#define	MMP_SEQ_VALID(ubp)	(MMP_VALID(ubp) && (ubp->ub_mmp_config & \
-				    MMP_SEQ_VALID_BIT))
-#define	MMP_FAIL_INT_VALID(ubp)	(MMP_VALID(ubp) && (ubp->ub_mmp_config & \
-				    MMP_FAIL_INT_VALID_BIT))
-
-#define	MMP_INTERVAL(ubp)	((ubp->ub_mmp_config & 0x00000000FFFFFF00) \
-				    >> 8)
-#define	MMP_SEQ(ubp)		((ubp->ub_mmp_config & 0x0000FFFF00000000) \
-				    >> 32)
-#define	MMP_FAIL_INT(ubp)	((ubp->ub_mmp_config & 0xFFFF000000000000) \
-				    >> 48)
-
-#define	MMP_INTERVAL_SET(write) \
-	    (((uint64_t)(write & 0xFFFFFF) << 8) | MMP_INTERVAL_VALID_BIT)
-
-#define	MMP_SEQ_SET(seq) \
-	    (((uint64_t)(seq & 0xFFFF) << 32) | MMP_SEQ_VALID_BIT)
-
-#define	MMP_FAIL_INT_SET(fail) \
-	    (((uint64_t)(fail & 0xFFFF) << 48) | MMP_FAIL_INT_VALID_BIT)
-
-struct uberblock {
-	uint64_t	ub_magic;	/* UBERBLOCK_MAGIC		*/
-	uint64_t	ub_version;	/* SPA_VERSION			*/
-	uint64_t	ub_txg;		/* txg of last sync		*/
-	uint64_t	ub_guid_sum;	/* sum of all vdev guids	*/
-	uint64_t	ub_timestamp;	/* UTC time of last sync	*/
-	blkptr_t	ub_rootbp;	/* MOS objset_phys_t		*/
-
-	/* highest SPA_VERSION supported by software that wrote this txg */
-	uint64_t	ub_software_version;
-
-	/* Maybe missing in uberblocks we read, but always written */
-	uint64_t	ub_mmp_magic;
-	/*
-	 * If ub_mmp_delay == 0 and ub_mmp_magic is valid, MMP is off.
-	 * Otherwise, nanosec since last MMP write.
-	 */
-	uint64_t	ub_mmp_delay;
-
-	/*
-	 * The ub_mmp_config contains the multihost write interval, multihost
-	 * fail intervals, sequence number for sub-second granularity, and
-	 * valid bit mask.  This layout is as follows:
-	 *
-	 *   64      56      48      40      32      24      16      8       0
-	 *   +-------+-------+-------+-------+-------+-------+-------+-------+
-	 * 0 | Fail Intervals|      Seq      |   Write Interval (ms) | VALID |
-	 *   +-------+-------+-------+-------+-------+-------+-------+-------+
-	 *
-	 * This allows a write_interval of (2^24/1000)s, over 4.5 hours
-	 *
-	 * VALID Bits:
-	 * - 0x01 - Write Interval (ms)
-	 * - 0x02 - Sequence number exists
-	 * - 0x04 - Fail Intervals
-	 * - 0xf8 - Reserved
-	 */
-	uint64_t	ub_mmp_config;
-
-	/*
-	 * ub_checkpoint_txg indicates two things about the current uberblock:
-	 *
-	 * 1] If it is not zero then this uberblock is a checkpoint. If it is
-	 *    zero, then this uberblock is not a checkpoint.
-	 *
-	 * 2] On checkpointed uberblocks, the value of ub_checkpoint_txg is
-	 *    the ub_txg that the uberblock had at the time we moved it to
-	 *    the MOS config.
-	 *
-	 * The field is set when we checkpoint the uberblock and continues to
-	 * hold that value even after we've rewound (unlike the ub_txg that
-	 * is reset to a higher value).
-	 *
-	 * Besides checks used to determine whether we are reopening the
-	 * pool from a checkpointed uberblock [see spa_ld_select_uberblock()],
-	 * the value of the field is used to determine which ZIL blocks have
-	 * been allocated according to the ms_sm when we are rewinding to a
-	 * checkpoint. Specifically, if blk_birth > ub_checkpoint_txg, then
-	 * the ZIL block is not allocated [see uses of spa_min_claim_txg()].
-	 */
-	uint64_t	ub_checkpoint_txg;
-};
-
-#ifdef	__cplusplus
-}
-#endif
-
-#endif	/* _SYS_UBERBLOCK_IMPL_H */
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/unique.h
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/unique.h
+++ /dev/null
@@ -1,57 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-
-#ifndef	_SYS_UNIQUE_H
-#define	_SYS_UNIQUE_H
-
-#include <sys/zfs_context.h>
-
-#ifdef	__cplusplus
-extern "C" {
-#endif
-
-/* The number of significant bits in each unique value. */
-#define	UNIQUE_BITS	56
-
-void unique_init(void);
-void unique_fini(void);
-
-/*
- * Return a new unique value (which will not be uniquified against until
- * it is unique_insert()-ed).
- */
-uint64_t unique_create(void);
-
-/* Return a unique value, which equals the one passed in if possible. */
-uint64_t unique_insert(uint64_t value);
-
-/* Indicate that this value no longer needs to be uniquified against. */
-void unique_remove(uint64_t value);
-
-#ifdef	__cplusplus
-}
-#endif
-
-#endif /* _SYS_UNIQUE_H */
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev.h
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev.h
+++ /dev/null
@@ -1,194 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-
-/*
- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, 2017 by Delphix. All rights reserved.
- * Copyright (c) 2017, Intel Corporation.
- */
-
-#ifndef _SYS_VDEV_H
-#define	_SYS_VDEV_H
-
-#include <sys/spa.h>
-#include <sys/zio.h>
-#include <sys/dmu.h>
-#include <sys/space_map.h>
-#include <sys/fs/zfs.h>
-
-#ifdef	__cplusplus
-extern "C" {
-#endif
-
-typedef enum vdev_dtl_type {
-	DTL_MISSING,	/* 0% replication: no copies of the data */
-	DTL_PARTIAL,	/* less than 100% replication: some copies missing */
-	DTL_SCRUB,	/* unable to fully repair during scrub/resilver */
-	DTL_OUTAGE,	/* temporarily missing (used to attempt detach) */
-	DTL_TYPES
-} vdev_dtl_type_t;
-
-extern boolean_t zfs_nocacheflush;
-extern boolean_t zfs_trim_enabled;
-
-extern void vdev_dbgmsg(vdev_t *vd, const char *fmt, ...);
-extern void vdev_dbgmsg_print_tree(vdev_t *, int);
-extern int vdev_open(vdev_t *);
-extern void vdev_open_children(vdev_t *);
-extern boolean_t vdev_uses_zvols(vdev_t *);
-extern int vdev_validate(vdev_t *);
-extern int vdev_copy_path_strict(vdev_t *, vdev_t *);
-extern void vdev_copy_path_relaxed(vdev_t *, vdev_t *);
-extern void vdev_close(vdev_t *);
-extern int vdev_create(vdev_t *, uint64_t txg, boolean_t isreplace);
-extern void vdev_reopen(vdev_t *);
-extern int vdev_validate_aux(vdev_t *vd);
-extern zio_t *vdev_probe(vdev_t *vd, zio_t *pio);
-extern boolean_t vdev_is_concrete(vdev_t *vd);
-extern boolean_t vdev_is_bootable(vdev_t *vd);
-extern vdev_t *vdev_lookup_top(spa_t *spa, uint64_t vdev);
-extern vdev_t *vdev_lookup_by_guid(vdev_t *vd, uint64_t guid);
-extern int vdev_count_leaves(spa_t *spa);
-extern void vdev_dtl_dirty(vdev_t *vd, vdev_dtl_type_t d,
-    uint64_t txg, uint64_t size);
-extern boolean_t vdev_dtl_contains(vdev_t *vd, vdev_dtl_type_t d,
-    uint64_t txg, uint64_t size);
-extern boolean_t vdev_dtl_empty(vdev_t *vd, vdev_dtl_type_t d);
-extern boolean_t vdev_dtl_need_resilver(vdev_t *vd, uint64_t off, size_t size);
-extern void vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg,
-    int scrub_done);
-extern boolean_t vdev_dtl_required(vdev_t *vd);
-extern boolean_t vdev_resilver_needed(vdev_t *vd,
-    uint64_t *minp, uint64_t *maxp);
-extern void vdev_destroy_unlink_zap(vdev_t *vd, uint64_t zapobj,
-    dmu_tx_t *tx);
-extern uint64_t vdev_create_link_zap(vdev_t *vd, dmu_tx_t *tx);
-extern void vdev_construct_zaps(vdev_t *vd, dmu_tx_t *tx);
-extern void vdev_destroy_spacemaps(vdev_t *vd, dmu_tx_t *tx);
-extern void vdev_indirect_mark_obsolete(vdev_t *vd, uint64_t offset,
-    uint64_t size);
-extern void spa_vdev_indirect_mark_obsolete(spa_t *spa, uint64_t vdev,
-    uint64_t offset, uint64_t size, dmu_tx_t *tx);
-
-extern void vdev_hold(vdev_t *);
-extern void vdev_rele(vdev_t *);
-
-extern int vdev_metaslab_init(vdev_t *vd, uint64_t txg);
-extern void vdev_metaslab_fini(vdev_t *vd);
-extern void vdev_metaslab_set_size(vdev_t *);
-extern void vdev_ashift_optimize(vdev_t *);
-extern void vdev_expand(vdev_t *vd, uint64_t txg);
-extern void vdev_split(vdev_t *vd);
-extern void vdev_deadman(vdev_t *vd);
-
-extern void vdev_get_stats(vdev_t *vd, vdev_stat_t *vs);
-extern void vdev_clear_stats(vdev_t *vd);
-extern void vdev_stat_update(zio_t *zio, uint64_t psize);
-extern void vdev_scan_stat_init(vdev_t *vd);
-extern void vdev_propagate_state(vdev_t *vd);
-extern void vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state,
-    vdev_aux_t aux);
-extern boolean_t vdev_children_are_offline(vdev_t *vd);
-
-extern void vdev_space_update(vdev_t *vd,
-    int64_t alloc_delta, int64_t defer_delta, int64_t space_delta);
-
-extern int64_t vdev_deflated_space(vdev_t *vd, int64_t space);
-
-extern uint64_t vdev_psize_to_asize(vdev_t *vd, uint64_t psize);
-
-extern int vdev_fault(spa_t *spa, uint64_t guid, vdev_aux_t aux);
-extern int vdev_degrade(spa_t *spa, uint64_t guid, vdev_aux_t aux);
-extern int vdev_online(spa_t *spa, uint64_t guid, uint64_t flags,
-    vdev_state_t *);
-extern int vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags);
-extern void vdev_clear(spa_t *spa, vdev_t *vd);
-
-extern boolean_t vdev_is_dead(vdev_t *vd);
-extern boolean_t vdev_readable(vdev_t *vd);
-extern boolean_t vdev_writeable(vdev_t *vd);
-extern boolean_t vdev_allocatable(vdev_t *vd);
-extern boolean_t vdev_accessible(vdev_t *vd, zio_t *zio);
-extern boolean_t vdev_is_spacemap_addressable(vdev_t *vd);
-
-extern void vdev_cache_init(vdev_t *vd);
-extern void vdev_cache_fini(vdev_t *vd);
-extern boolean_t vdev_cache_read(zio_t *zio);
-extern void vdev_cache_write(zio_t *zio);
-extern void vdev_cache_purge(vdev_t *vd);
-
-extern void vdev_queue_init(vdev_t *vd);
-extern void vdev_queue_fini(vdev_t *vd);
-extern zio_t *vdev_queue_io(zio_t *zio);
-extern void vdev_queue_io_done(zio_t *zio);
-extern void vdev_queue_change_io_priority(zio_t *zio, zio_priority_t priority);
-extern int vdev_queue_length(vdev_t *vd);
-extern uint64_t vdev_queue_lastoffset(vdev_t *vd);
-extern void vdev_queue_register_lastoffset(vdev_t *vd, zio_t *zio);
-
-extern void vdev_config_dirty(vdev_t *vd);
-extern void vdev_config_clean(vdev_t *vd);
-extern int vdev_config_sync(vdev_t **svd, int svdcount, uint64_t txg);
-
-extern void vdev_state_dirty(vdev_t *vd);
-extern void vdev_state_clean(vdev_t *vd);
-
-typedef enum vdev_config_flag {
-	VDEV_CONFIG_SPARE = 1 << 0,
-	VDEV_CONFIG_L2CACHE = 1 << 1,
-	VDEV_CONFIG_REMOVING = 1 << 2,
-	VDEV_CONFIG_MOS = 1 << 3,
-	VDEV_CONFIG_MISSING = 1 << 4
-} vdev_config_flag_t;
-
-extern void vdev_top_config_generate(spa_t *spa, nvlist_t *config);
-extern nvlist_t *vdev_config_generate(spa_t *spa, vdev_t *vd,
-    boolean_t getstats, vdev_config_flag_t flags);
-
-/*
- * Label routines
- */
-struct uberblock;
-extern uint64_t vdev_label_offset(uint64_t psize, int l, uint64_t offset);
-extern int vdev_label_number(uint64_t psise, uint64_t offset);
-extern nvlist_t *vdev_label_read_config(vdev_t *vd, uint64_t txg);
-extern void vdev_uberblock_load(vdev_t *, struct uberblock *, nvlist_t **);
-extern void vdev_label_write(zio_t *zio, vdev_t *vd, int l, abd_t *buf, uint64_t
-    offset, uint64_t size, zio_done_func_t *done, void *priv, int flags);
-
-typedef enum {
-	VDEV_LABEL_CREATE,	/* create/add a new device */
-	VDEV_LABEL_REPLACE,	/* replace an existing device */
-	VDEV_LABEL_SPARE,	/* add a new hot spare */
-	VDEV_LABEL_REMOVE,	/* remove an existing device */
-	VDEV_LABEL_L2CACHE,	/* add an L2ARC cache device */
-	VDEV_LABEL_SPLIT	/* generating new label for split-off dev */
-} vdev_labeltype_t;
-
-extern int vdev_label_init(vdev_t *vd, uint64_t txg, vdev_labeltype_t reason);
-
-extern int vdev_label_write_pad2(vdev_t *vd, const char *buf, size_t size);
-
-#ifdef	__cplusplus
-}
-#endif
-
-#endif	/* _SYS_VDEV_H */
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_disk.h
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_disk.h
+++ /dev/null
@@ -1,67 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- * Copyright (c) 2013 Joyent, Inc. All rights reserved.
- * Copyright 2012 Nexenta Systems, Inc.  All rights reserved.
- */
-
-#ifndef _SYS_VDEV_DISK_H
-#define	_SYS_VDEV_DISK_H
-
-#include <sys/vdev.h>
-#ifdef _KERNEL
-#include <sys/buf.h>
-#include <sys/ddi.h>
-#include <sys/sunldi.h>
-#include <sys/sunddi.h>
-#endif
-
-#ifdef	__cplusplus
-extern "C" {
-#endif
-
-#ifdef _KERNEL
-typedef struct vdev_disk {
-	ddi_devid_t	vd_devid;
-	char		*vd_minor;
-	ldi_handle_t	vd_lh;
-	list_t		vd_ldi_cbs;
-	boolean_t	vd_ldi_offline;
-} vdev_disk_t;
-#endif
-
-extern int vdev_disk_physio(vdev_t *,
-    caddr_t, size_t, uint64_t, int, boolean_t);
-
-/*
- * Since vdev_disk.c is not compiled into libzpool, this function should only be
- * defined in the zfs kernel module.
- */
-#ifdef _KERNEL
-extern int vdev_disk_ldi_physio(ldi_handle_t, caddr_t, size_t, uint64_t, int);
-#endif
-#ifdef	__cplusplus
-}
-#endif
-
-#endif	/* _SYS_VDEV_DISK_H */
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_file.h
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_file.h
+++ /dev/null
@@ -1,49 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-
-#ifndef _SYS_VDEV_FILE_H
-#define	_SYS_VDEV_FILE_H
-
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
-#include <sys/vdev.h>
-
-#ifdef	__cplusplus
-extern "C" {
-#endif
-
-typedef struct vdev_file {
-	vnode_t		*vf_vnode;
-} vdev_file_t;
-
-extern void vdev_file_init(void);
-extern void vdev_file_fini(void);
-
-#ifdef	__cplusplus
-}
-#endif
-
-#endif	/* _SYS_VDEV_FILE_H */
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h
+++ /dev/null
@@ -1,551 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
- * Copyright (c) 2017, Intel Corporation.
- */
-
-#ifndef _SYS_VDEV_IMPL_H
-#define	_SYS_VDEV_IMPL_H
-
-#include <sys/avl.h>
-#include <sys/bpobj.h>
-#include <sys/dmu.h>
-#include <sys/metaslab.h>
-#include <sys/nvpair.h>
-#include <sys/space_map.h>
-#include <sys/vdev.h>
-#include <sys/dkio.h>
-#include <sys/uberblock_impl.h>
-#include <sys/vdev_indirect_mapping.h>
-#include <sys/vdev_indirect_births.h>
-#include <sys/vdev_removal.h>
-
-#ifdef	__cplusplus
-extern "C" {
-#endif
-
-/*
- * Virtual device descriptors.
- *
- * All storage pool operations go through the virtual device framework,
- * which provides data replication and I/O scheduling.
- */
-
-/*
- * Forward declarations that lots of things need.
- */
-typedef struct vdev_queue vdev_queue_t;
-typedef struct vdev_cache vdev_cache_t;
-typedef struct vdev_cache_entry vdev_cache_entry_t;
-struct abd;
-
-extern int zfs_vdev_queue_depth_pct;
-extern int zfs_vdev_def_queue_depth;
-extern uint32_t zfs_vdev_async_write_max_active;
-
-/*
- * Virtual device operations
- */
-typedef int	vdev_open_func_t(vdev_t *vd, uint64_t *size, uint64_t *max_size,
-    uint64_t *logical_ashift, uint64_t *physical_ashift);
-typedef void	vdev_close_func_t(vdev_t *vd);
-typedef uint64_t vdev_asize_func_t(vdev_t *vd, uint64_t psize);
-typedef void	vdev_io_start_func_t(zio_t *zio);
-typedef void	vdev_io_done_func_t(zio_t *zio);
-typedef void	vdev_state_change_func_t(vdev_t *vd, int, int);
-typedef boolean_t vdev_need_resilver_func_t(vdev_t *vd, uint64_t, size_t);
-typedef void	vdev_hold_func_t(vdev_t *vd);
-typedef void	vdev_rele_func_t(vdev_t *vd);
-
-typedef void	vdev_remap_cb_t(uint64_t inner_offset, vdev_t *vd,
-    uint64_t offset, uint64_t size, void *arg);
-typedef void	vdev_remap_func_t(vdev_t *vd, uint64_t offset, uint64_t size,
-    vdev_remap_cb_t callback, void *arg);
-/*
- * Given a target vdev, translates the logical range "in" to the physical
- * range "res"
- */
-typedef void vdev_xlation_func_t(vdev_t *cvd, const range_seg_t *in,
-    range_seg_t *res);
-
-typedef struct vdev_ops {
-	vdev_open_func_t		*vdev_op_open;
-	vdev_close_func_t		*vdev_op_close;
-	vdev_asize_func_t		*vdev_op_asize;
-	vdev_io_start_func_t		*vdev_op_io_start;
-	vdev_io_done_func_t		*vdev_op_io_done;
-	vdev_state_change_func_t	*vdev_op_state_change;
-	vdev_need_resilver_func_t	*vdev_op_need_resilver;
-	vdev_hold_func_t		*vdev_op_hold;
-	vdev_rele_func_t		*vdev_op_rele;
-	vdev_remap_func_t		*vdev_op_remap;
-	/*
-	 * For translating ranges from non-leaf vdevs (e.g. raidz) to leaves.
-	 * Used when initializing vdevs. Isn't used by leaf ops.
-	 */
-	vdev_xlation_func_t		*vdev_op_xlate;
-	char				vdev_op_type[16];
-	boolean_t			vdev_op_leaf;
-} vdev_ops_t;
-
-/*
- * Virtual device properties
- */
-struct vdev_cache_entry {
-	struct abd	*ve_abd;
-	uint64_t	ve_offset;
-	uint64_t	ve_lastused;
-	avl_node_t	ve_offset_node;
-	avl_node_t	ve_lastused_node;
-	uint32_t	ve_hits;
-	uint16_t	ve_missed_update;
-	zio_t		*ve_fill_io;
-};
-
-struct vdev_cache {
-	avl_tree_t	vc_offset_tree;
-	avl_tree_t	vc_lastused_tree;
-	kmutex_t	vc_lock;
-};
-
-typedef struct vdev_queue_class {
-	uint32_t	vqc_active;
-
-	/*
-	 * Sorted by offset or timestamp, depending on if the queue is
-	 * LBA-ordered vs FIFO.
-	 */
-	avl_tree_t	vqc_queued_tree;
-} vdev_queue_class_t;
-
-struct vdev_queue {
-	vdev_t		*vq_vdev;
-	vdev_queue_class_t vq_class[ZIO_PRIORITY_NUM_QUEUEABLE];
-	avl_tree_t	vq_active_tree;
-	avl_tree_t	vq_read_offset_tree;
-	avl_tree_t	vq_write_offset_tree;
-	uint64_t	vq_last_offset;
-	hrtime_t	vq_io_complete_ts; /* time last i/o completed */
-	kmutex_t	vq_lock;
-	uint64_t	vq_lastoffset;
-};
-
-typedef enum vdev_alloc_bias {
-	VDEV_BIAS_NONE,
-	VDEV_BIAS_LOG,		/* dedicated to ZIL data (SLOG) */
-	VDEV_BIAS_SPECIAL,	/* dedicated to ddt, metadata, and small blks */
-	VDEV_BIAS_DEDUP		/* dedicated to dedup metadata */
-} vdev_alloc_bias_t;
-
-
-/*
- * On-disk indirect vdev state.
- *
- * An indirect vdev is described exclusively in the MOS config of a pool.
- * The config for an indirect vdev includes several fields, which are
- * accessed in memory by a vdev_indirect_config_t.
- */
-typedef struct vdev_indirect_config {
-	/*
-	 * Object (in MOS) which contains the indirect mapping. This object
-	 * contains an array of vdev_indirect_mapping_entry_phys_t ordered by
-	 * vimep_src. The bonus buffer for this object is a
-	 * vdev_indirect_mapping_phys_t. This object is allocated when a vdev
-	 * removal is initiated.
-	 *
-	 * Note that this object can be empty if none of the data on the vdev
-	 * has been copied yet.
-	 */
-	uint64_t	vic_mapping_object;
-
-	/*
-	 * Object (in MOS) which contains the birth times for the mapping
-	 * entries. This object contains an array of
-	 * vdev_indirect_birth_entry_phys_t sorted by vibe_offset. The bonus
-	 * buffer for this object is a vdev_indirect_birth_phys_t. This object
-	 * is allocated when a vdev removal is initiated.
-	 *
-	 * Note that this object can be empty if none of the vdev has yet been
-	 * copied.
-	 */
-	uint64_t	vic_births_object;
-
-	/*
-	 * This is the vdev ID which was removed previous to this vdev, or
-	 * UINT64_MAX if there are no previously removed vdevs.
-	 */
-	uint64_t	vic_prev_indirect_vdev;
-} vdev_indirect_config_t;
-
-/*
- * Virtual device descriptor
- */
-struct vdev {
-	/*
-	 * Common to all vdev types.
-	 */
-	uint64_t	vdev_id;	/* child number in vdev parent	*/
-	uint64_t	vdev_guid;	/* unique ID for this vdev	*/
-	uint64_t	vdev_guid_sum;	/* self guid + all child guids	*/
-	uint64_t	vdev_orig_guid;	/* orig. guid prior to remove	*/
-	uint64_t	vdev_asize;	/* allocatable device capacity	*/
-	uint64_t	vdev_min_asize;	/* min acceptable asize		*/
-	uint64_t	vdev_max_asize;	/* max acceptable asize		*/
-	uint64_t	vdev_ashift;	/* block alignment shift	*/
-	/*
-	 * Logical block alignment shift
-	 *
-	 * The smallest sized/aligned I/O supported by the device.
-	 */
-	uint64_t        vdev_logical_ashift;
-	/*
-	 * Physical block alignment shift
-	 *
-	 * The device supports logical I/Os with vdev_logical_ashift
-	 * size/alignment, but optimum performance will be achieved by
-	 * aligning/sizing requests to vdev_physical_ashift.  Smaller
-	 * requests may be inflated or incur device level read-modify-write
-	 * operations.
-	 *
-	 * May be 0 to indicate no preference (i.e. use vdev_logical_ashift).
-         */
-	uint64_t        vdev_physical_ashift;
-	uint64_t	vdev_state;	/* see VDEV_STATE_* #defines	*/
-	uint64_t	vdev_prevstate;	/* used when reopening a vdev	*/
-	vdev_ops_t	*vdev_ops;	/* vdev operations		*/
-	spa_t		*vdev_spa;	/* spa for this vdev		*/
-	void		*vdev_tsd;	/* type-specific data		*/
-	vnode_t		*vdev_name_vp;	/* vnode for pathname		*/
-	vnode_t		*vdev_devid_vp;	/* vnode for devid		*/
-	vdev_t		*vdev_top;	/* top-level vdev		*/
-	vdev_t		*vdev_parent;	/* parent vdev			*/
-	vdev_t		**vdev_child;	/* array of children		*/
-	uint64_t	vdev_children;	/* number of children		*/
-	vdev_stat_t	vdev_stat;	/* virtual device statistics	*/
-	boolean_t	vdev_expanding;	/* expand the vdev?		*/
-	boolean_t	vdev_reopening;	/* reopen in progress?		*/
-	boolean_t	vdev_nonrot;	/* true if solid state		*/
-	int		vdev_open_error; /* error on last open		*/
-	kthread_t	*vdev_open_thread; /* thread opening children	*/
-	uint64_t	vdev_crtxg;	/* txg when top-level was added */
-
-	/*
-	 * Top-level vdev state.
-	 */
-	uint64_t	vdev_ms_array;	/* metaslab array object	*/
-	uint64_t	vdev_ms_shift;	/* metaslab size shift		*/
-	uint64_t	vdev_ms_count;	/* number of metaslabs		*/
-	metaslab_group_t *vdev_mg;	/* metaslab group		*/
-	metaslab_t	**vdev_ms;	/* metaslab array		*/
-	txg_list_t	vdev_ms_list;	/* per-txg dirty metaslab lists	*/
-	txg_list_t	vdev_dtl_list;	/* per-txg dirty DTL lists	*/
-	txg_node_t	vdev_txg_node;	/* per-txg dirty vdev linkage	*/
-	boolean_t	vdev_remove_wanted; /* async remove wanted?	*/
-	boolean_t	vdev_probe_wanted; /* async probe wanted?	*/
-	list_node_t	vdev_config_dirty_node; /* config dirty list	*/
-	list_node_t	vdev_state_dirty_node; /* state dirty list	*/
-	uint64_t	vdev_deflate_ratio; /* deflation ratio (x512)	*/
-	uint64_t	vdev_islog;	/* is an intent log device	*/
-	uint64_t	vdev_removing;	/* device is being removed?	*/
-	boolean_t	vdev_ishole;	/* is a hole in the namespace	*/
-	uint64_t	vdev_top_zap;
-	vdev_alloc_bias_t vdev_alloc_bias; /* metaslab allocation bias	*/
-
-	/* pool checkpoint related */
-	space_map_t	*vdev_checkpoint_sm;	/* contains reserved blocks */
-
-	boolean_t	vdev_initialize_exit_wanted;
-	vdev_initializing_state_t	vdev_initialize_state;
-	kthread_t	*vdev_initialize_thread;
-	/* Protects vdev_initialize_thread and vdev_initialize_state. */
-	kmutex_t	vdev_initialize_lock;
-	kcondvar_t	vdev_initialize_cv;
-	uint64_t	vdev_initialize_offset[TXG_SIZE];
-	uint64_t	vdev_initialize_last_offset;
-	range_tree_t	*vdev_initialize_tree;	/* valid while initializing */
-	uint64_t	vdev_initialize_bytes_est;
-	uint64_t	vdev_initialize_bytes_done;
-	time_t		vdev_initialize_action_time;	/* start and end time */
-
-	/* for limiting outstanding I/Os */
-	kmutex_t	vdev_initialize_io_lock;
-	kcondvar_t	vdev_initialize_io_cv;
-	uint64_t	vdev_initialize_inflight;
-
-	/*
-	 * Values stored in the config for an indirect or removing vdev.
-	 */
-	vdev_indirect_config_t	vdev_indirect_config;
-
-	/*
-	 * The vdev_indirect_rwlock protects the vdev_indirect_mapping
-	 * pointer from changing on indirect vdevs (when it is condensed).
-	 * Note that removing (not yet indirect) vdevs have different
-	 * access patterns (the mapping is not accessed from open context,
-	 * e.g. from zio_read) and locking strategy (e.g. svr_lock).
-	 */
-	krwlock_t vdev_indirect_rwlock;
-	vdev_indirect_mapping_t *vdev_indirect_mapping;
-	vdev_indirect_births_t *vdev_indirect_births;
-
-	/*
-	 * In memory data structures used to manage the obsolete sm, for
-	 * indirect or removing vdevs.
-	 *
-	 * The vdev_obsolete_segments is the in-core record of the segments
-	 * that are no longer referenced anywhere in the pool (due to
-	 * being freed or remapped and not referenced by any snapshots).
-	 * During a sync, segments are added to vdev_obsolete_segments
-	 * via vdev_indirect_mark_obsolete(); at the end of each sync
-	 * pass, this is appended to vdev_obsolete_sm via
-	 * vdev_indirect_sync_obsolete().  The vdev_obsolete_lock
-	 * protects against concurrent modifications of vdev_obsolete_segments
-	 * from multiple zio threads.
-	 */
-	kmutex_t	vdev_obsolete_lock;
-	range_tree_t	*vdev_obsolete_segments;
-	space_map_t	*vdev_obsolete_sm;
-
-	/*
-	 * Protects the vdev_scan_io_queue field itself as well as the
-	 * structure's contents (when present).
-	 */
-	kmutex_t			vdev_scan_io_queue_lock;
-	struct dsl_scan_io_queue	*vdev_scan_io_queue;
-
-	/*
-	 * Leaf vdev state.
-	 */
-	range_tree_t	*vdev_dtl[DTL_TYPES]; /* dirty time logs	*/
-	space_map_t	*vdev_dtl_sm;	/* dirty time log space map	*/
-	txg_node_t	vdev_dtl_node;	/* per-txg dirty DTL linkage	*/
-	uint64_t	vdev_dtl_object; /* DTL object			*/
-	uint64_t	vdev_psize;	/* physical device capacity	*/
-	uint64_t	vdev_wholedisk;	/* true if this is a whole disk */
-	uint64_t	vdev_offline;	/* persistent offline state	*/
-	uint64_t	vdev_faulted;	/* persistent faulted state	*/
-	uint64_t	vdev_degraded;	/* persistent degraded state	*/
-	uint64_t	vdev_removed;	/* persistent removed state	*/
-	uint64_t	vdev_resilver_txg; /* persistent resilvering state */
-	uint64_t	vdev_nparity;	/* number of parity devices for raidz */
-	char		*vdev_path;	/* vdev path (if any)		*/
-	char		*vdev_devid;	/* vdev devid (if any)		*/
-	char		*vdev_physpath;	/* vdev device path (if any)	*/
-	char		*vdev_fru;	/* physical FRU location	*/
-	uint64_t	vdev_not_present; /* not present during import	*/
-	uint64_t	vdev_unspare;	/* unspare when resilvering done */
-	boolean_t	vdev_nowritecache; /* true if flushwritecache failed */
-	boolean_t	vdev_notrim;	/* true if trim failed */
-	boolean_t	vdev_checkremove; /* temporary online test	*/
-	boolean_t	vdev_forcefault; /* force online fault		*/
-	boolean_t	vdev_splitting;	/* split or repair in progress  */
-	boolean_t	vdev_delayed_close; /* delayed device close?	*/
-	boolean_t	vdev_tmpoffline; /* device taken offline temporarily? */
-	boolean_t	vdev_detached;	/* device detached?		*/
-	boolean_t	vdev_cant_read;	/* vdev is failing all reads	*/
-	boolean_t	vdev_cant_write; /* vdev is failing all writes	*/
-	boolean_t	vdev_isspare;	/* was a hot spare		*/
-	boolean_t	vdev_isl2cache;	/* was a l2cache device		*/
-	vdev_queue_t	vdev_queue;	/* I/O deadline schedule queue	*/
-	vdev_cache_t	vdev_cache;	/* physical block cache		*/
-	spa_aux_vdev_t	*vdev_aux;	/* for l2cache and spares vdevs	*/
-	zio_t		*vdev_probe_zio; /* root of current probe	*/
-	vdev_aux_t	vdev_label_aux;	/* on-disk aux state		*/
-	struct trim_map	*vdev_trimmap;	/* map on outstanding trims	*/ 
-	uint64_t	vdev_leaf_zap;
-	hrtime_t	vdev_mmp_pending; /* 0 if write finished	*/
-	uint64_t	vdev_mmp_kstat_id;	/* to find kstat entry */
-	list_node_t	vdev_leaf_node;		/* leaf vdev list */
-
-	/*
-	 * For DTrace to work in userland (libzpool) context, these fields must
-	 * remain at the end of the structure.  DTrace will use the kernel's
-	 * CTF definition for 'struct vdev', and since the size of a kmutex_t is
-	 * larger in userland, the offsets for the rest of the fields would be
-	 * incorrect.
-	 */
-	kmutex_t	vdev_dtl_lock;	/* vdev_dtl_{map,resilver}	*/
-	kmutex_t	vdev_stat_lock;	/* vdev_stat			*/
-	kmutex_t	vdev_probe_lock; /* protects vdev_probe_zio	*/
-};
-
-#define	VDEV_RAIDZ_MAXPARITY	3
-
-#define	VDEV_PAD_SIZE		(8 << 10)
-/* 2 padding areas (vl_pad1 and vl_pad2) to skip */
-#define	VDEV_SKIP_SIZE		VDEV_PAD_SIZE * 2
-#define	VDEV_PHYS_SIZE		(112 << 10)
-#define	VDEV_UBERBLOCK_RING	(128 << 10)
-
-/*
- * MMP blocks occupy the last MMP_BLOCKS_PER_LABEL slots in the uberblock
- * ring when MMP is enabled.
- */
-#define	MMP_BLOCKS_PER_LABEL	1
-
-/* The largest uberblock we support is 8k. */
-#define	MAX_UBERBLOCK_SHIFT (13)
-#define	VDEV_UBERBLOCK_SHIFT(vd)	\
-	MIN(MAX((vd)->vdev_top->vdev_ashift, UBERBLOCK_SHIFT), \
-	    MAX_UBERBLOCK_SHIFT)
-#define	VDEV_UBERBLOCK_COUNT(vd)	\
-	(VDEV_UBERBLOCK_RING >> VDEV_UBERBLOCK_SHIFT(vd))
-#define	VDEV_UBERBLOCK_OFFSET(vd, n)	\
-	offsetof(vdev_label_t, vl_uberblock[(n) << VDEV_UBERBLOCK_SHIFT(vd)])
-#define	VDEV_UBERBLOCK_SIZE(vd)		(1ULL << VDEV_UBERBLOCK_SHIFT(vd))
-
-typedef struct vdev_phys {
-	char		vp_nvlist[VDEV_PHYS_SIZE - sizeof (zio_eck_t)];
-	zio_eck_t	vp_zbt;
-} vdev_phys_t;
-
-typedef struct vdev_label {
-	char		vl_pad1[VDEV_PAD_SIZE];			/*  8K */
-	char		vl_pad2[VDEV_PAD_SIZE];			/*  8K */
-	vdev_phys_t	vl_vdev_phys;				/* 112K	*/
-	char		vl_uberblock[VDEV_UBERBLOCK_RING];	/* 128K	*/
-} vdev_label_t;							/* 256K total */
-
-/*
- * vdev_dirty() flags
- */
-#define	VDD_METASLAB	0x01
-#define	VDD_DTL		0x02
-
-/* Offset of embedded boot loader region on each label */
-#define	VDEV_BOOT_OFFSET	(2 * sizeof (vdev_label_t))
-/*
- * Size of embedded boot loader region on each label.
- * The total size of the first two labels plus the boot area is 4MB.
- */
-#define	VDEV_BOOT_SIZE		(7ULL << 19)			/* 3.5M */
-
-/*
- * Size of label regions at the start and end of each leaf device.
- */
-#define	VDEV_LABEL_START_SIZE	(2 * sizeof (vdev_label_t) + VDEV_BOOT_SIZE)
-#define	VDEV_LABEL_END_SIZE	(2 * sizeof (vdev_label_t))
-#define	VDEV_LABELS		4
-#define	VDEV_BEST_LABEL		VDEV_LABELS
-
-#define	VDEV_ALLOC_LOAD		0
-#define	VDEV_ALLOC_ADD		1
-#define	VDEV_ALLOC_SPARE	2
-#define	VDEV_ALLOC_L2CACHE	3
-#define	VDEV_ALLOC_ROOTPOOL	4
-#define	VDEV_ALLOC_SPLIT	5
-#define	VDEV_ALLOC_ATTACH	6
-
-/*
- * Allocate or free a vdev
- */
-extern vdev_t *vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid,
-    vdev_ops_t *ops);
-extern int vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *config,
-    vdev_t *parent, uint_t id, int alloctype);
-extern void vdev_free(vdev_t *vd);
-
-/*
- * Add or remove children and parents
- */
-extern void vdev_add_child(vdev_t *pvd, vdev_t *cvd);
-extern void vdev_remove_child(vdev_t *pvd, vdev_t *cvd);
-extern void vdev_compact_children(vdev_t *pvd);
-extern vdev_t *vdev_add_parent(vdev_t *cvd, vdev_ops_t *ops);
-extern void vdev_remove_parent(vdev_t *cvd);
-
-/*
- * vdev sync load and sync
- */
-extern boolean_t vdev_log_state_valid(vdev_t *vd);
-extern int vdev_load(vdev_t *vd);
-extern int vdev_dtl_load(vdev_t *vd);
-extern void vdev_sync(vdev_t *vd, uint64_t txg);
-extern void vdev_sync_done(vdev_t *vd, uint64_t txg);
-extern void vdev_dirty(vdev_t *vd, int flags, void *arg, uint64_t txg);
-extern void vdev_dirty_leaves(vdev_t *vd, int flags, uint64_t txg);
-
-/*
- * Available vdev types.
- */
-extern vdev_ops_t vdev_root_ops;
-extern vdev_ops_t vdev_mirror_ops;
-extern vdev_ops_t vdev_replacing_ops;
-extern vdev_ops_t vdev_raidz_ops;
-#ifdef _KERNEL
-extern vdev_ops_t vdev_geom_ops;
-#else
-extern vdev_ops_t vdev_disk_ops;
-#endif
-extern vdev_ops_t vdev_file_ops;
-extern vdev_ops_t vdev_missing_ops;
-extern vdev_ops_t vdev_hole_ops;
-extern vdev_ops_t vdev_spare_ops;
-extern vdev_ops_t vdev_indirect_ops;
-
-/*
- * Common size functions
- */
-extern void vdev_default_xlate(vdev_t *vd, const range_seg_t *in,
-    range_seg_t *out);
-extern uint64_t vdev_default_asize(vdev_t *vd, uint64_t psize);
-extern uint64_t vdev_get_min_asize(vdev_t *vd);
-extern void vdev_set_min_asize(vdev_t *vd);
-
-/*
- * Global variables
- */
-extern int vdev_standard_sm_blksz;
-/* zdb uses this tunable, so it must be declared here to make lint happy. */
-extern int zfs_vdev_cache_size;
-extern uint_t zfs_geom_probe_vdev_key;
-
-/*
- * Functions from vdev_indirect.c
- */
-extern void vdev_indirect_sync_obsolete(vdev_t *vd, dmu_tx_t *tx);
-extern boolean_t vdev_indirect_should_condense(vdev_t *vd);
-extern void spa_condense_indirect_start_sync(vdev_t *vd, dmu_tx_t *tx);
-extern int vdev_obsolete_sm_object(vdev_t *vd);
-extern boolean_t vdev_obsolete_counts_are_precise(vdev_t *vd);
-
-#ifdef illumos
-/*
- * Other miscellaneous functions
- */
-int vdev_checkpoint_sm_object(vdev_t *vd);
-
-/*
- * The vdev_buf_t is used to translate between zio_t and buf_t, and back again.
- */
-typedef struct vdev_buf {
-	buf_t	vb_buf;		/* buffer that describes the io */
-	zio_t	*vb_io;		/* pointer back to the original zio_t */
-} vdev_buf_t;
-#endif
-
-#ifdef	__cplusplus
-}
-#endif
-
-#endif	/* _SYS_VDEV_IMPL_H */
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_indirect_births.h
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_indirect_births.h
+++ /dev/null
@@ -1,80 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * This file and its contents are supplied under the terms of the
- * Common Development and Distribution License ("CDDL"), version 1.0.
- * You may only use this file in accordance with the terms of version
- * 1.0 of the CDDL.
- *
- * A full copy of the text of the CDDL should have accompanied this
- * source.  A copy of the CDDL is also available via the Internet at
- * http://www.illumos.org/license/CDDL.
- *
- * CDDL HEADER END
- */
-
-/*
- * Copyright (c) 2015 by Delphix. All rights reserved.
- */
-
-#ifndef	_SYS_VDEV_INDIRECT_BIRTHS_H
-#define	_SYS_VDEV_INDIRECT_BIRTHS_H
-
-#include <sys/dmu.h>
-#include <sys/spa.h>
-
-#ifdef	__cplusplus
-extern "C" {
-#endif
-
-typedef struct vdev_indirect_birth_entry_phys {
-	uint64_t vibe_offset;
-	uint64_t vibe_phys_birth_txg;
-} vdev_indirect_birth_entry_phys_t;
-
-typedef struct vdev_indirect_birth_phys {
-	uint64_t	vib_count; /* count of v_i_b_entry_phys_t's */
-} vdev_indirect_birth_phys_t;
-
-typedef struct vdev_indirect_births {
-	uint64_t	vib_object;
-
-	/*
-	 * Each entry indicates that everything up to but not including
-	 * vibe_offset was copied in vibe_phys_birth_txg. Entries are sorted
-	 * by increasing phys_birth, and also by increasing offset. See
-	 * vdev_indirect_births_physbirth for usage.
-	 */
-	vdev_indirect_birth_entry_phys_t *vib_entries;
-
-	objset_t	*vib_objset;
-
-	dmu_buf_t	*vib_dbuf;
-	vdev_indirect_birth_phys_t	*vib_phys;
-} vdev_indirect_births_t;
-
-extern vdev_indirect_births_t *vdev_indirect_births_open(objset_t *os,
-    uint64_t object);
-extern void vdev_indirect_births_close(vdev_indirect_births_t *vib);
-extern boolean_t vdev_indirect_births_is_open(vdev_indirect_births_t *vib);
-extern uint64_t vdev_indirect_births_alloc(objset_t *os, dmu_tx_t *tx);
-extern void vdev_indirect_births_free(objset_t *os, uint64_t object,
-    dmu_tx_t *tx);
-
-extern uint64_t vdev_indirect_births_count(vdev_indirect_births_t *vib);
-extern uint64_t vdev_indirect_births_object(vdev_indirect_births_t *vib);
-
-extern void vdev_indirect_births_add_entry(vdev_indirect_births_t *vib,
-    uint64_t offset, uint64_t txg, dmu_tx_t *tx);
-
-extern uint64_t vdev_indirect_births_physbirth(vdev_indirect_births_t *vib,
-    uint64_t offset, uint64_t asize);
-
-extern uint64_t vdev_indirect_births_last_entry_txg(
-    vdev_indirect_births_t *vib);
-
-#ifdef	__cplusplus
-}
-#endif
-
-#endif	/* _SYS_VDEV_INDIRECT_BIRTHS_H */
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_indirect_mapping.h
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_indirect_mapping.h
+++ /dev/null
@@ -1,141 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * This file and its contents are supplied under the terms of the
- * Common Development and Distribution License ("CDDL"), version 1.0.
- * You may only use this file in accordance with the terms of version
- * 1.0 of the CDDL.
- *
- * A full copy of the text of the CDDL should have accompanied this
- * source.  A copy of the CDDL is also available via the Internet at
- * http://www.illumos.org/license/CDDL.
- *
- * CDDL HEADER END
- */
-
-/*
- * Copyright (c) 2015 by Delphix. All rights reserved.
- */
-
-#ifndef	_SYS_VDEV_INDIRECT_MAPPING_H
-#define	_SYS_VDEV_INDIRECT_MAPPING_H
-
-#include <sys/dmu.h>
-#include <sys/list.h>
-#include <sys/spa.h>
-#include <sys/space_map.h>
-
-#ifdef	__cplusplus
-extern "C" {
-#endif
-
-typedef struct vdev_indirect_mapping_entry_phys {
-	/*
-	 * Decode with DVA_MAPPING_* macros.
-	 * Contains:
-	 *   the source offset (low 63 bits)
-	 *   the one-bit "mark", used for garbage collection (by zdb)
-	 */
-	uint64_t vimep_src;
-
-	/*
-	 * Note: the DVA's asize is 24 bits, and can thus store ranges
-	 * up to 8GB.
-	 */
-	dva_t	vimep_dst;
-} vdev_indirect_mapping_entry_phys_t;
-
-#define	DVA_MAPPING_GET_SRC_OFFSET(vimep)	\
-	BF64_GET_SB((vimep)->vimep_src, 0, 63, SPA_MINBLOCKSHIFT, 0)
-#define	DVA_MAPPING_SET_SRC_OFFSET(vimep, x)	\
-	BF64_SET_SB((vimep)->vimep_src, 0, 63, SPA_MINBLOCKSHIFT, 0, x)
-
-typedef struct vdev_indirect_mapping_entry {
-	vdev_indirect_mapping_entry_phys_t	vime_mapping;
-	uint32_t				vime_obsolete_count;
-	list_node_t				vime_node;
-} vdev_indirect_mapping_entry_t;
-
-/*
- * This is stored in the bonus buffer of the mapping object, see comment of
- * vdev_indirect_config for more details.
- */
-typedef struct vdev_indirect_mapping_phys {
-	uint64_t	vimp_max_offset;
-	uint64_t	vimp_bytes_mapped;
-	uint64_t	vimp_num_entries; /* number of v_i_m_entry_phys_t's */
-
-	/*
-	 * For each entry in the mapping object, this object contains an
-	 * entry representing the number of bytes of that mapping entry
-	 * that were no longer in use by the pool at the time this indirect
-	 * vdev was last condensed.
-	 */
-	uint64_t	vimp_counts_object;
-} vdev_indirect_mapping_phys_t;
-
-#define	VDEV_INDIRECT_MAPPING_SIZE_V0	(3 * sizeof (uint64_t))
-
-typedef struct vdev_indirect_mapping {
-	uint64_t	vim_object;
-	boolean_t	vim_havecounts;
-
-	/*
-	 * An ordered array of all mapping entries, sorted by source offset.
-	 * Note that vim_entries is needed during a removal (and contains
-	 * mappings that have been synced to disk so far) to handle frees
-	 * from the removing device.
-	 */
-	vdev_indirect_mapping_entry_phys_t *vim_entries;
-
-	objset_t	*vim_objset;
-
-	dmu_buf_t	*vim_dbuf;
-	vdev_indirect_mapping_phys_t	*vim_phys;
-} vdev_indirect_mapping_t;
-
-extern vdev_indirect_mapping_t *vdev_indirect_mapping_open(objset_t *os,
-    uint64_t object);
-extern void vdev_indirect_mapping_close(vdev_indirect_mapping_t *vim);
-extern uint64_t vdev_indirect_mapping_alloc(objset_t *os, dmu_tx_t *tx);
-extern void vdev_indirect_mapping_free(objset_t *os, uint64_t obj,
-    dmu_tx_t *tx);
-
-extern uint64_t vdev_indirect_mapping_num_entries(vdev_indirect_mapping_t *vim);
-extern uint64_t vdev_indirect_mapping_max_offset(vdev_indirect_mapping_t *vim);
-extern uint64_t vdev_indirect_mapping_object(vdev_indirect_mapping_t *vim);
-extern uint64_t vdev_indirect_mapping_bytes_mapped(
-    vdev_indirect_mapping_t *vim);
-extern uint64_t vdev_indirect_mapping_size(vdev_indirect_mapping_t *vim);
-
-/*
- * Writes the given list of vdev_indirect_mapping_entry_t to the mapping
- * then updates internal state.
- */
-extern void vdev_indirect_mapping_add_entries(vdev_indirect_mapping_t *vim,
-    list_t *vime_list, dmu_tx_t *tx);
-
-extern vdev_indirect_mapping_entry_phys_t *
-    vdev_indirect_mapping_entry_for_offset(vdev_indirect_mapping_t *vim,
-    uint64_t offset);
-
-extern vdev_indirect_mapping_entry_phys_t *
-    vdev_indirect_mapping_entry_for_offset_or_next(vdev_indirect_mapping_t *vim,
-    uint64_t offset);
-
-extern uint32_t *vdev_indirect_mapping_load_obsolete_counts(
-    vdev_indirect_mapping_t *vim);
-extern void vdev_indirect_mapping_load_obsolete_spacemap(
-    vdev_indirect_mapping_t *vim,
-    uint32_t *counts, space_map_t *obsolete_space_sm);
-extern void vdev_indirect_mapping_increment_obsolete_count(
-    vdev_indirect_mapping_t *vim,
-    uint64_t offset, uint64_t asize, uint32_t *counts);
-extern void vdev_indirect_mapping_free_obsolete_counts(
-    vdev_indirect_mapping_t *vim, uint32_t *counts);
-
-#ifdef	__cplusplus
-}
-#endif
-
-#endif	/* _SYS_VDEV_INDIRECT_MAPPING_H */
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_initialize.h
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_initialize.h
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-
-/*
- * Copyright (c) 2016 by Delphix. All rights reserved.
- */
-
-#ifndef _SYS_VDEV_INITIALIZE_H
-#define	_SYS_VDEV_INITIALIZE_H
-
-#ifdef	__cplusplus
-extern "C" {
-#endif
-
-extern void vdev_initialize(vdev_t *vd);
-extern void vdev_initialize_stop(vdev_t *vd,
-    vdev_initializing_state_t tgt_state);
-extern void vdev_initialize_stop_all(vdev_t *vd,
-    vdev_initializing_state_t tgt_state);
-extern void vdev_initialize_restart(vdev_t *vd);
-extern void vdev_xlate(vdev_t *vd, const range_seg_t *logical_rs,
-    range_seg_t *physical_rs);
-
-#ifdef	__cplusplus
-}
-#endif
-
-#endif	/* _SYS_VDEV_INITIALIZE_H */
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_raidz.h
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_raidz.h
+++ /dev/null
@@ -1,50 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright (c) 2013, Joyent, Inc. All rights reserved.
- */
-
-#ifndef _SYS_VDEV_RAIDZ_H
-#define	_SYS_VDEV_RAIDZ_H
-
-#include <sys/vdev.h>
-#ifdef illumos
-#include <sys/semaphore.h>
-#ifdef _KERNEL
-#include <sys/ddi.h>
-#include <sys/sunldi.h>
-#include <sys/sunddi.h>
-#endif
-#endif
-
-#ifdef	__cplusplus
-extern "C" {
-#endif
-
-#ifdef _KERNEL
-extern int vdev_raidz_physio(vdev_t *,
-    caddr_t, size_t, uint64_t, uint64_t, boolean_t, boolean_t);
-#endif
-#ifdef	__cplusplus
-}
-#endif
-
-#endif	/* _SYS_VDEV_RAIDZ_H */
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_removal.h
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_removal.h
+++ /dev/null
@@ -1,96 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * This file and its contents are supplied under the terms of the
- * Common Development and Distribution License ("CDDL"), version 1.0.
- * You may only use this file in accordance with the terms of version
- * 1.0 of the CDDL.
- *
- * A full copy of the text of the CDDL should have accompanied this
- * source.  A copy of the CDDL is also available via the Internet at
- * http://www.illumos.org/license/CDDL.
- *
- * CDDL HEADER END
- */
-
-/*
- * Copyright (c) 2014, 2017 by Delphix. All rights reserved.
- */
-
-#ifndef _SYS_VDEV_REMOVAL_H
-#define	_SYS_VDEV_REMOVAL_H
-
-#include <sys/spa.h>
-#include <sys/bpobj.h>
-#include <sys/vdev_indirect_mapping.h>
-#include <sys/vdev_indirect_births.h>
-
-#ifdef	__cplusplus
-extern "C" {
-#endif
-
-typedef struct spa_vdev_removal {
-	uint64_t	svr_vdev_id;
-	uint64_t	svr_max_offset_to_sync[TXG_SIZE];
-	/* Thread performing a vdev removal. */
-	kthread_t	*svr_thread;
-	/* Segments left to copy from the current metaslab. */
-	range_tree_t	*svr_allocd_segs;
-	kmutex_t	svr_lock;
-	kcondvar_t	svr_cv;
-	boolean_t	svr_thread_exit;
-
-	/*
-	 * New mappings to write out each txg.
-	 */
-	list_t		svr_new_segments[TXG_SIZE];
-
-	/*
-	 * Ranges that were freed while a mapping was in flight.  This is
-	 * a subset of the ranges covered by vdev_im_new_segments.
-	 */
-	range_tree_t	*svr_frees[TXG_SIZE];
-
-	/*
-	 * Number of bytes which we have finished our work for
-	 * in each txg.  This could be data copied (which will be part of
-	 * the mappings in vdev_im_new_segments), or data freed before
-	 * we got around to copying it.
-	 */
-	uint64_t	svr_bytes_done[TXG_SIZE];
-
-	/* List of leaf zap objects to be unlinked */
-	nvlist_t	*svr_zaplist;
-} spa_vdev_removal_t;
-
-typedef struct spa_condensing_indirect {
-	/*
-	 * New mappings to write out each txg.
-	 */
-	list_t		sci_new_mapping_entries[TXG_SIZE];
-
-	vdev_indirect_mapping_t *sci_new_mapping;
-} spa_condensing_indirect_t;
-
-extern int spa_remove_init(spa_t *);
-extern void spa_restart_removal(spa_t *);
-extern int spa_condense_init(spa_t *);
-extern void spa_condense_fini(spa_t *);
-extern void spa_start_indirect_condensing_thread(spa_t *);
-extern void spa_vdev_condense_suspend(spa_t *);
-extern int spa_vdev_remove(spa_t *, uint64_t, boolean_t);
-extern void free_from_removing_vdev(vdev_t *, uint64_t, uint64_t);
-extern int spa_removal_get_stats(spa_t *, pool_removal_stat_t *);
-extern void svr_sync(spa_t *spa, dmu_tx_t *tx);
-extern void spa_vdev_remove_suspend(spa_t *);
-extern int spa_vdev_remove_cancel(spa_t *);
-extern void spa_vdev_removal_destroy(spa_vdev_removal_t *svr);
-
-extern int vdev_removal_max_span;
-extern int zfs_remove_max_segment;
-
-#ifdef	__cplusplus
-}
-#endif
-
-#endif	/* _SYS_VDEV_REMOVAL_H */
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap.h
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap.h
+++ /dev/null
@@ -1,514 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-
-/*
- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
- * Copyright 2017 Nexenta Systems, Inc.
- */
-
-#ifndef	_SYS_ZAP_H
-#define	_SYS_ZAP_H
-
-/*
- * ZAP - ZFS Attribute Processor
- *
- * The ZAP is a module which sits on top of the DMU (Data Management
- * Unit) and implements a higher-level storage primitive using DMU
- * objects.  Its primary consumer is the ZPL (ZFS Posix Layer).
- *
- * A "zapobj" is a DMU object which the ZAP uses to stores attributes.
- * Users should use only zap routines to access a zapobj - they should
- * not access the DMU object directly using DMU routines.
- *
- * The attributes stored in a zapobj are name-value pairs.  The name is
- * a zero-terminated string of up to ZAP_MAXNAMELEN bytes (including
- * terminating NULL).  The value is an array of integers, which may be
- * 1, 2, 4, or 8 bytes long.  The total space used by the array (number
- * of integers * integer length) can be up to ZAP_MAXVALUELEN bytes.
- * Note that an 8-byte integer value can be used to store the location
- * (object number) of another dmu object (which may be itself a zapobj).
- * Note that you can use a zero-length attribute to store a single bit
- * of information - the attribute is present or not.
- *
- * The ZAP routines are thread-safe.  However, you must observe the
- * DMU's restriction that a transaction may not be operated on
- * concurrently.
- *
- * Any of the routines that return an int may return an I/O error (EIO
- * or ECHECKSUM).
- *
- *
- * Implementation / Performance Notes:
- *
- * The ZAP is intended to operate most efficiently on attributes with
- * short (49 bytes or less) names and single 8-byte values, for which
- * the microzap will be used.  The ZAP should be efficient enough so
- * that the user does not need to cache these attributes.
- *
- * The ZAP's locking scheme makes its routines thread-safe.  Operations
- * on different zapobjs will be processed concurrently.  Operations on
- * the same zapobj which only read data will be processed concurrently.
- * Operations on the same zapobj which modify data will be processed
- * concurrently when there are many attributes in the zapobj (because
- * the ZAP uses per-block locking - more than 128 * (number of cpus)
- * small attributes will suffice).
- */
-
-/*
- * We're using zero-terminated byte strings (ie. ASCII or UTF-8 C
- * strings) for the names of attributes, rather than a byte string
- * bounded by an explicit length.  If some day we want to support names
- * in character sets which have embedded zeros (eg. UTF-16, UTF-32),
- * we'll have to add routines for using length-bounded strings.
- */
-
-#include <sys/dmu.h>
-#include <sys/refcount.h>
-
-#ifdef	__cplusplus
-extern "C" {
-#endif
-
-/*
- * Specifies matching criteria for ZAP lookups.
- * MT_NORMALIZE		Use ZAP normalization flags, which can include both
- *			unicode normalization and case-insensitivity.
- * MT_MATCH_CASE	Do case-sensitive lookups even if MT_NORMALIZE is
- *			specified and ZAP normalization flags include
- *			U8_TEXTPREP_TOUPPER.
- */
-typedef enum matchtype {
-	MT_NORMALIZE = 1 << 0,
-	MT_MATCH_CASE = 1 << 1,
-} matchtype_t;
-
-typedef enum zap_flags {
-	/* Use 64-bit hash value (serialized cursors will always use 64-bits) */
-	ZAP_FLAG_HASH64 = 1 << 0,
-	/* Key is binary, not string (zap_add_uint64() can be used) */
-	ZAP_FLAG_UINT64_KEY = 1 << 1,
-	/*
-	 * First word of key (which must be an array of uint64) is
-	 * already randomly distributed.
-	 */
-	ZAP_FLAG_PRE_HASHED_KEY = 1 << 2,
-} zap_flags_t;
-
-/*
- * Create a new zapobj with no attributes and return its object number.
- *
- * dnodesize specifies the on-disk size of the dnode for the new zapobj.
- * Valid values are multiples of 512 up to DNODE_MAX_SIZE.
- */
-uint64_t zap_create(objset_t *ds, dmu_object_type_t ot,
-    dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx);
-uint64_t zap_create_dnsize(objset_t *ds, dmu_object_type_t ot,
-    dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx);
-uint64_t zap_create_norm(objset_t *ds, int normflags, dmu_object_type_t ot,
-    dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx);
-uint64_t zap_create_norm_dnsize(objset_t *ds, int normflags,
-    dmu_object_type_t ot, dmu_object_type_t bonustype, int bonuslen,
-    int dnodesize, dmu_tx_t *tx);
-uint64_t zap_create_flags(objset_t *os, int normflags, zap_flags_t flags,
-    dmu_object_type_t ot, int leaf_blockshift, int indirect_blockshift,
-    dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx);
-uint64_t zap_create_flags_dnsize(objset_t *os, int normflags,
-    zap_flags_t flags, dmu_object_type_t ot, int leaf_blockshift,
-    int indirect_blockshift, dmu_object_type_t bonustype, int bonuslen,
-    int dnodesize, dmu_tx_t *tx);
-uint64_t zap_create_link(objset_t *os, dmu_object_type_t ot,
-     uint64_t parent_obj, const char *name, dmu_tx_t *tx);
-uint64_t zap_create_link_dnsize(objset_t *os, dmu_object_type_t ot,
-    uint64_t parent_obj, const char *name, int dnodesize, dmu_tx_t *tx);
-uint64_t zap_create_link_dnsize(objset_t *os, dmu_object_type_t ot,
-    uint64_t parent_obj, const char *name, int dnodesize, dmu_tx_t *tx);
-
-/*
- * Initialize an already-allocated object.
- */
-void mzap_create_impl(objset_t *os, uint64_t obj, int normflags,
-    zap_flags_t flags, dmu_tx_t *tx);
-
-/*
- * Create a new zapobj with no attributes from the given (unallocated)
- * object number.
- */
-int zap_create_claim(objset_t *ds, uint64_t obj, dmu_object_type_t ot,
-    dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx);
-int zap_create_claim_dnsize(objset_t *ds, uint64_t obj, dmu_object_type_t ot,
-    dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx);
-int zap_create_claim_norm(objset_t *ds, uint64_t obj,
-    int normflags, dmu_object_type_t ot,
-    dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx);
-int zap_create_claim_norm_dnsize(objset_t *ds, uint64_t obj,
-    int normflags, dmu_object_type_t ot,
-    dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx);
-
-/*
- * The zapobj passed in must be a valid ZAP object for all of the
- * following routines.
- */
-
-/*
- * Destroy this zapobj and all its attributes.
- *
- * Frees the object number using dmu_object_free.
- */
-int zap_destroy(objset_t *ds, uint64_t zapobj, dmu_tx_t *tx);
-
-/*
- * Manipulate attributes.
- *
- * 'integer_size' is in bytes, and must be 1, 2, 4, or 8.
- */
-
-/*
- * Retrieve the contents of the attribute with the given name.
- *
- * If the requested attribute does not exist, the call will fail and
- * return ENOENT.
- *
- * If 'integer_size' is smaller than the attribute's integer size, the
- * call will fail and return EINVAL.
- *
- * If 'integer_size' is equal to or larger than the attribute's integer
- * size, the call will succeed and return 0.
- *
- * When converting to a larger integer size, the integers will be treated as
- * unsigned (ie. no sign-extension will be performed).
- *
- * 'num_integers' is the length (in integers) of 'buf'.
- *
- * If the attribute is longer than the buffer, as many integers as will
- * fit will be transferred to 'buf'.  If the entire attribute was not
- * transferred, the call will return EOVERFLOW.
- */
-int zap_lookup(objset_t *ds, uint64_t zapobj, const char *name,
-    uint64_t integer_size, uint64_t num_integers, void *buf);
-
-/*
- * If rn_len is nonzero, realname will be set to the name of the found
- * entry (which may be different from the requested name if matchtype is
- * not MT_EXACT).
- *
- * If normalization_conflictp is not NULL, it will be set if there is
- * another name with the same case/unicode normalized form.
- */
-int zap_lookup_norm(objset_t *ds, uint64_t zapobj, const char *name,
-    uint64_t integer_size, uint64_t num_integers, void *buf,
-    matchtype_t mt, char *realname, int rn_len,
-    boolean_t *normalization_conflictp);
-int zap_lookup_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
-    int key_numints, uint64_t integer_size, uint64_t num_integers, void *buf);
-int zap_contains(objset_t *ds, uint64_t zapobj, const char *name);
-int zap_prefetch_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
-    int key_numints);
-int zap_lookup_by_dnode(dnode_t *dn, const char *name,
-    uint64_t integer_size, uint64_t num_integers, void *buf);
-int zap_lookup_norm_by_dnode(dnode_t *dn, const char *name,
-    uint64_t integer_size, uint64_t num_integers, void *buf,
-    matchtype_t mt, char *realname, int rn_len,
-    boolean_t *ncp);
-
-int zap_count_write_by_dnode(dnode_t *dn, const char *name,
-    int add, zfs_refcount_t *towrite, zfs_refcount_t *tooverwrite);
-
-/*
- * Create an attribute with the given name and value.
- *
- * If an attribute with the given name already exists, the call will
- * fail and return EEXIST.
- */
-int zap_add(objset_t *ds, uint64_t zapobj, const char *key,
-    int integer_size, uint64_t num_integers,
-    const void *val, dmu_tx_t *tx);
-int zap_add_by_dnode(dnode_t *dn, const char *key,
-    int integer_size, uint64_t num_integers,
-    const void *val, dmu_tx_t *tx);
-int zap_add_uint64(objset_t *ds, uint64_t zapobj, const uint64_t *key,
-    int key_numints, int integer_size, uint64_t num_integers,
-    const void *val, dmu_tx_t *tx);
-
-/*
- * Set the attribute with the given name to the given value.  If an
- * attribute with the given name does not exist, it will be created.  If
- * an attribute with the given name already exists, the previous value
- * will be overwritten.  The integer_size may be different from the
- * existing attribute's integer size, in which case the attribute's
- * integer size will be updated to the new value.
- */
-int zap_update(objset_t *ds, uint64_t zapobj, const char *name,
-    int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx);
-int zap_update_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
-    int key_numints,
-    int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx);
-
-/*
- * Get the length (in integers) and the integer size of the specified
- * attribute.
- *
- * If the requested attribute does not exist, the call will fail and
- * return ENOENT.
- */
-int zap_length(objset_t *ds, uint64_t zapobj, const char *name,
-    uint64_t *integer_size, uint64_t *num_integers);
-int zap_length_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
-    int key_numints, uint64_t *integer_size, uint64_t *num_integers);
-
-/*
- * Remove the specified attribute.
- *
- * If the specified attribute does not exist, the call will fail and
- * return ENOENT.
- */
-int zap_remove(objset_t *ds, uint64_t zapobj, const char *name, dmu_tx_t *tx);
-int zap_remove_norm(objset_t *ds, uint64_t zapobj, const char *name,
-    matchtype_t mt, dmu_tx_t *tx);
-int zap_remove_by_dnode(dnode_t *dn, const char *name, dmu_tx_t *tx);
-int zap_remove_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
-    int key_numints, dmu_tx_t *tx);
-
-/*
- * Returns (in *count) the number of attributes in the specified zap
- * object.
- */
-int zap_count(objset_t *ds, uint64_t zapobj, uint64_t *count);
-
-/*
- * Returns (in name) the name of the entry whose (value & mask)
- * (za_first_integer) is value, or ENOENT if not found.  The string
- * pointed to by name must be at least 256 bytes long.  If mask==0, the
- * match must be exact (ie, same as mask=-1ULL).
- */
-int zap_value_search(objset_t *os, uint64_t zapobj,
-    uint64_t value, uint64_t mask, char *name);
-
-/*
- * Transfer all the entries from fromobj into intoobj.  Only works on
- * int_size=8 num_integers=1 values.  Fails if there are any duplicated
- * entries.
- */
-int zap_join(objset_t *os, uint64_t fromobj, uint64_t intoobj, dmu_tx_t *tx);
-
-/* Same as zap_join, but set the values to 'value'. */
-int zap_join_key(objset_t *os, uint64_t fromobj, uint64_t intoobj,
-    uint64_t value, dmu_tx_t *tx);
-
-/* Same as zap_join, but add together any duplicated entries. */
-int zap_join_increment(objset_t *os, uint64_t fromobj, uint64_t intoobj,
-    dmu_tx_t *tx);
-
-/*
- * Manipulate entries where the name + value are the "same" (the name is
- * a stringified version of the value).
- */
-int zap_add_int(objset_t *os, uint64_t obj, uint64_t value, dmu_tx_t *tx);
-int zap_remove_int(objset_t *os, uint64_t obj, uint64_t value, dmu_tx_t *tx);
-int zap_lookup_int(objset_t *os, uint64_t obj, uint64_t value);
-int zap_increment_int(objset_t *os, uint64_t obj, uint64_t key, int64_t delta,
-    dmu_tx_t *tx);
-
-/* Here the key is an int and the value is a different int. */
-int zap_add_int_key(objset_t *os, uint64_t obj,
-    uint64_t key, uint64_t value, dmu_tx_t *tx);
-int zap_update_int_key(objset_t *os, uint64_t obj,
-    uint64_t key, uint64_t value, dmu_tx_t *tx);
-int zap_lookup_int_key(objset_t *os, uint64_t obj,
-    uint64_t key, uint64_t *valuep);
-
-int zap_increment(objset_t *os, uint64_t obj, const char *name, int64_t delta,
-    dmu_tx_t *tx);
-
-struct zap;
-struct zap_leaf;
-typedef struct zap_cursor {
-	/* This structure is opaque! */
-	objset_t *zc_objset;
-	struct zap *zc_zap;
-	struct zap_leaf *zc_leaf;
-	uint64_t zc_zapobj;
-	uint64_t zc_serialized;
-	uint64_t zc_hash;
-	uint32_t zc_cd;
-	boolean_t zc_prefetch;
-} zap_cursor_t;
-
-typedef struct {
-	int za_integer_length;
-	/*
-	 * za_normalization_conflict will be set if there are additional
-	 * entries with this normalized form (eg, "foo" and "Foo").
-	 */
-	boolean_t za_normalization_conflict;
-	uint64_t za_num_integers;
-	uint64_t za_first_integer;	/* no sign extension for <8byte ints */
-	char za_name[ZAP_MAXNAMELEN];
-} zap_attribute_t;
-
-/*
- * The interface for listing all the attributes of a zapobj can be
- * thought of as cursor moving down a list of the attributes one by
- * one.  The cookie returned by the zap_cursor_serialize routine is
- * persistent across system calls (and across reboot, even).
- */
-
-/*
- * Initialize a zap cursor, pointing to the "first" attribute of the
- * zapobj.  You must _fini the cursor when you are done with it.
- */
-void zap_cursor_init(zap_cursor_t *zc, objset_t *ds, uint64_t zapobj);
-void zap_cursor_init_noprefetch(zap_cursor_t *zc, objset_t *os,
-    uint64_t zapobj);
-void zap_cursor_fini(zap_cursor_t *zc);
-
-/*
- * Get the attribute currently pointed to by the cursor.  Returns
- * ENOENT if at the end of the attributes.
- */
-int zap_cursor_retrieve(zap_cursor_t *zc, zap_attribute_t *za);
-
-/*
- * Advance the cursor to the next attribute.
- */
-void zap_cursor_advance(zap_cursor_t *zc);
-
-/*
- * Get a persistent cookie pointing to the current position of the zap
- * cursor.  The low 4 bits in the cookie are always zero, and thus can
- * be used as to differentiate a serialized cookie from a different type
- * of value.  The cookie will be less than 2^32 as long as there are
- * fewer than 2^22 (4.2 million) entries in the zap object.
- */
-uint64_t zap_cursor_serialize(zap_cursor_t *zc);
-
-/*
- * Advance the cursor to the attribute having the given key.
- */
-int zap_cursor_move_to_key(zap_cursor_t *zc, const char *name, matchtype_t mt);
-
-/*
- * Initialize a zap cursor pointing to the position recorded by
- * zap_cursor_serialize (in the "serialized" argument).  You can also
- * use a "serialized" argument of 0 to start at the beginning of the
- * zapobj (ie.  zap_cursor_init_serialized(..., 0) is equivalent to
- * zap_cursor_init(...).)
- */
-void zap_cursor_init_serialized(zap_cursor_t *zc, objset_t *ds,
-    uint64_t zapobj, uint64_t serialized);
-
-
-#define	ZAP_HISTOGRAM_SIZE 10
-
-typedef struct zap_stats {
-	/*
-	 * Size of the pointer table (in number of entries).
-	 * This is always a power of 2, or zero if it's a microzap.
-	 * In general, it should be considerably greater than zs_num_leafs.
-	 */
-	uint64_t zs_ptrtbl_len;
-
-	uint64_t zs_blocksize;		/* size of zap blocks */
-
-	/*
-	 * The number of blocks used.  Note that some blocks may be
-	 * wasted because old ptrtbl's and large name/value blocks are
-	 * not reused.  (Although their space is reclaimed, we don't
-	 * reuse those offsets in the object.)
-	 */
-	uint64_t zs_num_blocks;
-
-	/*
-	 * Pointer table values from zap_ptrtbl in the zap_phys_t
-	 */
-	uint64_t zs_ptrtbl_nextblk;	  /* next (larger) copy start block */
-	uint64_t zs_ptrtbl_blks_copied;   /* number source blocks copied */
-	uint64_t zs_ptrtbl_zt_blk;	  /* starting block number */
-	uint64_t zs_ptrtbl_zt_numblks;    /* number of blocks */
-	uint64_t zs_ptrtbl_zt_shift;	  /* bits to index it */
-
-	/*
-	 * Values of the other members of the zap_phys_t
-	 */
-	uint64_t zs_block_type;		/* ZBT_HEADER */
-	uint64_t zs_magic;		/* ZAP_MAGIC */
-	uint64_t zs_num_leafs;		/* The number of leaf blocks */
-	uint64_t zs_num_entries;	/* The number of zap entries */
-	uint64_t zs_salt;		/* salt to stir into hash function */
-
-	/*
-	 * Histograms.  For all histograms, the last index
-	 * (ZAP_HISTOGRAM_SIZE-1) includes any values which are greater
-	 * than what can be represented.  For example
-	 * zs_leafs_with_n5_entries[ZAP_HISTOGRAM_SIZE-1] is the number
-	 * of leafs with more than 45 entries.
-	 */
-
-	/*
-	 * zs_leafs_with_n_pointers[n] is the number of leafs with
-	 * 2^n pointers to it.
-	 */
-	uint64_t zs_leafs_with_2n_pointers[ZAP_HISTOGRAM_SIZE];
-
-	/*
-	 * zs_leafs_with_n_entries[n] is the number of leafs with
-	 * [n*5, (n+1)*5) entries.  In the current implementation, there
-	 * can be at most 55 entries in any block, but there may be
-	 * fewer if the name or value is large, or the block is not
-	 * completely full.
-	 */
-	uint64_t zs_blocks_with_n5_entries[ZAP_HISTOGRAM_SIZE];
-
-	/*
-	 * zs_leafs_n_tenths_full[n] is the number of leafs whose
-	 * fullness is in the range [n/10, (n+1)/10).
-	 */
-	uint64_t zs_blocks_n_tenths_full[ZAP_HISTOGRAM_SIZE];
-
-	/*
-	 * zs_entries_using_n_chunks[n] is the number of entries which
-	 * consume n 24-byte chunks.  (Note, large names/values only use
-	 * one chunk, but contribute to zs_num_blocks_large.)
-	 */
-	uint64_t zs_entries_using_n_chunks[ZAP_HISTOGRAM_SIZE];
-
-	/*
-	 * zs_buckets_with_n_entries[n] is the number of buckets (each
-	 * leaf has 64 buckets) with n entries.
-	 * zs_buckets_with_n_entries[1] should be very close to
-	 * zs_num_entries.
-	 */
-	uint64_t zs_buckets_with_n_entries[ZAP_HISTOGRAM_SIZE];
-} zap_stats_t;
-
-/*
- * Get statistics about a ZAP object.  Note: you need to be aware of the
- * internal implementation of the ZAP to correctly interpret some of the
- * statistics.  This interface shouldn't be relied on unless you really
- * know what you're doing.
- */
-int zap_get_stats(objset_t *ds, uint64_t zapobj, zap_stats_t *zs);
-
-#ifdef	__cplusplus
-}
-#endif
-
-#endif	/* _SYS_ZAP_H */
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap_impl.h
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap_impl.h
+++ /dev/null
@@ -1,242 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-
-/*
- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2013, 2016 by Delphix. All rights reserved.
- * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
- * Copyright (c) 2014 Integros [integros.com]
- * Copyright 2017 Nexenta Systems, Inc.
- */
-
-#ifndef	_SYS_ZAP_IMPL_H
-#define	_SYS_ZAP_IMPL_H
-
-#include <sys/zap.h>
-#include <sys/zfs_context.h>
-#include <sys/avl.h>
-
-#ifdef	__cplusplus
-extern "C" {
-#endif
-
-extern int fzap_default_block_shift;
-
-#define	ZAP_MAGIC 0x2F52AB2ABULL
-
-#define	FZAP_BLOCK_SHIFT(zap)	((zap)->zap_f.zap_block_shift)
-
-#define	MZAP_ENT_LEN		64
-#define	MZAP_NAME_LEN		(MZAP_ENT_LEN - 8 - 4 - 2)
-#define	MZAP_MAX_BLKSZ		SPA_OLD_MAXBLOCKSIZE
-
-#define	ZAP_NEED_CD		(-1U)
-
-typedef struct mzap_ent_phys {
-	uint64_t mze_value;
-	uint32_t mze_cd;
-	uint16_t mze_pad;	/* in case we want to chain them someday */
-	char mze_name[MZAP_NAME_LEN];
-} mzap_ent_phys_t;
-
-typedef struct mzap_phys {
-	uint64_t mz_block_type;	/* ZBT_MICRO */
-	uint64_t mz_salt;
-	uint64_t mz_normflags;
-	uint64_t mz_pad[5];
-	mzap_ent_phys_t mz_chunk[1];
-	/* actually variable size depending on block size */
-} mzap_phys_t;
-
-typedef struct mzap_ent {
-	avl_node_t mze_node;
-	int mze_chunkid;
-	uint64_t mze_hash;
-	uint32_t mze_cd; /* copy from mze_phys->mze_cd */
-} mzap_ent_t;
-
-#define	MZE_PHYS(zap, mze) \
-	(&zap_m_phys(zap)->mz_chunk[(mze)->mze_chunkid])
-
-/*
- * The (fat) zap is stored in one object. It is an array of
- * 1<<FZAP_BLOCK_SHIFT byte blocks. The layout looks like one of:
- *
- * ptrtbl fits in first block:
- * 	[zap_phys_t zap_ptrtbl_shift < 6] [zap_leaf_t] ...
- *
- * ptrtbl too big for first block:
- * 	[zap_phys_t zap_ptrtbl_shift >= 6] [zap_leaf_t] [ptrtbl] ...
- *
- */
-
-struct dmu_buf;
-struct zap_leaf;
-
-#define	ZBT_LEAF		((1ULL << 63) + 0)
-#define	ZBT_HEADER		((1ULL << 63) + 1)
-#define	ZBT_MICRO		((1ULL << 63) + 3)
-/* any other values are ptrtbl blocks */
-
-/*
- * the embedded pointer table takes up half a block:
- * block size / entry size (2^3) / 2
- */
-#define	ZAP_EMBEDDED_PTRTBL_SHIFT(zap) (FZAP_BLOCK_SHIFT(zap) - 3 - 1)
-
-/*
- * The embedded pointer table starts half-way through the block.  Since
- * the pointer table itself is half the block, it starts at (64-bit)
- * word number (1<<ZAP_EMBEDDED_PTRTBL_SHIFT(zap)).
- */
-#define	ZAP_EMBEDDED_PTRTBL_ENT(zap, idx) \
-	((uint64_t *)zap_f_phys(zap)) \
-	[(idx) + (1<<ZAP_EMBEDDED_PTRTBL_SHIFT(zap))]
-
-/*
- * TAKE NOTE:
- * If zap_phys_t is modified, zap_byteswap() must be modified.
- */
-typedef struct zap_phys {
-	uint64_t zap_block_type;	/* ZBT_HEADER */
-	uint64_t zap_magic;		/* ZAP_MAGIC */
-
-	struct zap_table_phys {
-		uint64_t zt_blk;	/* starting block number */
-		uint64_t zt_numblks;	/* number of blocks */
-		uint64_t zt_shift;	/* bits to index it */
-		uint64_t zt_nextblk;	/* next (larger) copy start block */
-		uint64_t zt_blks_copied; /* number source blocks copied */
-	} zap_ptrtbl;
-
-	uint64_t zap_freeblk;		/* the next free block */
-	uint64_t zap_num_leafs;		/* number of leafs */
-	uint64_t zap_num_entries;	/* number of entries */
-	uint64_t zap_salt;		/* salt to stir into hash function */
-	uint64_t zap_normflags;		/* flags for u8_textprep_str() */
-	uint64_t zap_flags;		/* zap_flags_t */
-	/*
-	 * This structure is followed by padding, and then the embedded
-	 * pointer table.  The embedded pointer table takes up second
-	 * half of the block.  It is accessed using the
-	 * ZAP_EMBEDDED_PTRTBL_ENT() macro.
-	 */
-} zap_phys_t;
-
-typedef struct zap_table_phys zap_table_phys_t;
-
-typedef struct zap {
-	dmu_buf_user_t zap_dbu;
-	objset_t *zap_objset;
-	uint64_t zap_object;
-	struct dmu_buf *zap_dbuf;
-	krwlock_t zap_rwlock;
-	boolean_t zap_ismicro;
-	int zap_normflags;
-	uint64_t zap_salt;
-	union {
-		struct {
-			/*
-			 * zap_num_entries_mtx protects
-			 * zap_num_entries
-			 */
-			kmutex_t zap_num_entries_mtx;
-			int zap_block_shift;
-		} zap_fat;
-		struct {
-			int16_t zap_num_entries;
-			int16_t zap_num_chunks;
-			int16_t zap_alloc_next;
-			avl_tree_t zap_avl;
-		} zap_micro;
-	} zap_u;
-} zap_t;
-
-inline zap_phys_t *
-zap_f_phys(zap_t *zap)
-{
-	return (zap->zap_dbuf->db_data);
-}
-
-inline mzap_phys_t *
-zap_m_phys(zap_t *zap)
-{
-	return (zap->zap_dbuf->db_data);
-}
-
-typedef struct zap_name {
-	zap_t *zn_zap;
-	int zn_key_intlen;
-	const void *zn_key_orig;
-	int zn_key_orig_numints;
-	const void *zn_key_norm;
-	int zn_key_norm_numints;
-	uint64_t zn_hash;
-	matchtype_t zn_matchtype;
-	int zn_normflags;
-	char zn_normbuf[ZAP_MAXNAMELEN];
-} zap_name_t;
-
-#define	zap_f	zap_u.zap_fat
-#define	zap_m	zap_u.zap_micro
-
-boolean_t zap_match(zap_name_t *zn, const char *matchname);
-int zap_lockdir(objset_t *os, uint64_t obj, dmu_tx_t *tx,
-    krw_t lti, boolean_t fatreader, boolean_t adding, void *tag, zap_t **zapp);
-void zap_unlockdir(zap_t *zap, void *tag);
-void zap_evict_sync(void *dbu);
-zap_name_t *zap_name_alloc(zap_t *zap, const char *key, matchtype_t mt);
-void zap_name_free(zap_name_t *zn);
-int zap_hashbits(zap_t *zap);
-uint32_t zap_maxcd(zap_t *zap);
-uint64_t zap_getflags(zap_t *zap);
-
-#define	ZAP_HASH_IDX(hash, n) (((n) == 0) ? 0 : ((hash) >> (64 - (n))))
-
-void fzap_byteswap(void *buf, size_t size);
-int fzap_count(zap_t *zap, uint64_t *count);
-int fzap_lookup(zap_name_t *zn,
-    uint64_t integer_size, uint64_t num_integers, void *buf,
-    char *realname, int rn_len, boolean_t *normalization_conflictp);
-void fzap_prefetch(zap_name_t *zn);
-int fzap_add(zap_name_t *zn, uint64_t integer_size, uint64_t num_integers,
-    const void *val, void *tag, dmu_tx_t *tx);
-int fzap_update(zap_name_t *zn,
-    int integer_size, uint64_t num_integers, const void *val,
-    void *tag, dmu_tx_t *tx);
-int fzap_length(zap_name_t *zn,
-    uint64_t *integer_size, uint64_t *num_integers);
-int fzap_remove(zap_name_t *zn, dmu_tx_t *tx);
-int fzap_cursor_retrieve(zap_t *zap, zap_cursor_t *zc, zap_attribute_t *za);
-void fzap_get_stats(zap_t *zap, zap_stats_t *zs);
-void zap_put_leaf(struct zap_leaf *l);
-
-int fzap_add_cd(zap_name_t *zn,
-    uint64_t integer_size, uint64_t num_integers,
-    const void *val, uint32_t cd, void *tag, dmu_tx_t *tx);
-void fzap_upgrade(zap_t *zap, dmu_tx_t *tx, zap_flags_t flags);
-int fzap_cursor_move_to_key(zap_cursor_t *zc, zap_name_t *zn);
-
-#ifdef	__cplusplus
-}
-#endif
-
-#endif /* _SYS_ZAP_IMPL_H */
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap_leaf.h
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap_leaf.h
+++ /dev/null
@@ -1,248 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
- */
-
-#ifndef	_SYS_ZAP_LEAF_H
-#define	_SYS_ZAP_LEAF_H
-
-#include <sys/zap.h>
-
-#ifdef	__cplusplus
-extern "C" {
-#endif
-
-struct zap;
-struct zap_name;
-struct zap_stats;
-
-#define	ZAP_LEAF_MAGIC 0x2AB1EAF
-
-/* chunk size = 24 bytes */
-#define	ZAP_LEAF_CHUNKSIZE 24
-
-/*
- * The amount of space available for chunks is:
- * block size (1<<l->l_bs) - hash entry size (2) * number of hash
- * entries - header space (2*chunksize)
- */
-#define	ZAP_LEAF_NUMCHUNKS(l) \
-	(((1<<(l)->l_bs) - 2*ZAP_LEAF_HASH_NUMENTRIES(l)) / \
-	ZAP_LEAF_CHUNKSIZE - 2)
-
-/*
- * The amount of space within the chunk available for the array is:
- * chunk size - space for type (1) - space for next pointer (2)
- */
-#define	ZAP_LEAF_ARRAY_BYTES (ZAP_LEAF_CHUNKSIZE - 3)
-
-#define	ZAP_LEAF_ARRAY_NCHUNKS(bytes) \
-	(((bytes)+ZAP_LEAF_ARRAY_BYTES-1)/ZAP_LEAF_ARRAY_BYTES)
-
-/*
- * Low water mark:  when there are only this many chunks free, start
- * growing the ptrtbl.  Ideally, this should be larger than a
- * "reasonably-sized" entry.  20 chunks is more than enough for the
- * largest directory entry (MAXNAMELEN (256) byte name, 8-byte value),
- * while still being only around 3% for 16k blocks.
- */
-#define	ZAP_LEAF_LOW_WATER (20)
-
-/*
- * The leaf hash table has block size / 2^5 (32) number of entries,
- * which should be more than enough for the maximum number of entries,
- * which is less than block size / CHUNKSIZE (24) / minimum number of
- * chunks per entry (3).
- */
-#define	ZAP_LEAF_HASH_SHIFT(l) ((l)->l_bs - 5)
-#define	ZAP_LEAF_HASH_NUMENTRIES(l) (1 << ZAP_LEAF_HASH_SHIFT(l))
-
-/*
- * The chunks start immediately after the hash table.  The end of the
- * hash table is at l_hash + HASH_NUMENTRIES, which we simply cast to a
- * chunk_t.
- */
-#define	ZAP_LEAF_CHUNK(l, idx) \
-	((zap_leaf_chunk_t *) \
-	(zap_leaf_phys(l)->l_hash + ZAP_LEAF_HASH_NUMENTRIES(l)))[idx]
-#define	ZAP_LEAF_ENTRY(l, idx) (&ZAP_LEAF_CHUNK(l, idx).l_entry)
-
-typedef enum zap_chunk_type {
-	ZAP_CHUNK_FREE = 253,
-	ZAP_CHUNK_ENTRY = 252,
-	ZAP_CHUNK_ARRAY = 251,
-	ZAP_CHUNK_TYPE_MAX = 250
-} zap_chunk_type_t;
-
-#define	ZLF_ENTRIES_CDSORTED (1<<0)
-
-/*
- * TAKE NOTE:
- * If zap_leaf_phys_t is modified, zap_leaf_byteswap() must be modified.
- */
-typedef struct zap_leaf_phys {
-	struct zap_leaf_header {
-		/* Public to ZAP */
-		uint64_t lh_block_type;		/* ZBT_LEAF */
-		uint64_t lh_pad1;
-		uint64_t lh_prefix;		/* hash prefix of this leaf */
-		uint32_t lh_magic;		/* ZAP_LEAF_MAGIC */
-		uint16_t lh_nfree;		/* number free chunks */
-		uint16_t lh_nentries;		/* number of entries */
-		uint16_t lh_prefix_len;		/* num bits used to id this */
-
-		/* Private to zap_leaf */
-		uint16_t lh_freelist;		/* chunk head of free list */
-		uint8_t lh_flags;		/* ZLF_* flags */
-		uint8_t lh_pad2[11];
-	} l_hdr; /* 2 24-byte chunks */
-
-	/*
-	 * The header is followed by a hash table with
-	 * ZAP_LEAF_HASH_NUMENTRIES(zap) entries.  The hash table is
-	 * followed by an array of ZAP_LEAF_NUMCHUNKS(zap)
-	 * zap_leaf_chunk structures.  These structures are accessed
-	 * with the ZAP_LEAF_CHUNK() macro.
-	 */
-
-	uint16_t l_hash[1];
-} zap_leaf_phys_t;
-
-typedef union zap_leaf_chunk {
-	struct zap_leaf_entry {
-		uint8_t le_type; 		/* always ZAP_CHUNK_ENTRY */
-		uint8_t le_value_intlen;	/* size of value's ints */
-		uint16_t le_next;		/* next entry in hash chain */
-		uint16_t le_name_chunk;		/* first chunk of the name */
-		uint16_t le_name_numints;	/* ints in name (incl null) */
-		uint16_t le_value_chunk;	/* first chunk of the value */
-		uint16_t le_value_numints;	/* value length in ints */
-		uint32_t le_cd;			/* collision differentiator */
-		uint64_t le_hash;		/* hash value of the name */
-	} l_entry;
-	struct zap_leaf_array {
-		uint8_t la_type;		/* always ZAP_CHUNK_ARRAY */
-		uint8_t la_array[ZAP_LEAF_ARRAY_BYTES];
-		uint16_t la_next;		/* next blk or CHAIN_END */
-	} l_array;
-	struct zap_leaf_free {
-		uint8_t lf_type;		/* always ZAP_CHUNK_FREE */
-		uint8_t lf_pad[ZAP_LEAF_ARRAY_BYTES];
-		uint16_t lf_next;	/* next in free list, or CHAIN_END */
-	} l_free;
-} zap_leaf_chunk_t;
-
-typedef struct zap_leaf {
-	dmu_buf_user_t l_dbu;
-	krwlock_t l_rwlock;
-	uint64_t l_blkid;		/* 1<<ZAP_BLOCK_SHIFT byte block off */
-	int l_bs;			/* block size shift */
-	dmu_buf_t *l_dbuf;
-} zap_leaf_t;
-
-inline zap_leaf_phys_t *
-zap_leaf_phys(zap_leaf_t *l)
-{
-	return (l->l_dbuf->db_data);
-}
-
-typedef struct zap_entry_handle {
-	/* Set by zap_leaf and public to ZAP */
-	uint64_t zeh_num_integers;
-	uint64_t zeh_hash;
-	uint32_t zeh_cd;
-	uint8_t zeh_integer_size;
-
-	/* Private to zap_leaf */
-	uint16_t zeh_fakechunk;
-	uint16_t *zeh_chunkp;
-	zap_leaf_t *zeh_leaf;
-} zap_entry_handle_t;
-
-/*
- * Return a handle to the named entry, or ENOENT if not found.  The hash
- * value must equal zap_hash(name).
- */
-extern int zap_leaf_lookup(zap_leaf_t *l,
-    struct zap_name *zn, zap_entry_handle_t *zeh);
-
-/*
- * Return a handle to the entry with this hash+cd, or the entry with the
- * next closest hash+cd.
- */
-extern int zap_leaf_lookup_closest(zap_leaf_t *l,
-    uint64_t hash, uint32_t cd, zap_entry_handle_t *zeh);
-
-/*
- * Read the first num_integers in the attribute.  Integer size
- * conversion will be done without sign extension.  Return EINVAL if
- * integer_size is too small.  Return EOVERFLOW if there are more than
- * num_integers in the attribute.
- */
-extern int zap_entry_read(const zap_entry_handle_t *zeh,
-    uint8_t integer_size, uint64_t num_integers, void *buf);
-
-extern int zap_entry_read_name(struct zap *zap, const zap_entry_handle_t *zeh,
-    uint16_t buflen, char *buf);
-
-/*
- * Replace the value of an existing entry.
- *
- * May fail if it runs out of space (ENOSPC).
- */
-extern int zap_entry_update(zap_entry_handle_t *zeh,
-    uint8_t integer_size, uint64_t num_integers, const void *buf);
-
-/*
- * Remove an entry.
- */
-extern void zap_entry_remove(zap_entry_handle_t *zeh);
-
-/*
- * Create an entry. An equal entry must not exist, and this entry must
- * belong in this leaf (according to its hash value).  Fills in the
- * entry handle on success.  Returns 0 on success or ENOSPC on failure.
- */
-extern int zap_entry_create(zap_leaf_t *l, struct zap_name *zn, uint32_t cd,
-    uint8_t integer_size, uint64_t num_integers, const void *buf,
-    zap_entry_handle_t *zeh);
-
-/* Determine whether there is another entry with the same normalized form. */
-extern boolean_t zap_entry_normalization_conflict(zap_entry_handle_t *zeh,
-    struct zap_name *zn, const char *name, struct zap *zap);
-
-/*
- * Other stuff.
- */
-
-extern void zap_leaf_init(zap_leaf_t *l, boolean_t sort);
-extern void zap_leaf_byteswap(zap_leaf_phys_t *buf, int len);
-extern void zap_leaf_split(zap_leaf_t *l, zap_leaf_t *nl, boolean_t sort);
-extern void zap_leaf_stats(struct zap *zap, zap_leaf_t *l,
-    struct zap_stats *zs);
-
-#ifdef	__cplusplus
-}
-#endif
-
-#endif /* _SYS_ZAP_LEAF_H */
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zcp.h
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zcp.h
+++ /dev/null
@@ -1,185 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * This file and its contents are supplied under the terms of the
- * Common Development and Distribution License ("CDDL"), version 1.0.
- * You may only use this file in accordance with the terms of version
- * 1.0 of the CDDL.
- *
- * A full copy of the text of the CDDL should have accompanied this
- * source.  A copy of the CDDL is also available via the Internet at
- * http://www.illumos.org/license/CDDL.
- *
- * CDDL HEADER END
- */
-
-/*
- * Copyright (c) 2016, 2017 by Delphix. All rights reserved.
- */
-
-#ifndef _SYS_ZCP_H
-#define	_SYS_ZCP_H
-
-#include <sys/dmu_tx.h>
-#include <sys/dsl_pool.h>
-
-#include "lua.h"
-#include "lualib.h"
-#include "lauxlib.h"
-
-#ifdef	__cplusplus
-extern "C" {
-#endif
-
-#define	ZCP_RUN_INFO_KEY "runinfo"
-
-extern uint64_t zfs_lua_max_instrlimit;
-extern uint64_t zfs_lua_max_memlimit;
-
-int zcp_argerror(lua_State *, int, const char *, ...);
-
-int zcp_eval(const char *, const char *, boolean_t, uint64_t, uint64_t,
-    nvpair_t *, nvlist_t *);
-
-int zcp_load_list_lib(lua_State *);
-
-int zcp_load_synctask_lib(lua_State *, boolean_t);
-
-typedef void (zcp_cleanup_t)(void *);
-typedef struct zcp_cleanup_handler {
-	zcp_cleanup_t *zch_cleanup_func;
-	void *zch_cleanup_arg;
-	list_node_t zch_node;
-} zcp_cleanup_handler_t;
-
-typedef struct zcp_alloc_arg {
-	boolean_t	aa_must_succeed;
-	int64_t		aa_alloc_remaining;
-	int64_t		aa_alloc_limit;
-} zcp_alloc_arg_t;
-
-typedef struct zcp_run_info {
-	dsl_pool_t	*zri_pool;
-
-	/*
-	 * An estimate of the total amount of space consumed by all
-	 * synctasks we have successfully performed so far in this
-	 * channel program. Used to generate ENOSPC errors for syncfuncs.
-	 */
-	int		zri_space_used;
-
-	/*
-	 * The credentials of the thread which originally invoked the channel
-	 * program. Since channel programs are always invoked from the synctask
-	 * thread they should always do permissions checks against this cred
-	 * rather than the 'current' thread's.
-	 */
-	cred_t		*zri_cred;
-
-	/*
-	 * The tx in which this channel program is running.
-	 */
-	dmu_tx_t	*zri_tx;
-
-	/*
-	 * The maximum number of Lua instructions the channel program is allowed
-	 * to execute. If it takes longer than this it will time out. A value
-	 * of 0 indicates no instruction limit.
-	 */
-	uint64_t	zri_maxinstrs;
-
-	/*
-	 * The number of Lua instructions the channel program has executed.
-	 */
-	uint64_t	zri_curinstrs;
-
-	/*
-	 * Boolean indicating whether or not the channel program exited
-	 * because it timed out.
-	 */
-	boolean_t	zri_timed_out;
-
-	/*
-	 * Channel program was canceled by user
-	 */
-	boolean_t	zri_canceled;
-
-	/*
-	 * Boolean indicating whether or not we are running in syncing
-	 * context.
-	 */
-	boolean_t	zri_sync;
-
-	/*
-	 * List of currently registered cleanup handlers, which will be
-	 * triggered in the event of a fatal error.
-	 */
-	list_t		zri_cleanup_handlers;
-
-	/*
-	 * The Lua state context of our channel program.
-	 */
-	lua_State	*zri_state;
-
-	/*
-	 * Lua memory allocator arguments.
-	 */
-	zcp_alloc_arg_t	*zri_allocargs;
-
-	/*
-	 * Contains output values from zcp script or error string.
-	 */
-	nvlist_t	*zri_outnvl;
-
-	/*
-	 * The errno number returned to caller of zcp_eval().
-	 */
-	int		zri_result;
-} zcp_run_info_t;
-
-zcp_run_info_t *zcp_run_info(lua_State *);
-zcp_cleanup_handler_t *zcp_register_cleanup(lua_State *, zcp_cleanup_t, void *);
-void zcp_deregister_cleanup(lua_State *, zcp_cleanup_handler_t *);
-void zcp_cleanup(lua_State *);
-
-/*
- * Argument parsing routines for channel program callback functions.
- */
-typedef struct zcp_arg {
-	/*
-	 * The name of this argument. For keyword arguments this is the name
-	 * functions will use to set the argument. For positional arguments
-	 * the name has no programatic meaning, but will appear in error
-	 * messages and help output.
-	 */
-	const char *za_name;
-
-	/*
-	 * The Lua type this argument should have (e.g. LUA_TSTRING,
-	 * LUA_TBOOLEAN) see the lua_type() function documentation for a
-	 * complete list. Calling a function with an argument that does
-	 * not match the expected type will result in the program terminating.
-	 */
-	const int za_lua_type;
-} zcp_arg_t;
-
-void zcp_parse_args(lua_State *, const char *, const zcp_arg_t *,
-    const zcp_arg_t *);
-int zcp_nvlist_to_lua(lua_State *, nvlist_t *, char *, int);
-int zcp_dataset_hold_error(lua_State *, dsl_pool_t *, const char *, int);
-struct dsl_dataset *zcp_dataset_hold(lua_State *, dsl_pool_t *,
-    const char *, void *);
-
-typedef int (zcp_lib_func_t)(lua_State *);
-typedef struct zcp_lib_info {
-	const char *name;
-	zcp_lib_func_t *func;
-	const zcp_arg_t pargs[4];
-	const zcp_arg_t kwargs[2];
-} zcp_lib_info_t;
-
-#ifdef	__cplusplus
-}
-#endif
-
-#endif	/* _SYS_ZCP_H */
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zcp_global.h
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zcp_global.h
+++ /dev/null
@@ -1,35 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * This file and its contents are supplied under the terms of the
- * Common Development and Distribution License ("CDDL"), version 1.0.
- * You may only use this file in accordance with the terms of version
- * 1.0 of the CDDL.
- *
- * A full copy of the text of the CDDL should have accompanied this
- * source.  A copy of the CDDL is also available via the Internet at
- * http://www.illumos.org/license/CDDL.
- *
- * CDDL HEADER END
- */
-
-/*
- * Copyright (c) 2016 by Delphix. All rights reserved.
- */
-
-#ifndef _SYS_ZCP_GLOBALS_H
-#define	_SYS_ZCP_GLOBALS_H
-
-#include "lua.h"
-
-#ifdef	__cplusplus
-extern "C" {
-#endif
-
-void zcp_load_globals(lua_State *);
-
-#ifdef	__cplusplus
-}
-#endif
-
-#endif	/* _SYS_ZCP_GLOBALS_H */
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zcp_iter.h
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zcp_iter.h
+++ /dev/null
@@ -1,41 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-
-/*
- * Copyright (c) 2016 by Delphix. All rights reserved.
- */
-
-#ifndef _SYS_ZCP_LIST_H
-#define	_SYS_ZCP_LIST_H
-
-#include "lua.h"
-
-#ifdef	__cplusplus
-extern "C" {
-#endif
-
-void zcp_load_list_funcs(lua_State *);
-
-#ifdef	__cplusplus
-}
-#endif
-
-#endif	/* _SYS_ZCP_LIST_H */
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zcp_prop.h
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zcp_prop.h
+++ /dev/null
@@ -1,34 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * This file and its contents are supplied under the terms of the
- * Common Development and Distribution License ("CDDL"), version 1.0.
- * You may only use this file in accordance with the terms of version
- * 1.0 of the CDDL.
- *
- * A full copy of the text of the CDDL should have accompanied this
- * source.  A copy of the CDDL is also available via the Internet at
- * http://www.illumos.org/license/CDDL.
- *
- * CDDL HEADER END
- */
-
-/*
- * Copyright (c) 2016 by Delphix. All rights reserved.
- */
-
-#ifndef _SYS_ZCP_PROP_H
-#define	_SYS_ZCP_PROP_H
-
-#ifdef	__cplusplus
-extern "C" {
-#endif
-
-int zcp_load_get_lib(lua_State *state);
-boolean_t prop_valid_for_ds(dsl_dataset_t *ds, zfs_prop_t zfs_prop);
-
-#ifdef	__cplusplus
-}
-#endif
-
-#endif /* _SYS_ZCP_PROP_H */
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfeature.h
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfeature.h
+++ /dev/null
@@ -1,73 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-
-/*
- * Copyright (c) 2013 by Delphix. All rights reserved.
- */
-
-#ifndef _SYS_ZFEATURE_H
-#define	_SYS_ZFEATURE_H
-
-#include <sys/nvpair.h>
-#include <sys/txg.h>
-#include "zfeature_common.h"
-
-#ifdef	__cplusplus
-extern "C" {
-#endif
-
-#define	VALID_FEATURE_FID(fid)	((fid) >= 0 && (fid) < SPA_FEATURES)
-#define	VALID_FEATURE_OR_NONE(fid)	((fid) == SPA_FEATURE_NONE ||	\
-    VALID_FEATURE_FID(fid))
-
-struct spa;
-struct dmu_tx;
-struct objset;
-
-extern void spa_feature_create_zap_objects(struct spa *, struct dmu_tx *);
-extern void spa_feature_enable(struct spa *, spa_feature_t,
-    struct dmu_tx *);
-extern void spa_feature_incr(struct spa *, spa_feature_t, struct dmu_tx *);
-extern void spa_feature_decr(struct spa *, spa_feature_t, struct dmu_tx *);
-extern boolean_t spa_feature_is_enabled(struct spa *, spa_feature_t);
-extern boolean_t spa_feature_is_active(struct spa *, spa_feature_t);
-extern boolean_t spa_feature_enabled_txg(spa_t *spa, spa_feature_t fid,
-    uint64_t *txg);
-extern uint64_t spa_feature_refcount(spa_t *, spa_feature_t, uint64_t);
-extern boolean_t spa_features_check(spa_t *, boolean_t, nvlist_t *, nvlist_t *);
-
-/*
- * These functions are only exported for zhack and zdb; normal callers should
- * use the above interfaces.
- */
-extern int feature_get_refcount(struct spa *, zfeature_info_t *, uint64_t *);
-extern int feature_get_refcount_from_disk(spa_t *spa, zfeature_info_t *feature,
-    uint64_t *res);
-extern void feature_enable_sync(struct spa *, zfeature_info_t *,
-    struct dmu_tx *);
-extern void feature_sync(struct spa *, zfeature_info_t *, uint64_t,
-    struct dmu_tx *);
-
-#ifdef	__cplusplus
-}
-#endif
-
-#endif /* _SYS_ZFEATURE_H */
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_acl.h
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_acl.h
+++ /dev/null
@@ -1,248 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- */
-
-#ifndef	_SYS_FS_ZFS_ACL_H
-#define	_SYS_FS_ZFS_ACL_H
-
-#ifdef _KERNEL
-#include <sys/cred.h>
-#endif
-#include <sys/acl.h>
-#include <sys/dmu.h>
-#include <sys/zfs_fuid.h>
-#include <sys/sa.h>
-
-#ifdef	__cplusplus
-extern "C" {
-#endif
-
-struct znode_phys;
-
-#define	ACE_SLOT_CNT	6
-#define	ZFS_ACL_VERSION_INITIAL 0ULL
-#define	ZFS_ACL_VERSION_FUID	1ULL
-#define	ZFS_ACL_VERSION		ZFS_ACL_VERSION_FUID
-
-/*
- * ZFS ACLs (Access Control Lists) are stored in various forms.
- *
- * Files created with ACL version ZFS_ACL_VERSION_INITIAL
- * will all be created with fixed length ACEs of type
- * zfs_oldace_t.
- *
- * Files with ACL version ZFS_ACL_VERSION_FUID will be created
- * with various sized ACEs.  The abstraction entries will utilize
- * zfs_ace_hdr_t, normal user/group entries will use zfs_ace_t
- * and some specialized CIFS ACEs will use zfs_object_ace_t.
- */
-
-/*
- * All ACEs have a common hdr.  For
- * owner@, group@, and everyone@ this is all
- * thats needed.
- */
-typedef struct zfs_ace_hdr {
-	uint16_t z_type;
-	uint16_t z_flags;
-	uint32_t z_access_mask;
-} zfs_ace_hdr_t;
-
-typedef zfs_ace_hdr_t zfs_ace_abstract_t;
-
-/*
- * Standard ACE
- */
-typedef struct zfs_ace {
-	zfs_ace_hdr_t	z_hdr;
-	uint64_t	z_fuid;
-} zfs_ace_t;
-
-/*
- * The following type only applies to ACE_ACCESS_ALLOWED|DENIED_OBJECT_ACE_TYPE
- * and will only be set/retrieved in a CIFS context.
- */
-
-typedef struct zfs_object_ace {
-	zfs_ace_t	z_ace;
-	uint8_t		z_object_type[16]; /* object type */
-	uint8_t		z_inherit_type[16]; /* inherited object type */
-} zfs_object_ace_t;
-
-typedef struct zfs_oldace {
-	uint32_t	z_fuid;		/* "who" */
-	uint32_t	z_access_mask;  /* access mask */
-	uint16_t	z_flags;	/* flags, i.e inheritance */
-	uint16_t	z_type;		/* type of entry allow/deny */
-} zfs_oldace_t;
-
-typedef struct zfs_acl_phys_v0 {
-	uint64_t	z_acl_extern_obj;	/* ext acl pieces */
-	uint32_t	z_acl_count;		/* Number of ACEs */
-	uint16_t	z_acl_version;		/* acl version */
-	uint16_t	z_acl_pad;		/* pad */
-	zfs_oldace_t	z_ace_data[ACE_SLOT_CNT]; /* 6 standard ACEs */
-} zfs_acl_phys_v0_t;
-
-#define	ZFS_ACE_SPACE	(sizeof (zfs_oldace_t) * ACE_SLOT_CNT)
-
-/*
- * Size of ACL count is always 2 bytes.
- * Necessary to for dealing with both V0 ACL and V1 ACL layout
- */
-#define	ZFS_ACL_COUNT_SIZE	(sizeof (uint16_t))
-
-typedef struct zfs_acl_phys {
-	uint64_t	z_acl_extern_obj;	  /* ext acl pieces */
-	uint32_t	z_acl_size;		  /* Number of bytes in ACL */
-	uint16_t	z_acl_version;		  /* acl version */
-	uint16_t	z_acl_count;		  /* ace count */
-	uint8_t	z_ace_data[ZFS_ACE_SPACE]; /* space for embedded ACEs */
-} zfs_acl_phys_t;
-
-typedef struct acl_ops {
-	uint32_t	(*ace_mask_get) (void *acep); /* get  access mask */
-	void 		(*ace_mask_set) (void *acep,
-			    uint32_t mask); /* set access mask */
-	uint16_t	(*ace_flags_get) (void *acep);	/* get flags */
-	void		(*ace_flags_set) (void *acep,
-			    uint16_t flags); /* set flags */
-	uint16_t	(*ace_type_get)(void *acep); /* get type */
-	void		(*ace_type_set)(void *acep,
-			    uint16_t type); /* set type */
-	uint64_t	(*ace_who_get)(void *acep); /* get who/fuid */
-	void		(*ace_who_set)(void *acep,
-			    uint64_t who); /* set who/fuid */
-	size_t		(*ace_size)(void *acep); /* how big is this ace */
-	size_t		(*ace_abstract_size)(void); /* sizeof abstract entry */
-	int		(*ace_mask_off)(void); /* off of access mask in ace */
-	/* ptr to data if any */
-	int		(*ace_data)(void *acep, void **datap);
-} acl_ops_t;
-
-/*
- * A zfs_acl_t structure is composed of a list of zfs_acl_node_t's.
- * Each node will have one or more ACEs associated with it.  You will
- * only have multiple nodes during a chmod operation.   Normally only
- * one node is required.
- */
-typedef struct zfs_acl_node {
-	list_node_t	z_next;		/* Next chunk of ACEs */
-	void		*z_acldata;	/* pointer into actual ACE(s) */
-	void		*z_allocdata;	/* pointer to kmem allocated memory */
-	size_t		z_allocsize;	/* Size of blob in bytes */
-	size_t		z_size;		/* length of ACL data */
-	uint64_t	z_ace_count;	/* number of ACEs in this acl node */
-	int		z_ace_idx;	/* ace iterator positioned on */
-} zfs_acl_node_t;
-
-typedef struct zfs_acl {
-	uint64_t	z_acl_count;	/* Number of ACEs */
-	size_t		z_acl_bytes;	/* Number of bytes in ACL */
-	uint_t		z_version;	/* version of ACL */
-	void		*z_next_ace;	/* pointer to next ACE */
-	uint64_t	z_hints;	/* ACL hints (ZFS_INHERIT_ACE ...) */
-	zfs_acl_node_t	*z_curr_node;	/* current node iterator is handling */
-	list_t		z_acl;		/* chunks of ACE data */
-	acl_ops_t	z_ops;		/* ACL operations */
-} zfs_acl_t;
-
-typedef struct acl_locator_cb {
-	zfs_acl_t *cb_aclp;
-	zfs_acl_node_t *cb_acl_node;
-} zfs_acl_locator_cb_t;
-
-#define	ACL_DATA_ALLOCED	0x1
-#define	ZFS_ACL_SIZE(aclcnt)	(sizeof (ace_t) * (aclcnt))
-
-struct zfs_fuid_info;
-
-typedef struct zfs_acl_ids {
-	uint64_t		z_fuid;		/* file owner fuid */
-	uint64_t		z_fgid;		/* file group owner fuid */
-	uint64_t		z_mode;		/* mode to set on create */
-	zfs_acl_t		*z_aclp;	/* ACL to create with file */
-	struct zfs_fuid_info 	*z_fuidp;	/* for tracking fuids for log */
-} zfs_acl_ids_t;
-
-/*
- * Property values for acl_mode and acl_inherit.
- *
- * acl_mode can take discard, noallow, groupmask and passthrough.
- * whereas acl_inherit has secure instead of groupmask.
- */
-
-#define	ZFS_ACL_DISCARD		0
-#define	ZFS_ACL_NOALLOW		1
-#define	ZFS_ACL_GROUPMASK	2
-#define	ZFS_ACL_PASSTHROUGH	3
-#define	ZFS_ACL_RESTRICTED	4
-#define	ZFS_ACL_PASSTHROUGH_X	5
-
-struct znode;
-struct zfsvfs;
-
-#ifdef _KERNEL
-int zfs_acl_ids_create(struct znode *, int, vattr_t *,
-    cred_t *, vsecattr_t *, zfs_acl_ids_t *);
-void zfs_acl_ids_free(zfs_acl_ids_t *);
-boolean_t zfs_acl_ids_overquota(struct zfsvfs *, zfs_acl_ids_t *);
-int zfs_getacl(struct znode *, vsecattr_t *, boolean_t, cred_t *);
-int zfs_setacl(struct znode *, vsecattr_t *, boolean_t, cred_t *);
-void zfs_acl_rele(void *);
-void zfs_oldace_byteswap(ace_t *, int);
-void zfs_ace_byteswap(void *, size_t, boolean_t);
-extern boolean_t zfs_has_access(struct znode *zp, cred_t *cr);
-extern int zfs_zaccess(struct znode *, int, int, boolean_t, cred_t *);
-#ifdef illumos
-int zfs_fastaccesschk_execute(struct znode *, cred_t *);
-#endif
-int zfs_freebsd_fastaccesschk_execute(struct vnode *, cred_t *);
-extern int zfs_zaccess_rwx(struct znode *, mode_t, int, cred_t *);
-extern int zfs_zaccess_unix(struct znode *, mode_t, cred_t *);
-extern int zfs_acl_access(struct znode *, int, cred_t *);
-int zfs_acl_chmod_setattr(struct znode *, zfs_acl_t **, uint64_t);
-int zfs_zaccess_delete(struct znode *, struct znode *, cred_t *);
-int zfs_zaccess_rename(struct znode *, struct znode *,
-    struct znode *, struct znode *, cred_t *cr);
-void zfs_acl_free(zfs_acl_t *);
-int zfs_vsec_2_aclp(struct zfsvfs *, vtype_t, vsecattr_t *, cred_t *,
-    struct zfs_fuid_info **, zfs_acl_t **);
-int zfs_aclset_common(struct znode *, zfs_acl_t *, cred_t *, dmu_tx_t *);
-uint64_t zfs_external_acl(struct znode *);
-int zfs_znode_acl_version(struct znode *);
-int zfs_acl_size(struct znode *, int *);
-zfs_acl_t *zfs_acl_alloc(int);
-zfs_acl_node_t *zfs_acl_node_alloc(size_t);
-void zfs_acl_xform(struct znode *, zfs_acl_t *, cred_t *);
-void zfs_acl_data_locator(void **, uint32_t *, uint32_t, boolean_t, void *);
-uint64_t zfs_mode_compute(uint64_t, zfs_acl_t *,
-    uint64_t *, uint64_t, uint64_t);
-int zfs_acl_chown_setattr(struct znode *);
-
-#endif
-
-#ifdef	__cplusplus
-}
-#endif
-#endif	/* _SYS_FS_ZFS_ACL_H */
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_context.h
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_context.h
+++ /dev/null
@@ -1,146 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-/*
- * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
- * Copyright (c) 2013 by Delphix. All rights reserved.
- */
-
-#ifndef _SYS_ZFS_CONTEXT_H
-#define	_SYS_ZFS_CONTEXT_H
-
-#ifdef	__cplusplus
-extern "C" {
-#endif
-
-#include <sys/param.h>
-#include <sys/stdint.h>
-#include <sys/note.h>
-#include <sys/kernel.h>
-#include <sys/debug.h>
-#include <sys/systm.h>
-#include <sys/proc.h>
-#include <sys/sysmacros.h>
-#include <sys/bitmap.h>
-#include <sys/cmn_err.h>
-#include <sys/kmem.h>
-#include <sys/taskq.h>
-#include <sys/taskqueue.h>
-#include <sys/systm.h>
-#include <sys/conf.h>
-#include <sys/mutex.h>
-#include <sys/rwlock.h>
-#include <sys/kcondvar.h>
-#include <sys/random.h>
-#include <sys/byteorder.h>
-#include <sys/systm.h>
-#include <sys/list.h>
-#include <sys/zfs_debug.h>
-#include <sys/sysevent.h>
-#include <sys/uio.h>
-#include <sys/dirent.h>
-#include <sys/time.h>
-#include <sys/uio.h>
-#include <sys/fcntl.h>
-#include <sys/limits.h>
-#include <sys/string.h>
-#include <sys/bio.h>
-#include <sys/buf.h>
-#include <sys/cred.h>
-#include <sys/sdt.h>
-#include <sys/file.h>
-#include <sys/vfs.h>
-#include <sys/sysctl.h>
-#include <sys/sbuf.h>
-#include <sys/priv.h>
-#include <sys/kdb.h>
-#include <sys/ktr.h>
-#include <sys/stack.h>
-#include <sys/lockf.h>
-#include <sys/pathname.h>
-#include <sys/policy.h>
-#include <sys/refstr.h>
-#include <sys/zone.h>
-#include <sys/eventhandler.h>
-#include <sys/extattr.h>
-#include <sys/misc.h>
-#include <sys/sig.h>
-#include <sys/osd.h>
-#include <sys/sysevent/dev.h>
-#include <sys/sysevent/eventdefs.h>
-#include <sys/u8_textprep.h>
-#include <sys/fm/util.h>
-#include <sys/sunddi.h>
-#ifdef illumos
-#include <sys/cyclic.h>
-#endif
-#include <sys/callo.h>
-#include <sys/disp.h>
-#include <machine/_inttypes.h>
-#include <machine/stdarg.h>
-
-#include <vm/vm.h>
-#include <vm/vm_page.h>
-#include <vm/vm_object.h>
-#include <vm/vm_kern.h>
-#include <vm/vm_map.h>
-#include <vm/vm_extern.h>
-#include <vm/vnode_pager.h>
-
-#define	boot_ncpus	(mp_ncpus)
-
-#define	CPU_SEQID	(curcpu)
-
-#define	tsd_create(keyp, destructor)	do {				\
-	*(keyp) = osd_thread_register((destructor));			\
-	KASSERT(*(keyp) > 0, ("cannot register OSD"));			\
-} while (0)
-#define	tsd_destroy(keyp)		osd_thread_deregister(*(keyp))
-#define	tsd_get(key)			osd_thread_get(curthread, (key))
-#define	tsd_set(key, value)		osd_thread_set(curthread, (key), (value))
-
-#ifdef	__cplusplus
-}
-#endif
-
-extern int zfs_debug_level;
-extern struct mtx zfs_debug_mtx;
-#define	ZFS_LOG(lvl, ...)	do {					\
-	if (((lvl) & 0xff) <= zfs_debug_level) {			\
-		mtx_lock(&zfs_debug_mtx);				\
-		printf("%s:%u[%d]: ", __func__, __LINE__, (lvl));	\
-		printf(__VA_ARGS__);					\
-		printf("\n");						\
-		if ((lvl) & 0x100)					\
-			kdb_backtrace();				\
-		mtx_unlock(&zfs_debug_mtx);				\
-	}								\
-} while (0)
-
-#define	sys_shutdown	rebooting
-
-#define	noinline	__attribute__((noinline))
-#define	likely(x)	__builtin_expect((x), 1)
-
-#endif	/* _SYS_ZFS_CONTEXT_H */
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ctldir.h
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ctldir.h
+++ /dev/null
@@ -1,65 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- */
-
-#ifndef	_ZFS_CTLDIR_H
-#define	_ZFS_CTLDIR_H
-
-#include <sys/vnode.h>
-#include <sys/zfs_vfsops.h>
-#include <sys/zfs_znode.h>
-
-#ifdef	__cplusplus
-extern "C" {
-#endif
-
-#define	ZFS_CTLDIR_NAME		".zfs"
-
-#define	zfs_has_ctldir(zdp)	\
-	((zdp)->z_id == (zdp)->z_zfsvfs->z_root && \
-	((zdp)->z_zfsvfs->z_ctldir != NULL))
-#define	zfs_show_ctldir(zdp)	\
-	(zfs_has_ctldir(zdp) && \
-	((zdp)->z_zfsvfs->z_show_ctldir))
-
-void zfsctl_create(zfsvfs_t *);
-void zfsctl_destroy(zfsvfs_t *);
-int zfsctl_root(zfsvfs_t *, int, vnode_t **);
-void zfsctl_init(void);
-void zfsctl_fini(void);
-boolean_t zfsctl_is_node(vnode_t *);
-
-int zfsctl_rename_snapshot(const char *from, const char *to);
-int zfsctl_destroy_snapshot(const char *snapname, int force);
-int zfsctl_umount_snapshots(vfs_t *, int, cred_t *);
-
-int zfsctl_lookup_objset(vfs_t *vfsp, uint64_t objsetid, zfsvfs_t **zfsvfsp);
-
-#define	ZFSCTL_INO_ROOT		0x1
-#define	ZFSCTL_INO_SNAPDIR	0x2
-
-#ifdef	__cplusplus
-}
-#endif
-
-#endif	/* _ZFS_CTLDIR_H */
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_debug.h
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_debug.h
+++ /dev/null
@@ -1,99 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
- */
-
-#ifndef _SYS_ZFS_DEBUG_H
-#define	_SYS_ZFS_DEBUG_H
-
-#ifdef	__cplusplus
-extern "C" {
-#endif
-
-#ifndef TRUE
-#define	TRUE 1
-#endif
-
-#ifndef FALSE
-#define	FALSE 0
-#endif
-
-/*
- * ZFS debugging
- */
-
-#if defined(DEBUG) || !defined(_KERNEL)
-#if !defined(ZFS_DEBUG)
-#define	ZFS_DEBUG
-#endif
-#endif
-
-extern int zfs_flags;
-extern boolean_t zfs_recover;
-extern boolean_t zfs_free_leak_on_eio;
-
-#define	ZFS_DEBUG_DPRINTF		(1 << 0)
-#define	ZFS_DEBUG_DBUF_VERIFY		(1 << 1)
-#define	ZFS_DEBUG_DNODE_VERIFY		(1 << 2)
-#define	ZFS_DEBUG_SNAPNAMES		(1 << 3)
-#define	ZFS_DEBUG_MODIFY		(1 << 4)
-/* 1<<5 was previously used, try not to reuse */
-#define	ZFS_DEBUG_ZIO_FREE		(1 << 6)
-#define	ZFS_DEBUG_HISTOGRAM_VERIFY	(1 << 7)
-#define	ZFS_DEBUG_METASLAB_VERIFY	(1 << 8)
-#define	ZFS_DEBUG_INDIRECT_REMAP	(1 << 9)
-
-#ifdef ZFS_DEBUG
-extern void __dprintf(const char *file, const char *func,
-    int line, const char *fmt, ...);
-#define	dprintf(...) \
-	if (zfs_flags & ZFS_DEBUG_DPRINTF) \
-		__dprintf(__FILE__, __func__, __LINE__, __VA_ARGS__)
-#else
-#define	dprintf(...) ((void)0)
-#endif /* ZFS_DEBUG */
-
-extern void zfs_panic_recover(const char *fmt, ...);
-
-typedef struct zfs_dbgmsg {
-	list_node_t zdm_node;
-	time_t zdm_timestamp;
-	char zdm_msg[1]; /* variable length allocation */
-} zfs_dbgmsg_t;
-
-extern void zfs_dbgmsg_init(void);
-extern void zfs_dbgmsg_fini(void);
-extern void zfs_dbgmsg(const char *fmt, ...);
-extern void zfs_dbgmsg_print(const char *tag);
-
-#ifdef illumos
-#ifndef _KERNEL
-extern int dprintf_find_string(const char *string);
-#endif
-#endif /* illumos */
-
-#ifdef	__cplusplus
-}
-#endif
-
-#endif	/* _SYS_ZFS_DEBUG_H */
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_dir.h
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_dir.h
+++ /dev/null
@@ -1,74 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-
-#ifndef	_SYS_FS_ZFS_DIR_H
-#define	_SYS_FS_ZFS_DIR_H
-
-#include <sys/pathname.h>
-#include <sys/dmu.h>
-#include <sys/zfs_znode.h>
-
-#ifdef	__cplusplus
-extern "C" {
-#endif
-
-/* zfs_dirent_lock() flags */
-#define	ZNEW		0x0001		/* entry should not exist */
-#define	ZEXISTS		0x0002		/* entry should exist */
-#define	ZSHARED		0x0004		/* shared access (zfs_dirlook()) */
-#define	ZXATTR		0x0008		/* we want the xattr dir */
-#define	ZRENAMING	0x0010		/* znode is being renamed */
-#define	ZCILOOK		0x0020		/* case-insensitive lookup requested */
-#define	ZCIEXACT	0x0040		/* c-i requires c-s match (rename) */
-#define	ZHAVELOCK	0x0080		/* z_name_lock is already held */
-
-/* mknode flags */
-#define	IS_ROOT_NODE	0x01		/* create a root node */
-#define	IS_XATTR	0x02		/* create an extended attribute node */
-
-extern int zfs_dirent_lookup(znode_t *, const char *, znode_t **, int);
-extern int zfs_link_create(znode_t *, const char *, znode_t *, dmu_tx_t *, int);
-extern int zfs_link_destroy(znode_t *, const char *, znode_t *, dmu_tx_t *, int,
-    boolean_t *);
-#if 0
-extern int zfs_dirlook(vnode_t *, const char *, vnode_t **, int);
-#else
-extern int zfs_dirlook(znode_t *, const char *name, znode_t **);
-#endif
-extern void zfs_mknode(znode_t *, vattr_t *, dmu_tx_t *, cred_t *,
-    uint_t, znode_t **, zfs_acl_ids_t *);
-extern void zfs_rmnode(znode_t *);
-extern boolean_t zfs_dirempty(znode_t *);
-extern void zfs_unlinked_add(znode_t *, dmu_tx_t *);
-extern void zfs_unlinked_drain(zfsvfs_t *zfsvfs);
-extern int zfs_sticky_remove_access(znode_t *, znode_t *, cred_t *cr);
-extern int zfs_get_xattrdir(znode_t *, vnode_t **, cred_t *, int);
-extern int zfs_make_xattrdir(znode_t *, vattr_t *, vnode_t **, cred_t *);
-
-#ifdef	__cplusplus
-}
-#endif
-
-#endif	/* _SYS_FS_ZFS_DIR_H */
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_fuid.h
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_fuid.h
+++ /dev/null
@@ -1,132 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-
-#ifndef	_SYS_FS_ZFS_FUID_H
-#define	_SYS_FS_ZFS_FUID_H
-
-#include <sys/types.h>
-#ifdef _KERNEL
-#include <sys/kidmap.h>
-#include <sys/dmu.h>
-#include <sys/zfs_vfsops.h>
-#endif
-#include <sys/avl.h>
-#include <sys/list.h>
-
-#ifdef	__cplusplus
-extern "C" {
-#endif
-
-typedef enum {
-	ZFS_OWNER,
-	ZFS_GROUP,
-	ZFS_ACE_USER,
-	ZFS_ACE_GROUP
-} zfs_fuid_type_t;
-
-/*
- * Estimate space needed for one more fuid table entry.
- * for now assume its current size + 1K
- */
-#define	FUID_SIZE_ESTIMATE(z) ((z)->z_fuid_size + (SPA_MINBLOCKSIZE << 1))
-
-#define	FUID_INDEX(x)	((x) >> 32)
-#define	FUID_RID(x)	((x) & 0xffffffff)
-#define	FUID_ENCODE(idx, rid) (((uint64_t)(idx) << 32) | (rid))
-/*
- * FUIDs cause problems for the intent log
- * we need to replay the creation of the FUID,
- * but we can't count on the idmapper to be around
- * and during replay the FUID index may be different than
- * before.  Also, if an ACL has 100 ACEs and 12 different
- * domains we don't want to log 100 domain strings, but rather
- * just the unique 12.
- */
-
-/*
- * The FUIDs in the log will index into
- * domain string table and the bottom half will be the rid.
- * Used for mapping ephemeral uid/gid during ACL setting to FUIDs
- */
-typedef struct zfs_fuid {
-	list_node_t 	z_next;
-	uint64_t 	z_id;		/* uid/gid being converted to fuid */
-	uint64_t	z_domidx;	/* index in AVL domain table */
-	uint64_t	z_logfuid;	/* index for domain in log */
-} zfs_fuid_t;
-
-/* list of unique domains */
-typedef struct zfs_fuid_domain {
-	list_node_t	z_next;
-	uint64_t	z_domidx;	/* AVL tree idx */
-	const char	*z_domain;	/* domain string */
-} zfs_fuid_domain_t;
-
-/*
- * FUID information necessary for logging create, setattr, and setacl.
- */
-typedef struct zfs_fuid_info {
-	list_t	z_fuids;
-	list_t	z_domains;
-	uint64_t z_fuid_owner;
-	uint64_t z_fuid_group;
-	char **z_domain_table;  /* Used during replay */
-	uint32_t z_fuid_cnt;	/* How many fuids in z_fuids */
-	uint32_t z_domain_cnt;	/* How many domains */
-	size_t	z_domain_str_sz; /* len of domain strings z_domain list */
-} zfs_fuid_info_t;
-
-#ifdef _KERNEL
-struct znode;
-extern uid_t zfs_fuid_map_id(zfsvfs_t *, uint64_t, cred_t *, zfs_fuid_type_t);
-extern void zfs_fuid_node_add(zfs_fuid_info_t **, const char *, uint32_t,
-    uint64_t, uint64_t, zfs_fuid_type_t);
-extern void zfs_fuid_destroy(zfsvfs_t *);
-extern uint64_t zfs_fuid_create_cred(zfsvfs_t *, zfs_fuid_type_t,
-    cred_t *, zfs_fuid_info_t **);
-extern uint64_t zfs_fuid_create(zfsvfs_t *, uint64_t, cred_t *, zfs_fuid_type_t,
-    zfs_fuid_info_t **);
-extern void zfs_fuid_map_ids(struct znode *zp, cred_t *cr,
-    uid_t *uid, uid_t *gid);
-extern zfs_fuid_info_t *zfs_fuid_info_alloc(void);
-extern void zfs_fuid_info_free(zfs_fuid_info_t *);
-extern boolean_t zfs_groupmember(zfsvfs_t *, uint64_t, cred_t *);
-void zfs_fuid_sync(zfsvfs_t *, dmu_tx_t *);
-extern int zfs_fuid_find_by_domain(zfsvfs_t *, const char *domain,
-    char **retdomain, boolean_t addok);
-extern const char *zfs_fuid_find_by_idx(zfsvfs_t *zfsvfs, uint32_t idx);
-extern void zfs_fuid_txhold(zfsvfs_t *zfsvfs, dmu_tx_t *tx);
-#endif
-
-char *zfs_fuid_idx_domain(avl_tree_t *, uint32_t);
-void zfs_fuid_avl_tree_create(avl_tree_t *, avl_tree_t *);
-uint64_t zfs_fuid_table_load(objset_t *, uint64_t, avl_tree_t *, avl_tree_t *);
-void zfs_fuid_table_destroy(avl_tree_t *, avl_tree_t *);
-
-#ifdef	__cplusplus
-}
-#endif
-
-#endif	/* _SYS_FS_ZFS_FUID_H */
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ioctl.h
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ioctl.h
+++ /dev/null
@@ -1,466 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011-2012 Pawel Jakub Dawidek. All rights reserved.
- * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
- * Copyright 2016 RackTop Systems.
- * Copyright (c) 2014 Integros [integros.com]
- */
-
-#ifndef	_SYS_ZFS_IOCTL_H
-#define	_SYS_ZFS_IOCTL_H
-
-#include <sys/cred.h>
-#include <sys/dmu.h>
-#include <sys/zio.h>
-#include <sys/dsl_deleg.h>
-#include <sys/spa.h>
-#include <sys/zfs_stat.h>
-
-#ifdef _KERNEL
-#include <sys/nvpair.h>
-#endif	/* _KERNEL */
-
-#ifdef	__cplusplus
-extern "C" {
-#endif
-
-/*
- * The structures in this file are passed between userland and the
- * kernel.  Userland may be running a 32-bit process, while the kernel
- * is 64-bit.  Therefore, these structures need to compile the same in
- * 32-bit and 64-bit.  This means not using type "long", and adding
- * explicit padding so that the 32-bit structure will not be packed more
- * tightly than the 64-bit structure (which requires 64-bit alignment).
- */
-
-/*
- * Property values for snapdir
- */
-#define	ZFS_SNAPDIR_HIDDEN		0
-#define	ZFS_SNAPDIR_VISIBLE		1
-
-/*
- * Field manipulation macros for the drr_versioninfo field of the
- * send stream header.
- */
-
-/*
- * Header types for zfs send streams.
- */
-typedef enum drr_headertype {
-	DMU_SUBSTREAM = 0x1,
-	DMU_COMPOUNDSTREAM = 0x2
-} drr_headertype_t;
-
-#define	DMU_GET_STREAM_HDRTYPE(vi)	BF64_GET((vi), 0, 2)
-#define	DMU_SET_STREAM_HDRTYPE(vi, x)	BF64_SET((vi), 0, 2, x)
-
-#define	DMU_GET_FEATUREFLAGS(vi)	BF64_GET((vi), 2, 30)
-#define	DMU_SET_FEATUREFLAGS(vi, x)	BF64_SET((vi), 2, 30, x)
-
-/*
- * Feature flags for zfs send streams (flags in drr_versioninfo)
- */
-
-#define	DMU_BACKUP_FEATURE_DEDUP		(1 << 0)
-#define	DMU_BACKUP_FEATURE_DEDUPPROPS		(1 << 1)
-#define	DMU_BACKUP_FEATURE_SA_SPILL		(1 << 2)
-/* flags #3 - #15 are reserved for incompatible closed-source implementations */
-#define	DMU_BACKUP_FEATURE_EMBED_DATA		(1 << 16)
-#define	DMU_BACKUP_FEATURE_LZ4			(1 << 17)
-/* flag #18 is reserved for a Delphix feature */
-#define	DMU_BACKUP_FEATURE_LARGE_BLOCKS		(1 << 19)
-#define	DMU_BACKUP_FEATURE_RESUMING		(1 << 20)
-/* flag #21 is reserved for a Delphix feature */
-#define	DMU_BACKUP_FEATURE_COMPRESSED		(1 << 22)
-#define	DMU_BACKUP_FEATURE_LARGE_DNODE		(1 << 23)
-/* flag #24 is reserved for the raw send feature */
-/* flag #25 is reserved for the ZSTD compression feature */
-
-/*
- * Mask of all supported backup features
- */
-#define	DMU_BACKUP_FEATURE_MASK	(DMU_BACKUP_FEATURE_DEDUP | \
-    DMU_BACKUP_FEATURE_DEDUPPROPS | DMU_BACKUP_FEATURE_SA_SPILL | \
-    DMU_BACKUP_FEATURE_EMBED_DATA | DMU_BACKUP_FEATURE_LZ4 | \
-    DMU_BACKUP_FEATURE_RESUMING | \
-    DMU_BACKUP_FEATURE_LARGE_BLOCKS | DMU_BACKUP_FEATURE_LARGE_DNODE | \
-    DMU_BACKUP_FEATURE_COMPRESSED)
-
-/* Are all features in the given flag word currently supported? */
-#define	DMU_STREAM_SUPPORTED(x)	(!((x) & ~DMU_BACKUP_FEATURE_MASK))
-
-typedef enum dmu_send_resume_token_version {
-	ZFS_SEND_RESUME_TOKEN_VERSION = 1
-} dmu_send_resume_token_version_t;
-
-/*
- * The drr_versioninfo field of the dmu_replay_record has the
- * following layout:
- *
- *	64	56	48	40	32	24	16	8	0
- *	+-------+-------+-------+-------+-------+-------+-------+-------+
- *	|		reserved	|        feature-flags	    |C|S|
- *	+-------+-------+-------+-------+-------+-------+-------+-------+
- *
- * The low order two bits indicate the header type: SUBSTREAM (0x1)
- * or COMPOUNDSTREAM (0x2).  Using two bits for this is historical:
- * this field used to be a version number, where the two version types
- * were 1 and 2.  Using two bits for this allows earlier versions of
- * the code to be able to recognize send streams that don't use any
- * of the features indicated by feature flags.
- */
-
-#define	DMU_BACKUP_MAGIC 0x2F5bacbacULL
-
-/*
- * Send stream flags.  Bits 24-31 are reserved for vendor-specific
- * implementations and should not be used.
- */
-#define	DRR_FLAG_CLONE		(1<<0)
-#define	DRR_FLAG_CI_DATA	(1<<1)
-/*
- * This send stream, if it is a full send, includes the FREE and FREEOBJECT
- * records that are created by the sending process.  This means that the send
- * stream can be received as a clone, even though it is not an incremental.
- * This is not implemented as a feature flag, because the receiving side does
- * not need to have implemented it to receive this stream; it is fully backwards
- * compatible.  We need a flag, though, because full send streams without it
- * cannot necessarily be received as a clone correctly.
- */
-#define	DRR_FLAG_FREERECORDS	(1<<2)
-
-/*
- * flags in the drr_checksumflags field in the DRR_WRITE and
- * DRR_WRITE_BYREF blocks
- */
-#define	DRR_CHECKSUM_DEDUP	(1<<0)
-
-#define	DRR_IS_DEDUP_CAPABLE(flags)	((flags) & DRR_CHECKSUM_DEDUP)
-
-/* deal with compressed drr_write replay records */
-#define	DRR_WRITE_COMPRESSED(drrw)	((drrw)->drr_compressiontype != 0)
-#define	DRR_WRITE_PAYLOAD_SIZE(drrw) \
-	(DRR_WRITE_COMPRESSED(drrw) ? (drrw)->drr_compressed_size : \
-	(drrw)->drr_logical_size)
-
-typedef struct dmu_replay_record {
-	enum {
-		DRR_BEGIN, DRR_OBJECT, DRR_FREEOBJECTS,
-		DRR_WRITE, DRR_FREE, DRR_END, DRR_WRITE_BYREF,
-		DRR_SPILL, DRR_WRITE_EMBEDDED, DRR_NUMTYPES
-	} drr_type;
-	uint32_t drr_payloadlen;
-	union {
-		struct drr_begin {
-			uint64_t drr_magic;
-			uint64_t drr_versioninfo; /* was drr_version */
-			uint64_t drr_creation_time;
-			dmu_objset_type_t drr_type;
-			uint32_t drr_flags;
-			uint64_t drr_toguid;
-			uint64_t drr_fromguid;
-			char drr_toname[MAXNAMELEN];
-		} drr_begin;
-		struct drr_end {
-			zio_cksum_t drr_checksum;
-			uint64_t drr_toguid;
-		} drr_end;
-		struct drr_object {
-			uint64_t drr_object;
-			dmu_object_type_t drr_type;
-			dmu_object_type_t drr_bonustype;
-			uint32_t drr_blksz;
-			uint32_t drr_bonuslen;
-			uint8_t drr_checksumtype;
-			uint8_t drr_compress;
-			uint8_t drr_dn_slots;
-			uint8_t drr_pad[5];
-			uint64_t drr_toguid;
-			/* bonus content follows */
-		} drr_object;
-		struct drr_freeobjects {
-			uint64_t drr_firstobj;
-			uint64_t drr_numobjs;
-			uint64_t drr_toguid;
-		} drr_freeobjects;
-		struct drr_write {
-			uint64_t drr_object;
-			dmu_object_type_t drr_type;
-			uint32_t drr_pad;
-			uint64_t drr_offset;
-			uint64_t drr_logical_size;
-			uint64_t drr_toguid;
-			uint8_t drr_checksumtype;
-			uint8_t drr_checksumflags;
-			uint8_t drr_compressiontype;
-			uint8_t drr_pad2[5];
-			/* deduplication key */
-			ddt_key_t drr_key;
-			/* only nonzero if drr_compressiontype is not 0 */
-			uint64_t drr_compressed_size;
-			/* content follows */
-		} drr_write;
-		struct drr_free {
-			uint64_t drr_object;
-			uint64_t drr_offset;
-			uint64_t drr_length;
-			uint64_t drr_toguid;
-		} drr_free;
-		struct drr_write_byref {
-			/* where to put the data */
-			uint64_t drr_object;
-			uint64_t drr_offset;
-			uint64_t drr_length;
-			uint64_t drr_toguid;
-			/* where to find the prior copy of the data */
-			uint64_t drr_refguid;
-			uint64_t drr_refobject;
-			uint64_t drr_refoffset;
-			/* properties of the data */
-			uint8_t drr_checksumtype;
-			uint8_t drr_checksumflags;
-			uint8_t drr_pad2[6];
-			ddt_key_t drr_key; /* deduplication key */
-		} drr_write_byref;
-		struct drr_spill {
-			uint64_t drr_object;
-			uint64_t drr_length;
-			uint64_t drr_toguid;
-			uint64_t drr_pad[4]; /* needed for crypto */
-			/* spill data follows */
-		} drr_spill;
-		struct drr_write_embedded {
-			uint64_t drr_object;
-			uint64_t drr_offset;
-			/* logical length, should equal blocksize */
-			uint64_t drr_length;
-			uint64_t drr_toguid;
-			uint8_t drr_compression;
-			uint8_t drr_etype;
-			uint8_t drr_pad[6];
-			uint32_t drr_lsize; /* uncompressed size of payload */
-			uint32_t drr_psize; /* compr. (real) size of payload */
-			/* (possibly compressed) content follows */
-		} drr_write_embedded;
-
-		/*
-		 * Nore: drr_checksum is overlaid with all record types
-		 * except DRR_BEGIN.  Therefore its (non-pad) members
-		 * must not overlap with members from the other structs.
-		 * We accomplish this by putting its members at the very
-		 * end of the struct.
-		 */
-		struct drr_checksum {
-			uint64_t drr_pad[34];
-			/*
-			 * fletcher-4 checksum of everything preceding the
-			 * checksum.
-			 */
-			zio_cksum_t drr_checksum;
-		} drr_checksum;
-	} drr_u;
-} dmu_replay_record_t;
-
-/* diff record range types */
-typedef enum diff_type {
-	DDR_NONE = 0x1,
-	DDR_INUSE = 0x2,
-	DDR_FREE = 0x4
-} diff_type_t;
-
-/*
- * The diff reports back ranges of free or in-use objects.
- */
-typedef struct dmu_diff_record {
-	uint64_t ddr_type;
-	uint64_t ddr_first;
-	uint64_t ddr_last;
-} dmu_diff_record_t;
-
-typedef struct zinject_record {
-	uint64_t	zi_objset;
-	uint64_t	zi_object;
-	uint64_t	zi_start;
-	uint64_t	zi_end;
-	uint64_t	zi_guid;
-	uint32_t	zi_level;
-	uint32_t	zi_error;
-	uint64_t	zi_type;
-	uint32_t	zi_freq;
-	uint32_t	zi_failfast;
-	char		zi_func[MAXNAMELEN];
-	uint32_t	zi_iotype;
-	int32_t		zi_duration;
-	uint64_t	zi_timer;
-	uint64_t	zi_nlanes;
-	uint32_t	zi_cmd;
-	uint32_t	zi_pad;
-} zinject_record_t;
-
-#define	ZINJECT_NULL		0x1
-#define	ZINJECT_FLUSH_ARC	0x2
-#define	ZINJECT_UNLOAD_SPA	0x4
-
-typedef enum zinject_type {
-	ZINJECT_UNINITIALIZED,
-	ZINJECT_DATA_FAULT,
-	ZINJECT_DEVICE_FAULT,
-	ZINJECT_LABEL_FAULT,
-	ZINJECT_IGNORED_WRITES,
-	ZINJECT_PANIC,
-	ZINJECT_DELAY_IO,
-} zinject_type_t;
-
-typedef struct zfs_share {
-	uint64_t	z_exportdata;
-	uint64_t	z_sharedata;
-	uint64_t	z_sharetype;	/* 0 = share, 1 = unshare */
-	uint64_t	z_sharemax;  /* max length of share string */
-} zfs_share_t;
-
-/*
- * ZFS file systems may behave the usual, POSIX-compliant way, where
- * name lookups are case-sensitive.  They may also be set up so that
- * all the name lookups are case-insensitive, or so that only some
- * lookups, the ones that set an FIGNORECASE flag, are case-insensitive.
- */
-typedef enum zfs_case {
-	ZFS_CASE_SENSITIVE,
-	ZFS_CASE_INSENSITIVE,
-	ZFS_CASE_MIXED
-} zfs_case_t;
-
-/*
- * Note: this struct must have the same layout in 32-bit and 64-bit, so
- * that 32-bit processes (like /sbin/zfs) can pass it to the 64-bit
- * kernel.  Therefore, we add padding to it so that no "hidden" padding
- * is automatically added on 64-bit (but not on 32-bit).
- */
-typedef struct zfs_cmd {
-	char		zc_name[MAXPATHLEN];	/* name of pool or dataset */
-	uint64_t	zc_nvlist_src;		/* really (char *) */
-	uint64_t	zc_nvlist_src_size;
-	uint64_t	zc_nvlist_dst;		/* really (char *) */
-	uint64_t	zc_nvlist_dst_size;
-	boolean_t	zc_nvlist_dst_filled;	/* put an nvlist in dst? */
-	int		zc_pad2;
-
-	/*
-	 * The following members are for legacy ioctls which haven't been
-	 * converted to the new method.
-	 */
-	uint64_t	zc_history;		/* really (char *) */
-	char		zc_value[MAXPATHLEN * 2];
-	char		zc_string[MAXNAMELEN];
-	uint64_t	zc_guid;
-	uint64_t	zc_nvlist_conf;		/* really (char *) */
-	uint64_t	zc_nvlist_conf_size;
-	uint64_t	zc_cookie;
-	uint64_t	zc_objset_type;
-	uint64_t	zc_perm_action;
-	uint64_t	zc_history_len;
-	uint64_t	zc_history_offset;
-	uint64_t	zc_obj;
-	uint64_t	zc_iflags;		/* internal to zfs(7fs) */
-	zfs_share_t	zc_share;
-	uint64_t	zc_jailid;
-	dmu_objset_stats_t zc_objset_stats;
-	dmu_replay_record_t zc_begin_record;
-	zinject_record_t zc_inject_record;
-	uint32_t	zc_defer_destroy;
-	uint32_t	zc_flags;
-	uint64_t	zc_action_handle;
-	int		zc_cleanup_fd;
-	uint8_t		zc_simple;
-	uint8_t		zc_pad3[3];
-	boolean_t	zc_resumable;
-	uint32_t	zc_pad4;
-	uint64_t	zc_sendobj;
-	uint64_t	zc_fromobj;
-	uint64_t	zc_createtxg;
-	zfs_stat_t	zc_stat;
-} zfs_cmd_t;
-
-typedef struct zfs_useracct {
-	char zu_domain[256];
-	uid_t zu_rid;
-	uint32_t zu_pad;
-	uint64_t zu_space;
-} zfs_useracct_t;
-
-#define	ZFSDEV_MAX_MINOR	(1 << 16)
-#define	ZFS_MIN_MINOR	(ZFSDEV_MAX_MINOR + 1)
-
-#define	ZPOOL_EXPORT_AFTER_SPLIT 0x1
-
-#ifdef _KERNEL
-struct objset;
-struct zfsvfs;
-
-typedef struct zfs_creat {
-	nvlist_t	*zct_zplprops;
-	nvlist_t	*zct_props;
-} zfs_creat_t;
-
-extern int zfs_secpolicy_snapshot_perms(const char *, cred_t *);
-extern int zfs_secpolicy_rename_perms(const char *, const char *, cred_t *);
-extern int zfs_secpolicy_destroy_perms(const char *, cred_t *);
-extern int zfs_busy(void);
-extern void zfs_unmount_snap(const char *);
-extern void zfs_destroy_unmount_origin(const char *);
-#ifdef illumos
-extern int getzfsvfs_impl(struct objset *, struct zfsvfs **);
-#else
-extern int getzfsvfs_impl(struct objset *, vfs_t **);
-#endif
-extern int getzfsvfs(const char *, struct zfsvfs **);
-
-/*
- * ZFS minor numbers can refer to either a control device instance or
- * a zvol. Depending on the value of zss_type, zss_data points to either
- * a zvol_state_t or a zfs_onexit_t.
- */
-enum zfs_soft_state_type {
-	ZSST_ZVOL,
-	ZSST_CTLDEV
-};
-
-typedef struct zfs_soft_state {
-	enum zfs_soft_state_type zss_type;
-	void *zss_data;
-} zfs_soft_state_t;
-
-extern void *zfsdev_get_soft_state(minor_t minor,
-    enum zfs_soft_state_type which);
-extern minor_t zfsdev_minor_alloc(void);
-
-extern void *zfsdev_state;
-
-#endif	/* _KERNEL */
-
-#ifdef	__cplusplus
-}
-#endif
-
-#endif	/* _SYS_ZFS_IOCTL_H */
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_onexit.h
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_onexit.h
+++ /dev/null
@@ -1,66 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-
-/*
- * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
- */
-
-#ifndef	_SYS_ZFS_ONEXIT_H
-#define	_SYS_ZFS_ONEXIT_H
-
-#include <sys/zfs_context.h>
-
-#ifdef	__cplusplus
-extern "C" {
-#endif
-
-#ifdef _KERNEL
-
-typedef struct zfs_onexit {
-	kmutex_t	zo_lock;
-	list_t		zo_actions;
-} zfs_onexit_t;
-
-typedef struct zfs_onexit_action_node {
-	list_node_t	za_link;
-	void		(*za_func)(void *);
-	void		*za_data;
-} zfs_onexit_action_node_t;
-
-extern void zfs_onexit_init(zfs_onexit_t **zo);
-extern void zfs_onexit_destroy(zfs_onexit_t *zo);
-
-#endif
-
-extern int zfs_onexit_fd_hold(int fd, minor_t *minorp);
-extern void zfs_onexit_fd_rele(int fd);
-extern int zfs_onexit_add_cb(minor_t minor, void (*func)(void *), void *data,
-    uint64_t *action_handle);
-extern int zfs_onexit_del_cb(minor_t minor, uint64_t action_handle,
-    boolean_t fire);
-extern int zfs_onexit_cb_data(minor_t minor, uint64_t action_handle,
-    void **data);
-
-#ifdef	__cplusplus
-}
-#endif
-
-#endif	/* _SYS_ZFS_ONEXIT_H */
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_rlock.h
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_rlock.h
+++ /dev/null
@@ -1,90 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-/*
- * Copyright (c) 2018 by Delphix. All rights reserved.
- */
-
-#ifndef	_SYS_FS_ZFS_RLOCK_H
-#define	_SYS_FS_ZFS_RLOCK_H
-
-#ifdef	__cplusplus
-extern "C" {
-#endif
-
-#ifdef __FreeBSD__
-#define	rangelock_init		zfs_rangelock_init
-#define	rangelock_fini		zfs_rangelock_fini
-#endif
-
-typedef enum {
-	RL_READER,
-	RL_WRITER,
-	RL_APPEND
-} rangelock_type_t;
-
-struct locked_range;
-
-typedef void (rangelock_cb_t)(struct locked_range *, void *);
-
-#ifdef __FreeBSD__
-typedef struct zfs_rangelock {
-#else
-typedef struct rangelock {
-#endif
-	avl_tree_t rl_tree; /* contains locked_range_t */
-	kmutex_t rl_lock;
-	rangelock_cb_t *rl_cb;
-	void *rl_arg;
-} rangelock_t;
-
-typedef struct locked_range {
-	rangelock_t *lr_rangelock; /* rangelock that this lock applies to */
-	avl_node_t lr_node;	/* avl node link */
-	uint64_t lr_offset;	/* file range offset */
-	uint64_t lr_length;	/* file range length */
-	uint_t lr_count;	/* range reference count in tree */
-	rangelock_type_t lr_type; /* range type */
-	kcondvar_t lr_write_cv;	/* cv for waiting writers */
-	kcondvar_t lr_read_cv;	/* cv for waiting readers */
-	uint8_t lr_proxy;	/* acting for original range */
-	uint8_t lr_write_wanted; /* writer wants to lock this range */
-	uint8_t lr_read_wanted;	/* reader wants to lock this range */
-} locked_range_t;
-
-void rangelock_init(rangelock_t *, rangelock_cb_t *, void *);
-void rangelock_fini(rangelock_t *);
-
-locked_range_t *rangelock_enter(rangelock_t *,
-    uint64_t, uint64_t, rangelock_type_t);
-locked_range_t *rangelock_tryenter(rangelock_t *,
-    uint64_t, uint64_t, rangelock_type_t);
-void rangelock_exit(locked_range_t *);
-void rangelock_reduce(locked_range_t *, uint64_t, uint64_t);
-
-#ifdef	__cplusplus
-}
-#endif
-
-#endif	/* _SYS_FS_ZFS_RLOCK_H */
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_sa.h
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_sa.h
+++ /dev/null
@@ -1,142 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-
-#ifndef	_SYS_ZFS_SA_H
-#define	_SYS_ZFS_SA_H
-
-#ifdef _KERNEL
-#include <sys/list.h>
-#include <sys/dmu.h>
-#include <sys/zfs_acl.h>
-#include <sys/zfs_znode.h>
-#include <sys/sa.h>
-#include <sys/zil.h>
-
-
-#endif
-
-#ifdef	__cplusplus
-extern "C" {
-#endif
-
-/*
- * This is the list of known attributes
- * to the ZPL.  The values of the actual
- * attributes are not defined by the order
- * the enums.  It is controlled by the attribute
- * registration mechanism.  Two different file system
- * could have different numeric values for the same
- * attributes.  this list is only used for dereferencing
- * into the table that will hold the actual numeric value.
- */
-typedef enum zpl_attr {
-	ZPL_ATIME,
-	ZPL_MTIME,
-	ZPL_CTIME,
-	ZPL_CRTIME,
-	ZPL_GEN,
-	ZPL_MODE,
-	ZPL_SIZE,
-	ZPL_PARENT,
-	ZPL_LINKS,
-	ZPL_XATTR,
-	ZPL_RDEV,
-	ZPL_FLAGS,
-	ZPL_UID,
-	ZPL_GID,
-	ZPL_PAD,
-	ZPL_ZNODE_ACL,
-	ZPL_DACL_COUNT,
-	ZPL_SYMLINK,
-	ZPL_SCANSTAMP,
-	ZPL_DACL_ACES,
-	ZPL_END
-} zpl_attr_t;
-
-#define	ZFS_OLD_ZNODE_PHYS_SIZE	0x108
-#define	ZFS_SA_BASE_ATTR_SIZE	(ZFS_OLD_ZNODE_PHYS_SIZE - \
-    sizeof (zfs_acl_phys_t))
-
-#define	SA_MODE_OFFSET		0
-#define	SA_SIZE_OFFSET		8
-#define	SA_GEN_OFFSET		16
-#define	SA_UID_OFFSET		24
-#define	SA_GID_OFFSET		32
-#define	SA_PARENT_OFFSET	40
-
-extern sa_attr_reg_t zfs_attr_table[ZPL_END + 1];
-extern sa_attr_reg_t zfs_legacy_attr_table[ZPL_END + 1];
-
-/*
- * This is a deprecated data structure that only exists for
- * dealing with file systems create prior to ZPL version 5.
- */
-typedef struct znode_phys {
-	uint64_t zp_atime[2];		/*  0 - last file access time */
-	uint64_t zp_mtime[2];		/* 16 - last file modification time */
-	uint64_t zp_ctime[2];		/* 32 - last file change time */
-	uint64_t zp_crtime[2];		/* 48 - creation time */
-	uint64_t zp_gen;		/* 64 - generation (txg of creation) */
-	uint64_t zp_mode;		/* 72 - file mode bits */
-	uint64_t zp_size;		/* 80 - size of file */
-	uint64_t zp_parent;		/* 88 - directory parent (`..') */
-	uint64_t zp_links;		/* 96 - number of links to file */
-	uint64_t zp_xattr;		/* 104 - DMU object for xattrs */
-	uint64_t zp_rdev;		/* 112 - dev_t for VBLK & VCHR files */
-	uint64_t zp_flags;		/* 120 - persistent flags */
-	uint64_t zp_uid;		/* 128 - file owner */
-	uint64_t zp_gid;		/* 136 - owning group */
-	uint64_t zp_zap;		/* 144 - extra attributes */
-	uint64_t zp_pad[3];		/* 152 - future */
-	zfs_acl_phys_t zp_acl;		/* 176 - 263 ACL */
-	/*
-	 * Data may pad out any remaining bytes in the znode buffer, eg:
-	 *
-	 * |<---------------------- dnode_phys (512) ------------------------>|
-	 * |<-- dnode (192) --->|<----------- "bonus" buffer (320) ---------->|
-	 *			|<---- znode (264) ---->|<---- data (56) ---->|
-	 *
-	 * At present, we use this space for the following:
-	 *  - symbolic links
-	 *  - 32-byte anti-virus scanstamp (regular files only)
-	 */
-} znode_phys_t;
-
-#ifdef _KERNEL
-int zfs_sa_readlink(struct znode *, uio_t *);
-void zfs_sa_symlink(struct znode *, char *link, int len, dmu_tx_t *);
-void zfs_sa_upgrade(struct sa_handle  *, dmu_tx_t *);
-void zfs_sa_get_scanstamp(struct znode *, xvattr_t *);
-void zfs_sa_set_scanstamp(struct znode *, xvattr_t *, dmu_tx_t *);
-void zfs_sa_uprade_pre(struct sa_handle *, void *, dmu_tx_t *);
-void zfs_sa_upgrade_post(struct sa_handle *, void *, dmu_tx_t *);
-void zfs_sa_upgrade_txholds(dmu_tx_t *, struct znode *);
-#endif
-
-#ifdef	__cplusplus
-}
-#endif
-
-#endif	/* _SYS_ZFS_SA_H */
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_stat.h
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_stat.h
+++ /dev/null
@@ -1,55 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
- */
-
-#ifndef	_SYS_FS_ZFS_STAT_H
-#define	_SYS_FS_ZFS_STAT_H
-
-#ifdef _KERNEL
-#include <sys/isa_defs.h>
-#include <sys/dmu.h>
-#endif
-
-#ifdef	__cplusplus
-extern "C" {
-#endif
-
-/*
- * A limited number of zpl level stats are retrievable
- * with an ioctl.  zfs diff is the current consumer.
- */
-typedef struct zfs_stat {
-	uint64_t	zs_gen;
-	uint64_t	zs_mode;
-	uint64_t	zs_links;
-	uint64_t	zs_ctime[2];
-} zfs_stat_t;
-
-extern int zfs_obj_to_stats(objset_t *osp, uint64_t obj, zfs_stat_t *sb,
-    char *buf, int len);
-
-#ifdef	__cplusplus
-}
-#endif
-
-#endif	/* _SYS_FS_ZFS_STAT_H */
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_vfsops.h
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_vfsops.h
+++ /dev/null
@@ -1,192 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011 Pawel Jakub Dawidek <pawel@dawidek.net>.
- * All rights reserved.
- */
-
-#ifndef	_SYS_FS_ZFS_VFSOPS_H
-#define	_SYS_FS_ZFS_VFSOPS_H
-
-#include <sys/list.h>
-#include <sys/vfs.h>
-#include <sys/zil.h>
-#include <sys/sa.h>
-#include <sys/rrwlock.h>
-#include <sys/zfs_ioctl.h>
-#include <sys/rmlock.h>
-
-#ifdef	__cplusplus
-extern "C" {
-#endif
-
-typedef struct zfsvfs zfsvfs_t;
-struct znode;
-
-struct zfsvfs {
-	vfs_t		*z_vfs;		/* generic fs struct */
-	zfsvfs_t	*z_parent;	/* parent fs */
-	objset_t	*z_os;		/* objset reference */
-	uint64_t	z_root;		/* id of root znode */
-	uint64_t	z_unlinkedobj;	/* id of unlinked zapobj */
-	uint64_t	z_max_blksz;	/* maximum block size for files */
-	uint64_t	z_fuid_obj;	/* fuid table object number */
-	uint64_t	z_fuid_size;	/* fuid table size */
-	avl_tree_t	z_fuid_idx;	/* fuid tree keyed by index */
-	avl_tree_t	z_fuid_domain;	/* fuid tree keyed by domain */
-	krwlock_t	z_fuid_lock;	/* fuid lock */
-	boolean_t	z_fuid_loaded;	/* fuid tables are loaded */
-	boolean_t	z_fuid_dirty;   /* need to sync fuid table ? */
-	struct zfs_fuid_info	*z_fuid_replay; /* fuid info for replay */
-	zilog_t		*z_log;		/* intent log pointer */
-	uint_t		z_acl_mode;	/* acl chmod/mode behavior */
-	uint_t		z_acl_inherit;	/* acl inheritance behavior */
-	zfs_case_t	z_case;		/* case-sense */
-	boolean_t	z_utf8;		/* utf8-only */
-	int		z_norm;		/* normalization flags */
-	boolean_t	z_atime;	/* enable atimes mount option */
-	boolean_t	z_unmounted;	/* unmounted */
-	rrmlock_t	z_teardown_lock;
-	struct rmslock	z_teardown_inactive_lock;
-	list_t		z_all_znodes;	/* all vnodes in the fs */
-	kmutex_t	z_znodes_lock;	/* lock for z_all_znodes */
-	struct zfsctl_root	*z_ctldir;	/* .zfs directory pointer */
-	boolean_t	z_show_ctldir;	/* expose .zfs in the root dir */
-	boolean_t	z_issnap;	/* true if this is a snapshot */
-	boolean_t	z_vscan;	/* virus scan on/off */
-	boolean_t	z_use_fuids;	/* version allows fuids */
-	boolean_t	z_replay;	/* set during ZIL replay */
-	boolean_t	z_use_sa;	/* version allow system attributes */
-	boolean_t	z_use_namecache;/* make use of FreeBSD name cache */
-	uint64_t	z_version;	/* ZPL version */
-	uint64_t	z_shares_dir;	/* hidden shares dir */
-	kmutex_t	z_lock;
-	uint64_t	z_userquota_obj;
-	uint64_t	z_groupquota_obj;
-	uint64_t	z_replay_eof;	/* New end of file - replay only */
-	sa_attr_type_t	*z_attr_table;	/* SA attr mapping->id */
-#define	ZFS_OBJ_MTX_SZ	64
-	kmutex_t	z_hold_mtx[ZFS_OBJ_MTX_SZ];	/* znode hold locks */
-#if defined(__FreeBSD__)
-	struct task	z_unlinked_drain_task;
-#endif
-};
-
-#define	ZFS_TRYRLOCK_TEARDOWN_INACTIVE(zfsvfs) \
-		rms_try_rlock(&(zfsvfs)->z_teardown_inactive_lock)
-
-#define	ZFS_RLOCK_TEARDOWN_INACTIVE(zfsvfs) \
-		rms_rlock(&(zfsvfs)->z_teardown_inactive_lock)
-
-#define	ZFS_RUNLOCK_TEARDOWN_INACTIVE(zfsvfs) \
-		rms_runlock(&(zfsvfs)->z_teardown_inactive_lock)
-
-#define	ZFS_WLOCK_TEARDOWN_INACTIVE(zfsvfs) \
-		rms_wlock(&(zfsvfs)->z_teardown_inactive_lock)
-
-#define	ZFS_WUNLOCK_TEARDOWN_INACTIVE(zfsvfs) \
-		rms_wunlock(&(zfsvfs)->z_teardown_inactive_lock)
-
-#define	ZFS_TEARDOWN_INACTIVE_WLOCKED(zfsvfs) \
-		rms_wowned(&(zfsvfs)->z_teardown_inactive_lock)
-
-/*
- * Normal filesystems (those not under .zfs/snapshot) have a total
- * file ID size limited to 12 bytes (including the length field) due to
- * NFSv2 protocol's limitation of 32 bytes for a filehandle.  For historical
- * reasons, this same limit is being imposed by the Solaris NFSv3 implementation
- * (although the NFSv3 protocol actually permits a maximum of 64 bytes).  It
- * is not possible to expand beyond 12 bytes without abandoning support
- * of NFSv2.
- *
- * For normal filesystems, we partition up the available space as follows:
- *	2 bytes		fid length (required)
- *	6 bytes		object number (48 bits)
- *	4 bytes		generation number (32 bits)
- *
- * We reserve only 48 bits for the object number, as this is the limit
- * currently defined and imposed by the DMU.
- */
-typedef struct zfid_short {
-	uint16_t	zf_len;
-	uint8_t		zf_object[6];		/* obj[i] = obj >> (8 * i) */
-	uint8_t		zf_gen[4];		/* gen[i] = gen >> (8 * i) */
-} zfid_short_t;
-
-/*
- * Filesystems under .zfs/snapshot have a total file ID size of 22[*] bytes
- * (including the length field).  This makes files under .zfs/snapshot
- * accessible by NFSv3 and NFSv4, but not NFSv2.
- *
- * For files under .zfs/snapshot, we partition up the available space
- * as follows:
- *	2 bytes		fid length (required)
- *	6 bytes		object number (48 bits)
- *	4 bytes		generation number (32 bits)
- *	6 bytes		objset id (48 bits)
- *	4 bytes[**]	currently just zero (32 bits)
- *
- * We reserve only 48 bits for the object number and objset id, as these are
- * the limits currently defined and imposed by the DMU.
- *
- * [*] 20 bytes on FreeBSD to fit into the size of struct fid.
- * [**] 2 bytes on FreeBSD for the above reason.
- */
-typedef struct zfid_long {
-	zfid_short_t	z_fid;
-	uint8_t		zf_setid[6];		/* obj[i] = obj >> (8 * i) */
-	uint8_t		zf_setgen[2];		/* gen[i] = gen >> (8 * i) */
-} zfid_long_t;
-
-#define	SHORT_FID_LEN	(sizeof (zfid_short_t) - sizeof (uint16_t))
-#define	LONG_FID_LEN	(sizeof (zfid_long_t) - sizeof (uint16_t))
-
-extern uint_t zfs_fsyncer_key;
-extern int zfs_super_owner;
-
-extern int zfs_suspend_fs(zfsvfs_t *zfsvfs);
-extern int zfs_resume_fs(zfsvfs_t *zfsvfs, struct dsl_dataset *ds);
-extern int zfs_userspace_one(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type,
-    const char *domain, uint64_t rid, uint64_t *valuep);
-extern int zfs_userspace_many(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type,
-    uint64_t *cookiep, void *vbuf, uint64_t *bufsizep);
-extern int zfs_set_userquota(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type,
-    const char *domain, uint64_t rid, uint64_t quota);
-extern boolean_t zfs_owner_overquota(zfsvfs_t *zfsvfs, struct znode *,
-    boolean_t isgroup);
-extern boolean_t zfs_fuid_overquota(zfsvfs_t *zfsvfs, boolean_t isgroup,
-    uint64_t fuid);
-extern int zfs_set_version(zfsvfs_t *zfsvfs, uint64_t newvers);
-extern int zfsvfs_create(const char *name, zfsvfs_t **zfvp);
-extern int zfsvfs_create_impl(zfsvfs_t **zfvp, zfsvfs_t *zfsvfs, objset_t *os);
-extern void zfsvfs_free(zfsvfs_t *zfsvfs);
-extern int zfs_check_global_label(const char *dsname, const char *hexsl);
-
-#ifdef _KERNEL
-extern void zfsvfs_update_fromname(const char *oldname, const char *newname);
-#endif
-
-#ifdef	__cplusplus
-}
-#endif
-
-#endif	/* _SYS_FS_ZFS_VFSOPS_H */
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_znode.h
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_znode.h
+++ /dev/null
@@ -1,374 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
- * Copyright (c) 2014 Integros [integros.com]
- * Copyright 2016 Nexenta Systems, Inc. All rights reserved.
- */
-
-#ifndef	_SYS_FS_ZFS_ZNODE_H
-#define	_SYS_FS_ZFS_ZNODE_H
-
-#ifdef _KERNEL
-#include <sys/list.h>
-#include <sys/dmu.h>
-#include <sys/sa.h>
-#include <sys/zfs_vfsops.h>
-#include <sys/rrwlock.h>
-#include <sys/zfs_sa.h>
-#include <sys/zfs_stat.h>
-#include <sys/zfs_rlock.h>
-#endif
-#include <sys/zfs_acl.h>
-#include <sys/zil.h>
-
-#ifdef	__cplusplus
-extern "C" {
-#endif
-
-/*
- * Additional file level attributes, that are stored
- * in the upper half of zp_flags
- */
-#define	ZFS_READONLY		0x0000000100000000
-#define	ZFS_HIDDEN		0x0000000200000000
-#define	ZFS_SYSTEM		0x0000000400000000
-#define	ZFS_ARCHIVE		0x0000000800000000
-#define	ZFS_IMMUTABLE		0x0000001000000000
-#define	ZFS_NOUNLINK		0x0000002000000000
-#define	ZFS_APPENDONLY		0x0000004000000000
-#define	ZFS_NODUMP		0x0000008000000000
-#define	ZFS_OPAQUE		0x0000010000000000
-#define	ZFS_AV_QUARANTINED	0x0000020000000000
-#define	ZFS_AV_MODIFIED		0x0000040000000000
-#define	ZFS_REPARSE		0x0000080000000000
-#define	ZFS_OFFLINE		0x0000100000000000
-#define	ZFS_SPARSE		0x0000200000000000
-
-#define	ZFS_ATTR_SET(zp, attr, value, pflags, tx) \
-{ \
-	if (value) \
-		pflags |= attr; \
-	else \
-		pflags &= ~attr; \
-	VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_FLAGS(zp->z_zfsvfs), \
-	    &pflags, sizeof (pflags), tx)); \
-}
-
-/*
- * Define special zfs pflags
- */
-#define	ZFS_XATTR		0x1		/* is an extended attribute */
-#define	ZFS_INHERIT_ACE		0x2		/* ace has inheritable ACEs */
-#define	ZFS_ACL_TRIVIAL		0x4		/* files ACL is trivial */
-#define	ZFS_ACL_OBJ_ACE		0x8		/* ACL has CMPLX Object ACE */
-#define	ZFS_ACL_PROTECTED	0x10		/* ACL protected */
-#define	ZFS_ACL_DEFAULTED	0x20		/* ACL should be defaulted */
-#define	ZFS_ACL_AUTO_INHERIT	0x40		/* ACL should be inherited */
-#define	ZFS_BONUS_SCANSTAMP	0x80		/* Scanstamp in bonus area */
-#define	ZFS_NO_EXECS_DENIED	0x100		/* exec was given to everyone */
-
-#define	SA_ZPL_ATIME(z)		z->z_attr_table[ZPL_ATIME]
-#define	SA_ZPL_MTIME(z)		z->z_attr_table[ZPL_MTIME]
-#define	SA_ZPL_CTIME(z)		z->z_attr_table[ZPL_CTIME]
-#define	SA_ZPL_CRTIME(z)	z->z_attr_table[ZPL_CRTIME]
-#define	SA_ZPL_GEN(z)		z->z_attr_table[ZPL_GEN]
-#define	SA_ZPL_DACL_ACES(z)	z->z_attr_table[ZPL_DACL_ACES]
-#define	SA_ZPL_XATTR(z)		z->z_attr_table[ZPL_XATTR]
-#define	SA_ZPL_SYMLINK(z)	z->z_attr_table[ZPL_SYMLINK]
-#define	SA_ZPL_RDEV(z)		z->z_attr_table[ZPL_RDEV]
-#define	SA_ZPL_SCANSTAMP(z)	z->z_attr_table[ZPL_SCANSTAMP]
-#define	SA_ZPL_UID(z)		z->z_attr_table[ZPL_UID]
-#define	SA_ZPL_GID(z)		z->z_attr_table[ZPL_GID]
-#define	SA_ZPL_PARENT(z)	z->z_attr_table[ZPL_PARENT]
-#define	SA_ZPL_LINKS(z)		z->z_attr_table[ZPL_LINKS]
-#define	SA_ZPL_MODE(z)		z->z_attr_table[ZPL_MODE]
-#define	SA_ZPL_DACL_COUNT(z)	z->z_attr_table[ZPL_DACL_COUNT]
-#define	SA_ZPL_FLAGS(z)		z->z_attr_table[ZPL_FLAGS]
-#define	SA_ZPL_SIZE(z)		z->z_attr_table[ZPL_SIZE]
-#define	SA_ZPL_ZNODE_ACL(z)	z->z_attr_table[ZPL_ZNODE_ACL]
-#define	SA_ZPL_PAD(z)		z->z_attr_table[ZPL_PAD]
-
-/*
- * Is ID ephemeral?
- */
-#define	IS_EPHEMERAL(x)		(x > MAXUID)
-
-/*
- * Should we use FUIDs?
- */
-#define	USE_FUIDS(version, os)	(version >= ZPL_VERSION_FUID && \
-    spa_version(dmu_objset_spa(os)) >= SPA_VERSION_FUID)
-#define	USE_SA(version, os) (version >= ZPL_VERSION_SA && \
-    spa_version(dmu_objset_spa(os)) >= SPA_VERSION_SA)
-
-#define	MASTER_NODE_OBJ	1
-
-/*
- * Special attributes for master node.
- * "userquota@" and "groupquota@" are also valid (from
- * zfs_userquota_prop_prefixes[]).
- */
-#define	ZFS_FSID		"FSID"
-#define	ZFS_UNLINKED_SET	"DELETE_QUEUE"
-#define	ZFS_ROOT_OBJ		"ROOT"
-#define	ZPL_VERSION_STR		"VERSION"
-#define	ZFS_FUID_TABLES		"FUID"
-#define	ZFS_SHARES_DIR		"SHARES"
-#define	ZFS_SA_ATTRS		"SA_ATTRS"
-
-/*
- * Convert mode bits (zp_mode) to BSD-style DT_* values for storing in
- * the directory entries.
- */
-#ifndef IFTODT
-#define	IFTODT(mode) (((mode) & S_IFMT) >> 12)
-#endif
-
-/*
- * The directory entry has the type (currently unused on Solaris) in the
- * top 4 bits, and the object number in the low 48 bits.  The "middle"
- * 12 bits are unused.
- */
-#define	ZFS_DIRENT_TYPE(de) BF64_GET(de, 60, 4)
-#define	ZFS_DIRENT_OBJ(de) BF64_GET(de, 0, 48)
-
-/*
- * Directory entry locks control access to directory entries.
- * They are used to protect creates, deletes, and renames.
- * Each directory znode has a mutex and a list of locked names.
- */
-#ifdef _KERNEL
-typedef struct zfs_dirlock {
-	char		*dl_name;	/* directory entry being locked */
-	uint32_t	dl_sharecnt;	/* 0 if exclusive, > 0 if shared */
-	uint8_t		dl_namelock;	/* 1 if z_name_lock is NOT held */
-	uint16_t	dl_namesize;	/* set if dl_name was allocated */
-	kcondvar_t	dl_cv;		/* wait for entry to be unlocked */
-	struct znode	*dl_dzp;	/* directory znode */
-	struct zfs_dirlock *dl_next;	/* next in z_dirlocks list */
-} zfs_dirlock_t;
-
-typedef struct znode {
-	struct zfsvfs	*z_zfsvfs;
-	vnode_t		*z_vnode;
-	uint64_t	z_id;		/* object ID for this znode */
-#ifdef illumos
-	kmutex_t	z_lock;		/* znode modification lock */
-	krwlock_t	z_parent_lock;	/* parent lock for directories */
-	krwlock_t	z_name_lock;	/* "master" lock for dirent locks */
-	zfs_dirlock_t	*z_dirlocks;	/* directory entry lock list */
-#endif
-	rangelock_t	z_rangelock;	/* file range locks */
-	uint8_t		z_unlinked;	/* file has been unlinked */
-	uint8_t		z_atime_dirty;	/* atime needs to be synced */
-	uint8_t		z_zn_prefetch;	/* Prefetch znodes? */
-	uint8_t		z_moved;	/* Has this znode been moved? */
-	uint_t		z_blksz;	/* block size in bytes */
-	uint_t		z_seq;		/* modification sequence number */
-	uint64_t	z_mapcnt;	/* number of pages mapped to file */
-	uint64_t	z_dnodesize;	/* dnode size */
-	uint64_t	z_gen;		/* generation (cached) */
-	uint64_t	z_size;		/* file size (cached) */
-	uint64_t	z_atime[2];	/* atime (cached) */
-	uint64_t	z_links;	/* file links (cached) */
-	uint64_t	z_pflags;	/* pflags (cached) */
-	uint64_t	z_uid;		/* uid fuid (cached) */
-	uint64_t	z_gid;		/* gid fuid (cached) */
-	mode_t		z_mode;		/* mode (cached) */
-	uint32_t	z_sync_cnt;	/* synchronous open count */
-	kmutex_t	z_acl_lock;	/* acl data lock */
-	zfs_acl_t	*z_acl_cached;	/* cached acl */
-	list_node_t	z_link_node;	/* all znodes in fs link */
-	sa_handle_t	*z_sa_hdl;	/* handle to sa data */
-	boolean_t	z_is_sa;	/* are we native sa? */
-} znode_t;
-
-#define	ZFS_LINK_MAX	UINT64_MAX
-
-/*
- * Range locking rules
- * --------------------
- * 1. When truncating a file (zfs_create, zfs_setattr, zfs_space) the whole
- *    file range needs to be locked as RL_WRITER. Only then can the pages be
- *    freed etc and zp_size reset. zp_size must be set within range lock.
- * 2. For writes and punching holes (zfs_write & zfs_space) just the range
- *    being written or freed needs to be locked as RL_WRITER.
- *    Multiple writes at the end of the file must coordinate zp_size updates
- *    to ensure data isn't lost. A compare and swap loop is currently used
- *    to ensure the file size is at least the offset last written.
- * 3. For reads (zfs_read, zfs_get_data & zfs_putapage) just the range being
- *    read needs to be locked as RL_READER. A check against zp_size can then
- *    be made for reading beyond end of file.
- */
-
-/*
- * Convert between znode pointers and vnode pointers
- */
-#ifdef DEBUG
-static __inline vnode_t *
-ZTOV(znode_t *zp)
-{
-	vnode_t *vp = zp->z_vnode;
-
-	ASSERT(vp != NULL && vp->v_data == zp);
-	return (vp);
-}
-static __inline znode_t *
-VTOZ(vnode_t *vp)
-{
-	znode_t *zp = (znode_t *)vp->v_data;
-
-	ASSERT(zp != NULL && zp->z_vnode == vp);
-	return (zp);
-}
-#else
-#define	ZTOV(ZP)	((ZP)->z_vnode)
-#define	VTOZ(VP)	((znode_t *)(VP)->v_data)
-#endif
-
-#define	VTOZ_SMR(VP)	((znode_t *)vn_load_v_data_smr(VP))
-
-/* Called on entry to each ZFS vnode and vfs operation  */
-#define	ZFS_ENTER(zfsvfs) \
-	{ \
-		rrm_enter_read(&(zfsvfs)->z_teardown_lock, FTAG); \
-		if ((zfsvfs)->z_unmounted) { \
-			ZFS_EXIT(zfsvfs); \
-			return (EIO); \
-		} \
-	}
-
-/* Must be called before exiting the vop */
-#define	ZFS_EXIT(zfsvfs) rrm_exit(&(zfsvfs)->z_teardown_lock, FTAG)
-
-/* Verifies the znode is valid */
-#define	ZFS_VERIFY_ZP(zp) \
-	if ((zp)->z_sa_hdl == NULL) { \
-		ZFS_EXIT((zp)->z_zfsvfs); \
-		return (EIO); \
-	} \
-
-/*
- * Macros for dealing with dmu_buf_hold
- */
-#define	ZFS_OBJ_HASH(obj_num)	((obj_num) & (ZFS_OBJ_MTX_SZ - 1))
-#define	ZFS_OBJ_MUTEX(zfsvfs, obj_num)	\
-	(&(zfsvfs)->z_hold_mtx[ZFS_OBJ_HASH(obj_num)])
-#define	ZFS_OBJ_HOLD_ENTER(zfsvfs, obj_num) \
-	mutex_enter(ZFS_OBJ_MUTEX((zfsvfs), (obj_num)))
-#define	ZFS_OBJ_HOLD_TRYENTER(zfsvfs, obj_num) \
-	mutex_tryenter(ZFS_OBJ_MUTEX((zfsvfs), (obj_num)))
-#define	ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num) \
-	mutex_exit(ZFS_OBJ_MUTEX((zfsvfs), (obj_num)))
-
-/* Encode ZFS stored time values from a struct timespec */
-#define	ZFS_TIME_ENCODE(tp, stmp)		\
-{						\
-	(stmp)[0] = (uint64_t)(tp)->tv_sec;	\
-	(stmp)[1] = (uint64_t)(tp)->tv_nsec;	\
-}
-
-/* Decode ZFS stored time values to a struct timespec */
-#define	ZFS_TIME_DECODE(tp, stmp)		\
-{						\
-	(tp)->tv_sec = (time_t)(stmp)[0];		\
-	(tp)->tv_nsec = (long)(stmp)[1];		\
-}
-
-/*
- * Timestamp defines
- */
-#define	ACCESSED		(AT_ATIME)
-#define	STATE_CHANGED		(AT_CTIME)
-#define	CONTENT_MODIFIED	(AT_MTIME | AT_CTIME)
-
-#define	ZFS_ACCESSTIME_STAMP(zfsvfs, zp) \
-	if ((zfsvfs)->z_atime && !((zfsvfs)->z_vfs->vfs_flag & VFS_RDONLY)) \
-		zfs_tstamp_update_setup(zp, ACCESSED, NULL, NULL, B_FALSE);
-
-extern int	zfs_init_fs(zfsvfs_t *, znode_t **);
-extern void	zfs_set_dataprop(objset_t *);
-extern void	zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *,
-    dmu_tx_t *tx);
-extern void	zfs_tstamp_update_setup(znode_t *, uint_t, uint64_t [2],
-    uint64_t [2], boolean_t);
-extern void	zfs_grow_blocksize(znode_t *, uint64_t, dmu_tx_t *);
-extern int	zfs_freesp(znode_t *, uint64_t, uint64_t, int, boolean_t);
-extern void	zfs_znode_init(void);
-extern void	zfs_znode_fini(void);
-extern int	zfs_zget(zfsvfs_t *, uint64_t, znode_t **);
-extern int	zfs_rezget(znode_t *);
-extern void	zfs_zinactive(znode_t *);
-extern void	zfs_znode_delete(znode_t *, dmu_tx_t *);
-extern void	zfs_znode_free(znode_t *);
-extern void	zfs_remove_op_tables();
-extern int	zfs_create_op_tables();
-extern dev_t	zfs_cmpldev(uint64_t);
-extern int	zfs_get_zplprop(objset_t *os, zfs_prop_t prop, uint64_t *value);
-extern int	zfs_get_stats(objset_t *os, nvlist_t *nv);
-extern boolean_t zfs_get_vfs_flag_unmounted(objset_t *os);
-extern void	zfs_znode_dmu_fini(znode_t *);
-
-extern void zfs_log_create(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
-    znode_t *dzp, znode_t *zp, char *name, vsecattr_t *, zfs_fuid_info_t *,
-    vattr_t *vap);
-extern int zfs_log_create_txtype(zil_create_t, vsecattr_t *vsecp,
-    vattr_t *vap);
-extern void zfs_log_remove(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
-    znode_t *dzp, char *name, uint64_t foid);
-#define	ZFS_NO_OBJECT	0	/* no object id */
-extern void zfs_log_link(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
-    znode_t *dzp, znode_t *zp, char *name);
-extern void zfs_log_symlink(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
-    znode_t *dzp, znode_t *zp, char *name, char *link);
-extern void zfs_log_rename(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
-    znode_t *sdzp, char *sname, znode_t *tdzp, char *dname, znode_t *szp);
-extern void zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype,
-    znode_t *zp, offset_t off, ssize_t len, int ioflag);
-extern void zfs_log_truncate(zilog_t *zilog, dmu_tx_t *tx, int txtype,
-    znode_t *zp, uint64_t off, uint64_t len);
-extern void zfs_log_setattr(zilog_t *zilog, dmu_tx_t *tx, int txtype,
-    znode_t *zp, vattr_t *vap, uint_t mask_applied, zfs_fuid_info_t *fuidp);
-#ifndef ZFS_NO_ACL
-extern void zfs_log_acl(zilog_t *zilog, dmu_tx_t *tx, znode_t *zp,
-    vsecattr_t *vsecp, zfs_fuid_info_t *fuidp);
-#endif
-extern void zfs_xvattr_set(znode_t *zp, xvattr_t *xvap, dmu_tx_t *tx);
-extern void zfs_upgrade(zfsvfs_t *zfsvfs, dmu_tx_t *tx);
-extern int zfs_create_share_dir(zfsvfs_t *zfsvfs, dmu_tx_t *tx);
-
-extern zil_get_data_t zfs_get_data;
-extern zil_replay_func_t *zfs_replay_vector[TX_MAX_TYPE];
-extern int zfsfstype;
-
-extern int zfs_znode_parent_and_name(znode_t *zp, znode_t **dzpp, char *buf);
-
-#endif /* _KERNEL */
-
-extern int zfs_obj_to_path(objset_t *osp, uint64_t obj, char *buf, int len);
-
-#ifdef	__cplusplus
-}
-#endif
-
-#endif	/* _SYS_FS_ZFS_ZNODE_H */
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil.h
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil.h
+++ /dev/null
@@ -1,464 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
- * Copyright (c) 2014 Integros [integros.com]
- */
-
-/* Portions Copyright 2010 Robert Milkowski */
-
-#ifndef	_SYS_ZIL_H
-#define	_SYS_ZIL_H
-
-#include <sys/types.h>
-#include <sys/spa.h>
-#include <sys/zio.h>
-#include <sys/dmu.h>
-
-#ifdef	__cplusplus
-extern "C" {
-#endif
-
-struct dsl_pool;
-struct dsl_dataset;
-struct lwb;
-
-/*
- * Intent log format:
- *
- * Each objset has its own intent log.  The log header (zil_header_t)
- * for objset N's intent log is kept in the Nth object of the SPA's
- * intent_log objset.  The log header points to a chain of log blocks,
- * each of which contains log records (i.e., transactions) followed by
- * a log block trailer (zil_trailer_t).  The format of a log record
- * depends on the record (or transaction) type, but all records begin
- * with a common structure that defines the type, length, and txg.
- */
-
-/*
- * Intent log header - this on disk structure holds fields to manage
- * the log.  All fields are 64 bit to easily handle cross architectures.
- */
-typedef struct zil_header {
-	uint64_t zh_claim_txg;	/* txg in which log blocks were claimed */
-	uint64_t zh_replay_seq;	/* highest replayed sequence number */
-	blkptr_t zh_log;	/* log chain */
-	uint64_t zh_claim_blk_seq; /* highest claimed block sequence number */
-	uint64_t zh_flags;	/* header flags */
-	uint64_t zh_claim_lr_seq; /* highest claimed lr sequence number */
-	uint64_t zh_pad[3];
-} zil_header_t;
-
-/*
- * zh_flags bit settings
- */
-#define	ZIL_REPLAY_NEEDED	0x1	/* replay needed - internal only */
-#define	ZIL_CLAIM_LR_SEQ_VALID	0x2	/* zh_claim_lr_seq field is valid */
-
-/*
- * Log block chaining.
- *
- * Log blocks are chained together. Originally they were chained at the
- * end of the block. For performance reasons the chain was moved to the
- * beginning of the block which allows writes for only the data being used.
- * The older position is supported for backwards compatability.
- *
- * The zio_eck_t contains a zec_cksum which for the intent log is
- * the sequence number of this log block. A seq of 0 is invalid.
- * The zec_cksum is checked by the SPA against the sequence
- * number passed in the blk_cksum field of the blkptr_t
- */
-typedef struct zil_chain {
-	uint64_t zc_pad;
-	blkptr_t zc_next_blk;	/* next block in chain */
-	uint64_t zc_nused;	/* bytes in log block used */
-	zio_eck_t zc_eck;	/* block trailer */
-} zil_chain_t;
-
-#define	ZIL_MIN_BLKSZ	4096ULL
-
-/*
- * ziltest is by and large an ugly hack, but very useful in
- * checking replay without tedious work.
- * When running ziltest we want to keep all itx's and so maintain
- * a single list in the zl_itxg[] that uses a high txg: ZILTEST_TXG
- * We subtract TXG_CONCURRENT_STATES to allow for common code.
- */
-#define	ZILTEST_TXG (UINT64_MAX - TXG_CONCURRENT_STATES)
-
-/*
- * The words of a log block checksum.
- */
-#define	ZIL_ZC_GUID_0	0
-#define	ZIL_ZC_GUID_1	1
-#define	ZIL_ZC_OBJSET	2
-#define	ZIL_ZC_SEQ	3
-
-typedef enum zil_create {
-	Z_FILE,
-	Z_DIR,
-	Z_XATTRDIR,
-} zil_create_t;
-
-/*
- * size of xvattr log section.
- * its composed of lr_attr_t + xvattr bitmap + 2 64 bit timestamps
- * for create time and a single 64 bit integer for all of the attributes,
- * and 4 64 bit integers (32 bytes) for the scanstamp.
- *
- */
-
-#define	ZIL_XVAT_SIZE(mapsize) \
-	sizeof (lr_attr_t) + (sizeof (uint32_t) * (mapsize - 1)) + \
-	(sizeof (uint64_t) * 7)
-
-/*
- * Size of ACL in log.  The ACE data is padded out to properly align
- * on 8 byte boundary.
- */
-
-#define	ZIL_ACE_LENGTH(x)	(roundup(x, sizeof (uint64_t)))
-
-/*
- * Intent log transaction types and record structures
- */
-#define	TX_COMMIT		0	/* Commit marker (no on-disk state) */
-#define	TX_CREATE		1	/* Create file */
-#define	TX_MKDIR		2	/* Make directory */
-#define	TX_MKXATTR		3	/* Make XATTR directory */
-#define	TX_SYMLINK		4	/* Create symbolic link to a file */
-#define	TX_REMOVE		5	/* Remove file */
-#define	TX_RMDIR		6	/* Remove directory */
-#define	TX_LINK			7	/* Create hard link to a file */
-#define	TX_RENAME		8	/* Rename a file */
-#define	TX_WRITE		9	/* File write */
-#define	TX_TRUNCATE		10	/* Truncate a file */
-#define	TX_SETATTR		11	/* Set file attributes */
-#define	TX_ACL_V0		12	/* Set old formatted ACL */
-#define	TX_ACL			13	/* Set ACL */
-#define	TX_CREATE_ACL		14	/* create with ACL */
-#define	TX_CREATE_ATTR		15	/* create + attrs */
-#define	TX_CREATE_ACL_ATTR	16	/* create with ACL + attrs */
-#define	TX_MKDIR_ACL		17	/* mkdir with ACL */
-#define	TX_MKDIR_ATTR		18	/* mkdir with attr */
-#define	TX_MKDIR_ACL_ATTR	19	/* mkdir with ACL + attrs */
-#define	TX_WRITE2		20	/* dmu_sync EALREADY write */
-#define	TX_MAX_TYPE		21	/* Max transaction type */
-
-/*
- * The transactions for mkdir, symlink, remove, rmdir, link, and rename
- * may have the following bit set, indicating the original request
- * specified case-insensitive handling of names.
- */
-#define	TX_CI	((uint64_t)0x1 << 63) /* case-insensitive behavior requested */
-
-/*
- * Transactions for write, truncate, setattr, acl_v0, and acl can be logged
- * out of order.  For convenience in the code, all such records must have
- * lr_foid at the same offset.
- */
-#define	TX_OOO(txtype)			\
-	((txtype) == TX_WRITE ||	\
-	(txtype) == TX_TRUNCATE ||	\
-	(txtype) == TX_SETATTR ||	\
-	(txtype) == TX_ACL_V0 ||	\
-	(txtype) == TX_ACL ||		\
-	(txtype) == TX_WRITE2)
-
-/*
- * The number of dnode slots consumed by the object is stored in the 8
- * unused upper bits of the object ID. We subtract 1 from the value
- * stored on disk for compatibility with implementations that don't
- * support large dnodes. The slot count for a single-slot dnode will
- * contain 0 for those bits to preserve the log record format for
- * "small" dnodes.
- */
-#define	LR_FOID_GET_SLOTS(oid) (BF64_GET((oid), 56, 8) + 1)
-#define	LR_FOID_SET_SLOTS(oid, x) BF64_SET((oid), 56, 8, (x) - 1)
-#define	LR_FOID_GET_OBJ(oid) BF64_GET((oid), 0, DN_MAX_OBJECT_SHIFT)
-#define	LR_FOID_SET_OBJ(oid, x) BF64_SET((oid), 0, DN_MAX_OBJECT_SHIFT, (x))
-
-/*
- * Format of log records.
- * The fields are carefully defined to allow them to be aligned
- * and sized the same on sparc & intel architectures.
- * Each log record has a common structure at the beginning.
- *
- * The log record on disk (lrc_seq) holds the sequence number of all log
- * records which is used to ensure we don't replay the same record.
- */
-typedef struct {			/* common log record header */
-	uint64_t	lrc_txtype;	/* intent log transaction type */
-	uint64_t	lrc_reclen;	/* transaction record length */
-	uint64_t	lrc_txg;	/* dmu transaction group number */
-	uint64_t	lrc_seq;	/* see comment above */
-} lr_t;
-
-/*
- * Common start of all out-of-order record types (TX_OOO() above).
- */
-typedef struct {
-	lr_t		lr_common;	/* common portion of log record */
-	uint64_t	lr_foid;	/* object id */
-} lr_ooo_t;
-
-/*
- * Handle option extended vattr attributes.
- *
- * Whenever new attributes are added the version number
- * will need to be updated as will code in
- * zfs_log.c and zfs_replay.c
- */
-typedef struct {
-	uint32_t	lr_attr_masksize; /* number of elements in array */
-	uint32_t	lr_attr_bitmap; /* First entry of array */
-	/* remainder of array and any additional fields */
-} lr_attr_t;
-
-/*
- * log record for creates without optional ACL.
- * This log record does support optional xvattr_t attributes.
- */
-typedef struct {
-	lr_t		lr_common;	/* common portion of log record */
-	uint64_t	lr_doid;	/* object id of directory */
-	uint64_t	lr_foid;	/* object id of created file object */
-	uint64_t	lr_mode;	/* mode of object */
-	uint64_t	lr_uid;		/* uid of object */
-	uint64_t	lr_gid;		/* gid of object */
-	uint64_t	lr_gen;		/* generation (txg of creation) */
-	uint64_t	lr_crtime[2];	/* creation time */
-	uint64_t	lr_rdev;	/* rdev of object to create */
-	/* name of object to create follows this */
-	/* for symlinks, link content follows name */
-	/* for creates with xvattr data, the name follows the xvattr info */
-} lr_create_t;
-
-/*
- * FUID ACL record will be an array of ACEs from the original ACL.
- * If this array includes ephemeral IDs, the record will also include
- * an array of log-specific FUIDs to replace the ephemeral IDs.
- * Only one copy of each unique domain will be present, so the log-specific
- * FUIDs will use an index into a compressed domain table.  On replay this
- * information will be used to construct real FUIDs (and bypass idmap,
- * since it may not be available).
- */
-
-/*
- * Log record for creates with optional ACL
- * This log record is also used for recording any FUID
- * information needed for replaying the create.  If the
- * file doesn't have any actual ACEs then the lr_aclcnt
- * would be zero.
- *
- * After lr_acl_flags, there are a lr_acl_bytes number of variable sized ace's.
- * If create is also setting xvattr's, then acl data follows xvattr.
- * If ACE FUIDs are needed then they will follow the xvattr_t.  Following
- * the FUIDs will be the domain table information.  The FUIDs for the owner
- * and group will be in lr_create.  Name follows ACL data.
- */
-typedef struct {
-	lr_create_t	lr_create;	/* common create portion */
-	uint64_t	lr_aclcnt;	/* number of ACEs in ACL */
-	uint64_t	lr_domcnt;	/* number of unique domains */
-	uint64_t	lr_fuidcnt;	/* number of real fuids */
-	uint64_t	lr_acl_bytes;	/* number of bytes in ACL */
-	uint64_t	lr_acl_flags;	/* ACL flags */
-} lr_acl_create_t;
-
-typedef struct {
-	lr_t		lr_common;	/* common portion of log record */
-	uint64_t	lr_doid;	/* obj id of directory */
-	/* name of object to remove follows this */
-} lr_remove_t;
-
-typedef struct {
-	lr_t		lr_common;	/* common portion of log record */
-	uint64_t	lr_doid;	/* obj id of directory */
-	uint64_t	lr_link_obj;	/* obj id of link */
-	/* name of object to link follows this */
-} lr_link_t;
-
-typedef struct {
-	lr_t		lr_common;	/* common portion of log record */
-	uint64_t	lr_sdoid;	/* obj id of source directory */
-	uint64_t	lr_tdoid;	/* obj id of target directory */
-	/* 2 strings: names of source and destination follow this */
-} lr_rename_t;
-
-typedef struct {
-	lr_t		lr_common;	/* common portion of log record */
-	uint64_t	lr_foid;	/* file object to write */
-	uint64_t	lr_offset;	/* offset to write to */
-	uint64_t	lr_length;	/* user data length to write */
-	uint64_t	lr_blkoff;	/* no longer used */
-	blkptr_t	lr_blkptr;	/* spa block pointer for replay */
-	/* write data will follow for small writes */
-} lr_write_t;
-
-typedef struct {
-	lr_t		lr_common;	/* common portion of log record */
-	uint64_t	lr_foid;	/* object id of file to truncate */
-	uint64_t	lr_offset;	/* offset to truncate from */
-	uint64_t	lr_length;	/* length to truncate */
-} lr_truncate_t;
-
-typedef struct {
-	lr_t		lr_common;	/* common portion of log record */
-	uint64_t	lr_foid;	/* file object to change attributes */
-	uint64_t	lr_mask;	/* mask of attributes to set */
-	uint64_t	lr_mode;	/* mode to set */
-	uint64_t	lr_uid;		/* uid to set */
-	uint64_t	lr_gid;		/* gid to set */
-	uint64_t	lr_size;	/* size to set */
-	uint64_t	lr_atime[2];	/* access time */
-	uint64_t	lr_mtime[2];	/* modification time */
-	/* optional attribute lr_attr_t may be here */
-} lr_setattr_t;
-
-typedef struct {
-	lr_t		lr_common;	/* common portion of log record */
-	uint64_t	lr_foid;	/* obj id of file */
-	uint64_t	lr_aclcnt;	/* number of acl entries */
-	/* lr_aclcnt number of ace_t entries follow this */
-} lr_acl_v0_t;
-
-typedef struct {
-	lr_t		lr_common;	/* common portion of log record */
-	uint64_t	lr_foid;	/* obj id of file */
-	uint64_t	lr_aclcnt;	/* number of ACEs in ACL */
-	uint64_t	lr_domcnt;	/* number of unique domains */
-	uint64_t	lr_fuidcnt;	/* number of real fuids */
-	uint64_t	lr_acl_bytes;	/* number of bytes in ACL */
-	uint64_t	lr_acl_flags;	/* ACL flags */
-	/* lr_acl_bytes number of variable sized ace's follows */
-} lr_acl_t;
-
-/*
- * ZIL structure definitions, interface function prototype and globals.
- */
-
-/*
- * Writes are handled in three different ways:
- *
- * WR_INDIRECT:
- *    In this mode, if we need to commit the write later, then the block
- *    is immediately written into the file system (using dmu_sync),
- *    and a pointer to the block is put into the log record.
- *    When the txg commits the block is linked in.
- *    This saves additionally writing the data into the log record.
- *    There are a few requirements for this to occur:
- *	- write is greater than zfs/zvol_immediate_write_sz
- *	- not using slogs (as slogs are assumed to always be faster
- *	  than writing into the main pool)
- *	- the write occupies only one block
- * WR_COPIED:
- *    If we know we'll immediately be committing the
- *    transaction (FSYNC or FDSYNC), the we allocate a larger
- *    log record here for the data and copy the data in.
- * WR_NEED_COPY:
- *    Otherwise we don't allocate a buffer, and *if* we need to
- *    flush the write later then a buffer is allocated and
- *    we retrieve the data using the dmu.
- */
-typedef enum {
-	WR_INDIRECT,	/* indirect - a large write (dmu_sync() data */
-			/* and put blkptr in log, rather than actual data) */
-	WR_COPIED,	/* immediate - data is copied into lr_write_t */
-	WR_NEED_COPY,	/* immediate - data needs to be copied if pushed */
-	WR_NUM_STATES	/* number of states */
-} itx_wr_state_t;
-
-typedef struct itx {
-	list_node_t	itx_node;	/* linkage on zl_itx_list */
-	void		*itx_private;	/* type-specific opaque data */
-	itx_wr_state_t	itx_wr_state;	/* write state */
-	uint8_t		itx_sync;	/* synchronous transaction */
-	uint64_t	itx_oid;	/* object id */
-	lr_t		itx_lr;		/* common part of log record */
-	/* followed by type-specific part of lr_xx_t and its immediate data */
-} itx_t;
-
-typedef int zil_parse_blk_func_t(zilog_t *zilog, blkptr_t *bp, void *arg,
-    uint64_t txg);
-typedef int zil_parse_lr_func_t(zilog_t *zilog, lr_t *lr, void *arg,
-    uint64_t txg);
-typedef int zil_replay_func_t(void *arg1, void *arg2, boolean_t byteswap);
-typedef int zil_get_data_t(void *arg, lr_write_t *lr, char *dbuf,
-    struct lwb *lwb, zio_t *zio);
-
-extern int zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func,
-    zil_parse_lr_func_t *parse_lr_func, void *arg, uint64_t txg);
-
-extern void	zil_init(void);
-extern void	zil_fini(void);
-
-extern zilog_t	*zil_alloc(objset_t *os, zil_header_t *zh_phys);
-extern void	zil_free(zilog_t *zilog);
-
-extern zilog_t	*zil_open(objset_t *os, zil_get_data_t *get_data);
-extern void	zil_close(zilog_t *zilog);
-
-extern void	zil_replay(objset_t *os, void *arg,
-    zil_replay_func_t *replay_func[TX_MAX_TYPE]);
-extern boolean_t zil_replaying(zilog_t *zilog, dmu_tx_t *tx);
-extern void	zil_destroy(zilog_t *zilog, boolean_t keep_first);
-extern void	zil_destroy_sync(zilog_t *zilog, dmu_tx_t *tx);
-extern void	zil_rollback_destroy(zilog_t *zilog, dmu_tx_t *tx);
-
-extern itx_t	*zil_itx_create(uint64_t txtype, size_t lrsize);
-extern void	zil_itx_destroy(itx_t *itx);
-extern void	zil_itx_assign(zilog_t *zilog, itx_t *itx, dmu_tx_t *tx);
-
-extern void	zil_async_to_sync(zilog_t *zilog, uint64_t oid);
-extern void	zil_commit(zilog_t *zilog, uint64_t oid);
-extern void	zil_commit_impl(zilog_t *zilog, uint64_t oid);
-
-extern int	zil_reset(const char *osname, void *txarg);
-extern int	zil_claim(struct dsl_pool *dp,
-    struct dsl_dataset *ds, void *txarg);
-extern int	zil_check_log_chain(struct dsl_pool *dp,
-    struct dsl_dataset *ds, void *tx);
-extern void	zil_sync(zilog_t *zilog, dmu_tx_t *tx);
-extern void	zil_clean(zilog_t *zilog, uint64_t synced_txg);
-
-extern int	zil_suspend(const char *osname, void **cookiep);
-extern void	zil_resume(void *cookie);
-
-extern void	zil_lwb_add_block(struct lwb *lwb, const blkptr_t *bp);
-extern void	zil_lwb_add_txg(struct lwb *lwb, uint64_t txg);
-extern int	zil_bp_tree_add(zilog_t *zilog, const blkptr_t *bp);
-
-extern void	zil_set_sync(zilog_t *zilog, uint64_t syncval);
-
-extern void	zil_set_logbias(zilog_t *zilog, uint64_t slogval);
-
-extern uint64_t	zil_max_copied_data(zilog_t *zilog);
-extern uint64_t	zil_max_log_data(zilog_t *zilog);
-
-extern int zil_replay_disable;
-
-#ifdef	__cplusplus
-}
-#endif
-
-#endif	/* _SYS_ZIL_H */
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil_impl.h
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil_impl.h
+++ /dev/null
@@ -1,229 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
- * Copyright (c) 2014 Integros [integros.com]
- */
-
-/* Portions Copyright 2010 Robert Milkowski */
-
-#ifndef	_SYS_ZIL_IMPL_H
-#define	_SYS_ZIL_IMPL_H
-
-#include <sys/zil.h>
-#include <sys/dmu_objset.h>
-
-#ifdef	__cplusplus
-extern "C" {
-#endif
-
-/*
- * Possbile states for a given lwb structure.
- *
- * An lwb will start out in the "closed" state, and then transition to
- * the "opened" state via a call to zil_lwb_write_open(). When
- * transitioning from "closed" to "opened" the zilog's "zl_issuer_lock"
- * must be held.
- *
- * After the lwb is "opened", it can transition into the "issued" state
- * via zil_lwb_write_issue(). Again, the zilog's "zl_issuer_lock" must
- * be held when making this transition.
- *
- * After the lwb's write zio completes, it transitions into the "write
- * done" state via zil_lwb_write_done(); and then into the "flush done"
- * state via zil_lwb_flush_vdevs_done(). When transitioning from
- * "issued" to "write done", and then from "write done" to "flush done",
- * the zilog's "zl_lock" must be held, *not* the "zl_issuer_lock".
- *
- * The zilog's "zl_issuer_lock" can become heavily contended in certain
- * workloads, so we specifically avoid acquiring that lock when
- * transitioning an lwb from "issued" to "done". This allows us to avoid
- * having to acquire the "zl_issuer_lock" for each lwb ZIO completion,
- * which would have added more lock contention on an already heavily
- * contended lock.
- *
- * Additionally, correctness when reading an lwb's state is often
- * acheived by exploiting the fact that these state transitions occur in
- * this specific order; i.e. "closed" to "opened" to "issued" to "done".
- *
- * Thus, if an lwb is in the "closed" or "opened" state, holding the
- * "zl_issuer_lock" will prevent a concurrent thread from transitioning
- * that lwb to the "issued" state. Likewise, if an lwb is already in the
- * "issued" state, holding the "zl_lock" will prevent a concurrent
- * thread from transitioning that lwb to the "write done" state.
- */
-typedef enum {
-    LWB_STATE_CLOSED,
-    LWB_STATE_OPENED,
-    LWB_STATE_ISSUED,
-    LWB_STATE_WRITE_DONE,
-    LWB_STATE_FLUSH_DONE,
-    LWB_NUM_STATES
-} lwb_state_t;
-
-/*
- * Log write block (lwb)
- *
- * Prior to an lwb being issued to disk via zil_lwb_write_issue(), it
- * will be protected by the zilog's "zl_issuer_lock". Basically, prior
- * to it being issued, it will only be accessed by the thread that's
- * holding the "zl_issuer_lock". After the lwb is issued, the zilog's
- * "zl_lock" is used to protect the lwb against concurrent access.
- */
-typedef struct lwb {
-	zilog_t		*lwb_zilog;	/* back pointer to log struct */
-	blkptr_t	lwb_blk;	/* on disk address of this log blk */
-	boolean_t	lwb_slog;	/* lwb_blk is on SLOG device */
-	int		lwb_nused;	/* # used bytes in buffer */
-	int		lwb_sz;		/* size of block and buffer */
-	lwb_state_t	lwb_state;	/* the state of this lwb */
-	char		*lwb_buf;	/* log write buffer */
-	zio_t		*lwb_write_zio;	/* zio for the lwb buffer */
-	zio_t		*lwb_root_zio;	/* root zio for lwb write and flushes */
-	dmu_tx_t	*lwb_tx;	/* tx for log block allocation */
-	uint64_t	lwb_max_txg;	/* highest txg in this lwb */
-	list_node_t	lwb_node;	/* zilog->zl_lwb_list linkage */
-	list_t		lwb_waiters;	/* list of zil_commit_waiter's */
-	avl_tree_t	lwb_vdev_tree;	/* vdevs to flush after lwb write */
-	kmutex_t	lwb_vdev_lock;	/* protects lwb_vdev_tree */
-	hrtime_t	lwb_issued_timestamp; /* when was the lwb issued? */
-} lwb_t;
-
-/*
- * ZIL commit waiter.
- *
- * This structure is allocated each time zil_commit() is called, and is
- * used by zil_commit() to communicate with other parts of the ZIL, such
- * that zil_commit() can know when it safe for it return. For more
- * details, see the comment above zil_commit().
- *
- * The "zcw_lock" field is used to protect the commit waiter against
- * concurrent access. This lock is often acquired while already holding
- * the zilog's "zl_issuer_lock" or "zl_lock"; see the functions
- * zil_process_commit_list() and zil_lwb_flush_vdevs_done() as examples
- * of this. Thus, one must be careful not to acquire the
- * "zl_issuer_lock" or "zl_lock" when already holding the "zcw_lock";
- * e.g. see the zil_commit_waiter_timeout() function.
- */
-typedef struct zil_commit_waiter {
-	kcondvar_t	zcw_cv;		/* signalled when "done" */
-	kmutex_t	zcw_lock;	/* protects fields of this struct */
-	list_node_t	zcw_node;	/* linkage in lwb_t:lwb_waiter list */
-	lwb_t		*zcw_lwb;	/* back pointer to lwb when linked */
-	boolean_t	zcw_done;	/* B_TRUE when "done", else B_FALSE */
-	int		zcw_zio_error;	/* contains the zio io_error value */
-} zil_commit_waiter_t;
-
-/*
- * Intent log transaction lists
- */
-typedef struct itxs {
-	list_t		i_sync_list;	/* list of synchronous itxs */
-	avl_tree_t	i_async_tree;	/* tree of foids for async itxs */
-} itxs_t;
-
-typedef struct itxg {
-	kmutex_t	itxg_lock;	/* lock for this structure */
-	uint64_t	itxg_txg;	/* txg for this chain */
-	itxs_t		*itxg_itxs;	/* sync and async itxs */
-} itxg_t;
-
-/* for async nodes we build up an AVL tree of lists of async itxs per file */
-typedef struct itx_async_node {
-	uint64_t	ia_foid;	/* file object id */
-	list_t		ia_list;	/* list of async itxs for this foid */
-	avl_node_t	ia_node;	/* AVL tree linkage */
-} itx_async_node_t;
-
-/*
- * Vdev flushing: during a zil_commit(), we build up an AVL tree of the vdevs
- * we've touched so we know which ones need a write cache flush at the end.
- */
-typedef struct zil_vdev_node {
-	uint64_t	zv_vdev;	/* vdev to be flushed */
-	avl_node_t	zv_node;	/* AVL tree linkage */
-} zil_vdev_node_t;
-
-#define	ZIL_PREV_BLKS 16
-
-/*
- * Stable storage intent log management structure.  One per dataset.
- */
-struct zilog {
-	kmutex_t	zl_lock;	/* protects most zilog_t fields */
-	struct dsl_pool	*zl_dmu_pool;	/* DSL pool */
-	spa_t		*zl_spa;	/* handle for read/write log */
-	const zil_header_t *zl_header;	/* log header buffer */
-	objset_t	*zl_os;		/* object set we're logging */
-	zil_get_data_t	*zl_get_data;	/* callback to get object content */
-	lwb_t		*zl_last_lwb_opened; /* most recent lwb opened */
-	hrtime_t	zl_last_lwb_latency; /* zio latency of last lwb done */
-	uint64_t	zl_lr_seq;	/* on-disk log record sequence number */
-	uint64_t	zl_commit_lr_seq; /* last committed on-disk lr seq */
-	uint64_t	zl_destroy_txg;	/* txg of last zil_destroy() */
-	uint64_t	zl_replayed_seq[TXG_SIZE]; /* last replayed rec seq */
-	uint64_t	zl_replaying_seq; /* current replay seq number */
-	uint32_t	zl_suspend;	/* log suspend count */
-	kcondvar_t	zl_cv_suspend;	/* log suspend completion */
-	uint8_t		zl_suspending;	/* log is currently suspending */
-	uint8_t		zl_keep_first;	/* keep first log block in destroy */
-	uint8_t		zl_replay;	/* replaying records while set */
-	uint8_t		zl_stop_sync;	/* for debugging */
-	kmutex_t	zl_issuer_lock;	/* single writer, per ZIL, at a time */
-	uint8_t		zl_logbias;	/* latency or throughput */
-	uint8_t		zl_sync;	/* synchronous or asynchronous */
-	int		zl_parse_error;	/* last zil_parse() error */
-	uint64_t	zl_parse_blk_seq; /* highest blk seq on last parse */
-	uint64_t	zl_parse_lr_seq; /* highest lr seq on last parse */
-	uint64_t	zl_parse_blk_count; /* number of blocks parsed */
-	uint64_t	zl_parse_lr_count; /* number of log records parsed */
-	itxg_t		zl_itxg[TXG_SIZE]; /* intent log txg chains */
-	list_t		zl_itx_commit_list; /* itx list to be committed */
-	uint64_t	zl_cur_used;	/* current commit log size used */
-	list_t		zl_lwb_list;	/* in-flight log write list */
-	avl_tree_t	zl_bp_tree;	/* track bps during log parse */
-	clock_t		zl_replay_time;	/* lbolt of when replay started */
-	uint64_t	zl_replay_blks;	/* number of log blocks replayed */
-	zil_header_t	zl_old_header;	/* debugging aid */
-	uint_t		zl_prev_blks[ZIL_PREV_BLKS]; /* size - sector rounded */
-	uint_t		zl_prev_rotor;	/* rotor for zl_prev[] */
-	txg_node_t	zl_dirty_link;	/* protected by dp_dirty_zilogs list */
-	uint64_t	zl_dirty_max_txg; /* highest txg used to dirty zilog */
-	/*
-	 * Max block size for this ZIL.  Note that this can not be changed
-	 * while the ZIL is in use because consumers (ZPL/zvol) need to take
-	 * this into account when deciding between WR_COPIED and WR_NEED_COPY
-	 * (see zil_max_copied_data()).
-	 */
-	uint64_t	zl_max_block_size;
-};
-
-typedef struct zil_bp_node {
-	dva_t		zn_dva;
-	avl_node_t	zn_node;
-} zil_bp_node_t;
-
-#ifdef	__cplusplus
-}
-#endif
-
-#endif	/* _SYS_ZIL_IMPL_H */
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h
+++ /dev/null
@@ -1,675 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-
-/*
- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
- * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
- * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
- * Copyright 2016 Toomas Soome <tsoome@me.com>
- */
-
-#ifndef _ZIO_H
-#define	_ZIO_H
-
-#include <sys/zio_priority.h>
-#include <sys/zfs_context.h>
-#include <sys/spa.h>
-#include <sys/txg.h>
-#include <sys/avl.h>
-#include <sys/kstat.h>
-#include <sys/fs/zfs.h>
-#include <sys/zio_impl.h>
-
-#ifdef	__cplusplus
-extern "C" {
-#endif
-
-/*
- * Embedded checksum
- */
-#define	ZEC_MAGIC	0x210da7ab10c7a11ULL
-
-typedef struct zio_eck {
-	uint64_t	zec_magic;	/* for validation, endianness	*/
-	zio_cksum_t	zec_cksum;	/* 256-bit checksum		*/
-} zio_eck_t;
-
-/*
- * Gang block headers are self-checksumming and contain an array
- * of block pointers.
- */
-#define	SPA_GANGBLOCKSIZE	SPA_MINBLOCKSIZE
-#define	SPA_GBH_NBLKPTRS	((SPA_GANGBLOCKSIZE - \
-	sizeof (zio_eck_t)) / sizeof (blkptr_t))
-#define	SPA_GBH_FILLER		((SPA_GANGBLOCKSIZE - \
-	sizeof (zio_eck_t) - \
-	(SPA_GBH_NBLKPTRS * sizeof (blkptr_t))) /\
-	sizeof (uint64_t))
-
-typedef struct zio_gbh {
-	blkptr_t		zg_blkptr[SPA_GBH_NBLKPTRS];
-	uint64_t		zg_filler[SPA_GBH_FILLER];
-	zio_eck_t		zg_tail;
-} zio_gbh_phys_t;
-
-enum zio_checksum {
-	ZIO_CHECKSUM_INHERIT = 0,
-	ZIO_CHECKSUM_ON,
-	ZIO_CHECKSUM_OFF,
-	ZIO_CHECKSUM_LABEL,
-	ZIO_CHECKSUM_GANG_HEADER,
-	ZIO_CHECKSUM_ZILOG,
-	ZIO_CHECKSUM_FLETCHER_2,
-	ZIO_CHECKSUM_FLETCHER_4,
-	ZIO_CHECKSUM_SHA256,
-	ZIO_CHECKSUM_ZILOG2,
-	ZIO_CHECKSUM_NOPARITY,
-	ZIO_CHECKSUM_SHA512,
-	ZIO_CHECKSUM_SKEIN,
-#ifdef illumos
-	ZIO_CHECKSUM_EDONR,
-#endif
-	ZIO_CHECKSUM_FUNCTIONS
-};
-
-/*
- * The number of "legacy" compression functions which can be set on individual
- * objects.
- */
-#define	ZIO_CHECKSUM_LEGACY_FUNCTIONS ZIO_CHECKSUM_ZILOG2
-
-#define	ZIO_CHECKSUM_ON_VALUE	ZIO_CHECKSUM_FLETCHER_4
-#define	ZIO_CHECKSUM_DEFAULT	ZIO_CHECKSUM_ON
-
-#define	ZIO_CHECKSUM_MASK	0xffULL
-#define	ZIO_CHECKSUM_VERIFY	(1 << 8)
-
-#define	ZIO_DEDUPCHECKSUM	ZIO_CHECKSUM_SHA256
-#define	ZIO_DEDUPDITTO_MIN	100
-
-/*
- * The number of "legacy" compression functions which can be set on individual
- * objects.
- */
-#define	ZIO_COMPRESS_LEGACY_FUNCTIONS ZIO_COMPRESS_LZ4
-
-/*
- * The meaning of "compress = on" selected by the compression features enabled
- * on a given pool.
- */
-#define	ZIO_COMPRESS_LEGACY_ON_VALUE	ZIO_COMPRESS_LZJB
-#define	ZIO_COMPRESS_LZ4_ON_VALUE	ZIO_COMPRESS_LZ4
-
-#define	ZIO_COMPRESS_DEFAULT		ZIO_COMPRESS_OFF
-
-#define	BOOTFS_COMPRESS_VALID(compress)			\
-	((compress) == ZIO_COMPRESS_LZJB ||		\
-	(compress) == ZIO_COMPRESS_LZ4 ||		\
-	(compress) == ZIO_COMPRESS_ON ||		\
-	(compress) == ZIO_COMPRESS_OFF)
-
-#define	ZIO_FAILURE_MODE_WAIT		0
-#define	ZIO_FAILURE_MODE_CONTINUE	1
-#define	ZIO_FAILURE_MODE_PANIC		2
-
-typedef enum zio_suspend_reason {
-	ZIO_SUSPEND_NONE = 0,
-	ZIO_SUSPEND_IOERR,
-	ZIO_SUSPEND_MMP,
-} zio_suspend_reason_t;
-
-enum zio_flag {
-	/*
-	 * Flags inherited by gang, ddt, and vdev children,
-	 * and that must be equal for two zios to aggregate
-	 */
-	ZIO_FLAG_DONT_AGGREGATE	= 1 << 0,
-	ZIO_FLAG_IO_REPAIR	= 1 << 1,
-	ZIO_FLAG_SELF_HEAL	= 1 << 2,
-	ZIO_FLAG_RESILVER	= 1 << 3,
-	ZIO_FLAG_SCRUB		= 1 << 4,
-	ZIO_FLAG_SCAN_THREAD	= 1 << 5,
-	ZIO_FLAG_PHYSICAL	= 1 << 6,
-
-#define	ZIO_FLAG_AGG_INHERIT	(ZIO_FLAG_CANFAIL - 1)
-
-	/*
-	 * Flags inherited by ddt, gang, and vdev children.
-	 */
-	ZIO_FLAG_CANFAIL	= 1 << 7,	/* must be first for INHERIT */
-	ZIO_FLAG_SPECULATIVE	= 1 << 8,
-	ZIO_FLAG_CONFIG_WRITER	= 1 << 9,
-	ZIO_FLAG_DONT_RETRY	= 1 << 10,
-	ZIO_FLAG_DONT_CACHE	= 1 << 11,
-	ZIO_FLAG_NODATA		= 1 << 12,
-	ZIO_FLAG_INDUCE_DAMAGE	= 1 << 13,
-	ZIO_FLAG_IO_ALLOCATING	= 1 << 14,
-
-#define	ZIO_FLAG_DDT_INHERIT	(ZIO_FLAG_IO_RETRY - 1)
-#define	ZIO_FLAG_GANG_INHERIT	(ZIO_FLAG_IO_RETRY - 1)
-
-	/*
-	 * Flags inherited by vdev children.
-	 */
-	ZIO_FLAG_IO_RETRY	= 1 << 15,	/* must be first for INHERIT */
-	ZIO_FLAG_PROBE		= 1 << 16,
-	ZIO_FLAG_TRYHARD	= 1 << 17,
-	ZIO_FLAG_OPTIONAL	= 1 << 18,
-
-#define	ZIO_FLAG_VDEV_INHERIT	(ZIO_FLAG_DONT_QUEUE - 1)
-
-	/*
-	 * Flags not inherited by any children.
-	 */
-	ZIO_FLAG_DONT_QUEUE	= 1 << 19,	/* must be first for INHERIT */
-	ZIO_FLAG_DONT_PROPAGATE	= 1 << 20,
-	ZIO_FLAG_IO_BYPASS	= 1 << 21,
-	ZIO_FLAG_IO_REWRITE	= 1 << 22,
-	ZIO_FLAG_RAW		= 1 << 23,
-	ZIO_FLAG_GANG_CHILD	= 1 << 24,
-	ZIO_FLAG_DDT_CHILD	= 1 << 25,
-	ZIO_FLAG_GODFATHER	= 1 << 26,
-	ZIO_FLAG_NOPWRITE	= 1 << 27,
-	ZIO_FLAG_REEXECUTED	= 1 << 28,
-	ZIO_FLAG_DELEGATED	= 1 << 29,
-};
-
-#define	ZIO_FLAG_MUSTSUCCEED		0
-
-#define	ZIO_DDT_CHILD_FLAGS(zio)				\
-	(((zio)->io_flags & ZIO_FLAG_DDT_INHERIT) |		\
-	ZIO_FLAG_DDT_CHILD | ZIO_FLAG_CANFAIL)
-
-#define	ZIO_GANG_CHILD_FLAGS(zio)				\
-	(((zio)->io_flags & ZIO_FLAG_GANG_INHERIT) |		\
-	ZIO_FLAG_GANG_CHILD | ZIO_FLAG_CANFAIL)
-
-#define	ZIO_VDEV_CHILD_FLAGS(zio)				\
-	(((zio)->io_flags & ZIO_FLAG_VDEV_INHERIT) |		\
-	ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_CANFAIL)
-
-#define	ZIO_CHILD_BIT(x)		(1 << (x))
-#define	ZIO_CHILD_BIT_IS_SET(val, x)	((val) & (1 << (x)))
-
-enum zio_child {
-	ZIO_CHILD_VDEV = 0,
-	ZIO_CHILD_GANG,
-	ZIO_CHILD_DDT,
-	ZIO_CHILD_LOGICAL,
-	ZIO_CHILD_TYPES
-};
-
-#define	ZIO_CHILD_VDEV_BIT		ZIO_CHILD_BIT(ZIO_CHILD_VDEV)
-#define	ZIO_CHILD_GANG_BIT		ZIO_CHILD_BIT(ZIO_CHILD_GANG)
-#define	ZIO_CHILD_DDT_BIT		ZIO_CHILD_BIT(ZIO_CHILD_DDT)
-#define	ZIO_CHILD_LOGICAL_BIT		ZIO_CHILD_BIT(ZIO_CHILD_LOGICAL)
-#define	ZIO_CHILD_ALL_BITS					\
-	(ZIO_CHILD_VDEV_BIT | ZIO_CHILD_GANG_BIT |		\
-	ZIO_CHILD_DDT_BIT | ZIO_CHILD_LOGICAL_BIT)
-
-enum zio_wait_type {
-	ZIO_WAIT_READY = 0,
-	ZIO_WAIT_DONE,
-	ZIO_WAIT_TYPES
-};
-
-/*
- * These are bespoke errnos used in ZFS. We map them to their closest FreeBSD
- * equivalents. This gives us more useful error messages from strerror(3).
- */
-#define	ECKSUM	EINTEGRITY
-#define	EFRAGS	ENOSPC
-
-typedef void zio_done_func_t(zio_t *zio);
-
-extern boolean_t zio_dva_throttle_enabled;
-extern const char *zio_type_name[ZIO_TYPES];
-
-/*
- * A bookmark is a four-tuple <objset, object, level, blkid> that uniquely
- * identifies any block in the pool.  By convention, the meta-objset (MOS)
- * is objset 0, and the meta-dnode is object 0.  This covers all blocks
- * except root blocks and ZIL blocks, which are defined as follows:
- *
- * Root blocks (objset_phys_t) are object 0, level -1:  <objset, 0, -1, 0>.
- * ZIL blocks are bookmarked <objset, 0, -2, blkid == ZIL sequence number>.
- * dmu_sync()ed ZIL data blocks are bookmarked <objset, object, -2, blkid>.
- * dnode visit bookmarks are <objset, object id of dnode, -3, 0>.
- *
- * Note: this structure is called a bookmark because its original purpose
- * was to remember where to resume a pool-wide traverse.
- *
- * Note: this structure is passed between userland and the kernel, and is
- * stored on disk (by virtue of being incorporated into other on-disk
- * structures, e.g. dsl_scan_phys_t).
- */
-typedef struct zbookmark_phys {
-	uint64_t	zb_objset;
-	uint64_t	zb_object;
-	int64_t		zb_level;
-	uint64_t	zb_blkid;
-} zbookmark_phys_t;
-
-#define	SET_BOOKMARK(zb, objset, object, level, blkid)  \
-{                                                       \
-	(zb)->zb_objset = objset;                       \
-	(zb)->zb_object = object;                       \
-	(zb)->zb_level = level;                         \
-	(zb)->zb_blkid = blkid;                         \
-}
-
-#define	ZB_DESTROYED_OBJSET	(-1ULL)
-
-#define	ZB_ROOT_OBJECT		(0ULL)
-#define	ZB_ROOT_LEVEL		(-1LL)
-#define	ZB_ROOT_BLKID		(0ULL)
-
-#define	ZB_ZIL_OBJECT		(0ULL)
-#define	ZB_ZIL_LEVEL		(-2LL)
-
-#define	ZB_DNODE_LEVEL		(-3LL)
-#define	ZB_DNODE_BLKID		(0ULL)
-
-#define	ZB_IS_ZERO(zb)						\
-	((zb)->zb_objset == 0 && (zb)->zb_object == 0 &&	\
-	(zb)->zb_level == 0 && (zb)->zb_blkid == 0)
-#define	ZB_IS_ROOT(zb)				\
-	((zb)->zb_object == ZB_ROOT_OBJECT &&	\
-	(zb)->zb_level == ZB_ROOT_LEVEL &&	\
-	(zb)->zb_blkid == ZB_ROOT_BLKID)
-
-typedef struct zio_prop {
-	enum zio_checksum	zp_checksum;
-	enum zio_compress	zp_compress;
-	dmu_object_type_t	zp_type;
-	uint8_t			zp_level;
-	uint8_t			zp_copies;
-	boolean_t		zp_dedup;
-	boolean_t		zp_dedup_verify;
-	boolean_t		zp_nopwrite;
-	uint32_t		zp_zpl_smallblk;
-} zio_prop_t;
-
-typedef struct zio_cksum_report zio_cksum_report_t;
-
-typedef void zio_cksum_finish_f(zio_cksum_report_t *rep,
-    const void *good_data);
-typedef void zio_cksum_free_f(void *cbdata, size_t size);
-
-struct zio_bad_cksum;				/* defined in zio_checksum.h */
-struct dnode_phys;
-struct abd;
-
-struct zio_cksum_report {
-	struct zio_cksum_report *zcr_next;
-	nvlist_t		*zcr_ereport;
-	nvlist_t		*zcr_detector;
-	void			*zcr_cbdata;
-	size_t			zcr_cbinfo;	/* passed to zcr_free() */
-	uint64_t		zcr_align;
-	uint64_t		zcr_length;
-	zio_cksum_finish_f	*zcr_finish;
-	zio_cksum_free_f	*zcr_free;
-
-	/* internal use only */
-	struct zio_bad_cksum	*zcr_ckinfo;	/* information from failure */
-};
-
-typedef void zio_vsd_cksum_report_f(zio_t *zio, zio_cksum_report_t *zcr,
-    void *arg);
-
-zio_vsd_cksum_report_f	zio_vsd_default_cksum_report;
-
-typedef struct zio_vsd_ops {
-	zio_done_func_t		*vsd_free;
-	zio_vsd_cksum_report_f	*vsd_cksum_report;
-} zio_vsd_ops_t;
-
-typedef struct zio_gang_node {
-	zio_gbh_phys_t		*gn_gbh;
-	struct zio_gang_node	*gn_child[SPA_GBH_NBLKPTRS];
-} zio_gang_node_t;
-
-typedef zio_t *zio_gang_issue_func_t(zio_t *zio, blkptr_t *bp,
-    zio_gang_node_t *gn, struct abd *data, uint64_t offset);
-
-typedef void zio_transform_func_t(zio_t *zio, struct abd *data, uint64_t size);
-
-typedef struct zio_transform {
-	struct abd		*zt_orig_abd;
-	uint64_t		zt_orig_size;
-	uint64_t		zt_bufsize;
-	zio_transform_func_t	*zt_transform;
-	struct zio_transform	*zt_next;
-} zio_transform_t;
-
-typedef zio_t *zio_pipe_stage_t(zio_t *zio);
-
-/*
- * The io_reexecute flags are distinct from io_flags because the child must
- * be able to propagate them to the parent.  The normal io_flags are local
- * to the zio, not protected by any lock, and not modifiable by children;
- * the reexecute flags are protected by io_lock, modifiable by children,
- * and always propagated -- even when ZIO_FLAG_DONT_PROPAGATE is set.
- */
-#define	ZIO_REEXECUTE_NOW	0x01
-#define	ZIO_REEXECUTE_SUSPEND	0x02
-
-typedef struct zio_alloc_list {
-	list_t  zal_list;
-	uint64_t zal_size;
-} zio_alloc_list_t;
-
-typedef struct zio_link {
-	zio_t		*zl_parent;
-	zio_t		*zl_child;
-	list_node_t	zl_parent_node;
-	list_node_t	zl_child_node;
-} zio_link_t;
-
-/*
- * Used for TRIM kstat.
- */
-typedef struct zio_trim_stats {
-	/*
-	 * Number of bytes successfully TRIMmed.
-	 */
-	kstat_named_t bytes;
-
-	/*
-	 * Number of successful TRIM requests.
-	 */
-	kstat_named_t success;
-
-	/*
-	 * Number of TRIM requests that failed because TRIM is not
-	 * supported.
-	 */
-	kstat_named_t unsupported;
-
-	/*
-	 * Number of TRIM requests that failed for other reasons.
-	 */
-	kstat_named_t failed;
-} zio_trim_stats_t;
-
-extern zio_trim_stats_t zio_trim_stats;
-
-#define ZIO_TRIM_STAT_INCR(stat, val) \
-	atomic_add_64(&zio_trim_stats.stat.value.ui64, (val));
-#define ZIO_TRIM_STAT_BUMP(stat) \
-	ZIO_TRIM_STAT_INCR(stat, 1);
-
-struct zio {
-	/* Core information about this I/O */
-	zbookmark_phys_t	io_bookmark;
-	zio_prop_t	io_prop;
-	zio_type_t	io_type;
-	enum zio_child	io_child_type;
-	int		io_cmd;
-	zio_priority_t	io_priority;
-	uint8_t		io_reexecute;
-	uint8_t		io_state[ZIO_WAIT_TYPES];
-	uint64_t	io_txg;
-	spa_t		*io_spa;
-	blkptr_t	*io_bp;
-	blkptr_t	*io_bp_override;
-	blkptr_t	io_bp_copy;
-	list_t		io_parent_list;
-	list_t		io_child_list;
-	zio_t		*io_logical;
-	zio_transform_t *io_transform_stack;
-
-	/* Callback info */
-	zio_done_func_t	*io_ready;
-	zio_done_func_t	*io_children_ready;
-	zio_done_func_t	*io_physdone;
-	zio_done_func_t	*io_done;
-	void		*io_private;
-	int64_t		io_prev_space_delta;	/* DMU private */
-	blkptr_t	io_bp_orig;
-
-	/* Data represented by this I/O */
-	struct abd	*io_abd;
-	struct abd	*io_orig_abd;
-	uint64_t	io_size;
-	uint64_t	io_orig_size;
-	/* io_lsize != io_orig_size iff this is a raw write */
-	uint64_t	io_lsize;
-
-	/* Stuff for the vdev stack */
-	vdev_t		*io_vd;
-	void		*io_vsd;
-	const zio_vsd_ops_t *io_vsd_ops;
-	metaslab_class_t *io_metaslab_class;	/* dva throttle class */
-
-	uint64_t	io_offset;
-	hrtime_t	io_timestamp;
-	hrtime_t	io_queued_timestamp;
-	hrtime_t	io_target_timestamp;
-	avl_node_t	io_queue_node;
-	avl_node_t	io_offset_node;
-	avl_node_t	io_alloc_node;
-	zio_alloc_list_t	io_alloc_list;
-
-#ifdef __FreeBSD__
-	struct bio	*io_bio;
-#ifdef _KERNEL
-	struct callout	io_timer;
-#endif
-#endif
-
-	/* Internal pipeline state */
-	enum zio_flag	io_flags;
-	enum zio_stage	io_stage;
-	enum zio_stage	io_pipeline;
-	enum zio_flag	io_orig_flags;
-	enum zio_stage	io_orig_stage;
-	enum zio_stage	io_orig_pipeline;
-	enum zio_stage	io_pipeline_trace;
-	int		io_error;
-	int		io_child_error[ZIO_CHILD_TYPES];
-	uint64_t	io_children[ZIO_CHILD_TYPES][ZIO_WAIT_TYPES];
-	uint64_t	io_child_count;
-	uint64_t	io_phys_children;
-	uint64_t	io_parent_count;
-	uint64_t	*io_stall;
-	zio_t		*io_gang_leader;
-	zio_gang_node_t	*io_gang_tree;
-	void		*io_executor;
-	void		*io_waiter;
-	kmutex_t	io_lock;
-	kcondvar_t	io_cv;
-	int		io_allocator;
-
-	/* FMA state */
-	zio_cksum_report_t *io_cksum_report;
-	uint64_t	io_ena;
-
-	/* Taskq dispatching state */
-	taskq_ent_t	io_tqent;
-
-	avl_node_t	io_trim_node;
-	list_node_t	io_trim_link;
-};
-
-extern int zio_bookmark_compare(const void *, const void *);
-
-extern zio_t *zio_null(zio_t *pio, spa_t *spa, vdev_t *vd,
-    zio_done_func_t *done, void *priv, enum zio_flag flags);
-
-extern zio_t *zio_root(spa_t *spa,
-    zio_done_func_t *done, void *priv, enum zio_flag flags);
-
-extern zio_t *zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp,
-    struct abd *data, uint64_t lsize, zio_done_func_t *done, void *priv,
-    zio_priority_t priority, enum zio_flag flags, const zbookmark_phys_t *zb);
-
-extern zio_t *zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
-    struct abd *data, uint64_t size, uint64_t psize, const zio_prop_t *zp,
-    zio_done_func_t *ready, zio_done_func_t *children_ready,
-    zio_done_func_t *physdone, zio_done_func_t *done,
-    void *priv, zio_priority_t priority, enum zio_flag flags,
-    const zbookmark_phys_t *zb);
-
-extern zio_t *zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
-    struct abd *data, uint64_t size, zio_done_func_t *done, void *priv,
-    zio_priority_t priority, enum zio_flag flags, zbookmark_phys_t *zb);
-
-extern void zio_write_override(zio_t *zio, blkptr_t *bp, int copies,
-    boolean_t nopwrite);
-
-extern void zio_free(spa_t *spa, uint64_t txg, const blkptr_t *bp);
-
-extern zio_t *zio_claim(zio_t *pio, spa_t *spa, uint64_t txg,
-    const blkptr_t *bp,
-    zio_done_func_t *done, void *priv, enum zio_flag flags);
-
-extern zio_t *zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd,
-    uint64_t offset, uint64_t size, zio_done_func_t *done, void *priv,
-    zio_priority_t priority, enum zio_flag flags);
-
-extern zio_t *zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset,
-    uint64_t size, struct abd *data, int checksum,
-    zio_done_func_t *done, void *priv, zio_priority_t priority,
-    enum zio_flag flags, boolean_t labels);
-
-extern zio_t *zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset,
-    uint64_t size, struct abd *data, int checksum,
-    zio_done_func_t *done, void *priv, zio_priority_t priority,
-    enum zio_flag flags, boolean_t labels);
-
-extern zio_t *zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg,
-    const blkptr_t *bp, uint64_t size, enum zio_flag flags);
-
-extern int zio_alloc_zil(spa_t *spa, uint64_t objset, uint64_t txg,
-    blkptr_t *new_bp, blkptr_t *old_bp, uint64_t size, boolean_t *slog);
-extern void zio_flush(zio_t *zio, vdev_t *vd);
-extern zio_t *zio_trim(zio_t *zio, spa_t *spa, vdev_t *vd, uint64_t offset,
-    uint64_t size);
-extern void zio_shrink(zio_t *zio, uint64_t size);
-
-extern int zio_wait(zio_t *zio);
-extern void zio_nowait(zio_t *zio);
-extern void zio_execute(zio_t *zio);
-extern void zio_interrupt(zio_t *zio);
-extern void zio_delay_init(zio_t *zio);
-extern void zio_delay_interrupt(zio_t *zio);
-
-extern zio_t *zio_walk_parents(zio_t *cio, zio_link_t **);
-extern zio_t *zio_walk_children(zio_t *pio, zio_link_t **);
-extern zio_t *zio_unique_parent(zio_t *cio);
-extern void zio_add_child(zio_t *pio, zio_t *cio);
-
-extern void *zio_buf_alloc(size_t size);
-extern void zio_buf_free(void *buf, size_t size);
-extern void *zio_data_buf_alloc(size_t size);
-extern void zio_data_buf_free(void *buf, size_t size);
-
-extern void zio_push_transform(zio_t *zio, struct abd *abd, uint64_t size,
-    uint64_t bufsize, zio_transform_func_t *transform);
-extern void zio_pop_transforms(zio_t *zio);
-
-extern void zio_resubmit_stage_async(void *);
-
-extern zio_t *zio_vdev_child_io(zio_t *zio, blkptr_t *bp, vdev_t *vd,
-    uint64_t offset, struct abd *data, uint64_t size, int type,
-    zio_priority_t priority, enum zio_flag flags,
-    zio_done_func_t *done, void *priv);
-
-extern zio_t *zio_vdev_delegated_io(vdev_t *vd, uint64_t offset,
-    struct abd *data, uint64_t size, zio_type_t type, zio_priority_t priority,
-    enum zio_flag flags, zio_done_func_t *done, void *priv);
-
-extern void zio_vdev_io_bypass(zio_t *zio);
-extern void zio_vdev_io_reissue(zio_t *zio);
-extern void zio_vdev_io_redone(zio_t *zio);
-
-extern void zio_change_priority(zio_t *pio, zio_priority_t priority);
-
-extern void zio_checksum_verified(zio_t *zio);
-extern int zio_worst_error(int e1, int e2);
-
-extern enum zio_checksum zio_checksum_select(enum zio_checksum child,
-    enum zio_checksum parent);
-extern enum zio_checksum zio_checksum_dedup_select(spa_t *spa,
-    enum zio_checksum child, enum zio_checksum parent);
-extern enum zio_compress zio_compress_select(spa_t *spa,
-    enum zio_compress child, enum zio_compress parent);
-
-extern void zio_suspend(spa_t *spa, zio_t *zio, zio_suspend_reason_t);
-extern int zio_resume(spa_t *spa);
-extern void zio_resume_wait(spa_t *spa);
-
-/*
- * Initial setup and teardown.
- */
-extern void zio_init(void);
-extern void zio_fini(void);
-
-/*
- * Fault injection
- */
-struct zinject_record;
-extern uint32_t zio_injection_enabled;
-extern int zio_inject_fault(char *name, int flags, int *id,
-    struct zinject_record *record);
-extern int zio_inject_list_next(int *id, char *name, size_t buflen,
-    struct zinject_record *record);
-extern int zio_clear_fault(int id);
-extern void zio_handle_panic_injection(spa_t *spa, char *tag, uint64_t type);
-extern int zio_handle_fault_injection(zio_t *zio, int error);
-extern int zio_handle_device_injection(vdev_t *vd, zio_t *zio, int error);
-extern int zio_handle_label_injection(zio_t *zio, int error);
-extern void zio_handle_ignored_writes(zio_t *zio);
-extern hrtime_t zio_handle_io_delay(zio_t *zio);
-
-/*
- * Checksum ereport functions
- */
-extern void zfs_ereport_start_checksum(spa_t *spa, vdev_t *vd, struct zio *zio,
-    uint64_t offset, uint64_t length, void *arg, struct zio_bad_cksum *info);
-extern void zfs_ereport_finish_checksum(zio_cksum_report_t *report,
-    const void *good_data, const void *bad_data, boolean_t drop_if_identical);
-
-extern void zfs_ereport_send_interim_checksum(zio_cksum_report_t *report);
-extern void zfs_ereport_free_checksum(zio_cksum_report_t *report);
-
-/* If we have the good data in hand, this function can be used */
-extern void zfs_ereport_post_checksum(spa_t *spa, vdev_t *vd,
-    struct zio *zio, uint64_t offset, uint64_t length,
-    const void *good_data, const void *bad_data, struct zio_bad_cksum *info);
-
-/* Called from spa_sync(), but primarily an injection handler */
-extern void spa_handle_ignored_writes(spa_t *spa);
-
-/* zbookmark_phys functions */
-boolean_t zbookmark_subtree_completed(const struct dnode_phys *dnp,
-    const zbookmark_phys_t *subtree_root, const zbookmark_phys_t *last_block);
-int zbookmark_compare(uint16_t dbss1, uint8_t ibs1, uint16_t dbss2,
-    uint8_t ibs2, const zbookmark_phys_t *zb1, const zbookmark_phys_t *zb2);
-
-#ifdef	__cplusplus
-}
-#endif
-
-#endif	/* _ZIO_H */
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_checksum.h
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_checksum.h
+++ /dev/null
@@ -1,119 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2014, 2016 by Delphix. All rights reserved.
- * Copyright Saso Kiselkov 2013, All rights reserved.
- */
-
-#ifndef _SYS_ZIO_CHECKSUM_H
-#define	_SYS_ZIO_CHECKSUM_H
-
-#include <sys/zio.h>
-#include <zfeature_common.h>
-
-#ifdef	__cplusplus
-extern "C" {
-#endif
-
-struct abd;
-
-/*
- * Signature for checksum functions.
- */
-typedef void zio_checksum_t(struct abd *, uint64_t size,
-    const void *ctx_template, zio_cksum_t *zcp);
-typedef void *zio_checksum_tmpl_init_t(const zio_cksum_salt_t *salt);
-typedef void zio_checksum_tmpl_free_t(void *ctx_template);
-
-typedef enum zio_checksum_flags {
-	/* Strong enough for metadata? */
-	ZCHECKSUM_FLAG_METADATA = (1 << 1),
-	/* ZIO embedded checksum */
-	ZCHECKSUM_FLAG_EMBEDDED = (1 << 2),
-	/* Strong enough for dedup (without verification)? */
-	ZCHECKSUM_FLAG_DEDUP = (1 << 3),
-	/* Uses salt value */
-	ZCHECKSUM_FLAG_SALTED = (1 << 4),
-	/* Strong enough for nopwrite? */
-	ZCHECKSUM_FLAG_NOPWRITE = (1 << 5)
-} zio_checksum_flags_t;
-
-/*
- * Information about each checksum function.
- */
-typedef struct zio_checksum_info {
-	/* checksum function for each byteorder */
-	zio_checksum_t			*ci_func[2];
-	zio_checksum_tmpl_init_t	*ci_tmpl_init;
-	zio_checksum_tmpl_free_t	*ci_tmpl_free;
-	zio_checksum_flags_t		ci_flags;
-	char				*ci_name;	/* descriptive name */
-} zio_checksum_info_t;
-
-typedef struct zio_bad_cksum {
-	zio_cksum_t		zbc_expected;
-	zio_cksum_t		zbc_actual;
-	const char		*zbc_checksum_name;
-	uint8_t			zbc_byteswapped;
-	uint8_t			zbc_injected;
-	uint8_t			zbc_has_cksum;	/* expected/actual valid */
-} zio_bad_cksum_t;
-
-extern zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS];
-
-/*
- * Checksum routines.
- */
-extern zio_checksum_t abd_checksum_SHA256;
-extern zio_checksum_t abd_checksum_SHA512_native;
-extern zio_checksum_t abd_checksum_SHA512_byteswap;
-
-/* Skein */
-extern zio_checksum_t abd_checksum_skein_native;
-extern zio_checksum_t abd_checksum_skein_byteswap;
-extern zio_checksum_tmpl_init_t abd_checksum_skein_tmpl_init;
-extern zio_checksum_tmpl_free_t abd_checksum_skein_tmpl_free;
-
-#ifdef illumos
-/* Edon-R */
-extern zio_checksum_t abd_checksum_edonr_native;
-extern zio_checksum_t abd_checksum_edonr_byteswap;
-extern zio_checksum_tmpl_init_t abd_checksum_edonr_tmpl_init;
-extern zio_checksum_tmpl_free_t abd_checksum_edonr_tmpl_free;
-#endif
-
-extern int zio_checksum_equal(spa_t *, blkptr_t *, enum zio_checksum,
-    void *, uint64_t, uint64_t, zio_bad_cksum_t *);
-extern void zio_checksum_compute(zio_t *, enum zio_checksum,
-    struct abd *, uint64_t);
-extern int zio_checksum_error_impl(spa_t *, blkptr_t *, enum zio_checksum,
-    struct abd *, uint64_t, uint64_t, zio_bad_cksum_t *);
-extern int zio_checksum_error(zio_t *zio, zio_bad_cksum_t *out);
-extern enum zio_checksum spa_dedup_checksum(spa_t *spa);
-extern void zio_checksum_templates_free(spa_t *spa);
-extern spa_feature_t zio_checksum_to_feature(enum zio_checksum cksum);
-
-#ifdef	__cplusplus
-}
-#endif
-
-#endif	/* _SYS_ZIO_CHECKSUM_H */
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_compress.h
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_compress.h
+++ /dev/null
@@ -1,128 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-
-/*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-/*
- * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
- * Copyright (c) 2015, 2016 by Delphix. All rights reserved.
- */
-
-#ifndef _SYS_ZIO_COMPRESS_H
-#define	_SYS_ZIO_COMPRESS_H
-
-#include <sys/abd.h>
-
-#ifdef	__cplusplus
-extern "C" {
-#endif
-
-enum zio_compress {
-	ZIO_COMPRESS_INHERIT = 0,
-	ZIO_COMPRESS_ON,
-	ZIO_COMPRESS_OFF,
-	ZIO_COMPRESS_LZJB,
-	ZIO_COMPRESS_EMPTY,
-	ZIO_COMPRESS_GZIP_1,
-	ZIO_COMPRESS_GZIP_2,
-	ZIO_COMPRESS_GZIP_3,
-	ZIO_COMPRESS_GZIP_4,
-	ZIO_COMPRESS_GZIP_5,
-	ZIO_COMPRESS_GZIP_6,
-	ZIO_COMPRESS_GZIP_7,
-	ZIO_COMPRESS_GZIP_8,
-	ZIO_COMPRESS_GZIP_9,
-	ZIO_COMPRESS_ZLE,
-	ZIO_COMPRESS_LZ4,
-	ZIO_COMPRESS_FUNCTIONS
-};
-
-/* Common signature for all zio compress functions. */
-typedef size_t zio_compress_func_t(void *src, void *dst,
-    size_t s_len, size_t d_len, int);
-/* Common signature for all zio decompress functions. */
-typedef int zio_decompress_func_t(void *src, void *dst,
-    size_t s_len, size_t d_len, int);
-/*
- * Common signature for all zio decompress functions using an ABD as input.
- * This is helpful if you have both compressed ARC and scatter ABDs enabled,
- * but is not a requirement for all compression algorithms.
- */
-typedef int zio_decompress_abd_func_t(abd_t *src, void *dst,
-    size_t s_len, size_t d_len, int);
-
-/*
- * Information about each compression function.
- */
-typedef struct zio_compress_info {
-	char				*ci_name;
-	int				ci_level;
-	zio_compress_func_t		*ci_compress;
-	zio_decompress_func_t		*ci_decompress;
-} zio_compress_info_t;
-
-extern zio_compress_info_t zio_compress_table[ZIO_COMPRESS_FUNCTIONS];
-
-/*
- * Compression routines.
- */
-extern size_t lzjb_compress(void *src, void *dst, size_t s_len, size_t d_len,
-    int level);
-extern int lzjb_decompress(void *src, void *dst, size_t s_len, size_t d_len,
-    int level);
-extern size_t gzip_compress(void *src, void *dst, size_t s_len, size_t d_len,
-    int level);
-extern int gzip_decompress(void *src, void *dst, size_t s_len, size_t d_len,
-    int level);
-extern size_t zle_compress(void *src, void *dst, size_t s_len, size_t d_len,
-    int level);
-extern int zle_decompress(void *src, void *dst, size_t s_len, size_t d_len,
-    int level);
-extern void lz4_init(void);
-extern void lz4_fini(void);
-extern size_t lz4_compress(void *src, void *dst, size_t s_len, size_t d_len,
-    int level);
-extern int lz4_decompress(void *src, void *dst, size_t s_len, size_t d_len,
-    int level);
-
-/*
- * Compress and decompress data if necessary.
- */
-extern size_t zio_compress_data(enum zio_compress c, abd_t *src, void *dst,
-    size_t s_len);
-extern int zio_decompress_data(enum zio_compress c, abd_t *src, void *dst,
-    size_t s_len, size_t d_len);
-extern int zio_decompress_data_buf(enum zio_compress c, void *src, void *dst,
-    size_t s_len, size_t d_len);
-
-/*
- * Module lifetime management.
- */
-extern void zio_compress_init(void);
-extern void zio_compress_fini(void);
-
-#ifdef	__cplusplus
-}
-#endif
-
-#endif	/* _SYS_ZIO_COMPRESS_H */
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_impl.h
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_impl.h
+++ /dev/null
@@ -1,256 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-
-/*
- * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
- */
-
-#ifndef _ZIO_IMPL_H
-#define	_ZIO_IMPL_H
-
-#include <sys/zfs_context.h>
-#include <sys/zio.h>
-
-#ifdef	__cplusplus
-extern "C" {
-#endif
-
-/*
- * XXX -- Describe ZFS I/O pipeline here. Fill in as needed.
- *
- * The ZFS I/O pipeline is comprised of various stages which are defined
- * in the zio_stage enum below. The individual stages are used to construct
- * these basic I/O operations: Read, Write, Free, Claim, and Ioctl.
- *
- * I/O operations: (XXX - provide detail for each of the operations)
- *
- * Read:
- * Write:
- * Free:
- * Claim:
- * Ioctl:
- *
- * Although the most common pipeline are used by the basic I/O operations
- * above, there are some helper pipelines (one could consider them
- * sub-pipelines) which are used internally by the ZIO module and are
- * explained below:
- *
- * Interlock Pipeline:
- * The interlock pipeline is the most basic pipeline and is used by all
- * of the I/O operations. The interlock pipeline does not perform any I/O
- * and is used to coordinate the dependencies between I/Os that are being
- * issued (i.e. the parent/child relationship).
- *
- * Vdev child Pipeline:
- * The vdev child pipeline is responsible for performing the physical I/O.
- * It is in this pipeline where the I/O are queued and possibly cached.
- *
- * In addition to performing I/O, the pipeline is also responsible for
- * data transformations. The transformations performed are based on the
- * specific properties that user may have selected and modify the
- * behavior of the pipeline. Examples of supported transformations are
- * compression, dedup, and nop writes. Transformations will either modify
- * the data or the pipeline. This list below further describes each of
- * the supported transformations:
- *
- * Compression:
- * ZFS supports three different flavors of compression -- gzip, lzjb, and
- * zle. Compression occurs as part of the write pipeline and is performed
- * in the ZIO_STAGE_WRITE_BP_INIT stage.
- *
- * Dedup:
- * Dedup reads are handled by the ZIO_STAGE_DDT_READ_START and
- * ZIO_STAGE_DDT_READ_DONE stages. These stages are added to an existing
- * read pipeline if the dedup bit is set on the block pointer.
- * Writing a dedup block is performed by the ZIO_STAGE_DDT_WRITE stage
- * and added to a write pipeline if a user has enabled dedup on that
- * particular dataset.
- *
- * NOP Write:
- * The NOP write feature is performed by the ZIO_STAGE_NOP_WRITE stage
- * and is added to an existing write pipeline if a crypographically
- * secure checksum (i.e. SHA256) is enabled and compression is turned on.
- * The NOP write stage will compare the checksums of the current data
- * on-disk (level-0 blocks only) and the data that is currently being written.
- * If the checksum values are identical then the pipeline is converted to
- * an interlock pipeline skipping block allocation and bypassing the
- * physical I/O.  The nop write feature can handle writes in either
- * syncing or open context (i.e. zil writes) and as a result is mutually
- * exclusive with dedup.
- */
-
-/*
- * zio pipeline stage definitions
- */
-enum zio_stage {
-	ZIO_STAGE_OPEN			= 1 << 0,	/* RWFCI */
-
-	ZIO_STAGE_READ_BP_INIT		= 1 << 1,	/* R---- */
-	ZIO_STAGE_WRITE_BP_INIT		= 1 << 2,	/* -W--- */
-	ZIO_STAGE_FREE_BP_INIT		= 1 << 3,	/* --F-- */
-	ZIO_STAGE_ISSUE_ASYNC		= 1 << 4,	/* RWF-- */
-	ZIO_STAGE_WRITE_COMPRESS	= 1 << 5,	/* -W--- */
-
-	ZIO_STAGE_CHECKSUM_GENERATE	= 1 << 6,	/* -W--- */
-
-	ZIO_STAGE_NOP_WRITE		= 1 << 7,	/* -W--- */
-
-	ZIO_STAGE_DDT_READ_START	= 1 << 8,	/* R---- */
-	ZIO_STAGE_DDT_READ_DONE		= 1 << 9,	/* R---- */
-	ZIO_STAGE_DDT_WRITE		= 1 << 10,	/* -W--- */
-	ZIO_STAGE_DDT_FREE		= 1 << 11,	/* --F-- */
-
-	ZIO_STAGE_GANG_ASSEMBLE		= 1 << 12,	/* RWFC- */
-	ZIO_STAGE_GANG_ISSUE		= 1 << 13,	/* RWFC- */
-
-	ZIO_STAGE_DVA_THROTTLE		= 1 << 14,	/* -W--- */
-	ZIO_STAGE_DVA_ALLOCATE		= 1 << 15,	/* -W--- */
-	ZIO_STAGE_DVA_FREE		= 1 << 16,	/* --F-- */
-	ZIO_STAGE_DVA_CLAIM		= 1 << 17,	/* ---C- */
-
-	ZIO_STAGE_READY			= 1 << 18,	/* RWFCI */
-
-	ZIO_STAGE_VDEV_IO_START		= 1 << 19,	/* RWF-I */
-	ZIO_STAGE_VDEV_IO_DONE		= 1 << 20,	/* RWF-I */
-	ZIO_STAGE_VDEV_IO_ASSESS	= 1 << 21,	/* RWF-I */
-
-	ZIO_STAGE_CHECKSUM_VERIFY	= 1 << 22,	/* R---- */
-
-	ZIO_STAGE_DONE			= 1 << 23	/* RWFCI */
-};
-
-#define	ZIO_INTERLOCK_STAGES			\
-	(ZIO_STAGE_READY |			\
-	ZIO_STAGE_DONE)
-
-#define	ZIO_INTERLOCK_PIPELINE			\
-	ZIO_INTERLOCK_STAGES
-
-#define	ZIO_VDEV_IO_STAGES			\
-	(ZIO_STAGE_VDEV_IO_START |		\
-	ZIO_STAGE_VDEV_IO_DONE |		\
-	ZIO_STAGE_VDEV_IO_ASSESS)
-
-#define	ZIO_VDEV_CHILD_PIPELINE			\
-	(ZIO_VDEV_IO_STAGES |			\
-	ZIO_STAGE_DONE)
-
-#define	ZIO_READ_COMMON_STAGES			\
-	(ZIO_INTERLOCK_STAGES |			\
-	ZIO_VDEV_IO_STAGES |			\
-	ZIO_STAGE_CHECKSUM_VERIFY)
-
-#define	ZIO_READ_PHYS_PIPELINE			\
-	ZIO_READ_COMMON_STAGES
-
-#define	ZIO_READ_PIPELINE			\
-	(ZIO_READ_COMMON_STAGES |		\
-	ZIO_STAGE_READ_BP_INIT)
-
-#define	ZIO_DDT_CHILD_READ_PIPELINE		\
-	ZIO_READ_COMMON_STAGES
-
-#define	ZIO_DDT_READ_PIPELINE			\
-	(ZIO_INTERLOCK_STAGES |			\
-	ZIO_STAGE_READ_BP_INIT |		\
-	ZIO_STAGE_DDT_READ_START |		\
-	ZIO_STAGE_DDT_READ_DONE)
-
-#define	ZIO_WRITE_COMMON_STAGES			\
-	(ZIO_INTERLOCK_STAGES |			\
-	ZIO_VDEV_IO_STAGES |			\
-	ZIO_STAGE_ISSUE_ASYNC |			\
-	ZIO_STAGE_CHECKSUM_GENERATE)
-
-#define	ZIO_WRITE_PHYS_PIPELINE			\
-	ZIO_WRITE_COMMON_STAGES
-
-#define	ZIO_REWRITE_PIPELINE			\
-	(ZIO_WRITE_COMMON_STAGES |		\
-	ZIO_STAGE_WRITE_COMPRESS |		\
-	ZIO_STAGE_WRITE_BP_INIT)
-
-#define	ZIO_WRITE_PIPELINE			\
-	(ZIO_WRITE_COMMON_STAGES |		\
-	ZIO_STAGE_WRITE_BP_INIT |		\
-	ZIO_STAGE_WRITE_COMPRESS |		\
-	ZIO_STAGE_DVA_THROTTLE |		\
-	ZIO_STAGE_DVA_ALLOCATE)
-
-#define	ZIO_DDT_CHILD_WRITE_PIPELINE		\
-	(ZIO_INTERLOCK_STAGES |			\
-	ZIO_VDEV_IO_STAGES |			\
-	ZIO_STAGE_DVA_THROTTLE |		\
-	ZIO_STAGE_DVA_ALLOCATE)
-
-#define	ZIO_DDT_WRITE_PIPELINE			\
-	(ZIO_INTERLOCK_STAGES |			\
-	ZIO_STAGE_WRITE_BP_INIT |		\
-	ZIO_STAGE_ISSUE_ASYNC |			\
-	ZIO_STAGE_WRITE_COMPRESS |		\
-	ZIO_STAGE_CHECKSUM_GENERATE |		\
-	ZIO_STAGE_DDT_WRITE)
-
-#define	ZIO_GANG_STAGES				\
-	(ZIO_STAGE_GANG_ASSEMBLE |		\
-	ZIO_STAGE_GANG_ISSUE)
-
-#define	ZIO_FREE_PIPELINE			\
-	(ZIO_INTERLOCK_STAGES |			\
-	ZIO_STAGE_FREE_BP_INIT |		\
-	ZIO_STAGE_DVA_FREE)
-
-#define	ZIO_FREE_PHYS_PIPELINE			\
-	(ZIO_INTERLOCK_STAGES |			\
-	ZIO_VDEV_IO_STAGES)
-
-#define	ZIO_DDT_FREE_PIPELINE			\
-	(ZIO_INTERLOCK_STAGES |			\
-	ZIO_STAGE_FREE_BP_INIT |		\
-	ZIO_STAGE_ISSUE_ASYNC |			\
-	ZIO_STAGE_DDT_FREE)
-
-#define	ZIO_CLAIM_PIPELINE			\
-	(ZIO_INTERLOCK_STAGES |			\
-	ZIO_STAGE_DVA_CLAIM)
-
-#define	ZIO_IOCTL_PIPELINE			\
-	(ZIO_INTERLOCK_STAGES |			\
-	ZIO_STAGE_VDEV_IO_START |		\
-	ZIO_STAGE_VDEV_IO_ASSESS)
-
-#define	ZIO_BLOCKING_STAGES			\
-	(ZIO_STAGE_DVA_ALLOCATE |		\
-	ZIO_STAGE_DVA_CLAIM |			\
-	ZIO_STAGE_VDEV_IO_START)
-
-extern void zio_inject_init(void);
-extern void zio_inject_fini(void);
-
-#ifdef	__cplusplus
-}
-#endif
-
-#endif	/* _ZIO_IMPL_H */
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_priority.h
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_priority.h
+++ /dev/null
@@ -1,43 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * This file and its contents are supplied under the terms of the
- * Common Development and Distribution License ("CDDL"), version 1.0.
- * You may only use this file in accordance with the terms of version
- * 1.0 of the CDDL.
- *
- * A full copy of the text of the CDDL should have accompanied this
- * source.  A copy of the CDDL is also available via the Internet at
- * http://www.illumos.org/license/CDDL.
- *
- * CDDL HEADER END
- */
-/*
- * Copyright (c) 2014, 2016 by Delphix. All rights reserved.
- */
-#ifndef	_ZIO_PRIORITY_H
-#define	_ZIO_PRIORITY_H
-
-#ifdef	__cplusplus
-extern "C" {
-#endif
-
-typedef enum zio_priority {
-	ZIO_PRIORITY_SYNC_READ,
-	ZIO_PRIORITY_SYNC_WRITE,	/* ZIL */
-	ZIO_PRIORITY_ASYNC_READ,	/* prefetch */
-	ZIO_PRIORITY_ASYNC_WRITE,	/* spa_sync() */
-	ZIO_PRIORITY_SCRUB,		/* asynchronous scrub/resilver reads */
-	ZIO_PRIORITY_TRIM,		/* free requests used for TRIM */
-	ZIO_PRIORITY_REMOVAL,		/* reads/writes for vdev removal */
-	ZIO_PRIORITY_INITIALIZING,	/* initializing I/O */
-	ZIO_PRIORITY_NUM_QUEUEABLE,
-
-	ZIO_PRIORITY_NOW		/* non-queued i/os (e.g. free) */
-} zio_priority_t;
-
-#ifdef	__cplusplus
-}
-#endif
-
-#endif	/* _ZIO_PRIORITY_H */
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zrlock.h
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zrlock.h
+++ /dev/null
@@ -1,63 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2015 by Delphix. All rights reserved.
- */
-
-#ifndef	_SYS_ZRLOCK_H
-#define	_SYS_ZRLOCK_H
-
-#include <sys/zfs_context.h>
-
-#ifdef	__cplusplus
-extern "C" {
-#endif
-
-typedef struct zrlock {
-	kmutex_t zr_mtx;
-	volatile int32_t zr_refcount;
-	kcondvar_t zr_cv;
-	uint16_t zr_pad;
-#ifdef	ZFS_DEBUG
-	kthread_t *zr_owner;
-	const char *zr_caller;
-#endif
-} zrlock_t;
-
-extern void zrl_init(zrlock_t *);
-extern void zrl_destroy(zrlock_t *);
-#define	zrl_add(_z)	zrl_add_impl((_z), __func__)
-extern void zrl_add_impl(zrlock_t *, const char *);
-extern void zrl_remove(zrlock_t *);
-extern int zrl_tryenter(zrlock_t *);
-extern void zrl_exit(zrlock_t *);
-extern int zrl_is_zero(zrlock_t *);
-extern int zrl_is_locked(zrlock_t *);
-#ifdef	ZFS_DEBUG
-extern kthread_t *zrl_owner(zrlock_t *);
-#endif
-
-#ifdef	__cplusplus
-}
-#endif
-
-#endif /* _SYS_ZRLOCK_H */
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zthr.h
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zthr.h
+++ /dev/null
@@ -1,39 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * This file and its contents are supplied under the terms of the
- * Common Development and Distribution License ("CDDL"), version 1.0.
- * You may only use this file in accordance with the terms of version
- * 1.0 of the CDDL.
- *
- * A full copy of the text of the CDDL should have accompanied this
- * source. A copy of the CDDL is also available via the Internet at
- * http://www.illumos.org/license/CDDL.
- *
- * CDDL HEADER END
- */
-
-/*
- * Copyright (c) 2017, 2018 by Delphix. All rights reserved.
- */
-
-#ifndef _SYS_ZTHR_H
-#define	_SYS_ZTHR_H
-
-typedef struct zthr zthr_t;
-typedef void (zthr_func_t)(void *, zthr_t *);
-typedef boolean_t (zthr_checkfunc_t)(void *, zthr_t *);
-
-extern zthr_t *zthr_create(zthr_checkfunc_t checkfunc,
-    zthr_func_t *func, void *arg);
-extern zthr_t *zthr_create_timer(zthr_checkfunc_t *checkfunc,
-    zthr_func_t *func, void *arg, hrtime_t nano_wait);
-extern void zthr_destroy(zthr_t *t);
-
-extern void zthr_wakeup(zthr_t *t);
-extern void zthr_cancel(zthr_t *t);
-extern void zthr_resume(zthr_t *t);
-
-extern boolean_t zthr_iscancelled(zthr_t *t);
-
-#endif /* _SYS_ZTHR_H */
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zvol.h
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zvol.h
+++ /dev/null
@@ -1,85 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-
-/*
- * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2016 Actifio, Inc. All rights reserved.
- */
-
-#ifndef	_SYS_ZVOL_H
-#define	_SYS_ZVOL_H
-
-#include <sys/zfs_context.h>
-
-#ifdef	__cplusplus
-extern "C" {
-#endif
-
-#define	ZVOL_OBJ		1ULL
-#define	ZVOL_ZAP_OBJ		2ULL
-
-#ifdef _KERNEL
-extern int zvol_check_volsize(uint64_t volsize, uint64_t blocksize);
-extern int zvol_check_volblocksize(uint64_t volblocksize);
-extern int zvol_get_stats(objset_t *os, nvlist_t *nv);
-extern void zvol_create_cb(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx);
-extern int zvol_set_volsize(const char *, uint64_t);
-
-#ifdef illumos
-extern int zvol_open(dev_t *devp, int flag, int otyp, cred_t *cr);
-extern int zvol_dump(dev_t dev, caddr_t addr, daddr_t offset, int nblocks);
-extern int zvol_close(dev_t dev, int flag, int otyp, cred_t *cr);
-extern int zvol_strategy(buf_t *bp);
-extern int zvol_read(dev_t dev, uio_t *uiop, cred_t *cr);
-extern int zvol_write(dev_t dev, uio_t *uiop, cred_t *cr);
-extern int zvol_aread(dev_t dev, struct aio_req *aio, cred_t *cr);
-extern int zvol_awrite(dev_t dev, struct aio_req *aio, cred_t *cr);
-#endif	/* illumos */
-extern int zvol_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *cr,
-    int *rvalp);
-extern int zvol_busy(void);
-extern void zvol_init(void);
-extern void zvol_fini(void);
-
-#ifdef illumos
-extern int zvol_get_volume_params(minor_t minor, uint64_t *blksize,
-    uint64_t *max_xfer_len, void **minor_hdl, void **objset_hdl, void **zil_hdl,
-    void **rl_hdl, void **bonus_hdl);
-extern uint64_t zvol_get_volume_size(void *minor_hdl);
-extern int zvol_get_volume_wce(void *minor_hdl);
-extern void zvol_log_write_minor(void *minor_hdl, dmu_tx_t *tx, offset_t off,
-    ssize_t resid, boolean_t sync);
-#endif	/* illumos */
-
-#if defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
-extern void zvol_create_minors(spa_t *spa, const char *name);
-extern void zvol_remove_minors(spa_t *spa, const char *name);
-extern void zvol_rename_minors(spa_t *spa, const char *oldname,
-    const char *newname);
-#endif
-
-#endif	/* _KERNEL */
-
-#ifdef	__cplusplus
-}
-#endif
-
-#endif	/* _SYS_ZVOL_H */
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/trim_map.c
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/trim_map.c
+++ /dev/null
@@ -1,634 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright (c) 2012 Pawel Jakub Dawidek <pawel@dawidek.net>.
- * All rights reserved.
- */
-
-#include <sys/zfs_context.h>
-#include <sys/spa_impl.h>
-#include <sys/vdev_impl.h>
-#include <sys/trim_map.h>
-#include <sys/time.h>
-
-/*
- * Calculate the zio end, upgrading based on ashift which would be
- * done by zio_vdev_io_start.
- *
- * This makes free range consolidation much more effective
- * than it would otherwise be as well as ensuring that entire
- * blocks are invalidated by writes.
- */
-#define	TRIM_ZIO_END(vd, offset, size)	(offset +		\
- 	P2ROUNDUP(size, 1ULL << vd->vdev_top->vdev_ashift))
-
-/* Maximal segment size for ATA TRIM. */
-#define TRIM_MAP_SIZE_FACTOR	(512 << 16)
-
-#define TRIM_MAP_SEGS(size)	(1 + (size) / TRIM_MAP_SIZE_FACTOR)
-
-#define TRIM_MAP_ADD(tm, ts)	do {				\
-	list_insert_tail(&(tm)->tm_head, (ts));			\
-	(tm)->tm_pending += TRIM_MAP_SEGS((ts)->ts_end - (ts)->ts_start); \
-} while (0)
-
-#define TRIM_MAP_REM(tm, ts)	do {				\
-	list_remove(&(tm)->tm_head, (ts));			\
-	(tm)->tm_pending -= TRIM_MAP_SEGS((ts)->ts_end - (ts)->ts_start); \
-} while (0)
-
-typedef struct trim_map {
-	list_t		tm_head;		/* List of segments sorted by txg. */
-	avl_tree_t	tm_queued_frees;	/* AVL tree of segments waiting for TRIM. */
-	avl_tree_t	tm_inflight_frees;	/* AVL tree of in-flight TRIMs. */
-	avl_tree_t	tm_inflight_writes;	/* AVL tree of in-flight writes. */
-	list_t		tm_pending_writes;	/* Writes blocked on in-flight frees. */
-	kmutex_t	tm_lock;
-	uint64_t	tm_pending;		/* Count of pending TRIMs. */
-} trim_map_t;
-
-typedef struct trim_seg {
-	avl_node_t	ts_node;	/* AVL node. */
-	list_node_t	ts_next;	/* List element. */
-	uint64_t	ts_start;	/* Starting offset of this segment. */
-	uint64_t	ts_end;		/* Ending offset (non-inclusive). */
-	uint64_t	ts_txg;		/* Segment creation txg. */
-	hrtime_t	ts_time;	/* Segment creation time. */
-} trim_seg_t;
-
-extern boolean_t zfs_trim_enabled;
-
-static u_int trim_txg_delay = 32;	/* Keep deleted data up to 32 TXG */
-static u_int trim_timeout = 30;		/* Keep deleted data up to 30s */
-static u_int trim_max_interval = 1;	/* 1s delays between TRIMs */
-static u_int trim_vdev_max_pending = 10000; /* Keep up to 10K segments */
-
-SYSCTL_DECL(_vfs_zfs);
-SYSCTL_NODE(_vfs_zfs, OID_AUTO, trim, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
-    "ZFS TRIM");
-
-SYSCTL_UINT(_vfs_zfs_trim, OID_AUTO, txg_delay, CTLFLAG_RWTUN, &trim_txg_delay,
-    0, "Delay TRIMs by up to this many TXGs");
-SYSCTL_UINT(_vfs_zfs_trim, OID_AUTO, timeout, CTLFLAG_RWTUN, &trim_timeout, 0,
-    "Delay TRIMs by up to this many seconds");
-SYSCTL_UINT(_vfs_zfs_trim, OID_AUTO, max_interval, CTLFLAG_RWTUN,
-    &trim_max_interval, 0,
-    "Maximum interval between TRIM queue processing (seconds)");
-
-SYSCTL_DECL(_vfs_zfs_vdev);
-SYSCTL_UINT(_vfs_zfs_vdev, OID_AUTO, trim_max_pending, CTLFLAG_RWTUN,
-    &trim_vdev_max_pending, 0,
-    "Maximum pending TRIM segments for a vdev");
-
-static void trim_map_vdev_commit_done(spa_t *spa, vdev_t *vd);
-
-static int
-trim_map_seg_compare(const void *x1, const void *x2)
-{
-	const trim_seg_t *s1 = x1;
-	const trim_seg_t *s2 = x2;
-
-	if (s1->ts_start < s2->ts_start) {
-		if (s1->ts_end > s2->ts_start)
-			return (0);
-		return (-1);
-	}
-	if (s1->ts_start > s2->ts_start) {
-		if (s1->ts_start < s2->ts_end)
-			return (0);
-		return (1);
-	}
-	return (0);
-}
-
-static int
-trim_map_zio_compare(const void *x1, const void *x2)
-{
-	const zio_t *z1 = x1;
-	const zio_t *z2 = x2;
-
-	if (z1->io_offset < z2->io_offset) {
-		if (z1->io_offset + z1->io_size > z2->io_offset)
-			return (0);
-		return (-1);
-	}
-	if (z1->io_offset > z2->io_offset) {
-		if (z1->io_offset < z2->io_offset + z2->io_size)
-			return (0);
-		return (1);
-	}
-	return (0);
-}
-
-void
-trim_map_create(vdev_t *vd)
-{
-	trim_map_t *tm;
-
-	ASSERT(zfs_trim_enabled && !vd->vdev_notrim &&
-		vd->vdev_ops->vdev_op_leaf);
-
-	tm = kmem_zalloc(sizeof (*tm), KM_SLEEP);
-	mutex_init(&tm->tm_lock, NULL, MUTEX_DEFAULT, NULL);
-	list_create(&tm->tm_head, sizeof (trim_seg_t),
-	    offsetof(trim_seg_t, ts_next));
-	list_create(&tm->tm_pending_writes, sizeof (zio_t),
-	    offsetof(zio_t, io_trim_link));
-	avl_create(&tm->tm_queued_frees, trim_map_seg_compare,
-	    sizeof (trim_seg_t), offsetof(trim_seg_t, ts_node));
-	avl_create(&tm->tm_inflight_frees, trim_map_seg_compare,
-	    sizeof (trim_seg_t), offsetof(trim_seg_t, ts_node));
-	avl_create(&tm->tm_inflight_writes, trim_map_zio_compare,
-	    sizeof (zio_t), offsetof(zio_t, io_trim_node));
-	vd->vdev_trimmap = tm;
-}
-
-void
-trim_map_destroy(vdev_t *vd)
-{
-	trim_map_t *tm;
-	trim_seg_t *ts;
-
-	ASSERT(vd->vdev_ops->vdev_op_leaf);
-
-	if (!zfs_trim_enabled)
-		return;
-
-	tm = vd->vdev_trimmap;
-	if (tm == NULL)
-		return;
-
-	/*
-	 * We may have been called before trim_map_vdev_commit_done()
-	 * had a chance to run, so do it now to prune the remaining
-	 * inflight frees.
-	 */
-	trim_map_vdev_commit_done(vd->vdev_spa, vd);
-
-	mutex_enter(&tm->tm_lock);
-	while ((ts = list_head(&tm->tm_head)) != NULL) {
-		avl_remove(&tm->tm_queued_frees, ts);
-		TRIM_MAP_REM(tm, ts);
-		kmem_free(ts, sizeof (*ts));
-	}
-	mutex_exit(&tm->tm_lock);
-
-	avl_destroy(&tm->tm_queued_frees);
-	avl_destroy(&tm->tm_inflight_frees);
-	avl_destroy(&tm->tm_inflight_writes);
-	list_destroy(&tm->tm_pending_writes);
-	list_destroy(&tm->tm_head);
-	mutex_destroy(&tm->tm_lock);
-	kmem_free(tm, sizeof (*tm));
-	vd->vdev_trimmap = NULL;
-}
-
-static void
-trim_map_segment_add(trim_map_t *tm, uint64_t start, uint64_t end, uint64_t txg)
-{
-	avl_index_t where;
-	trim_seg_t tsearch, *ts_before, *ts_after, *ts;
-	boolean_t merge_before, merge_after;
-	hrtime_t time;
-
-	ASSERT(MUTEX_HELD(&tm->tm_lock));
-	VERIFY(start < end);
-
-	time = gethrtime();
-	tsearch.ts_start = start;
-	tsearch.ts_end = end;
-
-	ts = avl_find(&tm->tm_queued_frees, &tsearch, &where);
-	if (ts != NULL) {
-		if (start < ts->ts_start)
-			trim_map_segment_add(tm, start, ts->ts_start, txg);
-		if (end > ts->ts_end)
-			trim_map_segment_add(tm, ts->ts_end, end, txg);
-		return;
-	}
-
-	ts_before = avl_nearest(&tm->tm_queued_frees, where, AVL_BEFORE);
-	ts_after = avl_nearest(&tm->tm_queued_frees, where, AVL_AFTER);
-
-	merge_before = (ts_before != NULL && ts_before->ts_end == start);
-	merge_after = (ts_after != NULL && ts_after->ts_start == end);
-
-	if (merge_before && merge_after) {
-		avl_remove(&tm->tm_queued_frees, ts_before);
-		TRIM_MAP_REM(tm, ts_before);
-		TRIM_MAP_REM(tm, ts_after);
-		ts_after->ts_start = ts_before->ts_start;
-		ts_after->ts_txg = txg;
-		ts_after->ts_time = time;
-		TRIM_MAP_ADD(tm, ts_after);
-		kmem_free(ts_before, sizeof (*ts_before));
-	} else if (merge_before) {
-		TRIM_MAP_REM(tm, ts_before);
-		ts_before->ts_end = end;
-		ts_before->ts_txg = txg;
-		ts_before->ts_time = time;
-		TRIM_MAP_ADD(tm, ts_before);
-	} else if (merge_after) {
-		TRIM_MAP_REM(tm, ts_after);
-		ts_after->ts_start = start;
-		ts_after->ts_txg = txg;
-		ts_after->ts_time = time;
-		TRIM_MAP_ADD(tm, ts_after);
-	} else {
-		ts = kmem_alloc(sizeof (*ts), KM_SLEEP);
-		ts->ts_start = start;
-		ts->ts_end = end;
-		ts->ts_txg = txg;
-		ts->ts_time = time;
-		avl_insert(&tm->tm_queued_frees, ts, where);
-		TRIM_MAP_ADD(tm, ts);
-	}
-}
-
-static void
-trim_map_segment_remove(trim_map_t *tm, trim_seg_t *ts, uint64_t start,
-    uint64_t end)
-{
-	trim_seg_t *nts;
-	boolean_t left_over, right_over;
-
-	ASSERT(MUTEX_HELD(&tm->tm_lock));
-
-	left_over = (ts->ts_start < start);
-	right_over = (ts->ts_end > end);
-
-	TRIM_MAP_REM(tm, ts);
-	if (left_over && right_over) {
-		nts = kmem_alloc(sizeof (*nts), KM_SLEEP);
-		nts->ts_start = end;
-		nts->ts_end = ts->ts_end;
-		nts->ts_txg = ts->ts_txg;
-		nts->ts_time = ts->ts_time;
-		ts->ts_end = start;
-		avl_insert_here(&tm->tm_queued_frees, nts, ts, AVL_AFTER);
-		TRIM_MAP_ADD(tm, ts);
-		TRIM_MAP_ADD(tm, nts);
-	} else if (left_over) {
-		ts->ts_end = start;
-		TRIM_MAP_ADD(tm, ts);
-	} else if (right_over) {
-		ts->ts_start = end;
-		TRIM_MAP_ADD(tm, ts);
-	} else {
-		avl_remove(&tm->tm_queued_frees, ts);
-		kmem_free(ts, sizeof (*ts));
-	}
-}
-
-static void
-trim_map_free_locked(trim_map_t *tm, uint64_t start, uint64_t end, uint64_t txg)
-{
-	zio_t zsearch, *zs;
-
-	ASSERT(MUTEX_HELD(&tm->tm_lock));
-
-	zsearch.io_offset = start;
-	zsearch.io_size = end - start;
-
-	zs = avl_find(&tm->tm_inflight_writes, &zsearch, NULL);
-	if (zs == NULL) {
-		trim_map_segment_add(tm, start, end, txg);
-		return;
-	}
-	if (start < zs->io_offset)
-		trim_map_free_locked(tm, start, zs->io_offset, txg);
-	if (zs->io_offset + zs->io_size < end)
-		trim_map_free_locked(tm, zs->io_offset + zs->io_size, end, txg);
-}
-
-void
-trim_map_free(vdev_t *vd, uint64_t offset, uint64_t size, uint64_t txg)
-{
-	trim_map_t *tm = vd->vdev_trimmap;
-
-	if (!zfs_trim_enabled || vd->vdev_notrim || tm == NULL)
-		return;
-
-	mutex_enter(&tm->tm_lock);
-	trim_map_free_locked(tm, offset, TRIM_ZIO_END(vd, offset, size), txg);
-	mutex_exit(&tm->tm_lock);
-}
-
-boolean_t
-trim_map_write_start(zio_t *zio)
-{
-	vdev_t *vd = zio->io_vd;
-	trim_map_t *tm = vd->vdev_trimmap;
-	trim_seg_t tsearch, *ts;
-	boolean_t left_over, right_over;
-	uint64_t start, end;
-
-	if (!zfs_trim_enabled || vd->vdev_notrim || tm == NULL)
-		return (B_TRUE);
-
-	start = zio->io_offset;
-	end = TRIM_ZIO_END(zio->io_vd, start, zio->io_size);
-	tsearch.ts_start = start;
-	tsearch.ts_end = end;
-
-	mutex_enter(&tm->tm_lock);
-
-	/*
-	 * Checking for colliding in-flight frees.
-	 */
-	ts = avl_find(&tm->tm_inflight_frees, &tsearch, NULL);
-	if (ts != NULL) {
-		list_insert_tail(&tm->tm_pending_writes, zio);
-		mutex_exit(&tm->tm_lock);
-		return (B_FALSE);
-	}
-
-	/*
-	 * Loop until all overlapping segments are removed.
-	 */
-	while ((ts = avl_find(&tm->tm_queued_frees, &tsearch, NULL)) != NULL) {
-		trim_map_segment_remove(tm, ts, start, end);
-	}
-
-	avl_add(&tm->tm_inflight_writes, zio);
-
-	mutex_exit(&tm->tm_lock);
-
-	return (B_TRUE);
-}
-
-void
-trim_map_write_done(zio_t *zio)
-{
-	vdev_t *vd = zio->io_vd;
-	trim_map_t *tm = vd->vdev_trimmap;
-
-	/*
-	 * Don't check for vdev_notrim, since the write could have
-	 * started before vdev_notrim was set.
-	 */
-	if (!zfs_trim_enabled || tm == NULL)
-		return;
-
-	mutex_enter(&tm->tm_lock);
-	/*
-	 * Don't fail if the write isn't in the tree, since the write
-	 * could have started after vdev_notrim was set.
-	 */
-	if (zio->io_trim_node.avl_child[0] ||
-	    zio->io_trim_node.avl_child[1] ||
-	    AVL_XPARENT(&zio->io_trim_node) ||
-	    tm->tm_inflight_writes.avl_root == &zio->io_trim_node)
-		avl_remove(&tm->tm_inflight_writes, zio);
-	mutex_exit(&tm->tm_lock);
-}
-
-/*
- * Return the oldest segment (the one with the lowest txg / time) or NULL if:
- * 1. The list is empty
- * 2. The first element's txg is greater than txgsafe
- * 3. The first element's txg is not greater than the txg argument and the
- *    the first element's time is not greater than time argument
- */
-static trim_seg_t *
-trim_map_first(trim_map_t *tm, uint64_t txg, uint64_t txgsafe, hrtime_t time,
-    boolean_t force)
-{
-	trim_seg_t *ts;
-
-	ASSERT(MUTEX_HELD(&tm->tm_lock));
-	VERIFY(txgsafe >= txg);
-
-	ts = list_head(&tm->tm_head);
-	if (ts != NULL && ts->ts_txg <= txgsafe &&
-	    (ts->ts_txg <= txg || ts->ts_time <= time || force))
-		return (ts);
-	return (NULL);
-}
-
-static void
-trim_map_vdev_commit(spa_t *spa, zio_t *zio, vdev_t *vd)
-{
-	trim_map_t *tm = vd->vdev_trimmap;
-	trim_seg_t *ts;
-	uint64_t size, offset, txgtarget, txgsafe;
-	int64_t hard, soft;
-	hrtime_t timelimit;
-
-	ASSERT(vd->vdev_ops->vdev_op_leaf);
-
-	if (tm == NULL)
-		return;
-
-	timelimit = gethrtime() - (hrtime_t)trim_timeout * NANOSEC;
-	if (vd->vdev_isl2cache) {
-		txgsafe = UINT64_MAX;
-		txgtarget = UINT64_MAX;
-	} else {
-		txgsafe = MIN(spa_last_synced_txg(spa), spa_freeze_txg(spa));
-		if (txgsafe > trim_txg_delay)
-			txgtarget = txgsafe - trim_txg_delay;
-		else
-			txgtarget = 0;
-	}
-
-	mutex_enter(&tm->tm_lock);
-	hard = 0;
-	if (tm->tm_pending > trim_vdev_max_pending)
-		hard = (tm->tm_pending - trim_vdev_max_pending) / 4;
-	soft = P2ROUNDUP(hard + tm->tm_pending / trim_timeout + 1, 64);
-	/* Loop until we have sent all outstanding free's */
-	while (soft > 0 &&
-	    (ts = trim_map_first(tm, txgtarget, txgsafe, timelimit, hard > 0))
-	    != NULL) {
-		TRIM_MAP_REM(tm, ts);
-		avl_remove(&tm->tm_queued_frees, ts);
-		avl_add(&tm->tm_inflight_frees, ts);
-		size = ts->ts_end - ts->ts_start;
-		offset = ts->ts_start;
-		/*
-		 * We drop the lock while we call zio_nowait as the IO
-		 * scheduler can result in a different IO being run e.g.
-		 * a write which would result in a recursive lock.
-		 */
-		mutex_exit(&tm->tm_lock);
-
-		zio_nowait(zio_trim(zio, spa, vd, offset, size));
-
-		soft -= TRIM_MAP_SEGS(size);
-		hard -= TRIM_MAP_SEGS(size);
-		mutex_enter(&tm->tm_lock);
-	}
-	mutex_exit(&tm->tm_lock);
-}
-
-static void
-trim_map_vdev_commit_done(spa_t *spa, vdev_t *vd)
-{
-	trim_map_t *tm = vd->vdev_trimmap;
-	trim_seg_t *ts;
-	list_t pending_writes;
-	zio_t *zio;
-	uint64_t start, size;
-	void *cookie;
-
-	ASSERT(vd->vdev_ops->vdev_op_leaf);
-
-	if (tm == NULL)
-		return;
-
-	mutex_enter(&tm->tm_lock);
-	if (!avl_is_empty(&tm->tm_inflight_frees)) {
-		cookie = NULL;
-		while ((ts = avl_destroy_nodes(&tm->tm_inflight_frees,
-		    &cookie)) != NULL) {
-			kmem_free(ts, sizeof (*ts));
-		}
-	}
-	list_create(&pending_writes, sizeof (zio_t), offsetof(zio_t,
-	    io_trim_link));
-	list_move_tail(&pending_writes, &tm->tm_pending_writes);
-	mutex_exit(&tm->tm_lock);
-
-	while ((zio = list_remove_head(&pending_writes)) != NULL) {
-		zio_vdev_io_reissue(zio);
-		zio_execute(zio);
-	}
-	list_destroy(&pending_writes);
-}
-
-static void
-trim_map_commit(spa_t *spa, zio_t *zio, vdev_t *vd)
-{
-	int c;
-
-	if (vd == NULL)
-		return;
-
-	if (vd->vdev_ops->vdev_op_leaf) {
-		trim_map_vdev_commit(spa, zio, vd);
-	} else {
-		for (c = 0; c < vd->vdev_children; c++)
-			trim_map_commit(spa, zio, vd->vdev_child[c]);
-	}
-}
-
-static void
-trim_map_commit_done(spa_t *spa, vdev_t *vd)
-{
-	int c;
-
-	if (vd == NULL)
-		return;
-
-	if (vd->vdev_ops->vdev_op_leaf) {
-		trim_map_vdev_commit_done(spa, vd);
-	} else {
-		for (c = 0; c < vd->vdev_children; c++)
-			trim_map_commit_done(spa, vd->vdev_child[c]);
-	}
-}
-
-static void
-trim_thread(void *arg)
-{
-	spa_t *spa = arg;
-	zio_t *zio;
-
-#ifdef _KERNEL
-	(void) snprintf(curthread->td_name, sizeof(curthread->td_name),
-	    "trim %s", spa_name(spa));
-#endif
-
-	for (;;) {
-		mutex_enter(&spa->spa_trim_lock);
-		if (spa->spa_trim_thread == NULL) {
-			spa->spa_trim_thread = curthread;
-			cv_signal(&spa->spa_trim_cv);
-			mutex_exit(&spa->spa_trim_lock);
-			thread_exit();
-		}
-
-		(void) cv_timedwait(&spa->spa_trim_cv, &spa->spa_trim_lock,
-		    hz * trim_max_interval);
-		mutex_exit(&spa->spa_trim_lock);
-
-		zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
-
-		spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
-		trim_map_commit(spa, zio, spa->spa_root_vdev);
-		(void) zio_wait(zio);
-		trim_map_commit_done(spa, spa->spa_root_vdev);
-		spa_config_exit(spa, SCL_STATE, FTAG);
-	}
-}
-
-void
-trim_thread_create(spa_t *spa)
-{
-
-	if (!zfs_trim_enabled)
-		return;
-
-	mutex_init(&spa->spa_trim_lock, NULL, MUTEX_DEFAULT, NULL);
-	cv_init(&spa->spa_trim_cv, NULL, CV_DEFAULT, NULL);
-	mutex_enter(&spa->spa_trim_lock);
-	spa->spa_trim_thread = thread_create(NULL, 0, trim_thread, spa, 0, &p0,
-	    TS_RUN, minclsyspri);
-	mutex_exit(&spa->spa_trim_lock);
-}
-
-void
-trim_thread_destroy(spa_t *spa)
-{
-
-	if (!zfs_trim_enabled)
-		return;
-	if (spa->spa_trim_thread == NULL)
-		return;
-
-	mutex_enter(&spa->spa_trim_lock);
-	/* Setting spa_trim_thread to NULL tells the thread to stop. */
-	spa->spa_trim_thread = NULL;
-	cv_signal(&spa->spa_trim_cv);
-	/* The thread will set it back to != NULL on exit. */
-	while (spa->spa_trim_thread == NULL)
-		cv_wait(&spa->spa_trim_cv, &spa->spa_trim_lock);
-	spa->spa_trim_thread = NULL;
-	mutex_exit(&spa->spa_trim_lock);
-
-	cv_destroy(&spa->spa_trim_cv);
-	mutex_destroy(&spa->spa_trim_lock);
-}
-
-void
-trim_thread_wakeup(spa_t *spa)
-{
-
-	if (!zfs_trim_enabled)
-		return;
-	if (spa->spa_trim_thread == NULL)
-		return;
-
-	mutex_enter(&spa->spa_trim_lock);
-	cv_signal(&spa->spa_trim_cv);
-	mutex_exit(&spa->spa_trim_lock);
-}
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/txg.c
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/txg.c
+++ /dev/null
@@ -1,977 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Portions Copyright 2011 Martin Matuska <mm@FreeBSD.org>
- * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
- */
-
-#include <sys/zfs_context.h>
-#include <sys/txg_impl.h>
-#include <sys/dmu_impl.h>
-#include <sys/dmu_tx.h>
-#include <sys/dsl_pool.h>
-#include <sys/dsl_scan.h>
-#include <sys/zil.h>
-#include <sys/callb.h>
-
-/*
- * ZFS Transaction Groups
- * ----------------------
- *
- * ZFS transaction groups are, as the name implies, groups of transactions
- * that act on persistent state. ZFS asserts consistency at the granularity of
- * these transaction groups. Each successive transaction group (txg) is
- * assigned a 64-bit consecutive identifier. There are three active
- * transaction group states: open, quiescing, or syncing. At any given time,
- * there may be an active txg associated with each state; each active txg may
- * either be processing, or blocked waiting to enter the next state. There may
- * be up to three active txgs, and there is always a txg in the open state
- * (though it may be blocked waiting to enter the quiescing state). In broad
- * strokes, transactions -- operations that change in-memory structures -- are
- * accepted into the txg in the open state, and are completed while the txg is
- * in the open or quiescing states. The accumulated changes are written to
- * disk in the syncing state.
- *
- * Open
- *
- * When a new txg becomes active, it first enters the open state. New
- * transactions -- updates to in-memory structures -- are assigned to the
- * currently open txg. There is always a txg in the open state so that ZFS can
- * accept new changes (though the txg may refuse new changes if it has hit
- * some limit). ZFS advances the open txg to the next state for a variety of
- * reasons such as it hitting a time or size threshold, or the execution of an
- * administrative action that must be completed in the syncing state.
- *
- * Quiescing
- *
- * After a txg exits the open state, it enters the quiescing state. The
- * quiescing state is intended to provide a buffer between accepting new
- * transactions in the open state and writing them out to stable storage in
- * the syncing state. While quiescing, transactions can continue their
- * operation without delaying either of the other states. Typically, a txg is
- * in the quiescing state very briefly since the operations are bounded by
- * software latencies rather than, say, slower I/O latencies. After all
- * transactions complete, the txg is ready to enter the next state.
- *
- * Syncing
- *
- * In the syncing state, the in-memory state built up during the open and (to
- * a lesser degree) the quiescing states is written to stable storage. The
- * process of writing out modified data can, in turn modify more data. For
- * example when we write new blocks, we need to allocate space for them; those
- * allocations modify metadata (space maps)... which themselves must be
- * written to stable storage. During the sync state, ZFS iterates, writing out
- * data until it converges and all in-memory changes have been written out.
- * The first such pass is the largest as it encompasses all the modified user
- * data (as opposed to filesystem metadata). Subsequent passes typically have
- * far less data to write as they consist exclusively of filesystem metadata.
- *
- * To ensure convergence, after a certain number of passes ZFS begins
- * overwriting locations on stable storage that had been allocated earlier in
- * the syncing state (and subsequently freed). ZFS usually allocates new
- * blocks to optimize for large, continuous, writes. For the syncing state to
- * converge however it must complete a pass where no new blocks are allocated
- * since each allocation requires a modification of persistent metadata.
- * Further, to hasten convergence, after a prescribed number of passes, ZFS
- * also defers frees, and stops compressing.
- *
- * In addition to writing out user data, we must also execute synctasks during
- * the syncing context. A synctask is the mechanism by which some
- * administrative activities work such as creating and destroying snapshots or
- * datasets. Note that when a synctask is initiated it enters the open txg,
- * and ZFS then pushes that txg as quickly as possible to completion of the
- * syncing state in order to reduce the latency of the administrative
- * activity. To complete the syncing state, ZFS writes out a new uberblock,
- * the root of the tree of blocks that comprise all state stored on the ZFS
- * pool. Finally, if there is a quiesced txg waiting, we signal that it can
- * now transition to the syncing state.
- */
-
-static void txg_sync_thread(void *arg);
-static void txg_quiesce_thread(void *arg);
-
-int zfs_txg_timeout = 5;	/* max seconds worth of delta per txg */
-
-SYSCTL_DECL(_vfs_zfs);
-SYSCTL_NODE(_vfs_zfs, OID_AUTO, txg, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
-    "ZFS TXG");
-SYSCTL_INT(_vfs_zfs_txg, OID_AUTO, timeout, CTLFLAG_RWTUN, &zfs_txg_timeout, 0,
-    "Maximum seconds worth of delta per txg");
-
-/*
- * Prepare the txg subsystem.
- */
-void
-txg_init(dsl_pool_t *dp, uint64_t txg)
-{
-	tx_state_t *tx = &dp->dp_tx;
-	int c;
-	bzero(tx, sizeof (tx_state_t));
-
-	tx->tx_cpu = kmem_zalloc(max_ncpus * sizeof (tx_cpu_t), KM_SLEEP);
-
-	for (c = 0; c < max_ncpus; c++) {
-		int i;
-
-		mutex_init(&tx->tx_cpu[c].tc_lock, NULL, MUTEX_DEFAULT, NULL);
-		mutex_init(&tx->tx_cpu[c].tc_open_lock, NULL, MUTEX_DEFAULT,
-		    NULL);
-		for (i = 0; i < TXG_SIZE; i++) {
-			cv_init(&tx->tx_cpu[c].tc_cv[i], NULL, CV_DEFAULT,
-			    NULL);
-			list_create(&tx->tx_cpu[c].tc_callbacks[i],
-			    sizeof (dmu_tx_callback_t),
-			    offsetof(dmu_tx_callback_t, dcb_node));
-		}
-	}
-
-	mutex_init(&tx->tx_sync_lock, NULL, MUTEX_DEFAULT, NULL);
-
-	cv_init(&tx->tx_sync_more_cv, NULL, CV_DEFAULT, NULL);
-	cv_init(&tx->tx_sync_done_cv, NULL, CV_DEFAULT, NULL);
-	cv_init(&tx->tx_quiesce_more_cv, NULL, CV_DEFAULT, NULL);
-	cv_init(&tx->tx_quiesce_done_cv, NULL, CV_DEFAULT, NULL);
-	cv_init(&tx->tx_exit_cv, NULL, CV_DEFAULT, NULL);
-
-	tx->tx_open_txg = txg;
-}
-
-/*
- * Close down the txg subsystem.
- */
-void
-txg_fini(dsl_pool_t *dp)
-{
-	tx_state_t *tx = &dp->dp_tx;
-	int c;
-
-	ASSERT0(tx->tx_threads);
-
-	mutex_destroy(&tx->tx_sync_lock);
-
-	cv_destroy(&tx->tx_sync_more_cv);
-	cv_destroy(&tx->tx_sync_done_cv);
-	cv_destroy(&tx->tx_quiesce_more_cv);
-	cv_destroy(&tx->tx_quiesce_done_cv);
-	cv_destroy(&tx->tx_exit_cv);
-
-	for (c = 0; c < max_ncpus; c++) {
-		int i;
-
-		mutex_destroy(&tx->tx_cpu[c].tc_open_lock);
-		mutex_destroy(&tx->tx_cpu[c].tc_lock);
-		for (i = 0; i < TXG_SIZE; i++) {
-			cv_destroy(&tx->tx_cpu[c].tc_cv[i]);
-			list_destroy(&tx->tx_cpu[c].tc_callbacks[i]);
-		}
-	}
-
-	if (tx->tx_commit_cb_taskq != NULL)
-		taskq_destroy(tx->tx_commit_cb_taskq);
-
-	kmem_free(tx->tx_cpu, max_ncpus * sizeof (tx_cpu_t));
-
-	bzero(tx, sizeof (tx_state_t));
-}
-
-/*
- * Start syncing transaction groups.
- */
-void
-txg_sync_start(dsl_pool_t *dp)
-{
-	tx_state_t *tx = &dp->dp_tx;
-
-	mutex_enter(&tx->tx_sync_lock);
-
-	dprintf("pool %p\n", dp);
-
-	ASSERT0(tx->tx_threads);
-
-	tx->tx_threads = 2;
-
-	tx->tx_quiesce_thread = thread_create(NULL, 0, txg_quiesce_thread,
-	    dp, 0, spa_proc(dp->dp_spa), TS_RUN, minclsyspri);
-
-	/*
-	 * The sync thread can need a larger-than-default stack size on
-	 * 32-bit x86.  This is due in part to nested pools and
-	 * scrub_visitbp() recursion.
-	 */
-	tx->tx_sync_thread = thread_create(NULL, 32<<10, txg_sync_thread,
-	    dp, 0, spa_proc(dp->dp_spa), TS_RUN, minclsyspri);
-
-	mutex_exit(&tx->tx_sync_lock);
-}
-
-static void
-txg_thread_enter(tx_state_t *tx, callb_cpr_t *cpr)
-{
-	CALLB_CPR_INIT(cpr, &tx->tx_sync_lock, callb_generic_cpr, FTAG);
-	mutex_enter(&tx->tx_sync_lock);
-}
-
-static void
-txg_thread_exit(tx_state_t *tx, callb_cpr_t *cpr, kthread_t **tpp)
-{
-	ASSERT(*tpp != NULL);
-	*tpp = NULL;
-	tx->tx_threads--;
-	cv_broadcast(&tx->tx_exit_cv);
-	CALLB_CPR_EXIT(cpr);		/* drops &tx->tx_sync_lock */
-	thread_exit();
-}
-
-static void
-txg_thread_wait(tx_state_t *tx, callb_cpr_t *cpr, kcondvar_t *cv, clock_t time)
-{
-	CALLB_CPR_SAFE_BEGIN(cpr);
-
-	if (time)
-		(void) cv_timedwait(cv, &tx->tx_sync_lock, time);
-	else
-		cv_wait(cv, &tx->tx_sync_lock);
-
-	CALLB_CPR_SAFE_END(cpr, &tx->tx_sync_lock);
-}
-
-/*
- * Stop syncing transaction groups.
- */
-void
-txg_sync_stop(dsl_pool_t *dp)
-{
-	tx_state_t *tx = &dp->dp_tx;
-
-	dprintf("pool %p\n", dp);
-	/*
-	 * Finish off any work in progress.
-	 */
-	ASSERT3U(tx->tx_threads, ==, 2);
-
-	/*
-	 * We need to ensure that we've vacated the deferred space_maps.
-	 */
-	txg_wait_synced(dp, tx->tx_open_txg + TXG_DEFER_SIZE);
-
-	/*
-	 * Wake all sync threads and wait for them to die.
-	 */
-	mutex_enter(&tx->tx_sync_lock);
-
-	ASSERT3U(tx->tx_threads, ==, 2);
-
-	tx->tx_exiting = 1;
-
-	cv_broadcast(&tx->tx_quiesce_more_cv);
-	cv_broadcast(&tx->tx_quiesce_done_cv);
-	cv_broadcast(&tx->tx_sync_more_cv);
-
-	while (tx->tx_threads != 0)
-		cv_wait(&tx->tx_exit_cv, &tx->tx_sync_lock);
-
-	tx->tx_exiting = 0;
-
-	mutex_exit(&tx->tx_sync_lock);
-}
-
-uint64_t
-txg_hold_open(dsl_pool_t *dp, txg_handle_t *th)
-{
-	tx_state_t *tx = &dp->dp_tx;
-	tx_cpu_t *tc = &tx->tx_cpu[CPU_SEQID];
-	uint64_t txg;
-
-	mutex_enter(&tc->tc_open_lock);
-	txg = tx->tx_open_txg;
-
-	mutex_enter(&tc->tc_lock);
-	tc->tc_count[txg & TXG_MASK]++;
-	mutex_exit(&tc->tc_lock);
-
-	th->th_cpu = tc;
-	th->th_txg = txg;
-
-	return (txg);
-}
-
-void
-txg_rele_to_quiesce(txg_handle_t *th)
-{
-	tx_cpu_t *tc = th->th_cpu;
-
-	ASSERT(!MUTEX_HELD(&tc->tc_lock));
-	mutex_exit(&tc->tc_open_lock);
-}
-
-void
-txg_register_callbacks(txg_handle_t *th, list_t *tx_callbacks)
-{
-	tx_cpu_t *tc = th->th_cpu;
-	int g = th->th_txg & TXG_MASK;
-
-	mutex_enter(&tc->tc_lock);
-	list_move_tail(&tc->tc_callbacks[g], tx_callbacks);
-	mutex_exit(&tc->tc_lock);
-}
-
-void
-txg_rele_to_sync(txg_handle_t *th)
-{
-	tx_cpu_t *tc = th->th_cpu;
-	int g = th->th_txg & TXG_MASK;
-
-	mutex_enter(&tc->tc_lock);
-	ASSERT(tc->tc_count[g] != 0);
-	if (--tc->tc_count[g] == 0)
-		cv_broadcast(&tc->tc_cv[g]);
-	mutex_exit(&tc->tc_lock);
-
-	th->th_cpu = NULL;	/* defensive */
-}
-
-/*
- * Blocks until all transactions in the group are committed.
- *
- * On return, the transaction group has reached a stable state in which it can
- * then be passed off to the syncing context.
- */
-static __noinline void
-txg_quiesce(dsl_pool_t *dp, uint64_t txg)
-{
-	tx_state_t *tx = &dp->dp_tx;
-	int g = txg & TXG_MASK;
-	int c;
-
-	/*
-	 * Grab all tc_open_locks so nobody else can get into this txg.
-	 */
-	for (c = 0; c < max_ncpus; c++)
-		mutex_enter(&tx->tx_cpu[c].tc_open_lock);
-
-	ASSERT(txg == tx->tx_open_txg);
-	tx->tx_open_txg++;
-	tx->tx_open_time = gethrtime();
-
-	DTRACE_PROBE2(txg__quiescing, dsl_pool_t *, dp, uint64_t, txg);
-	DTRACE_PROBE2(txg__opened, dsl_pool_t *, dp, uint64_t, tx->tx_open_txg);
-
-	/*
-	 * Now that we've incremented tx_open_txg, we can let threads
-	 * enter the next transaction group.
-	 */
-	for (c = 0; c < max_ncpus; c++)
-		mutex_exit(&tx->tx_cpu[c].tc_open_lock);
-
-	/*
-	 * Quiesce the transaction group by waiting for everyone to txg_exit().
-	 */
-	for (c = 0; c < max_ncpus; c++) {
-		tx_cpu_t *tc = &tx->tx_cpu[c];
-		mutex_enter(&tc->tc_lock);
-		while (tc->tc_count[g] != 0)
-			cv_wait(&tc->tc_cv[g], &tc->tc_lock);
-		mutex_exit(&tc->tc_lock);
-	}
-}
-
-static void
-txg_do_callbacks(void *arg)
-{
-	list_t *cb_list = arg;
-
-	dmu_tx_do_callbacks(cb_list, 0);
-
-	list_destroy(cb_list);
-
-	kmem_free(cb_list, sizeof (list_t));
-}
-
-/*
- * Dispatch the commit callbacks registered on this txg to worker threads.
- *
- * If no callbacks are registered for a given TXG, nothing happens.
- * This function creates a taskq for the associated pool, if needed.
- */
-static void
-txg_dispatch_callbacks(dsl_pool_t *dp, uint64_t txg)
-{
-	int c;
-	tx_state_t *tx = &dp->dp_tx;
-	list_t *cb_list;
-
-	for (c = 0; c < max_ncpus; c++) {
-		tx_cpu_t *tc = &tx->tx_cpu[c];
-		/*
-		 * No need to lock tx_cpu_t at this point, since this can
-		 * only be called once a txg has been synced.
-		 */
-
-		int g = txg & TXG_MASK;
-
-		if (list_is_empty(&tc->tc_callbacks[g]))
-			continue;
-
-		if (tx->tx_commit_cb_taskq == NULL) {
-			/*
-			 * Commit callback taskq hasn't been created yet.
-			 */
-			tx->tx_commit_cb_taskq = taskq_create("tx_commit_cb",
-			    max_ncpus, minclsyspri, max_ncpus, max_ncpus * 2,
-			    TASKQ_PREPOPULATE);
-		}
-
-		cb_list = kmem_alloc(sizeof (list_t), KM_SLEEP);
-		list_create(cb_list, sizeof (dmu_tx_callback_t),
-		    offsetof(dmu_tx_callback_t, dcb_node));
-
-		list_move_tail(cb_list, &tc->tc_callbacks[g]);
-
-		(void) taskq_dispatch(tx->tx_commit_cb_taskq, (task_func_t *)
-		    txg_do_callbacks, cb_list, TQ_SLEEP);
-	}
-}
-
-static boolean_t
-txg_is_syncing(dsl_pool_t *dp)
-{
-	tx_state_t *tx = &dp->dp_tx;
-	ASSERT(MUTEX_HELD(&tx->tx_sync_lock));
-	return (tx->tx_syncing_txg != 0);
-}
-
-static boolean_t
-txg_is_quiescing(dsl_pool_t *dp)
-{
-	tx_state_t *tx = &dp->dp_tx;
-	ASSERT(MUTEX_HELD(&tx->tx_sync_lock));
-	return (tx->tx_quiescing_txg != 0);
-}
-
-static boolean_t
-txg_has_quiesced_to_sync(dsl_pool_t *dp)
-{
-	tx_state_t *tx = &dp->dp_tx;
-	ASSERT(MUTEX_HELD(&tx->tx_sync_lock));
-	return (tx->tx_quiesced_txg != 0);
-}
-
-static void
-txg_sync_thread(void *arg)
-{
-	dsl_pool_t *dp = arg;
-	spa_t *spa = dp->dp_spa;
-	tx_state_t *tx = &dp->dp_tx;
-	callb_cpr_t cpr;
-	uint64_t start, delta;
-
-	txg_thread_enter(tx, &cpr);
-
-	start = delta = 0;
-	for (;;) {
-		uint64_t timeout = zfs_txg_timeout * hz;
-		uint64_t timer;
-		uint64_t txg;
-		uint64_t dirty_min_bytes =
-		    zfs_dirty_data_max * zfs_dirty_data_sync_pct / 100;
-
-		/*
-		 * We sync when we're scanning, there's someone waiting
-		 * on us, or the quiesce thread has handed off a txg to
-		 * us, or we have reached our timeout.
-		 */
-		timer = (delta >= timeout ? 0 : timeout - delta);
-		while (!dsl_scan_active(dp->dp_scan) &&
-		    !tx->tx_exiting && timer > 0 &&
-		    tx->tx_synced_txg >= tx->tx_sync_txg_waiting &&
-		    !txg_has_quiesced_to_sync(dp) &&
-		    dp->dp_dirty_total < dirty_min_bytes) {
-			dprintf("waiting; tx_synced=%llu waiting=%llu dp=%p\n",
-			    tx->tx_synced_txg, tx->tx_sync_txg_waiting, dp);
-			txg_thread_wait(tx, &cpr, &tx->tx_sync_more_cv, timer);
-			delta = ddi_get_lbolt() - start;
-			timer = (delta > timeout ? 0 : timeout - delta);
-		}
-
-		/*
-		 * Wait until the quiesce thread hands off a txg to us,
-		 * prompting it to do so if necessary.
-		 */
-		while (!tx->tx_exiting && !txg_has_quiesced_to_sync(dp)) {
-			if (tx->tx_quiesce_txg_waiting < tx->tx_open_txg+1)
-				tx->tx_quiesce_txg_waiting = tx->tx_open_txg+1;
-			cv_broadcast(&tx->tx_quiesce_more_cv);
-			txg_thread_wait(tx, &cpr, &tx->tx_quiesce_done_cv, 0);
-		}
-
-		if (tx->tx_exiting)
-			txg_thread_exit(tx, &cpr, &tx->tx_sync_thread);
-
-		/*
-		 * Consume the quiesced txg which has been handed off to
-		 * us.  This may cause the quiescing thread to now be
-		 * able to quiesce another txg, so we must signal it.
-		 */
-		ASSERT(tx->tx_quiesced_txg != 0);
-		txg = tx->tx_quiesced_txg;
-		tx->tx_quiesced_txg = 0;
-		tx->tx_syncing_txg = txg;
-		DTRACE_PROBE2(txg__syncing, dsl_pool_t *, dp, uint64_t, txg);
-		cv_broadcast(&tx->tx_quiesce_more_cv);
-
-		dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n",
-		    txg, tx->tx_quiesce_txg_waiting, tx->tx_sync_txg_waiting);
-		mutex_exit(&tx->tx_sync_lock);
-
-		start = ddi_get_lbolt();
-		spa_sync(spa, txg);
-		delta = ddi_get_lbolt() - start;
-
-		mutex_enter(&tx->tx_sync_lock);
-		tx->tx_synced_txg = txg;
-		tx->tx_syncing_txg = 0;
-		DTRACE_PROBE2(txg__synced, dsl_pool_t *, dp, uint64_t, txg);
-		cv_broadcast(&tx->tx_sync_done_cv);
-
-		/*
-		 * Dispatch commit callbacks to worker threads.
-		 */
-		txg_dispatch_callbacks(dp, txg);
-	}
-}
-
-static void
-txg_quiesce_thread(void *arg)
-{
-	dsl_pool_t *dp = arg;
-	tx_state_t *tx = &dp->dp_tx;
-	callb_cpr_t cpr;
-
-	txg_thread_enter(tx, &cpr);
-
-	for (;;) {
-		uint64_t txg;
-
-		/*
-		 * We quiesce when there's someone waiting on us.
-		 * However, we can only have one txg in "quiescing" or
-		 * "quiesced, waiting to sync" state.  So we wait until
-		 * the "quiesced, waiting to sync" txg has been consumed
-		 * by the sync thread.
-		 */
-		while (!tx->tx_exiting &&
-		    (tx->tx_open_txg >= tx->tx_quiesce_txg_waiting ||
-		    txg_has_quiesced_to_sync(dp)))
-			txg_thread_wait(tx, &cpr, &tx->tx_quiesce_more_cv, 0);
-
-		if (tx->tx_exiting)
-			txg_thread_exit(tx, &cpr, &tx->tx_quiesce_thread);
-
-		txg = tx->tx_open_txg;
-		dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n",
-		    txg, tx->tx_quiesce_txg_waiting,
-		    tx->tx_sync_txg_waiting);
-		tx->tx_quiescing_txg = txg;
-
-		mutex_exit(&tx->tx_sync_lock);
-		txg_quiesce(dp, txg);
-		mutex_enter(&tx->tx_sync_lock);
-
-		/*
-		 * Hand this txg off to the sync thread.
-		 */
-		dprintf("quiesce done, handing off txg %llu\n", txg);
-		tx->tx_quiescing_txg = 0;
-		tx->tx_quiesced_txg = txg;
-		DTRACE_PROBE2(txg__quiesced, dsl_pool_t *, dp, uint64_t, txg);
-		cv_broadcast(&tx->tx_sync_more_cv);
-		cv_broadcast(&tx->tx_quiesce_done_cv);
-	}
-}
-
-/*
- * Delay this thread by delay nanoseconds if we are still in the open
- * transaction group and there is already a waiting txg quiesing or quiesced.
- * Abort the delay if this txg stalls or enters the quiesing state.
- */
-void
-txg_delay(dsl_pool_t *dp, uint64_t txg, hrtime_t delay, hrtime_t resolution)
-{
-	tx_state_t *tx = &dp->dp_tx;
-	hrtime_t start = gethrtime();
-
-	/* don't delay if this txg could transition to quiescing immediately */
-	if (tx->tx_open_txg > txg ||
-	    tx->tx_syncing_txg == txg-1 || tx->tx_synced_txg == txg-1)
-		return;
-
-	mutex_enter(&tx->tx_sync_lock);
-	if (tx->tx_open_txg > txg || tx->tx_synced_txg == txg-1) {
-		mutex_exit(&tx->tx_sync_lock);
-		return;
-	}
-
-	while (gethrtime() - start < delay &&
-	    tx->tx_syncing_txg < txg-1 && !txg_stalled(dp)) {
-		(void) cv_timedwait_hires(&tx->tx_quiesce_more_cv,
-		    &tx->tx_sync_lock, delay, resolution, 0);
-	}
-
-	mutex_exit(&tx->tx_sync_lock);
-}
-
-static boolean_t
-txg_wait_synced_impl(dsl_pool_t *dp, uint64_t txg, boolean_t wait_sig)
-{
-	tx_state_t *tx = &dp->dp_tx;
-
-	ASSERT(!dsl_pool_config_held(dp));
-
-	mutex_enter(&tx->tx_sync_lock);
-	ASSERT3U(tx->tx_threads, ==, 2);
-	if (txg == 0)
-		txg = tx->tx_open_txg + TXG_DEFER_SIZE;
-	if (tx->tx_sync_txg_waiting < txg)
-		tx->tx_sync_txg_waiting = txg;
-	dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n",
-	    txg, tx->tx_quiesce_txg_waiting, tx->tx_sync_txg_waiting);
-	while (tx->tx_synced_txg < txg) {
-		dprintf("broadcasting sync more "
-		    "tx_synced=%llu waiting=%llu dp=%p\n",
-		    tx->tx_synced_txg, tx->tx_sync_txg_waiting, dp);
-		cv_broadcast(&tx->tx_sync_more_cv);
-		if (wait_sig) {
-			/*
-			 * Condition wait here but stop if the thread receives a
-			 * signal. The caller may call txg_wait_synced*() again
-			 * to resume waiting for this txg.
-			 */
-#ifdef __FreeBSD__
-			/*
-			 * FreeBSD returns EINTR or ERESTART if there is
-			 * a pending signal, zero if the conditional variable
-			 * is signaled.  illumos returns zero in the former case
-			 * and >0 in the latter.
-			 */
-			if (cv_wait_sig(&tx->tx_sync_done_cv,
-			    &tx->tx_sync_lock) != 0) {
-#else
-			if (cv_wait_sig(&tx->tx_sync_done_cv,
-			    &tx->tx_sync_lock) == 0) {
-#endif
-
-				mutex_exit(&tx->tx_sync_lock);
-				return (B_TRUE);
-			}
-		} else {
-			cv_wait(&tx->tx_sync_done_cv, &tx->tx_sync_lock);
-		}
-	}
-	mutex_exit(&tx->tx_sync_lock);
-	return (B_FALSE);
-}
-
-void
-txg_wait_synced(dsl_pool_t *dp, uint64_t txg)
-{
-	VERIFY0(txg_wait_synced_impl(dp, txg, B_FALSE));
-}
-
-/*
- * Similar to a txg_wait_synced but it can be interrupted from a signal.
- * Returns B_TRUE if the thread was signaled while waiting.
- */
-boolean_t
-txg_wait_synced_sig(dsl_pool_t *dp, uint64_t txg)
-{
-	return (txg_wait_synced_impl(dp, txg, B_TRUE));
-}
-
-void
-txg_wait_open(dsl_pool_t *dp, uint64_t txg)
-{
-	tx_state_t *tx = &dp->dp_tx;
-
-	ASSERT(!dsl_pool_config_held(dp));
-
-	mutex_enter(&tx->tx_sync_lock);
-	ASSERT3U(tx->tx_threads, ==, 2);
-	if (txg == 0)
-		txg = tx->tx_open_txg + 1;
-	if (tx->tx_quiesce_txg_waiting < txg)
-		tx->tx_quiesce_txg_waiting = txg;
-	dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n",
-	    txg, tx->tx_quiesce_txg_waiting, tx->tx_sync_txg_waiting);
-	while (tx->tx_open_txg < txg) {
-		cv_broadcast(&tx->tx_quiesce_more_cv);
-		cv_wait(&tx->tx_quiesce_done_cv, &tx->tx_sync_lock);
-	}
-	mutex_exit(&tx->tx_sync_lock);
-}
-
-/*
- * If there isn't a txg syncing or in the pipeline, push another txg through
- * the pipeline by queiscing the open txg.
- */
-void
-txg_kick(dsl_pool_t *dp)
-{
-	tx_state_t *tx = &dp->dp_tx;
-
-	ASSERT(!dsl_pool_config_held(dp));
-
-	mutex_enter(&tx->tx_sync_lock);
-	if (!txg_is_syncing(dp) &&
-	    !txg_is_quiescing(dp) &&
-	    tx->tx_quiesce_txg_waiting <= tx->tx_open_txg &&
-	    tx->tx_sync_txg_waiting <= tx->tx_synced_txg &&
-	    tx->tx_quiesced_txg <= tx->tx_synced_txg) {
-		tx->tx_quiesce_txg_waiting = tx->tx_open_txg + 1;
-		cv_broadcast(&tx->tx_quiesce_more_cv);
-	}
-	mutex_exit(&tx->tx_sync_lock);
-}
-
-boolean_t
-txg_stalled(dsl_pool_t *dp)
-{
-	tx_state_t *tx = &dp->dp_tx;
-	return (tx->tx_quiesce_txg_waiting > tx->tx_open_txg);
-}
-
-boolean_t
-txg_sync_waiting(dsl_pool_t *dp)
-{
-	tx_state_t *tx = &dp->dp_tx;
-
-	return (tx->tx_syncing_txg <= tx->tx_sync_txg_waiting ||
-	    tx->tx_quiesced_txg != 0);
-}
-
-/*
- * Verify that this txg is active (open, quiescing, syncing).  Non-active
- * txg's should not be manipulated.
- */
-void
-txg_verify(spa_t *spa, uint64_t txg)
-{
-	dsl_pool_t *dp = spa_get_dsl(spa);
-	if (txg <= TXG_INITIAL || txg == ZILTEST_TXG)
-		return;
-	ASSERT3U(txg, <=, dp->dp_tx.tx_open_txg);
-	ASSERT3U(txg, >=, dp->dp_tx.tx_synced_txg);
-	ASSERT3U(txg, >=, dp->dp_tx.tx_open_txg - TXG_CONCURRENT_STATES);
-}
-
-/*
- * Per-txg object lists.
- */
-void
-txg_list_create(txg_list_t *tl, spa_t *spa, size_t offset)
-{
-	int t;
-
-	mutex_init(&tl->tl_lock, NULL, MUTEX_DEFAULT, NULL);
-
-	tl->tl_offset = offset;
-	tl->tl_spa = spa;
-
-	for (t = 0; t < TXG_SIZE; t++)
-		tl->tl_head[t] = NULL;
-}
-
-void
-txg_list_destroy(txg_list_t *tl)
-{
-	int t;
-
-	for (t = 0; t < TXG_SIZE; t++)
-		ASSERT(txg_list_empty(tl, t));
-
-	mutex_destroy(&tl->tl_lock);
-}
-
-boolean_t
-txg_list_empty(txg_list_t *tl, uint64_t txg)
-{
-	txg_verify(tl->tl_spa, txg);
-	return (tl->tl_head[txg & TXG_MASK] == NULL);
-}
-
-/*
- * Returns true if all txg lists are empty.
- *
- * Warning: this is inherently racy (an item could be added immediately
- * after this function returns). We don't bother with the lock because
- * it wouldn't change the semantics.
- */
-boolean_t
-txg_all_lists_empty(txg_list_t *tl)
-{
-	for (int i = 0; i < TXG_SIZE; i++) {
-		if (!txg_list_empty(tl, i)) {
-			return (B_FALSE);
-		}
-	}
-	return (B_TRUE);
-}
-
-/*
- * Add an entry to the list (unless it's already on the list).
- * Returns B_TRUE if it was actually added.
- */
-boolean_t
-txg_list_add(txg_list_t *tl, void *p, uint64_t txg)
-{
-	int t = txg & TXG_MASK;
-	txg_node_t *tn = (txg_node_t *)((char *)p + tl->tl_offset);
-	boolean_t add;
-
-	txg_verify(tl->tl_spa, txg);
-	mutex_enter(&tl->tl_lock);
-	add = (tn->tn_member[t] == 0);
-	if (add) {
-		tn->tn_member[t] = 1;
-		tn->tn_next[t] = tl->tl_head[t];
-		tl->tl_head[t] = tn;
-	}
-	mutex_exit(&tl->tl_lock);
-
-	return (add);
-}
-
-/*
- * Add an entry to the end of the list, unless it's already on the list.
- * (walks list to find end)
- * Returns B_TRUE if it was actually added.
- */
-boolean_t
-txg_list_add_tail(txg_list_t *tl, void *p, uint64_t txg)
-{
-	int t = txg & TXG_MASK;
-	txg_node_t *tn = (txg_node_t *)((char *)p + tl->tl_offset);
-	boolean_t add;
-
-	txg_verify(tl->tl_spa, txg);
-	mutex_enter(&tl->tl_lock);
-	add = (tn->tn_member[t] == 0);
-	if (add) {
-		txg_node_t **tp;
-
-		for (tp = &tl->tl_head[t]; *tp != NULL; tp = &(*tp)->tn_next[t])
-			continue;
-
-		tn->tn_member[t] = 1;
-		tn->tn_next[t] = NULL;
-		*tp = tn;
-	}
-	mutex_exit(&tl->tl_lock);
-
-	return (add);
-}
-
-/*
- * Remove the head of the list and return it.
- */
-void *
-txg_list_remove(txg_list_t *tl, uint64_t txg)
-{
-	int t = txg & TXG_MASK;
-	txg_node_t *tn;
-	void *p = NULL;
-
-	txg_verify(tl->tl_spa, txg);
-	mutex_enter(&tl->tl_lock);
-	if ((tn = tl->tl_head[t]) != NULL) {
-		ASSERT(tn->tn_member[t]);
-		ASSERT(tn->tn_next[t] == NULL || tn->tn_next[t]->tn_member[t]);
-		p = (char *)tn - tl->tl_offset;
-		tl->tl_head[t] = tn->tn_next[t];
-		tn->tn_next[t] = NULL;
-		tn->tn_member[t] = 0;
-	}
-	mutex_exit(&tl->tl_lock);
-
-	return (p);
-}
-
-/*
- * Remove a specific item from the list and return it.
- */
-void *
-txg_list_remove_this(txg_list_t *tl, void *p, uint64_t txg)
-{
-	int t = txg & TXG_MASK;
-	txg_node_t *tn, **tp;
-
-	txg_verify(tl->tl_spa, txg);
-	mutex_enter(&tl->tl_lock);
-
-	for (tp = &tl->tl_head[t]; (tn = *tp) != NULL; tp = &tn->tn_next[t]) {
-		if ((char *)tn - tl->tl_offset == p) {
-			*tp = tn->tn_next[t];
-			tn->tn_next[t] = NULL;
-			tn->tn_member[t] = 0;
-			mutex_exit(&tl->tl_lock);
-			return (p);
-		}
-	}
-
-	mutex_exit(&tl->tl_lock);
-
-	return (NULL);
-}
-
-boolean_t
-txg_list_member(txg_list_t *tl, void *p, uint64_t txg)
-{
-	int t = txg & TXG_MASK;
-	txg_node_t *tn = (txg_node_t *)((char *)p + tl->tl_offset);
-
-	txg_verify(tl->tl_spa, txg);
-	return (tn->tn_member[t] != 0);
-}
-
-/*
- * Walk a txg list -- only safe if you know it's not changing.
- */
-void *
-txg_list_head(txg_list_t *tl, uint64_t txg)
-{
-	int t = txg & TXG_MASK;
-	txg_node_t *tn = tl->tl_head[t];
-
-	txg_verify(tl->tl_spa, txg);
-	return (tn == NULL ? NULL : (char *)tn - tl->tl_offset);
-}
-
-void *
-txg_list_next(txg_list_t *tl, void *p, uint64_t txg)
-{
-	int t = txg & TXG_MASK;
-	txg_node_t *tn = (txg_node_t *)((char *)p + tl->tl_offset);
-
-	txg_verify(tl->tl_spa, txg);
-	tn = tn->tn_next[t];
-
-	return (tn == NULL ? NULL : (char *)tn - tl->tl_offset);
-}
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/uberblock.c
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/uberblock.c
+++ /dev/null
@@ -1,74 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2013, 2017 by Delphix. All rights reserved.
- */
-
-#include <sys/zfs_context.h>
-#include <sys/uberblock_impl.h>
-#include <sys/vdev_impl.h>
-#include <sys/mmp.h>
-
-int
-uberblock_verify(uberblock_t *ub)
-{
-	if (ub->ub_magic == BSWAP_64((uint64_t)UBERBLOCK_MAGIC))
-		byteswap_uint64_array(ub, sizeof (uberblock_t));
-
-	if (ub->ub_magic != UBERBLOCK_MAGIC)
-		return (SET_ERROR(EINVAL));
-
-	return (0);
-}
-
-/*
- * Update the uberblock and return TRUE if anything changed in this
- * transaction group.
- */
-boolean_t
-uberblock_update(uberblock_t *ub, vdev_t *rvd, uint64_t txg, uint64_t mmp_delay)
-{
-	ASSERT(ub->ub_txg < txg);
-
-	/*
-	 * We explicitly do not set ub_version here, so that older versions
-	 * continue to be written with the previous uberblock version.
-	 */
-	ub->ub_magic = UBERBLOCK_MAGIC;
-	ub->ub_txg = txg;
-	ub->ub_guid_sum = rvd->vdev_guid_sum;
-	ub->ub_timestamp = gethrestime_sec();
-	ub->ub_software_version = SPA_VERSION;
-	ub->ub_mmp_magic = MMP_MAGIC;
-	if (spa_multihost(rvd->vdev_spa)) {
-		ub->ub_mmp_delay = mmp_delay;
-		ub->ub_mmp_config = MMP_SEQ_SET(0) |
-		    MMP_INTERVAL_SET(zfs_multihost_interval) |
-		    MMP_FAIL_INT_SET(zfs_multihost_fail_intervals);
-	} else {
-		ub->ub_mmp_delay = 0;
-		ub->ub_mmp_config = 0;
-	}
-	ub->ub_checkpoint_txg = 0;
-
-	return (ub->ub_rootbp.blk_birth == txg);
-}
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/unique.c
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/unique.c
+++ /dev/null
@@ -1,112 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
-#include <sys/zfs_context.h>
-#include <sys/avl.h>
-#include <sys/unique.h>
-
-static avl_tree_t unique_avl;
-static kmutex_t unique_mtx;
-
-typedef struct unique {
-	avl_node_t un_link;
-	uint64_t un_value;
-} unique_t;
-
-#define	UNIQUE_MASK ((1ULL << UNIQUE_BITS) - 1)
-
-static int
-unique_compare(const void *a, const void *b)
-{
-	const unique_t *una = (const unique_t *)a;
-	const unique_t *unb = (const unique_t *)b;
-
-	return (AVL_CMP(una->un_value, unb->un_value));
-}
-
-void
-unique_init(void)
-{
-	avl_create(&unique_avl, unique_compare,
-	    sizeof (unique_t), offsetof(unique_t, un_link));
-	mutex_init(&unique_mtx, NULL, MUTEX_DEFAULT, NULL);
-}
-
-void
-unique_fini(void)
-{
-	avl_destroy(&unique_avl);
-	mutex_destroy(&unique_mtx);
-}
-
-uint64_t
-unique_create(void)
-{
-	uint64_t value = unique_insert(0);
-	unique_remove(value);
-	return (value);
-}
-
-uint64_t
-unique_insert(uint64_t value)
-{
-	avl_index_t idx;
-	unique_t *un = kmem_alloc(sizeof (unique_t), KM_SLEEP);
-
-	un->un_value = value;
-
-	mutex_enter(&unique_mtx);
-	while (un->un_value == 0 || un->un_value & ~UNIQUE_MASK ||
-	    avl_find(&unique_avl, un, &idx)) {
-		mutex_exit(&unique_mtx);
-		(void) random_get_pseudo_bytes((void*)&un->un_value,
-		    sizeof (un->un_value));
-		un->un_value &= UNIQUE_MASK;
-		mutex_enter(&unique_mtx);
-	}
-
-	avl_insert(&unique_avl, un, idx);
-	mutex_exit(&unique_mtx);
-
-	return (un->un_value);
-}
-
-void
-unique_remove(uint64_t value)
-{
-	unique_t un_tofind;
-	unique_t *un;
-
-	un_tofind.un_value = value;
-	mutex_enter(&unique_mtx);
-	un = avl_find(&unique_avl, &un_tofind, NULL);
-	if (un != NULL) {
-		avl_remove(&unique_avl, un);
-		kmem_free(un, sizeof (unique_t));
-	}
-	mutex_exit(&unique_mtx);
-}
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c
+++ /dev/null
@@ -1,4520 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-
-/*
- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
- * Copyright 2017 Nexenta Systems, Inc.
- * Copyright 2013 Martin Matuska <mm@FreeBSD.org>. All rights reserved.
- * Copyright (c) 2014 Integros [integros.com]
- * Copyright 2016 Toomas Soome <tsoome@me.com>
- * Copyright 2019 Joyent, Inc.
- * Copyright (c) 2017, Intel Corporation.
- */
-
-#include <sys/zfs_context.h>
-#include <sys/fm/fs/zfs.h>
-#include <sys/spa.h>
-#include <sys/spa_impl.h>
-#include <sys/bpobj.h>
-#include <sys/dmu.h>
-#include <sys/dmu_tx.h>
-#include <sys/dsl_dir.h>
-#include <sys/vdev_impl.h>
-#include <sys/uberblock_impl.h>
-#include <sys/metaslab.h>
-#include <sys/metaslab_impl.h>
-#include <sys/space_map.h>
-#include <sys/space_reftree.h>
-#include <sys/zio.h>
-#include <sys/zap.h>
-#include <sys/fs/zfs.h>
-#include <sys/arc.h>
-#include <sys/zil.h>
-#include <sys/dsl_scan.h>
-#include <sys/abd.h>
-#include <sys/trim_map.h>
-#include <sys/vdev_initialize.h>
-
-SYSCTL_DECL(_vfs_zfs);
-SYSCTL_NODE(_vfs_zfs, OID_AUTO, vdev, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
-    "ZFS VDEV");
-
-/*
- * Virtual device management.
- */
-
-/*
- * The limit for ZFS to automatically increase a top-level vdev's ashift
- * from logical ashift to physical ashift.
- *
- * Example: one or more 512B emulation child vdevs
- *          child->vdev_ashift = 9 (512 bytes)
- *          child->vdev_physical_ashift = 12 (4096 bytes)
- *          zfs_max_auto_ashift = 11 (2048 bytes)
- *          zfs_min_auto_ashift = 9 (512 bytes)
- *
- * On pool creation or the addition of a new top-level vdev, ZFS will
- * increase the ashift of the top-level vdev to 2048 as limited by
- * zfs_max_auto_ashift.
- *
- * Example: one or more 512B emulation child vdevs
- *          child->vdev_ashift = 9 (512 bytes)
- *          child->vdev_physical_ashift = 12 (4096 bytes)
- *          zfs_max_auto_ashift = 13 (8192 bytes)
- *          zfs_min_auto_ashift = 9 (512 bytes)
- *
- * On pool creation or the addition of a new top-level vdev, ZFS will
- * increase the ashift of the top-level vdev to 4096 to match the
- * max vdev_physical_ashift.
- *
- * Example: one or more 512B emulation child vdevs
- *          child->vdev_ashift = 9 (512 bytes)
- *          child->vdev_physical_ashift = 9 (512 bytes)
- *          zfs_max_auto_ashift = 13 (8192 bytes)
- *          zfs_min_auto_ashift = 12 (4096 bytes)
- *
- * On pool creation or the addition of a new top-level vdev, ZFS will
- * increase the ashift of the top-level vdev to 4096 to match the
- * zfs_min_auto_ashift.
- */
-static uint64_t zfs_max_auto_ashift = SPA_MAXASHIFT;
-static uint64_t zfs_min_auto_ashift = SPA_MINASHIFT;
-
-static int
-sysctl_vfs_zfs_max_auto_ashift(SYSCTL_HANDLER_ARGS)
-{
-	uint64_t val;
-	int err;
-
-	val = zfs_max_auto_ashift;
-	err = sysctl_handle_64(oidp, &val, 0, req);
-	if (err != 0 || req->newptr == NULL)
-		return (err);
-
-	if (val > SPA_MAXASHIFT || val < zfs_min_auto_ashift)
-		return (EINVAL);
-
-	zfs_max_auto_ashift = val;
-
-	return (0);
-}
-SYSCTL_PROC(_vfs_zfs, OID_AUTO, max_auto_ashift,
-    CTLTYPE_U64 | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(uint64_t),
-    sysctl_vfs_zfs_max_auto_ashift, "QU",
-    "Max ashift used when optimising for logical -> physical sectors size on "
-    "new top-level vdevs.");
-
-static int
-sysctl_vfs_zfs_min_auto_ashift(SYSCTL_HANDLER_ARGS)
-{
-	uint64_t val;
-	int err;
-
-	val = zfs_min_auto_ashift;
-	err = sysctl_handle_64(oidp, &val, 0, req);
-	if (err != 0 || req->newptr == NULL)
-		return (err);
-
-	if (val < SPA_MINASHIFT || val > zfs_max_auto_ashift)
-		return (EINVAL);
-
-	zfs_min_auto_ashift = val;
-
-	return (0);
-}
-SYSCTL_PROC(_vfs_zfs, OID_AUTO, min_auto_ashift,
-    CTLTYPE_U64 | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(uint64_t),
-    sysctl_vfs_zfs_min_auto_ashift, "QU",
-    "Min ashift used when creating new top-level vdevs.");
-
-static vdev_ops_t *vdev_ops_table[] = {
-	&vdev_root_ops,
-	&vdev_raidz_ops,
-	&vdev_mirror_ops,
-	&vdev_replacing_ops,
-	&vdev_spare_ops,
-#ifdef _KERNEL
-	&vdev_geom_ops,
-#else
-	&vdev_disk_ops,
-#endif
-	&vdev_file_ops,
-	&vdev_missing_ops,
-	&vdev_hole_ops,
-	&vdev_indirect_ops,
-	NULL
-};
-
-
-/* default target for number of metaslabs per top-level vdev */
-int zfs_vdev_default_ms_count = 200;
-SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, default_ms_count, CTLFLAG_RWTUN,
-    &zfs_vdev_default_ms_count, 0,
-    "Target number of metaslabs per top-level vdev");
-
-/* minimum number of metaslabs per top-level vdev */
-int zfs_vdev_min_ms_count = 16;
-SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, min_ms_count, CTLFLAG_RWTUN,
-    &zfs_vdev_min_ms_count, 0,
-    "Minimum number of metaslabs per top-level vdev");
-
-/* practical upper limit of total metaslabs per top-level vdev */
-int zfs_vdev_ms_count_limit = 1ULL << 17;
-SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, max_ms_count_limit, CTLFLAG_RWTUN,
-    &zfs_vdev_ms_count_limit, 0,
-    "Maximum number of metaslabs per top-level vdev");
-
-/* lower limit for metaslab size (512M) */
-int zfs_vdev_default_ms_shift = 29;
-SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, default_ms_shift, CTLFLAG_RWTUN,
-    &zfs_vdev_default_ms_shift, 0,
-    "Default shift between vdev size and number of metaslabs");
-
-/* upper limit for metaslab size (16G) */
-int zfs_vdev_max_ms_shift = 34;
-SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, max_ms_shift, CTLFLAG_RWTUN,
-    &zfs_vdev_max_ms_shift, 0,
-    "Maximum shift between vdev size and number of metaslabs");
-
-boolean_t vdev_validate_skip = B_FALSE;
-SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, validate_skip, CTLFLAG_RWTUN,
-    &vdev_validate_skip, 0,
-    "Bypass vdev validation");
-
-/*
- * Since the DTL space map of a vdev is not expected to have a lot of
- * entries, we default its block size to 4K.
- */
-int vdev_dtl_sm_blksz = (1 << 12);
-SYSCTL_INT(_vfs_zfs, OID_AUTO, dtl_sm_blksz, CTLFLAG_RDTUN,
-    &vdev_dtl_sm_blksz, 0,
-    "Block size for DTL space map.  Power of 2 and greater than 4096.");
-
-/*
- * vdev-wide space maps that have lots of entries written to them at
- * the end of each transaction can benefit from a higher I/O bandwidth
- * (e.g. vdev_obsolete_sm), thus we default their block size to 128K.
- */
-int vdev_standard_sm_blksz = (1 << 17);
-SYSCTL_INT(_vfs_zfs, OID_AUTO, standard_sm_blksz, CTLFLAG_RDTUN,
-    &vdev_standard_sm_blksz, 0,
-    "Block size for standard space map.  Power of 2 and greater than 4096.");
-
-/*
- * Tunable parameter for debugging or performance analysis. Setting this
- * will cause pool corruption on power loss if a volatile out-of-order
- * write cache is enabled.
- */
-boolean_t zfs_nocacheflush = B_FALSE;
-SYSCTL_INT(_vfs_zfs, OID_AUTO, cache_flush_disable, CTLFLAG_RWTUN,
-    &zfs_nocacheflush, 0, "Disable cache flush");
-
-/*PRINTFLIKE2*/
-void
-vdev_dbgmsg(vdev_t *vd, const char *fmt, ...)
-{
-	va_list adx;
-	char buf[256];
-
-	va_start(adx, fmt);
-	(void) vsnprintf(buf, sizeof (buf), fmt, adx);
-	va_end(adx);
-
-	if (vd->vdev_path != NULL) {
-		zfs_dbgmsg("%s vdev '%s': %s", vd->vdev_ops->vdev_op_type,
-		    vd->vdev_path, buf);
-	} else {
-		zfs_dbgmsg("%s-%llu vdev (guid %llu): %s",
-		    vd->vdev_ops->vdev_op_type,
-		    (u_longlong_t)vd->vdev_id,
-		    (u_longlong_t)vd->vdev_guid, buf);
-	}
-}
-
-void
-vdev_dbgmsg_print_tree(vdev_t *vd, int indent)
-{
-	char state[20];
-
-	if (vd->vdev_ishole || vd->vdev_ops == &vdev_missing_ops) {
-		zfs_dbgmsg("%*svdev %u: %s", indent, "", vd->vdev_id,
-		    vd->vdev_ops->vdev_op_type);
-		return;
-	}
-
-	switch (vd->vdev_state) {
-	case VDEV_STATE_UNKNOWN:
-		(void) snprintf(state, sizeof (state), "unknown");
-		break;
-	case VDEV_STATE_CLOSED:
-		(void) snprintf(state, sizeof (state), "closed");
-		break;
-	case VDEV_STATE_OFFLINE:
-		(void) snprintf(state, sizeof (state), "offline");
-		break;
-	case VDEV_STATE_REMOVED:
-		(void) snprintf(state, sizeof (state), "removed");
-		break;
-	case VDEV_STATE_CANT_OPEN:
-		(void) snprintf(state, sizeof (state), "can't open");
-		break;
-	case VDEV_STATE_FAULTED:
-		(void) snprintf(state, sizeof (state), "faulted");
-		break;
-	case VDEV_STATE_DEGRADED:
-		(void) snprintf(state, sizeof (state), "degraded");
-		break;
-	case VDEV_STATE_HEALTHY:
-		(void) snprintf(state, sizeof (state), "healthy");
-		break;
-	default:
-		(void) snprintf(state, sizeof (state), "<state %u>",
-		    (uint_t)vd->vdev_state);
-	}
-
-	zfs_dbgmsg("%*svdev %u: %s%s, guid: %llu, path: %s, %s", indent,
-	    "", (int)vd->vdev_id, vd->vdev_ops->vdev_op_type,
-	    vd->vdev_islog ? " (log)" : "",
-	    (u_longlong_t)vd->vdev_guid,
-	    vd->vdev_path ? vd->vdev_path : "N/A", state);
-
-	for (uint64_t i = 0; i < vd->vdev_children; i++)
-		vdev_dbgmsg_print_tree(vd->vdev_child[i], indent + 2);
-}
-
-/*
- * Given a vdev type, return the appropriate ops vector.
- */
-static vdev_ops_t *
-vdev_getops(const char *type)
-{
-	vdev_ops_t *ops, **opspp;
-
-	for (opspp = vdev_ops_table; (ops = *opspp) != NULL; opspp++)
-		if (strcmp(ops->vdev_op_type, type) == 0)
-			break;
-
-	return (ops);
-}
-
-/*
- * Derive the enumerated alloction bias from string input.
- * String origin is either the per-vdev zap or zpool(1M).
- */
-static vdev_alloc_bias_t
-vdev_derive_alloc_bias(const char *bias)
-{
-	vdev_alloc_bias_t alloc_bias = VDEV_BIAS_NONE;
-
-	if (strcmp(bias, VDEV_ALLOC_BIAS_LOG) == 0)
-		alloc_bias = VDEV_BIAS_LOG;
-	else if (strcmp(bias, VDEV_ALLOC_BIAS_SPECIAL) == 0)
-		alloc_bias = VDEV_BIAS_SPECIAL;
-	else if (strcmp(bias, VDEV_ALLOC_BIAS_DEDUP) == 0)
-		alloc_bias = VDEV_BIAS_DEDUP;
-
-	return (alloc_bias);
-}
-
-/* ARGSUSED */
-void
-vdev_default_xlate(vdev_t *vd, const range_seg_t *in, range_seg_t *res)
-{
-	res->rs_start = in->rs_start;
-	res->rs_end = in->rs_end;
-}
-
-/*
- * Default asize function: return the MAX of psize with the asize of
- * all children.  This is what's used by anything other than RAID-Z.
- */
-uint64_t
-vdev_default_asize(vdev_t *vd, uint64_t psize)
-{
-	uint64_t asize = P2ROUNDUP(psize, 1ULL << vd->vdev_top->vdev_ashift);
-	uint64_t csize;
-
-	for (int c = 0; c < vd->vdev_children; c++) {
-		csize = vdev_psize_to_asize(vd->vdev_child[c], psize);
-		asize = MAX(asize, csize);
-	}
-
-	return (asize);
-}
-
-/*
- * Get the minimum allocatable size. We define the allocatable size as
- * the vdev's asize rounded to the nearest metaslab. This allows us to
- * replace or attach devices which don't have the same physical size but
- * can still satisfy the same number of allocations.
- */
-uint64_t
-vdev_get_min_asize(vdev_t *vd)
-{
-	vdev_t *pvd = vd->vdev_parent;
-
-	/*
-	 * If our parent is NULL (inactive spare or cache) or is the root,
-	 * just return our own asize.
-	 */
-	if (pvd == NULL)
-		return (vd->vdev_asize);
-
-	/*
-	 * The top-level vdev just returns the allocatable size rounded
-	 * to the nearest metaslab.
-	 */
-	if (vd == vd->vdev_top)
-		return (P2ALIGN(vd->vdev_asize, 1ULL << vd->vdev_ms_shift));
-
-	/*
-	 * The allocatable space for a raidz vdev is N * sizeof(smallest child),
-	 * so each child must provide at least 1/Nth of its asize.
-	 */
-	if (pvd->vdev_ops == &vdev_raidz_ops)
-		return ((pvd->vdev_min_asize + pvd->vdev_children - 1) /
-		    pvd->vdev_children);
-
-	return (pvd->vdev_min_asize);
-}
-
-void
-vdev_set_min_asize(vdev_t *vd)
-{
-	vd->vdev_min_asize = vdev_get_min_asize(vd);
-
-	for (int c = 0; c < vd->vdev_children; c++)
-		vdev_set_min_asize(vd->vdev_child[c]);
-}
-
-vdev_t *
-vdev_lookup_top(spa_t *spa, uint64_t vdev)
-{
-	vdev_t *rvd = spa->spa_root_vdev;
-
-	ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0);
-
-	if (vdev < rvd->vdev_children) {
-		ASSERT(rvd->vdev_child[vdev] != NULL);
-		return (rvd->vdev_child[vdev]);
-	}
-
-	return (NULL);
-}
-
-vdev_t *
-vdev_lookup_by_guid(vdev_t *vd, uint64_t guid)
-{
-	vdev_t *mvd;
-
-	if (vd->vdev_guid == guid)
-		return (vd);
-
-	for (int c = 0; c < vd->vdev_children; c++)
-		if ((mvd = vdev_lookup_by_guid(vd->vdev_child[c], guid)) !=
-		    NULL)
-			return (mvd);
-
-	return (NULL);
-}
-
-static int
-vdev_count_leaves_impl(vdev_t *vd)
-{
-	int n = 0;
-
-	if (vd->vdev_ops->vdev_op_leaf)
-		return (1);
-
-	for (int c = 0; c < vd->vdev_children; c++)
-		n += vdev_count_leaves_impl(vd->vdev_child[c]);
-
-	return (n);
-}
-
-int
-vdev_count_leaves(spa_t *spa)
-{
-	return (vdev_count_leaves_impl(spa->spa_root_vdev));
-}
-
-void
-vdev_add_child(vdev_t *pvd, vdev_t *cvd)
-{
-	size_t oldsize, newsize;
-	uint64_t id = cvd->vdev_id;
-	vdev_t **newchild;
-	spa_t *spa = cvd->vdev_spa;
-
-	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
-	ASSERT(cvd->vdev_parent == NULL);
-
-	cvd->vdev_parent = pvd;
-
-	if (pvd == NULL)
-		return;
-
-	ASSERT(id >= pvd->vdev_children || pvd->vdev_child[id] == NULL);
-
-	oldsize = pvd->vdev_children * sizeof (vdev_t *);
-	pvd->vdev_children = MAX(pvd->vdev_children, id + 1);
-	newsize = pvd->vdev_children * sizeof (vdev_t *);
-
-	newchild = kmem_zalloc(newsize, KM_SLEEP);
-	if (pvd->vdev_child != NULL) {
-		bcopy(pvd->vdev_child, newchild, oldsize);
-		kmem_free(pvd->vdev_child, oldsize);
-	}
-
-	pvd->vdev_child = newchild;
-	pvd->vdev_child[id] = cvd;
-
-	cvd->vdev_top = (pvd->vdev_top ? pvd->vdev_top: cvd);
-	ASSERT(cvd->vdev_top->vdev_parent->vdev_parent == NULL);
-
-	/*
-	 * Walk up all ancestors to update guid sum.
-	 */
-	for (; pvd != NULL; pvd = pvd->vdev_parent)
-		pvd->vdev_guid_sum += cvd->vdev_guid_sum;
-
-	if (cvd->vdev_ops->vdev_op_leaf) {
-		list_insert_head(&cvd->vdev_spa->spa_leaf_list, cvd);
-		cvd->vdev_spa->spa_leaf_list_gen++;
-	}
-}
-
-void
-vdev_remove_child(vdev_t *pvd, vdev_t *cvd)
-{
-	int c;
-	uint_t id = cvd->vdev_id;
-
-	ASSERT(cvd->vdev_parent == pvd);
-
-	if (pvd == NULL)
-		return;
-
-	ASSERT(id < pvd->vdev_children);
-	ASSERT(pvd->vdev_child[id] == cvd);
-
-	pvd->vdev_child[id] = NULL;
-	cvd->vdev_parent = NULL;
-
-	for (c = 0; c < pvd->vdev_children; c++)
-		if (pvd->vdev_child[c])
-			break;
-
-	if (c == pvd->vdev_children) {
-		kmem_free(pvd->vdev_child, c * sizeof (vdev_t *));
-		pvd->vdev_child = NULL;
-		pvd->vdev_children = 0;
-	}
-
-	if (cvd->vdev_ops->vdev_op_leaf) {
-		spa_t *spa = cvd->vdev_spa;
-		list_remove(&spa->spa_leaf_list, cvd);
-		spa->spa_leaf_list_gen++;
-	}
-
-	/*
-	 * Walk up all ancestors to update guid sum.
-	 */
-	for (; pvd != NULL; pvd = pvd->vdev_parent)
-		pvd->vdev_guid_sum -= cvd->vdev_guid_sum;
-}
-
-/*
- * Remove any holes in the child array.
- */
-void
-vdev_compact_children(vdev_t *pvd)
-{
-	vdev_t **newchild, *cvd;
-	int oldc = pvd->vdev_children;
-	int newc;
-
-	ASSERT(spa_config_held(pvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL);
-
-	if (oldc == 0)
-		return;
-
-	for (int c = newc = 0; c < oldc; c++)
-		if (pvd->vdev_child[c])
-			newc++;
-
-	if (newc > 0) {
-		newchild = kmem_alloc(newc * sizeof (vdev_t *), KM_SLEEP);
-
-		for (int c = newc = 0; c < oldc; c++) {
-			if ((cvd = pvd->vdev_child[c]) != NULL) {
-				newchild[newc] = cvd;
-				cvd->vdev_id = newc++;
-			}
-		}
-	} else {
-		newchild = NULL;
-	}
-
-	kmem_free(pvd->vdev_child, oldc * sizeof (vdev_t *));
-	pvd->vdev_child = newchild;
-	pvd->vdev_children = newc;
-}
-
-/*
- * Allocate and minimally initialize a vdev_t.
- */
-vdev_t *
-vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops)
-{
-	vdev_t *vd;
-	vdev_indirect_config_t *vic;
-
-	vd = kmem_zalloc(sizeof (vdev_t), KM_SLEEP);
-	vic = &vd->vdev_indirect_config;
-
-	if (spa->spa_root_vdev == NULL) {
-		ASSERT(ops == &vdev_root_ops);
-		spa->spa_root_vdev = vd;
-		spa->spa_load_guid = spa_generate_guid(NULL);
-	}
-
-	if (guid == 0 && ops != &vdev_hole_ops) {
-		if (spa->spa_root_vdev == vd) {
-			/*
-			 * The root vdev's guid will also be the pool guid,
-			 * which must be unique among all pools.
-			 */
-			guid = spa_generate_guid(NULL);
-		} else {
-			/*
-			 * Any other vdev's guid must be unique within the pool.
-			 */
-			guid = spa_generate_guid(spa);
-		}
-		ASSERT(!spa_guid_exists(spa_guid(spa), guid));
-	}
-
-	vd->vdev_spa = spa;
-	vd->vdev_id = id;
-	vd->vdev_guid = guid;
-	vd->vdev_guid_sum = guid;
-	vd->vdev_ops = ops;
-	vd->vdev_state = VDEV_STATE_CLOSED;
-	vd->vdev_ishole = (ops == &vdev_hole_ops);
-	vic->vic_prev_indirect_vdev = UINT64_MAX;
-
-	rw_init(&vd->vdev_indirect_rwlock, NULL, RW_DEFAULT, NULL);
-	mutex_init(&vd->vdev_obsolete_lock, NULL, MUTEX_DEFAULT, NULL);
-	vd->vdev_obsolete_segments = range_tree_create(NULL, NULL);
-
-	list_link_init(&vd->vdev_leaf_node);
-	mutex_init(&vd->vdev_dtl_lock, NULL, MUTEX_DEFAULT, NULL);
-	mutex_init(&vd->vdev_stat_lock, NULL, MUTEX_DEFAULT, NULL);
-	mutex_init(&vd->vdev_probe_lock, NULL, MUTEX_DEFAULT, NULL);
-	mutex_init(&vd->vdev_scan_io_queue_lock, NULL, MUTEX_DEFAULT, NULL);
-	mutex_init(&vd->vdev_initialize_lock, NULL, MUTEX_DEFAULT, NULL);
-	mutex_init(&vd->vdev_initialize_io_lock, NULL, MUTEX_DEFAULT, NULL);
-	cv_init(&vd->vdev_initialize_cv, NULL, CV_DEFAULT, NULL);
-	cv_init(&vd->vdev_initialize_io_cv, NULL, CV_DEFAULT, NULL);
-
-	for (int t = 0; t < DTL_TYPES; t++) {
-		vd->vdev_dtl[t] = range_tree_create(NULL, NULL);
-	}
-	txg_list_create(&vd->vdev_ms_list, spa,
-	    offsetof(struct metaslab, ms_txg_node));
-	txg_list_create(&vd->vdev_dtl_list, spa,
-	    offsetof(struct vdev, vdev_dtl_node));
-	vd->vdev_stat.vs_timestamp = gethrtime();
-	vdev_queue_init(vd);
-	vdev_cache_init(vd);
-
-	return (vd);
-}
-
-/*
- * Allocate a new vdev.  The 'alloctype' is used to control whether we are
- * creating a new vdev or loading an existing one - the behavior is slightly
- * different for each case.
- */
-int
-vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
-    int alloctype)
-{
-	vdev_ops_t *ops;
-	char *type;
-	uint64_t guid = 0, islog, nparity;
-	vdev_t *vd;
-	vdev_indirect_config_t *vic;
-	vdev_alloc_bias_t alloc_bias = VDEV_BIAS_NONE;
-	boolean_t top_level = (parent && !parent->vdev_parent);
-
-	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
-
-	if (nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) != 0)
-		return (SET_ERROR(EINVAL));
-
-	if ((ops = vdev_getops(type)) == NULL)
-		return (SET_ERROR(EINVAL));
-
-	/*
-	 * If this is a load, get the vdev guid from the nvlist.
-	 * Otherwise, vdev_alloc_common() will generate one for us.
-	 */
-	if (alloctype == VDEV_ALLOC_LOAD) {
-		uint64_t label_id;
-
-		if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ID, &label_id) ||
-		    label_id != id)
-			return (SET_ERROR(EINVAL));
-
-		if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0)
-			return (SET_ERROR(EINVAL));
-	} else if (alloctype == VDEV_ALLOC_SPARE) {
-		if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0)
-			return (SET_ERROR(EINVAL));
-	} else if (alloctype == VDEV_ALLOC_L2CACHE) {
-		if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0)
-			return (SET_ERROR(EINVAL));
-	} else if (alloctype == VDEV_ALLOC_ROOTPOOL) {
-		if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0)
-			return (SET_ERROR(EINVAL));
-	}
-
-	/*
-	 * The first allocated vdev must be of type 'root'.
-	 */
-	if (ops != &vdev_root_ops && spa->spa_root_vdev == NULL)
-		return (SET_ERROR(EINVAL));
-
-	/*
-	 * Determine whether we're a log vdev.
-	 */
-	islog = 0;
-	(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_LOG, &islog);
-	if (islog && spa_version(spa) < SPA_VERSION_SLOGS)
-		return (SET_ERROR(ENOTSUP));
-
-	if (ops == &vdev_hole_ops && spa_version(spa) < SPA_VERSION_HOLES)
-		return (SET_ERROR(ENOTSUP));
-
-	/*
-	 * Set the nparity property for RAID-Z vdevs.
-	 */
-	nparity = -1ULL;
-	if (ops == &vdev_raidz_ops) {
-		if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY,
-		    &nparity) == 0) {
-			if (nparity == 0 || nparity > VDEV_RAIDZ_MAXPARITY)
-				return (SET_ERROR(EINVAL));
-			/*
-			 * Previous versions could only support 1 or 2 parity
-			 * device.
-			 */
-			if (nparity > 1 &&
-			    spa_version(spa) < SPA_VERSION_RAIDZ2)
-				return (SET_ERROR(ENOTSUP));
-			if (nparity > 2 &&
-			    spa_version(spa) < SPA_VERSION_RAIDZ3)
-				return (SET_ERROR(ENOTSUP));
-		} else {
-			/*
-			 * We require the parity to be specified for SPAs that
-			 * support multiple parity levels.
-			 */
-			if (spa_version(spa) >= SPA_VERSION_RAIDZ2)
-				return (SET_ERROR(EINVAL));
-			/*
-			 * Otherwise, we default to 1 parity device for RAID-Z.
-			 */
-			nparity = 1;
-		}
-	} else {
-		nparity = 0;
-	}
-	ASSERT(nparity != -1ULL);
-
-	/*
-	 * If creating a top-level vdev, check for allocation classes input
-	 */
-	if (top_level && alloctype == VDEV_ALLOC_ADD) {
-		char *bias;
-
-		if (nvlist_lookup_string(nv, ZPOOL_CONFIG_ALLOCATION_BIAS,
-		    &bias) == 0) {
-			alloc_bias = vdev_derive_alloc_bias(bias);
-
-			/* spa_vdev_add() expects feature to be enabled */
-			if (alloc_bias != VDEV_BIAS_LOG &&
-			    spa->spa_load_state != SPA_LOAD_CREATE &&
-			    !spa_feature_is_enabled(spa,
-			    SPA_FEATURE_ALLOCATION_CLASSES)) {
-				return (SET_ERROR(ENOTSUP));
-			}
-		}
-	}
-
-	vd = vdev_alloc_common(spa, id, guid, ops);
-	vic = &vd->vdev_indirect_config;
-
-	vd->vdev_islog = islog;
-	vd->vdev_nparity = nparity;
-	if (top_level && alloc_bias != VDEV_BIAS_NONE)
-		vd->vdev_alloc_bias = alloc_bias;
-
-	if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &vd->vdev_path) == 0)
-		vd->vdev_path = spa_strdup(vd->vdev_path);
-	if (nvlist_lookup_string(nv, ZPOOL_CONFIG_DEVID, &vd->vdev_devid) == 0)
-		vd->vdev_devid = spa_strdup(vd->vdev_devid);
-	if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PHYS_PATH,
-	    &vd->vdev_physpath) == 0)
-		vd->vdev_physpath = spa_strdup(vd->vdev_physpath);
-	if (nvlist_lookup_string(nv, ZPOOL_CONFIG_FRU, &vd->vdev_fru) == 0)
-		vd->vdev_fru = spa_strdup(vd->vdev_fru);
-
-	/*
-	 * Set the whole_disk property.  If it's not specified, leave the value
-	 * as -1.
-	 */
-	if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK,
-	    &vd->vdev_wholedisk) != 0)
-		vd->vdev_wholedisk = -1ULL;
-
-	ASSERT0(vic->vic_mapping_object);
-	(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_INDIRECT_OBJECT,
-	    &vic->vic_mapping_object);
-	ASSERT0(vic->vic_births_object);
-	(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_INDIRECT_BIRTHS,
-	    &vic->vic_births_object);
-	ASSERT3U(vic->vic_prev_indirect_vdev, ==, UINT64_MAX);
-	(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_PREV_INDIRECT_VDEV,
-	    &vic->vic_prev_indirect_vdev);
-
-	/*
-	 * Look for the 'not present' flag.  This will only be set if the device
-	 * was not present at the time of import.
-	 */
-	(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT,
-	    &vd->vdev_not_present);
-
-	/*
-	 * Get the alignment requirement.
-	 */
-	(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASHIFT, &vd->vdev_ashift);
-
-	/*
-	 * Retrieve the vdev creation time.
-	 */
-	(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_CREATE_TXG,
-	    &vd->vdev_crtxg);
-
-	/*
-	 * If we're a top-level vdev, try to load the allocation parameters.
-	 */
-	if (top_level &&
-	    (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_SPLIT)) {
-		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY,
-		    &vd->vdev_ms_array);
-		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_SHIFT,
-		    &vd->vdev_ms_shift);
-		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASIZE,
-		    &vd->vdev_asize);
-		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVING,
-		    &vd->vdev_removing);
-		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_VDEV_TOP_ZAP,
-		    &vd->vdev_top_zap);
-	} else {
-		ASSERT0(vd->vdev_top_zap);
-	}
-
-	if (top_level && alloctype != VDEV_ALLOC_ATTACH) {
-		ASSERT(alloctype == VDEV_ALLOC_LOAD ||
-		    alloctype == VDEV_ALLOC_ADD ||
-		    alloctype == VDEV_ALLOC_SPLIT ||
-		    alloctype == VDEV_ALLOC_ROOTPOOL);
-		/* Note: metaslab_group_create() is now deferred */
-	}
-
-	if (vd->vdev_ops->vdev_op_leaf &&
-	    (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_SPLIT)) {
-		(void) nvlist_lookup_uint64(nv,
-		    ZPOOL_CONFIG_VDEV_LEAF_ZAP, &vd->vdev_leaf_zap);
-	} else {
-		ASSERT0(vd->vdev_leaf_zap);
-	}
-
-	/*
-	 * If we're a leaf vdev, try to load the DTL object and other state.
-	 */
-
-	if (vd->vdev_ops->vdev_op_leaf &&
-	    (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_L2CACHE ||
-	    alloctype == VDEV_ALLOC_ROOTPOOL)) {
-		if (alloctype == VDEV_ALLOC_LOAD) {
-			(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DTL,
-			    &vd->vdev_dtl_object);
-			(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_UNSPARE,
-			    &vd->vdev_unspare);
-		}
-
-		if (alloctype == VDEV_ALLOC_ROOTPOOL) {
-			uint64_t spare = 0;
-
-			if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_SPARE,
-			    &spare) == 0 && spare)
-				spa_spare_add(vd);
-		}
-
-		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_OFFLINE,
-		    &vd->vdev_offline);
-
-		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_RESILVER_TXG,
-		    &vd->vdev_resilver_txg);
-
-		/*
-		 * When importing a pool, we want to ignore the persistent fault
-		 * state, as the diagnosis made on another system may not be
-		 * valid in the current context.  Local vdevs will
-		 * remain in the faulted state.
-		 */
-		if (spa_load_state(spa) == SPA_LOAD_OPEN) {
-			(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_FAULTED,
-			    &vd->vdev_faulted);
-			(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DEGRADED,
-			    &vd->vdev_degraded);
-			(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVED,
-			    &vd->vdev_removed);
-
-			if (vd->vdev_faulted || vd->vdev_degraded) {
-				char *aux;
-
-				vd->vdev_label_aux =
-				    VDEV_AUX_ERR_EXCEEDED;
-				if (nvlist_lookup_string(nv,
-				    ZPOOL_CONFIG_AUX_STATE, &aux) == 0 &&
-				    strcmp(aux, "external") == 0)
-					vd->vdev_label_aux = VDEV_AUX_EXTERNAL;
-			}
-		}
-	}
-
-	/*
-	 * Add ourselves to the parent's list of children.
-	 */
-	vdev_add_child(parent, vd);
-
-	*vdp = vd;
-
-	return (0);
-}
-
-void
-vdev_free(vdev_t *vd)
-{
-	spa_t *spa = vd->vdev_spa;
-	ASSERT3P(vd->vdev_initialize_thread, ==, NULL);
-
-	/*
-	 * Scan queues are normally destroyed at the end of a scan. If the
-	 * queue exists here, that implies the vdev is being removed while
-	 * the scan is still running.
-	 */
-	if (vd->vdev_scan_io_queue != NULL) {
-		mutex_enter(&vd->vdev_scan_io_queue_lock);
-		dsl_scan_io_queue_destroy(vd->vdev_scan_io_queue);
-		vd->vdev_scan_io_queue = NULL;
-		mutex_exit(&vd->vdev_scan_io_queue_lock);
-	}
-
-	/*
-	 * vdev_free() implies closing the vdev first.  This is simpler than
-	 * trying to ensure complicated semantics for all callers.
-	 */
-	vdev_close(vd);
-
-	ASSERT(!list_link_active(&vd->vdev_config_dirty_node));
-	ASSERT(!list_link_active(&vd->vdev_state_dirty_node));
-
-	/*
-	 * Free all children.
-	 */
-	for (int c = 0; c < vd->vdev_children; c++)
-		vdev_free(vd->vdev_child[c]);
-
-	ASSERT(vd->vdev_child == NULL);
-	ASSERT(vd->vdev_guid_sum == vd->vdev_guid);
-	ASSERT(vd->vdev_initialize_thread == NULL);
-
-	/*
-	 * Discard allocation state.
-	 */
-	if (vd->vdev_mg != NULL) {
-		vdev_metaslab_fini(vd);
-		metaslab_group_destroy(vd->vdev_mg);
-	}
-
-	ASSERT0(vd->vdev_stat.vs_space);
-	ASSERT0(vd->vdev_stat.vs_dspace);
-	ASSERT0(vd->vdev_stat.vs_alloc);
-
-	/*
-	 * Remove this vdev from its parent's child list.
-	 */
-	vdev_remove_child(vd->vdev_parent, vd);
-
-	ASSERT(vd->vdev_parent == NULL);
-	ASSERT(!list_link_active(&vd->vdev_leaf_node));
-
-	/*
-	 * Clean up vdev structure.
-	 */
-	vdev_queue_fini(vd);
-	vdev_cache_fini(vd);
-
-	if (vd->vdev_path)
-		spa_strfree(vd->vdev_path);
-	if (vd->vdev_devid)
-		spa_strfree(vd->vdev_devid);
-	if (vd->vdev_physpath)
-		spa_strfree(vd->vdev_physpath);
-	if (vd->vdev_fru)
-		spa_strfree(vd->vdev_fru);
-
-	if (vd->vdev_isspare)
-		spa_spare_remove(vd);
-	if (vd->vdev_isl2cache)
-		spa_l2cache_remove(vd);
-
-	txg_list_destroy(&vd->vdev_ms_list);
-	txg_list_destroy(&vd->vdev_dtl_list);
-
-	mutex_enter(&vd->vdev_dtl_lock);
-	space_map_close(vd->vdev_dtl_sm);
-	for (int t = 0; t < DTL_TYPES; t++) {
-		range_tree_vacate(vd->vdev_dtl[t], NULL, NULL);
-		range_tree_destroy(vd->vdev_dtl[t]);
-	}
-	mutex_exit(&vd->vdev_dtl_lock);
-
-	EQUIV(vd->vdev_indirect_births != NULL,
-	    vd->vdev_indirect_mapping != NULL);
-	if (vd->vdev_indirect_births != NULL) {
-		vdev_indirect_mapping_close(vd->vdev_indirect_mapping);
-		vdev_indirect_births_close(vd->vdev_indirect_births);
-	}
-
-	if (vd->vdev_obsolete_sm != NULL) {
-		ASSERT(vd->vdev_removing ||
-		    vd->vdev_ops == &vdev_indirect_ops);
-		space_map_close(vd->vdev_obsolete_sm);
-		vd->vdev_obsolete_sm = NULL;
-	}
-	range_tree_destroy(vd->vdev_obsolete_segments);
-	rw_destroy(&vd->vdev_indirect_rwlock);
-	mutex_destroy(&vd->vdev_obsolete_lock);
-
-	mutex_destroy(&vd->vdev_dtl_lock);
-	mutex_destroy(&vd->vdev_stat_lock);
-	mutex_destroy(&vd->vdev_probe_lock);
-	mutex_destroy(&vd->vdev_scan_io_queue_lock);
-	mutex_destroy(&vd->vdev_initialize_lock);
-	mutex_destroy(&vd->vdev_initialize_io_lock);
-	cv_destroy(&vd->vdev_initialize_io_cv);
-	cv_destroy(&vd->vdev_initialize_cv);
-
-	if (vd == spa->spa_root_vdev)
-		spa->spa_root_vdev = NULL;
-
-	kmem_free(vd, sizeof (vdev_t));
-}
-
-/*
- * Transfer top-level vdev state from svd to tvd.
- */
-static void
-vdev_top_transfer(vdev_t *svd, vdev_t *tvd)
-{
-	spa_t *spa = svd->vdev_spa;
-	metaslab_t *msp;
-	vdev_t *vd;
-	int t;
-
-	ASSERT(tvd == tvd->vdev_top);
-
-	tvd->vdev_ms_array = svd->vdev_ms_array;
-	tvd->vdev_ms_shift = svd->vdev_ms_shift;
-	tvd->vdev_ms_count = svd->vdev_ms_count;
-	tvd->vdev_top_zap = svd->vdev_top_zap;
-
-	svd->vdev_ms_array = 0;
-	svd->vdev_ms_shift = 0;
-	svd->vdev_ms_count = 0;
-	svd->vdev_top_zap = 0;
-
-	if (tvd->vdev_mg)
-		ASSERT3P(tvd->vdev_mg, ==, svd->vdev_mg);
-	tvd->vdev_mg = svd->vdev_mg;
-	tvd->vdev_ms = svd->vdev_ms;
-
-	svd->vdev_mg = NULL;
-	svd->vdev_ms = NULL;
-
-	if (tvd->vdev_mg != NULL)
-		tvd->vdev_mg->mg_vd = tvd;
-
-	tvd->vdev_checkpoint_sm = svd->vdev_checkpoint_sm;
-	svd->vdev_checkpoint_sm = NULL;
-
-	tvd->vdev_alloc_bias = svd->vdev_alloc_bias;
-	svd->vdev_alloc_bias = VDEV_BIAS_NONE;
-
-	tvd->vdev_stat.vs_alloc = svd->vdev_stat.vs_alloc;
-	tvd->vdev_stat.vs_space = svd->vdev_stat.vs_space;
-	tvd->vdev_stat.vs_dspace = svd->vdev_stat.vs_dspace;
-
-	svd->vdev_stat.vs_alloc = 0;
-	svd->vdev_stat.vs_space = 0;
-	svd->vdev_stat.vs_dspace = 0;
-
-	/*
-	 * State which may be set on a top-level vdev that's in the
-	 * process of being removed.
-	 */
-	ASSERT0(tvd->vdev_indirect_config.vic_births_object);
-	ASSERT0(tvd->vdev_indirect_config.vic_mapping_object);
-	ASSERT3U(tvd->vdev_indirect_config.vic_prev_indirect_vdev, ==, -1ULL);
-	ASSERT3P(tvd->vdev_indirect_mapping, ==, NULL);
-	ASSERT3P(tvd->vdev_indirect_births, ==, NULL);
-	ASSERT3P(tvd->vdev_obsolete_sm, ==, NULL);
-	ASSERT0(tvd->vdev_removing);
-	tvd->vdev_removing = svd->vdev_removing;
-	tvd->vdev_indirect_config = svd->vdev_indirect_config;
-	tvd->vdev_indirect_mapping = svd->vdev_indirect_mapping;
-	tvd->vdev_indirect_births = svd->vdev_indirect_births;
-	range_tree_swap(&svd->vdev_obsolete_segments,
-	    &tvd->vdev_obsolete_segments);
-	tvd->vdev_obsolete_sm = svd->vdev_obsolete_sm;
-	svd->vdev_indirect_config.vic_mapping_object = 0;
-	svd->vdev_indirect_config.vic_births_object = 0;
-	svd->vdev_indirect_config.vic_prev_indirect_vdev = -1ULL;
-	svd->vdev_indirect_mapping = NULL;
-	svd->vdev_indirect_births = NULL;
-	svd->vdev_obsolete_sm = NULL;
-	svd->vdev_removing = 0;
-
-	for (t = 0; t < TXG_SIZE; t++) {
-		while ((msp = txg_list_remove(&svd->vdev_ms_list, t)) != NULL)
-			(void) txg_list_add(&tvd->vdev_ms_list, msp, t);
-		while ((vd = txg_list_remove(&svd->vdev_dtl_list, t)) != NULL)
-			(void) txg_list_add(&tvd->vdev_dtl_list, vd, t);
-		if (txg_list_remove_this(&spa->spa_vdev_txg_list, svd, t))
-			(void) txg_list_add(&spa->spa_vdev_txg_list, tvd, t);
-	}
-
-	if (list_link_active(&svd->vdev_config_dirty_node)) {
-		vdev_config_clean(svd);
-		vdev_config_dirty(tvd);
-	}
-
-	if (list_link_active(&svd->vdev_state_dirty_node)) {
-		vdev_state_clean(svd);
-		vdev_state_dirty(tvd);
-	}
-
-	tvd->vdev_deflate_ratio = svd->vdev_deflate_ratio;
-	svd->vdev_deflate_ratio = 0;
-
-	tvd->vdev_islog = svd->vdev_islog;
-	svd->vdev_islog = 0;
-
-	dsl_scan_io_queue_vdev_xfer(svd, tvd);
-}
-
-static void
-vdev_top_update(vdev_t *tvd, vdev_t *vd)
-{
-	if (vd == NULL)
-		return;
-
-	vd->vdev_top = tvd;
-
-	for (int c = 0; c < vd->vdev_children; c++)
-		vdev_top_update(tvd, vd->vdev_child[c]);
-}
-
-/*
- * Add a mirror/replacing vdev above an existing vdev.
- */
-vdev_t *
-vdev_add_parent(vdev_t *cvd, vdev_ops_t *ops)
-{
-	spa_t *spa = cvd->vdev_spa;
-	vdev_t *pvd = cvd->vdev_parent;
-	vdev_t *mvd;
-
-	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
-
-	mvd = vdev_alloc_common(spa, cvd->vdev_id, 0, ops);
-
-	mvd->vdev_asize = cvd->vdev_asize;
-	mvd->vdev_min_asize = cvd->vdev_min_asize;
-	mvd->vdev_max_asize = cvd->vdev_max_asize;
-	mvd->vdev_psize = cvd->vdev_psize;
-	mvd->vdev_ashift = cvd->vdev_ashift;
-	mvd->vdev_logical_ashift = cvd->vdev_logical_ashift;
-	mvd->vdev_physical_ashift = cvd->vdev_physical_ashift;
-	mvd->vdev_state = cvd->vdev_state;
-	mvd->vdev_crtxg = cvd->vdev_crtxg;
-
-	vdev_remove_child(pvd, cvd);
-	vdev_add_child(pvd, mvd);
-	cvd->vdev_id = mvd->vdev_children;
-	vdev_add_child(mvd, cvd);
-	vdev_top_update(cvd->vdev_top, cvd->vdev_top);
-
-	if (mvd == mvd->vdev_top)
-		vdev_top_transfer(cvd, mvd);
-
-	return (mvd);
-}
-
-/*
- * Remove a 1-way mirror/replacing vdev from the tree.
- */
-void
-vdev_remove_parent(vdev_t *cvd)
-{
-	vdev_t *mvd = cvd->vdev_parent;
-	vdev_t *pvd = mvd->vdev_parent;
-
-	ASSERT(spa_config_held(cvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL);
-
-	ASSERT(mvd->vdev_children == 1);
-	ASSERT(mvd->vdev_ops == &vdev_mirror_ops ||
-	    mvd->vdev_ops == &vdev_replacing_ops ||
-	    mvd->vdev_ops == &vdev_spare_ops);
-	cvd->vdev_ashift = mvd->vdev_ashift;
-	cvd->vdev_logical_ashift = mvd->vdev_logical_ashift;
-	cvd->vdev_physical_ashift = mvd->vdev_physical_ashift;
-
-	vdev_remove_child(mvd, cvd);
-	vdev_remove_child(pvd, mvd);
-
-	/*
-	 * If cvd will replace mvd as a top-level vdev, preserve mvd's guid.
-	 * Otherwise, we could have detached an offline device, and when we
-	 * go to import the pool we'll think we have two top-level vdevs,
-	 * instead of a different version of the same top-level vdev.
-	 */
-	if (mvd->vdev_top == mvd) {
-		uint64_t guid_delta = mvd->vdev_guid - cvd->vdev_guid;
-		cvd->vdev_orig_guid = cvd->vdev_guid;
-		cvd->vdev_guid += guid_delta;
-		cvd->vdev_guid_sum += guid_delta;
-	}
-	cvd->vdev_id = mvd->vdev_id;
-	vdev_add_child(pvd, cvd);
-	vdev_top_update(cvd->vdev_top, cvd->vdev_top);
-
-	if (cvd == cvd->vdev_top)
-		vdev_top_transfer(mvd, cvd);
-
-	ASSERT(mvd->vdev_children == 0);
-	vdev_free(mvd);
-}
-
-static void
-vdev_metaslab_group_create(vdev_t *vd)
-{
-	spa_t *spa = vd->vdev_spa;
-
-	/*
-	 * metaslab_group_create was delayed until allocation bias was available
-	 */
-	if (vd->vdev_mg == NULL) {
-		metaslab_class_t *mc;
-
-		if (vd->vdev_islog && vd->vdev_alloc_bias == VDEV_BIAS_NONE)
-			vd->vdev_alloc_bias = VDEV_BIAS_LOG;
-
-		ASSERT3U(vd->vdev_islog, ==,
-		    (vd->vdev_alloc_bias == VDEV_BIAS_LOG));
-
-		switch (vd->vdev_alloc_bias) {
-		case VDEV_BIAS_LOG:
-			mc = spa_log_class(spa);
-			break;
-		case VDEV_BIAS_SPECIAL:
-			mc = spa_special_class(spa);
-			break;
-		case VDEV_BIAS_DEDUP:
-			mc = spa_dedup_class(spa);
-			break;
-		default:
-			mc = spa_normal_class(spa);
-		}
-
-		vd->vdev_mg = metaslab_group_create(mc, vd,
-		    spa->spa_alloc_count);
-
-		/*
-		 * The spa ashift values currently only reflect the
-		 * general vdev classes. Class destination is late
-		 * binding so ashift checking had to wait until now
-		 */
-		if (vd->vdev_top == vd && vd->vdev_ashift != 0 &&
-		    mc == spa_normal_class(spa) && vd->vdev_aux == NULL) {
-			if (vd->vdev_ashift > spa->spa_max_ashift)
-				spa->spa_max_ashift = vd->vdev_ashift;
-			if (vd->vdev_ashift < spa->spa_min_ashift)
-				spa->spa_min_ashift = vd->vdev_ashift;
-		}
-	}
-}
-
-int
-vdev_metaslab_init(vdev_t *vd, uint64_t txg)
-{
-	spa_t *spa = vd->vdev_spa;
-	objset_t *mos = spa->spa_meta_objset;
-	uint64_t m;
-	uint64_t oldc = vd->vdev_ms_count;
-	uint64_t newc = vd->vdev_asize >> vd->vdev_ms_shift;
-	metaslab_t **mspp;
-	int error;
-	boolean_t expanding = (oldc != 0);
-
-	ASSERT(txg == 0 || spa_config_held(spa, SCL_ALLOC, RW_WRITER));
-
-	/*
-	 * This vdev is not being allocated from yet or is a hole.
-	 */
-	if (vd->vdev_ms_shift == 0)
-		return (0);
-
-	ASSERT(!vd->vdev_ishole);
-
-	ASSERT(oldc <= newc);
-
-	mspp = kmem_zalloc(newc * sizeof (*mspp), KM_SLEEP);
-
-	if (expanding) {
-		bcopy(vd->vdev_ms, mspp, oldc * sizeof (*mspp));
-		kmem_free(vd->vdev_ms, oldc * sizeof (*mspp));
-	}
-
-	vd->vdev_ms = mspp;
-	vd->vdev_ms_count = newc;
-	for (m = oldc; m < newc; m++) {
-		uint64_t object = 0;
-
-		/*
-		 * vdev_ms_array may be 0 if we are creating the "fake"
-		 * metaslabs for an indirect vdev for zdb's leak detection.
-		 * See zdb_leak_init().
-		 */
-		if (txg == 0 && vd->vdev_ms_array != 0) {
-			error = dmu_read(mos, vd->vdev_ms_array,
-			    m * sizeof (uint64_t), sizeof (uint64_t), &object,
-			    DMU_READ_PREFETCH);
-			if (error != 0) {
-				vdev_dbgmsg(vd, "unable to read the metaslab "
-				    "array [error=%d]", error);
-				return (error);
-			}
-		}
-
-#ifndef _KERNEL
-		/*
-		 * To accomodate zdb_leak_init() fake indirect
-		 * metaslabs, we allocate a metaslab group for
-		 * indirect vdevs which normally don't have one.
-		 */
-		if (vd->vdev_mg == NULL) {
-			ASSERT0(vdev_is_concrete(vd));
-			vdev_metaslab_group_create(vd);
-		}
-#endif
-		error = metaslab_init(vd->vdev_mg, m, object, txg,
-		    &(vd->vdev_ms[m]));
-		if (error != 0) {
-			vdev_dbgmsg(vd, "metaslab_init failed [error=%d]",
-			    error);
-			return (error);
-		}
-	}
-
-	if (txg == 0)
-		spa_config_enter(spa, SCL_ALLOC, FTAG, RW_WRITER);
-
-	/*
-	 * If the vdev is being removed we don't activate
-	 * the metaslabs since we want to ensure that no new
-	 * allocations are performed on this device.
-	 */
-	if (!expanding && !vd->vdev_removing) {
-		metaslab_group_activate(vd->vdev_mg);
-	}
-
-	if (txg == 0)
-		spa_config_exit(spa, SCL_ALLOC, FTAG);
-
-	return (0);
-}
-
-void
-vdev_metaslab_fini(vdev_t *vd)
-{
-	if (vd->vdev_checkpoint_sm != NULL) {
-		ASSERT(spa_feature_is_active(vd->vdev_spa,
-		    SPA_FEATURE_POOL_CHECKPOINT));
-		space_map_close(vd->vdev_checkpoint_sm);
-		/*
-		 * Even though we close the space map, we need to set its
-		 * pointer to NULL. The reason is that vdev_metaslab_fini()
-		 * may be called multiple times for certain operations
-		 * (i.e. when destroying a pool) so we need to ensure that
-		 * this clause never executes twice. This logic is similar
-		 * to the one used for the vdev_ms clause below.
-		 */
-		vd->vdev_checkpoint_sm = NULL;
-	}
-
-	if (vd->vdev_ms != NULL) {
-		metaslab_group_t *mg = vd->vdev_mg;
-		metaslab_group_passivate(mg);
-
-		uint64_t count = vd->vdev_ms_count;
-		for (uint64_t m = 0; m < count; m++) {
-			metaslab_t *msp = vd->vdev_ms[m];
-			if (msp != NULL)
-				metaslab_fini(msp);
-		}
-		kmem_free(vd->vdev_ms, count * sizeof (metaslab_t *));
-		vd->vdev_ms = NULL;
-
-		vd->vdev_ms_count = 0;
-
-		for (int i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++)
-			ASSERT0(mg->mg_histogram[i]);
-	}
-	ASSERT0(vd->vdev_ms_count);
-}
-
-typedef struct vdev_probe_stats {
-	boolean_t	vps_readable;
-	boolean_t	vps_writeable;
-	int		vps_flags;
-} vdev_probe_stats_t;
-
-static void
-vdev_probe_done(zio_t *zio)
-{
-	spa_t *spa = zio->io_spa;
-	vdev_t *vd = zio->io_vd;
-	vdev_probe_stats_t *vps = zio->io_private;
-
-	ASSERT(vd->vdev_probe_zio != NULL);
-
-	if (zio->io_type == ZIO_TYPE_READ) {
-		if (zio->io_error == 0)
-			vps->vps_readable = 1;
-		if (zio->io_error == 0 && spa_writeable(spa)) {
-			zio_nowait(zio_write_phys(vd->vdev_probe_zio, vd,
-			    zio->io_offset, zio->io_size, zio->io_abd,
-			    ZIO_CHECKSUM_OFF, vdev_probe_done, vps,
-			    ZIO_PRIORITY_SYNC_WRITE, vps->vps_flags, B_TRUE));
-		} else {
-			abd_free(zio->io_abd);
-		}
-	} else if (zio->io_type == ZIO_TYPE_WRITE) {
-		if (zio->io_error == 0)
-			vps->vps_writeable = 1;
-		abd_free(zio->io_abd);
-	} else if (zio->io_type == ZIO_TYPE_NULL) {
-		zio_t *pio;
-
-		vd->vdev_cant_read |= !vps->vps_readable;
-		vd->vdev_cant_write |= !vps->vps_writeable;
-
-		if (vdev_readable(vd) &&
-		    (vdev_writeable(vd) || !spa_writeable(spa))) {
-			zio->io_error = 0;
-		} else {
-			ASSERT(zio->io_error != 0);
-			vdev_dbgmsg(vd, "failed probe");
-			zfs_ereport_post(FM_EREPORT_ZFS_PROBE_FAILURE,
-			    spa, vd, NULL, 0, 0);
-			zio->io_error = SET_ERROR(ENXIO);
-		}
-
-		mutex_enter(&vd->vdev_probe_lock);
-		ASSERT(vd->vdev_probe_zio == zio);
-		vd->vdev_probe_zio = NULL;
-		mutex_exit(&vd->vdev_probe_lock);
-
-		zio_link_t *zl = NULL;
-		while ((pio = zio_walk_parents(zio, &zl)) != NULL)
-			if (!vdev_accessible(vd, pio))
-				pio->io_error = SET_ERROR(ENXIO);
-
-		kmem_free(vps, sizeof (*vps));
-	}
-}
-
-/*
- * Determine whether this device is accessible.
- *
- * Read and write to several known locations: the pad regions of each
- * vdev label but the first, which we leave alone in case it contains
- * a VTOC.
- */
-zio_t *
-vdev_probe(vdev_t *vd, zio_t *zio)
-{
-	spa_t *spa = vd->vdev_spa;
-	vdev_probe_stats_t *vps = NULL;
-	zio_t *pio;
-
-	ASSERT(vd->vdev_ops->vdev_op_leaf);
-
-	/*
-	 * Don't probe the probe.
-	 */
-	if (zio && (zio->io_flags & ZIO_FLAG_PROBE))
-		return (NULL);
-
-	/*
-	 * To prevent 'probe storms' when a device fails, we create
-	 * just one probe i/o at a time.  All zios that want to probe
-	 * this vdev will become parents of the probe io.
-	 */
-	mutex_enter(&vd->vdev_probe_lock);
-
-	if ((pio = vd->vdev_probe_zio) == NULL) {
-		vps = kmem_zalloc(sizeof (*vps), KM_SLEEP);
-
-		vps->vps_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_PROBE |
-		    ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_AGGREGATE |
-		    ZIO_FLAG_TRYHARD;
-
-		if (spa_config_held(spa, SCL_ZIO, RW_WRITER)) {
-			/*
-			 * vdev_cant_read and vdev_cant_write can only
-			 * transition from TRUE to FALSE when we have the
-			 * SCL_ZIO lock as writer; otherwise they can only
-			 * transition from FALSE to TRUE.  This ensures that
-			 * any zio looking at these values can assume that
-			 * failures persist for the life of the I/O.  That's
-			 * important because when a device has intermittent
-			 * connectivity problems, we want to ensure that
-			 * they're ascribed to the device (ENXIO) and not
-			 * the zio (EIO).
-			 *
-			 * Since we hold SCL_ZIO as writer here, clear both
-			 * values so the probe can reevaluate from first
-			 * principles.
-			 */
-			vps->vps_flags |= ZIO_FLAG_CONFIG_WRITER;
-			vd->vdev_cant_read = B_FALSE;
-			vd->vdev_cant_write = B_FALSE;
-		}
-
-		vd->vdev_probe_zio = pio = zio_null(NULL, spa, vd,
-		    vdev_probe_done, vps,
-		    vps->vps_flags | ZIO_FLAG_DONT_PROPAGATE);
-
-		/*
-		 * We can't change the vdev state in this context, so we
-		 * kick off an async task to do it on our behalf.
-		 */
-		if (zio != NULL) {
-			vd->vdev_probe_wanted = B_TRUE;
-			spa_async_request(spa, SPA_ASYNC_PROBE);
-		}
-	}
-
-	if (zio != NULL)
-		zio_add_child(zio, pio);
-
-	mutex_exit(&vd->vdev_probe_lock);
-
-	if (vps == NULL) {
-		ASSERT(zio != NULL);
-		return (NULL);
-	}
-
-	for (int l = 1; l < VDEV_LABELS; l++) {
-		zio_nowait(zio_read_phys(pio, vd,
-		    vdev_label_offset(vd->vdev_psize, l,
-		    offsetof(vdev_label_t, vl_pad2)), VDEV_PAD_SIZE,
-		    abd_alloc_for_io(VDEV_PAD_SIZE, B_TRUE),
-		    ZIO_CHECKSUM_OFF, vdev_probe_done, vps,
-		    ZIO_PRIORITY_SYNC_READ, vps->vps_flags, B_TRUE));
-	}
-
-	if (zio == NULL)
-		return (pio);
-
-	zio_nowait(pio);
-	return (NULL);
-}
-
-static void
-vdev_open_child(void *arg)
-{
-	vdev_t *vd = arg;
-
-	vd->vdev_open_thread = curthread;
-	vd->vdev_open_error = vdev_open(vd);
-	vd->vdev_open_thread = NULL;
-}
-
-boolean_t
-vdev_uses_zvols(vdev_t *vd)
-{
-	if (vd->vdev_path && strncmp(vd->vdev_path, ZVOL_DIR,
-	    strlen(ZVOL_DIR)) == 0)
-		return (B_TRUE);
-	for (int c = 0; c < vd->vdev_children; c++)
-		if (vdev_uses_zvols(vd->vdev_child[c]))
-			return (B_TRUE);
-	return (B_FALSE);
-}
-
-void
-vdev_open_children(vdev_t *vd)
-{
-	taskq_t *tq;
-	int children = vd->vdev_children;
-
-	vd->vdev_nonrot = B_TRUE;
-
-	/*
-	 * in order to handle pools on top of zvols, do the opens
-	 * in a single thread so that the same thread holds the
-	 * spa_namespace_lock
-	 */
-	if (B_TRUE || vdev_uses_zvols(vd)) {
-		for (int c = 0; c < children; c++) {
-			vd->vdev_child[c]->vdev_open_error =
-			    vdev_open(vd->vdev_child[c]);
-			vd->vdev_nonrot &= vd->vdev_child[c]->vdev_nonrot;
-		}
-		return;
-	}
-	tq = taskq_create("vdev_open", children, minclsyspri,
-	    children, children, TASKQ_PREPOPULATE);
-
-	for (int c = 0; c < children; c++)
-		VERIFY(taskq_dispatch(tq, vdev_open_child, vd->vdev_child[c],
-		    TQ_SLEEP) != 0);
-
-	taskq_destroy(tq);
-
-	for (int c = 0; c < children; c++)
-		vd->vdev_nonrot &= vd->vdev_child[c]->vdev_nonrot;
-}
-
-/*
- * Compute the raidz-deflation ratio.  Note, we hard-code
- * in 128k (1 << 17) because it is the "typical" blocksize.
- * Even though SPA_MAXBLOCKSIZE changed, this algorithm can not change,
- * otherwise it would inconsistently account for existing bp's.
- */
-static void
-vdev_set_deflate_ratio(vdev_t *vd)
-{
-	if (vd == vd->vdev_top && !vd->vdev_ishole && vd->vdev_ashift != 0) {
-		vd->vdev_deflate_ratio = (1 << 17) /
-		    (vdev_psize_to_asize(vd, 1 << 17) >> SPA_MINBLOCKSHIFT);
-	}
-}
-
-/*
- * Prepare a virtual device for access.
- */
-int
-vdev_open(vdev_t *vd)
-{
-	spa_t *spa = vd->vdev_spa;
-	int error;
-	uint64_t osize = 0;
-	uint64_t max_osize = 0;
-	uint64_t asize, max_asize, psize;
-	uint64_t logical_ashift = 0;
-	uint64_t physical_ashift = 0;
-
-	ASSERT(vd->vdev_open_thread == curthread ||
-	    spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
-	ASSERT(vd->vdev_state == VDEV_STATE_CLOSED ||
-	    vd->vdev_state == VDEV_STATE_CANT_OPEN ||
-	    vd->vdev_state == VDEV_STATE_OFFLINE);
-
-	vd->vdev_stat.vs_aux = VDEV_AUX_NONE;
-	vd->vdev_cant_read = B_FALSE;
-	vd->vdev_cant_write = B_FALSE;
-	vd->vdev_notrim = B_FALSE;
-	vd->vdev_min_asize = vdev_get_min_asize(vd);
-
-	/*
-	 * If this vdev is not removed, check its fault status.  If it's
-	 * faulted, bail out of the open.
-	 */
-	if (!vd->vdev_removed && vd->vdev_faulted) {
-		ASSERT(vd->vdev_children == 0);
-		ASSERT(vd->vdev_label_aux == VDEV_AUX_ERR_EXCEEDED ||
-		    vd->vdev_label_aux == VDEV_AUX_EXTERNAL);
-		vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED,
-		    vd->vdev_label_aux);
-		return (SET_ERROR(ENXIO));
-	} else if (vd->vdev_offline) {
-		ASSERT(vd->vdev_children == 0);
-		vdev_set_state(vd, B_TRUE, VDEV_STATE_OFFLINE, VDEV_AUX_NONE);
-		return (SET_ERROR(ENXIO));
-	}
-
-	error = vd->vdev_ops->vdev_op_open(vd, &osize, &max_osize,
-	    &logical_ashift, &physical_ashift);
-
-	/*
-	 * Reset the vdev_reopening flag so that we actually close
-	 * the vdev on error.
-	 */
-	vd->vdev_reopening = B_FALSE;
-	if (zio_injection_enabled && error == 0)
-		error = zio_handle_device_injection(vd, NULL, ENXIO);
-
-	if (error) {
-		if (vd->vdev_removed &&
-		    vd->vdev_stat.vs_aux != VDEV_AUX_OPEN_FAILED)
-			vd->vdev_removed = B_FALSE;
-
-		if (vd->vdev_stat.vs_aux == VDEV_AUX_CHILDREN_OFFLINE) {
-			vdev_set_state(vd, B_TRUE, VDEV_STATE_OFFLINE,
-			    vd->vdev_stat.vs_aux);
-		} else {
-			vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
-			    vd->vdev_stat.vs_aux);
-		}
-		return (error);
-	}
-
-	vd->vdev_removed = B_FALSE;
-
-	/*
-	 * Recheck the faulted flag now that we have confirmed that
-	 * the vdev is accessible.  If we're faulted, bail.
-	 */
-	if (vd->vdev_faulted) {
-		ASSERT(vd->vdev_children == 0);
-		ASSERT(vd->vdev_label_aux == VDEV_AUX_ERR_EXCEEDED ||
-		    vd->vdev_label_aux == VDEV_AUX_EXTERNAL);
-		vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED,
-		    vd->vdev_label_aux);
-		return (SET_ERROR(ENXIO));
-	}
-
-	if (vd->vdev_degraded) {
-		ASSERT(vd->vdev_children == 0);
-		vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED,
-		    VDEV_AUX_ERR_EXCEEDED);
-	} else {
-		vdev_set_state(vd, B_TRUE, VDEV_STATE_HEALTHY, 0);
-	}
-
-	/*
-	 * For hole or missing vdevs we just return success.
-	 */
-	if (vd->vdev_ishole || vd->vdev_ops == &vdev_missing_ops)
-		return (0);
-
-	if (zfs_trim_enabled && !vd->vdev_notrim && vd->vdev_ops->vdev_op_leaf)
-		trim_map_create(vd);
-
-	for (int c = 0; c < vd->vdev_children; c++) {
-		if (vd->vdev_child[c]->vdev_state != VDEV_STATE_HEALTHY) {
-			vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED,
-			    VDEV_AUX_NONE);
-			break;
-		}
-	}
-
-	osize = P2ALIGN(osize, (uint64_t)sizeof (vdev_label_t));
-	max_osize = P2ALIGN(max_osize, (uint64_t)sizeof (vdev_label_t));
-
-	if (vd->vdev_children == 0) {
-		if (osize < SPA_MINDEVSIZE) {
-			vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
-			    VDEV_AUX_TOO_SMALL);
-			return (SET_ERROR(EOVERFLOW));
-		}
-		psize = osize;
-		asize = osize - (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE);
-		max_asize = max_osize - (VDEV_LABEL_START_SIZE +
-		    VDEV_LABEL_END_SIZE);
-	} else {
-		if (vd->vdev_parent != NULL && osize < SPA_MINDEVSIZE -
-		    (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE)) {
-			vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
-			    VDEV_AUX_TOO_SMALL);
-			return (SET_ERROR(EOVERFLOW));
-		}
-		psize = 0;
-		asize = osize;
-		max_asize = max_osize;
-	}
-
-	vd->vdev_psize = psize;
-
-	/*
-	 * Make sure the allocatable size hasn't shrunk too much.
-	 */
-	if (asize < vd->vdev_min_asize) {
-		vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
-		    VDEV_AUX_BAD_LABEL);
-		return (SET_ERROR(EINVAL));
-	}
-
-	vd->vdev_physical_ashift =
-	    MAX(physical_ashift, vd->vdev_physical_ashift);
-	vd->vdev_logical_ashift = MAX(logical_ashift, vd->vdev_logical_ashift);
-	vd->vdev_ashift = MAX(vd->vdev_logical_ashift, vd->vdev_ashift);
-
-	if (vd->vdev_logical_ashift > SPA_MAXASHIFT) {
-		vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
-		    VDEV_AUX_ASHIFT_TOO_BIG);
-		return (EINVAL);
-	}
-
-	if (vd->vdev_asize == 0) {
-		/*
-		 * This is the first-ever open, so use the computed values.
-		 * For testing purposes, a higher ashift can be requested.
-		 */
-		vd->vdev_asize = asize;
-		vd->vdev_max_asize = max_asize;
-	} else {
-		/*
-		 * Make sure the alignment requirement hasn't increased.
-		 */
-		if (vd->vdev_ashift > vd->vdev_top->vdev_ashift &&
-		    vd->vdev_ops->vdev_op_leaf) {
-			vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
-			    VDEV_AUX_BAD_LABEL);
-			return (EINVAL);
-		}
-		vd->vdev_max_asize = max_asize;
-	}
-
-	/*
-	 * If all children are healthy we update asize if either:
-	 * The asize has increased, due to a device expansion caused by dynamic
-	 * LUN growth or vdev replacement, and automatic expansion is enabled;
-	 * making the additional space available.
-	 *
-	 * The asize has decreased, due to a device shrink usually caused by a
-	 * vdev replace with a smaller device. This ensures that calculations
-	 * based of max_asize and asize e.g. esize are always valid. It's safe
-	 * to do this as we've already validated that asize is greater than
-	 * vdev_min_asize.
-	 */
-	if (vd->vdev_state == VDEV_STATE_HEALTHY &&
-	    ((asize > vd->vdev_asize &&
-	    (vd->vdev_expanding || spa->spa_autoexpand)) ||
-	    (asize < vd->vdev_asize)))
-		vd->vdev_asize = asize;
-
-	vdev_set_min_asize(vd);
-
-	/*
-	 * Ensure we can issue some IO before declaring the
-	 * vdev open for business.
-	 */
-	if (vd->vdev_ops->vdev_op_leaf &&
-	    (error = zio_wait(vdev_probe(vd, NULL))) != 0) {
-		vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED,
-		    VDEV_AUX_ERR_EXCEEDED);
-		return (error);
-	}
-
-	/*
-	 * Track the min and max ashift values for normal data devices.
-	 *
-	 * DJB - TBD these should perhaps be tracked per allocation class
-	 * (e.g. spa_min_ashift is used to round up post compression buffers)
-	 */
-	if (vd->vdev_top == vd && vd->vdev_ashift != 0 &&
-	    vd->vdev_alloc_bias == VDEV_BIAS_NONE &&
-	    vd->vdev_aux == NULL) {
-		if (vd->vdev_ashift > spa->spa_max_ashift)
-			spa->spa_max_ashift = vd->vdev_ashift;
-		if (vd->vdev_ashift < spa->spa_min_ashift)
-			spa->spa_min_ashift = vd->vdev_ashift;
-	}
-
-	/*
-	 * If a leaf vdev has a DTL, and seems healthy, then kick off a
-	 * resilver.  But don't do this if we are doing a reopen for a scrub,
-	 * since this would just restart the scrub we are already doing.
-	 */
-	if (vd->vdev_ops->vdev_op_leaf && !spa->spa_scrub_reopen &&
-	    vdev_resilver_needed(vd, NULL, NULL))
-		spa_async_request(spa, SPA_ASYNC_RESILVER);
-
-	return (0);
-}
-
-/*
- * Called once the vdevs are all opened, this routine validates the label
- * contents. This needs to be done before vdev_load() so that we don't
- * inadvertently do repair I/Os to the wrong device.
- *
- * This function will only return failure if one of the vdevs indicates that it
- * has since been destroyed or exported.  This is only possible if
- * /etc/zfs/zpool.cache was readonly at the time.  Otherwise, the vdev state
- * will be updated but the function will return 0.
- */
-int
-vdev_validate(vdev_t *vd)
-{
-	spa_t *spa = vd->vdev_spa;
-	nvlist_t *label;
-	uint64_t guid = 0, aux_guid = 0, top_guid;
-	uint64_t state;
-	nvlist_t *nvl;
-	uint64_t txg;
-
-	if (vdev_validate_skip)
-		return (0);
-
-	for (uint64_t c = 0; c < vd->vdev_children; c++)
-		if (vdev_validate(vd->vdev_child[c]) != 0)
-			return (SET_ERROR(EBADF));
-
-	/*
-	 * If the device has already failed, or was marked offline, don't do
-	 * any further validation.  Otherwise, label I/O will fail and we will
-	 * overwrite the previous state.
-	 */
-	if (!vd->vdev_ops->vdev_op_leaf || !vdev_readable(vd))
-		return (0);
-
-	/*
-	 * If we are performing an extreme rewind, we allow for a label that
-	 * was modified at a point after the current txg.
-	 * If config lock is not held do not check for the txg. spa_sync could
-	 * be updating the vdev's label before updating spa_last_synced_txg.
-	 */
-	if (spa->spa_extreme_rewind || spa_last_synced_txg(spa) == 0 ||
-	    spa_config_held(spa, SCL_CONFIG, RW_WRITER) != SCL_CONFIG)
-		txg = UINT64_MAX;
-	else
-		txg = spa_last_synced_txg(spa);
-
-	if ((label = vdev_label_read_config(vd, txg)) == NULL) {
-		vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
-		    VDEV_AUX_BAD_LABEL);
-		vdev_dbgmsg(vd, "vdev_validate: failed reading config for "
-		    "txg %llu", (u_longlong_t)txg);
-		return (0);
-	}
-
-	/*
-	 * Determine if this vdev has been split off into another
-	 * pool.  If so, then refuse to open it.
-	 */
-	if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_SPLIT_GUID,
-	    &aux_guid) == 0 && aux_guid == spa_guid(spa)) {
-		vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
-		    VDEV_AUX_SPLIT_POOL);
-		nvlist_free(label);
-		vdev_dbgmsg(vd, "vdev_validate: vdev split into other pool");
-		return (0);
-	}
-
-	if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_GUID, &guid) != 0) {
-		vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
-		    VDEV_AUX_CORRUPT_DATA);
-		nvlist_free(label);
-		vdev_dbgmsg(vd, "vdev_validate: '%s' missing from label",
-		    ZPOOL_CONFIG_POOL_GUID);
-		return (0);
-	}
-
-	/*
-	 * If config is not trusted then ignore the spa guid check. This is
-	 * necessary because if the machine crashed during a re-guid the new
-	 * guid might have been written to all of the vdev labels, but not the
-	 * cached config. The check will be performed again once we have the
-	 * trusted config from the MOS.
-	 */
-	if (spa->spa_trust_config && guid != spa_guid(spa)) {
-		vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
-		    VDEV_AUX_CORRUPT_DATA);
-		nvlist_free(label);
-		vdev_dbgmsg(vd, "vdev_validate: vdev label pool_guid doesn't "
-		    "match config (%llu != %llu)", (u_longlong_t)guid,
-		    (u_longlong_t)spa_guid(spa));
-		return (0);
-	}
-
-	if (nvlist_lookup_nvlist(label, ZPOOL_CONFIG_VDEV_TREE, &nvl)
-	    != 0 || nvlist_lookup_uint64(nvl, ZPOOL_CONFIG_ORIG_GUID,
-	    &aux_guid) != 0)
-		aux_guid = 0;
-
-	if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) != 0) {
-		vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
-		    VDEV_AUX_CORRUPT_DATA);
-		nvlist_free(label);
-		vdev_dbgmsg(vd, "vdev_validate: '%s' missing from label",
-		    ZPOOL_CONFIG_GUID);
-		return (0);
-	}
-
-	if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_TOP_GUID, &top_guid)
-	    != 0) {
-		vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
-		    VDEV_AUX_CORRUPT_DATA);
-		nvlist_free(label);
-		vdev_dbgmsg(vd, "vdev_validate: '%s' missing from label",
-		    ZPOOL_CONFIG_TOP_GUID);
-		return (0);
-	}
-
-	/*
-	 * If this vdev just became a top-level vdev because its sibling was
-	 * detached, it will have adopted the parent's vdev guid -- but the
-	 * label may or may not be on disk yet. Fortunately, either version
-	 * of the label will have the same top guid, so if we're a top-level
-	 * vdev, we can safely compare to that instead.
-	 * However, if the config comes from a cachefile that failed to update
-	 * after the detach, a top-level vdev will appear as a non top-level
-	 * vdev in the config. Also relax the constraints if we perform an
-	 * extreme rewind.
-	 *
-	 * If we split this vdev off instead, then we also check the
-	 * original pool's guid. We don't want to consider the vdev
-	 * corrupt if it is partway through a split operation.
-	 */
-	if (vd->vdev_guid != guid && vd->vdev_guid != aux_guid) {
-		boolean_t mismatch = B_FALSE;
-		if (spa->spa_trust_config && !spa->spa_extreme_rewind) {
-			if (vd != vd->vdev_top || vd->vdev_guid != top_guid)
-				mismatch = B_TRUE;
-		} else {
-			if (vd->vdev_guid != top_guid &&
-			    vd->vdev_top->vdev_guid != guid)
-				mismatch = B_TRUE;
-		}
-
-		if (mismatch) {
-			vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
-			    VDEV_AUX_CORRUPT_DATA);
-			nvlist_free(label);
-			vdev_dbgmsg(vd, "vdev_validate: config guid "
-			    "doesn't match label guid");
-			vdev_dbgmsg(vd, "CONFIG: guid %llu, top_guid %llu",
-			    (u_longlong_t)vd->vdev_guid,
-			    (u_longlong_t)vd->vdev_top->vdev_guid);
-			vdev_dbgmsg(vd, "LABEL: guid %llu, top_guid %llu, "
-			    "aux_guid %llu", (u_longlong_t)guid,
-			    (u_longlong_t)top_guid, (u_longlong_t)aux_guid);
-			return (0);
-		}
-	}
-
-	if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE,
-	    &state) != 0) {
-		vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
-		    VDEV_AUX_CORRUPT_DATA);
-		nvlist_free(label);
-		vdev_dbgmsg(vd, "vdev_validate: '%s' missing from label",
-		    ZPOOL_CONFIG_POOL_STATE);
-		return (0);
-	}
-
-	nvlist_free(label);
-
-	/*
-	 * If this is a verbatim import, no need to check the
-	 * state of the pool.
-	 */
-	if (!(spa->spa_import_flags & ZFS_IMPORT_VERBATIM) &&
-	    spa_load_state(spa) == SPA_LOAD_OPEN &&
-	    state != POOL_STATE_ACTIVE) {
-		vdev_dbgmsg(vd, "vdev_validate: invalid pool state (%llu) "
-		    "for spa %s", (u_longlong_t)state, spa->spa_name);
-		return (SET_ERROR(EBADF));
-	}
-
-	/*
-	 * If we were able to open and validate a vdev that was
-	 * previously marked permanently unavailable, clear that state
-	 * now.
-	 */
-	if (vd->vdev_not_present)
-		vd->vdev_not_present = 0;
-
-	return (0);
-}
-
-static void
-vdev_copy_path_impl(vdev_t *svd, vdev_t *dvd)
-{
-	if (svd->vdev_path != NULL && dvd->vdev_path != NULL) {
-		if (strcmp(svd->vdev_path, dvd->vdev_path) != 0) {
-			zfs_dbgmsg("vdev_copy_path: vdev %llu: path changed "
-			    "from '%s' to '%s'", (u_longlong_t)dvd->vdev_guid,
-			    dvd->vdev_path, svd->vdev_path);
-			spa_strfree(dvd->vdev_path);
-			dvd->vdev_path = spa_strdup(svd->vdev_path);
-		}
-	} else if (svd->vdev_path != NULL) {
-		dvd->vdev_path = spa_strdup(svd->vdev_path);
-		zfs_dbgmsg("vdev_copy_path: vdev %llu: path set to '%s'",
-		    (u_longlong_t)dvd->vdev_guid, dvd->vdev_path);
-	}
-}
-
-/*
- * Recursively copy vdev paths from one vdev to another. Source and destination
- * vdev trees must have same geometry otherwise return error. Intended to copy
- * paths from userland config into MOS config.
- */
-int
-vdev_copy_path_strict(vdev_t *svd, vdev_t *dvd)
-{
-	if ((svd->vdev_ops == &vdev_missing_ops) ||
-	    (svd->vdev_ishole && dvd->vdev_ishole) ||
-	    (dvd->vdev_ops == &vdev_indirect_ops))
-		return (0);
-
-	if (svd->vdev_ops != dvd->vdev_ops) {
-		vdev_dbgmsg(svd, "vdev_copy_path: vdev type mismatch: %s != %s",
-		    svd->vdev_ops->vdev_op_type, dvd->vdev_ops->vdev_op_type);
-		return (SET_ERROR(EINVAL));
-	}
-
-	if (svd->vdev_guid != dvd->vdev_guid) {
-		vdev_dbgmsg(svd, "vdev_copy_path: guids mismatch (%llu != "
-		    "%llu)", (u_longlong_t)svd->vdev_guid,
-		    (u_longlong_t)dvd->vdev_guid);
-		return (SET_ERROR(EINVAL));
-	}
-
-	if (svd->vdev_children != dvd->vdev_children) {
-		vdev_dbgmsg(svd, "vdev_copy_path: children count mismatch: "
-		    "%llu != %llu", (u_longlong_t)svd->vdev_children,
-		    (u_longlong_t)dvd->vdev_children);
-		return (SET_ERROR(EINVAL));
-	}
-
-	for (uint64_t i = 0; i < svd->vdev_children; i++) {
-		int error = vdev_copy_path_strict(svd->vdev_child[i],
-		    dvd->vdev_child[i]);
-		if (error != 0)
-			return (error);
-	}
-
-	if (svd->vdev_ops->vdev_op_leaf)
-		vdev_copy_path_impl(svd, dvd);
-
-	return (0);
-}
-
-static void
-vdev_copy_path_search(vdev_t *stvd, vdev_t *dvd)
-{
-	ASSERT(stvd->vdev_top == stvd);
-	ASSERT3U(stvd->vdev_id, ==, dvd->vdev_top->vdev_id);
-
-	for (uint64_t i = 0; i < dvd->vdev_children; i++) {
-		vdev_copy_path_search(stvd, dvd->vdev_child[i]);
-	}
-
-	if (!dvd->vdev_ops->vdev_op_leaf || !vdev_is_concrete(dvd))
-		return;
-
-	/*
-	 * The idea here is that while a vdev can shift positions within
-	 * a top vdev (when replacing, attaching mirror, etc.) it cannot
-	 * step outside of it.
-	 */
-	vdev_t *vd = vdev_lookup_by_guid(stvd, dvd->vdev_guid);
-
-	if (vd == NULL || vd->vdev_ops != dvd->vdev_ops)
-		return;
-
-	ASSERT(vd->vdev_ops->vdev_op_leaf);
-
-	vdev_copy_path_impl(vd, dvd);
-}
-
-/*
- * Recursively copy vdev paths from one root vdev to another. Source and
- * destination vdev trees may differ in geometry. For each destination leaf
- * vdev, search a vdev with the same guid and top vdev id in the source.
- * Intended to copy paths from userland config into MOS config.
- */
-void
-vdev_copy_path_relaxed(vdev_t *srvd, vdev_t *drvd)
-{
-	uint64_t children = MIN(srvd->vdev_children, drvd->vdev_children);
-	ASSERT(srvd->vdev_ops == &vdev_root_ops);
-	ASSERT(drvd->vdev_ops == &vdev_root_ops);
-
-	for (uint64_t i = 0; i < children; i++) {
-		vdev_copy_path_search(srvd->vdev_child[i],
-		    drvd->vdev_child[i]);
-	}
-}
-
-/*
- * Close a virtual device.
- */
-void
-vdev_close(vdev_t *vd)
-{
-	spa_t *spa = vd->vdev_spa;
-	vdev_t *pvd = vd->vdev_parent;
-
-	ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
-
-	/*
-	 * If our parent is reopening, then we are as well, unless we are
-	 * going offline.
-	 */
-	if (pvd != NULL && pvd->vdev_reopening)
-		vd->vdev_reopening = (pvd->vdev_reopening && !vd->vdev_offline);
-
-	vd->vdev_ops->vdev_op_close(vd);
-
-	vdev_cache_purge(vd);
-
-	if (vd->vdev_ops->vdev_op_leaf)
-		trim_map_destroy(vd);
-
-	/*
-	 * We record the previous state before we close it, so that if we are
-	 * doing a reopen(), we don't generate FMA ereports if we notice that
-	 * it's still faulted.
-	 */
-	vd->vdev_prevstate = vd->vdev_state;
-
-	if (vd->vdev_offline)
-		vd->vdev_state = VDEV_STATE_OFFLINE;
-	else
-		vd->vdev_state = VDEV_STATE_CLOSED;
-	vd->vdev_stat.vs_aux = VDEV_AUX_NONE;
-}
-
-void
-vdev_hold(vdev_t *vd)
-{
-	spa_t *spa = vd->vdev_spa;
-
-	ASSERT(spa_is_root(spa));
-	if (spa->spa_state == POOL_STATE_UNINITIALIZED)
-		return;
-
-	for (int c = 0; c < vd->vdev_children; c++)
-		vdev_hold(vd->vdev_child[c]);
-
-	if (vd->vdev_ops->vdev_op_leaf)
-		vd->vdev_ops->vdev_op_hold(vd);
-}
-
-void
-vdev_rele(vdev_t *vd)
-{
-	spa_t *spa = vd->vdev_spa;
-
-	ASSERT(spa_is_root(spa));
-	for (int c = 0; c < vd->vdev_children; c++)
-		vdev_rele(vd->vdev_child[c]);
-
-	if (vd->vdev_ops->vdev_op_leaf)
-		vd->vdev_ops->vdev_op_rele(vd);
-}
-
-/*
- * Reopen all interior vdevs and any unopened leaves.  We don't actually
- * reopen leaf vdevs which had previously been opened as they might deadlock
- * on the spa_config_lock.  Instead we only obtain the leaf's physical size.
- * If the leaf has never been opened then open it, as usual.
- */
-void
-vdev_reopen(vdev_t *vd)
-{
-	spa_t *spa = vd->vdev_spa;
-
-	ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
-
-	/* set the reopening flag unless we're taking the vdev offline */
-	vd->vdev_reopening = !vd->vdev_offline;
-	vdev_close(vd);
-	(void) vdev_open(vd);
-
-	/*
-	 * Call vdev_validate() here to make sure we have the same device.
-	 * Otherwise, a device with an invalid label could be successfully
-	 * opened in response to vdev_reopen().
-	 */
-	if (vd->vdev_aux) {
-		(void) vdev_validate_aux(vd);
-		if (vdev_readable(vd) && vdev_writeable(vd) &&
-		    vd->vdev_aux == &spa->spa_l2cache &&
-		    !l2arc_vdev_present(vd))
-			l2arc_add_vdev(spa, vd);
-	} else {
-		(void) vdev_validate(vd);
-	}
-
-	/*
-	 * Reassess parent vdev's health.
-	 */
-	vdev_propagate_state(vd);
-}
-
-int
-vdev_create(vdev_t *vd, uint64_t txg, boolean_t isreplacing)
-{
-	int error;
-
-	/*
-	 * Normally, partial opens (e.g. of a mirror) are allowed.
-	 * For a create, however, we want to fail the request if
-	 * there are any components we can't open.
-	 */
-	error = vdev_open(vd);
-
-	if (error || vd->vdev_state != VDEV_STATE_HEALTHY) {
-		vdev_close(vd);
-		return (error ? error : ENXIO);
-	}
-
-	/*
-	 * Recursively load DTLs and initialize all labels.
-	 */
-	if ((error = vdev_dtl_load(vd)) != 0 ||
-	    (error = vdev_label_init(vd, txg, isreplacing ?
-	    VDEV_LABEL_REPLACE : VDEV_LABEL_CREATE)) != 0) {
-		vdev_close(vd);
-		return (error);
-	}
-
-	return (0);
-}
-
-void
-vdev_metaslab_set_size(vdev_t *vd)
-{
-	uint64_t asize = vd->vdev_asize;
-	uint64_t ms_count = asize >> zfs_vdev_default_ms_shift;
-	uint64_t ms_shift;
-
-	/*
-	 * There are two dimensions to the metaslab sizing calculation:
-	 * the size of the metaslab and the count of metaslabs per vdev.
-	 *
-	 * The default values used below are a good balance between memory
-	 * usage (larger metaslab size means more memory needed for loaded
-	 * metaslabs; more metaslabs means more memory needed for the
-	 * metaslab_t structs), metaslab load time (larger metaslabs take
-	 * longer to load), and metaslab sync time (more metaslabs means
-	 * more time spent syncing all of them).
-	 *
-	 * In general, we aim for zfs_vdev_default_ms_count (200) metaslabs.
-	 * The range of the dimensions are as follows:
-	 *
-	 *	2^29 <= ms_size  <= 2^34
-	 *	  16 <= ms_count <= 131,072
-	 *
-	 * On the lower end of vdev sizes, we aim for metaslabs sizes of
-	 * at least 512MB (2^29) to minimize fragmentation effects when
-	 * testing with smaller devices.  However, the count constraint
-	 * of at least 16 metaslabs will override this minimum size goal.
-	 *
-	 * On the upper end of vdev sizes, we aim for a maximum metaslab
-	 * size of 16GB.  However, we will cap the total count to 2^17
-	 * metaslabs to keep our memory footprint in check and let the
-	 * metaslab size grow from there if that limit is hit.
-	 *
-	 * The net effect of applying above constrains is summarized below.
-	 *
-	 *   vdev size	    metaslab count
-	 *  --------------|-----------------
-	 *	< 8GB		~16
-	 *  8GB   - 100GB	one per 512MB
-	 *  100GB - 3TB		~200
-	 *  3TB   - 2PB		one per 16GB
-	 *	> 2PB		~131,072
-	 *  --------------------------------
-	 *
-	 *  Finally, note that all of the above calculate the initial
-	 *  number of metaslabs. Expanding a top-level vdev will result
-	 *  in additional metaslabs being allocated making it possible
-	 *  to exceed the zfs_vdev_ms_count_limit.
-	 */
-
-	if (ms_count < zfs_vdev_min_ms_count)
-		ms_shift = highbit64(asize / zfs_vdev_min_ms_count);
-	else if (ms_count > zfs_vdev_default_ms_count)
-		ms_shift = highbit64(asize / zfs_vdev_default_ms_count);
-	else
-		ms_shift = zfs_vdev_default_ms_shift;
-
-	if (ms_shift < SPA_MAXBLOCKSHIFT) {
-		ms_shift = SPA_MAXBLOCKSHIFT;
-	} else if (ms_shift > zfs_vdev_max_ms_shift) {
-		ms_shift = zfs_vdev_max_ms_shift;
-		/* cap the total count to constrain memory footprint */
-		if ((asize >> ms_shift) > zfs_vdev_ms_count_limit)
-			ms_shift = highbit64(asize / zfs_vdev_ms_count_limit);
-	}
-
-	vd->vdev_ms_shift = ms_shift;
-	ASSERT3U(vd->vdev_ms_shift, >=, SPA_MAXBLOCKSHIFT);
-}
-
-/*
- * Maximize performance by inflating the configured ashift for top level
- * vdevs to be as close to the physical ashift as possible while maintaining
- * administrator defined limits and ensuring it doesn't go below the
- * logical ashift.
- */
-void
-vdev_ashift_optimize(vdev_t *vd)
-{
-	if (vd == vd->vdev_top) {
-		if (vd->vdev_ashift < vd->vdev_physical_ashift) {
-			vd->vdev_ashift = MIN(
-			    MAX(zfs_max_auto_ashift, vd->vdev_ashift),
-			    MAX(zfs_min_auto_ashift, vd->vdev_physical_ashift));
-		} else {
-			/*
-			 * Unusual case where logical ashift > physical ashift
-			 * so we can't cap the calculated ashift based on max
-			 * ashift as that would cause failures.
-			 * We still check if we need to increase it to match
-			 * the min ashift.
-			 */
-			vd->vdev_ashift = MAX(zfs_min_auto_ashift,
-			    vd->vdev_ashift);
-		}
-	}
-}
-
-void
-vdev_dirty(vdev_t *vd, int flags, void *arg, uint64_t txg)
-{
-	ASSERT(vd == vd->vdev_top);
-	/* indirect vdevs don't have metaslabs or dtls */
-	ASSERT(vdev_is_concrete(vd) || flags == 0);
-	ASSERT(ISP2(flags));
-	ASSERT(spa_writeable(vd->vdev_spa));
-
-	if (flags & VDD_METASLAB)
-		(void) txg_list_add(&vd->vdev_ms_list, arg, txg);
-
-	if (flags & VDD_DTL)
-		(void) txg_list_add(&vd->vdev_dtl_list, arg, txg);
-
-	(void) txg_list_add(&vd->vdev_spa->spa_vdev_txg_list, vd, txg);
-}
-
-void
-vdev_dirty_leaves(vdev_t *vd, int flags, uint64_t txg)
-{
-	for (int c = 0; c < vd->vdev_children; c++)
-		vdev_dirty_leaves(vd->vdev_child[c], flags, txg);
-
-	if (vd->vdev_ops->vdev_op_leaf)
-		vdev_dirty(vd->vdev_top, flags, vd, txg);
-}
-
-/*
- * DTLs.
- *
- * A vdev's DTL (dirty time log) is the set of transaction groups for which
- * the vdev has less than perfect replication.  There are four kinds of DTL:
- *
- * DTL_MISSING: txgs for which the vdev has no valid copies of the data
- *
- * DTL_PARTIAL: txgs for which data is available, but not fully replicated
- *
- * DTL_SCRUB: the txgs that could not be repaired by the last scrub; upon
- *	scrub completion, DTL_SCRUB replaces DTL_MISSING in the range of
- *	txgs that was scrubbed.
- *
- * DTL_OUTAGE: txgs which cannot currently be read, whether due to
- *	persistent errors or just some device being offline.
- *	Unlike the other three, the DTL_OUTAGE map is not generally
- *	maintained; it's only computed when needed, typically to
- *	determine whether a device can be detached.
- *
- * For leaf vdevs, DTL_MISSING and DTL_PARTIAL are identical: the device
- * either has the data or it doesn't.
- *
- * For interior vdevs such as mirror and RAID-Z the picture is more complex.
- * A vdev's DTL_PARTIAL is the union of its children's DTL_PARTIALs, because
- * if any child is less than fully replicated, then so is its parent.
- * A vdev's DTL_MISSING is a modified union of its children's DTL_MISSINGs,
- * comprising only those txgs which appear in 'maxfaults' or more children;
- * those are the txgs we don't have enough replication to read.  For example,
- * double-parity RAID-Z can tolerate up to two missing devices (maxfaults == 2);
- * thus, its DTL_MISSING consists of the set of txgs that appear in more than
- * two child DTL_MISSING maps.
- *
- * It should be clear from the above that to compute the DTLs and outage maps
- * for all vdevs, it suffices to know just the leaf vdevs' DTL_MISSING maps.
- * Therefore, that is all we keep on disk.  When loading the pool, or after
- * a configuration change, we generate all other DTLs from first principles.
- */
-void
-vdev_dtl_dirty(vdev_t *vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size)
-{
-	range_tree_t *rt = vd->vdev_dtl[t];
-
-	ASSERT(t < DTL_TYPES);
-	ASSERT(vd != vd->vdev_spa->spa_root_vdev);
-	ASSERT(spa_writeable(vd->vdev_spa));
-
-	mutex_enter(&vd->vdev_dtl_lock);
-	if (!range_tree_contains(rt, txg, size))
-		range_tree_add(rt, txg, size);
-	mutex_exit(&vd->vdev_dtl_lock);
-}
-
-boolean_t
-vdev_dtl_contains(vdev_t *vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size)
-{
-	range_tree_t *rt = vd->vdev_dtl[t];
-	boolean_t dirty = B_FALSE;
-
-	ASSERT(t < DTL_TYPES);
-	ASSERT(vd != vd->vdev_spa->spa_root_vdev);
-
-	/*
-	 * While we are loading the pool, the DTLs have not been loaded yet.
-	 * Ignore the DTLs and try all devices.  This avoids a recursive
-	 * mutex enter on the vdev_dtl_lock, and also makes us try hard
-	 * when loading the pool (relying on the checksum to ensure that
-	 * we get the right data -- note that we while loading, we are
-	 * only reading the MOS, which is always checksummed).
-	 */
-	if (vd->vdev_spa->spa_load_state != SPA_LOAD_NONE)
-		return (B_FALSE);
-
-	mutex_enter(&vd->vdev_dtl_lock);
-	if (!range_tree_is_empty(rt))
-		dirty = range_tree_contains(rt, txg, size);
-	mutex_exit(&vd->vdev_dtl_lock);
-
-	return (dirty);
-}
-
-boolean_t
-vdev_dtl_empty(vdev_t *vd, vdev_dtl_type_t t)
-{
-	range_tree_t *rt = vd->vdev_dtl[t];
-	boolean_t empty;
-
-	mutex_enter(&vd->vdev_dtl_lock);
-	empty = range_tree_is_empty(rt);
-	mutex_exit(&vd->vdev_dtl_lock);
-
-	return (empty);
-}
-
-/*
- * Returns B_TRUE if vdev determines offset needs to be resilvered.
- */
-boolean_t
-vdev_dtl_need_resilver(vdev_t *vd, uint64_t offset, size_t psize)
-{
-        ASSERT(vd != vd->vdev_spa->spa_root_vdev);
-
-        if (vd->vdev_ops->vdev_op_need_resilver == NULL ||
-            vd->vdev_ops->vdev_op_leaf)
-                return (B_TRUE);
-
-        return (vd->vdev_ops->vdev_op_need_resilver(vd, offset, psize));
-}
-
-/*
- * Returns the lowest txg in the DTL range.
- */
-static uint64_t
-vdev_dtl_min(vdev_t *vd)
-{
-	range_seg_t *rs;
-
-	ASSERT(MUTEX_HELD(&vd->vdev_dtl_lock));
-	ASSERT3U(range_tree_space(vd->vdev_dtl[DTL_MISSING]), !=, 0);
-	ASSERT0(vd->vdev_children);
-
-	rs = avl_first(&vd->vdev_dtl[DTL_MISSING]->rt_root);
-	return (rs->rs_start - 1);
-}
-
-/*
- * Returns the highest txg in the DTL.
- */
-static uint64_t
-vdev_dtl_max(vdev_t *vd)
-{
-	range_seg_t *rs;
-
-	ASSERT(MUTEX_HELD(&vd->vdev_dtl_lock));
-	ASSERT3U(range_tree_space(vd->vdev_dtl[DTL_MISSING]), !=, 0);
-	ASSERT0(vd->vdev_children);
-
-	rs = avl_last(&vd->vdev_dtl[DTL_MISSING]->rt_root);
-	return (rs->rs_end);
-}
-
-/*
- * Determine if a resilvering vdev should remove any DTL entries from
- * its range. If the vdev was resilvering for the entire duration of the
- * scan then it should excise that range from its DTLs. Otherwise, this
- * vdev is considered partially resilvered and should leave its DTL
- * entries intact. The comment in vdev_dtl_reassess() describes how we
- * excise the DTLs.
- */
-static boolean_t
-vdev_dtl_should_excise(vdev_t *vd)
-{
-	spa_t *spa = vd->vdev_spa;
-	dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan;
-
-	ASSERT0(scn->scn_phys.scn_errors);
-	ASSERT0(vd->vdev_children);
-
-	if (vd->vdev_state < VDEV_STATE_DEGRADED)
-		return (B_FALSE);
-
-	if (vd->vdev_resilver_txg == 0 ||
-	    range_tree_is_empty(vd->vdev_dtl[DTL_MISSING]))
-		return (B_TRUE);
-
-	/*
-	 * When a resilver is initiated the scan will assign the scn_max_txg
-	 * value to the highest txg value that exists in all DTLs. If this
-	 * device's max DTL is not part of this scan (i.e. it is not in
-	 * the range (scn_min_txg, scn_max_txg] then it is not eligible
-	 * for excision.
-	 */
-	if (vdev_dtl_max(vd) <= scn->scn_phys.scn_max_txg) {
-		ASSERT3U(scn->scn_phys.scn_min_txg, <=, vdev_dtl_min(vd));
-		ASSERT3U(scn->scn_phys.scn_min_txg, <, vd->vdev_resilver_txg);
-		ASSERT3U(vd->vdev_resilver_txg, <=, scn->scn_phys.scn_max_txg);
-		return (B_TRUE);
-	}
-	return (B_FALSE);
-}
-
-/*
- * Reassess DTLs after a config change or scrub completion.
- */
-void
-vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done)
-{
-	spa_t *spa = vd->vdev_spa;
-	avl_tree_t reftree;
-	int minref;
-
-	ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0);
-
-	for (int c = 0; c < vd->vdev_children; c++)
-		vdev_dtl_reassess(vd->vdev_child[c], txg,
-		    scrub_txg, scrub_done);
-
-	if (vd == spa->spa_root_vdev || !vdev_is_concrete(vd) || vd->vdev_aux)
-		return;
-
-	if (vd->vdev_ops->vdev_op_leaf) {
-		dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan;
-
-		mutex_enter(&vd->vdev_dtl_lock);
-
-		/*
-		 * If we've completed a scan cleanly then determine
-		 * if this vdev should remove any DTLs. We only want to
-		 * excise regions on vdevs that were available during
-		 * the entire duration of this scan.
-		 */
-		if (scrub_txg != 0 &&
-		    (spa->spa_scrub_started ||
-		    (scn != NULL && scn->scn_phys.scn_errors == 0)) &&
-		    vdev_dtl_should_excise(vd)) {
-			/*
-			 * We completed a scrub up to scrub_txg.  If we
-			 * did it without rebooting, then the scrub dtl
-			 * will be valid, so excise the old region and
-			 * fold in the scrub dtl.  Otherwise, leave the
-			 * dtl as-is if there was an error.
-			 *
-			 * There's little trick here: to excise the beginning
-			 * of the DTL_MISSING map, we put it into a reference
-			 * tree and then add a segment with refcnt -1 that
-			 * covers the range [0, scrub_txg).  This means
-			 * that each txg in that range has refcnt -1 or 0.
-			 * We then add DTL_SCRUB with a refcnt of 2, so that
-			 * entries in the range [0, scrub_txg) will have a
-			 * positive refcnt -- either 1 or 2.  We then convert
-			 * the reference tree into the new DTL_MISSING map.
-			 */
-			space_reftree_create(&reftree);
-			space_reftree_add_map(&reftree,
-			    vd->vdev_dtl[DTL_MISSING], 1);
-			space_reftree_add_seg(&reftree, 0, scrub_txg, -1);
-			space_reftree_add_map(&reftree,
-			    vd->vdev_dtl[DTL_SCRUB], 2);
-			space_reftree_generate_map(&reftree,
-			    vd->vdev_dtl[DTL_MISSING], 1);
-			space_reftree_destroy(&reftree);
-		}
-		range_tree_vacate(vd->vdev_dtl[DTL_PARTIAL], NULL, NULL);
-		range_tree_walk(vd->vdev_dtl[DTL_MISSING],
-		    range_tree_add, vd->vdev_dtl[DTL_PARTIAL]);
-		if (scrub_done)
-			range_tree_vacate(vd->vdev_dtl[DTL_SCRUB], NULL, NULL);
-		range_tree_vacate(vd->vdev_dtl[DTL_OUTAGE], NULL, NULL);
-		if (!vdev_readable(vd))
-			range_tree_add(vd->vdev_dtl[DTL_OUTAGE], 0, -1ULL);
-		else
-			range_tree_walk(vd->vdev_dtl[DTL_MISSING],
-			    range_tree_add, vd->vdev_dtl[DTL_OUTAGE]);
-
-		/*
-		 * If the vdev was resilvering and no longer has any
-		 * DTLs then reset its resilvering flag and dirty
-		 * the top level so that we persist the change.
-		 */
-		if (vd->vdev_resilver_txg != 0 &&
-		    range_tree_is_empty(vd->vdev_dtl[DTL_MISSING]) &&
-		    range_tree_is_empty(vd->vdev_dtl[DTL_OUTAGE])) {
-			vd->vdev_resilver_txg = 0;
-			vdev_config_dirty(vd->vdev_top);
-		}
-
-		mutex_exit(&vd->vdev_dtl_lock);
-
-		if (txg != 0)
-			vdev_dirty(vd->vdev_top, VDD_DTL, vd, txg);
-		return;
-	}
-
-	mutex_enter(&vd->vdev_dtl_lock);
-	for (int t = 0; t < DTL_TYPES; t++) {
-		/* account for child's outage in parent's missing map */
-		int s = (t == DTL_MISSING) ? DTL_OUTAGE: t;
-		if (t == DTL_SCRUB)
-			continue;			/* leaf vdevs only */
-		if (t == DTL_PARTIAL)
-			minref = 1;			/* i.e. non-zero */
-		else if (vd->vdev_nparity != 0)
-			minref = vd->vdev_nparity + 1;	/* RAID-Z */
-		else
-			minref = vd->vdev_children;	/* any kind of mirror */
-		space_reftree_create(&reftree);
-		for (int c = 0; c < vd->vdev_children; c++) {
-			vdev_t *cvd = vd->vdev_child[c];
-			mutex_enter(&cvd->vdev_dtl_lock);
-			space_reftree_add_map(&reftree, cvd->vdev_dtl[s], 1);
-			mutex_exit(&cvd->vdev_dtl_lock);
-		}
-		space_reftree_generate_map(&reftree, vd->vdev_dtl[t], minref);
-		space_reftree_destroy(&reftree);
-	}
-	mutex_exit(&vd->vdev_dtl_lock);
-}
-
-int
-vdev_dtl_load(vdev_t *vd)
-{
-	spa_t *spa = vd->vdev_spa;
-	objset_t *mos = spa->spa_meta_objset;
-	int error = 0;
-
-	if (vd->vdev_ops->vdev_op_leaf && vd->vdev_dtl_object != 0) {
-		ASSERT(vdev_is_concrete(vd));
-
-		error = space_map_open(&vd->vdev_dtl_sm, mos,
-		    vd->vdev_dtl_object, 0, -1ULL, 0);
-		if (error)
-			return (error);
-		ASSERT(vd->vdev_dtl_sm != NULL);
-
-		mutex_enter(&vd->vdev_dtl_lock);
-		error = space_map_load(vd->vdev_dtl_sm,
-		    vd->vdev_dtl[DTL_MISSING], SM_ALLOC);
-		mutex_exit(&vd->vdev_dtl_lock);
-
-		return (error);
-	}
-
-	for (int c = 0; c < vd->vdev_children; c++) {
-		error = vdev_dtl_load(vd->vdev_child[c]);
-		if (error != 0)
-			break;
-	}
-
-	return (error);
-}
-
-static void
-vdev_zap_allocation_data(vdev_t *vd, dmu_tx_t *tx)
-{
-	spa_t *spa = vd->vdev_spa;
-	objset_t *mos = spa->spa_meta_objset;
-	vdev_alloc_bias_t alloc_bias = vd->vdev_alloc_bias;
-	const char *string;
-
-	ASSERT(alloc_bias != VDEV_BIAS_NONE);
-
-	string =
-	    (alloc_bias == VDEV_BIAS_LOG) ? VDEV_ALLOC_BIAS_LOG :
-	    (alloc_bias == VDEV_BIAS_SPECIAL) ? VDEV_ALLOC_BIAS_SPECIAL :
-	    (alloc_bias == VDEV_BIAS_DEDUP) ? VDEV_ALLOC_BIAS_DEDUP : NULL;
-
-	ASSERT(string != NULL);
-	VERIFY0(zap_add(mos, vd->vdev_top_zap, VDEV_TOP_ZAP_ALLOCATION_BIAS,
-	    1, strlen(string) + 1, string, tx));
-
-	if (alloc_bias == VDEV_BIAS_SPECIAL || alloc_bias == VDEV_BIAS_DEDUP) {
-		spa_activate_allocation_classes(spa, tx);
-	}
-}
-
-void
-vdev_destroy_unlink_zap(vdev_t *vd, uint64_t zapobj, dmu_tx_t *tx)
-{
-	spa_t *spa = vd->vdev_spa;
-
-	VERIFY0(zap_destroy(spa->spa_meta_objset, zapobj, tx));
-	VERIFY0(zap_remove_int(spa->spa_meta_objset, spa->spa_all_vdev_zaps,
-	    zapobj, tx));
-}
-
-uint64_t
-vdev_create_link_zap(vdev_t *vd, dmu_tx_t *tx)
-{
-	spa_t *spa = vd->vdev_spa;
-	uint64_t zap = zap_create(spa->spa_meta_objset, DMU_OTN_ZAP_METADATA,
-	    DMU_OT_NONE, 0, tx);
-
-	ASSERT(zap != 0);
-	VERIFY0(zap_add_int(spa->spa_meta_objset, spa->spa_all_vdev_zaps,
-	    zap, tx));
-
-	return (zap);
-}
-
-void
-vdev_construct_zaps(vdev_t *vd, dmu_tx_t *tx)
-{
-	if (vd->vdev_ops != &vdev_hole_ops &&
-	    vd->vdev_ops != &vdev_missing_ops &&
-	    vd->vdev_ops != &vdev_root_ops &&
-	    !vd->vdev_top->vdev_removing) {
-		if (vd->vdev_ops->vdev_op_leaf && vd->vdev_leaf_zap == 0) {
-			vd->vdev_leaf_zap = vdev_create_link_zap(vd, tx);
-		}
-		if (vd == vd->vdev_top && vd->vdev_top_zap == 0) {
-			vd->vdev_top_zap = vdev_create_link_zap(vd, tx);
-			if (vd->vdev_alloc_bias != VDEV_BIAS_NONE)
-				vdev_zap_allocation_data(vd, tx);
-		}
-	}
-
-	for (uint64_t i = 0; i < vd->vdev_children; i++) {
-		vdev_construct_zaps(vd->vdev_child[i], tx);
-	}
-}
-
-void
-vdev_dtl_sync(vdev_t *vd, uint64_t txg)
-{
-	spa_t *spa = vd->vdev_spa;
-	range_tree_t *rt = vd->vdev_dtl[DTL_MISSING];
-	objset_t *mos = spa->spa_meta_objset;
-	range_tree_t *rtsync;
-	dmu_tx_t *tx;
-	uint64_t object = space_map_object(vd->vdev_dtl_sm);
-
-	ASSERT(vdev_is_concrete(vd));
-	ASSERT(vd->vdev_ops->vdev_op_leaf);
-
-	tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
-
-	if (vd->vdev_detached || vd->vdev_top->vdev_removing) {
-		mutex_enter(&vd->vdev_dtl_lock);
-		space_map_free(vd->vdev_dtl_sm, tx);
-		space_map_close(vd->vdev_dtl_sm);
-		vd->vdev_dtl_sm = NULL;
-		mutex_exit(&vd->vdev_dtl_lock);
-
-		/*
-		 * We only destroy the leaf ZAP for detached leaves or for
-		 * removed log devices. Removed data devices handle leaf ZAP
-		 * cleanup later, once cancellation is no longer possible.
-		 */
-		if (vd->vdev_leaf_zap != 0 && (vd->vdev_detached ||
-		    vd->vdev_top->vdev_islog)) {
-			vdev_destroy_unlink_zap(vd, vd->vdev_leaf_zap, tx);
-			vd->vdev_leaf_zap = 0;
-		}
-
-		dmu_tx_commit(tx);
-		return;
-	}
-
-	if (vd->vdev_dtl_sm == NULL) {
-		uint64_t new_object;
-
-		new_object = space_map_alloc(mos, vdev_dtl_sm_blksz, tx);
-		VERIFY3U(new_object, !=, 0);
-
-		VERIFY0(space_map_open(&vd->vdev_dtl_sm, mos, new_object,
-		    0, -1ULL, 0));
-		ASSERT(vd->vdev_dtl_sm != NULL);
-	}
-
-	rtsync = range_tree_create(NULL, NULL);
-
-	mutex_enter(&vd->vdev_dtl_lock);
-	range_tree_walk(rt, range_tree_add, rtsync);
-	mutex_exit(&vd->vdev_dtl_lock);
-
-	space_map_truncate(vd->vdev_dtl_sm, vdev_dtl_sm_blksz, tx);
-	space_map_write(vd->vdev_dtl_sm, rtsync, SM_ALLOC, SM_NO_VDEVID, tx);
-	range_tree_vacate(rtsync, NULL, NULL);
-
-	range_tree_destroy(rtsync);
-
-	/*
-	 * If the object for the space map has changed then dirty
-	 * the top level so that we update the config.
-	 */
-	if (object != space_map_object(vd->vdev_dtl_sm)) {
-		vdev_dbgmsg(vd, "txg %llu, spa %s, DTL old object %llu, "
-		    "new object %llu", (u_longlong_t)txg, spa_name(spa),
-		    (u_longlong_t)object,
-		    (u_longlong_t)space_map_object(vd->vdev_dtl_sm));
-		vdev_config_dirty(vd->vdev_top);
-	}
-
-	dmu_tx_commit(tx);
-}
-
-/*
- * Determine whether the specified vdev can be offlined/detached/removed
- * without losing data.
- */
-boolean_t
-vdev_dtl_required(vdev_t *vd)
-{
-	spa_t *spa = vd->vdev_spa;
-	vdev_t *tvd = vd->vdev_top;
-	uint8_t cant_read = vd->vdev_cant_read;
-	boolean_t required;
-
-	ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
-
-	if (vd == spa->spa_root_vdev || vd == tvd)
-		return (B_TRUE);
-
-	/*
-	 * Temporarily mark the device as unreadable, and then determine
-	 * whether this results in any DTL outages in the top-level vdev.
-	 * If not, we can safely offline/detach/remove the device.
-	 */
-	vd->vdev_cant_read = B_TRUE;
-	vdev_dtl_reassess(tvd, 0, 0, B_FALSE);
-	required = !vdev_dtl_empty(tvd, DTL_OUTAGE);
-	vd->vdev_cant_read = cant_read;
-	vdev_dtl_reassess(tvd, 0, 0, B_FALSE);
-
-	if (!required && zio_injection_enabled)
-		required = !!zio_handle_device_injection(vd, NULL, ECHILD);
-
-	return (required);
-}
-
-/*
- * Determine if resilver is needed, and if so the txg range.
- */
-boolean_t
-vdev_resilver_needed(vdev_t *vd, uint64_t *minp, uint64_t *maxp)
-{
-	boolean_t needed = B_FALSE;
-	uint64_t thismin = UINT64_MAX;
-	uint64_t thismax = 0;
-
-	if (vd->vdev_children == 0) {
-		mutex_enter(&vd->vdev_dtl_lock);
-		if (!range_tree_is_empty(vd->vdev_dtl[DTL_MISSING]) &&
-		    vdev_writeable(vd)) {
-
-			thismin = vdev_dtl_min(vd);
-			thismax = vdev_dtl_max(vd);
-			needed = B_TRUE;
-		}
-		mutex_exit(&vd->vdev_dtl_lock);
-	} else {
-		for (int c = 0; c < vd->vdev_children; c++) {
-			vdev_t *cvd = vd->vdev_child[c];
-			uint64_t cmin, cmax;
-
-			if (vdev_resilver_needed(cvd, &cmin, &cmax)) {
-				thismin = MIN(thismin, cmin);
-				thismax = MAX(thismax, cmax);
-				needed = B_TRUE;
-			}
-		}
-	}
-
-	if (needed && minp) {
-		*minp = thismin;
-		*maxp = thismax;
-	}
-	return (needed);
-}
-
-/*
- * Gets the checkpoint space map object from the vdev's ZAP.
- * Returns the spacemap object, or 0 if it wasn't in the ZAP
- * or the ZAP doesn't exist yet.
- */
-int
-vdev_checkpoint_sm_object(vdev_t *vd)
-{
-	ASSERT0(spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER));
-	if (vd->vdev_top_zap == 0) {
-		return (0);
-	}
-
-	uint64_t sm_obj = 0;
-	int err = zap_lookup(spa_meta_objset(vd->vdev_spa), vd->vdev_top_zap,
-	    VDEV_TOP_ZAP_POOL_CHECKPOINT_SM, sizeof (uint64_t), 1, &sm_obj);
-
-	ASSERT(err == 0 || err == ENOENT);
-
-	return (sm_obj);
-}
-
-int
-vdev_load(vdev_t *vd)
-{
-	int error = 0;
-	/*
-	 * Recursively load all children.
-	 */
-	for (int c = 0; c < vd->vdev_children; c++) {
-		error = vdev_load(vd->vdev_child[c]);
-		if (error != 0) {
-			return (error);
-		}
-	}
-
-	vdev_set_deflate_ratio(vd);
-
-	/*
-	 * On spa_load path, grab the allocation bias from our zap
-	 */
-	if (vd == vd->vdev_top && vd->vdev_top_zap != 0) {
-		spa_t *spa = vd->vdev_spa;
-		char bias_str[64];
-
-		if (zap_lookup(spa->spa_meta_objset, vd->vdev_top_zap,
-		    VDEV_TOP_ZAP_ALLOCATION_BIAS, 1, sizeof (bias_str),
-		    bias_str) == 0) {
-			ASSERT(vd->vdev_alloc_bias == VDEV_BIAS_NONE);
-			vd->vdev_alloc_bias = vdev_derive_alloc_bias(bias_str);
-		}
-	}
-
-	/*
-	 * If this is a top-level vdev, initialize its metaslabs.
-	 */
-	if (vd == vd->vdev_top && vdev_is_concrete(vd)) {
-		vdev_metaslab_group_create(vd);
-
-		if (vd->vdev_ashift == 0 || vd->vdev_asize == 0) {
-			vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
-			    VDEV_AUX_CORRUPT_DATA);
-			vdev_dbgmsg(vd, "vdev_load: invalid size. ashift=%llu, "
-			    "asize=%llu", (u_longlong_t)vd->vdev_ashift,
-			    (u_longlong_t)vd->vdev_asize);
-			return (SET_ERROR(ENXIO));
-		}
-
-		error = vdev_metaslab_init(vd, 0);
-		if (error != 0) {
-			vdev_dbgmsg(vd, "vdev_load: metaslab_init failed "
-			    "[error=%d]", error);
-			vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
-			    VDEV_AUX_CORRUPT_DATA);
-			return (error);
-		}
-
-		uint64_t checkpoint_sm_obj = vdev_checkpoint_sm_object(vd);
-		if (checkpoint_sm_obj != 0) {
-			objset_t *mos = spa_meta_objset(vd->vdev_spa);
-			ASSERT(vd->vdev_asize != 0);
-			ASSERT3P(vd->vdev_checkpoint_sm, ==, NULL);
-
-			error = space_map_open(&vd->vdev_checkpoint_sm,
-			    mos, checkpoint_sm_obj, 0, vd->vdev_asize,
-			    vd->vdev_ashift);
-			if (error != 0) {
-				vdev_dbgmsg(vd, "vdev_load: space_map_open "
-				    "failed for checkpoint spacemap (obj %llu) "
-				    "[error=%d]",
-				    (u_longlong_t)checkpoint_sm_obj, error);
-				return (error);
-			}
-			ASSERT3P(vd->vdev_checkpoint_sm, !=, NULL);
-
-			/*
-			 * Since the checkpoint_sm contains free entries
-			 * exclusively we can use space_map_allocated() to
-			 * indicate the cumulative checkpointed space that
-			 * has been freed.
-			 */
-			vd->vdev_stat.vs_checkpoint_space =
-			    -space_map_allocated(vd->vdev_checkpoint_sm);
-			vd->vdev_spa->spa_checkpoint_info.sci_dspace +=
-			    vd->vdev_stat.vs_checkpoint_space;
-		}
-	}
-
-	/*
-	 * If this is a leaf vdev, load its DTL.
-	 */
-	if (vd->vdev_ops->vdev_op_leaf && (error = vdev_dtl_load(vd)) != 0) {
-		vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
-		    VDEV_AUX_CORRUPT_DATA);
-		vdev_dbgmsg(vd, "vdev_load: vdev_dtl_load failed "
-		    "[error=%d]", error);
-		return (error);
-	}
-
-	uint64_t obsolete_sm_object = vdev_obsolete_sm_object(vd);
-	if (obsolete_sm_object != 0) {
-		objset_t *mos = vd->vdev_spa->spa_meta_objset;
-		ASSERT(vd->vdev_asize != 0);
-		ASSERT3P(vd->vdev_obsolete_sm, ==, NULL);
-
-		if ((error = space_map_open(&vd->vdev_obsolete_sm, mos,
-		    obsolete_sm_object, 0, vd->vdev_asize, 0))) {
-			vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
-			    VDEV_AUX_CORRUPT_DATA);
-			vdev_dbgmsg(vd, "vdev_load: space_map_open failed for "
-			    "obsolete spacemap (obj %llu) [error=%d]",
-			    (u_longlong_t)obsolete_sm_object, error);
-			return (error);
-		}
-	}
-
-	return (0);
-}
-
-/*
- * The special vdev case is used for hot spares and l2cache devices.  Its
- * sole purpose it to set the vdev state for the associated vdev.  To do this,
- * we make sure that we can open the underlying device, then try to read the
- * label, and make sure that the label is sane and that it hasn't been
- * repurposed to another pool.
- */
-int
-vdev_validate_aux(vdev_t *vd)
-{
-	nvlist_t *label;
-	uint64_t guid, version;
-	uint64_t state;
-
-	if (!vdev_readable(vd))
-		return (0);
-
-	if ((label = vdev_label_read_config(vd, -1ULL)) == NULL) {
-		vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
-		    VDEV_AUX_CORRUPT_DATA);
-		return (-1);
-	}
-
-	if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_VERSION, &version) != 0 ||
-	    !SPA_VERSION_IS_SUPPORTED(version) ||
-	    nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) != 0 ||
-	    guid != vd->vdev_guid ||
-	    nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE, &state) != 0) {
-		vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
-		    VDEV_AUX_CORRUPT_DATA);
-		nvlist_free(label);
-		return (-1);
-	}
-
-	/*
-	 * We don't actually check the pool state here.  If it's in fact in
-	 * use by another pool, we update this fact on the fly when requested.
-	 */
-	nvlist_free(label);
-	return (0);
-}
-
-/*
- * Free the objects used to store this vdev's spacemaps, and the array
- * that points to them.
- */
-void
-vdev_destroy_spacemaps(vdev_t *vd, dmu_tx_t *tx)
-{
-	if (vd->vdev_ms_array == 0)
-		return;
-
-	objset_t *mos = vd->vdev_spa->spa_meta_objset;
-	uint64_t array_count = vd->vdev_asize >> vd->vdev_ms_shift;
-	size_t array_bytes = array_count * sizeof (uint64_t);
-	uint64_t *smobj_array = kmem_alloc(array_bytes, KM_SLEEP);
-	VERIFY0(dmu_read(mos, vd->vdev_ms_array, 0,
-	    array_bytes, smobj_array, 0));
-
-	for (uint64_t i = 0; i < array_count; i++) {
-		uint64_t smobj = smobj_array[i];
-		if (smobj == 0)
-			continue;
-
-		space_map_free_obj(mos, smobj, tx);
-	}
-
-	kmem_free(smobj_array, array_bytes);
-	VERIFY0(dmu_object_free(mos, vd->vdev_ms_array, tx));
-	vd->vdev_ms_array = 0;
-}
-
-static void
-vdev_remove_empty_log(vdev_t *vd, uint64_t txg)
-{
-	spa_t *spa = vd->vdev_spa;
-
-	ASSERT(vd->vdev_islog);
-	ASSERT(vd == vd->vdev_top);
-	ASSERT3U(txg, ==, spa_syncing_txg(spa));
-
-	dmu_tx_t *tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg);
-
-	vdev_destroy_spacemaps(vd, tx);
-	if (vd->vdev_top_zap != 0) {
-		vdev_destroy_unlink_zap(vd, vd->vdev_top_zap, tx);
-		vd->vdev_top_zap = 0;
-	}
-
-	dmu_tx_commit(tx);
-}
-
-void
-vdev_sync_done(vdev_t *vd, uint64_t txg)
-{
-	metaslab_t *msp;
-	boolean_t reassess = !txg_list_empty(&vd->vdev_ms_list, TXG_CLEAN(txg));
-
-	ASSERT(vdev_is_concrete(vd));
-
-	while ((msp = txg_list_remove(&vd->vdev_ms_list, TXG_CLEAN(txg)))
-	    != NULL)
-		metaslab_sync_done(msp, txg);
-
-	if (reassess)
-		metaslab_sync_reassess(vd->vdev_mg);
-}
-
-void
-vdev_sync(vdev_t *vd, uint64_t txg)
-{
-	spa_t *spa = vd->vdev_spa;
-	vdev_t *lvd;
-	metaslab_t *msp;
-
-	ASSERT3U(txg, ==, spa->spa_syncing_txg);
-	dmu_tx_t *tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
-	if (range_tree_space(vd->vdev_obsolete_segments) > 0) {
-		ASSERT(vd->vdev_removing ||
-		    vd->vdev_ops == &vdev_indirect_ops);
-
-		vdev_indirect_sync_obsolete(vd, tx);
-
-		/*
-		 * If the vdev is indirect, it can't have dirty
-		 * metaslabs or DTLs.
-		 */
-		if (vd->vdev_ops == &vdev_indirect_ops) {
-			ASSERT(txg_list_empty(&vd->vdev_ms_list, txg));
-			ASSERT(txg_list_empty(&vd->vdev_dtl_list, txg));
-			dmu_tx_commit(tx);
-			return;
-		}
-	}
-
-	ASSERT(vdev_is_concrete(vd));
-
-	if (vd->vdev_ms_array == 0 && vd->vdev_ms_shift != 0 &&
-	    !vd->vdev_removing) {
-		ASSERT(vd == vd->vdev_top);
-		ASSERT0(vd->vdev_indirect_config.vic_mapping_object);
-		vd->vdev_ms_array = dmu_object_alloc(spa->spa_meta_objset,
-		    DMU_OT_OBJECT_ARRAY, 0, DMU_OT_NONE, 0, tx);
-		ASSERT(vd->vdev_ms_array != 0);
-		vdev_config_dirty(vd);
-	}
-
-	while ((msp = txg_list_remove(&vd->vdev_ms_list, txg)) != NULL) {
-		metaslab_sync(msp, txg);
-		(void) txg_list_add(&vd->vdev_ms_list, msp, TXG_CLEAN(txg));
-	}
-
-	while ((lvd = txg_list_remove(&vd->vdev_dtl_list, txg)) != NULL)
-		vdev_dtl_sync(lvd, txg);
-
-	/*
-	 * If this is an empty log device being removed, destroy the
-	 * metadata associated with it.
-	 */
-	if (vd->vdev_islog && vd->vdev_stat.vs_alloc == 0 && vd->vdev_removing)
-		vdev_remove_empty_log(vd, txg);
-
-	(void) txg_list_add(&spa->spa_vdev_txg_list, vd, TXG_CLEAN(txg));
-	dmu_tx_commit(tx);
-}
-
-uint64_t
-vdev_psize_to_asize(vdev_t *vd, uint64_t psize)
-{
-	return (vd->vdev_ops->vdev_op_asize(vd, psize));
-}
-
-/*
- * Mark the given vdev faulted.  A faulted vdev behaves as if the device could
- * not be opened, and no I/O is attempted.
- */
-int
-vdev_fault(spa_t *spa, uint64_t guid, vdev_aux_t aux)
-{
-	vdev_t *vd, *tvd;
-
-	spa_vdev_state_enter(spa, SCL_NONE);
-
-	if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
-		return (spa_vdev_state_exit(spa, NULL, ENODEV));
-
-	if (!vd->vdev_ops->vdev_op_leaf)
-		return (spa_vdev_state_exit(spa, NULL, ENOTSUP));
-
-	tvd = vd->vdev_top;
-
-	/*
-	 * We don't directly use the aux state here, but if we do a
-	 * vdev_reopen(), we need this value to be present to remember why we
-	 * were faulted.
-	 */
-	vd->vdev_label_aux = aux;
-
-	/*
-	 * Faulted state takes precedence over degraded.
-	 */
-	vd->vdev_delayed_close = B_FALSE;
-	vd->vdev_faulted = 1ULL;
-	vd->vdev_degraded = 0ULL;
-	vdev_set_state(vd, B_FALSE, VDEV_STATE_FAULTED, aux);
-
-	/*
-	 * If this device has the only valid copy of the data, then
-	 * back off and simply mark the vdev as degraded instead.
-	 */
-	if (!tvd->vdev_islog && vd->vdev_aux == NULL && vdev_dtl_required(vd)) {
-		vd->vdev_degraded = 1ULL;
-		vd->vdev_faulted = 0ULL;
-
-		/*
-		 * If we reopen the device and it's not dead, only then do we
-		 * mark it degraded.
-		 */
-		vdev_reopen(tvd);
-
-		if (vdev_readable(vd))
-			vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, aux);
-	}
-
-	return (spa_vdev_state_exit(spa, vd, 0));
-}
-
-/*
- * Mark the given vdev degraded.  A degraded vdev is purely an indication to the
- * user that something is wrong.  The vdev continues to operate as normal as far
- * as I/O is concerned.
- */
-int
-vdev_degrade(spa_t *spa, uint64_t guid, vdev_aux_t aux)
-{
-	vdev_t *vd;
-
-	spa_vdev_state_enter(spa, SCL_NONE);
-
-	if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
-		return (spa_vdev_state_exit(spa, NULL, ENODEV));
-
-	if (!vd->vdev_ops->vdev_op_leaf)
-		return (spa_vdev_state_exit(spa, NULL, ENOTSUP));
-
-	/*
-	 * If the vdev is already faulted, then don't do anything.
-	 */
-	if (vd->vdev_faulted || vd->vdev_degraded)
-		return (spa_vdev_state_exit(spa, NULL, 0));
-
-	vd->vdev_degraded = 1ULL;
-	if (!vdev_is_dead(vd))
-		vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED,
-		    aux);
-
-	return (spa_vdev_state_exit(spa, vd, 0));
-}
-
-/*
- * Online the given vdev.
- *
- * If 'ZFS_ONLINE_UNSPARE' is set, it implies two things.  First, any attached
- * spare device should be detached when the device finishes resilvering.
- * Second, the online should be treated like a 'test' online case, so no FMA
- * events are generated if the device fails to open.
- */
-int
-vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, vdev_state_t *newstate)
-{
-	vdev_t *vd, *tvd, *pvd, *rvd = spa->spa_root_vdev;
-	boolean_t wasoffline;
-	vdev_state_t oldstate;
-
-	spa_vdev_state_enter(spa, SCL_NONE);
-
-	if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
-		return (spa_vdev_state_exit(spa, NULL, ENODEV));
-
-	if (!vd->vdev_ops->vdev_op_leaf)
-		return (spa_vdev_state_exit(spa, NULL, ENOTSUP));
-
-	wasoffline = (vd->vdev_offline || vd->vdev_tmpoffline);
-	oldstate = vd->vdev_state;
-
-	tvd = vd->vdev_top;
-	vd->vdev_offline = B_FALSE;
-	vd->vdev_tmpoffline = B_FALSE;
-	vd->vdev_checkremove = !!(flags & ZFS_ONLINE_CHECKREMOVE);
-	vd->vdev_forcefault = !!(flags & ZFS_ONLINE_FORCEFAULT);
-
-	/* XXX - L2ARC 1.0 does not support expansion */
-	if (!vd->vdev_aux) {
-		for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent)
-			pvd->vdev_expanding = !!(flags & ZFS_ONLINE_EXPAND);
-	}
-
-	vdev_reopen(tvd);
-	vd->vdev_checkremove = vd->vdev_forcefault = B_FALSE;
-
-	if (!vd->vdev_aux) {
-		for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent)
-			pvd->vdev_expanding = B_FALSE;
-	}
-
-	if (newstate)
-		*newstate = vd->vdev_state;
-	if ((flags & ZFS_ONLINE_UNSPARE) &&
-	    !vdev_is_dead(vd) && vd->vdev_parent &&
-	    vd->vdev_parent->vdev_ops == &vdev_spare_ops &&
-	    vd->vdev_parent->vdev_child[0] == vd)
-		vd->vdev_unspare = B_TRUE;
-
-	if ((flags & ZFS_ONLINE_EXPAND) || spa->spa_autoexpand) {
-
-		/* XXX - L2ARC 1.0 does not support expansion */
-		if (vd->vdev_aux)
-			return (spa_vdev_state_exit(spa, vd, ENOTSUP));
-		spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
-	}
-
-	/* Restart initializing if necessary */
-	mutex_enter(&vd->vdev_initialize_lock);
-	if (vdev_writeable(vd) &&
-	    vd->vdev_initialize_thread == NULL &&
-	    vd->vdev_initialize_state == VDEV_INITIALIZE_ACTIVE) {
-		(void) vdev_initialize(vd);
-	}
-	mutex_exit(&vd->vdev_initialize_lock);
-
-	if (wasoffline ||
-	    (oldstate < VDEV_STATE_DEGRADED &&
-	    vd->vdev_state >= VDEV_STATE_DEGRADED))
-		spa_event_notify(spa, vd, NULL, ESC_ZFS_VDEV_ONLINE);
-
-	return (spa_vdev_state_exit(spa, vd, 0));
-}
-
-static int
-vdev_offline_locked(spa_t *spa, uint64_t guid, uint64_t flags)
-{
-	vdev_t *vd, *tvd;
-	int error = 0;
-	uint64_t generation;
-	metaslab_group_t *mg;
-
-top:
-	spa_vdev_state_enter(spa, SCL_ALLOC);
-
-	if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
-		return (spa_vdev_state_exit(spa, NULL, ENODEV));
-
-	if (!vd->vdev_ops->vdev_op_leaf)
-		return (spa_vdev_state_exit(spa, NULL, ENOTSUP));
-
-	tvd = vd->vdev_top;
-	mg = tvd->vdev_mg;
-	generation = spa->spa_config_generation + 1;
-
-	/*
-	 * If the device isn't already offline, try to offline it.
-	 */
-	if (!vd->vdev_offline) {
-		/*
-		 * If this device has the only valid copy of some data,
-		 * don't allow it to be offlined. Log devices are always
-		 * expendable.
-		 */
-		if (!tvd->vdev_islog && vd->vdev_aux == NULL &&
-		    vdev_dtl_required(vd))
-			return (spa_vdev_state_exit(spa, NULL, EBUSY));
-
-		/*
-		 * If the top-level is a slog and it has had allocations
-		 * then proceed.  We check that the vdev's metaslab group
-		 * is not NULL since it's possible that we may have just
-		 * added this vdev but not yet initialized its metaslabs.
-		 */
-		if (tvd->vdev_islog && mg != NULL) {
-			/*
-			 * Prevent any future allocations.
-			 */
-			metaslab_group_passivate(mg);
-			(void) spa_vdev_state_exit(spa, vd, 0);
-
-			error = spa_reset_logs(spa);
-
-			/*
-			 * If the log device was successfully reset but has
-			 * checkpointed data, do not offline it.
-			 */
-			if (error == 0 &&
-			    tvd->vdev_checkpoint_sm != NULL) {
-				error = ZFS_ERR_CHECKPOINT_EXISTS;
-			}
-
-			spa_vdev_state_enter(spa, SCL_ALLOC);
-
-			/*
-			 * Check to see if the config has changed.
-			 */
-			if (error || generation != spa->spa_config_generation) {
-				metaslab_group_activate(mg);
-				if (error)
-					return (spa_vdev_state_exit(spa,
-					    vd, error));
-				(void) spa_vdev_state_exit(spa, vd, 0);
-				goto top;
-			}
-			ASSERT0(tvd->vdev_stat.vs_alloc);
-		}
-
-		/*
-		 * Offline this device and reopen its top-level vdev.
-		 * If the top-level vdev is a log device then just offline
-		 * it. Otherwise, if this action results in the top-level
-		 * vdev becoming unusable, undo it and fail the request.
-		 */
-		vd->vdev_offline = B_TRUE;
-		vdev_reopen(tvd);
-
-		if (!tvd->vdev_islog && vd->vdev_aux == NULL &&
-		    vdev_is_dead(tvd)) {
-			vd->vdev_offline = B_FALSE;
-			vdev_reopen(tvd);
-			return (spa_vdev_state_exit(spa, NULL, EBUSY));
-		}
-
-		/*
-		 * Add the device back into the metaslab rotor so that
-		 * once we online the device it's open for business.
-		 */
-		if (tvd->vdev_islog && mg != NULL)
-			metaslab_group_activate(mg);
-	}
-
-	vd->vdev_tmpoffline = !!(flags & ZFS_OFFLINE_TEMPORARY);
-
-	return (spa_vdev_state_exit(spa, vd, 0));
-}
-
-int
-vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags)
-{
-	int error;
-
-	mutex_enter(&spa->spa_vdev_top_lock);
-	error = vdev_offline_locked(spa, guid, flags);
-	mutex_exit(&spa->spa_vdev_top_lock);
-
-	return (error);
-}
-
-/*
- * Clear the error counts associated with this vdev.  Unlike vdev_online() and
- * vdev_offline(), we assume the spa config is locked.  We also clear all
- * children.  If 'vd' is NULL, then the user wants to clear all vdevs.
- */
-void
-vdev_clear(spa_t *spa, vdev_t *vd)
-{
-	vdev_t *rvd = spa->spa_root_vdev;
-
-	ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
-
-	if (vd == NULL)
-		vd = rvd;
-
-	vd->vdev_stat.vs_read_errors = 0;
-	vd->vdev_stat.vs_write_errors = 0;
-	vd->vdev_stat.vs_checksum_errors = 0;
-
-	for (int c = 0; c < vd->vdev_children; c++)
-		vdev_clear(spa, vd->vdev_child[c]);
-
-	if (vd == rvd) {
-		for (int c = 0; c < spa->spa_l2cache.sav_count; c++)
-			vdev_clear(spa, spa->spa_l2cache.sav_vdevs[c]);
-
-		for (int c = 0; c < spa->spa_spares.sav_count; c++)
-			vdev_clear(spa, spa->spa_spares.sav_vdevs[c]);
-	}
-
-	/*
-	 * It makes no sense to "clear" an indirect vdev.
-	 */
-	if (!vdev_is_concrete(vd))
-		return;
-
-	/*
-	 * If we're in the FAULTED state or have experienced failed I/O, then
-	 * clear the persistent state and attempt to reopen the device.  We
-	 * also mark the vdev config dirty, so that the new faulted state is
-	 * written out to disk.
-	 */
-	if (vd->vdev_faulted || vd->vdev_degraded ||
-	    !vdev_readable(vd) || !vdev_writeable(vd)) {
-
-		/*
-		 * When reopening in reponse to a clear event, it may be due to
-		 * a fmadm repair request.  In this case, if the device is
-		 * still broken, we want to still post the ereport again.
-		 */
-		vd->vdev_forcefault = B_TRUE;
-
-		vd->vdev_faulted = vd->vdev_degraded = 0ULL;
-		vd->vdev_cant_read = B_FALSE;
-		vd->vdev_cant_write = B_FALSE;
-
-		vdev_reopen(vd == rvd ? rvd : vd->vdev_top);
-
-		vd->vdev_forcefault = B_FALSE;
-
-		if (vd != rvd && vdev_writeable(vd->vdev_top))
-			vdev_state_dirty(vd->vdev_top);
-
-		if (vd->vdev_aux == NULL && !vdev_is_dead(vd))
-			spa_async_request(spa, SPA_ASYNC_RESILVER);
-
-		spa_event_notify(spa, vd, NULL, ESC_ZFS_VDEV_CLEAR);
-	}
-
-	/*
-	 * When clearing a FMA-diagnosed fault, we always want to
-	 * unspare the device, as we assume that the original spare was
-	 * done in response to the FMA fault.
-	 */
-	if (!vdev_is_dead(vd) && vd->vdev_parent != NULL &&
-	    vd->vdev_parent->vdev_ops == &vdev_spare_ops &&
-	    vd->vdev_parent->vdev_child[0] == vd)
-		vd->vdev_unspare = B_TRUE;
-}
-
-boolean_t
-vdev_is_dead(vdev_t *vd)
-{
-	/*
-	 * Holes and missing devices are always considered "dead".
-	 * This simplifies the code since we don't have to check for
-	 * these types of devices in the various code paths.
-	 * Instead we rely on the fact that we skip over dead devices
-	 * before issuing I/O to them.
-	 */
-	return (vd->vdev_state < VDEV_STATE_DEGRADED ||
-	    vd->vdev_ops == &vdev_hole_ops ||
-	    vd->vdev_ops == &vdev_missing_ops);
-}
-
-boolean_t
-vdev_readable(vdev_t *vd)
-{
-	return (!vdev_is_dead(vd) && !vd->vdev_cant_read);
-}
-
-boolean_t
-vdev_writeable(vdev_t *vd)
-{
-	return (!vdev_is_dead(vd) && !vd->vdev_cant_write &&
-	    vdev_is_concrete(vd));
-}
-
-boolean_t
-vdev_allocatable(vdev_t *vd)
-{
-	uint64_t state = vd->vdev_state;
-
-	/*
-	 * We currently allow allocations from vdevs which may be in the
-	 * process of reopening (i.e. VDEV_STATE_CLOSED). If the device
-	 * fails to reopen then we'll catch it later when we're holding
-	 * the proper locks.  Note that we have to get the vdev state
-	 * in a local variable because although it changes atomically,
-	 * we're asking two separate questions about it.
-	 */
-	return (!(state < VDEV_STATE_DEGRADED && state != VDEV_STATE_CLOSED) &&
-	    !vd->vdev_cant_write && vdev_is_concrete(vd) &&
-	    vd->vdev_mg->mg_initialized);
-}
-
-boolean_t
-vdev_accessible(vdev_t *vd, zio_t *zio)
-{
-	ASSERT(zio->io_vd == vd);
-
-	if (vdev_is_dead(vd) || vd->vdev_remove_wanted)
-		return (B_FALSE);
-
-	if (zio->io_type == ZIO_TYPE_READ)
-		return (!vd->vdev_cant_read);
-
-	if (zio->io_type == ZIO_TYPE_WRITE)
-		return (!vd->vdev_cant_write);
-
-	return (B_TRUE);
-}
-
-boolean_t
-vdev_is_spacemap_addressable(vdev_t *vd)
-{
-	if (spa_feature_is_active(vd->vdev_spa, SPA_FEATURE_SPACEMAP_V2))
-		return (B_TRUE);
-
-	/*
-	 * If double-word space map entries are not enabled we assume
-	 * 47 bits of the space map entry are dedicated to the entry's
-	 * offset (see SM_OFFSET_BITS in space_map.h). We then use that
-	 * to calculate the maximum address that can be described by a
-	 * space map entry for the given device.
-	 */
-	uint64_t shift = vd->vdev_ashift + SM_OFFSET_BITS;
-
-	if (shift >= 63) /* detect potential overflow */
-		return (B_TRUE);
-
-	return (vd->vdev_asize < (1ULL << shift));
-}
-
-/*
- * Get statistics for the given vdev.
- */
-void
-vdev_get_stats(vdev_t *vd, vdev_stat_t *vs)
-{
-	spa_t *spa = vd->vdev_spa;
-	vdev_t *rvd = spa->spa_root_vdev;
-	vdev_t *tvd = vd->vdev_top;
-
-	ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0);
-
-	mutex_enter(&vd->vdev_stat_lock);
-	bcopy(&vd->vdev_stat, vs, sizeof (*vs));
-	vs->vs_timestamp = gethrtime() - vs->vs_timestamp;
-	vs->vs_state = vd->vdev_state;
-	vs->vs_rsize = vdev_get_min_asize(vd);
-	if (vd->vdev_ops->vdev_op_leaf) {
-		vs->vs_rsize += VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE;
-		/*
-		 * Report intializing progress. Since we don't have the
-		 * initializing locks held, this is only an estimate (although a
-		 * fairly accurate one).
-		 */
-		vs->vs_initialize_bytes_done = vd->vdev_initialize_bytes_done;
-		vs->vs_initialize_bytes_est = vd->vdev_initialize_bytes_est;
-		vs->vs_initialize_state = vd->vdev_initialize_state;
-		vs->vs_initialize_action_time = vd->vdev_initialize_action_time;
-	}
-	/*
-	 * Report expandable space on top-level, non-auxillary devices only.
-	 * The expandable space is reported in terms of metaslab sized units
-	 * since that determines how much space the pool can expand.
-	 */
-	if (vd->vdev_aux == NULL && tvd != NULL && vd->vdev_max_asize != 0) {
-		vs->vs_esize = P2ALIGN(vd->vdev_max_asize - vd->vdev_asize -
-		    spa->spa_bootsize, 1ULL << tvd->vdev_ms_shift);
-	}
-	vs->vs_configured_ashift = vd->vdev_top != NULL
-	    ? vd->vdev_top->vdev_ashift : vd->vdev_ashift;
-	vs->vs_logical_ashift = vd->vdev_logical_ashift;
-	vs->vs_physical_ashift = vd->vdev_physical_ashift;
-	if (vd->vdev_aux == NULL && vd == vd->vdev_top &&
-	    vdev_is_concrete(vd)) {
-		vs->vs_fragmentation = (vd->vdev_mg != NULL) ?
-		    vd->vdev_mg->mg_fragmentation : 0;
-	}
-
-	/*
-	 * If we're getting stats on the root vdev, aggregate the I/O counts
-	 * over all top-level vdevs (i.e. the direct children of the root).
-	 */
-	if (vd == rvd) {
-		for (int c = 0; c < rvd->vdev_children; c++) {
-			vdev_t *cvd = rvd->vdev_child[c];
-			vdev_stat_t *cvs = &cvd->vdev_stat;
-
-			for (int t = 0; t < ZIO_TYPES; t++) {
-				vs->vs_ops[t] += cvs->vs_ops[t];
-				vs->vs_bytes[t] += cvs->vs_bytes[t];
-			}
-			cvs->vs_scan_removing = cvd->vdev_removing;
-		}
-	}
-	mutex_exit(&vd->vdev_stat_lock);
-}
-
-void
-vdev_clear_stats(vdev_t *vd)
-{
-	mutex_enter(&vd->vdev_stat_lock);
-	vd->vdev_stat.vs_space = 0;
-	vd->vdev_stat.vs_dspace = 0;
-	vd->vdev_stat.vs_alloc = 0;
-	mutex_exit(&vd->vdev_stat_lock);
-}
-
-void
-vdev_scan_stat_init(vdev_t *vd)
-{
-	vdev_stat_t *vs = &vd->vdev_stat;
-
-	for (int c = 0; c < vd->vdev_children; c++)
-		vdev_scan_stat_init(vd->vdev_child[c]);
-
-	mutex_enter(&vd->vdev_stat_lock);
-	vs->vs_scan_processed = 0;
-	mutex_exit(&vd->vdev_stat_lock);
-}
-
-void
-vdev_stat_update(zio_t *zio, uint64_t psize)
-{
-	spa_t *spa = zio->io_spa;
-	vdev_t *rvd = spa->spa_root_vdev;
-	vdev_t *vd = zio->io_vd ? zio->io_vd : rvd;
-	vdev_t *pvd;
-	uint64_t txg = zio->io_txg;
-	vdev_stat_t *vs = &vd->vdev_stat;
-	zio_type_t type = zio->io_type;
-	int flags = zio->io_flags;
-
-	/*
-	 * If this i/o is a gang leader, it didn't do any actual work.
-	 */
-	if (zio->io_gang_tree)
-		return;
-
-	if (zio->io_error == 0) {
-		/*
-		 * If this is a root i/o, don't count it -- we've already
-		 * counted the top-level vdevs, and vdev_get_stats() will
-		 * aggregate them when asked.  This reduces contention on
-		 * the root vdev_stat_lock and implicitly handles blocks
-		 * that compress away to holes, for which there is no i/o.
-		 * (Holes never create vdev children, so all the counters
-		 * remain zero, which is what we want.)
-		 *
-		 * Note: this only applies to successful i/o (io_error == 0)
-		 * because unlike i/o counts, errors are not additive.
-		 * When reading a ditto block, for example, failure of
-		 * one top-level vdev does not imply a root-level error.
-		 */
-		if (vd == rvd)
-			return;
-
-		ASSERT(vd == zio->io_vd);
-
-		if (flags & ZIO_FLAG_IO_BYPASS)
-			return;
-
-		mutex_enter(&vd->vdev_stat_lock);
-
-		if (flags & ZIO_FLAG_IO_REPAIR) {
-			if (flags & ZIO_FLAG_SCAN_THREAD) {
-				dsl_scan_phys_t *scn_phys =
-				    &spa->spa_dsl_pool->dp_scan->scn_phys;
-				uint64_t *processed = &scn_phys->scn_processed;
-
-				/* XXX cleanup? */
-				if (vd->vdev_ops->vdev_op_leaf)
-					atomic_add_64(processed, psize);
-				vs->vs_scan_processed += psize;
-			}
-
-			if (flags & ZIO_FLAG_SELF_HEAL)
-				vs->vs_self_healed += psize;
-		}
-
-		vs->vs_ops[type]++;
-		vs->vs_bytes[type] += psize;
-
-		mutex_exit(&vd->vdev_stat_lock);
-		return;
-	}
-
-	if (flags & ZIO_FLAG_SPECULATIVE)
-		return;
-
-	/*
-	 * If this is an I/O error that is going to be retried, then ignore the
-	 * error.  Otherwise, the user may interpret B_FAILFAST I/O errors as
-	 * hard errors, when in reality they can happen for any number of
-	 * innocuous reasons (bus resets, MPxIO link failure, etc).
-	 */
-	if (zio->io_error == EIO &&
-	    !(zio->io_flags & ZIO_FLAG_IO_RETRY))
-		return;
-
-	/*
-	 * Intent logs writes won't propagate their error to the root
-	 * I/O so don't mark these types of failures as pool-level
-	 * errors.
-	 */
-	if (zio->io_vd == NULL && (zio->io_flags & ZIO_FLAG_DONT_PROPAGATE))
-		return;
-
-	mutex_enter(&vd->vdev_stat_lock);
-	if (type == ZIO_TYPE_READ && !vdev_is_dead(vd)) {
-		if (zio->io_error == ECKSUM)
-			vs->vs_checksum_errors++;
-		else
-			vs->vs_read_errors++;
-	}
-	if (type == ZIO_TYPE_WRITE && !vdev_is_dead(vd))
-		vs->vs_write_errors++;
-	mutex_exit(&vd->vdev_stat_lock);
-
-	if (spa->spa_load_state == SPA_LOAD_NONE &&
-	    type == ZIO_TYPE_WRITE && txg != 0 &&
-	    (!(flags & ZIO_FLAG_IO_REPAIR) ||
-	    (flags & ZIO_FLAG_SCAN_THREAD) ||
-	    spa->spa_claiming)) {
-		/*
-		 * This is either a normal write (not a repair), or it's
-		 * a repair induced by the scrub thread, or it's a repair
-		 * made by zil_claim() during spa_load() in the first txg.
-		 * In the normal case, we commit the DTL change in the same
-		 * txg as the block was born.  In the scrub-induced repair
-		 * case, we know that scrubs run in first-pass syncing context,
-		 * so we commit the DTL change in spa_syncing_txg(spa).
-		 * In the zil_claim() case, we commit in spa_first_txg(spa).
-		 *
-		 * We currently do not make DTL entries for failed spontaneous
-		 * self-healing writes triggered by normal (non-scrubbing)
-		 * reads, because we have no transactional context in which to
-		 * do so -- and it's not clear that it'd be desirable anyway.
-		 */
-		if (vd->vdev_ops->vdev_op_leaf) {
-			uint64_t commit_txg = txg;
-			if (flags & ZIO_FLAG_SCAN_THREAD) {
-				ASSERT(flags & ZIO_FLAG_IO_REPAIR);
-				ASSERT(spa_sync_pass(spa) == 1);
-				vdev_dtl_dirty(vd, DTL_SCRUB, txg, 1);
-				commit_txg = spa_syncing_txg(spa);
-			} else if (spa->spa_claiming) {
-				ASSERT(flags & ZIO_FLAG_IO_REPAIR);
-				commit_txg = spa_first_txg(spa);
-			}
-			ASSERT(commit_txg >= spa_syncing_txg(spa));
-			if (vdev_dtl_contains(vd, DTL_MISSING, txg, 1))
-				return;
-			for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent)
-				vdev_dtl_dirty(pvd, DTL_PARTIAL, txg, 1);
-			vdev_dirty(vd->vdev_top, VDD_DTL, vd, commit_txg);
-		}
-		if (vd != rvd)
-			vdev_dtl_dirty(vd, DTL_MISSING, txg, 1);
-	}
-}
-
-int64_t
-vdev_deflated_space(vdev_t *vd, int64_t space)
-{
-	ASSERT((space & (SPA_MINBLOCKSIZE-1)) == 0);
-	ASSERT(vd->vdev_deflate_ratio != 0 || vd->vdev_isl2cache);
-
-	return ((space >> SPA_MINBLOCKSHIFT) * vd->vdev_deflate_ratio);
-}
-
-/*
- * Update the in-core space usage stats for this vdev and the root vdev.
- */
-void
-vdev_space_update(vdev_t *vd, int64_t alloc_delta, int64_t defer_delta,
-    int64_t space_delta)
-{
-	int64_t dspace_delta;
-	spa_t *spa = vd->vdev_spa;
-	vdev_t *rvd = spa->spa_root_vdev;
-
-	ASSERT(vd == vd->vdev_top);
-
-	/*
-	 * Apply the inverse of the psize-to-asize (ie. RAID-Z) space-expansion
-	 * factor.  We must calculate this here and not at the root vdev
-	 * because the root vdev's psize-to-asize is simply the max of its
-	 * childrens', thus not accurate enough for us.
-	 */
-	dspace_delta = vdev_deflated_space(vd, space_delta);
-
-	mutex_enter(&vd->vdev_stat_lock);
-	vd->vdev_stat.vs_alloc += alloc_delta;
-	vd->vdev_stat.vs_space += space_delta;
-	vd->vdev_stat.vs_dspace += dspace_delta;
-	mutex_exit(&vd->vdev_stat_lock);
-
-	/* every class but log contributes to root space stats */
-	if (vd->vdev_mg != NULL && !vd->vdev_islog) {
-		mutex_enter(&rvd->vdev_stat_lock);
-		rvd->vdev_stat.vs_alloc += alloc_delta;
-		rvd->vdev_stat.vs_space += space_delta;
-		rvd->vdev_stat.vs_dspace += dspace_delta;
-		mutex_exit(&rvd->vdev_stat_lock);
-	}
-	/* Note: metaslab_class_space_update moved to metaslab_space_update */
-}
-
-/*
- * Mark a top-level vdev's config as dirty, placing it on the dirty list
- * so that it will be written out next time the vdev configuration is synced.
- * If the root vdev is specified (vdev_top == NULL), dirty all top-level vdevs.
- */
-void
-vdev_config_dirty(vdev_t *vd)
-{
-	spa_t *spa = vd->vdev_spa;
-	vdev_t *rvd = spa->spa_root_vdev;
-	int c;
-
-	ASSERT(spa_writeable(spa));
-
-	/*
-	 * If this is an aux vdev (as with l2cache and spare devices), then we
-	 * update the vdev config manually and set the sync flag.
-	 */
-	if (vd->vdev_aux != NULL) {
-		spa_aux_vdev_t *sav = vd->vdev_aux;
-		nvlist_t **aux;
-		uint_t naux;
-
-		for (c = 0; c < sav->sav_count; c++) {
-			if (sav->sav_vdevs[c] == vd)
-				break;
-		}
-
-		if (c == sav->sav_count) {
-			/*
-			 * We're being removed.  There's nothing more to do.
-			 */
-			ASSERT(sav->sav_sync == B_TRUE);
-			return;
-		}
-
-		sav->sav_sync = B_TRUE;
-
-		if (nvlist_lookup_nvlist_array(sav->sav_config,
-		    ZPOOL_CONFIG_L2CACHE, &aux, &naux) != 0) {
-			VERIFY(nvlist_lookup_nvlist_array(sav->sav_config,
-			    ZPOOL_CONFIG_SPARES, &aux, &naux) == 0);
-		}
-
-		ASSERT(c < naux);
-
-		/*
-		 * Setting the nvlist in the middle if the array is a little
-		 * sketchy, but it will work.
-		 */
-		nvlist_free(aux[c]);
-		aux[c] = vdev_config_generate(spa, vd, B_TRUE, 0);
-
-		return;
-	}
-
-	/*
-	 * The dirty list is protected by the SCL_CONFIG lock.  The caller
-	 * must either hold SCL_CONFIG as writer, or must be the sync thread
-	 * (which holds SCL_CONFIG as reader).  There's only one sync thread,
-	 * so this is sufficient to ensure mutual exclusion.
-	 */
-	ASSERT(spa_config_held(spa, SCL_CONFIG, RW_WRITER) ||
-	    (dsl_pool_sync_context(spa_get_dsl(spa)) &&
-	    spa_config_held(spa, SCL_CONFIG, RW_READER)));
-
-	if (vd == rvd) {
-		for (c = 0; c < rvd->vdev_children; c++)
-			vdev_config_dirty(rvd->vdev_child[c]);
-	} else {
-		ASSERT(vd == vd->vdev_top);
-
-		if (!list_link_active(&vd->vdev_config_dirty_node) &&
-		    vdev_is_concrete(vd)) {
-			list_insert_head(&spa->spa_config_dirty_list, vd);
-		}
-	}
-}
-
-void
-vdev_config_clean(vdev_t *vd)
-{
-	spa_t *spa = vd->vdev_spa;
-
-	ASSERT(spa_config_held(spa, SCL_CONFIG, RW_WRITER) ||
-	    (dsl_pool_sync_context(spa_get_dsl(spa)) &&
-	    spa_config_held(spa, SCL_CONFIG, RW_READER)));
-
-	ASSERT(list_link_active(&vd->vdev_config_dirty_node));
-	list_remove(&spa->spa_config_dirty_list, vd);
-}
-
-/*
- * Mark a top-level vdev's state as dirty, so that the next pass of
- * spa_sync() can convert this into vdev_config_dirty().  We distinguish
- * the state changes from larger config changes because they require
- * much less locking, and are often needed for administrative actions.
- */
-void
-vdev_state_dirty(vdev_t *vd)
-{
-	spa_t *spa = vd->vdev_spa;
-
-	ASSERT(spa_writeable(spa));
-	ASSERT(vd == vd->vdev_top);
-
-	/*
-	 * The state list is protected by the SCL_STATE lock.  The caller
-	 * must either hold SCL_STATE as writer, or must be the sync thread
-	 * (which holds SCL_STATE as reader).  There's only one sync thread,
-	 * so this is sufficient to ensure mutual exclusion.
-	 */
-	ASSERT(spa_config_held(spa, SCL_STATE, RW_WRITER) ||
-	    (dsl_pool_sync_context(spa_get_dsl(spa)) &&
-	    spa_config_held(spa, SCL_STATE, RW_READER)));
-
-	if (!list_link_active(&vd->vdev_state_dirty_node) &&
-	    vdev_is_concrete(vd))
-		list_insert_head(&spa->spa_state_dirty_list, vd);
-}
-
-void
-vdev_state_clean(vdev_t *vd)
-{
-	spa_t *spa = vd->vdev_spa;
-
-	ASSERT(spa_config_held(spa, SCL_STATE, RW_WRITER) ||
-	    (dsl_pool_sync_context(spa_get_dsl(spa)) &&
-	    spa_config_held(spa, SCL_STATE, RW_READER)));
-
-	ASSERT(list_link_active(&vd->vdev_state_dirty_node));
-	list_remove(&spa->spa_state_dirty_list, vd);
-}
-
-/*
- * Propagate vdev state up from children to parent.
- */
-void
-vdev_propagate_state(vdev_t *vd)
-{
-	spa_t *spa = vd->vdev_spa;
-	vdev_t *rvd = spa->spa_root_vdev;
-	int degraded = 0, faulted = 0;
-	int corrupted = 0;
-	vdev_t *child;
-
-	if (vd->vdev_children > 0) {
-		for (int c = 0; c < vd->vdev_children; c++) {
-			child = vd->vdev_child[c];
-
-			/*
-			 * Don't factor holes or indirect vdevs into the
-			 * decision.
-			 */
-			if (!vdev_is_concrete(child))
-				continue;
-
-			if (!vdev_readable(child) ||
-			    (!vdev_writeable(child) && spa_writeable(spa))) {
-				/*
-				 * Root special: if there is a top-level log
-				 * device, treat the root vdev as if it were
-				 * degraded.
-				 */
-				if (child->vdev_islog && vd == rvd)
-					degraded++;
-				else
-					faulted++;
-			} else if (child->vdev_state <= VDEV_STATE_DEGRADED) {
-				degraded++;
-			}
-
-			if (child->vdev_stat.vs_aux == VDEV_AUX_CORRUPT_DATA)
-				corrupted++;
-		}
-
-		vd->vdev_ops->vdev_op_state_change(vd, faulted, degraded);
-
-		/*
-		 * Root special: if there is a top-level vdev that cannot be
-		 * opened due to corrupted metadata, then propagate the root
-		 * vdev's aux state as 'corrupt' rather than 'insufficient
-		 * replicas'.
-		 */
-		if (corrupted && vd == rvd &&
-		    rvd->vdev_state == VDEV_STATE_CANT_OPEN)
-			vdev_set_state(rvd, B_FALSE, VDEV_STATE_CANT_OPEN,
-			    VDEV_AUX_CORRUPT_DATA);
-	}
-
-	if (vd->vdev_parent)
-		vdev_propagate_state(vd->vdev_parent);
-}
-
-/*
- * Set a vdev's state.  If this is during an open, we don't update the parent
- * state, because we're in the process of opening children depth-first.
- * Otherwise, we propagate the change to the parent.
- *
- * If this routine places a device in a faulted state, an appropriate ereport is
- * generated.
- */
-void
-vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux)
-{
-	uint64_t save_state;
-	spa_t *spa = vd->vdev_spa;
-
-	if (state == vd->vdev_state) {
-		vd->vdev_stat.vs_aux = aux;
-		return;
-	}
-
-	save_state = vd->vdev_state;
-
-	vd->vdev_state = state;
-	vd->vdev_stat.vs_aux = aux;
-
-	/*
-	 * If we are setting the vdev state to anything but an open state, then
-	 * always close the underlying device unless the device has requested
-	 * a delayed close (i.e. we're about to remove or fault the device).
-	 * Otherwise, we keep accessible but invalid devices open forever.
-	 * We don't call vdev_close() itself, because that implies some extra
-	 * checks (offline, etc) that we don't want here.  This is limited to
-	 * leaf devices, because otherwise closing the device will affect other
-	 * children.
-	 */
-	if (!vd->vdev_delayed_close && vdev_is_dead(vd) &&
-	    vd->vdev_ops->vdev_op_leaf)
-		vd->vdev_ops->vdev_op_close(vd);
-
-	if (vd->vdev_removed &&
-	    state == VDEV_STATE_CANT_OPEN &&
-	    (aux == VDEV_AUX_OPEN_FAILED || vd->vdev_checkremove)) {
-		/*
-		 * If the previous state is set to VDEV_STATE_REMOVED, then this
-		 * device was previously marked removed and someone attempted to
-		 * reopen it.  If this failed due to a nonexistent device, then
-		 * keep the device in the REMOVED state.  We also let this be if
-		 * it is one of our special test online cases, which is only
-		 * attempting to online the device and shouldn't generate an FMA
-		 * fault.
-		 */
-		vd->vdev_state = VDEV_STATE_REMOVED;
-		vd->vdev_stat.vs_aux = VDEV_AUX_NONE;
-	} else if (state == VDEV_STATE_REMOVED) {
-		vd->vdev_removed = B_TRUE;
-	} else if (state == VDEV_STATE_CANT_OPEN) {
-		/*
-		 * If we fail to open a vdev during an import or recovery, we
-		 * mark it as "not available", which signifies that it was
-		 * never there to begin with.  Failure to open such a device
-		 * is not considered an error.
-		 */
-		if ((spa_load_state(spa) == SPA_LOAD_IMPORT ||
-		    spa_load_state(spa) == SPA_LOAD_RECOVER) &&
-		    vd->vdev_ops->vdev_op_leaf)
-			vd->vdev_not_present = 1;
-
-		/*
-		 * Post the appropriate ereport.  If the 'prevstate' field is
-		 * set to something other than VDEV_STATE_UNKNOWN, it indicates
-		 * that this is part of a vdev_reopen().  In this case, we don't
-		 * want to post the ereport if the device was already in the
-		 * CANT_OPEN state beforehand.
-		 *
-		 * If the 'checkremove' flag is set, then this is an attempt to
-		 * online the device in response to an insertion event.  If we
-		 * hit this case, then we have detected an insertion event for a
-		 * faulted or offline device that wasn't in the removed state.
-		 * In this scenario, we don't post an ereport because we are
-		 * about to replace the device, or attempt an online with
-		 * vdev_forcefault, which will generate the fault for us.
-		 */
-		if ((vd->vdev_prevstate != state || vd->vdev_forcefault) &&
-		    !vd->vdev_not_present && !vd->vdev_checkremove &&
-		    vd != spa->spa_root_vdev) {
-			const char *class;
-
-			switch (aux) {
-			case VDEV_AUX_OPEN_FAILED:
-				class = FM_EREPORT_ZFS_DEVICE_OPEN_FAILED;
-				break;
-			case VDEV_AUX_CORRUPT_DATA:
-				class = FM_EREPORT_ZFS_DEVICE_CORRUPT_DATA;
-				break;
-			case VDEV_AUX_NO_REPLICAS:
-				class = FM_EREPORT_ZFS_DEVICE_NO_REPLICAS;
-				break;
-			case VDEV_AUX_BAD_GUID_SUM:
-				class = FM_EREPORT_ZFS_DEVICE_BAD_GUID_SUM;
-				break;
-			case VDEV_AUX_TOO_SMALL:
-				class = FM_EREPORT_ZFS_DEVICE_TOO_SMALL;
-				break;
-			case VDEV_AUX_BAD_LABEL:
-				class = FM_EREPORT_ZFS_DEVICE_BAD_LABEL;
-				break;
-			default:
-				class = FM_EREPORT_ZFS_DEVICE_UNKNOWN;
-			}
-
-			zfs_ereport_post(class, spa, vd, NULL, save_state, 0);
-		}
-
-		/* Erase any notion of persistent removed state */
-		vd->vdev_removed = B_FALSE;
-	} else {
-		vd->vdev_removed = B_FALSE;
-	}
-
-	/*
-	* Notify the fmd of the state change.  Be verbose and post
-	* notifications even for stuff that's not important; the fmd agent can
-	* sort it out.  Don't emit state change events for non-leaf vdevs since
-	* they can't change state on their own.  The FMD can check their state
-	* if it wants to when it sees that a leaf vdev had a state change.
-	*/
-	if (vd->vdev_ops->vdev_op_leaf)
-		zfs_post_state_change(spa, vd);
-
-	if (!isopen && vd->vdev_parent)
-		vdev_propagate_state(vd->vdev_parent);
-}
-
-boolean_t
-vdev_children_are_offline(vdev_t *vd)
-{
-	ASSERT(!vd->vdev_ops->vdev_op_leaf);
-
-	for (uint64_t i = 0; i < vd->vdev_children; i++) {
-		if (vd->vdev_child[i]->vdev_state != VDEV_STATE_OFFLINE)
-			return (B_FALSE);
-	}
-
-	return (B_TRUE);
-}
-
-/*
- * Check the vdev configuration to ensure that it's capable of supporting
- * a root pool. We do not support partial configuration.
- * In addition, only a single top-level vdev is allowed.
- *
- * FreeBSD does not have above limitations.
- */
-boolean_t
-vdev_is_bootable(vdev_t *vd)
-{
-#ifdef illumos
-	if (!vd->vdev_ops->vdev_op_leaf) {
-		char *vdev_type = vd->vdev_ops->vdev_op_type;
-
-		if (strcmp(vdev_type, VDEV_TYPE_ROOT) == 0 &&
-		    vd->vdev_children > 1) {
-			return (B_FALSE);
-		} else if (strcmp(vdev_type, VDEV_TYPE_MISSING) == 0 ||
-		    strcmp(vdev_type, VDEV_TYPE_INDIRECT) == 0) {
-			return (B_FALSE);
-		}
-	}
-
-	for (int c = 0; c < vd->vdev_children; c++) {
-		if (!vdev_is_bootable(vd->vdev_child[c]))
-			return (B_FALSE);
-	}
-#endif	/* illumos */
-	return (B_TRUE);
-}
-
-boolean_t
-vdev_is_concrete(vdev_t *vd)
-{
-	vdev_ops_t *ops = vd->vdev_ops;
-	if (ops == &vdev_indirect_ops || ops == &vdev_hole_ops ||
-	    ops == &vdev_missing_ops || ops == &vdev_root_ops) {
-		return (B_FALSE);
-	} else {
-		return (B_TRUE);
-	}
-}
-
-/*
- * Determine if a log device has valid content.  If the vdev was
- * removed or faulted in the MOS config then we know that
- * the content on the log device has already been written to the pool.
- */
-boolean_t
-vdev_log_state_valid(vdev_t *vd)
-{
-	if (vd->vdev_ops->vdev_op_leaf && !vd->vdev_faulted &&
-	    !vd->vdev_removed)
-		return (B_TRUE);
-
-	for (int c = 0; c < vd->vdev_children; c++)
-		if (vdev_log_state_valid(vd->vdev_child[c]))
-			return (B_TRUE);
-
-	return (B_FALSE);
-}
-
-/*
- * Expand a vdev if possible.
- */
-void
-vdev_expand(vdev_t *vd, uint64_t txg)
-{
-	ASSERT(vd->vdev_top == vd);
-	ASSERT(spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL);
-	ASSERT(vdev_is_concrete(vd));
-
-	vdev_set_deflate_ratio(vd);
-
-	if ((vd->vdev_asize >> vd->vdev_ms_shift) > vd->vdev_ms_count &&
-	    vdev_is_concrete(vd)) {
-		vdev_metaslab_group_create(vd);
-		VERIFY(vdev_metaslab_init(vd, txg) == 0);
-		vdev_config_dirty(vd);
-	}
-}
-
-/*
- * Split a vdev.
- */
-void
-vdev_split(vdev_t *vd)
-{
-	vdev_t *cvd, *pvd = vd->vdev_parent;
-
-	vdev_remove_child(pvd, vd);
-	vdev_compact_children(pvd);
-
-	cvd = pvd->vdev_child[0];
-	if (pvd->vdev_children == 1) {
-		vdev_remove_parent(cvd);
-		cvd->vdev_splitting = B_TRUE;
-	}
-	vdev_propagate_state(cvd);
-}
-
-void
-vdev_deadman(vdev_t *vd)
-{
-	for (int c = 0; c < vd->vdev_children; c++) {
-		vdev_t *cvd = vd->vdev_child[c];
-
-		vdev_deadman(cvd);
-	}
-
-	if (vd->vdev_ops->vdev_op_leaf) {
-		vdev_queue_t *vq = &vd->vdev_queue;
-
-		mutex_enter(&vq->vq_lock);
-		if (avl_numnodes(&vq->vq_active_tree) > 0) {
-			spa_t *spa = vd->vdev_spa;
-			zio_t *fio;
-			uint64_t delta;
-
-			/*
-			 * Look at the head of all the pending queues,
-			 * if any I/O has been outstanding for longer than
-			 * the spa_deadman_synctime we panic the system.
-			 */
-			fio = avl_first(&vq->vq_active_tree);
-			delta = gethrtime() - fio->io_timestamp;
-			if (delta > spa_deadman_synctime(spa)) {
-				vdev_dbgmsg(vd, "SLOW IO: zio timestamp "
-				    "%lluns, delta %lluns, last io %lluns",
-				    fio->io_timestamp, (u_longlong_t)delta,
-				    vq->vq_io_complete_ts);
-				fm_panic("I/O to pool '%s' appears to be "
-				    "hung on vdev guid %llu at '%s'.",
-				    spa_name(spa),
-				    (long long unsigned int) vd->vdev_guid,
-				    vd->vdev_path);
-			}
-		}
-		mutex_exit(&vq->vq_lock);
-	}
-}
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_cache.c
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_cache.c
+++ /dev/null
@@ -1,434 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-/*
- * Copyright (c) 2013, 2017 by Delphix. All rights reserved.
- */
-
-#include <sys/zfs_context.h>
-#include <sys/spa.h>
-#include <sys/vdev_impl.h>
-#include <sys/zio.h>
-#include <sys/kstat.h>
-#include <sys/abd.h>
-
-/*
- * Virtual device read-ahead caching.
- *
- * This file implements a simple LRU read-ahead cache.  When the DMU reads
- * a given block, it will often want other, nearby blocks soon thereafter.
- * We take advantage of this by reading a larger disk region and caching
- * the result.  In the best case, this can turn 128 back-to-back 512-byte
- * reads into a single 64k read followed by 127 cache hits; this reduces
- * latency dramatically.  In the worst case, it can turn an isolated 512-byte
- * read into a 64k read, which doesn't affect latency all that much but is
- * terribly wasteful of bandwidth.  A more intelligent version of the cache
- * could keep track of access patterns and not do read-ahead unless it sees
- * at least two temporally close I/Os to the same region.  Currently, only
- * metadata I/O is inflated.  A futher enhancement could take advantage of
- * more semantic information about the I/O.  And it could use something
- * faster than an AVL tree; that was chosen solely for convenience.
- *
- * There are five cache operations: allocate, fill, read, write, evict.
- *
- * (1) Allocate.  This reserves a cache entry for the specified region.
- *     We separate the allocate and fill operations so that multiple threads
- *     don't generate I/O for the same cache miss.
- *
- * (2) Fill.  When the I/O for a cache miss completes, the fill routine
- *     places the data in the previously allocated cache entry.
- *
- * (3) Read.  Read data from the cache.
- *
- * (4) Write.  Update cache contents after write completion.
- *
- * (5) Evict.  When allocating a new entry, we evict the oldest (LRU) entry
- *     if the total cache size exceeds zfs_vdev_cache_size.
- */
-
-/*
- * These tunables are for performance analysis.
- */
-/*
- * All i/os smaller than zfs_vdev_cache_max will be turned into
- * 1<<zfs_vdev_cache_bshift byte reads by the vdev_cache (aka software
- * track buffer).  At most zfs_vdev_cache_size bytes will be kept in each
- * vdev's vdev_cache.
- *
- * TODO: Note that with the current ZFS code, it turns out that the
- * vdev cache is not helpful, and in some cases actually harmful.  It
- * is better if we disable this.  Once some time has passed, we should
- * actually remove this to simplify the code.  For now we just disable
- * it by setting the zfs_vdev_cache_size to zero.  Note that Solaris 11
- * has made these same changes.
- */
-int zfs_vdev_cache_max = 1<<14;			/* 16KB */
-int zfs_vdev_cache_size = 0;
-int zfs_vdev_cache_bshift = 16;
-
-#define	VCBS (1 << zfs_vdev_cache_bshift)	/* 64KB */
-
-SYSCTL_DECL(_vfs_zfs_vdev);
-SYSCTL_NODE(_vfs_zfs_vdev, OID_AUTO, cache, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
-    "ZFS VDEV Cache");
-SYSCTL_INT(_vfs_zfs_vdev_cache, OID_AUTO, max, CTLFLAG_RDTUN,
-    &zfs_vdev_cache_max, 0, "Maximum I/O request size that increase read size");
-SYSCTL_INT(_vfs_zfs_vdev_cache, OID_AUTO, size, CTLFLAG_RDTUN,
-    &zfs_vdev_cache_size, 0, "Size of VDEV cache");
-SYSCTL_INT(_vfs_zfs_vdev_cache, OID_AUTO, bshift, CTLFLAG_RDTUN,
-    &zfs_vdev_cache_bshift, 0, "Turn too small requests into 1 << this value");
-
-kstat_t	*vdc_ksp = NULL;
-
-typedef struct vdc_stats {
-	kstat_named_t vdc_stat_delegations;
-	kstat_named_t vdc_stat_hits;
-	kstat_named_t vdc_stat_misses;
-} vdc_stats_t;
-
-static vdc_stats_t vdc_stats = {
-	{ "delegations",	KSTAT_DATA_UINT64 },
-	{ "hits",		KSTAT_DATA_UINT64 },
-	{ "misses",		KSTAT_DATA_UINT64 }
-};
-
-#define	VDCSTAT_BUMP(stat)	atomic_inc_64(&vdc_stats.stat.value.ui64);
-
-static inline int
-vdev_cache_offset_compare(const void *a1, const void *a2)
-{
-	const vdev_cache_entry_t *ve1 = (const vdev_cache_entry_t *)a1;
-	const vdev_cache_entry_t *ve2 = (const vdev_cache_entry_t *)a2;
-
-	return (AVL_CMP(ve1->ve_offset, ve2->ve_offset));
-}
-
-static int
-vdev_cache_lastused_compare(const void *a1, const void *a2)
-{
-	const vdev_cache_entry_t *ve1 = (const vdev_cache_entry_t *)a1;
-	const vdev_cache_entry_t *ve2 = (const vdev_cache_entry_t *)a2;
-
-	int cmp = AVL_CMP(ve1->ve_lastused, ve2->ve_lastused);
-	if (likely(cmp))
-		return (cmp);
-
-	/*
-	 * Among equally old entries, sort by offset to ensure uniqueness.
-	 */
-	return (vdev_cache_offset_compare(a1, a2));
-}
-
-/*
- * Evict the specified entry from the cache.
- */
-static void
-vdev_cache_evict(vdev_cache_t *vc, vdev_cache_entry_t *ve)
-{
-	ASSERT(MUTEX_HELD(&vc->vc_lock));
-	ASSERT3P(ve->ve_fill_io, ==, NULL);
-	ASSERT3P(ve->ve_abd, !=, NULL);
-
-	avl_remove(&vc->vc_lastused_tree, ve);
-	avl_remove(&vc->vc_offset_tree, ve);
-	abd_free(ve->ve_abd);
-	kmem_free(ve, sizeof (vdev_cache_entry_t));
-}
-
-/*
- * Allocate an entry in the cache.  At the point we don't have the data,
- * we're just creating a placeholder so that multiple threads don't all
- * go off and read the same blocks.
- */
-static vdev_cache_entry_t *
-vdev_cache_allocate(zio_t *zio)
-{
-	vdev_cache_t *vc = &zio->io_vd->vdev_cache;
-	uint64_t offset = P2ALIGN(zio->io_offset, VCBS);
-	vdev_cache_entry_t *ve;
-
-	ASSERT(MUTEX_HELD(&vc->vc_lock));
-
-	if (zfs_vdev_cache_size == 0)
-		return (NULL);
-
-	/*
-	 * If adding a new entry would exceed the cache size,
-	 * evict the oldest entry (LRU).
-	 */
-	if ((avl_numnodes(&vc->vc_lastused_tree) << zfs_vdev_cache_bshift) >
-	    zfs_vdev_cache_size) {
-		ve = avl_first(&vc->vc_lastused_tree);
-		if (ve->ve_fill_io != NULL)
-			return (NULL);
-		ASSERT3U(ve->ve_hits, !=, 0);
-		vdev_cache_evict(vc, ve);
-	}
-
-	ve = kmem_zalloc(sizeof (vdev_cache_entry_t), KM_SLEEP);
-	ve->ve_offset = offset;
-	ve->ve_lastused = ddi_get_lbolt();
-	ve->ve_abd = abd_alloc_for_io(VCBS, B_TRUE);
-
-	avl_add(&vc->vc_offset_tree, ve);
-	avl_add(&vc->vc_lastused_tree, ve);
-
-	return (ve);
-}
-
-static void
-vdev_cache_hit(vdev_cache_t *vc, vdev_cache_entry_t *ve, zio_t *zio)
-{
-	uint64_t cache_phase = P2PHASE(zio->io_offset, VCBS);
-
-	ASSERT(MUTEX_HELD(&vc->vc_lock));
-	ASSERT3P(ve->ve_fill_io, ==, NULL);
-
-	if (ve->ve_lastused != ddi_get_lbolt()) {
-		avl_remove(&vc->vc_lastused_tree, ve);
-		ve->ve_lastused = ddi_get_lbolt();
-		avl_add(&vc->vc_lastused_tree, ve);
-	}
-
-	ve->ve_hits++;
-	abd_copy_off(zio->io_abd, ve->ve_abd, 0, cache_phase, zio->io_size);
-}
-
-/*
- * Fill a previously allocated cache entry with data.
- */
-static void
-vdev_cache_fill(zio_t *fio)
-{
-	vdev_t *vd = fio->io_vd;
-	vdev_cache_t *vc = &vd->vdev_cache;
-	vdev_cache_entry_t *ve = fio->io_private;
-	zio_t *pio;
-
-	ASSERT3U(fio->io_size, ==, VCBS);
-
-	/*
-	 * Add data to the cache.
-	 */
-	mutex_enter(&vc->vc_lock);
-
-	ASSERT3P(ve->ve_fill_io, ==, fio);
-	ASSERT3U(ve->ve_offset, ==, fio->io_offset);
-	ASSERT3P(ve->ve_abd, ==, fio->io_abd);
-
-	ve->ve_fill_io = NULL;
-
-	/*
-	 * Even if this cache line was invalidated by a missed write update,
-	 * any reads that were queued up before the missed update are still
-	 * valid, so we can satisfy them from this line before we evict it.
-	 */
-	zio_link_t *zl = NULL;
-	while ((pio = zio_walk_parents(fio, &zl)) != NULL)
-		vdev_cache_hit(vc, ve, pio);
-
-	if (fio->io_error || ve->ve_missed_update)
-		vdev_cache_evict(vc, ve);
-
-	mutex_exit(&vc->vc_lock);
-}
-
-/*
- * Read data from the cache.  Returns B_TRUE cache hit, B_FALSE on miss.
- */
-boolean_t
-vdev_cache_read(zio_t *zio)
-{
-	vdev_cache_t *vc = &zio->io_vd->vdev_cache;
-	vdev_cache_entry_t *ve, ve_search;
-	uint64_t cache_offset = P2ALIGN(zio->io_offset, VCBS);
-	uint64_t cache_phase = P2PHASE(zio->io_offset, VCBS);
-	zio_t *fio;
-
-	ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ);
-
-	if (zio->io_flags & ZIO_FLAG_DONT_CACHE)
-		return (B_FALSE);
-
-	if (zio->io_size > zfs_vdev_cache_max)
-		return (B_FALSE);
-
-	/*
-	 * If the I/O straddles two or more cache blocks, don't cache it.
-	 */
-	if (P2BOUNDARY(zio->io_offset, zio->io_size, VCBS))
-		return (B_FALSE);
-
-	ASSERT3U(cache_phase + zio->io_size, <=, VCBS);
-
-	mutex_enter(&vc->vc_lock);
-
-	ve_search.ve_offset = cache_offset;
-	ve = avl_find(&vc->vc_offset_tree, &ve_search, NULL);
-
-	if (ve != NULL) {
-		if (ve->ve_missed_update) {
-			mutex_exit(&vc->vc_lock);
-			return (B_FALSE);
-		}
-
-		if ((fio = ve->ve_fill_io) != NULL) {
-			zio_vdev_io_bypass(zio);
-			zio_add_child(zio, fio);
-			mutex_exit(&vc->vc_lock);
-			VDCSTAT_BUMP(vdc_stat_delegations);
-			return (B_TRUE);
-		}
-
-		vdev_cache_hit(vc, ve, zio);
-		zio_vdev_io_bypass(zio);
-
-		mutex_exit(&vc->vc_lock);
-		VDCSTAT_BUMP(vdc_stat_hits);
-		return (B_TRUE);
-	}
-
-	ve = vdev_cache_allocate(zio);
-
-	if (ve == NULL) {
-		mutex_exit(&vc->vc_lock);
-		return (B_FALSE);
-	}
-
-	fio = zio_vdev_delegated_io(zio->io_vd, cache_offset,
-	    ve->ve_abd, VCBS, ZIO_TYPE_READ, ZIO_PRIORITY_NOW,
-	    ZIO_FLAG_DONT_CACHE, vdev_cache_fill, ve);
-
-	ve->ve_fill_io = fio;
-	zio_vdev_io_bypass(zio);
-	zio_add_child(zio, fio);
-
-	mutex_exit(&vc->vc_lock);
-	zio_nowait(fio);
-	VDCSTAT_BUMP(vdc_stat_misses);
-
-	return (B_TRUE);
-}
-
-/*
- * Update cache contents upon write completion.
- */
-void
-vdev_cache_write(zio_t *zio)
-{
-	vdev_cache_t *vc = &zio->io_vd->vdev_cache;
-	vdev_cache_entry_t *ve, ve_search;
-	uint64_t io_start = zio->io_offset;
-	uint64_t io_end = io_start + zio->io_size;
-	uint64_t min_offset = P2ALIGN(io_start, VCBS);
-	uint64_t max_offset = P2ROUNDUP(io_end, VCBS);
-	avl_index_t where;
-
-	ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE);
-
-	mutex_enter(&vc->vc_lock);
-
-	ve_search.ve_offset = min_offset;
-	ve = avl_find(&vc->vc_offset_tree, &ve_search, &where);
-
-	if (ve == NULL)
-		ve = avl_nearest(&vc->vc_offset_tree, where, AVL_AFTER);
-
-	while (ve != NULL && ve->ve_offset < max_offset) {
-		uint64_t start = MAX(ve->ve_offset, io_start);
-		uint64_t end = MIN(ve->ve_offset + VCBS, io_end);
-
-		if (ve->ve_fill_io != NULL) {
-			ve->ve_missed_update = 1;
-		} else {
-			abd_copy_off(ve->ve_abd, zio->io_abd,
-			    start - ve->ve_offset, start - io_start,
-			    end - start);
-		}
-		ve = AVL_NEXT(&vc->vc_offset_tree, ve);
-	}
-	mutex_exit(&vc->vc_lock);
-}
-
-void
-vdev_cache_purge(vdev_t *vd)
-{
-	vdev_cache_t *vc = &vd->vdev_cache;
-	vdev_cache_entry_t *ve;
-
-	mutex_enter(&vc->vc_lock);
-	while ((ve = avl_first(&vc->vc_offset_tree)) != NULL)
-		vdev_cache_evict(vc, ve);
-	mutex_exit(&vc->vc_lock);
-}
-
-void
-vdev_cache_init(vdev_t *vd)
-{
-	vdev_cache_t *vc = &vd->vdev_cache;
-
-	mutex_init(&vc->vc_lock, NULL, MUTEX_DEFAULT, NULL);
-
-	avl_create(&vc->vc_offset_tree, vdev_cache_offset_compare,
-	    sizeof (vdev_cache_entry_t),
-	    offsetof(struct vdev_cache_entry, ve_offset_node));
-
-	avl_create(&vc->vc_lastused_tree, vdev_cache_lastused_compare,
-	    sizeof (vdev_cache_entry_t),
-	    offsetof(struct vdev_cache_entry, ve_lastused_node));
-}
-
-void
-vdev_cache_fini(vdev_t *vd)
-{
-	vdev_cache_t *vc = &vd->vdev_cache;
-
-	vdev_cache_purge(vd);
-
-	avl_destroy(&vc->vc_offset_tree);
-	avl_destroy(&vc->vc_lastused_tree);
-
-	mutex_destroy(&vc->vc_lock);
-}
-
-void
-vdev_cache_stat_init(void)
-{
-	vdc_ksp = kstat_create("zfs", 0, "vdev_cache_stats", "misc",
-	    KSTAT_TYPE_NAMED, sizeof (vdc_stats) / sizeof (kstat_named_t),
-	    KSTAT_FLAG_VIRTUAL);
-	if (vdc_ksp != NULL) {
-		vdc_ksp->ks_data = &vdc_stats;
-		kstat_install(vdc_ksp);
-	}
-}
-
-void
-vdev_cache_stat_fini(void)
-{
-	if (vdc_ksp != NULL) {
-		kstat_delete(vdc_ksp);
-		vdc_ksp = NULL;
-	}
-}
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_disk.c
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_disk.c
+++ /dev/null
@@ -1,971 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
- * Copyright 2016 Nexenta Systems, Inc.  All rights reserved.
- * Copyright (c) 2013 Joyent, Inc.  All rights reserved.
- */
-
-#include <sys/zfs_context.h>
-#include <sys/spa_impl.h>
-#include <sys/refcount.h>
-#include <sys/vdev_disk.h>
-#include <sys/vdev_impl.h>
-#include <sys/abd.h>
-#include <sys/fs/zfs.h>
-#include <sys/zio.h>
-#include <sys/sunldi.h>
-#include <sys/efi_partition.h>
-#include <sys/fm/fs/zfs.h>
-
-/*
- * Virtual device vector for disks.
- */
-
-extern ldi_ident_t zfs_li;
-
-static void vdev_disk_close(vdev_t *);
-
-typedef struct vdev_disk_ldi_cb {
-	list_node_t		lcb_next;
-	ldi_callback_id_t	lcb_id;
-} vdev_disk_ldi_cb_t;
-
-/*
- * Bypass the devid when opening a disk vdev.
- * There have been issues where the devids of several devices were shuffled,
- * causing pool open failures. Note, that this flag is intended to be used
- * for pool recovery only.
- *
- * Note that if a pool is imported with the devids bypassed, all its vdevs will
- * cease storing devid information permanently. In practice, the devid is rarely
- * useful as vdev paths do not tend to change unless the hardware is
- * reconfigured. That said, if the paths do change and a pool fails to open
- * automatically at boot, a simple zpool import should re-scan the paths and fix
- * the issue.
- */
-boolean_t vdev_disk_bypass_devid = B_FALSE;
-
-static void
-vdev_disk_alloc(vdev_t *vd)
-{
-	vdev_disk_t *dvd;
-
-	dvd = vd->vdev_tsd = kmem_zalloc(sizeof (vdev_disk_t), KM_SLEEP);
-	/*
-	 * Create the LDI event callback list.
-	 */
-	list_create(&dvd->vd_ldi_cbs, sizeof (vdev_disk_ldi_cb_t),
-	    offsetof(vdev_disk_ldi_cb_t, lcb_next));
-}
-
-static void
-vdev_disk_free(vdev_t *vd)
-{
-	vdev_disk_t *dvd = vd->vdev_tsd;
-	vdev_disk_ldi_cb_t *lcb;
-
-	if (dvd == NULL)
-		return;
-
-	/*
-	 * We have already closed the LDI handle. Clean up the LDI event
-	 * callbacks and free vd->vdev_tsd.
-	 */
-	while ((lcb = list_head(&dvd->vd_ldi_cbs)) != NULL) {
-		list_remove(&dvd->vd_ldi_cbs, lcb);
-		(void) ldi_ev_remove_callbacks(lcb->lcb_id);
-		kmem_free(lcb, sizeof (vdev_disk_ldi_cb_t));
-	}
-	list_destroy(&dvd->vd_ldi_cbs);
-	kmem_free(dvd, sizeof (vdev_disk_t));
-	vd->vdev_tsd = NULL;
-}
-
-/* ARGSUSED */
-static int
-vdev_disk_off_notify(ldi_handle_t lh, ldi_ev_cookie_t ecookie, void *arg,
-    void *ev_data)
-{
-	vdev_t *vd = (vdev_t *)arg;
-	vdev_disk_t *dvd = vd->vdev_tsd;
-
-	/*
-	 * Ignore events other than offline.
-	 */
-	if (strcmp(ldi_ev_get_type(ecookie), LDI_EV_OFFLINE) != 0)
-		return (LDI_EV_SUCCESS);
-
-	/*
-	 * All LDI handles must be closed for the state change to succeed, so
-	 * call on vdev_disk_close() to do this.
-	 *
-	 * We inform vdev_disk_close that it is being called from offline
-	 * notify context so it will defer cleanup of LDI event callbacks and
-	 * freeing of vd->vdev_tsd to the offline finalize or a reopen.
-	 */
-	dvd->vd_ldi_offline = B_TRUE;
-	vdev_disk_close(vd);
-
-	/*
-	 * Now that the device is closed, request that the spa_async_thread
-	 * mark the device as REMOVED and notify FMA of the removal.
-	 */
-	zfs_post_remove(vd->vdev_spa, vd);
-	vd->vdev_remove_wanted = B_TRUE;
-	spa_async_request(vd->vdev_spa, SPA_ASYNC_REMOVE);
-
-	return (LDI_EV_SUCCESS);
-}
-
-/* ARGSUSED */
-static void
-vdev_disk_off_finalize(ldi_handle_t lh, ldi_ev_cookie_t ecookie,
-    int ldi_result, void *arg, void *ev_data)
-{
-	vdev_t *vd = (vdev_t *)arg;
-
-	/*
-	 * Ignore events other than offline.
-	 */
-	if (strcmp(ldi_ev_get_type(ecookie), LDI_EV_OFFLINE) != 0)
-		return;
-
-	/*
-	 * We have already closed the LDI handle in notify.
-	 * Clean up the LDI event callbacks and free vd->vdev_tsd.
-	 */
-	vdev_disk_free(vd);
-
-	/*
-	 * Request that the vdev be reopened if the offline state change was
-	 * unsuccessful.
-	 */
-	if (ldi_result != LDI_EV_SUCCESS) {
-		vd->vdev_probe_wanted = B_TRUE;
-		spa_async_request(vd->vdev_spa, SPA_ASYNC_PROBE);
-	}
-}
-
-static ldi_ev_callback_t vdev_disk_off_callb = {
-	.cb_vers = LDI_EV_CB_VERS,
-	.cb_notify = vdev_disk_off_notify,
-	.cb_finalize = vdev_disk_off_finalize
-};
-
-/* ARGSUSED */
-static void
-vdev_disk_dgrd_finalize(ldi_handle_t lh, ldi_ev_cookie_t ecookie,
-    int ldi_result, void *arg, void *ev_data)
-{
-	vdev_t *vd = (vdev_t *)arg;
-
-	/*
-	 * Ignore events other than degrade.
-	 */
-	if (strcmp(ldi_ev_get_type(ecookie), LDI_EV_DEGRADE) != 0)
-		return;
-
-	/*
-	 * Degrade events always succeed. Mark the vdev as degraded.
-	 * This status is purely informative for the user.
-	 */
-	(void) vdev_degrade(vd->vdev_spa, vd->vdev_guid, 0);
-}
-
-static ldi_ev_callback_t vdev_disk_dgrd_callb = {
-	.cb_vers = LDI_EV_CB_VERS,
-	.cb_notify = NULL,
-	.cb_finalize = vdev_disk_dgrd_finalize
-};
-
-static void
-vdev_disk_hold(vdev_t *vd)
-{
-	ddi_devid_t devid;
-	char *minor;
-
-	ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER));
-
-	/*
-	 * We must have a pathname, and it must be absolute.
-	 */
-	if (vd->vdev_path == NULL || vd->vdev_path[0] != '/')
-		return;
-
-	/*
-	 * Only prefetch path and devid info if the device has
-	 * never been opened.
-	 */
-	if (vd->vdev_tsd != NULL)
-		return;
-
-	if (vd->vdev_wholedisk == -1ULL) {
-		size_t len = strlen(vd->vdev_path) + 3;
-		char *buf = kmem_alloc(len, KM_SLEEP);
-
-		(void) snprintf(buf, len, "%ss0", vd->vdev_path);
-
-		(void) ldi_vp_from_name(buf, &vd->vdev_name_vp);
-		kmem_free(buf, len);
-	}
-
-	if (vd->vdev_name_vp == NULL)
-		(void) ldi_vp_from_name(vd->vdev_path, &vd->vdev_name_vp);
-
-	if (vd->vdev_devid != NULL &&
-	    ddi_devid_str_decode(vd->vdev_devid, &devid, &minor) == 0) {
-		(void) ldi_vp_from_devid(devid, minor, &vd->vdev_devid_vp);
-		ddi_devid_str_free(minor);
-		ddi_devid_free(devid);
-	}
-}
-
-static void
-vdev_disk_rele(vdev_t *vd)
-{
-	ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER));
-
-	if (vd->vdev_name_vp) {
-		VN_RELE_ASYNC(vd->vdev_name_vp,
-		    dsl_pool_vnrele_taskq(vd->vdev_spa->spa_dsl_pool));
-		vd->vdev_name_vp = NULL;
-	}
-	if (vd->vdev_devid_vp) {
-		VN_RELE_ASYNC(vd->vdev_devid_vp,
-		    dsl_pool_vnrele_taskq(vd->vdev_spa->spa_dsl_pool));
-		vd->vdev_devid_vp = NULL;
-	}
-}
-
-/*
- * We want to be loud in DEBUG kernels when DKIOCGMEDIAINFOEXT fails, or when
- * even a fallback to DKIOCGMEDIAINFO fails.
- */
-#ifdef DEBUG
-#define	VDEV_DEBUG(...)	cmn_err(CE_NOTE, __VA_ARGS__)
-#else
-#define	VDEV_DEBUG(...)	/* Nothing... */
-#endif
-
-static int
-vdev_disk_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize,
-    uint64_t *ashift)
-{
-	spa_t *spa = vd->vdev_spa;
-	vdev_disk_t *dvd = vd->vdev_tsd;
-	ldi_ev_cookie_t ecookie;
-	vdev_disk_ldi_cb_t *lcb;
-	union {
-		struct dk_minfo_ext ude;
-		struct dk_minfo ud;
-	} dks;
-	struct dk_minfo_ext *dkmext = &dks.ude;
-	struct dk_minfo *dkm = &dks.ud;
-	int error;
-	dev_t dev;
-	int otyp;
-	boolean_t validate_devid = B_FALSE;
-	ddi_devid_t devid;
-	uint64_t capacity = 0, blksz = 0, pbsize;
-
-	/*
-	 * We must have a pathname, and it must be absolute.
-	 */
-	if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') {
-		vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
-		return (SET_ERROR(EINVAL));
-	}
-
-	/*
-	 * Reopen the device if it's not currently open. Otherwise,
-	 * just update the physical size of the device.
-	 */
-	if (dvd != NULL) {
-		if (dvd->vd_ldi_offline && dvd->vd_lh == NULL) {
-			/*
-			 * If we are opening a device in its offline notify
-			 * context, the LDI handle was just closed. Clean
-			 * up the LDI event callbacks and free vd->vdev_tsd.
-			 */
-			vdev_disk_free(vd);
-		} else {
-			ASSERT(vd->vdev_reopening);
-			goto skip_open;
-		}
-	}
-
-	/*
-	 * Create vd->vdev_tsd.
-	 */
-	vdev_disk_alloc(vd);
-	dvd = vd->vdev_tsd;
-
-	/*
-	 * Allow bypassing the devid.
-	 */
-	if (vd->vdev_devid != NULL && vdev_disk_bypass_devid) {
-		vdev_dbgmsg(vd, "vdev_disk_open, devid %s bypassed",
-		    vd->vdev_devid);
-		spa_strfree(vd->vdev_devid);
-		vd->vdev_devid = NULL;
-	}
-
-	/*
-	 * When opening a disk device, we want to preserve the user's original
-	 * intent.  We always want to open the device by the path the user gave
-	 * us, even if it is one of multiple paths to the save device.  But we
-	 * also want to be able to survive disks being removed/recabled.
-	 * Therefore the sequence of opening devices is:
-	 *
-	 * 1. Try opening the device by path.  For legacy pools without the
-	 *    'whole_disk' property, attempt to fix the path by appending 's0'.
-	 *
-	 * 2. If the devid of the device matches the stored value, return
-	 *    success.
-	 *
-	 * 3. Otherwise, the device may have moved.  Try opening the device
-	 *    by the devid instead.
-	 */
-	if (vd->vdev_devid != NULL) {
-		if (ddi_devid_str_decode(vd->vdev_devid, &dvd->vd_devid,
-		    &dvd->vd_minor) != 0) {
-			vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
-			vdev_dbgmsg(vd, "vdev_disk_open: invalid "
-			    "vdev_devid '%s'", vd->vdev_devid);
-			return (SET_ERROR(EINVAL));
-		}
-	}
-
-	error = EINVAL;		/* presume failure */
-
-	if (vd->vdev_path != NULL) {
-
-		if (vd->vdev_wholedisk == -1ULL) {
-			size_t len = strlen(vd->vdev_path) + 3;
-			char *buf = kmem_alloc(len, KM_SLEEP);
-
-			(void) snprintf(buf, len, "%ss0", vd->vdev_path);
-
-			error = ldi_open_by_name(buf, spa_mode(spa), kcred,
-			    &dvd->vd_lh, zfs_li);
-			if (error == 0) {
-				spa_strfree(vd->vdev_path);
-				vd->vdev_path = buf;
-				vd->vdev_wholedisk = 1ULL;
-			} else {
-				kmem_free(buf, len);
-			}
-		}
-
-		/*
-		 * If we have not yet opened the device, try to open it by the
-		 * specified path.
-		 */
-		if (error != 0) {
-			error = ldi_open_by_name(vd->vdev_path, spa_mode(spa),
-			    kcred, &dvd->vd_lh, zfs_li);
-		}
-
-		/*
-		 * Compare the devid to the stored value.
-		 */
-		if (error == 0 && vd->vdev_devid != NULL &&
-		    ldi_get_devid(dvd->vd_lh, &devid) == 0) {
-			if (ddi_devid_compare(devid, dvd->vd_devid) != 0) {
-				/*
-				 * A mismatch here is unexpected, log it.
-				 */
-				char *devid_str = ddi_devid_str_encode(devid,
-				    dvd->vd_minor);
-				vdev_dbgmsg(vd, "vdev_disk_open: devid "
-				    "mismatch: %s != %s", vd->vdev_devid,
-				    devid_str);
-				cmn_err(CE_NOTE, "vdev_disk_open %s: devid "
-				    "mismatch: %s != %s", vd->vdev_path,
-				    vd->vdev_devid, devid_str);
-				ddi_devid_str_free(devid_str);
-
-				error = SET_ERROR(EINVAL);
-				(void) ldi_close(dvd->vd_lh, spa_mode(spa),
-				    kcred);
-				dvd->vd_lh = NULL;
-			}
-			ddi_devid_free(devid);
-		}
-
-		/*
-		 * If we succeeded in opening the device, but 'vdev_wholedisk'
-		 * is not yet set, then this must be a slice.
-		 */
-		if (error == 0 && vd->vdev_wholedisk == -1ULL)
-			vd->vdev_wholedisk = 0;
-	}
-
-	/*
-	 * If we were unable to open by path, or the devid check fails, open by
-	 * devid instead.
-	 */
-	if (error != 0 && vd->vdev_devid != NULL) {
-		error = ldi_open_by_devid(dvd->vd_devid, dvd->vd_minor,
-		    spa_mode(spa), kcred, &dvd->vd_lh, zfs_li);
-		if (error != 0) {
-			vdev_dbgmsg(vd, "Failed to open by devid (%s)",
-			    vd->vdev_devid);
-		}
-	}
-
-	/*
-	 * If all else fails, then try opening by physical path (if available)
-	 * or the logical path (if we failed due to the devid check).  While not
-	 * as reliable as the devid, this will give us something, and the higher
-	 * level vdev validation will prevent us from opening the wrong device.
-	 */
-	if (error) {
-		if (vd->vdev_devid != NULL)
-			validate_devid = B_TRUE;
-
-		if (vd->vdev_physpath != NULL &&
-		    (dev = ddi_pathname_to_dev_t(vd->vdev_physpath)) != NODEV)
-			error = ldi_open_by_dev(&dev, OTYP_BLK, spa_mode(spa),
-			    kcred, &dvd->vd_lh, zfs_li);
-
-		/*
-		 * Note that we don't support the legacy auto-wholedisk support
-		 * as above.  This hasn't been used in a very long time and we
-		 * don't need to propagate its oddities to this edge condition.
-		 */
-		if (error && vd->vdev_path != NULL)
-			error = ldi_open_by_name(vd->vdev_path, spa_mode(spa),
-			    kcred, &dvd->vd_lh, zfs_li);
-	}
-
-	if (error) {
-		vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
-		vdev_dbgmsg(vd, "vdev_disk_open: failed to open [error=%d]",
-		    error);
-		return (error);
-	}
-
-	/*
-	 * Now that the device has been successfully opened, update the devid
-	 * if necessary.
-	 */
-	if (validate_devid && spa_writeable(spa) &&
-	    ldi_get_devid(dvd->vd_lh, &devid) == 0) {
-		if (ddi_devid_compare(devid, dvd->vd_devid) != 0) {
-			char *vd_devid;
-
-			vd_devid = ddi_devid_str_encode(devid, dvd->vd_minor);
-			vdev_dbgmsg(vd, "vdev_disk_open: update devid from "
-			    "'%s' to '%s'", vd->vdev_devid, vd_devid);
-			cmn_err(CE_NOTE, "vdev_disk_open %s: update devid "
-			    "from '%s' to '%s'", vd->vdev_path != NULL ?
-			    vd->vdev_path : "?", vd->vdev_devid, vd_devid);
-			spa_strfree(vd->vdev_devid);
-			vd->vdev_devid = spa_strdup(vd_devid);
-			ddi_devid_str_free(vd_devid);
-		}
-		ddi_devid_free(devid);
-	}
-
-	/*
-	 * Once a device is opened, verify that the physical device path (if
-	 * available) is up to date.
-	 */
-	if (ldi_get_dev(dvd->vd_lh, &dev) == 0 &&
-	    ldi_get_otyp(dvd->vd_lh, &otyp) == 0) {
-		char *physpath, *minorname;
-
-		physpath = kmem_alloc(MAXPATHLEN, KM_SLEEP);
-		minorname = NULL;
-		if (ddi_dev_pathname(dev, otyp, physpath) == 0 &&
-		    ldi_get_minor_name(dvd->vd_lh, &minorname) == 0 &&
-		    (vd->vdev_physpath == NULL ||
-		    strcmp(vd->vdev_physpath, physpath) != 0)) {
-			if (vd->vdev_physpath)
-				spa_strfree(vd->vdev_physpath);
-			(void) strlcat(physpath, ":", MAXPATHLEN);
-			(void) strlcat(physpath, minorname, MAXPATHLEN);
-			vd->vdev_physpath = spa_strdup(physpath);
-		}
-		if (minorname)
-			kmem_free(minorname, strlen(minorname) + 1);
-		kmem_free(physpath, MAXPATHLEN);
-	}
-
-	/*
-	 * Register callbacks for the LDI offline event.
-	 */
-	if (ldi_ev_get_cookie(dvd->vd_lh, LDI_EV_OFFLINE, &ecookie) ==
-	    LDI_EV_SUCCESS) {
-		lcb = kmem_zalloc(sizeof (vdev_disk_ldi_cb_t), KM_SLEEP);
-		list_insert_tail(&dvd->vd_ldi_cbs, lcb);
-		(void) ldi_ev_register_callbacks(dvd->vd_lh, ecookie,
-		    &vdev_disk_off_callb, (void *) vd, &lcb->lcb_id);
-	}
-
-	/*
-	 * Register callbacks for the LDI degrade event.
-	 */
-	if (ldi_ev_get_cookie(dvd->vd_lh, LDI_EV_DEGRADE, &ecookie) ==
-	    LDI_EV_SUCCESS) {
-		lcb = kmem_zalloc(sizeof (vdev_disk_ldi_cb_t), KM_SLEEP);
-		list_insert_tail(&dvd->vd_ldi_cbs, lcb);
-		(void) ldi_ev_register_callbacks(dvd->vd_lh, ecookie,
-		    &vdev_disk_dgrd_callb, (void *) vd, &lcb->lcb_id);
-	}
-skip_open:
-	/*
-	 * Determine the actual size of the device.
-	 */
-	if (ldi_get_size(dvd->vd_lh, psize) != 0) {
-		vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
-		vdev_dbgmsg(vd, "vdev_disk_open: failed to get size");
-		return (SET_ERROR(EINVAL));
-	}
-
-	*max_psize = *psize;
-
-	/*
-	 * Determine the device's minimum transfer size.
-	 * If the ioctl isn't supported, assume DEV_BSIZE.
-	 */
-	if ((error = ldi_ioctl(dvd->vd_lh, DKIOCGMEDIAINFOEXT,
-	    (intptr_t)dkmext, FKIOCTL, kcred, NULL)) == 0) {
-		capacity = dkmext->dki_capacity - 1;
-		blksz = dkmext->dki_lbsize;
-		pbsize = dkmext->dki_pbsize;
-	} else if ((error = ldi_ioctl(dvd->vd_lh, DKIOCGMEDIAINFO,
-	    (intptr_t)dkm, FKIOCTL, kcred, NULL)) == 0) {
-		VDEV_DEBUG(
-		    "vdev_disk_open(\"%s\"): fallback to DKIOCGMEDIAINFO\n",
-		    vd->vdev_path);
-		capacity = dkm->dki_capacity - 1;
-		blksz = dkm->dki_lbsize;
-		pbsize = blksz;
-	} else {
-		VDEV_DEBUG("vdev_disk_open(\"%s\"): "
-		    "both DKIOCGMEDIAINFO{,EXT} calls failed, %d\n",
-		    vd->vdev_path, error);
-		pbsize = DEV_BSIZE;
-	}
-
-	*ashift = highbit64(MAX(pbsize, SPA_MINBLOCKSIZE)) - 1;
-
-	if (vd->vdev_wholedisk == 1) {
-		int wce = 1;
-
-		if (error == 0) {
-			/*
-			 * If we have the capability to expand, we'd have
-			 * found out via success from DKIOCGMEDIAINFO{,EXT}.
-			 * Adjust max_psize upward accordingly since we know
-			 * we own the whole disk now.
-			 */
-			*max_psize = capacity * blksz;
-		}
-
-		/*
-		 * Since we own the whole disk, try to enable disk write
-		 * caching.  We ignore errors because it's OK if we can't do it.
-		 */
-		(void) ldi_ioctl(dvd->vd_lh, DKIOCSETWCE, (intptr_t)&wce,
-		    FKIOCTL, kcred, NULL);
-	}
-
-	/*
-	 * Clear the nowritecache bit, so that on a vdev_reopen() we will
-	 * try again.
-	 */
-	vd->vdev_nowritecache = B_FALSE;
-
-	return (0);
-}
-
-static void
-vdev_disk_close(vdev_t *vd)
-{
-	vdev_disk_t *dvd = vd->vdev_tsd;
-
-	if (vd->vdev_reopening || dvd == NULL)
-		return;
-
-	if (dvd->vd_minor != NULL) {
-		ddi_devid_str_free(dvd->vd_minor);
-		dvd->vd_minor = NULL;
-	}
-
-	if (dvd->vd_devid != NULL) {
-		ddi_devid_free(dvd->vd_devid);
-		dvd->vd_devid = NULL;
-	}
-
-	if (dvd->vd_lh != NULL) {
-		(void) ldi_close(dvd->vd_lh, spa_mode(vd->vdev_spa), kcred);
-		dvd->vd_lh = NULL;
-	}
-
-	vd->vdev_delayed_close = B_FALSE;
-	/*
-	 * If we closed the LDI handle due to an offline notify from LDI,
-	 * don't free vd->vdev_tsd or unregister the callbacks here;
-	 * the offline finalize callback or a reopen will take care of it.
-	 */
-	if (dvd->vd_ldi_offline)
-		return;
-
-	vdev_disk_free(vd);
-}
-
-int
-vdev_disk_physio(vdev_t *vd, caddr_t data,
-    size_t size, uint64_t offset, int flags, boolean_t isdump)
-{
-	vdev_disk_t *dvd = vd->vdev_tsd;
-
-	/*
-	 * If the vdev is closed, it's likely in the REMOVED or FAULTED state.
-	 * Nothing to be done here but return failure.
-	 */
-	if (dvd == NULL || (dvd->vd_ldi_offline && dvd->vd_lh == NULL))
-		return (EIO);
-
-	ASSERT(vd->vdev_ops == &vdev_disk_ops);
-
-	/*
-	 * If in the context of an active crash dump, use the ldi_dump(9F)
-	 * call instead of ldi_strategy(9F) as usual.
-	 */
-	if (isdump) {
-		ASSERT3P(dvd, !=, NULL);
-		return (ldi_dump(dvd->vd_lh, data, lbtodb(offset),
-		    lbtodb(size)));
-	}
-
-	return (vdev_disk_ldi_physio(dvd->vd_lh, data, size, offset, flags));
-}
-
-int
-vdev_disk_ldi_physio(ldi_handle_t vd_lh, caddr_t data,
-    size_t size, uint64_t offset, int flags)
-{
-	buf_t *bp;
-	int error = 0;
-
-	if (vd_lh == NULL)
-		return (SET_ERROR(EINVAL));
-
-	ASSERT(flags & B_READ || flags & B_WRITE);
-
-	bp = getrbuf(KM_SLEEP);
-	bp->b_flags = flags | B_BUSY | B_NOCACHE | B_FAILFAST;
-	bp->b_bcount = size;
-	bp->b_un.b_addr = (void *)data;
-	bp->b_lblkno = lbtodb(offset);
-	bp->b_bufsize = size;
-
-	error = ldi_strategy(vd_lh, bp);
-	ASSERT(error == 0);
-	if ((error = biowait(bp)) == 0 && bp->b_resid != 0)
-		error = SET_ERROR(EIO);
-	freerbuf(bp);
-
-	return (error);
-}
-
-static void
-vdev_disk_io_intr(buf_t *bp)
-{
-	vdev_buf_t *vb = (vdev_buf_t *)bp;
-	zio_t *zio = vb->vb_io;
-
-	/*
-	 * The rest of the zio stack only deals with EIO, ECKSUM, and ENXIO.
-	 * Rather than teach the rest of the stack about other error
-	 * possibilities (EFAULT, etc), we normalize the error value here.
-	 */
-	zio->io_error = (geterror(bp) != 0 ? SET_ERROR(EIO) : 0);
-
-	if (zio->io_error == 0 && bp->b_resid != 0)
-		zio->io_error = SET_ERROR(EIO);
-
-	if (zio->io_type == ZIO_TYPE_READ) {
-		abd_return_buf_copy(zio->io_abd, bp->b_un.b_addr, zio->io_size);
-	} else {
-		abd_return_buf(zio->io_abd, bp->b_un.b_addr, zio->io_size);
-	}
-
-	kmem_free(vb, sizeof (vdev_buf_t));
-
-	zio_delay_interrupt(zio);
-}
-
-static void
-vdev_disk_ioctl_free(zio_t *zio)
-{
-	kmem_free(zio->io_vsd, sizeof (struct dk_callback));
-}
-
-static const zio_vsd_ops_t vdev_disk_vsd_ops = {
-	vdev_disk_ioctl_free,
-	zio_vsd_default_cksum_report
-};
-
-static void
-vdev_disk_ioctl_done(void *zio_arg, int error)
-{
-	zio_t *zio = zio_arg;
-
-	zio->io_error = error;
-
-	zio_interrupt(zio);
-}
-
-static void
-vdev_disk_io_start(zio_t *zio)
-{
-	vdev_t *vd = zio->io_vd;
-	vdev_disk_t *dvd = vd->vdev_tsd;
-	vdev_buf_t *vb;
-	struct dk_callback *dkc;
-	buf_t *bp;
-	int error;
-
-	/*
-	 * If the vdev is closed, it's likely in the REMOVED or FAULTED state.
-	 * Nothing to be done here but return failure.
-	 */
-	if (dvd == NULL || (dvd->vd_ldi_offline && dvd->vd_lh == NULL)) {
-		zio->io_error = SET_ERROR(ENXIO);
-		zio_interrupt(zio);
-		return;
-	}
-
-	if (zio->io_type == ZIO_TYPE_IOCTL) {
-		/* XXPOLICY */
-		if (!vdev_readable(vd)) {
-			zio->io_error = SET_ERROR(ENXIO);
-			zio_interrupt(zio);
-			return;
-		}
-
-		switch (zio->io_cmd) {
-
-		case DKIOCFLUSHWRITECACHE:
-
-			if (zfs_nocacheflush)
-				break;
-
-			if (vd->vdev_nowritecache) {
-				zio->io_error = SET_ERROR(ENOTSUP);
-				break;
-			}
-
-			zio->io_vsd = dkc = kmem_alloc(sizeof (*dkc), KM_SLEEP);
-			zio->io_vsd_ops = &vdev_disk_vsd_ops;
-
-			dkc->dkc_callback = vdev_disk_ioctl_done;
-			dkc->dkc_flag = FLUSH_VOLATILE;
-			dkc->dkc_cookie = zio;
-
-			error = ldi_ioctl(dvd->vd_lh, zio->io_cmd,
-			    (uintptr_t)dkc, FKIOCTL, kcred, NULL);
-
-			if (error == 0) {
-				/*
-				 * The ioctl will be done asychronously,
-				 * and will call vdev_disk_ioctl_done()
-				 * upon completion.
-				 */
-				return;
-			}
-
-			zio->io_error = error;
-
-			break;
-
-		default:
-			zio->io_error = SET_ERROR(ENOTSUP);
-		}
-
-		zio_execute(zio);
-		return;
-	}
-
-	ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE);
-	zio->io_target_timestamp = zio_handle_io_delay(zio);
-
-	vb = kmem_alloc(sizeof (vdev_buf_t), KM_SLEEP);
-
-	vb->vb_io = zio;
-	bp = &vb->vb_buf;
-
-	bioinit(bp);
-	bp->b_flags = B_BUSY | B_NOCACHE |
-	    (zio->io_type == ZIO_TYPE_READ ? B_READ : B_WRITE);
-	if (!(zio->io_flags & (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD)))
-		bp->b_flags |= B_FAILFAST;
-	bp->b_bcount = zio->io_size;
-
-	if (zio->io_type == ZIO_TYPE_READ) {
-		bp->b_un.b_addr =
-		    abd_borrow_buf(zio->io_abd, zio->io_size);
-	} else {
-		bp->b_un.b_addr =
-		    abd_borrow_buf_copy(zio->io_abd, zio->io_size);
-	}
-
-	bp->b_lblkno = lbtodb(zio->io_offset);
-	bp->b_bufsize = zio->io_size;
-	bp->b_iodone = (int (*)())vdev_disk_io_intr;
-
-	/* ldi_strategy() will return non-zero only on programming errors */
-	VERIFY(ldi_strategy(dvd->vd_lh, bp) == 0);
-}
-
-static void
-vdev_disk_io_done(zio_t *zio)
-{
-	vdev_t *vd = zio->io_vd;
-
-	/*
-	 * If the device returned EIO, then attempt a DKIOCSTATE ioctl to see if
-	 * the device has been removed.  If this is the case, then we trigger an
-	 * asynchronous removal of the device. Otherwise, probe the device and
-	 * make sure it's still accessible.
-	 */
-	if (zio->io_error == EIO && !vd->vdev_remove_wanted) {
-		vdev_disk_t *dvd = vd->vdev_tsd;
-		int state = DKIO_NONE;
-
-		if (ldi_ioctl(dvd->vd_lh, DKIOCSTATE, (intptr_t)&state,
-		    FKIOCTL, kcred, NULL) == 0 && state != DKIO_INSERTED) {
-			/*
-			 * We post the resource as soon as possible, instead of
-			 * when the async removal actually happens, because the
-			 * DE is using this information to discard previous I/O
-			 * errors.
-			 */
-			zfs_post_remove(zio->io_spa, vd);
-			vd->vdev_remove_wanted = B_TRUE;
-			spa_async_request(zio->io_spa, SPA_ASYNC_REMOVE);
-		} else if (!vd->vdev_delayed_close) {
-			vd->vdev_delayed_close = B_TRUE;
-		}
-	}
-}
-
-vdev_ops_t vdev_disk_ops = {
-	vdev_disk_open,
-	vdev_disk_close,
-	vdev_default_asize,
-	vdev_disk_io_start,
-	vdev_disk_io_done,
-	NULL,
-	NULL,
-	vdev_disk_hold,
-	vdev_disk_rele,
-	NULL,
-	vdev_default_xlate,
-	VDEV_TYPE_DISK,		/* name of this vdev type */
-	B_TRUE			/* leaf vdev */
-};
-
-/*
- * Given the root disk device devid or pathname, read the label from
- * the device, and construct a configuration nvlist.
- */
-int
-vdev_disk_read_rootlabel(char *devpath, char *devid, nvlist_t **config)
-{
-	ldi_handle_t vd_lh;
-	vdev_label_t *label;
-	uint64_t s, size;
-	int l;
-	ddi_devid_t tmpdevid;
-	int error = -1;
-	char *minor_name;
-
-	/*
-	 * Read the device label and build the nvlist.
-	 */
-	if (devid != NULL && ddi_devid_str_decode(devid, &tmpdevid,
-	    &minor_name) == 0) {
-		error = ldi_open_by_devid(tmpdevid, minor_name,
-		    FREAD, kcred, &vd_lh, zfs_li);
-		ddi_devid_free(tmpdevid);
-		ddi_devid_str_free(minor_name);
-	}
-
-	if (error && (error = ldi_open_by_name(devpath, FREAD, kcred, &vd_lh,
-	    zfs_li)))
-		return (error);
-
-	if (ldi_get_size(vd_lh, &s)) {
-		(void) ldi_close(vd_lh, FREAD, kcred);
-		return (SET_ERROR(EIO));
-	}
-
-	size = P2ALIGN_TYPED(s, sizeof (vdev_label_t), uint64_t);
-	label = kmem_alloc(sizeof (vdev_label_t), KM_SLEEP);
-
-	*config = NULL;
-	for (l = 0; l < VDEV_LABELS; l++) {
-		uint64_t offset, state, txg = 0;
-
-		/* read vdev label */
-		offset = vdev_label_offset(size, l, 0);
-		if (vdev_disk_ldi_physio(vd_lh, (caddr_t)label,
-		    VDEV_SKIP_SIZE + VDEV_PHYS_SIZE, offset, B_READ) != 0)
-			continue;
-
-		if (nvlist_unpack(label->vl_vdev_phys.vp_nvlist,
-		    sizeof (label->vl_vdev_phys.vp_nvlist), config, 0) != 0) {
-			*config = NULL;
-			continue;
-		}
-
-		if (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_STATE,
-		    &state) != 0 || state >= POOL_STATE_DESTROYED) {
-			nvlist_free(*config);
-			*config = NULL;
-			continue;
-		}
-
-		if (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_TXG,
-		    &txg) != 0 || txg == 0) {
-			nvlist_free(*config);
-			*config = NULL;
-			continue;
-		}
-
-		break;
-	}
-
-	kmem_free(label, sizeof (vdev_label_t));
-	(void) ldi_close(vd_lh, FREAD, kcred);
-	if (*config == NULL)
-		error = SET_ERROR(EIDRM);
-
-	return (error);
-}
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_file.c
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_file.c
+++ /dev/null
@@ -1,307 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, 2016 by Delphix. All rights reserved.
- */
-
-#include <sys/zfs_context.h>
-#include <sys/spa.h>
-#include <sys/vdev_file.h>
-#include <sys/vdev_impl.h>
-#include <sys/zio.h>
-#include <sys/fs/zfs.h>
-#include <sys/fm/fs/zfs.h>
-#include <sys/abd.h>
-
-/*
- * Virtual device vector for files.
- */
-
-static taskq_t *vdev_file_taskq;
-
-void
-vdev_file_init(void)
-{
-	vdev_file_taskq = taskq_create("z_vdev_file", MAX(max_ncpus, 16),
-	    minclsyspri, max_ncpus, INT_MAX, 0);
-}
-
-void
-vdev_file_fini(void)
-{
-	taskq_destroy(vdev_file_taskq);
-}
-
-static void
-vdev_file_hold(vdev_t *vd)
-{
-	ASSERT(vd->vdev_path != NULL);
-}
-
-static void
-vdev_file_rele(vdev_t *vd)
-{
-	ASSERT(vd->vdev_path != NULL);
-}
-
-static int
-vdev_file_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize,
-    uint64_t *logical_ashift, uint64_t *physical_ashift)
-{
-	vdev_file_t *vf;
-	vnode_t *vp;
-	vattr_t vattr;
-	int error;
-
-	/* Rotational optimizations only make sense on block devices */
-	vd->vdev_nonrot = B_TRUE;
-
-	/*
-	 * We must have a pathname, and it must be absolute.
-	 */
-	if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') {
-		vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
-		return (SET_ERROR(EINVAL));
-	}
-
-	/*
-	 * Reopen the device if it's not currently open.  Otherwise,
-	 * just update the physical size of the device.
-	 */
-	if (vd->vdev_tsd != NULL) {
-		ASSERT(vd->vdev_reopening);
-		vf = vd->vdev_tsd;
-		vp = vf->vf_vnode;
-		goto skip_open;
-	}
-
-	vf = vd->vdev_tsd = kmem_zalloc(sizeof (vdev_file_t), KM_SLEEP);
-
-	/*
-	 * We always open the files from the root of the global zone, even if
-	 * we're in a local zone.  If the user has gotten to this point, the
-	 * administrator has already decided that the pool should be available
-	 * to local zone users, so the underlying devices should be as well.
-	 */
-	ASSERT(vd->vdev_path != NULL && vd->vdev_path[0] == '/');
-	error = vn_openat(vd->vdev_path + 1, UIO_SYSSPACE,
-	    spa_mode(vd->vdev_spa) | FOFFMAX, 0, &vp, 0, 0, rootdir, -1);
-
-	if (error) {
-		vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
-		kmem_free(vd->vdev_tsd, sizeof (vdev_file_t));
-		vd->vdev_tsd = NULL;
-		return (error);
-	}
-
-	vf->vf_vnode = vp;
-
-#ifdef _KERNEL
-	/*
-	 * Make sure it's a regular file.
-	 */
-	if (vp->v_type != VREG) {
-#ifdef __FreeBSD__
-		(void) VOP_CLOSE(vp, spa_mode(vd->vdev_spa), 1, 0, kcred, NULL);
-#endif
-		vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
-#ifdef __FreeBSD__
-		kmem_free(vd->vdev_tsd, sizeof (vdev_file_t));
-		vd->vdev_tsd = NULL;
-#endif
-		return (SET_ERROR(ENODEV));
-	}
-#endif	/* _KERNEL */
-
-skip_open:
-	/*
-	 * Determine the physical size of the file.
-	 */
-	vattr.va_mask = AT_SIZE;
-	vn_lock(vp, LK_SHARED | LK_RETRY);
-	error = VOP_GETATTR(vp, &vattr, kcred);
-	VOP_UNLOCK(vp);
-	if (error) {
-		(void) VOP_CLOSE(vp, spa_mode(vd->vdev_spa), 1, 0, kcred, NULL);
-		vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
-		kmem_free(vd->vdev_tsd, sizeof (vdev_file_t));
-		vd->vdev_tsd = NULL;
-		return (error);
-	}
-
-	vd->vdev_notrim = B_TRUE;
-
-	*max_psize = *psize = vattr.va_size;
-	*logical_ashift = SPA_MINBLOCKSHIFT;
-	*physical_ashift = SPA_MINBLOCKSHIFT;
-
-	return (0);
-}
-
-static void
-vdev_file_close(vdev_t *vd)
-{
-	vdev_file_t *vf = vd->vdev_tsd;
-
-	if (vd->vdev_reopening || vf == NULL)
-		return;
-
-	if (vf->vf_vnode != NULL) {
-		(void) VOP_CLOSE(vf->vf_vnode, spa_mode(vd->vdev_spa), 1, 0,
-		    kcred, NULL);
-	}
-
-	vd->vdev_delayed_close = B_FALSE;
-	kmem_free(vf, sizeof (vdev_file_t));
-	vd->vdev_tsd = NULL;
-}
-
-/*
- * Implements the interrupt side for file vdev types. This routine will be
- * called when the I/O completes allowing us to transfer the I/O to the
- * interrupt taskqs. For consistency, the code structure mimics disk vdev
- * types.
- */
-static void
-vdev_file_io_intr(zio_t *zio)
-{
-	zio_delay_interrupt(zio);
-}
-
-static void
-vdev_file_io_strategy(void *arg)
-{
-	zio_t *zio = arg;
-	vdev_t *vd = zio->io_vd;
-	vdev_file_t *vf;
-	vnode_t *vp;
-	void *addr;
-	ssize_t resid;
-
-	vf = vd->vdev_tsd;
-	vp = vf->vf_vnode;
-
-	ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE);
-	if (zio->io_type == ZIO_TYPE_READ) {
-		addr = abd_borrow_buf(zio->io_abd, zio->io_size);
-	} else {
-		addr = abd_borrow_buf_copy(zio->io_abd, zio->io_size);
-	}
-
-	zio->io_error = vn_rdwr(zio->io_type == ZIO_TYPE_READ ?
-	    UIO_READ : UIO_WRITE, vp, addr, zio->io_size,
-	    zio->io_offset, UIO_SYSSPACE, 0, RLIM64_INFINITY, kcred, &resid);
-
-	if (zio->io_type == ZIO_TYPE_READ) {
-		abd_return_buf_copy(zio->io_abd, addr, zio->io_size);
-	} else {
-		abd_return_buf(zio->io_abd, addr, zio->io_size);
-	}
-
-	if (resid != 0 && zio->io_error == 0)
-		zio->io_error = ENOSPC;
-
-	vdev_file_io_intr(zio);
-}
-
-static void
-vdev_file_io_start(zio_t *zio)
-{
-	vdev_t *vd = zio->io_vd;
-	vdev_file_t *vf = vd->vdev_tsd;
-
-	if (zio->io_type == ZIO_TYPE_IOCTL) {
-		/* XXPOLICY */
-		if (!vdev_readable(vd)) {
-			zio->io_error = SET_ERROR(ENXIO);
-			zio_interrupt(zio);
-			return;
-		}
-
-		switch (zio->io_cmd) {
-		case DKIOCFLUSHWRITECACHE:
-
-			if (zfs_nocacheflush)
-				break;
-
-			zio->io_error = VOP_FSYNC(vf->vf_vnode, FSYNC | FDSYNC,
-			    kcred, NULL);
-			break;
-		default:
-			zio->io_error = SET_ERROR(ENOTSUP);
-		}
-
-		zio_execute(zio);
-		return;
-	}
-
-	ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE);
-	zio->io_target_timestamp = zio_handle_io_delay(zio);
-
-	VERIFY3U(taskq_dispatch(vdev_file_taskq, vdev_file_io_strategy, zio,
-	    TQ_SLEEP), !=, 0);
-}
-
-/* ARGSUSED */
-static void
-vdev_file_io_done(zio_t *zio)
-{
-}
-
-vdev_ops_t vdev_file_ops = {
-	vdev_file_open,
-	vdev_file_close,
-	vdev_default_asize,
-	vdev_file_io_start,
-	vdev_file_io_done,
-	NULL,
-	NULL,
-	vdev_file_hold,
-	vdev_file_rele,
-	NULL,
-	vdev_default_xlate,
-	VDEV_TYPE_FILE,		/* name of this vdev type */
-	B_TRUE			/* leaf vdev */
-};
-
-/*
- * From userland we access disks just like files.
- */
-#ifndef _KERNEL
-
-vdev_ops_t vdev_disk_ops = {
-	vdev_file_open,
-	vdev_file_close,
-	vdev_default_asize,
-	vdev_file_io_start,
-	vdev_file_io_done,
-	NULL,
-	NULL,
-	vdev_file_hold,
-	vdev_file_rele,
-	NULL,
-	vdev_default_xlate,
-	VDEV_TYPE_DISK,		/* name of this vdev type */
-	B_TRUE			/* leaf vdev */
-};
-
-#endif
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c
+++ /dev/null
@@ -1,1193 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright (c) 2006 Pawel Jakub Dawidek <pjd@FreeBSD.org>
- * All rights reserved.
- *
- * Portions Copyright (c) 2012 Martin Matuska <mm@FreeBSD.org>
- */
-
-#include <sys/zfs_context.h>
-#include <sys/param.h>
-#include <sys/kernel.h>
-#include <sys/bio.h>
-#include <sys/disk.h>
-#include <sys/spa.h>
-#include <sys/spa_impl.h>
-#include <sys/vdev_impl.h>
-#include <sys/fs/zfs.h>
-#include <sys/zio.h>
-#include <geom/geom.h>
-#include <geom/geom_int.h>
-
-/*
- * Virtual device vector for GEOM.
- */
-
-static g_attrchanged_t vdev_geom_attrchanged;
-struct g_class zfs_vdev_class = {
-	.name = "ZFS::VDEV",
-	.version = G_VERSION,
-	.attrchanged = vdev_geom_attrchanged,
-};
-
-struct consumer_vdev_elem {
-	SLIST_ENTRY(consumer_vdev_elem)	elems;
-	vdev_t				*vd;
-};
-
-SLIST_HEAD(consumer_priv_t, consumer_vdev_elem);
-_Static_assert(sizeof(((struct g_consumer*)NULL)->private)
-    == sizeof(struct consumer_priv_t*),
-    "consumer_priv_t* can't be stored in g_consumer.private");
-
-DECLARE_GEOM_CLASS(zfs_vdev_class, zfs_vdev);
-
-SYSCTL_DECL(_vfs_zfs_vdev);
-/* Don't send BIO_FLUSH. */
-static int vdev_geom_bio_flush_disable;
-SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, bio_flush_disable, CTLFLAG_RWTUN,
-    &vdev_geom_bio_flush_disable, 0, "Disable BIO_FLUSH");
-/* Don't send BIO_DELETE. */
-static int vdev_geom_bio_delete_disable;
-SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, bio_delete_disable, CTLFLAG_RWTUN,
-    &vdev_geom_bio_delete_disable, 0, "Disable BIO_DELETE");
-
-/* Declare local functions */
-static void vdev_geom_detach(struct g_consumer *cp, boolean_t open_for_read);
-
-/*
- * Thread local storage used to indicate when a thread is probing geoms
- * for their guids.  If NULL, this thread is not tasting geoms.  If non NULL,
- * it is looking for a replacement for the vdev_t* that is its value.
- */
-uint_t zfs_geom_probe_vdev_key;
-
-static void
-vdev_geom_set_rotation_rate(vdev_t *vd, struct g_consumer *cp)
-{ 
-	int error;
-	uint16_t rate;
-
-	error = g_getattr("GEOM::rotation_rate", cp, &rate);
-	if (error == 0 && rate == 1)
-		vd->vdev_nonrot = B_TRUE;
-	else
-		vd->vdev_nonrot = B_FALSE;
-}
-
-static void
-vdev_geom_set_physpath(vdev_t *vd, struct g_consumer *cp,
-		       boolean_t do_null_update)
-{
-	boolean_t needs_update = B_FALSE;
-	char *physpath;
-	int error, physpath_len;
-
-	physpath_len = MAXPATHLEN;
-	physpath = g_malloc(physpath_len, M_WAITOK|M_ZERO);
-	error = g_io_getattr("GEOM::physpath", cp, &physpath_len, physpath);
-	if (error == 0) {
-		char *old_physpath;
-
-		/* g_topology lock ensures that vdev has not been closed */
-		g_topology_assert();
-		old_physpath = vd->vdev_physpath;
-		vd->vdev_physpath = spa_strdup(physpath);
-
-		if (old_physpath != NULL) {
-			needs_update = (strcmp(old_physpath,
-						vd->vdev_physpath) != 0);
-			spa_strfree(old_physpath);
-		} else
-			needs_update = do_null_update;
-	}
-	g_free(physpath);
-
-	/*
-	 * If the physical path changed, update the config.
-	 * Only request an update for previously unset physpaths if
-	 * requested by the caller.
-	 */
-	if (needs_update)
-		spa_async_request(vd->vdev_spa, SPA_ASYNC_CONFIG_UPDATE);
-
-}
-
-static void
-vdev_geom_attrchanged(struct g_consumer *cp, const char *attr)
-{
-	char *old_physpath;
-	struct consumer_priv_t *priv;
-	struct consumer_vdev_elem *elem;
-	int error;
-
-	priv = (struct consumer_priv_t*)&cp->private;
-	if (SLIST_EMPTY(priv))
-		return;
-
-	SLIST_FOREACH(elem, priv, elems) {
-		vdev_t *vd = elem->vd;
-		if (strcmp(attr, "GEOM::rotation_rate") == 0) {
-			vdev_geom_set_rotation_rate(vd, cp);
-			return;
-		}
-		if (strcmp(attr, "GEOM::physpath") == 0) {
-			vdev_geom_set_physpath(vd, cp, /*null_update*/B_TRUE);
-			return;
-		}
-	}
-}
-
-static void
-vdev_geom_resize(struct g_consumer *cp)
-{
-	struct consumer_priv_t *priv;
-	struct consumer_vdev_elem *elem;
-	spa_t *spa;
-	vdev_t *vd;
-
-	priv = (struct consumer_priv_t *)&cp->private;
-	if (SLIST_EMPTY(priv))
-		return;
-
-	SLIST_FOREACH(elem, priv, elems) {
-		vd = elem->vd;
-		if (vd->vdev_state != VDEV_STATE_HEALTHY)
-			continue;
-		spa = vd->vdev_spa;
-		if (!spa->spa_autoexpand)
-			continue;
-		vdev_online(spa, vd->vdev_guid, ZFS_ONLINE_EXPAND, NULL);
-	}
-}
-
-static void
-vdev_geom_orphan(struct g_consumer *cp)
-{
-	struct consumer_priv_t *priv;
-	struct consumer_vdev_elem *elem;
-
-	g_topology_assert();
-
-	priv = (struct consumer_priv_t*)&cp->private;
-	if (SLIST_EMPTY(priv))
-		/* Vdev close in progress.  Ignore the event. */
-		return;
-
-	/*
-	 * Orphan callbacks occur from the GEOM event thread.
-	 * Concurrent with this call, new I/O requests may be
-	 * working their way through GEOM about to find out
-	 * (only once executed by the g_down thread) that we've
-	 * been orphaned from our disk provider.  These I/Os
-	 * must be retired before we can detach our consumer.
-	 * This is most easily achieved by acquiring the
-	 * SPA ZIO configuration lock as a writer, but doing
-	 * so with the GEOM topology lock held would cause
-	 * a lock order reversal.  Instead, rely on the SPA's
-	 * async removal support to invoke a close on this
-	 * vdev once it is safe to do so.
-	 */
-	SLIST_FOREACH(elem, priv, elems) {
-		vdev_t *vd = elem->vd;
-
-		vd->vdev_remove_wanted = B_TRUE;
-		spa_async_request(vd->vdev_spa, SPA_ASYNC_REMOVE);
-	}
-}
-
-static struct g_consumer *
-vdev_geom_attach(struct g_provider *pp, vdev_t *vd, boolean_t sanity)
-{
-	struct g_geom *gp;
-	struct g_consumer *cp;
-	int error;
-
-	g_topology_assert();
-
-	ZFS_LOG(1, "Attaching to %s.", pp->name);
-
-	if (sanity) {
-		if (pp->sectorsize > VDEV_PAD_SIZE || !ISP2(pp->sectorsize)) {
-			ZFS_LOG(1, "Failing attach of %s. "
-				   "Incompatible sectorsize %d\n",
-			    pp->name, pp->sectorsize);
-			return (NULL);
-		} else if (pp->mediasize < SPA_MINDEVSIZE) {
-			ZFS_LOG(1, "Failing attach of %s. "
-				   "Incompatible mediasize %ju\n",
-			    pp->name, pp->mediasize);
-			return (NULL);
-		}
-	}
-
-	/* Do we have geom already? No? Create one. */
-	LIST_FOREACH(gp, &zfs_vdev_class.geom, geom) {
-		if (gp->flags & G_GEOM_WITHER)
-			continue;
-		if (strcmp(gp->name, "zfs::vdev") != 0)
-			continue;
-		break;
-	}
-	if (gp == NULL) {
-		gp = g_new_geomf(&zfs_vdev_class, "zfs::vdev");
-		gp->orphan = vdev_geom_orphan;
-		gp->attrchanged = vdev_geom_attrchanged;
-		gp->resize = vdev_geom_resize;
-		cp = g_new_consumer(gp);
-		error = g_attach(cp, pp);
-		if (error != 0) {
-			ZFS_LOG(1, "%s(%d): g_attach failed: %d\n", __func__,
-			    __LINE__, error);
-			vdev_geom_detach(cp, B_FALSE);
-			return (NULL);
-		}
-		error = g_access(cp, 1, 0, 1);
-		if (error != 0) {
-			ZFS_LOG(1, "%s(%d): g_access failed: %d\n", __func__,
-			       __LINE__, error);
-			vdev_geom_detach(cp, B_FALSE);
-			return (NULL);
-		}
-		ZFS_LOG(1, "Created geom and consumer for %s.", pp->name);
-	} else {
-		/* Check if we are already connected to this provider. */
-		LIST_FOREACH(cp, &gp->consumer, consumer) {
-			if (cp->provider == pp) {
-				ZFS_LOG(1, "Found consumer for %s.", pp->name);
-				break;
-			}
-		}
-		if (cp == NULL) {
-			cp = g_new_consumer(gp);
-			error = g_attach(cp, pp);
-			if (error != 0) {
-				ZFS_LOG(1, "%s(%d): g_attach failed: %d\n",
-				    __func__, __LINE__, error);
-				vdev_geom_detach(cp, B_FALSE);
-				return (NULL);
-			}
-			error = g_access(cp, 1, 0, 1);
-			if (error != 0) {
-				ZFS_LOG(1, "%s(%d): g_access failed: %d\n",
-				    __func__, __LINE__, error);
-				vdev_geom_detach(cp, B_FALSE);
-				return (NULL);
-			}
-			ZFS_LOG(1, "Created consumer for %s.", pp->name);
-		} else {
-			error = g_access(cp, 1, 0, 1);
-			if (error != 0) {
-				ZFS_LOG(1, "%s(%d): g_access failed: %d\n",
-				    __func__, __LINE__, error);
-				return (NULL);
-			}
-			ZFS_LOG(1, "Used existing consumer for %s.", pp->name);
-		}
-	}
-
-	if (vd != NULL)
-		vd->vdev_tsd = cp;
-
-	cp->flags |= G_CF_DIRECT_SEND | G_CF_DIRECT_RECEIVE;
-	return (cp);
-}
-
-static void
-vdev_geom_detach(struct g_consumer *cp, boolean_t open_for_read)
-{
-	struct g_geom *gp;
-
-	g_topology_assert();
-
-	ZFS_LOG(1, "Detaching from %s.",
-	    cp->provider && cp->provider->name ? cp->provider->name : "NULL");
-
-	gp = cp->geom;
-	if (open_for_read)
-		g_access(cp, -1, 0, -1);
-	/* Destroy consumer on last close. */
-	if (cp->acr == 0 && cp->ace == 0) {
-		if (cp->acw > 0)
-			g_access(cp, 0, -cp->acw, 0);
-		if (cp->provider != NULL) {
-			ZFS_LOG(1, "Destroying consumer for %s.",
-			    cp->provider->name ? cp->provider->name : "NULL");
-			g_detach(cp);
-		}
-		g_destroy_consumer(cp);
-	}
-	/* Destroy geom if there are no consumers left. */
-	if (LIST_EMPTY(&gp->consumer)) {
-		ZFS_LOG(1, "Destroyed geom %s.", gp->name);
-		g_wither_geom(gp, ENXIO);
-	}
-}
-
-static void
-vdev_geom_close_locked(vdev_t *vd)
-{
-	struct g_consumer *cp;
-	struct consumer_priv_t *priv;
-	struct consumer_vdev_elem *elem, *elem_temp;
-
-	g_topology_assert();
-
-	cp = vd->vdev_tsd;
-	vd->vdev_delayed_close = B_FALSE;
-	if (cp == NULL)
-		return;
-
-	ZFS_LOG(1, "Closing access to %s.", cp->provider->name);
-	KASSERT(cp->private != NULL, ("%s: cp->private is NULL", __func__));
-	priv = (struct consumer_priv_t*)&cp->private;
-	vd->vdev_tsd = NULL;
-	SLIST_FOREACH_SAFE(elem, priv, elems, elem_temp) {
-		if (elem->vd == vd) {
-			SLIST_REMOVE(priv, elem, consumer_vdev_elem, elems);
-			g_free(elem);
-		}
-	}
-
-	vdev_geom_detach(cp, B_TRUE);
-}
-
-/*
- * Issue one or more bios to the vdev in parallel
- * cmds, datas, offsets, errors, and sizes are arrays of length ncmds.  Each IO
- * operation is described by parallel entries from each array.  There may be
- * more bios actually issued than entries in the array
- */
-static void
-vdev_geom_io(struct g_consumer *cp, int *cmds, void **datas, off_t *offsets,
-    off_t *sizes, int *errors, int ncmds)
-{
-	struct bio **bios;
-	u_char *p;
-	off_t off, maxio, s, end;
-	int i, n_bios, j;
-	size_t bios_size;
-
-	maxio = MAXPHYS - (MAXPHYS % cp->provider->sectorsize);
-	n_bios = 0;
-
-	/* How many bios are required for all commands ? */
-	for (i = 0; i < ncmds; i++)
-		n_bios += (sizes[i] + maxio - 1) / maxio;
-
-	/* Allocate memory for the bios */
-	bios_size = n_bios * sizeof(struct bio*);
-	bios = kmem_zalloc(bios_size, KM_SLEEP);
-
-	/* Prepare and issue all of the bios */
-	for (i = j = 0; i < ncmds; i++) {
-		off = offsets[i];
-		p = datas[i];
-		s = sizes[i];
-		end = off + s;
-		ASSERT((off % cp->provider->sectorsize) == 0);
-		ASSERT((s % cp->provider->sectorsize) == 0);
-
-		for (; off < end; off += maxio, p += maxio, s -= maxio, j++) {
-			bios[j] = g_alloc_bio();
-			bios[j]->bio_cmd = cmds[i];
-			bios[j]->bio_done = NULL;
-			bios[j]->bio_offset = off;
-			bios[j]->bio_length = MIN(s, maxio);
-			bios[j]->bio_data = p;
-			g_io_request(bios[j], cp);
-		}
-	}
-	ASSERT(j == n_bios);
-
-	/* Wait for all of the bios to complete, and clean them up */
-	for (i = j = 0; i < ncmds; i++) {
-		off = offsets[i];
-		s = sizes[i];
-		end = off + s;
-
-		for (; off < end; off += maxio, s -= maxio, j++) {
-			errors[i] = biowait(bios[j], "vdev_geom_io") || errors[i];
-			g_destroy_bio(bios[j]);
-		}
-	}
-	kmem_free(bios, bios_size);
-}
-
-/* 
- * Read the vdev config from a device.  Return the number of valid labels that
- * were found.  The vdev config will be returned in config if and only if at
- * least one valid label was found.
- */
-static int
-vdev_geom_read_config(struct g_consumer *cp, nvlist_t **configp)
-{
-	struct g_provider *pp;
-	nvlist_t *config;
-	vdev_phys_t *vdev_lists[VDEV_LABELS];
-	char *buf;
-	size_t buflen;
-	uint64_t psize, state, txg;
-	off_t offsets[VDEV_LABELS];
-	off_t size;
-	off_t sizes[VDEV_LABELS];
-	int cmds[VDEV_LABELS];
-	int errors[VDEV_LABELS];
-	int l, nlabels;
-
-	g_topology_assert_not();
-
-	pp = cp->provider;
-	ZFS_LOG(1, "Reading config from %s...", pp->name);
-
-	psize = pp->mediasize;
-	psize = P2ALIGN(psize, (uint64_t)sizeof(vdev_label_t));
-
-	size = sizeof(*vdev_lists[0]) + pp->sectorsize -
-	    ((sizeof(*vdev_lists[0]) - 1) % pp->sectorsize) - 1;
-
-	buflen = sizeof(vdev_lists[0]->vp_nvlist);
-
-	/* Create all of the IO requests */
-	for (l = 0; l < VDEV_LABELS; l++) {
-		cmds[l] = BIO_READ;
-		vdev_lists[l] = kmem_alloc(size, KM_SLEEP);
-		offsets[l] = vdev_label_offset(psize, l, 0) + VDEV_SKIP_SIZE;
-		sizes[l] = size;
-		errors[l] = 0;
-		ASSERT(offsets[l] % pp->sectorsize == 0);
-	}
-
-	/* Issue the IO requests */
-	vdev_geom_io(cp, cmds, (void**)vdev_lists, offsets, sizes, errors,
-	    VDEV_LABELS);
-
-	/* Parse the labels */
-	config = *configp = NULL;
-	nlabels = 0;
-	for (l = 0; l < VDEV_LABELS; l++) {
-		if (errors[l] != 0)
-			continue;
-
-		buf = vdev_lists[l]->vp_nvlist;
-
-		if (nvlist_unpack(buf, buflen, &config, 0) != 0)
-			continue;
-
-		if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE,
-		    &state) != 0 || state > POOL_STATE_L2CACHE) {
-			nvlist_free(config);
-			continue;
-		}
-
-		if (state != POOL_STATE_SPARE &&
-		    state != POOL_STATE_L2CACHE &&
-		    (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG,
-		    &txg) != 0 || txg == 0)) {
-			nvlist_free(config);
-			continue;
-		}
-
-		if (*configp != NULL)
-			nvlist_free(*configp);
-		*configp = config;
-
-		nlabels++;
-	}
-
-	/* Free the label storage */
-	for (l = 0; l < VDEV_LABELS; l++)
-		kmem_free(vdev_lists[l], size);
-
-	return (nlabels);
-}
-
-static void
-resize_configs(nvlist_t ***configs, uint64_t *count, uint64_t id)
-{
-	nvlist_t **new_configs;
-	uint64_t i;
-
-	if (id < *count)
-		return;
-	new_configs = kmem_zalloc((id + 1) * sizeof(nvlist_t *),
-	    KM_SLEEP);
-	for (i = 0; i < *count; i++)
-		new_configs[i] = (*configs)[i];
-	if (*configs != NULL)
-		kmem_free(*configs, *count * sizeof(void *));
-	*configs = new_configs;
-	*count = id + 1;
-}
-
-static void
-process_vdev_config(nvlist_t ***configs, uint64_t *count, nvlist_t *cfg,
-    const char *name, uint64_t* known_pool_guid)
-{
-	nvlist_t *vdev_tree;
-	uint64_t pool_guid;
-	uint64_t vdev_guid, known_guid;
-	uint64_t id, txg, known_txg;
-	char *pname;
-	int i;
-
-	if (nvlist_lookup_string(cfg, ZPOOL_CONFIG_POOL_NAME, &pname) != 0 ||
-	    strcmp(pname, name) != 0)
-		goto ignore;
-
-	if (nvlist_lookup_uint64(cfg, ZPOOL_CONFIG_POOL_GUID, &pool_guid) != 0)
-		goto ignore;
-
-	if (nvlist_lookup_uint64(cfg, ZPOOL_CONFIG_TOP_GUID, &vdev_guid) != 0)
-		goto ignore;
-
-	if (nvlist_lookup_nvlist(cfg, ZPOOL_CONFIG_VDEV_TREE, &vdev_tree) != 0)
-		goto ignore;
-
-	if (nvlist_lookup_uint64(vdev_tree, ZPOOL_CONFIG_ID, &id) != 0)
-		goto ignore;
-
-	VERIFY(nvlist_lookup_uint64(cfg, ZPOOL_CONFIG_POOL_TXG, &txg) == 0);
-
-	if (*known_pool_guid != 0) {
-		if (pool_guid != *known_pool_guid)
-			goto ignore;
-	} else
-		*known_pool_guid = pool_guid;
-
-	resize_configs(configs, count, id);
-
-	if ((*configs)[id] != NULL) {
-		VERIFY(nvlist_lookup_uint64((*configs)[id],
-		    ZPOOL_CONFIG_POOL_TXG, &known_txg) == 0);
-		if (txg <= known_txg)
-			goto ignore;
-		nvlist_free((*configs)[id]);
-	}
-
-	(*configs)[id] = cfg;
-	return;
-
-ignore:
-	nvlist_free(cfg);
-}
-
-int
-vdev_geom_read_pool_label(const char *name,
-    nvlist_t ***configs, uint64_t *count)
-{
-	struct g_class *mp;
-	struct g_geom *gp;
-	struct g_provider *pp;
-	struct g_consumer *zcp;
-	nvlist_t *vdev_cfg;
-	uint64_t pool_guid;
-	int error, nlabels;
-
-	DROP_GIANT();
-	g_topology_lock();
-
-	*configs = NULL;
-	*count = 0;
-	pool_guid = 0;
-	LIST_FOREACH(mp, &g_classes, class) {
-		if (mp == &zfs_vdev_class)
-			continue;
-		LIST_FOREACH(gp, &mp->geom, geom) {
-			if (gp->flags & G_GEOM_WITHER)
-				continue;
-			LIST_FOREACH(pp, &gp->provider, provider) {
-				if (pp->flags & G_PF_WITHER)
-					continue;
-				zcp = vdev_geom_attach(pp, NULL, B_TRUE);
-				if (zcp == NULL)
-					continue;
-				g_topology_unlock();
-				nlabels = vdev_geom_read_config(zcp, &vdev_cfg);
-				g_topology_lock();
-				vdev_geom_detach(zcp, B_TRUE);
-				if (nlabels == 0)
-					continue;
-				ZFS_LOG(1, "successfully read vdev config");
-
-				process_vdev_config(configs, count,
-				    vdev_cfg, name, &pool_guid);
-			}
-		}
-	}
-	g_topology_unlock();
-	PICKUP_GIANT();
-
-	return (*count > 0 ? 0 : ENOENT);
-}
-
-enum match {
-	NO_MATCH = 0,		/* No matching labels found */
-	TOPGUID_MATCH = 1,	/* Labels match top guid, not vdev guid*/
-	ZERO_MATCH = 1,		/* Should never be returned */
-	ONE_MATCH = 2,		/* 1 label matching the vdev_guid */
-	TWO_MATCH = 3,		/* 2 label matching the vdev_guid */
-	THREE_MATCH = 4,	/* 3 label matching the vdev_guid */
-	FULL_MATCH = 5		/* all labels match the vdev_guid */
-};
-
-static enum match
-vdev_attach_ok(vdev_t *vd, struct g_provider *pp)
-{
-	nvlist_t *config;
-	uint64_t pool_guid, top_guid, vdev_guid;
-	struct g_consumer *cp;
-	int nlabels;
-
-	cp = vdev_geom_attach(pp, NULL, B_TRUE);
-	if (cp == NULL) {
-		ZFS_LOG(1, "Unable to attach tasting instance to %s.",
-		    pp->name);
-		return (NO_MATCH);
-	}
-	g_topology_unlock();
-	nlabels = vdev_geom_read_config(cp, &config);
-	g_topology_lock();
-	vdev_geom_detach(cp, B_TRUE);
-	if (nlabels == 0) {
-		ZFS_LOG(1, "Unable to read config from %s.", pp->name);
-		return (NO_MATCH);
-	}
-
-	pool_guid = 0;
-	(void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid);
-	top_guid = 0;
-	(void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_TOP_GUID, &top_guid);
-	vdev_guid = 0;
-	(void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID, &vdev_guid);
-	nvlist_free(config);
-
-	/*
-	 * Check that the label's pool guid matches the desired guid.
-	 * Inactive spares and L2ARCs do not have any pool guid in the label.
-	 */
-	if (pool_guid != 0 && pool_guid != spa_guid(vd->vdev_spa)) {
-		ZFS_LOG(1, "pool guid mismatch for provider %s: %ju != %ju.",
-		    pp->name,
-		    (uintmax_t)spa_guid(vd->vdev_spa), (uintmax_t)pool_guid);
-		return (NO_MATCH);
-	}
-
-	/*
-	 * Check that the label's vdev guid matches the desired guid.
-	 * The second condition handles possible race on vdev detach, when
-	 * remaining vdev receives GUID of destroyed top level mirror vdev.
-	 */
-	if (vdev_guid == vd->vdev_guid) {
-		ZFS_LOG(1, "guids match for provider %s.", pp->name);
-		return (ZERO_MATCH + nlabels);
-	} else if (top_guid == vd->vdev_guid && vd == vd->vdev_top) {
-		ZFS_LOG(1, "top vdev guid match for provider %s.", pp->name);
-		return (TOPGUID_MATCH);
-	}
-	ZFS_LOG(1, "vdev guid mismatch for provider %s: %ju != %ju.",
-	    pp->name, (uintmax_t)vd->vdev_guid, (uintmax_t)vdev_guid);
-	return (NO_MATCH);
-}
-
-static struct g_consumer *
-vdev_geom_attach_by_guids(vdev_t *vd)
-{
-	struct g_class *mp;
-	struct g_geom *gp;
-	struct g_provider *pp, *best_pp;
-	struct g_consumer *cp;
-	const char *vdpath;
-	enum match match, best_match;
-
-	g_topology_assert();
-
-	vdpath = vd->vdev_path + sizeof("/dev/") - 1;
-	cp = NULL;
-	best_pp = NULL;
-	best_match = NO_MATCH;
-	LIST_FOREACH(mp, &g_classes, class) {
-		if (mp == &zfs_vdev_class)
-			continue;
-		LIST_FOREACH(gp, &mp->geom, geom) {
-			if (gp->flags & G_GEOM_WITHER)
-				continue;
-			LIST_FOREACH(pp, &gp->provider, provider) {
-				match = vdev_attach_ok(vd, pp);
-				if (match > best_match) {
-					best_match = match;
-					best_pp = pp;
-				} else if (match == best_match) {
-					if (strcmp(pp->name, vdpath) == 0) {
-						best_pp = pp;
-					}
-				}
-				if (match == FULL_MATCH)
-					goto out;
-			}
-		}
-	}
-
-out:
-	if (best_pp) {
-		cp = vdev_geom_attach(best_pp, vd, B_TRUE);
-		if (cp == NULL) {
-			printf("ZFS WARNING: Unable to attach to %s.\n",
-			    best_pp->name);
-		}
-	}
-	return (cp);
-}
-
-static struct g_consumer *
-vdev_geom_open_by_guids(vdev_t *vd)
-{
-	struct g_consumer *cp;
-	char *buf;
-	size_t len;
-
-	g_topology_assert();
-
-	ZFS_LOG(1, "Searching by guids [%ju:%ju].",
-		(uintmax_t)spa_guid(vd->vdev_spa), (uintmax_t)vd->vdev_guid);
-	cp = vdev_geom_attach_by_guids(vd);
-	if (cp != NULL) {
-		len = strlen(cp->provider->name) + strlen("/dev/") + 1;
-		buf = kmem_alloc(len, KM_SLEEP);
-
-		snprintf(buf, len, "/dev/%s", cp->provider->name);
-		spa_strfree(vd->vdev_path);
-		vd->vdev_path = buf;
-
-		ZFS_LOG(1, "Attach by guid [%ju:%ju] succeeded, provider %s.",
-		    (uintmax_t)spa_guid(vd->vdev_spa),
-		    (uintmax_t)vd->vdev_guid, cp->provider->name);
-	} else {
-		ZFS_LOG(1, "Search by guid [%ju:%ju] failed.",
-		    (uintmax_t)spa_guid(vd->vdev_spa),
-		    (uintmax_t)vd->vdev_guid);
-	}
-
-	return (cp);
-}
-
-static struct g_consumer *
-vdev_geom_open_by_path(vdev_t *vd, int check_guid)
-{
-	struct g_provider *pp;
-	struct g_consumer *cp;
-
-	g_topology_assert();
-
-	cp = NULL;
-	pp = g_provider_by_name(vd->vdev_path + sizeof("/dev/") - 1);
-	if (pp != NULL) {
-		ZFS_LOG(1, "Found provider by name %s.", vd->vdev_path);
-		if (!check_guid || vdev_attach_ok(vd, pp) == FULL_MATCH)
-			cp = vdev_geom_attach(pp, vd, B_FALSE);
-	}
-
-	return (cp);
-}
-
-static int
-vdev_geom_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize,
-    uint64_t *logical_ashift, uint64_t *physical_ashift)
-{
-	struct g_provider *pp;
-	struct g_consumer *cp;
-	size_t bufsize;
-	int error;
-
-	/* Set the TLS to indicate downstack that we should not access zvols*/
-	VERIFY(tsd_set(zfs_geom_probe_vdev_key, vd) == 0);
-
-	/*
-	 * We must have a pathname, and it must be absolute.
-	 */
-	if (vd->vdev_path == NULL || strncmp(vd->vdev_path, "/dev/", 5) != 0) {
-		vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
-		return (EINVAL);
-	}
-
-	/*
-	 * Reopen the device if it's not currently open. Otherwise,
-	 * just update the physical size of the device.
-	 */
-	if ((cp = vd->vdev_tsd) != NULL) {
-		ASSERT(vd->vdev_reopening);
-		goto skip_open;
-	}
-
-	DROP_GIANT();
-	g_topology_lock();
-	error = 0;
-
-	if (vd->vdev_spa->spa_splitting_newspa ||
-	    (vd->vdev_prevstate == VDEV_STATE_UNKNOWN &&
-	     vd->vdev_spa->spa_load_state == SPA_LOAD_NONE ||
-	     vd->vdev_spa->spa_load_state == SPA_LOAD_CREATE)) {
-		/*
-		 * We are dealing with a vdev that hasn't been previously
-		 * opened (since boot), and we are not loading an
-		 * existing pool configuration.  This looks like a
-		 * vdev add operation to a new or existing pool.
-		 * Assume the user knows what he/she is doing and find
-		 * GEOM provider by its name, ignoring GUID mismatches.
-		 *
-		 * XXPOLICY: It would be safer to only allow a device
-		 *           that is unlabeled or labeled but missing
-		 *           GUID information to be opened in this fashion,
-		 *           unless we are doing a split, in which case we
-		 *           should allow any guid.
-		 */
-		cp = vdev_geom_open_by_path(vd, 0);
-	} else {
-		/*
-		 * Try using the recorded path for this device, but only
-		 * accept it if its label data contains the expected GUIDs.
-		 */
-		cp = vdev_geom_open_by_path(vd, 1);
-		if (cp == NULL) {
-			/*
-			 * The device at vd->vdev_path doesn't have the
-			 * expected GUIDs. The disks might have merely
-			 * moved around so try all other GEOM providers
-			 * to find one with the right GUIDs.
-			 */
-			cp = vdev_geom_open_by_guids(vd);
-		}
-	}
-
-	/* Clear the TLS now that tasting is done */
-	VERIFY(tsd_set(zfs_geom_probe_vdev_key, NULL) == 0);
-
-	if (cp == NULL) {
-		ZFS_LOG(1, "Vdev %s not found.", vd->vdev_path);
-		error = ENOENT;
-	} else {
-		struct consumer_priv_t *priv;
-		struct consumer_vdev_elem *elem;
-		int spamode;
-
-		priv = (struct consumer_priv_t*)&cp->private;
-		if (cp->private == NULL)
-			SLIST_INIT(priv);
-		elem = g_malloc(sizeof(*elem), M_WAITOK|M_ZERO);
-		elem->vd = vd;
-		SLIST_INSERT_HEAD(priv, elem, elems);
-
-		spamode = spa_mode(vd->vdev_spa);
-		if (cp->provider->sectorsize > VDEV_PAD_SIZE ||
-		    !ISP2(cp->provider->sectorsize)) {
-			ZFS_LOG(1, "Provider %s has unsupported sectorsize.",
-			    cp->provider->name);
-
-			vdev_geom_close_locked(vd);
-			error = EINVAL;
-			cp = NULL;
-		} else if (cp->acw == 0 && (spamode & FWRITE) != 0) {
-			int i;
-
-			for (i = 0; i < 5; i++) {
-				error = g_access(cp, 0, 1, 0);
-				if (error == 0)
-					break;
-				g_topology_unlock();
-				tsleep(vd, 0, "vdev", hz / 2);
-				g_topology_lock();
-			}
-			if (error != 0) {
-				printf("ZFS WARNING: Unable to open %s for writing (error=%d).\n",
-				    cp->provider->name, error);
-				vdev_geom_close_locked(vd);
-				cp = NULL;
-			}
-		}
-	}
-
-	/* Fetch initial physical path information for this device. */
-	if (cp != NULL) {
-		vdev_geom_attrchanged(cp, "GEOM::physpath");
-	
-		/* Set other GEOM characteristics */
-		vdev_geom_set_physpath(vd, cp, /*do_null_update*/B_FALSE);
-		vdev_geom_set_rotation_rate(vd, cp);
-	}
-
-	g_topology_unlock();
-	PICKUP_GIANT();
-	if (cp == NULL) {
-		vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
-		vdev_dbgmsg(vd, "vdev_geom_open: failed to open [error=%d]",
-		    error);
-		return (error);
-	}
-skip_open:
-	pp = cp->provider;
-
-	/*
-	 * Determine the actual size of the device.
-	 */
-	*max_psize = *psize = pp->mediasize;
-
-	/*
-	 * Determine the device's minimum transfer size and preferred
-	 * transfer size.
-	 */
-	*logical_ashift = highbit(MAX(pp->sectorsize, SPA_MINBLOCKSIZE)) - 1;
-	*physical_ashift = 0;
-	if (pp->stripesize > (1 << *logical_ashift) && ISP2(pp->stripesize) &&
-	    pp->stripesize <= (1 << SPA_MAXASHIFT) && pp->stripeoffset == 0)
-		*physical_ashift = highbit(pp->stripesize) - 1;
-
-	/*
-	 * Clear the nowritecache settings, so that on a vdev_reopen()
-	 * we will try again.
-	 */
-	vd->vdev_nowritecache = B_FALSE;
-
-	return (0);
-}
-
-static void
-vdev_geom_close(vdev_t *vd)
-{
-	struct g_consumer *cp;
-	int locked;
-
-	cp = vd->vdev_tsd;
-
-	DROP_GIANT();
-	locked = g_topology_locked();
-	if (!locked)
-		g_topology_lock();
-
-	if (!vd->vdev_reopening ||
-	    (cp != NULL && ((cp->flags & G_CF_ORPHAN) != 0 ||
-	    (cp->provider != NULL && cp->provider->error != 0))))
-		vdev_geom_close_locked(vd);
-
-	if (!locked)
-		g_topology_unlock();
-	PICKUP_GIANT();
-}
-
-static void
-vdev_geom_io_intr(struct bio *bp)
-{
-	vdev_t *vd;
-	zio_t *zio;
-
-	zio = bp->bio_caller1;
-	vd = zio->io_vd;
-	zio->io_error = bp->bio_error;
-	if (zio->io_error == 0 && bp->bio_resid != 0)
-		zio->io_error = SET_ERROR(EIO);
-
-	switch(zio->io_error) {
-	case ENOTSUP:
-		/*
-		 * If we get ENOTSUP for BIO_FLUSH or BIO_DELETE we know
-		 * that future attempts will never succeed. In this case
-		 * we set a persistent flag so that we don't bother with
-		 * requests in the future.
-		 */
-		switch(bp->bio_cmd) {
-		case BIO_FLUSH:
-			vd->vdev_nowritecache = B_TRUE;
-			break;
-		case BIO_DELETE:
-			vd->vdev_notrim = B_TRUE;
-			break;
-		}
-		break;
-	case ENXIO:
-		if (!vd->vdev_remove_wanted) {
-			/*
-			 * If provider's error is set we assume it is being
-			 * removed.
-			 */
-			if (bp->bio_to->error != 0) {
-				vd->vdev_remove_wanted = B_TRUE;
-				spa_async_request(zio->io_spa,
-				    SPA_ASYNC_REMOVE);
-			} else if (!vd->vdev_delayed_close) {
-				vd->vdev_delayed_close = B_TRUE;
-			}
-		}
-		break;
-	}
-
-	/*
-	 * We have to split bio freeing into two parts, because the ABD code
-	 * cannot be called in this context and vdev_op_io_done is not called
-	 * for ZIO_TYPE_IOCTL zio-s.
-	 */
-	if (zio->io_type != ZIO_TYPE_READ && zio->io_type != ZIO_TYPE_WRITE) {
-		g_destroy_bio(bp);
-		zio->io_bio = NULL;
-	}
-	zio_delay_interrupt(zio);
-}
-
-static void
-vdev_geom_io_start(zio_t *zio)
-{
-	vdev_t *vd;
-	struct g_consumer *cp;
-	struct bio *bp;
-	int error;
-
-	vd = zio->io_vd;
-
-	switch (zio->io_type) {
-	case ZIO_TYPE_IOCTL:
-		/* XXPOLICY */
-		if (!vdev_readable(vd)) {
-			zio->io_error = SET_ERROR(ENXIO);
-			zio_interrupt(zio);
-			return;
-		} else {
-			switch (zio->io_cmd) {
-			case DKIOCFLUSHWRITECACHE:
-				if (zfs_nocacheflush || vdev_geom_bio_flush_disable)
-					break;
-				if (vd->vdev_nowritecache) {
-					zio->io_error = SET_ERROR(ENOTSUP);
-					break;
-				}
-				goto sendreq;
-			default:
-				zio->io_error = SET_ERROR(ENOTSUP);
-			}
-		}
-
-		zio_execute(zio);
-		return;
-	case ZIO_TYPE_FREE:
-		if (vd->vdev_notrim) {
-			zio->io_error = SET_ERROR(ENOTSUP);
-		} else if (!vdev_geom_bio_delete_disable) {
-			goto sendreq;
-		}
-		zio_execute(zio);
-		return;
-	}
-sendreq:
-	ASSERT(zio->io_type == ZIO_TYPE_READ ||
-	    zio->io_type == ZIO_TYPE_WRITE ||
-	    zio->io_type == ZIO_TYPE_FREE ||
-	    zio->io_type == ZIO_TYPE_IOCTL);
-
-	cp = vd->vdev_tsd;
-	if (cp == NULL) {
-		zio->io_error = SET_ERROR(ENXIO);
-		zio_interrupt(zio);
-		return;
-	}
-	bp = g_alloc_bio();
-	bp->bio_caller1 = zio;
-	switch (zio->io_type) {
-	case ZIO_TYPE_READ:
-	case ZIO_TYPE_WRITE:
-		zio->io_target_timestamp = zio_handle_io_delay(zio);
-		bp->bio_offset = zio->io_offset;
-		bp->bio_length = zio->io_size;
-		if (zio->io_type == ZIO_TYPE_READ) {
-			bp->bio_cmd = BIO_READ;
-			bp->bio_data =
-			    abd_borrow_buf(zio->io_abd, zio->io_size);
-		} else {
-			bp->bio_cmd = BIO_WRITE;
-			bp->bio_data =
-			    abd_borrow_buf_copy(zio->io_abd, zio->io_size);
-		}
-		break;
-	case ZIO_TYPE_FREE:
-		bp->bio_cmd = BIO_DELETE;
-		bp->bio_data = NULL;
-		bp->bio_offset = zio->io_offset;
-		bp->bio_length = zio->io_size;
-		break;
-	case ZIO_TYPE_IOCTL:
-		bp->bio_cmd = BIO_FLUSH;
-		bp->bio_data = NULL;
-		bp->bio_offset = cp->provider->mediasize;
-		bp->bio_length = 0;
-		break;
-	}
-	bp->bio_done = vdev_geom_io_intr;
-	zio->io_bio = bp;
-
-	g_io_request(bp, cp);
-}
-
-static void
-vdev_geom_io_done(zio_t *zio)
-{
-	struct bio *bp = zio->io_bio;
-
-	if (zio->io_type != ZIO_TYPE_READ && zio->io_type != ZIO_TYPE_WRITE) {
-		ASSERT(bp == NULL);
-		return;
-	}
-
-	if (bp == NULL) {
-		ASSERT3S(zio->io_error, ==, ENXIO);
-		return;
-	}
-
-	if (zio->io_type == ZIO_TYPE_READ)
-		abd_return_buf_copy(zio->io_abd, bp->bio_data, zio->io_size);
-	else
-		abd_return_buf(zio->io_abd, bp->bio_data, zio->io_size);
-
-	g_destroy_bio(bp);
-	zio->io_bio = NULL;
-}
-
-static void
-vdev_geom_hold(vdev_t *vd)
-{
-}
-
-static void
-vdev_geom_rele(vdev_t *vd)
-{
-}
-
-vdev_ops_t vdev_geom_ops = {
-	vdev_geom_open,
-	vdev_geom_close,
-	vdev_default_asize,
-	vdev_geom_io_start,
-	vdev_geom_io_done,
-	NULL,
-	NULL,
-	vdev_geom_hold,
-	vdev_geom_rele,
-	NULL,
-	vdev_default_xlate,
-	VDEV_TYPE_DISK,		/* name of this vdev type */
-	B_TRUE			/* leaf vdev */
-};
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_indirect.c
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_indirect.c
+++ /dev/null
@@ -1,1849 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * This file and its contents are supplied under the terms of the
- * Common Development and Distribution License ("CDDL"), version 1.0.
- * You may only use this file in accordance with the terms of version
- * 1.0 of the CDDL.
- *
- * A full copy of the text of the CDDL should have accompanied this
- * source.  A copy of the CDDL is also available via the Internet at
- * http://www.illumos.org/license/CDDL.
- *
- * CDDL HEADER END
- */
-
-/*
- * Copyright (c) 2014, 2017 by Delphix. All rights reserved.
- */
-
-#include <sys/zfs_context.h>
-#include <sys/spa.h>
-#include <sys/spa_impl.h>
-#include <sys/vdev_impl.h>
-#include <sys/fs/zfs.h>
-#include <sys/zio.h>
-#include <sys/zio_checksum.h>
-#include <sys/metaslab.h>
-#include <sys/refcount.h>
-#include <sys/dmu.h>
-#include <sys/vdev_indirect_mapping.h>
-#include <sys/dmu_tx.h>
-#include <sys/dsl_synctask.h>
-#include <sys/zap.h>
-#include <sys/abd.h>
-#include <sys/zthr.h>
-
-/*
- * An indirect vdev corresponds to a vdev that has been removed.  Since
- * we cannot rewrite block pointers of snapshots, etc., we keep a
- * mapping from old location on the removed device to the new location
- * on another device in the pool and use this mapping whenever we need
- * to access the DVA.  Unfortunately, this mapping did not respect
- * logical block boundaries when it was first created, and so a DVA on
- * this indirect vdev may be "split" into multiple sections that each
- * map to a different location.  As a consequence, not all DVAs can be
- * translated to an equivalent new DVA.  Instead we must provide a
- * "vdev_remap" operation that executes a callback on each contiguous
- * segment of the new location.  This function is used in multiple ways:
- *
- *  - i/os to this vdev use the callback to determine where the
- *    data is now located, and issue child i/os for each segment's new
- *    location.
- *
- *  - frees and claims to this vdev use the callback to free or claim
- *    each mapped segment.  (Note that we don't actually need to claim
- *    log blocks on indirect vdevs, because we don't allocate to
- *    removing vdevs.  However, zdb uses zio_claim() for its leak
- *    detection.)
- */
-
-/*
- * "Big theory statement" for how we mark blocks obsolete.
- *
- * When a block on an indirect vdev is freed or remapped, a section of
- * that vdev's mapping may no longer be referenced (aka "obsolete").  We
- * keep track of how much of each mapping entry is obsolete.  When
- * an entry becomes completely obsolete, we can remove it, thus reducing
- * the memory used by the mapping.  The complete picture of obsolescence
- * is given by the following data structures, described below:
- *  - the entry-specific obsolete count
- *  - the vdev-specific obsolete spacemap
- *  - the pool-specific obsolete bpobj
- *
- * == On disk data structures used ==
- *
- * We track the obsolete space for the pool using several objects.  Each
- * of these objects is created on demand and freed when no longer
- * needed, and is assumed to be empty if it does not exist.
- * SPA_FEATURE_OBSOLETE_COUNTS includes the count of these objects.
- *
- *  - Each vic_mapping_object (associated with an indirect vdev) can
- *    have a vimp_counts_object.  This is an array of uint32_t's
- *    with the same number of entries as the vic_mapping_object.  When
- *    the mapping is condensed, entries from the vic_obsolete_sm_object
- *    (see below) are folded into the counts.  Therefore, each
- *    obsolete_counts entry tells us the number of bytes in the
- *    corresponding mapping entry that were not referenced when the
- *    mapping was last condensed.
- *
- *  - Each indirect or removing vdev can have a vic_obsolete_sm_object.
- *    This is a space map containing an alloc entry for every DVA that
- *    has been obsoleted since the last time this indirect vdev was
- *    condensed.  We use this object in order to improve performance
- *    when marking a DVA as obsolete.  Instead of modifying an arbitrary
- *    offset of the vimp_counts_object, we only need to append an entry
- *    to the end of this object.  When a DVA becomes obsolete, it is
- *    added to the obsolete space map.  This happens when the DVA is
- *    freed, remapped and not referenced by a snapshot, or the last
- *    snapshot referencing it is destroyed.
- *
- *  - Each dataset can have a ds_remap_deadlist object.  This is a
- *    deadlist object containing all blocks that were remapped in this
- *    dataset but referenced in a previous snapshot.  Blocks can *only*
- *    appear on this list if they were remapped (dsl_dataset_block_remapped);
- *    blocks that were killed in a head dataset are put on the normal
- *    ds_deadlist and marked obsolete when they are freed.
- *
- *  - The pool can have a dp_obsolete_bpobj.  This is a list of blocks
- *    in the pool that need to be marked obsolete.  When a snapshot is
- *    destroyed, we move some of the ds_remap_deadlist to the obsolete
- *    bpobj (see dsl_destroy_snapshot_handle_remaps()).  We then
- *    asynchronously process the obsolete bpobj, moving its entries to
- *    the specific vdevs' obsolete space maps.
- *
- * == Summary of how we mark blocks as obsolete ==
- *
- * - When freeing a block: if any DVA is on an indirect vdev, append to
- *   vic_obsolete_sm_object.
- * - When remapping a block, add dva to ds_remap_deadlist (if prev snap
- *   references; otherwise append to vic_obsolete_sm_object).
- * - When freeing a snapshot: move parts of ds_remap_deadlist to
- *   dp_obsolete_bpobj (same algorithm as ds_deadlist).
- * - When syncing the spa: process dp_obsolete_bpobj, moving ranges to
- *   individual vdev's vic_obsolete_sm_object.
- */
-
-/*
- * "Big theory statement" for how we condense indirect vdevs.
- *
- * Condensing an indirect vdev's mapping is the process of determining
- * the precise counts of obsolete space for each mapping entry (by
- * integrating the obsolete spacemap into the obsolete counts) and
- * writing out a new mapping that contains only referenced entries.
- *
- * We condense a vdev when we expect the mapping to shrink (see
- * vdev_indirect_should_condense()), but only perform one condense at a
- * time to limit the memory usage.  In addition, we use a separate
- * open-context thread (spa_condense_indirect_thread) to incrementally
- * create the new mapping object in a way that minimizes the impact on
- * the rest of the system.
- *
- * == Generating a new mapping ==
- *
- * To generate a new mapping, we follow these steps:
- *
- * 1. Save the old obsolete space map and create a new mapping object
- *    (see spa_condense_indirect_start_sync()).  This initializes the
- *    spa_condensing_indirect_phys with the "previous obsolete space map",
- *    which is now read only.  Newly obsolete DVAs will be added to a
- *    new (initially empty) obsolete space map, and will not be
- *    considered as part of this condense operation.
- *
- * 2. Construct in memory the precise counts of obsolete space for each
- *    mapping entry, by incorporating the obsolete space map into the
- *    counts.  (See vdev_indirect_mapping_load_obsolete_{counts,spacemap}().)
- *
- * 3. Iterate through each mapping entry, writing to the new mapping any
- *    entries that are not completely obsolete (i.e. which don't have
- *    obsolete count == mapping length).  (See
- *    spa_condense_indirect_generate_new_mapping().)
- *
- * 4. Destroy the old mapping object and switch over to the new one
- *    (spa_condense_indirect_complete_sync).
- *
- * == Restarting from failure ==
- *
- * To restart the condense when we import/open the pool, we must start
- * at the 2nd step above: reconstruct the precise counts in memory,
- * based on the space map + counts.  Then in the 3rd step, we start
- * iterating where we left off: at vimp_max_offset of the new mapping
- * object.
- */
-
-boolean_t zfs_condense_indirect_vdevs_enable = B_TRUE;
-
-/*
- * Condense if at least this percent of the bytes in the mapping is
- * obsolete.  With the default of 25%, the amount of space mapped
- * will be reduced to 1% of its original size after at most 16
- * condenses.  Higher values will condense less often (causing less
- * i/o); lower values will reduce the mapping size more quickly.
- */
-int zfs_indirect_condense_obsolete_pct = 25;
-
-/*
- * Condense if the obsolete space map takes up more than this amount of
- * space on disk (logically).  This limits the amount of disk space
- * consumed by the obsolete space map; the default of 1GB is small enough
- * that we typically don't mind "wasting" it.
- */
-uint64_t zfs_condense_max_obsolete_bytes = 1024 * 1024 * 1024;
-
-/*
- * Don't bother condensing if the mapping uses less than this amount of
- * memory.  The default of 128KB is considered a "trivial" amount of
- * memory and not worth reducing.
- */
-uint64_t zfs_condense_min_mapping_bytes = 128 * 1024;
-
-/*
- * This is used by the test suite so that it can ensure that certain
- * actions happen while in the middle of a condense (which might otherwise
- * complete too quickly).  If used to reduce the performance impact of
- * condensing in production, a maximum value of 1 should be sufficient.
- */
-int zfs_condense_indirect_commit_entry_delay_ticks = 0;
-
-/*
- * If an indirect split block contains more than this many possible unique
- * combinations when being reconstructed, consider it too computationally
- * expensive to check them all. Instead, try at most 100 randomly-selected
- * combinations each time the block is accessed.  This allows all segment
- * copies to participate fairly in the reconstruction when all combinations
- * cannot be checked and prevents repeated use of one bad copy.
- */
-int zfs_reconstruct_indirect_combinations_max = 256;
-
-
-/*
- * Enable to simulate damaged segments and validate reconstruction.
- * Used by ztest
- */
-unsigned long zfs_reconstruct_indirect_damage_fraction = 0;
-
-/*
- * The indirect_child_t represents the vdev that we will read from, when we
- * need to read all copies of the data (e.g. for scrub or reconstruction).
- * For plain (non-mirror) top-level vdevs (i.e. is_vdev is not a mirror),
- * ic_vdev is the same as is_vdev.  However, for mirror top-level vdevs,
- * ic_vdev is a child of the mirror.
- */
-typedef struct indirect_child {
-	abd_t *ic_data;
-	vdev_t *ic_vdev;
-
-	/*
-	 * ic_duplicate is NULL when the ic_data contents are unique, when it
-	 * is determined to be a duplicate it references the primary child.
-	 */
-	struct indirect_child *ic_duplicate;
-	list_node_t ic_node; /* node on is_unique_child */
-} indirect_child_t;
-
-/*
- * The indirect_split_t represents one mapped segment of an i/o to the
- * indirect vdev. For non-split (contiguously-mapped) blocks, there will be
- * only one indirect_split_t, with is_split_offset==0 and is_size==io_size.
- * For split blocks, there will be several of these.
- */
-typedef struct indirect_split {
-	list_node_t is_node; /* link on iv_splits */
-
-	/*
-	 * is_split_offset is the offset into the i/o.
-	 * This is the sum of the previous splits' is_size's.
-	 */
-	uint64_t is_split_offset;
-
-	vdev_t *is_vdev; /* top-level vdev */
-	uint64_t is_target_offset; /* offset on is_vdev */
-	uint64_t is_size;
-	int is_children; /* number of entries in is_child[] */
-	int is_unique_children; /* number of entries in is_unique_child */
-	list_t is_unique_child;
-
-	/*
-	 * is_good_child is the child that we are currently using to
-	 * attempt reconstruction.
-	 */
-	indirect_child_t *is_good_child;
-
-	indirect_child_t is_child[1]; /* variable-length */
-} indirect_split_t;
-
-/*
- * The indirect_vsd_t is associated with each i/o to the indirect vdev.
- * It is the "Vdev-Specific Data" in the zio_t's io_vsd.
- */
-typedef struct indirect_vsd {
-	boolean_t iv_split_block;
-	boolean_t iv_reconstruct;
-	uint64_t iv_unique_combinations;
-	uint64_t iv_attempts;
-	uint64_t iv_attempts_max;
-
-	list_t iv_splits; /* list of indirect_split_t's */
-} indirect_vsd_t;
-
-static void
-vdev_indirect_map_free(zio_t *zio)
-{
-	indirect_vsd_t *iv = zio->io_vsd;
-
-	indirect_split_t *is;
-	while ((is = list_head(&iv->iv_splits)) != NULL) {
-		for (int c = 0; c < is->is_children; c++) {
-			indirect_child_t *ic = &is->is_child[c];
-			if (ic->ic_data != NULL)
-				abd_free(ic->ic_data);
-		}
-		list_remove(&iv->iv_splits, is);
-
-		indirect_child_t *ic;
-		while ((ic = list_head(&is->is_unique_child)) != NULL)
-			list_remove(&is->is_unique_child, ic);
-
-		list_destroy(&is->is_unique_child);
-
-		kmem_free(is,
-		    offsetof(indirect_split_t, is_child[is->is_children]));
-	}
-	kmem_free(iv, sizeof (*iv));
-}
-
-static const zio_vsd_ops_t vdev_indirect_vsd_ops = {
-	vdev_indirect_map_free,
-	zio_vsd_default_cksum_report
-};
-/*
- * Mark the given offset and size as being obsolete.
- */
-void
-vdev_indirect_mark_obsolete(vdev_t *vd, uint64_t offset, uint64_t size)
-{
-	spa_t *spa = vd->vdev_spa;
-
-	ASSERT3U(vd->vdev_indirect_config.vic_mapping_object, !=, 0);
-	ASSERT(vd->vdev_removing || vd->vdev_ops == &vdev_indirect_ops);
-	ASSERT(size > 0);
-	VERIFY(vdev_indirect_mapping_entry_for_offset(
-	    vd->vdev_indirect_mapping, offset) != NULL);
-
-	if (spa_feature_is_enabled(spa, SPA_FEATURE_OBSOLETE_COUNTS)) {
-		mutex_enter(&vd->vdev_obsolete_lock);
-		range_tree_add(vd->vdev_obsolete_segments, offset, size);
-		mutex_exit(&vd->vdev_obsolete_lock);
-		vdev_dirty(vd, 0, NULL, spa_syncing_txg(spa));
-	}
-}
-
-/*
- * Mark the DVA vdev_id:offset:size as being obsolete in the given tx. This
- * wrapper is provided because the DMU does not know about vdev_t's and
- * cannot directly call vdev_indirect_mark_obsolete.
- */
-void
-spa_vdev_indirect_mark_obsolete(spa_t *spa, uint64_t vdev_id, uint64_t offset,
-    uint64_t size, dmu_tx_t *tx)
-{
-	vdev_t *vd = vdev_lookup_top(spa, vdev_id);
-	ASSERT(dmu_tx_is_syncing(tx));
-
-	/* The DMU can only remap indirect vdevs. */
-	ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops);
-	vdev_indirect_mark_obsolete(vd, offset, size);
-}
-
-static spa_condensing_indirect_t *
-spa_condensing_indirect_create(spa_t *spa)
-{
-	spa_condensing_indirect_phys_t *scip =
-	    &spa->spa_condensing_indirect_phys;
-	spa_condensing_indirect_t *sci = kmem_zalloc(sizeof (*sci), KM_SLEEP);
-	objset_t *mos = spa->spa_meta_objset;
-
-	for (int i = 0; i < TXG_SIZE; i++) {
-		list_create(&sci->sci_new_mapping_entries[i],
-		    sizeof (vdev_indirect_mapping_entry_t),
-		    offsetof(vdev_indirect_mapping_entry_t, vime_node));
-	}
-
-	sci->sci_new_mapping =
-	    vdev_indirect_mapping_open(mos, scip->scip_next_mapping_object);
-
-	return (sci);
-}
-
-static void
-spa_condensing_indirect_destroy(spa_condensing_indirect_t *sci)
-{
-	for (int i = 0; i < TXG_SIZE; i++)
-		list_destroy(&sci->sci_new_mapping_entries[i]);
-
-	if (sci->sci_new_mapping != NULL)
-		vdev_indirect_mapping_close(sci->sci_new_mapping);
-
-	kmem_free(sci, sizeof (*sci));
-}
-
-boolean_t
-vdev_indirect_should_condense(vdev_t *vd)
-{
-	vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
-	spa_t *spa = vd->vdev_spa;
-
-	ASSERT(dsl_pool_sync_context(spa->spa_dsl_pool));
-
-	if (!zfs_condense_indirect_vdevs_enable)
-		return (B_FALSE);
-
-	/*
-	 * We can only condense one indirect vdev at a time.
-	 */
-	if (spa->spa_condensing_indirect != NULL)
-		return (B_FALSE);
-
-	if (spa_shutting_down(spa))
-		return (B_FALSE);
-
-	/*
-	 * The mapping object size must not change while we are
-	 * condensing, so we can only condense indirect vdevs
-	 * (not vdevs that are still in the middle of being removed).
-	 */
-	if (vd->vdev_ops != &vdev_indirect_ops)
-		return (B_FALSE);
-
-	/*
-	 * If nothing new has been marked obsolete, there is no
-	 * point in condensing.
-	 */
-	if (vd->vdev_obsolete_sm == NULL) {
-		ASSERT0(vdev_obsolete_sm_object(vd));
-		return (B_FALSE);
-	}
-
-	ASSERT(vd->vdev_obsolete_sm != NULL);
-
-	ASSERT3U(vdev_obsolete_sm_object(vd), ==,
-	    space_map_object(vd->vdev_obsolete_sm));
-
-	uint64_t bytes_mapped = vdev_indirect_mapping_bytes_mapped(vim);
-	uint64_t bytes_obsolete = space_map_allocated(vd->vdev_obsolete_sm);
-	uint64_t mapping_size = vdev_indirect_mapping_size(vim);
-	uint64_t obsolete_sm_size = space_map_length(vd->vdev_obsolete_sm);
-
-	ASSERT3U(bytes_obsolete, <=, bytes_mapped);
-
-	/*
-	 * If a high percentage of the bytes that are mapped have become
-	 * obsolete, condense (unless the mapping is already small enough).
-	 * This has a good chance of reducing the amount of memory used
-	 * by the mapping.
-	 */
-	if (bytes_obsolete * 100 / bytes_mapped >=
-	    zfs_indirect_condense_obsolete_pct &&
-	    mapping_size > zfs_condense_min_mapping_bytes) {
-		zfs_dbgmsg("should condense vdev %llu because obsolete "
-		    "spacemap covers %d%% of %lluMB mapping",
-		    (u_longlong_t)vd->vdev_id,
-		    (int)(bytes_obsolete * 100 / bytes_mapped),
-		    (u_longlong_t)bytes_mapped / 1024 / 1024);
-		return (B_TRUE);
-	}
-
-	/*
-	 * If the obsolete space map takes up too much space on disk,
-	 * condense in order to free up this disk space.
-	 */
-	if (obsolete_sm_size >= zfs_condense_max_obsolete_bytes) {
-		zfs_dbgmsg("should condense vdev %llu because obsolete sm "
-		    "length %lluMB >= max size %lluMB",
-		    (u_longlong_t)vd->vdev_id,
-		    (u_longlong_t)obsolete_sm_size / 1024 / 1024,
-		    (u_longlong_t)zfs_condense_max_obsolete_bytes /
-		    1024 / 1024);
-		return (B_TRUE);
-	}
-
-	return (B_FALSE);
-}
-
-/*
- * This sync task completes (finishes) a condense, deleting the old
- * mapping and replacing it with the new one.
- */
-static void
-spa_condense_indirect_complete_sync(void *arg, dmu_tx_t *tx)
-{
-	spa_condensing_indirect_t *sci = arg;
-	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
-	spa_condensing_indirect_phys_t *scip =
-	    &spa->spa_condensing_indirect_phys;
-	vdev_t *vd = vdev_lookup_top(spa, scip->scip_vdev);
-	vdev_indirect_config_t *vic = &vd->vdev_indirect_config;
-	objset_t *mos = spa->spa_meta_objset;
-	vdev_indirect_mapping_t *old_mapping = vd->vdev_indirect_mapping;
-	uint64_t old_count = vdev_indirect_mapping_num_entries(old_mapping);
-	uint64_t new_count =
-	    vdev_indirect_mapping_num_entries(sci->sci_new_mapping);
-
-	ASSERT(dmu_tx_is_syncing(tx));
-	ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops);
-	ASSERT3P(sci, ==, spa->spa_condensing_indirect);
-	for (int i = 0; i < TXG_SIZE; i++) {
-		ASSERT(list_is_empty(&sci->sci_new_mapping_entries[i]));
-	}
-	ASSERT(vic->vic_mapping_object != 0);
-	ASSERT3U(vd->vdev_id, ==, scip->scip_vdev);
-	ASSERT(scip->scip_next_mapping_object != 0);
-	ASSERT(scip->scip_prev_obsolete_sm_object != 0);
-
-	/*
-	 * Reset vdev_indirect_mapping to refer to the new object.
-	 */
-	rw_enter(&vd->vdev_indirect_rwlock, RW_WRITER);
-	vdev_indirect_mapping_close(vd->vdev_indirect_mapping);
-	vd->vdev_indirect_mapping = sci->sci_new_mapping;
-	rw_exit(&vd->vdev_indirect_rwlock);
-
-	sci->sci_new_mapping = NULL;
-	vdev_indirect_mapping_free(mos, vic->vic_mapping_object, tx);
-	vic->vic_mapping_object = scip->scip_next_mapping_object;
-	scip->scip_next_mapping_object = 0;
-
-	space_map_free_obj(mos, scip->scip_prev_obsolete_sm_object, tx);
-	spa_feature_decr(spa, SPA_FEATURE_OBSOLETE_COUNTS, tx);
-	scip->scip_prev_obsolete_sm_object = 0;
-
-	scip->scip_vdev = 0;
-
-	VERIFY0(zap_remove(mos, DMU_POOL_DIRECTORY_OBJECT,
-	    DMU_POOL_CONDENSING_INDIRECT, tx));
-	spa_condensing_indirect_destroy(spa->spa_condensing_indirect);
-	spa->spa_condensing_indirect = NULL;
-
-	zfs_dbgmsg("finished condense of vdev %llu in txg %llu: "
-	    "new mapping object %llu has %llu entries "
-	    "(was %llu entries)",
-	    vd->vdev_id, dmu_tx_get_txg(tx), vic->vic_mapping_object,
-	    new_count, old_count);
-
-	vdev_config_dirty(spa->spa_root_vdev);
-}
-
-/*
- * This sync task appends entries to the new mapping object.
- */
-static void
-spa_condense_indirect_commit_sync(void *arg, dmu_tx_t *tx)
-{
-	spa_condensing_indirect_t *sci = arg;
-	uint64_t txg = dmu_tx_get_txg(tx);
-	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
-
-	ASSERT(dmu_tx_is_syncing(tx));
-	ASSERT3P(sci, ==, spa->spa_condensing_indirect);
-
-	vdev_indirect_mapping_add_entries(sci->sci_new_mapping,
-	    &sci->sci_new_mapping_entries[txg & TXG_MASK], tx);
-	ASSERT(list_is_empty(&sci->sci_new_mapping_entries[txg & TXG_MASK]));
-}
-
-/*
- * Open-context function to add one entry to the new mapping.  The new
- * entry will be remembered and written from syncing context.
- */
-static void
-spa_condense_indirect_commit_entry(spa_t *spa,
-    vdev_indirect_mapping_entry_phys_t *vimep, uint32_t count)
-{
-	spa_condensing_indirect_t *sci = spa->spa_condensing_indirect;
-
-	ASSERT3U(count, <, DVA_GET_ASIZE(&vimep->vimep_dst));
-
-	dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
-	dmu_tx_hold_space(tx, sizeof (*vimep) + sizeof (count));
-	VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
-	int txgoff = dmu_tx_get_txg(tx) & TXG_MASK;
-
-	/*
-	 * If we are the first entry committed this txg, kick off the sync
-	 * task to write to the MOS on our behalf.
-	 */
-	if (list_is_empty(&sci->sci_new_mapping_entries[txgoff])) {
-		dsl_sync_task_nowait(dmu_tx_pool(tx),
-		    spa_condense_indirect_commit_sync, sci,
-		    0, ZFS_SPACE_CHECK_NONE, tx);
-	}
-
-	vdev_indirect_mapping_entry_t *vime =
-	    kmem_alloc(sizeof (*vime), KM_SLEEP);
-	vime->vime_mapping = *vimep;
-	vime->vime_obsolete_count = count;
-	list_insert_tail(&sci->sci_new_mapping_entries[txgoff], vime);
-
-	dmu_tx_commit(tx);
-}
-
-static void
-spa_condense_indirect_generate_new_mapping(vdev_t *vd,
-    uint32_t *obsolete_counts, uint64_t start_index, zthr_t *zthr)
-{
-	spa_t *spa = vd->vdev_spa;
-	uint64_t mapi = start_index;
-	vdev_indirect_mapping_t *old_mapping = vd->vdev_indirect_mapping;
-	uint64_t old_num_entries =
-	    vdev_indirect_mapping_num_entries(old_mapping);
-
-	ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops);
-	ASSERT3U(vd->vdev_id, ==, spa->spa_condensing_indirect_phys.scip_vdev);
-
-	zfs_dbgmsg("starting condense of vdev %llu from index %llu",
-	    (u_longlong_t)vd->vdev_id,
-	    (u_longlong_t)mapi);
-
-	while (mapi < old_num_entries) {
-
-		if (zthr_iscancelled(zthr)) {
-			zfs_dbgmsg("pausing condense of vdev %llu "
-			    "at index %llu", (u_longlong_t)vd->vdev_id,
-			    (u_longlong_t)mapi);
-			break;
-		}
-
-		vdev_indirect_mapping_entry_phys_t *entry =
-		    &old_mapping->vim_entries[mapi];
-		uint64_t entry_size = DVA_GET_ASIZE(&entry->vimep_dst);
-		ASSERT3U(obsolete_counts[mapi], <=, entry_size);
-		if (obsolete_counts[mapi] < entry_size) {
-			spa_condense_indirect_commit_entry(spa, entry,
-			    obsolete_counts[mapi]);
-
-			/*
-			 * This delay may be requested for testing, debugging,
-			 * or performance reasons.
-			 */
-			delay(zfs_condense_indirect_commit_entry_delay_ticks);
-		}
-
-		mapi++;
-	}
-}
-
-/* ARGSUSED */
-static boolean_t
-spa_condense_indirect_thread_check(void *arg, zthr_t *zthr)
-{
-	spa_t *spa = arg;
-
-	return (spa->spa_condensing_indirect != NULL);
-}
-
-/* ARGSUSED */
-static void
-spa_condense_indirect_thread(void *arg, zthr_t *zthr)
-{
-	spa_t *spa = arg;
-	vdev_t *vd;
-
-	ASSERT3P(spa->spa_condensing_indirect, !=, NULL);
-	spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
-	vd = vdev_lookup_top(spa, spa->spa_condensing_indirect_phys.scip_vdev);
-	ASSERT3P(vd, !=, NULL);
-	spa_config_exit(spa, SCL_VDEV, FTAG);
-
-	spa_condensing_indirect_t *sci = spa->spa_condensing_indirect;
-	spa_condensing_indirect_phys_t *scip =
-	    &spa->spa_condensing_indirect_phys;
-	uint32_t *counts;
-	uint64_t start_index;
-	vdev_indirect_mapping_t *old_mapping = vd->vdev_indirect_mapping;
-	space_map_t *prev_obsolete_sm = NULL;
-
-	ASSERT3U(vd->vdev_id, ==, scip->scip_vdev);
-	ASSERT(scip->scip_next_mapping_object != 0);
-	ASSERT(scip->scip_prev_obsolete_sm_object != 0);
-	ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops);
-
-	for (int i = 0; i < TXG_SIZE; i++) {
-		/*
-		 * The list must start out empty in order for the
-		 * _commit_sync() sync task to be properly registered
-		 * on the first call to _commit_entry(); so it's wise
-		 * to double check and ensure we actually are starting
-		 * with empty lists.
-		 */
-		ASSERT(list_is_empty(&sci->sci_new_mapping_entries[i]));
-	}
-
-	VERIFY0(space_map_open(&prev_obsolete_sm, spa->spa_meta_objset,
-	    scip->scip_prev_obsolete_sm_object, 0, vd->vdev_asize, 0));
-	counts = vdev_indirect_mapping_load_obsolete_counts(old_mapping);
-	if (prev_obsolete_sm != NULL) {
-		vdev_indirect_mapping_load_obsolete_spacemap(old_mapping,
-		    counts, prev_obsolete_sm);
-	}
-	space_map_close(prev_obsolete_sm);
-
-	/*
-	 * Generate new mapping.  Determine what index to continue from
-	 * based on the max offset that we've already written in the
-	 * new mapping.
-	 */
-	uint64_t max_offset =
-	    vdev_indirect_mapping_max_offset(sci->sci_new_mapping);
-	if (max_offset == 0) {
-		/* We haven't written anything to the new mapping yet. */
-		start_index = 0;
-	} else {
-		/*
-		 * Pick up from where we left off. _entry_for_offset()
-		 * returns a pointer into the vim_entries array. If
-		 * max_offset is greater than any of the mappings
-		 * contained in the table  NULL will be returned and
-		 * that indicates we've exhausted our iteration of the
-		 * old_mapping.
-		 */
-
-		vdev_indirect_mapping_entry_phys_t *entry =
-		    vdev_indirect_mapping_entry_for_offset_or_next(old_mapping,
-		    max_offset);
-
-		if (entry == NULL) {
-			/*
-			 * We've already written the whole new mapping.
-			 * This special value will cause us to skip the
-			 * generate_new_mapping step and just do the sync
-			 * task to complete the condense.
-			 */
-			start_index = UINT64_MAX;
-		} else {
-			start_index = entry - old_mapping->vim_entries;
-			ASSERT3U(start_index, <,
-			    vdev_indirect_mapping_num_entries(old_mapping));
-		}
-	}
-
-	spa_condense_indirect_generate_new_mapping(vd, counts,
-	    start_index, zthr);
-
-	vdev_indirect_mapping_free_obsolete_counts(old_mapping, counts);
-
-	/*
-	 * If the zthr has received a cancellation signal while running
-	 * in generate_new_mapping() or at any point after that, then bail
-	 * early. We don't want to complete the condense if the spa is
-	 * shutting down.
-	 */
-	if (zthr_iscancelled(zthr))
-		return;
-
-	VERIFY0(dsl_sync_task(spa_name(spa), NULL,
-	    spa_condense_indirect_complete_sync, sci, 0,
-	    ZFS_SPACE_CHECK_EXTRA_RESERVED));
-}
-
-/*
- * Sync task to begin the condensing process.
- */
-void
-spa_condense_indirect_start_sync(vdev_t *vd, dmu_tx_t *tx)
-{
-	spa_t *spa = vd->vdev_spa;
-	spa_condensing_indirect_phys_t *scip =
-	    &spa->spa_condensing_indirect_phys;
-
-	ASSERT0(scip->scip_next_mapping_object);
-	ASSERT0(scip->scip_prev_obsolete_sm_object);
-	ASSERT0(scip->scip_vdev);
-	ASSERT(dmu_tx_is_syncing(tx));
-	ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops);
-	ASSERT(spa_feature_is_active(spa, SPA_FEATURE_OBSOLETE_COUNTS));
-	ASSERT(vdev_indirect_mapping_num_entries(vd->vdev_indirect_mapping));
-
-	uint64_t obsolete_sm_obj = vdev_obsolete_sm_object(vd);
-	ASSERT(obsolete_sm_obj != 0);
-
-	scip->scip_vdev = vd->vdev_id;
-	scip->scip_next_mapping_object =
-	    vdev_indirect_mapping_alloc(spa->spa_meta_objset, tx);
-
-	scip->scip_prev_obsolete_sm_object = obsolete_sm_obj;
-
-	/*
-	 * We don't need to allocate a new space map object, since
-	 * vdev_indirect_sync_obsolete will allocate one when needed.
-	 */
-	space_map_close(vd->vdev_obsolete_sm);
-	vd->vdev_obsolete_sm = NULL;
-	VERIFY0(zap_remove(spa->spa_meta_objset, vd->vdev_top_zap,
-	    VDEV_TOP_ZAP_INDIRECT_OBSOLETE_SM, tx));
-
-	VERIFY0(zap_add(spa->spa_dsl_pool->dp_meta_objset,
-	    DMU_POOL_DIRECTORY_OBJECT,
-	    DMU_POOL_CONDENSING_INDIRECT, sizeof (uint64_t),
-	    sizeof (*scip) / sizeof (uint64_t), scip, tx));
-
-	ASSERT3P(spa->spa_condensing_indirect, ==, NULL);
-	spa->spa_condensing_indirect = spa_condensing_indirect_create(spa);
-
-	zfs_dbgmsg("starting condense of vdev %llu in txg %llu: "
-	    "posm=%llu nm=%llu",
-	    vd->vdev_id, dmu_tx_get_txg(tx),
-	    (u_longlong_t)scip->scip_prev_obsolete_sm_object,
-	    (u_longlong_t)scip->scip_next_mapping_object);
-
-	zthr_wakeup(spa->spa_condense_zthr);
-}
-
-/*
- * Sync to the given vdev's obsolete space map any segments that are no longer
- * referenced as of the given txg.
- *
- * If the obsolete space map doesn't exist yet, create and open it.
- */
-void
-vdev_indirect_sync_obsolete(vdev_t *vd, dmu_tx_t *tx)
-{
-	spa_t *spa = vd->vdev_spa;
-	vdev_indirect_config_t *vic = &vd->vdev_indirect_config;
-
-	ASSERT3U(vic->vic_mapping_object, !=, 0);
-	ASSERT(range_tree_space(vd->vdev_obsolete_segments) > 0);
-	ASSERT(vd->vdev_removing || vd->vdev_ops == &vdev_indirect_ops);
-	ASSERT(spa_feature_is_enabled(spa, SPA_FEATURE_OBSOLETE_COUNTS));
-
-	if (vdev_obsolete_sm_object(vd) == 0) {
-		uint64_t obsolete_sm_object =
-		    space_map_alloc(spa->spa_meta_objset,
-		    vdev_standard_sm_blksz, tx);
-
-		ASSERT(vd->vdev_top_zap != 0);
-		VERIFY0(zap_add(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap,
-		    VDEV_TOP_ZAP_INDIRECT_OBSOLETE_SM,
-		    sizeof (obsolete_sm_object), 1, &obsolete_sm_object, tx));
-		ASSERT3U(vdev_obsolete_sm_object(vd), !=, 0);
-
-		spa_feature_incr(spa, SPA_FEATURE_OBSOLETE_COUNTS, tx);
-		VERIFY0(space_map_open(&vd->vdev_obsolete_sm,
-		    spa->spa_meta_objset, obsolete_sm_object,
-		    0, vd->vdev_asize, 0));
-	}
-
-	ASSERT(vd->vdev_obsolete_sm != NULL);
-	ASSERT3U(vdev_obsolete_sm_object(vd), ==,
-	    space_map_object(vd->vdev_obsolete_sm));
-
-	space_map_write(vd->vdev_obsolete_sm,
-	    vd->vdev_obsolete_segments, SM_ALLOC, SM_NO_VDEVID, tx);
-	range_tree_vacate(vd->vdev_obsolete_segments, NULL, NULL);
-}
-
-int
-spa_condense_init(spa_t *spa)
-{
-	int error = zap_lookup(spa->spa_meta_objset,
-	    DMU_POOL_DIRECTORY_OBJECT,
-	    DMU_POOL_CONDENSING_INDIRECT, sizeof (uint64_t),
-	    sizeof (spa->spa_condensing_indirect_phys) / sizeof (uint64_t),
-	    &spa->spa_condensing_indirect_phys);
-	if (error == 0) {
-		if (spa_writeable(spa)) {
-			spa->spa_condensing_indirect =
-			    spa_condensing_indirect_create(spa);
-		}
-		return (0);
-	} else if (error == ENOENT) {
-		return (0);
-	} else {
-		return (error);
-	}
-}
-
-void
-spa_condense_fini(spa_t *spa)
-{
-	if (spa->spa_condensing_indirect != NULL) {
-		spa_condensing_indirect_destroy(spa->spa_condensing_indirect);
-		spa->spa_condensing_indirect = NULL;
-	}
-}
-
-void
-spa_start_indirect_condensing_thread(spa_t *spa)
-{
-	ASSERT3P(spa->spa_condense_zthr, ==, NULL);
-	spa->spa_condense_zthr = zthr_create(spa_condense_indirect_thread_check,
-	    spa_condense_indirect_thread, spa);
-}
-
-/*
- * Gets the obsolete spacemap object from the vdev's ZAP.
- * Returns the spacemap object, or 0 if it wasn't in the ZAP or the ZAP doesn't
- * exist yet.
- */
-int
-vdev_obsolete_sm_object(vdev_t *vd)
-{
-	ASSERT0(spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER));
-	if (vd->vdev_top_zap == 0) {
-		return (0);
-	}
-
-	uint64_t sm_obj = 0;
-	int err = zap_lookup(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap,
-	    VDEV_TOP_ZAP_INDIRECT_OBSOLETE_SM, sizeof (sm_obj), 1, &sm_obj);
-
-	ASSERT(err == 0 || err == ENOENT);
-
-	return (sm_obj);
-}
-
-boolean_t
-vdev_obsolete_counts_are_precise(vdev_t *vd)
-{
-	ASSERT0(spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER));
-	if (vd->vdev_top_zap == 0) {
-		return (B_FALSE);
-	}
-
-	uint64_t val = 0;
-	int err = zap_lookup(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap,
-	    VDEV_TOP_ZAP_OBSOLETE_COUNTS_ARE_PRECISE, sizeof (val), 1, &val);
-
-	ASSERT(err == 0 || err == ENOENT);
-
-	return (val != 0);
-}
-
-/* ARGSUSED */
-static void
-vdev_indirect_close(vdev_t *vd)
-{
-}
-
-/* ARGSUSED */
-static int
-vdev_indirect_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize,
-    uint64_t *logical_ashift, uint64_t *physical_ashift)
-{
-	*psize = *max_psize = vd->vdev_asize +
-	    VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE;
-	*logical_ashift = vd->vdev_ashift;
-	*physical_ashift = vd->vdev_physical_ashift;
-	return (0);
-}
-
-typedef struct remap_segment {
-	vdev_t *rs_vd;
-	uint64_t rs_offset;
-	uint64_t rs_asize;
-	uint64_t rs_split_offset;
-	list_node_t rs_node;
-} remap_segment_t;
-
-remap_segment_t *
-rs_alloc(vdev_t *vd, uint64_t offset, uint64_t asize, uint64_t split_offset)
-{
-	remap_segment_t *rs = kmem_alloc(sizeof (remap_segment_t), KM_SLEEP);
-	rs->rs_vd = vd;
-	rs->rs_offset = offset;
-	rs->rs_asize = asize;
-	rs->rs_split_offset = split_offset;
-	return (rs);
-}
-
-/*
- * Given an indirect vdev and an extent on that vdev, it duplicates the
- * physical entries of the indirect mapping that correspond to the extent
- * to a new array and returns a pointer to it. In addition, copied_entries
- * is populated with the number of mapping entries that were duplicated.
- *
- * Note that the function assumes that the caller holds vdev_indirect_rwlock.
- * This ensures that the mapping won't change due to condensing as we
- * copy over its contents.
- *
- * Finally, since we are doing an allocation, it is up to the caller to
- * free the array allocated in this function.
- */
-vdev_indirect_mapping_entry_phys_t *
-vdev_indirect_mapping_duplicate_adjacent_entries(vdev_t *vd, uint64_t offset,
-    uint64_t asize, uint64_t *copied_entries)
-{
-	vdev_indirect_mapping_entry_phys_t *duplicate_mappings = NULL;
-	vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
-	uint64_t entries = 0;
-
-	ASSERT(RW_READ_HELD(&vd->vdev_indirect_rwlock));
-
-	vdev_indirect_mapping_entry_phys_t *first_mapping =
-	    vdev_indirect_mapping_entry_for_offset(vim, offset);
-	ASSERT3P(first_mapping, !=, NULL);
-
-	vdev_indirect_mapping_entry_phys_t *m = first_mapping;
-	while (asize > 0) {
-		uint64_t size = DVA_GET_ASIZE(&m->vimep_dst);
-
-		ASSERT3U(offset, >=, DVA_MAPPING_GET_SRC_OFFSET(m));
-		ASSERT3U(offset, <, DVA_MAPPING_GET_SRC_OFFSET(m) + size);
-
-		uint64_t inner_offset = offset - DVA_MAPPING_GET_SRC_OFFSET(m);
-		uint64_t inner_size = MIN(asize, size - inner_offset);
-
-		offset += inner_size;
-		asize -= inner_size;
-		entries++;
-		m++;
-	}
-
-	size_t copy_length = entries * sizeof (*first_mapping);
-	duplicate_mappings = kmem_alloc(copy_length, KM_SLEEP);
-	bcopy(first_mapping, duplicate_mappings, copy_length);
-	*copied_entries = entries;
-
-	return (duplicate_mappings);
-}
-
-/*
- * Goes through the relevant indirect mappings until it hits a concrete vdev
- * and issues the callback. On the way to the concrete vdev, if any other
- * indirect vdevs are encountered, then the callback will also be called on
- * each of those indirect vdevs. For example, if the segment is mapped to
- * segment A on indirect vdev 1, and then segment A on indirect vdev 1 is
- * mapped to segment B on concrete vdev 2, then the callback will be called on
- * both vdev 1 and vdev 2.
- *
- * While the callback passed to vdev_indirect_remap() is called on every vdev
- * the function encounters, certain callbacks only care about concrete vdevs.
- * These types of callbacks should return immediately and explicitly when they
- * are called on an indirect vdev.
- *
- * Because there is a possibility that a DVA section in the indirect device
- * has been split into multiple sections in our mapping, we keep track
- * of the relevant contiguous segments of the new location (remap_segment_t)
- * in a stack. This way we can call the callback for each of the new sections
- * created by a single section of the indirect device. Note though, that in
- * this scenario the callbacks in each split block won't occur in-order in
- * terms of offset, so callers should not make any assumptions about that.
- *
- * For callbacks that don't handle split blocks and immediately return when
- * they encounter them (as is the case for remap_blkptr_cb), the caller can
- * assume that its callback will be applied from the first indirect vdev
- * encountered to the last one and then the concrete vdev, in that order.
- */
-static void
-vdev_indirect_remap(vdev_t *vd, uint64_t offset, uint64_t asize,
-    void (*func)(uint64_t, vdev_t *, uint64_t, uint64_t, void *), void *arg)
-{
-	list_t stack;
-	spa_t *spa = vd->vdev_spa;
-
-	list_create(&stack, sizeof (remap_segment_t),
-	    offsetof(remap_segment_t, rs_node));
-
-	for (remap_segment_t *rs = rs_alloc(vd, offset, asize, 0);
-	    rs != NULL; rs = list_remove_head(&stack)) {
-		vdev_t *v = rs->rs_vd;
-		uint64_t num_entries = 0;
-
-		ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0);
-		ASSERT(rs->rs_asize > 0);
-
-		/*
-		 * Note: As this function can be called from open context
-		 * (e.g. zio_read()), we need the following rwlock to
-		 * prevent the mapping from being changed by condensing.
-		 *
-		 * So we grab the lock and we make a copy of the entries
-		 * that are relevant to the extent that we are working on.
-		 * Once that is done, we drop the lock and iterate over
-		 * our copy of the mapping. Once we are done with the with
-		 * the remap segment and we free it, we also free our copy
-		 * of the indirect mapping entries that are relevant to it.
-		 *
-		 * This way we don't need to wait until the function is
-		 * finished with a segment, to condense it. In addition, we
-		 * don't need a recursive rwlock for the case that a call to
-		 * vdev_indirect_remap() needs to call itself (through the
-		 * codepath of its callback) for the same vdev in the middle
-		 * of its execution.
-		 */
-		rw_enter(&v->vdev_indirect_rwlock, RW_READER);
-		vdev_indirect_mapping_t *vim = v->vdev_indirect_mapping;
-		ASSERT3P(vim, !=, NULL);
-
-		vdev_indirect_mapping_entry_phys_t *mapping =
-		    vdev_indirect_mapping_duplicate_adjacent_entries(v,
-		    rs->rs_offset, rs->rs_asize, &num_entries);
-		ASSERT3P(mapping, !=, NULL);
-		ASSERT3U(num_entries, >, 0);
-		rw_exit(&v->vdev_indirect_rwlock);
-
-		for (uint64_t i = 0; i < num_entries; i++) {
-			/*
-			 * Note: the vdev_indirect_mapping can not change
-			 * while we are running.  It only changes while the
-			 * removal is in progress, and then only from syncing
-			 * context. While a removal is in progress, this
-			 * function is only called for frees, which also only
-			 * happen from syncing context.
-			 */
-			vdev_indirect_mapping_entry_phys_t *m = &mapping[i];
-
-			ASSERT3P(m, !=, NULL);
-			ASSERT3U(rs->rs_asize, >, 0);
-
-			uint64_t size = DVA_GET_ASIZE(&m->vimep_dst);
-			uint64_t dst_offset = DVA_GET_OFFSET(&m->vimep_dst);
-			uint64_t dst_vdev = DVA_GET_VDEV(&m->vimep_dst);
-
-			ASSERT3U(rs->rs_offset, >=,
-			    DVA_MAPPING_GET_SRC_OFFSET(m));
-			ASSERT3U(rs->rs_offset, <,
-			    DVA_MAPPING_GET_SRC_OFFSET(m) + size);
-			ASSERT3U(dst_vdev, !=, v->vdev_id);
-
-			uint64_t inner_offset = rs->rs_offset -
-			    DVA_MAPPING_GET_SRC_OFFSET(m);
-			uint64_t inner_size =
-			    MIN(rs->rs_asize, size - inner_offset);
-
-			vdev_t *dst_v = vdev_lookup_top(spa, dst_vdev);
-			ASSERT3P(dst_v, !=, NULL);
-
-			if (dst_v->vdev_ops == &vdev_indirect_ops) {
-				list_insert_head(&stack,
-				    rs_alloc(dst_v, dst_offset + inner_offset,
-				    inner_size, rs->rs_split_offset));
-
-			}
-
-			if ((zfs_flags & ZFS_DEBUG_INDIRECT_REMAP) &&
-			    IS_P2ALIGNED(inner_size, 2 * SPA_MINBLOCKSIZE)) {
-				/*
-				 * Note: This clause exists only solely for
-				 * testing purposes. We use it to ensure that
-				 * split blocks work and that the callbacks
-				 * using them yield the same result if issued
-				 * in reverse order.
-				 */
-				uint64_t inner_half = inner_size / 2;
-
-				func(rs->rs_split_offset + inner_half, dst_v,
-				    dst_offset + inner_offset + inner_half,
-				    inner_half, arg);
-
-				func(rs->rs_split_offset, dst_v,
-				    dst_offset + inner_offset,
-				    inner_half, arg);
-			} else {
-				func(rs->rs_split_offset, dst_v,
-				    dst_offset + inner_offset,
-				    inner_size, arg);
-			}
-
-			rs->rs_offset += inner_size;
-			rs->rs_asize -= inner_size;
-			rs->rs_split_offset += inner_size;
-		}
-		VERIFY0(rs->rs_asize);
-
-		kmem_free(mapping, num_entries * sizeof (*mapping));
-		kmem_free(rs, sizeof (remap_segment_t));
-	}
-	list_destroy(&stack);
-}
-
-static void
-vdev_indirect_child_io_done(zio_t *zio)
-{
-	zio_t *pio = zio->io_private;
-
-	mutex_enter(&pio->io_lock);
-	pio->io_error = zio_worst_error(pio->io_error, zio->io_error);
-	mutex_exit(&pio->io_lock);
-
-#ifdef __FreeBSD__
-	if (zio->io_abd != NULL)
-#endif
-	abd_put(zio->io_abd);
-}
-
-/*
- * This is a callback for vdev_indirect_remap() which allocates an
- * indirect_split_t for each split segment and adds it to iv_splits.
- */
-static void
-vdev_indirect_gather_splits(uint64_t split_offset, vdev_t *vd, uint64_t offset,
-    uint64_t size, void *arg)
-{
-	zio_t *zio = arg;
-	indirect_vsd_t *iv = zio->io_vsd;
-
-	ASSERT3P(vd, !=, NULL);
-
-	if (vd->vdev_ops == &vdev_indirect_ops)
-		return;
-
-	int n = 1;
-	if (vd->vdev_ops == &vdev_mirror_ops)
-		n = vd->vdev_children;
-
-	indirect_split_t *is =
-	    kmem_zalloc(offsetof(indirect_split_t, is_child[n]), KM_SLEEP);
-
-	is->is_children = n;
-	is->is_size = size;
-	is->is_split_offset = split_offset;
-	is->is_target_offset = offset;
-	is->is_vdev = vd;
-	list_create(&is->is_unique_child, sizeof (indirect_child_t),
-	    offsetof(indirect_child_t, ic_node));
-
-	/*
-	 * Note that we only consider multiple copies of the data for
-	 * *mirror* vdevs.  We don't for "replacing" or "spare" vdevs, even
-	 * though they use the same ops as mirror, because there's only one
-	 * "good" copy under the replacing/spare.
-	 */
-	if (vd->vdev_ops == &vdev_mirror_ops) {
-		for (int i = 0; i < n; i++) {
-			is->is_child[i].ic_vdev = vd->vdev_child[i];
-			list_link_init(&is->is_child[i].ic_node);
-		}
-	} else {
-		is->is_child[0].ic_vdev = vd;
-	}
-
-	list_insert_tail(&iv->iv_splits, is);
-}
-
-static void
-vdev_indirect_read_split_done(zio_t *zio)
-{
-	indirect_child_t *ic = zio->io_private;
-
-	if (zio->io_error != 0) {
-		/*
-		 * Clear ic_data to indicate that we do not have data for this
-		 * child.
-		 */
-		abd_free(ic->ic_data);
-		ic->ic_data = NULL;
-	}
-}
-
-/*
- * Issue reads for all copies (mirror children) of all splits.
- */
-static void
-vdev_indirect_read_all(zio_t *zio)
-{
-	indirect_vsd_t *iv = zio->io_vsd;
-
-	ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ);
-
-	for (indirect_split_t *is = list_head(&iv->iv_splits);
-	    is != NULL; is = list_next(&iv->iv_splits, is)) {
-		for (int i = 0; i < is->is_children; i++) {
-			indirect_child_t *ic = &is->is_child[i];
-
-			if (!vdev_readable(ic->ic_vdev))
-				continue;
-
-			/*
-			 * Note, we may read from a child whose DTL
-			 * indicates that the data may not be present here.
-			 * While this might result in a few i/os that will
-			 * likely return incorrect data, it simplifies the
-			 * code since we can treat scrub and resilver
-			 * identically.  (The incorrect data will be
-			 * detected and ignored when we verify the
-			 * checksum.)
-			 */
-
-			ic->ic_data = abd_alloc_sametype(zio->io_abd,
-			    is->is_size);
-			ic->ic_duplicate = NULL;
-
-			zio_nowait(zio_vdev_child_io(zio, NULL,
-			    ic->ic_vdev, is->is_target_offset, ic->ic_data,
-			    is->is_size, zio->io_type, zio->io_priority, 0,
-			    vdev_indirect_read_split_done, ic));
-		}
-	}
-	iv->iv_reconstruct = B_TRUE;
-}
-
-static void
-vdev_indirect_io_start(zio_t *zio)
-{
-	spa_t *spa = zio->io_spa;
-	indirect_vsd_t *iv = kmem_zalloc(sizeof (*iv), KM_SLEEP);
-	list_create(&iv->iv_splits,
-	    sizeof (indirect_split_t), offsetof(indirect_split_t, is_node));
-
-	zio->io_vsd = iv;
-	zio->io_vsd_ops = &vdev_indirect_vsd_ops;
-
-	ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0);
-#ifdef __FreeBSD__
-	if (zio->io_type == ZIO_TYPE_WRITE) {
-#else
-	if (zio->io_type != ZIO_TYPE_READ) {
-		ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE);
-#endif
-		/*
-		 * Note: this code can handle other kinds of writes,
-		 * but we don't expect them.
-		 */
-		ASSERT((zio->io_flags & (ZIO_FLAG_SELF_HEAL |
-		    ZIO_FLAG_RESILVER | ZIO_FLAG_INDUCE_DAMAGE)) != 0);
-	}
-
-	vdev_indirect_remap(zio->io_vd, zio->io_offset, zio->io_size,
-	    vdev_indirect_gather_splits, zio);
-
-	indirect_split_t *first = list_head(&iv->iv_splits);
-	if (first->is_size == zio->io_size) {
-		/*
-		 * This is not a split block; we are pointing to the entire
-		 * data, which will checksum the same as the original data.
-		 * Pass the BP down so that the child i/o can verify the
-		 * checksum, and try a different location if available
-		 * (e.g. on a mirror).
-		 *
-		 * While this special case could be handled the same as the
-		 * general (split block) case, doing it this way ensures
-		 * that the vast majority of blocks on indirect vdevs
-		 * (which are not split) are handled identically to blocks
-		 * on non-indirect vdevs.  This allows us to be less strict
-		 * about performance in the general (but rare) case.
-		 */
-		ASSERT0(first->is_split_offset);
-		ASSERT3P(list_next(&iv->iv_splits, first), ==, NULL);
-		zio_nowait(zio_vdev_child_io(zio, zio->io_bp,
-		    first->is_vdev, first->is_target_offset,
-#ifdef __FreeBSD__
-		    zio->io_abd == NULL ? NULL :
-#endif
-		    abd_get_offset(zio->io_abd, 0),
-		    zio->io_size, zio->io_type, zio->io_priority, 0,
-		    vdev_indirect_child_io_done, zio));
-	} else {
-		iv->iv_split_block = B_TRUE;
-		if (zio->io_type == ZIO_TYPE_READ &&
-		    zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER)) {
-			/*
-			 * Read all copies.  Note that for simplicity,
-			 * we don't bother consulting the DTL in the
-			 * resilver case.
-			 */
-			vdev_indirect_read_all(zio);
-		} else {
-			/*
-			 * If this is a read zio, we read one copy of each
-			 * split segment, from the top-level vdev.  Since
-			 * we don't know the checksum of each split
-			 * individually, the child zio can't ensure that
-			 * we get the right data. E.g. if it's a mirror,
-			 * it will just read from a random (healthy) leaf
-			 * vdev. We have to verify the checksum in
-			 * vdev_indirect_io_done().
-			 *
-			 * For write zios, the vdev code will ensure we write
-			 * to all children.
-			 */
-			for (indirect_split_t *is = list_head(&iv->iv_splits);
-			    is != NULL; is = list_next(&iv->iv_splits, is)) {
-				zio_nowait(zio_vdev_child_io(zio, NULL,
-				    is->is_vdev, is->is_target_offset,
-#ifdef __FreeBSD__
-				    zio->io_abd == NULL ? NULL :
-#endif
-				    abd_get_offset(zio->io_abd,
-				    is->is_split_offset),
-				    is->is_size, zio->io_type,
-				    zio->io_priority, 0,
-				    vdev_indirect_child_io_done, zio));
-			}
-		}
-	}
-
-	zio_execute(zio);
-}
-
-/*
- * Report a checksum error for a child.
- */
-static void
-vdev_indirect_checksum_error(zio_t *zio,
-    indirect_split_t *is, indirect_child_t *ic)
-{
-	vdev_t *vd = ic->ic_vdev;
-
-	if (zio->io_flags & ZIO_FLAG_SPECULATIVE)
-		return;
-
-	mutex_enter(&vd->vdev_stat_lock);
-	vd->vdev_stat.vs_checksum_errors++;
-	mutex_exit(&vd->vdev_stat_lock);
-
-	zio_bad_cksum_t zbc = { 0 };
-	void *bad_buf = abd_borrow_buf_copy(ic->ic_data, is->is_size);
-	abd_t *good_abd = is->is_good_child->ic_data;
-	void *good_buf = abd_borrow_buf_copy(good_abd, is->is_size);
-	zfs_ereport_post_checksum(zio->io_spa, vd, zio,
-	    is->is_target_offset, is->is_size, good_buf, bad_buf, &zbc);
-	abd_return_buf(ic->ic_data, bad_buf, is->is_size);
-	abd_return_buf(good_abd, good_buf, is->is_size);
-}
-
-/*
- * Issue repair i/os for any incorrect copies.  We do this by comparing
- * each split segment's correct data (is_good_child's ic_data) with each
- * other copy of the data.  If they differ, then we overwrite the bad data
- * with the good copy.  Note that we do this without regard for the DTL's,
- * which simplifies this code and also issues the optimal number of writes
- * (based on which copies actually read bad data, as opposed to which we
- * think might be wrong).  For the same reason, we always use
- * ZIO_FLAG_SELF_HEAL, to bypass the DTL check in zio_vdev_io_start().
- */
-static void
-vdev_indirect_repair(zio_t *zio)
-{
-	indirect_vsd_t *iv = zio->io_vsd;
-
-	enum zio_flag flags = ZIO_FLAG_IO_REPAIR;
-
-	if (!(zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER)))
-		flags |= ZIO_FLAG_SELF_HEAL;
-
-	if (!spa_writeable(zio->io_spa))
-		return;
-
-	for (indirect_split_t *is = list_head(&iv->iv_splits);
-	    is != NULL; is = list_next(&iv->iv_splits, is)) {
-		for (int c = 0; c < is->is_children; c++) {
-			indirect_child_t *ic = &is->is_child[c];
-			if (ic == is->is_good_child)
-				continue;
-			if (ic->ic_data == NULL)
-				continue;
-			if (ic->ic_duplicate == is->is_good_child)
-				continue;
-
-			zio_nowait(zio_vdev_child_io(zio, NULL,
-			    ic->ic_vdev, is->is_target_offset,
-			    is->is_good_child->ic_data, is->is_size,
-			    ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE,
-			    ZIO_FLAG_IO_REPAIR | ZIO_FLAG_SELF_HEAL,
-			    NULL, NULL));
-
-			vdev_indirect_checksum_error(zio, is, ic);
-		}
-	}
-}
-
-/*
- * Report checksum errors on all children that we read from.
- */
-static void
-vdev_indirect_all_checksum_errors(zio_t *zio)
-{
-	indirect_vsd_t *iv = zio->io_vsd;
-
-	if (zio->io_flags & ZIO_FLAG_SPECULATIVE)
-		return;
-
-	for (indirect_split_t *is = list_head(&iv->iv_splits);
-	    is != NULL; is = list_next(&iv->iv_splits, is)) {
-		for (int c = 0; c < is->is_children; c++) {
-			indirect_child_t *ic = &is->is_child[c];
-
-			if (ic->ic_data == NULL)
-				continue;
-
-			vdev_t *vd = ic->ic_vdev;
-
-			mutex_enter(&vd->vdev_stat_lock);
-			vd->vdev_stat.vs_checksum_errors++;
-			mutex_exit(&vd->vdev_stat_lock);
-
-			zfs_ereport_post_checksum(zio->io_spa, vd, zio,
-			    is->is_target_offset, is->is_size,
-			    NULL, NULL, NULL);
-		}
-	}
-}
-
-/*
- * Copy data from all the splits to a main zio then validate the checksum.
- * If then checksum is successfully validated return success.
- */
-static int
-vdev_indirect_splits_checksum_validate(indirect_vsd_t *iv, zio_t *zio)
-{
-	zio_bad_cksum_t zbc;
-
-	for (indirect_split_t *is = list_head(&iv->iv_splits);
-	    is != NULL; is = list_next(&iv->iv_splits, is)) {
-
-		ASSERT3P(is->is_good_child->ic_data, !=, NULL);
-		ASSERT3P(is->is_good_child->ic_duplicate, ==, NULL);
-
-		abd_copy_off(zio->io_abd, is->is_good_child->ic_data,
-		    is->is_split_offset, 0, is->is_size);
-	}
-
-	return (zio_checksum_error(zio, &zbc));
-}
-
-/*
- * There are relatively few possible combinations making it feasible to
- * deterministically check them all.  We do this by setting the good_child
- * to the next unique split version.  If we reach the end of the list then
- * "carry over" to the next unique split version (like counting in base
- * is_unique_children, but each digit can have a different base).
- */
-static int
-vdev_indirect_splits_enumerate_all(indirect_vsd_t *iv, zio_t *zio)
-{
-	boolean_t more = B_TRUE;
-
-	iv->iv_attempts = 0;
-
-	for (indirect_split_t *is = list_head(&iv->iv_splits);
-	    is != NULL; is = list_next(&iv->iv_splits, is))
-		is->is_good_child = list_head(&is->is_unique_child);
-
-	while (more == B_TRUE) {
-		iv->iv_attempts++;
-		more = B_FALSE;
-
-		if (vdev_indirect_splits_checksum_validate(iv, zio) == 0)
-			return (0);
-
-		for (indirect_split_t *is = list_head(&iv->iv_splits);
-		    is != NULL; is = list_next(&iv->iv_splits, is)) {
-			is->is_good_child = list_next(&is->is_unique_child,
-			    is->is_good_child);
-			if (is->is_good_child != NULL) {
-				more = B_TRUE;
-				break;
-			}
-
-			is->is_good_child = list_head(&is->is_unique_child);
-		}
-	}
-
-	ASSERT3S(iv->iv_attempts, <=, iv->iv_unique_combinations);
-
-	return (SET_ERROR(ECKSUM));
-}
-
-/*
- * There are too many combinations to try all of them in a reasonable amount
- * of time.  So try a fixed number of random combinations from the unique
- * split versions, after which we'll consider the block unrecoverable.
- */
-static int
-vdev_indirect_splits_enumerate_randomly(indirect_vsd_t *iv, zio_t *zio)
-{
-	iv->iv_attempts = 0;
-
-	while (iv->iv_attempts < iv->iv_attempts_max) {
-		iv->iv_attempts++;
-
-		for (indirect_split_t *is = list_head(&iv->iv_splits);
-		    is != NULL; is = list_next(&iv->iv_splits, is)) {
-			indirect_child_t *ic = list_head(&is->is_unique_child);
-			int children = is->is_unique_children;
-
-			for (int i = spa_get_random(children); i > 0; i--)
-				ic = list_next(&is->is_unique_child, ic);
-
-			ASSERT3P(ic, !=, NULL);
-			is->is_good_child = ic;
-		}
-
-		if (vdev_indirect_splits_checksum_validate(iv, zio) == 0)
-			return (0);
-	}
-
-	return (SET_ERROR(ECKSUM));
-}
-
-/*
- * This is a validation function for reconstruction.  It randomly selects
- * a good combination, if one can be found, and then it intentionally
- * damages all other segment copes by zeroing them.  This forces the
- * reconstruction algorithm to locate the one remaining known good copy.
- */
-static int
-vdev_indirect_splits_damage(indirect_vsd_t *iv, zio_t *zio)
-{
-	/* Presume all the copies are unique for initial selection. */
-	for (indirect_split_t *is = list_head(&iv->iv_splits);
-	    is != NULL; is = list_next(&iv->iv_splits, is)) {
-		is->is_unique_children = 0;
-
-		for (int i = 0; i < is->is_children; i++) {
-			indirect_child_t *ic = &is->is_child[i];
-			if (ic->ic_data != NULL) {
-				is->is_unique_children++;
-				list_insert_tail(&is->is_unique_child, ic);
-			}
-		}
-	}
-
-	/*
-	 * Set each is_good_child to a randomly-selected child which
-	 * is known to contain validated data.
-	 */
-	int error = vdev_indirect_splits_enumerate_randomly(iv, zio);
-	if (error)
-		goto out;
-
-	/*
-	 * Damage all but the known good copy by zeroing it.  This will
-	 * result in two or less unique copies per indirect_child_t.
-	 * Both may need to be checked in order to reconstruct the block.
-	 * Set iv->iv_attempts_max such that all unique combinations will
-	 * enumerated, but limit the damage to at most 16 indirect splits.
-	 */
-	iv->iv_attempts_max = 1;
-
-	for (indirect_split_t *is = list_head(&iv->iv_splits);
-	    is != NULL; is = list_next(&iv->iv_splits, is)) {
-		for (int c = 0; c < is->is_children; c++) {
-			indirect_child_t *ic = &is->is_child[c];
-
-			if (ic == is->is_good_child)
-				continue;
-			if (ic->ic_data == NULL)
-				continue;
-
-			abd_zero(ic->ic_data, ic->ic_data->abd_size);
-		}
-
-		iv->iv_attempts_max *= 2;
-		if (iv->iv_attempts_max > (1ULL << 16)) {
-			iv->iv_attempts_max = UINT64_MAX;
-			break;
-		}
-	}
-
-out:
-	/* Empty the unique children lists so they can be reconstructed. */
-	for (indirect_split_t *is = list_head(&iv->iv_splits);
-	    is != NULL; is = list_next(&iv->iv_splits, is)) {
-		indirect_child_t *ic;
-		while ((ic = list_head(&is->is_unique_child)) != NULL)
-			list_remove(&is->is_unique_child, ic);
-
-		is->is_unique_children = 0;
-	}
-
-	return (error);
-}
-
-/*
- * This function is called when we have read all copies of the data and need
- * to try to find a combination of copies that gives us the right checksum.
- *
- * If we pointed to any mirror vdevs, this effectively does the job of the
- * mirror.  The mirror vdev code can't do its own job because we don't know
- * the checksum of each split segment individually.
- *
- * We have to try every unique combination of copies of split segments, until
- * we find one that checksums correctly.  Duplicate segment copies are first
- * identified and latter skipped during reconstruction.  This optimization
- * reduces the search space and ensures that of the remaining combinations
- * at most one is correct.
- *
- * When the total number of combinations is small they can all be checked.
- * For example, if we have 3 segments in the split, and each points to a
- * 2-way mirror with unique copies, we will have the following pieces of data:
- *
- *       |     mirror child
- * split |     [0]        [1]
- * ======|=====================
- *   A   |  data_A_0   data_A_1
- *   B   |  data_B_0   data_B_1
- *   C   |  data_C_0   data_C_1
- *
- * We will try the following (mirror children)^(number of splits) (2^3=8)
- * combinations, which is similar to bitwise-little-endian counting in
- * binary.  In general each "digit" corresponds to a split segment, and the
- * base of each digit is is_children, which can be different for each
- * digit.
- *
- * "low bit"        "high bit"
- *        v                 v
- * data_A_0 data_B_0 data_C_0
- * data_A_1 data_B_0 data_C_0
- * data_A_0 data_B_1 data_C_0
- * data_A_1 data_B_1 data_C_0
- * data_A_0 data_B_0 data_C_1
- * data_A_1 data_B_0 data_C_1
- * data_A_0 data_B_1 data_C_1
- * data_A_1 data_B_1 data_C_1
- *
- * Note that the split segments may be on the same or different top-level
- * vdevs. In either case, we may need to try lots of combinations (see
- * zfs_reconstruct_indirect_combinations_max).  This ensures that if a mirror
- * has small silent errors on all of its children, we can still reconstruct
- * the correct data, as long as those errors are at sufficiently-separated
- * offsets (specifically, separated by the largest block size - default of
- * 128KB, but up to 16MB).
- */
-static void
-vdev_indirect_reconstruct_io_done(zio_t *zio)
-{
-	indirect_vsd_t *iv = zio->io_vsd;
-	boolean_t known_good = B_FALSE;
-	int error;
-
-	iv->iv_unique_combinations = 1;
-	iv->iv_attempts_max = UINT64_MAX;
-
-	if (zfs_reconstruct_indirect_combinations_max > 0)
-		iv->iv_attempts_max = zfs_reconstruct_indirect_combinations_max;
-
-	/*
-	 * If nonzero, every 1/x blocks will be damaged, in order to validate
-	 * reconstruction when there are split segments with damaged copies.
-	 * Known_good will TRUE when reconstruction is known to be possible.
-	 */
-	if (zfs_reconstruct_indirect_damage_fraction != 0 &&
-	    spa_get_random(zfs_reconstruct_indirect_damage_fraction) == 0)
-		known_good = (vdev_indirect_splits_damage(iv, zio) == 0);
-
-	/*
-	 * Determine the unique children for a split segment and add them
-	 * to the is_unique_child list.  By restricting reconstruction
-	 * to these children, only unique combinations will be considered.
-	 * This can vastly reduce the search space when there are a large
-	 * number of indirect splits.
-	 */
-	for (indirect_split_t *is = list_head(&iv->iv_splits);
-	    is != NULL; is = list_next(&iv->iv_splits, is)) {
-		is->is_unique_children = 0;
-
-		for (int i = 0; i < is->is_children; i++) {
-			indirect_child_t *ic_i = &is->is_child[i];
-
-			if (ic_i->ic_data == NULL ||
-			    ic_i->ic_duplicate != NULL)
-				continue;
-
-			for (int j = i + 1; j < is->is_children; j++) {
-				indirect_child_t *ic_j = &is->is_child[j];
-
-				if (ic_j->ic_data == NULL ||
-				    ic_j->ic_duplicate != NULL)
-					continue;
-
-				if (abd_cmp(ic_i->ic_data, ic_j->ic_data,
-				    is->is_size) == 0) {
-					ic_j->ic_duplicate = ic_i;
-				}
-			}
-
-			is->is_unique_children++;
-			list_insert_tail(&is->is_unique_child, ic_i);
-		}
-
-		/* Reconstruction is impossible, no valid children */
-		EQUIV(list_is_empty(&is->is_unique_child),
-		    is->is_unique_children == 0);
-		if (list_is_empty(&is->is_unique_child)) {
-			zio->io_error = EIO;
-			vdev_indirect_all_checksum_errors(zio);
-			zio_checksum_verified(zio);
-			return;
-		}
-
-		iv->iv_unique_combinations *= is->is_unique_children;
-	}
-
-	if (iv->iv_unique_combinations <= iv->iv_attempts_max)
-		error = vdev_indirect_splits_enumerate_all(iv, zio);
-	else
-		error = vdev_indirect_splits_enumerate_randomly(iv, zio);
-
-	if (error != 0) {
-		/* All attempted combinations failed. */
-		ASSERT3B(known_good, ==, B_FALSE);
-		zio->io_error = error;
-		vdev_indirect_all_checksum_errors(zio);
-	} else {
-		/*
-		 * The checksum has been successfully validated.  Issue
-		 * repair I/Os to any copies of splits which don't match
-		 * the validated version.
-		 */
-		ASSERT0(vdev_indirect_splits_checksum_validate(iv, zio));
-		vdev_indirect_repair(zio);
-		zio_checksum_verified(zio);
-	}
-}
-
-static void
-vdev_indirect_io_done(zio_t *zio)
-{
-	indirect_vsd_t *iv = zio->io_vsd;
-
-	if (iv->iv_reconstruct) {
-		/*
-		 * We have read all copies of the data (e.g. from mirrors),
-		 * either because this was a scrub/resilver, or because the
-		 * one-copy read didn't checksum correctly.
-		 */
-		vdev_indirect_reconstruct_io_done(zio);
-		return;
-	}
-
-	if (!iv->iv_split_block) {
-		/*
-		 * This was not a split block, so we passed the BP down,
-		 * and the checksum was handled by the (one) child zio.
-		 */
-		return;
-	}
-
-	zio_bad_cksum_t zbc;
-	int ret = zio_checksum_error(zio, &zbc);
-	if (ret == 0) {
-		zio_checksum_verified(zio);
-		return;
-	}
-
-	/*
-	 * The checksum didn't match.  Read all copies of all splits, and
-	 * then we will try to reconstruct.  The next time
-	 * vdev_indirect_io_done() is called, iv_reconstruct will be set.
-	 */
-	vdev_indirect_read_all(zio);
-
-	zio_vdev_io_redone(zio);
-}
-
-vdev_ops_t vdev_indirect_ops = {
-	vdev_indirect_open,
-	vdev_indirect_close,
-	vdev_default_asize,
-	vdev_indirect_io_start,
-	vdev_indirect_io_done,
-	NULL,
-	NULL,
-	NULL,
-	NULL,
-	vdev_indirect_remap,
-	NULL,
-	VDEV_TYPE_INDIRECT,	/* name of this vdev type */
-	B_FALSE			/* leaf vdev */
-};
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_indirect_births.c
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_indirect_births.c
+++ /dev/null
@@ -1,212 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * This file and its contents are supplied under the terms of the
- * Common Development and Distribution License ("CDDL"), version 1.0.
- * You may only use this file in accordance with the terms of version
- * 1.0 of the CDDL.
- *
- * A full copy of the text of the CDDL should have accompanied this
- * source.  A copy of the CDDL is also available via the Internet at
- * http://www.illumos.org/license/CDDL.
- *
- * CDDL HEADER END
- */
-
-/*
- * Copyright (c) 2015 by Delphix. All rights reserved.
- */
-
-#include <sys/dmu_tx.h>
-#include <sys/spa.h>
-#include <sys/dmu.h>
-#include <sys/dsl_pool.h>
-#include <sys/vdev_indirect_births.h>
-
-static boolean_t
-vdev_indirect_births_verify(vdev_indirect_births_t *vib)
-{
-	ASSERT(vib != NULL);
-
-	ASSERT(vib->vib_object != 0);
-	ASSERT(vib->vib_objset != NULL);
-	ASSERT(vib->vib_phys != NULL);
-	ASSERT(vib->vib_dbuf != NULL);
-
-	EQUIV(vib->vib_phys->vib_count > 0, vib->vib_entries != NULL);
-
-	return (B_TRUE);
-}
-
-uint64_t
-vdev_indirect_births_count(vdev_indirect_births_t *vib)
-{
-	ASSERT(vdev_indirect_births_verify(vib));
-
-	return (vib->vib_phys->vib_count);
-}
-
-uint64_t
-vdev_indirect_births_object(vdev_indirect_births_t *vib)
-{
-	ASSERT(vdev_indirect_births_verify(vib));
-
-	return (vib->vib_object);
-}
-
-static uint64_t
-vdev_indirect_births_size_impl(vdev_indirect_births_t *vib)
-{
-	return (vib->vib_phys->vib_count * sizeof (*vib->vib_entries));
-}
-
-void
-vdev_indirect_births_close(vdev_indirect_births_t *vib)
-{
-	ASSERT(vdev_indirect_births_verify(vib));
-
-	if (vib->vib_phys->vib_count > 0) {
-		uint64_t births_size = vdev_indirect_births_size_impl(vib);
-
-		kmem_free(vib->vib_entries, births_size);
-		vib->vib_entries = NULL;
-	}
-
-	dmu_buf_rele(vib->vib_dbuf, vib);
-
-	vib->vib_objset = NULL;
-	vib->vib_object = 0;
-	vib->vib_dbuf = NULL;
-	vib->vib_phys = NULL;
-
-	kmem_free(vib, sizeof (*vib));
-}
-
-uint64_t
-vdev_indirect_births_alloc(objset_t *os, dmu_tx_t *tx)
-{
-	ASSERT(dmu_tx_is_syncing(tx));
-
-	return (dmu_object_alloc(os,
-	    DMU_OTN_UINT64_METADATA, SPA_OLD_MAXBLOCKSIZE,
-	    DMU_OTN_UINT64_METADATA, sizeof (vdev_indirect_birth_phys_t),
-	    tx));
-}
-
-vdev_indirect_births_t *
-vdev_indirect_births_open(objset_t *os, uint64_t births_object)
-{
-	vdev_indirect_births_t *vib = kmem_zalloc(sizeof (*vib), KM_SLEEP);
-
-	vib->vib_objset = os;
-	vib->vib_object = births_object;
-
-	VERIFY0(dmu_bonus_hold(os, vib->vib_object, vib, &vib->vib_dbuf));
-	vib->vib_phys = vib->vib_dbuf->db_data;
-
-	if (vib->vib_phys->vib_count > 0) {
-		uint64_t births_size = vdev_indirect_births_size_impl(vib);
-		vib->vib_entries = kmem_alloc(births_size, KM_SLEEP);
-		VERIFY0(dmu_read(vib->vib_objset, vib->vib_object, 0,
-		    births_size, vib->vib_entries, DMU_READ_PREFETCH));
-	}
-
-	ASSERT(vdev_indirect_births_verify(vib));
-
-	return (vib);
-}
-
-void
-vdev_indirect_births_free(objset_t *os, uint64_t object, dmu_tx_t *tx)
-{
-	VERIFY0(dmu_object_free(os, object, tx));
-}
-
-void
-vdev_indirect_births_add_entry(vdev_indirect_births_t *vib,
-    uint64_t max_offset, uint64_t txg, dmu_tx_t *tx)
-{
-	vdev_indirect_birth_entry_phys_t vibe;
-	uint64_t old_size;
-	uint64_t new_size;
-	vdev_indirect_birth_entry_phys_t *new_entries;
-
-	ASSERT(dmu_tx_is_syncing(tx));
-	ASSERT(dsl_pool_sync_context(dmu_tx_pool(tx)));
-	ASSERT(vdev_indirect_births_verify(vib));
-
-	dmu_buf_will_dirty(vib->vib_dbuf, tx);
-
-	vibe.vibe_offset = max_offset;
-	vibe.vibe_phys_birth_txg = txg;
-
-	old_size = vdev_indirect_births_size_impl(vib);
-	dmu_write(vib->vib_objset, vib->vib_object, old_size, sizeof (vibe),
-	    &vibe, tx);
-	vib->vib_phys->vib_count++;
-	new_size = vdev_indirect_births_size_impl(vib);
-
-	new_entries = kmem_alloc(new_size, KM_SLEEP);
-	if (old_size > 0) {
-		bcopy(vib->vib_entries, new_entries, old_size);
-		kmem_free(vib->vib_entries, old_size);
-	}
-	new_entries[vib->vib_phys->vib_count - 1] = vibe;
-	vib->vib_entries = new_entries;
-}
-
-uint64_t
-vdev_indirect_births_last_entry_txg(vdev_indirect_births_t *vib)
-{
-	ASSERT(vdev_indirect_births_verify(vib));
-	ASSERT(vib->vib_phys->vib_count > 0);
-
-	vdev_indirect_birth_entry_phys_t *last =
-	    &vib->vib_entries[vib->vib_phys->vib_count - 1];
-	return (last->vibe_phys_birth_txg);
-}
-
-/*
- * Return the txg in which the given range was copied (i.e. its physical
- * birth txg).  The specified offset+asize must be contiguously mapped
- * (i.e. not a split block).
- *
- * The entries are sorted by increasing phys_birth, and also by increasing
- * offset.  We find the specified offset by binary search.  Note that we
- * can not use bsearch() because looking at each entry independently is
- * insufficient to find the correct entry.  Each entry implicitly relies
- * on the previous entry: an entry indicates that the offsets from the
- * end of the previous entry to the end of this entry were written in the
- * specified txg.
- */
-uint64_t
-vdev_indirect_births_physbirth(vdev_indirect_births_t *vib, uint64_t offset,
-    uint64_t asize)
-{
-	vdev_indirect_birth_entry_phys_t *base;
-	vdev_indirect_birth_entry_phys_t *last;
-
-	ASSERT(vdev_indirect_births_verify(vib));
-	ASSERT(vib->vib_phys->vib_count > 0);
-
-	base = vib->vib_entries;
-	last = base + vib->vib_phys->vib_count - 1;
-
-	ASSERT3U(offset, <, last->vibe_offset);
-
-	while (last >= base) {
-		vdev_indirect_birth_entry_phys_t *p =
-		    base + ((last - base) / 2);
-		if (offset >= p->vibe_offset) {
-			base = p + 1;
-		} else if (p == vib->vib_entries ||
-		    offset >= (p - 1)->vibe_offset) {
-			ASSERT3U(offset + asize, <=, p->vibe_offset);
-			return (p->vibe_phys_birth_txg);
-		} else {
-			last = p - 1;
-		}
-	}
-	ASSERT(!"offset not found");
-	return (-1);
-}
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_indirect_mapping.c
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_indirect_mapping.c
+++ /dev/null
@@ -1,593 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * This file and its contents are supplied under the terms of the
- * Common Development and Distribution License ("CDDL"), version 1.0.
- * You may only use this file in accordance with the terms of version
- * 1.0 of the CDDL.
- *
- * A full copy of the text of the CDDL should have accompanied this
- * source.  A copy of the CDDL is also available via the Internet at
- * http://www.illumos.org/license/CDDL.
- *
- * CDDL HEADER END
- */
-
-/*
- * Copyright (c) 2015, 2017 by Delphix. All rights reserved.
- */
-
-#include <sys/dmu_tx.h>
-#include <sys/dsl_pool.h>
-#include <sys/spa.h>
-#include <sys/vdev_impl.h>
-#include <sys/vdev_indirect_mapping.h>
-#include <sys/zfeature.h>
-#include <sys/dmu_objset.h>
-
-static boolean_t
-vdev_indirect_mapping_verify(vdev_indirect_mapping_t *vim)
-{
-	ASSERT(vim != NULL);
-
-	ASSERT(vim->vim_object != 0);
-	ASSERT(vim->vim_objset != NULL);
-	ASSERT(vim->vim_phys != NULL);
-	ASSERT(vim->vim_dbuf != NULL);
-
-	EQUIV(vim->vim_phys->vimp_num_entries > 0,
-	    vim->vim_entries != NULL);
-	if (vim->vim_phys->vimp_num_entries > 0) {
-		vdev_indirect_mapping_entry_phys_t *last_entry =
-		    &vim->vim_entries[vim->vim_phys->vimp_num_entries - 1];
-		uint64_t offset = DVA_MAPPING_GET_SRC_OFFSET(last_entry);
-		uint64_t size = DVA_GET_ASIZE(&last_entry->vimep_dst);
-
-		ASSERT3U(vim->vim_phys->vimp_max_offset, >=, offset + size);
-	}
-	if (vim->vim_havecounts) {
-		ASSERT(vim->vim_phys->vimp_counts_object != 0);
-	}
-
-	return (B_TRUE);
-}
-
-uint64_t
-vdev_indirect_mapping_num_entries(vdev_indirect_mapping_t *vim)
-{
-	ASSERT(vdev_indirect_mapping_verify(vim));
-
-	return (vim->vim_phys->vimp_num_entries);
-}
-
-uint64_t
-vdev_indirect_mapping_max_offset(vdev_indirect_mapping_t *vim)
-{
-	ASSERT(vdev_indirect_mapping_verify(vim));
-
-	return (vim->vim_phys->vimp_max_offset);
-}
-
-uint64_t
-vdev_indirect_mapping_object(vdev_indirect_mapping_t *vim)
-{
-	ASSERT(vdev_indirect_mapping_verify(vim));
-
-	return (vim->vim_object);
-}
-
-uint64_t
-vdev_indirect_mapping_bytes_mapped(vdev_indirect_mapping_t *vim)
-{
-	ASSERT(vdev_indirect_mapping_verify(vim));
-
-	return (vim->vim_phys->vimp_bytes_mapped);
-}
-
-/*
- * The length (in bytes) of the mapping object array in memory and
- * (logically) on disk.
- *
- * Note that unlike most of our accessor functions,
- * we don't assert that the struct is consistent; therefore it can be
- * called while there may be concurrent changes, if we don't care about
- * the value being immediately stale (e.g. from spa_removal_get_stats()).
- */
-uint64_t
-vdev_indirect_mapping_size(vdev_indirect_mapping_t *vim)
-{
-	return (vim->vim_phys->vimp_num_entries * sizeof (*vim->vim_entries));
-}
-
-/*
- * Compare an offset with an indirect mapping entry; there are three
- * possible scenarios:
- *
- *     1. The offset is "less than" the mapping entry; meaning the
- *        offset is less than the source offset of the mapping entry. In
- *        this case, there is no overlap between the offset and the
- *        mapping entry and -1 will be returned.
- *
- *     2. The offset is "greater than" the mapping entry; meaning the
- *        offset is greater than the mapping entry's source offset plus
- *        the entry's size. In this case, there is no overlap between
- *        the offset and the mapping entry and 1 will be returned.
- *
- *        NOTE: If the offset is actually equal to the entry's offset
- *        plus size, this is considered to be "greater" than the entry,
- *        and this case applies (i.e. 1 will be returned). Thus, the
- *        entry's "range" can be considered to be inclusive at its
- *        start, but exclusive at its end: e.g. [src, src + size).
- *
- *     3. The last case to consider is if the offset actually falls
- *        within the mapping entry's range. If this is the case, the
- *        offset is considered to be "equal to" the mapping entry and
- *        0 will be returned.
- *
- *        NOTE: If the offset is equal to the entry's source offset,
- *        this case applies and 0 will be returned. If the offset is
- *        equal to the entry's source plus its size, this case does
- *        *not* apply (see "NOTE" above for scenario 2), and 1 will be
- *        returned.
- */
-static int
-dva_mapping_overlap_compare(const void *v_key, const void *v_array_elem)
-{
-	const uint64_t *key = v_key;
-	const vdev_indirect_mapping_entry_phys_t *array_elem =
-	    v_array_elem;
-	uint64_t src_offset = DVA_MAPPING_GET_SRC_OFFSET(array_elem);
-
-	if (*key < src_offset) {
-		return (-1);
-	} else if (*key < src_offset + DVA_GET_ASIZE(&array_elem->vimep_dst)) {
-		return (0);
-	} else {
-		return (1);
-	}
-}
-
-/*
- * Returns the mapping entry for the given offset.
- *
- * It's possible that the given offset will not be in the mapping table
- * (i.e. no mapping entries contain this offset), in which case, the
- * return value value depends on the "next_if_missing" parameter.
- *
- * If the offset is not found in the table and "next_if_missing" is
- * B_FALSE, then NULL will always be returned. The behavior is intended
- * to allow consumers to get the entry corresponding to the offset
- * parameter, iff the offset overlaps with an entry in the table.
- *
- * If the offset is not found in the table and "next_if_missing" is
- * B_TRUE, then the entry nearest to the given offset will be returned,
- * such that the entry's source offset is greater than the offset
- * passed in (i.e. the "next" mapping entry in the table is returned, if
- * the offset is missing from the table). If there are no entries whose
- * source offset is greater than the passed in offset, NULL is returned.
- */
-static vdev_indirect_mapping_entry_phys_t *
-vdev_indirect_mapping_entry_for_offset_impl(vdev_indirect_mapping_t *vim,
-    uint64_t offset, boolean_t next_if_missing)
-{
-	ASSERT(vdev_indirect_mapping_verify(vim));
-	ASSERT(vim->vim_phys->vimp_num_entries > 0);
-
-	vdev_indirect_mapping_entry_phys_t *entry = NULL;
-
-	uint64_t last = vim->vim_phys->vimp_num_entries - 1;
-	uint64_t base = 0;
-
-	/*
-	 * We don't define these inside of the while loop because we use
-	 * their value in the case that offset isn't in the mapping.
-	 */
-	uint64_t mid;
-	int result;
-
-	while (last >= base) {
-		mid = base + ((last - base) >> 1);
-
-		result = dva_mapping_overlap_compare(&offset,
-		    &vim->vim_entries[mid]);
-
-		if (result == 0) {
-			entry = &vim->vim_entries[mid];
-			break;
-		} else if (result < 0) {
-			last = mid - 1;
-		} else {
-			base = mid + 1;
-		}
-	}
-
-	if (entry == NULL && next_if_missing) {
-		ASSERT3U(base, ==, last + 1);
-		ASSERT(mid == base || mid == last);
-		ASSERT3S(result, !=, 0);
-
-		/*
-		 * The offset we're looking for isn't actually contained
-		 * in the mapping table, thus we need to return the
-		 * closest mapping entry that is greater than the
-		 * offset. We reuse the result of the last comparison,
-		 * comparing the mapping entry at index "mid" and the
-		 * offset. The offset is guaranteed to lie between
-		 * indices one less than "mid", and one greater than
-		 * "mid"; we just need to determine if offset is greater
-		 * than, or less than the mapping entry contained at
-		 * index "mid".
-		 */
-
-		uint64_t index;
-		if (result < 0)
-			index = mid;
-		else
-			index = mid + 1;
-
-		ASSERT3U(index, <=, vim->vim_phys->vimp_num_entries);
-
-		if (index == vim->vim_phys->vimp_num_entries) {
-			/*
-			 * If "index" is past the end of the entries
-			 * array, then not only is the offset not in the
-			 * mapping table, but it's actually greater than
-			 * all entries in the table. In this case, we
-			 * can't return a mapping entry greater than the
-			 * offset (since none exist), so we return NULL.
-			 */
-
-			ASSERT3S(dva_mapping_overlap_compare(&offset,
-			    &vim->vim_entries[index - 1]), >, 0);
-
-			return (NULL);
-		} else {
-			/*
-			 * Just to be safe, we verify the offset falls
-			 * in between the mapping entries at index and
-			 * one less than index. Since we know the offset
-			 * doesn't overlap an entry, and we're supposed
-			 * to return the entry just greater than the
-			 * offset, both of the following tests must be
-			 * true.
-			 */
-			ASSERT3S(dva_mapping_overlap_compare(&offset,
-			    &vim->vim_entries[index]), <, 0);
-			IMPLY(index >= 1, dva_mapping_overlap_compare(&offset,
-			    &vim->vim_entries[index - 1]) > 0);
-
-			return (&vim->vim_entries[index]);
-		}
-	} else {
-		return (entry);
-	}
-}
-
-vdev_indirect_mapping_entry_phys_t *
-vdev_indirect_mapping_entry_for_offset(vdev_indirect_mapping_t *vim,
-    uint64_t offset)
-{
-	return (vdev_indirect_mapping_entry_for_offset_impl(vim, offset,
-	    B_FALSE));
-}
-
-vdev_indirect_mapping_entry_phys_t *
-vdev_indirect_mapping_entry_for_offset_or_next(vdev_indirect_mapping_t *vim,
-    uint64_t offset)
-{
-	return (vdev_indirect_mapping_entry_for_offset_impl(vim, offset,
-	    B_TRUE));
-}
-
-void
-vdev_indirect_mapping_close(vdev_indirect_mapping_t *vim)
-{
-	ASSERT(vdev_indirect_mapping_verify(vim));
-
-	if (vim->vim_phys->vimp_num_entries > 0) {
-		uint64_t map_size = vdev_indirect_mapping_size(vim);
-		kmem_free(vim->vim_entries, map_size);
-		vim->vim_entries = NULL;
-	}
-
-	dmu_buf_rele(vim->vim_dbuf, vim);
-
-	vim->vim_objset = NULL;
-	vim->vim_object = 0;
-	vim->vim_dbuf = NULL;
-	vim->vim_phys = NULL;
-
-	kmem_free(vim, sizeof (*vim));
-}
-
-uint64_t
-vdev_indirect_mapping_alloc(objset_t *os, dmu_tx_t *tx)
-{
-	uint64_t object;
-	ASSERT(dmu_tx_is_syncing(tx));
-	uint64_t bonus_size = VDEV_INDIRECT_MAPPING_SIZE_V0;
-
-	if (spa_feature_is_enabled(os->os_spa, SPA_FEATURE_OBSOLETE_COUNTS)) {
-		bonus_size = sizeof (vdev_indirect_mapping_phys_t);
-	}
-
-	object = dmu_object_alloc(os,
-	    DMU_OTN_UINT64_METADATA, SPA_OLD_MAXBLOCKSIZE,
-	    DMU_OTN_UINT64_METADATA, bonus_size,
-	    tx);
-
-	if (spa_feature_is_enabled(os->os_spa, SPA_FEATURE_OBSOLETE_COUNTS)) {
-		dmu_buf_t *dbuf;
-		vdev_indirect_mapping_phys_t *vimp;
-
-		VERIFY0(dmu_bonus_hold(os, object, FTAG, &dbuf));
-		dmu_buf_will_dirty(dbuf, tx);
-		vimp = dbuf->db_data;
-		vimp->vimp_counts_object = dmu_object_alloc(os,
-		    DMU_OTN_UINT32_METADATA, SPA_OLD_MAXBLOCKSIZE,
-		    DMU_OT_NONE, 0, tx);
-		spa_feature_incr(os->os_spa, SPA_FEATURE_OBSOLETE_COUNTS, tx);
-		dmu_buf_rele(dbuf, FTAG);
-	}
-
-	return (object);
-}
-
-
-vdev_indirect_mapping_t *
-vdev_indirect_mapping_open(objset_t *os, uint64_t mapping_object)
-{
-	vdev_indirect_mapping_t *vim = kmem_zalloc(sizeof (*vim), KM_SLEEP);
-	dmu_object_info_t doi;
-	VERIFY0(dmu_object_info(os, mapping_object, &doi));
-
-	vim->vim_objset = os;
-	vim->vim_object = mapping_object;
-
-	VERIFY0(dmu_bonus_hold(os, vim->vim_object, vim,
-	    &vim->vim_dbuf));
-	vim->vim_phys = vim->vim_dbuf->db_data;
-
-	vim->vim_havecounts =
-	    (doi.doi_bonus_size > VDEV_INDIRECT_MAPPING_SIZE_V0);
-
-	if (vim->vim_phys->vimp_num_entries > 0) {
-		uint64_t map_size = vdev_indirect_mapping_size(vim);
-		vim->vim_entries = kmem_alloc(map_size, KM_SLEEP);
-		VERIFY0(dmu_read(os, vim->vim_object, 0, map_size,
-		    vim->vim_entries, DMU_READ_PREFETCH));
-	}
-
-	ASSERT(vdev_indirect_mapping_verify(vim));
-
-	return (vim);
-}
-
-void
-vdev_indirect_mapping_free(objset_t *os, uint64_t object, dmu_tx_t *tx)
-{
-	vdev_indirect_mapping_t *vim = vdev_indirect_mapping_open(os, object);
-	if (vim->vim_havecounts) {
-		VERIFY0(dmu_object_free(os, vim->vim_phys->vimp_counts_object,
-		    tx));
-		spa_feature_decr(os->os_spa, SPA_FEATURE_OBSOLETE_COUNTS, tx);
-	}
-	vdev_indirect_mapping_close(vim);
-
-	VERIFY0(dmu_object_free(os, object, tx));
-}
-
-/*
- * Append the list of vdev_indirect_mapping_entry_t's to the on-disk
- * mapping object.  Also remove the entries from the list and free them.
- * This also implicitly extends the max_offset of the mapping (to the end
- * of the last entry).
- */
-void
-vdev_indirect_mapping_add_entries(vdev_indirect_mapping_t *vim,
-    list_t *list, dmu_tx_t *tx)
-{
-	vdev_indirect_mapping_entry_phys_t *mapbuf;
-	uint64_t old_size;
-	uint32_t *countbuf = NULL;
-	vdev_indirect_mapping_entry_phys_t *old_entries;
-	uint64_t old_count;
-	uint64_t entries_written = 0;
-
-	ASSERT(vdev_indirect_mapping_verify(vim));
-	ASSERT(dmu_tx_is_syncing(tx));
-	ASSERT(dsl_pool_sync_context(dmu_tx_pool(tx)));
-	ASSERT(!list_is_empty(list));
-
-	old_size = vdev_indirect_mapping_size(vim);
-	old_entries = vim->vim_entries;
-	old_count = vim->vim_phys->vimp_num_entries;
-
-	dmu_buf_will_dirty(vim->vim_dbuf, tx);
-
-	mapbuf = zio_buf_alloc(SPA_OLD_MAXBLOCKSIZE);
-	if (vim->vim_havecounts) {
-		countbuf = zio_buf_alloc(SPA_OLD_MAXBLOCKSIZE);
-		ASSERT(spa_feature_is_active(vim->vim_objset->os_spa,
-		    SPA_FEATURE_OBSOLETE_COUNTS));
-	}
-	while (!list_is_empty(list)) {
-		uint64_t i;
-		/*
-		 * Write entries from the list to the
-		 * vdev_im_object in batches of size SPA_OLD_MAXBLOCKSIZE.
-		 */
-		for (i = 0; i < SPA_OLD_MAXBLOCKSIZE / sizeof (*mapbuf); i++) {
-			vdev_indirect_mapping_entry_t *entry =
-			    list_remove_head(list);
-			if (entry == NULL)
-				break;
-
-			uint64_t size =
-			    DVA_GET_ASIZE(&entry->vime_mapping.vimep_dst);
-			uint64_t src_offset =
-			    DVA_MAPPING_GET_SRC_OFFSET(&entry->vime_mapping);
-
-			/*
-			 * We shouldn't be adding an entry which is fully
-			 * obsolete.
-			 */
-			ASSERT3U(entry->vime_obsolete_count, <, size);
-			IMPLY(entry->vime_obsolete_count != 0,
-			    vim->vim_havecounts);
-
-			mapbuf[i] = entry->vime_mapping;
-			if (vim->vim_havecounts)
-				countbuf[i] = entry->vime_obsolete_count;
-
-			vim->vim_phys->vimp_bytes_mapped += size;
-			ASSERT3U(src_offset, >=,
-			    vim->vim_phys->vimp_max_offset);
-			vim->vim_phys->vimp_max_offset = src_offset + size;
-
-			entries_written++;
-
-			kmem_free(entry, sizeof (*entry));
-		}
-		dmu_write(vim->vim_objset, vim->vim_object,
-		    vim->vim_phys->vimp_num_entries * sizeof (*mapbuf),
-		    i * sizeof (*mapbuf),
-		    mapbuf, tx);
-		if (vim->vim_havecounts) {
-			dmu_write(vim->vim_objset,
-			    vim->vim_phys->vimp_counts_object,
-			    vim->vim_phys->vimp_num_entries *
-			    sizeof (*countbuf),
-			    i * sizeof (*countbuf), countbuf, tx);
-		}
-		vim->vim_phys->vimp_num_entries += i;
-	}
-	zio_buf_free(mapbuf, SPA_OLD_MAXBLOCKSIZE);
-	if (vim->vim_havecounts)
-		zio_buf_free(countbuf, SPA_OLD_MAXBLOCKSIZE);
-
-	/*
-	 * Update the entry array to reflect the new entries. First, copy
-	 * over any old entries then read back the new entries we just wrote.
-	 */
-	uint64_t new_size = vdev_indirect_mapping_size(vim);
-	ASSERT3U(new_size, >, old_size);
-	ASSERT3U(new_size - old_size, ==,
-	    entries_written * sizeof (vdev_indirect_mapping_entry_phys_t));
-	vim->vim_entries = kmem_alloc(new_size, KM_SLEEP);
-	if (old_size > 0) {
-		bcopy(old_entries, vim->vim_entries, old_size);
-		kmem_free(old_entries, old_size);
-	}
-	VERIFY0(dmu_read(vim->vim_objset, vim->vim_object, old_size,
-	    new_size - old_size, &vim->vim_entries[old_count],
-	    DMU_READ_PREFETCH));
-
-	zfs_dbgmsg("txg %llu: wrote %llu entries to "
-	    "indirect mapping obj %llu; max offset=0x%llx",
-	    (u_longlong_t)dmu_tx_get_txg(tx),
-	    (u_longlong_t)entries_written,
-	    (u_longlong_t)vim->vim_object,
-	    (u_longlong_t)vim->vim_phys->vimp_max_offset);
-}
-
-/*
- * Increment the relevant counts for the specified offset and length.
- * The counts array must be obtained from
- * vdev_indirect_mapping_load_obsolete_counts().
- */
-void
-vdev_indirect_mapping_increment_obsolete_count(vdev_indirect_mapping_t *vim,
-    uint64_t offset, uint64_t length, uint32_t *counts)
-{
-	vdev_indirect_mapping_entry_phys_t *mapping;
-	uint64_t index;
-
-	mapping = vdev_indirect_mapping_entry_for_offset(vim,  offset);
-
-	ASSERT(length > 0);
-	ASSERT3P(mapping, !=, NULL);
-
-	index = mapping - vim->vim_entries;
-
-	while (length > 0) {
-		ASSERT3U(index, <, vdev_indirect_mapping_num_entries(vim));
-
-		uint64_t size = DVA_GET_ASIZE(&mapping->vimep_dst);
-		uint64_t inner_offset = offset -
-		    DVA_MAPPING_GET_SRC_OFFSET(mapping);
-		VERIFY3U(inner_offset, <, size);
-		uint64_t inner_size = MIN(length, size - inner_offset);
-
-		VERIFY3U(counts[index] + inner_size, <=, size);
-		counts[index] += inner_size;
-
-		offset += inner_size;
-		length -= inner_size;
-		mapping++;
-		index++;
-	}
-}
-
-typedef struct load_obsolete_space_map_arg {
-	vdev_indirect_mapping_t	*losma_vim;
-	uint32_t		*losma_counts;
-} load_obsolete_space_map_arg_t;
-
-static int
-load_obsolete_sm_callback(space_map_entry_t *sme, void *arg)
-{
-	load_obsolete_space_map_arg_t *losma = arg;
-	ASSERT3S(sme->sme_type, ==, SM_ALLOC);
-
-	vdev_indirect_mapping_increment_obsolete_count(losma->losma_vim,
-	    sme->sme_offset, sme->sme_run, losma->losma_counts);
-
-	return (0);
-}
-
-/*
- * Modify the counts (increment them) based on the spacemap.
- */
-void
-vdev_indirect_mapping_load_obsolete_spacemap(vdev_indirect_mapping_t *vim,
-    uint32_t *counts, space_map_t *obsolete_space_sm)
-{
-	load_obsolete_space_map_arg_t losma;
-	losma.losma_counts = counts;
-	losma.losma_vim = vim;
-	VERIFY0(space_map_iterate(obsolete_space_sm,
-	    space_map_length(obsolete_space_sm),
-	    load_obsolete_sm_callback, &losma));
-}
-
-/*
- * Read the obsolete counts from disk, returning them in an array.
- */
-uint32_t *
-vdev_indirect_mapping_load_obsolete_counts(vdev_indirect_mapping_t *vim)
-{
-	ASSERT(vdev_indirect_mapping_verify(vim));
-
-	uint64_t counts_size =
-	    vim->vim_phys->vimp_num_entries * sizeof (uint32_t);
-	uint32_t *counts = kmem_alloc(counts_size, KM_SLEEP);
-	if (vim->vim_havecounts) {
-		VERIFY0(dmu_read(vim->vim_objset,
-		    vim->vim_phys->vimp_counts_object,
-		    0, counts_size,
-		    counts, DMU_READ_PREFETCH));
-	} else {
-		bzero(counts, counts_size);
-	}
-	return (counts);
-}
-
-extern void
-vdev_indirect_mapping_free_obsolete_counts(vdev_indirect_mapping_t *vim,
-    uint32_t *counts)
-{
-	ASSERT(vdev_indirect_mapping_verify(vim));
-
-	kmem_free(counts, vim->vim_phys->vimp_num_entries * sizeof (uint32_t));
-}
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_initialize.c
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_initialize.c
+++ /dev/null
@@ -1,782 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-
-/*
- * Copyright (c) 2016 by Delphix. All rights reserved.
- */
-
-#include <sys/spa.h>
-#include <sys/spa_impl.h>
-#include <sys/txg.h>
-#include <sys/vdev_impl.h>
-#include <sys/refcount.h>
-#include <sys/metaslab_impl.h>
-#include <sys/dsl_synctask.h>
-#include <sys/zap.h>
-#include <sys/dmu_tx.h>
-
-/*
- * Maximum number of metaslabs per group that can be initialized
- * simultaneously.
- */
-int max_initialize_ms = 3;
-
-/*
- * Value that is written to disk during initialization.
- */
-uint64_t zfs_initialize_value = 0xdeadbeefdeadbeefULL;
-
-/* maximum number of I/Os outstanding per leaf vdev */
-int zfs_initialize_limit = 1;
-
-/* size of initializing writes; default 1MiB, see zfs_remove_max_segment */
-uint64_t zfs_initialize_chunk_size = 1024 * 1024;
-
-static boolean_t
-vdev_initialize_should_stop(vdev_t *vd)
-{
-	return (vd->vdev_initialize_exit_wanted || !vdev_writeable(vd) ||
-	    vd->vdev_detached || vd->vdev_top->vdev_removing);
-}
-
-static void
-vdev_initialize_zap_update_sync(void *arg, dmu_tx_t *tx)
-{
-	/*
-	 * We pass in the guid instead of the vdev_t since the vdev may
-	 * have been freed prior to the sync task being processed. This
-	 * happens when a vdev is detached as we call spa_config_vdev_exit(),
-	 * stop the intializing thread, schedule the sync task, and free
-	 * the vdev. Later when the scheduled sync task is invoked, it would
-	 * find that the vdev has been freed.
-	 */
-	uint64_t guid = *(uint64_t *)arg;
-	uint64_t txg = dmu_tx_get_txg(tx);
-	kmem_free(arg, sizeof (uint64_t));
-
-	vdev_t *vd = spa_lookup_by_guid(tx->tx_pool->dp_spa, guid, B_FALSE);
-	if (vd == NULL || vd->vdev_top->vdev_removing || !vdev_is_concrete(vd))
-		return;
-
-	uint64_t last_offset = vd->vdev_initialize_offset[txg & TXG_MASK];
-	vd->vdev_initialize_offset[txg & TXG_MASK] = 0;
-
-	VERIFY(vd->vdev_leaf_zap != 0);
-
-	objset_t *mos = vd->vdev_spa->spa_meta_objset;
-
-	if (last_offset > 0) {
-		vd->vdev_initialize_last_offset = last_offset;
-		VERIFY0(zap_update(mos, vd->vdev_leaf_zap,
-		    VDEV_LEAF_ZAP_INITIALIZE_LAST_OFFSET,
-		    sizeof (last_offset), 1, &last_offset, tx));
-	}
-	if (vd->vdev_initialize_action_time > 0) {
-		uint64_t val = (uint64_t)vd->vdev_initialize_action_time;
-		VERIFY0(zap_update(mos, vd->vdev_leaf_zap,
-		    VDEV_LEAF_ZAP_INITIALIZE_ACTION_TIME, sizeof (val),
-		    1, &val, tx));
-	}
-
-	uint64_t initialize_state = vd->vdev_initialize_state;
-	VERIFY0(zap_update(mos, vd->vdev_leaf_zap,
-	    VDEV_LEAF_ZAP_INITIALIZE_STATE, sizeof (initialize_state), 1,
-	    &initialize_state, tx));
-}
-
-static void
-vdev_initialize_change_state(vdev_t *vd, vdev_initializing_state_t new_state)
-{
-	ASSERT(MUTEX_HELD(&vd->vdev_initialize_lock));
-	spa_t *spa = vd->vdev_spa;
-
-	if (new_state == vd->vdev_initialize_state)
-		return;
-
-	/*
-	 * Copy the vd's guid, this will be freed by the sync task.
-	 */
-	uint64_t *guid = kmem_zalloc(sizeof (uint64_t), KM_SLEEP);
-	*guid = vd->vdev_guid;
-
-	/*
-	 * If we're suspending, then preserving the original start time.
-	 */
-	if (vd->vdev_initialize_state != VDEV_INITIALIZE_SUSPENDED) {
-		vd->vdev_initialize_action_time = gethrestime_sec();
-	}
-	vd->vdev_initialize_state = new_state;
-
-	dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
-	VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
-	dsl_sync_task_nowait(spa_get_dsl(spa), vdev_initialize_zap_update_sync,
-	    guid, 2, ZFS_SPACE_CHECK_RESERVED, tx);
-
-	switch (new_state) {
-	case VDEV_INITIALIZE_ACTIVE:
-		spa_history_log_internal(spa, "initialize", tx,
-		    "vdev=%s activated", vd->vdev_path);
-		break;
-	case VDEV_INITIALIZE_SUSPENDED:
-		spa_history_log_internal(spa, "initialize", tx,
-		    "vdev=%s suspended", vd->vdev_path);
-		break;
-	case VDEV_INITIALIZE_CANCELED:
-		spa_history_log_internal(spa, "initialize", tx,
-		    "vdev=%s canceled", vd->vdev_path);
-		break;
-	case VDEV_INITIALIZE_COMPLETE:
-		spa_history_log_internal(spa, "initialize", tx,
-		    "vdev=%s complete", vd->vdev_path);
-		break;
-	default:
-		panic("invalid state %llu", (unsigned long long)new_state);
-	}
-
-	dmu_tx_commit(tx);
-}
-
-static void
-vdev_initialize_cb(zio_t *zio)
-{
-	vdev_t *vd = zio->io_vd;
-	mutex_enter(&vd->vdev_initialize_io_lock);
-	if (zio->io_error == ENXIO && !vdev_writeable(vd)) {
-		/*
-		 * The I/O failed because the vdev was unavailable; roll the
-		 * last offset back. (This works because spa_sync waits on
-		 * spa_txg_zio before it runs sync tasks.)
-		 */
-		uint64_t *off =
-		    &vd->vdev_initialize_offset[zio->io_txg & TXG_MASK];
-		*off = MIN(*off, zio->io_offset);
-	} else {
-		/*
-		 * Since initializing is best-effort, we ignore I/O errors and
-		 * rely on vdev_probe to determine if the errors are more
-		 * critical.
-		 */
-		if (zio->io_error != 0)
-			vd->vdev_stat.vs_initialize_errors++;
-
-		vd->vdev_initialize_bytes_done += zio->io_orig_size;
-	}
-	ASSERT3U(vd->vdev_initialize_inflight, >, 0);
-	vd->vdev_initialize_inflight--;
-	cv_broadcast(&vd->vdev_initialize_io_cv);
-	mutex_exit(&vd->vdev_initialize_io_lock);
-
-	spa_config_exit(vd->vdev_spa, SCL_STATE_ALL, vd);
-}
-
-/* Takes care of physical writing and limiting # of concurrent ZIOs. */
-static int
-vdev_initialize_write(vdev_t *vd, uint64_t start, uint64_t size, abd_t *data)
-{
-	spa_t *spa = vd->vdev_spa;
-
-	/* Limit inflight initializing I/Os */
-	mutex_enter(&vd->vdev_initialize_io_lock);
-	while (vd->vdev_initialize_inflight >= zfs_initialize_limit) {
-		cv_wait(&vd->vdev_initialize_io_cv,
-		    &vd->vdev_initialize_io_lock);
-	}
-	vd->vdev_initialize_inflight++;
-	mutex_exit(&vd->vdev_initialize_io_lock);
-
-	dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
-	VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
-	uint64_t txg = dmu_tx_get_txg(tx);
-
-	spa_config_enter(spa, SCL_STATE_ALL, vd, RW_READER);
-	mutex_enter(&vd->vdev_initialize_lock);
-
-	if (vd->vdev_initialize_offset[txg & TXG_MASK] == 0) {
-		uint64_t *guid = kmem_zalloc(sizeof (uint64_t), KM_SLEEP);
-		*guid = vd->vdev_guid;
-
-		/* This is the first write of this txg. */
-		dsl_sync_task_nowait(spa_get_dsl(spa),
-		    vdev_initialize_zap_update_sync, guid, 2,
-		    ZFS_SPACE_CHECK_RESERVED, tx);
-	}
-
-	/*
-	 * We know the vdev struct will still be around since all
-	 * consumers of vdev_free must stop the initialization first.
-	 */
-	if (vdev_initialize_should_stop(vd)) {
-		mutex_enter(&vd->vdev_initialize_io_lock);
-		ASSERT3U(vd->vdev_initialize_inflight, >, 0);
-		vd->vdev_initialize_inflight--;
-		mutex_exit(&vd->vdev_initialize_io_lock);
-		spa_config_exit(vd->vdev_spa, SCL_STATE_ALL, vd);
-		mutex_exit(&vd->vdev_initialize_lock);
-		dmu_tx_commit(tx);
-		return (SET_ERROR(EINTR));
-	}
-	mutex_exit(&vd->vdev_initialize_lock);
-
-	vd->vdev_initialize_offset[txg & TXG_MASK] = start + size;
-	zio_nowait(zio_write_phys(spa->spa_txg_zio[txg & TXG_MASK], vd, start,
-	    size, data, ZIO_CHECKSUM_OFF, vdev_initialize_cb, NULL,
-	    ZIO_PRIORITY_INITIALIZING, ZIO_FLAG_CANFAIL, B_FALSE));
-	/* vdev_initialize_cb releases SCL_STATE_ALL */
-
-	dmu_tx_commit(tx);
-
-	return (0);
-}
-
-/*
- * Translate a logical range to the physical range for the specified vdev_t.
- * This function is initially called with a leaf vdev and will walk each
- * parent vdev until it reaches a top-level vdev. Once the top-level is
- * reached the physical range is initialized and the recursive function
- * begins to unwind. As it unwinds it calls the parent's vdev specific
- * translation function to do the real conversion.
- */
-void
-vdev_xlate(vdev_t *vd, const range_seg_t *logical_rs, range_seg_t *physical_rs)
-{
-	/*
-	 * Walk up the vdev tree
-	 */
-	if (vd != vd->vdev_top) {
-		vdev_xlate(vd->vdev_parent, logical_rs, physical_rs);
-	} else {
-		/*
-		 * We've reached the top-level vdev, initialize the
-		 * physical range to the logical range and start to
-		 * unwind.
-		 */
-		physical_rs->rs_start = logical_rs->rs_start;
-		physical_rs->rs_end = logical_rs->rs_end;
-		return;
-	}
-
-	vdev_t *pvd = vd->vdev_parent;
-	ASSERT3P(pvd, !=, NULL);
-	ASSERT3P(pvd->vdev_ops->vdev_op_xlate, !=, NULL);
-
-	/*
-	 * As this recursive function unwinds, translate the logical
-	 * range into its physical components by calling the
-	 * vdev specific translate function.
-	 */
-	range_seg_t intermediate = { 0 };
-	pvd->vdev_ops->vdev_op_xlate(vd, physical_rs, &intermediate);
-
-	physical_rs->rs_start = intermediate.rs_start;
-	physical_rs->rs_end = intermediate.rs_end;
-}
-
-/*
- * Callback to fill each ABD chunk with zfs_initialize_value. len must be
- * divisible by sizeof (uint64_t), and buf must be 8-byte aligned. The ABD
- * allocation will guarantee these for us.
- */
-/* ARGSUSED */
-static int
-vdev_initialize_block_fill(void *buf, size_t len, void *unused)
-{
-	ASSERT0(len % sizeof (uint64_t));
-	for (uint64_t i = 0; i < len; i += sizeof (uint64_t)) {
-		*(uint64_t *)((char *)(buf) + i) = zfs_initialize_value;
-	}
-	return (0);
-}
-
-static abd_t *
-vdev_initialize_block_alloc()
-{
-	/* Allocate ABD for filler data */
-	abd_t *data = abd_alloc_for_io(zfs_initialize_chunk_size, B_FALSE);
-
-	ASSERT0(zfs_initialize_chunk_size % sizeof (uint64_t));
-	(void) abd_iterate_func(data, 0, zfs_initialize_chunk_size,
-	    vdev_initialize_block_fill, NULL);
-
-	return (data);
-}
-
-static void
-vdev_initialize_block_free(abd_t *data)
-{
-	abd_free(data);
-}
-
-static int
-vdev_initialize_ranges(vdev_t *vd, abd_t *data)
-{
-	avl_tree_t *rt = &vd->vdev_initialize_tree->rt_root;
-
-	for (range_seg_t *rs = avl_first(rt); rs != NULL;
-	    rs = AVL_NEXT(rt, rs)) {
-		uint64_t size = rs->rs_end - rs->rs_start;
-
-		/* Split range into legally-sized physical chunks */
-		uint64_t writes_required =
-		    ((size - 1) / zfs_initialize_chunk_size) + 1;
-
-		for (uint64_t w = 0; w < writes_required; w++) {
-			int error;
-
-			error = vdev_initialize_write(vd,
-			    VDEV_LABEL_START_SIZE + rs->rs_start +
-			    (w * zfs_initialize_chunk_size),
-			    MIN(size - (w * zfs_initialize_chunk_size),
-			    zfs_initialize_chunk_size), data);
-			if (error != 0)
-				return (error);
-		}
-	}
-	return (0);
-}
-
-static void
-vdev_initialize_mg_wait(metaslab_group_t *mg)
-{
-	ASSERT(MUTEX_HELD(&mg->mg_ms_initialize_lock));
-	while (mg->mg_initialize_updating) {
-		cv_wait(&mg->mg_ms_initialize_cv, &mg->mg_ms_initialize_lock);
-	}
-}
-
-static void
-vdev_initialize_mg_mark(metaslab_group_t *mg)
-{
-	ASSERT(MUTEX_HELD(&mg->mg_ms_initialize_lock));
-	ASSERT(mg->mg_initialize_updating);
-
-	while (mg->mg_ms_initializing >= max_initialize_ms) {
-		cv_wait(&mg->mg_ms_initialize_cv, &mg->mg_ms_initialize_lock);
-	}
-	mg->mg_ms_initializing++;
-	ASSERT3U(mg->mg_ms_initializing, <=, max_initialize_ms);
-}
-
-/*
- * Mark the metaslab as being initialized to prevent any allocations
- * on this metaslab. We must also track how many metaslabs are currently
- * being initialized within a metaslab group and limit them to prevent
- * allocation failures from occurring because all metaslabs are being
- * initialized.
- */
-static void
-vdev_initialize_ms_mark(metaslab_t *msp)
-{
-	ASSERT(!MUTEX_HELD(&msp->ms_lock));
-	metaslab_group_t *mg = msp->ms_group;
-
-	mutex_enter(&mg->mg_ms_initialize_lock);
-
-	/*
-	 * To keep an accurate count of how many threads are initializing
-	 * a specific metaslab group, we only allow one thread to mark
-	 * the metaslab group at a time. This ensures that the value of
-	 * ms_initializing will be accurate when we decide to mark a metaslab
-	 * group as being initialized. To do this we force all other threads
-	 * to wait till the metaslab's mg_initialize_updating flag is no
-	 * longer set.
-	 */
-	vdev_initialize_mg_wait(mg);
-	mg->mg_initialize_updating = B_TRUE;
-	if (msp->ms_initializing == 0) {
-		vdev_initialize_mg_mark(mg);
-	}
-	mutex_enter(&msp->ms_lock);
-	msp->ms_initializing++;
-	mutex_exit(&msp->ms_lock);
-
-	mg->mg_initialize_updating = B_FALSE;
-	cv_broadcast(&mg->mg_ms_initialize_cv);
-	mutex_exit(&mg->mg_ms_initialize_lock);
-}
-
-static void
-vdev_initialize_ms_unmark(metaslab_t *msp)
-{
-	ASSERT(!MUTEX_HELD(&msp->ms_lock));
-	metaslab_group_t *mg = msp->ms_group;
-	mutex_enter(&mg->mg_ms_initialize_lock);
-	mutex_enter(&msp->ms_lock);
-	if (--msp->ms_initializing == 0) {
-		mg->mg_ms_initializing--;
-		cv_broadcast(&mg->mg_ms_initialize_cv);
-	}
-	mutex_exit(&msp->ms_lock);
-	mutex_exit(&mg->mg_ms_initialize_lock);
-}
-
-static void
-vdev_initialize_calculate_progress(vdev_t *vd)
-{
-	ASSERT(spa_config_held(vd->vdev_spa, SCL_CONFIG, RW_READER) ||
-	    spa_config_held(vd->vdev_spa, SCL_CONFIG, RW_WRITER));
-	ASSERT(vd->vdev_leaf_zap != 0);
-
-	vd->vdev_initialize_bytes_est = 0;
-	vd->vdev_initialize_bytes_done = 0;
-
-	for (uint64_t i = 0; i < vd->vdev_top->vdev_ms_count; i++) {
-		metaslab_t *msp = vd->vdev_top->vdev_ms[i];
-		mutex_enter(&msp->ms_lock);
-
-		uint64_t ms_free = msp->ms_size -
-		    metaslab_allocated_space(msp);
-
-		if (vd->vdev_top->vdev_ops == &vdev_raidz_ops)
-			ms_free /= vd->vdev_top->vdev_children;
-
-		/*
-		 * Convert the metaslab range to a physical range
-		 * on our vdev. We use this to determine if we are
-		 * in the middle of this metaslab range.
-		 */
-		range_seg_t logical_rs, physical_rs;
-		logical_rs.rs_start = msp->ms_start;
-		logical_rs.rs_end = msp->ms_start + msp->ms_size;
-		vdev_xlate(vd, &logical_rs, &physical_rs);
-
-		if (vd->vdev_initialize_last_offset <= physical_rs.rs_start) {
-			vd->vdev_initialize_bytes_est += ms_free;
-			mutex_exit(&msp->ms_lock);
-			continue;
-		} else if (vd->vdev_initialize_last_offset >
-		    physical_rs.rs_end) {
-			vd->vdev_initialize_bytes_done += ms_free;
-			vd->vdev_initialize_bytes_est += ms_free;
-			mutex_exit(&msp->ms_lock);
-			continue;
-		}
-
-		/*
-		 * If we get here, we're in the middle of initializing this
-		 * metaslab. Load it and walk the free tree for more accurate
-		 * progress estimation.
-		 */
-		VERIFY0(metaslab_load(msp));
-
-		for (range_seg_t *rs = avl_first(&msp->ms_allocatable->rt_root);
-		    rs; rs = AVL_NEXT(&msp->ms_allocatable->rt_root, rs)) {
-			logical_rs.rs_start = rs->rs_start;
-			logical_rs.rs_end = rs->rs_end;
-			vdev_xlate(vd, &logical_rs, &physical_rs);
-
-			uint64_t size = physical_rs.rs_end -
-			    physical_rs.rs_start;
-			vd->vdev_initialize_bytes_est += size;
-			if (vd->vdev_initialize_last_offset >
-			    physical_rs.rs_end) {
-				vd->vdev_initialize_bytes_done += size;
-			} else if (vd->vdev_initialize_last_offset >
-			    physical_rs.rs_start &&
-			    vd->vdev_initialize_last_offset <
-			    physical_rs.rs_end) {
-				vd->vdev_initialize_bytes_done +=
-				    vd->vdev_initialize_last_offset -
-				    physical_rs.rs_start;
-			}
-		}
-		mutex_exit(&msp->ms_lock);
-	}
-}
-
-static void
-vdev_initialize_load(vdev_t *vd)
-{
-	ASSERT(spa_config_held(vd->vdev_spa, SCL_CONFIG, RW_READER) ||
-	    spa_config_held(vd->vdev_spa, SCL_CONFIG, RW_WRITER));
-	ASSERT(vd->vdev_leaf_zap != 0);
-
-	if (vd->vdev_initialize_state == VDEV_INITIALIZE_ACTIVE ||
-	    vd->vdev_initialize_state == VDEV_INITIALIZE_SUSPENDED) {
-		int err = zap_lookup(vd->vdev_spa->spa_meta_objset,
-		    vd->vdev_leaf_zap, VDEV_LEAF_ZAP_INITIALIZE_LAST_OFFSET,
-		    sizeof (vd->vdev_initialize_last_offset), 1,
-		    &vd->vdev_initialize_last_offset);
-		ASSERT(err == 0 || err == ENOENT);
-	}
-
-	vdev_initialize_calculate_progress(vd);
-}
-
-
-/*
- * Convert the logical range into a physcial range and add it to our
- * avl tree.
- */
-void
-vdev_initialize_range_add(void *arg, uint64_t start, uint64_t size)
-{
-	vdev_t *vd = arg;
-	range_seg_t logical_rs, physical_rs;
-	logical_rs.rs_start = start;
-	logical_rs.rs_end = start + size;
-
-	ASSERT(vd->vdev_ops->vdev_op_leaf);
-	vdev_xlate(vd, &logical_rs, &physical_rs);
-
-	IMPLY(vd->vdev_top == vd,
-	    logical_rs.rs_start == physical_rs.rs_start);
-	IMPLY(vd->vdev_top == vd,
-	    logical_rs.rs_end == physical_rs.rs_end);
-
-	/* Only add segments that we have not visited yet */
-	if (physical_rs.rs_end <= vd->vdev_initialize_last_offset)
-		return;
-
-	/* Pick up where we left off mid-range. */
-	if (vd->vdev_initialize_last_offset > physical_rs.rs_start) {
-		zfs_dbgmsg("range write: vd %s changed (%llu, %llu) to "
-		    "(%llu, %llu)", vd->vdev_path,
-		    (u_longlong_t)physical_rs.rs_start,
-		    (u_longlong_t)physical_rs.rs_end,
-		    (u_longlong_t)vd->vdev_initialize_last_offset,
-		    (u_longlong_t)physical_rs.rs_end);
-		ASSERT3U(physical_rs.rs_end, >,
-		    vd->vdev_initialize_last_offset);
-		physical_rs.rs_start = vd->vdev_initialize_last_offset;
-	}
-	ASSERT3U(physical_rs.rs_end, >=, physical_rs.rs_start);
-
-	/*
-	 * With raidz, it's possible that the logical range does not live on
-	 * this leaf vdev. We only add the physical range to this vdev's if it
-	 * has a length greater than 0.
-	 */
-	if (physical_rs.rs_end > physical_rs.rs_start) {
-		range_tree_add(vd->vdev_initialize_tree, physical_rs.rs_start,
-		    physical_rs.rs_end - physical_rs.rs_start);
-	} else {
-		ASSERT3U(physical_rs.rs_end, ==, physical_rs.rs_start);
-	}
-}
-
-static void
-vdev_initialize_thread(void *arg)
-{
-	vdev_t *vd = arg;
-	spa_t *spa = vd->vdev_spa;
-	int error = 0;
-	uint64_t ms_count = 0;
-
-	ASSERT(vdev_is_concrete(vd));
-	spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
-
-	vd->vdev_initialize_last_offset = 0;
-	vdev_initialize_load(vd);
-
-	abd_t *deadbeef = vdev_initialize_block_alloc();
-
-	vd->vdev_initialize_tree = range_tree_create(NULL, NULL);
-
-	for (uint64_t i = 0; !vd->vdev_detached &&
-	    i < vd->vdev_top->vdev_ms_count; i++) {
-		metaslab_t *msp = vd->vdev_top->vdev_ms[i];
-
-		/*
-		 * If we've expanded the top-level vdev or it's our
-		 * first pass, calculate our progress.
-		 */
-		if (vd->vdev_top->vdev_ms_count != ms_count) {
-			vdev_initialize_calculate_progress(vd);
-			ms_count = vd->vdev_top->vdev_ms_count;
-		}
-
-		vdev_initialize_ms_mark(msp);
-		mutex_enter(&msp->ms_lock);
-		VERIFY0(metaslab_load(msp));
-
-		range_tree_walk(msp->ms_allocatable, vdev_initialize_range_add,
-		    vd);
-		mutex_exit(&msp->ms_lock);
-
-		spa_config_exit(spa, SCL_CONFIG, FTAG);
-		error = vdev_initialize_ranges(vd, deadbeef);
-		vdev_initialize_ms_unmark(msp);
-		spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
-
-		range_tree_vacate(vd->vdev_initialize_tree, NULL, NULL);
-		if (error != 0)
-			break;
-	}
-
-	spa_config_exit(spa, SCL_CONFIG, FTAG);
-	mutex_enter(&vd->vdev_initialize_io_lock);
-	while (vd->vdev_initialize_inflight > 0) {
-		cv_wait(&vd->vdev_initialize_io_cv,
-		    &vd->vdev_initialize_io_lock);
-	}
-	mutex_exit(&vd->vdev_initialize_io_lock);
-
-	range_tree_destroy(vd->vdev_initialize_tree);
-	vdev_initialize_block_free(deadbeef);
-	vd->vdev_initialize_tree = NULL;
-
-	mutex_enter(&vd->vdev_initialize_lock);
-	if (!vd->vdev_initialize_exit_wanted && vdev_writeable(vd)) {
-		vdev_initialize_change_state(vd, VDEV_INITIALIZE_COMPLETE);
-	}
-	ASSERT(vd->vdev_initialize_thread != NULL ||
-	    vd->vdev_initialize_inflight == 0);
-
-	/*
-	 * Drop the vdev_initialize_lock while we sync out the
-	 * txg since it's possible that a device might be trying to
-	 * come online and must check to see if it needs to restart an
-	 * initialization. That thread will be holding the spa_config_lock
-	 * which would prevent the txg_wait_synced from completing.
-	 */
-	mutex_exit(&vd->vdev_initialize_lock);
-	txg_wait_synced(spa_get_dsl(spa), 0);
-	mutex_enter(&vd->vdev_initialize_lock);
-
-	vd->vdev_initialize_thread = NULL;
-	cv_broadcast(&vd->vdev_initialize_cv);
-	mutex_exit(&vd->vdev_initialize_lock);
-	thread_exit();
-}
-
-/*
- * Initiates a device. Caller must hold vdev_initialize_lock.
- * Device must be a leaf and not already be initializing.
- */
-void
-vdev_initialize(vdev_t *vd)
-{
-	ASSERT(MUTEX_HELD(&vd->vdev_initialize_lock));
-	ASSERT(vd->vdev_ops->vdev_op_leaf);
-	ASSERT(vdev_is_concrete(vd));
-	ASSERT3P(vd->vdev_initialize_thread, ==, NULL);
-	ASSERT(!vd->vdev_detached);
-	ASSERT(!vd->vdev_initialize_exit_wanted);
-	ASSERT(!vd->vdev_top->vdev_removing);
-
-	vdev_initialize_change_state(vd, VDEV_INITIALIZE_ACTIVE);
-	vd->vdev_initialize_thread = thread_create(NULL, 0,
-	    vdev_initialize_thread, vd, 0, &p0, TS_RUN, maxclsyspri);
-}
-
-/*
- * Stop initializng a device, with the resultant initialing state being
- * tgt_state. Blocks until the initializing thread has exited.
- * Caller must hold vdev_initialize_lock and must not be writing to the spa
- * config, as the initializing thread may try to enter the config as a reader
- * before exiting.
- */
-void
-vdev_initialize_stop(vdev_t *vd, vdev_initializing_state_t tgt_state)
-{
-	spa_t *spa = vd->vdev_spa;
-	ASSERT(!spa_config_held(spa, SCL_CONFIG | SCL_STATE, RW_WRITER));
-
-	ASSERT(MUTEX_HELD(&vd->vdev_initialize_lock));
-	ASSERT(vd->vdev_ops->vdev_op_leaf);
-	ASSERT(vdev_is_concrete(vd));
-
-	/*
-	 * Allow cancel requests to proceed even if the initialize thread
-	 * has stopped.
-	 */
-	if (vd->vdev_initialize_thread == NULL &&
-	    tgt_state != VDEV_INITIALIZE_CANCELED) {
-		return;
-	}
-
-	vdev_initialize_change_state(vd, tgt_state);
-	vd->vdev_initialize_exit_wanted = B_TRUE;
-	while (vd->vdev_initialize_thread != NULL)
-		cv_wait(&vd->vdev_initialize_cv, &vd->vdev_initialize_lock);
-
-	ASSERT3P(vd->vdev_initialize_thread, ==, NULL);
-	vd->vdev_initialize_exit_wanted = B_FALSE;
-}
-
-static void
-vdev_initialize_stop_all_impl(vdev_t *vd, vdev_initializing_state_t tgt_state)
-{
-	if (vd->vdev_ops->vdev_op_leaf && vdev_is_concrete(vd)) {
-		mutex_enter(&vd->vdev_initialize_lock);
-		vdev_initialize_stop(vd, tgt_state);
-		mutex_exit(&vd->vdev_initialize_lock);
-		return;
-	}
-
-	for (uint64_t i = 0; i < vd->vdev_children; i++) {
-		vdev_initialize_stop_all_impl(vd->vdev_child[i], tgt_state);
-	}
-}
-
-/*
- * Convenience function to stop initializing of a vdev tree and set all
- * initialize thread pointers to NULL.
- */
-void
-vdev_initialize_stop_all(vdev_t *vd, vdev_initializing_state_t tgt_state)
-{
-	vdev_initialize_stop_all_impl(vd, tgt_state);
-
-	if (vd->vdev_spa->spa_sync_on) {
-		/* Make sure that our state has been synced to disk */
-		txg_wait_synced(spa_get_dsl(vd->vdev_spa), 0);
-	}
-}
-
-void
-vdev_initialize_restart(vdev_t *vd)
-{
-	ASSERT(MUTEX_HELD(&spa_namespace_lock));
-	ASSERT(!spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER));
-
-	if (vd->vdev_leaf_zap != 0) {
-		mutex_enter(&vd->vdev_initialize_lock);
-		uint64_t initialize_state = VDEV_INITIALIZE_NONE;
-		int err = zap_lookup(vd->vdev_spa->spa_meta_objset,
-		    vd->vdev_leaf_zap, VDEV_LEAF_ZAP_INITIALIZE_STATE,
-		    sizeof (initialize_state), 1, &initialize_state);
-		ASSERT(err == 0 || err == ENOENT);
-		vd->vdev_initialize_state = initialize_state;
-
-		uint64_t timestamp = 0;
-		err = zap_lookup(vd->vdev_spa->spa_meta_objset,
-		    vd->vdev_leaf_zap, VDEV_LEAF_ZAP_INITIALIZE_ACTION_TIME,
-		    sizeof (timestamp), 1, &timestamp);
-		ASSERT(err == 0 || err == ENOENT);
-		vd->vdev_initialize_action_time = (time_t)timestamp;
-
-		if (vd->vdev_initialize_state == VDEV_INITIALIZE_SUSPENDED ||
-		    vd->vdev_offline) {
-			/* load progress for reporting, but don't resume */
-			vdev_initialize_load(vd);
-		} else if (vd->vdev_initialize_state ==
-		    VDEV_INITIALIZE_ACTIVE && vdev_writeable(vd)) {
-			vdev_initialize(vd);
-		}
-
-		mutex_exit(&vd->vdev_initialize_lock);
-	}
-
-	for (uint64_t i = 0; i < vd->vdev_children; i++) {
-		vdev_initialize_restart(vd->vdev_child[i]);
-	}
-}
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_label.c
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_label.c
+++ /dev/null
@@ -1,1559 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-
-/*
- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
- * Copyright (c) 2017, Intel Corporation.
- * Copyright 2019 Joyent, Inc.
- */
-
-/*
- * Virtual Device Labels
- * ---------------------
- *
- * The vdev label serves several distinct purposes:
- *
- *	1. Uniquely identify this device as part of a ZFS pool and confirm its
- *	   identity within the pool.
- *
- *	2. Verify that all the devices given in a configuration are present
- *         within the pool.
- *
- *	3. Determine the uberblock for the pool.
- *
- *	4. In case of an import operation, determine the configuration of the
- *         toplevel vdev of which it is a part.
- *
- *	5. If an import operation cannot find all the devices in the pool,
- *         provide enough information to the administrator to determine which
- *         devices are missing.
- *
- * It is important to note that while the kernel is responsible for writing the
- * label, it only consumes the information in the first three cases.  The
- * latter information is only consumed in userland when determining the
- * configuration to import a pool.
- *
- *
- * Label Organization
- * ------------------
- *
- * Before describing the contents of the label, it's important to understand how
- * the labels are written and updated with respect to the uberblock.
- *
- * When the pool configuration is altered, either because it was newly created
- * or a device was added, we want to update all the labels such that we can deal
- * with fatal failure at any point.  To this end, each disk has two labels which
- * are updated before and after the uberblock is synced.  Assuming we have
- * labels and an uberblock with the following transaction groups:
- *
- *              L1          UB          L2
- *           +------+    +------+    +------+
- *           |      |    |      |    |      |
- *           | t10  |    | t10  |    | t10  |
- *           |      |    |      |    |      |
- *           +------+    +------+    +------+
- *
- * In this stable state, the labels and the uberblock were all updated within
- * the same transaction group (10).  Each label is mirrored and checksummed, so
- * that we can detect when we fail partway through writing the label.
- *
- * In order to identify which labels are valid, the labels are written in the
- * following manner:
- *
- *	1. For each vdev, update 'L1' to the new label
- *	2. Update the uberblock
- *	3. For each vdev, update 'L2' to the new label
- *
- * Given arbitrary failure, we can determine the correct label to use based on
- * the transaction group.  If we fail after updating L1 but before updating the
- * UB, we will notice that L1's transaction group is greater than the uberblock,
- * so L2 must be valid.  If we fail after writing the uberblock but before
- * writing L2, we will notice that L2's transaction group is less than L1, and
- * therefore L1 is valid.
- *
- * Another added complexity is that not every label is updated when the config
- * is synced.  If we add a single device, we do not want to have to re-write
- * every label for every device in the pool.  This means that both L1 and L2 may
- * be older than the pool uberblock, because the necessary information is stored
- * on another vdev.
- *
- *
- * On-disk Format
- * --------------
- *
- * The vdev label consists of two distinct parts, and is wrapped within the
- * vdev_label_t structure.  The label includes 8k of padding to permit legacy
- * VTOC disk labels, but is otherwise ignored.
- *
- * The first half of the label is a packed nvlist which contains pool wide
- * properties, per-vdev properties, and configuration information.  It is
- * described in more detail below.
- *
- * The latter half of the label consists of a redundant array of uberblocks.
- * These uberblocks are updated whenever a transaction group is committed,
- * or when the configuration is updated.  When a pool is loaded, we scan each
- * vdev for the 'best' uberblock.
- *
- *
- * Configuration Information
- * -------------------------
- *
- * The nvlist describing the pool and vdev contains the following elements:
- *
- *	version		ZFS on-disk version
- *	name		Pool name
- *	state		Pool state
- *	txg		Transaction group in which this label was written
- *	pool_guid	Unique identifier for this pool
- *	vdev_tree	An nvlist describing vdev tree.
- *	features_for_read
- *			An nvlist of the features necessary for reading the MOS.
- *
- * Each leaf device label also contains the following:
- *
- *	top_guid	Unique ID for top-level vdev in which this is contained
- *	guid		Unique ID for the leaf vdev
- *
- * The 'vs' configuration follows the format described in 'spa_config.c'.
- */
-
-#include <sys/zfs_context.h>
-#include <sys/spa.h>
-#include <sys/spa_impl.h>
-#include <sys/dmu.h>
-#include <sys/zap.h>
-#include <sys/vdev.h>
-#include <sys/vdev_impl.h>
-#include <sys/uberblock_impl.h>
-#include <sys/metaslab.h>
-#include <sys/metaslab_impl.h>
-#include <sys/zio.h>
-#include <sys/dsl_scan.h>
-#include <sys/abd.h>
-#include <sys/fs/zfs.h>
-#include <sys/trim_map.h>
-
-static boolean_t vdev_trim_on_init = B_TRUE;
-SYSCTL_DECL(_vfs_zfs_vdev);
-SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, trim_on_init, CTLFLAG_RWTUN,
-    &vdev_trim_on_init, 0, "Enable/disable full vdev trim on initialisation");
-
-/*
- * Basic routines to read and write from a vdev label.
- * Used throughout the rest of this file.
- */
-uint64_t
-vdev_label_offset(uint64_t psize, int l, uint64_t offset)
-{
-	ASSERT(offset < sizeof (vdev_label_t));
-	ASSERT(P2PHASE_TYPED(psize, sizeof (vdev_label_t), uint64_t) == 0);
-
-	return (offset + l * sizeof (vdev_label_t) + (l < VDEV_LABELS / 2 ?
-	    0 : psize - VDEV_LABELS * sizeof (vdev_label_t)));
-}
-
-/*
- * Returns back the vdev label associated with the passed in offset.
- */
-int
-vdev_label_number(uint64_t psize, uint64_t offset)
-{
-	int l;
-
-	if (offset >= psize - VDEV_LABEL_END_SIZE) {
-		offset -= psize - VDEV_LABEL_END_SIZE;
-		offset += (VDEV_LABELS / 2) * sizeof (vdev_label_t);
-	}
-	l = offset / sizeof (vdev_label_t);
-	return (l < VDEV_LABELS ? l : -1);
-}
-
-static void
-vdev_label_read(zio_t *zio, vdev_t *vd, int l, abd_t *buf, uint64_t offset,
-    uint64_t size, zio_done_func_t *done, void *private, int flags)
-{
-	ASSERT(
-	    spa_config_held(zio->io_spa, SCL_STATE, RW_READER) == SCL_STATE ||
-	    spa_config_held(zio->io_spa, SCL_STATE, RW_WRITER) == SCL_STATE);
-	ASSERT(flags & ZIO_FLAG_CONFIG_WRITER);
-
-	zio_nowait(zio_read_phys(zio, vd,
-	    vdev_label_offset(vd->vdev_psize, l, offset),
-	    size, buf, ZIO_CHECKSUM_LABEL, done, private,
-	    ZIO_PRIORITY_SYNC_READ, flags, B_TRUE));
-}
-
-void
-vdev_label_write(zio_t *zio, vdev_t *vd, int l, abd_t *buf, uint64_t offset,
-    uint64_t size, zio_done_func_t *done, void *private, int flags)
-{
-	ASSERT(
-	    spa_config_held(zio->io_spa, SCL_STATE, RW_READER) == SCL_STATE ||
-	    spa_config_held(zio->io_spa, SCL_STATE, RW_WRITER) == SCL_STATE);
-	ASSERT(flags & ZIO_FLAG_CONFIG_WRITER);
-
-	zio_nowait(zio_write_phys(zio, vd,
-	    vdev_label_offset(vd->vdev_psize, l, offset),
-	    size, buf, ZIO_CHECKSUM_LABEL, done, private,
-	    ZIO_PRIORITY_SYNC_WRITE, flags, B_TRUE));
-}
-
-static void
-root_vdev_actions_getprogress(vdev_t *vd, nvlist_t *nvl)
-{
-	spa_t *spa = vd->vdev_spa;
-
-	if (vd != spa->spa_root_vdev)
-		return;
-
-	/* provide either current or previous scan information */
-	pool_scan_stat_t ps;
-	if (spa_scan_get_stats(spa, &ps) == 0) {
-		fnvlist_add_uint64_array(nvl,
-		    ZPOOL_CONFIG_SCAN_STATS, (uint64_t *)&ps,
-		    sizeof (pool_scan_stat_t) / sizeof (uint64_t));
-	}
-
-	pool_removal_stat_t prs;
-	if (spa_removal_get_stats(spa, &prs) == 0) {
-		fnvlist_add_uint64_array(nvl,
-		    ZPOOL_CONFIG_REMOVAL_STATS, (uint64_t *)&prs,
-		    sizeof (prs) / sizeof (uint64_t));
-	}
-
-	pool_checkpoint_stat_t pcs;
-	if (spa_checkpoint_get_stats(spa, &pcs) == 0) {
-		fnvlist_add_uint64_array(nvl,
-		    ZPOOL_CONFIG_CHECKPOINT_STATS, (uint64_t *)&pcs,
-		    sizeof (pcs) / sizeof (uint64_t));
-	}
-}
-
-/*
- * Generate the nvlist representing this vdev's config.
- */
-nvlist_t *
-vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats,
-    vdev_config_flag_t flags)
-{
-	nvlist_t *nv = NULL;
-	vdev_indirect_config_t *vic = &vd->vdev_indirect_config;
-
-	nv = fnvlist_alloc();
-
-	fnvlist_add_string(nv, ZPOOL_CONFIG_TYPE, vd->vdev_ops->vdev_op_type);
-	if (!(flags & (VDEV_CONFIG_SPARE | VDEV_CONFIG_L2CACHE)))
-		fnvlist_add_uint64(nv, ZPOOL_CONFIG_ID, vd->vdev_id);
-	fnvlist_add_uint64(nv, ZPOOL_CONFIG_GUID, vd->vdev_guid);
-
-	if (vd->vdev_path != NULL)
-		fnvlist_add_string(nv, ZPOOL_CONFIG_PATH, vd->vdev_path);
-
-	if (vd->vdev_devid != NULL)
-		fnvlist_add_string(nv, ZPOOL_CONFIG_DEVID, vd->vdev_devid);
-
-	if (vd->vdev_physpath != NULL)
-		fnvlist_add_string(nv, ZPOOL_CONFIG_PHYS_PATH,
-		    vd->vdev_physpath);
-
-	if (vd->vdev_fru != NULL)
-		fnvlist_add_string(nv, ZPOOL_CONFIG_FRU, vd->vdev_fru);
-
-	if (vd->vdev_nparity != 0) {
-		ASSERT(strcmp(vd->vdev_ops->vdev_op_type,
-		    VDEV_TYPE_RAIDZ) == 0);
-
-		/*
-		 * Make sure someone hasn't managed to sneak a fancy new vdev
-		 * into a crufty old storage pool.
-		 */
-		ASSERT(vd->vdev_nparity == 1 ||
-		    (vd->vdev_nparity <= 2 &&
-		    spa_version(spa) >= SPA_VERSION_RAIDZ2) ||
-		    (vd->vdev_nparity <= 3 &&
-		    spa_version(spa) >= SPA_VERSION_RAIDZ3));
-
-		/*
-		 * Note that we'll add the nparity tag even on storage pools
-		 * that only support a single parity device -- older software
-		 * will just ignore it.
-		 */
-		fnvlist_add_uint64(nv, ZPOOL_CONFIG_NPARITY, vd->vdev_nparity);
-	}
-
-	if (vd->vdev_wholedisk != -1ULL)
-		fnvlist_add_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK,
-		    vd->vdev_wholedisk);
-
-	if (vd->vdev_not_present && !(flags & VDEV_CONFIG_MISSING))
-		fnvlist_add_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT, 1);
-
-	if (vd->vdev_isspare)
-		fnvlist_add_uint64(nv, ZPOOL_CONFIG_IS_SPARE, 1);
-
-	if (!(flags & (VDEV_CONFIG_SPARE | VDEV_CONFIG_L2CACHE)) &&
-	    vd == vd->vdev_top) {
-		fnvlist_add_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY,
-		    vd->vdev_ms_array);
-		fnvlist_add_uint64(nv, ZPOOL_CONFIG_METASLAB_SHIFT,
-		    vd->vdev_ms_shift);
-		fnvlist_add_uint64(nv, ZPOOL_CONFIG_ASHIFT, vd->vdev_ashift);
-		fnvlist_add_uint64(nv, ZPOOL_CONFIG_ASIZE,
-		    vd->vdev_asize);
-		fnvlist_add_uint64(nv, ZPOOL_CONFIG_IS_LOG, vd->vdev_islog);
-		if (vd->vdev_removing) {
-			fnvlist_add_uint64(nv, ZPOOL_CONFIG_REMOVING,
-			    vd->vdev_removing);
-		}
-
-		/* zpool command expects alloc class data */
-		if (getstats && vd->vdev_alloc_bias != VDEV_BIAS_NONE) {
-			const char *bias = NULL;
-
-			switch (vd->vdev_alloc_bias) {
-			case VDEV_BIAS_LOG:
-				bias = VDEV_ALLOC_BIAS_LOG;
-				break;
-			case VDEV_BIAS_SPECIAL:
-				bias = VDEV_ALLOC_BIAS_SPECIAL;
-				break;
-			case VDEV_BIAS_DEDUP:
-				bias = VDEV_ALLOC_BIAS_DEDUP;
-				break;
-			default:
-				ASSERT3U(vd->vdev_alloc_bias, ==,
-				    VDEV_BIAS_NONE);
-			}
-			fnvlist_add_string(nv, ZPOOL_CONFIG_ALLOCATION_BIAS,
-			    bias);
-		}
-	}
-
-	if (vd->vdev_dtl_sm != NULL) {
-		fnvlist_add_uint64(nv, ZPOOL_CONFIG_DTL,
-		    space_map_object(vd->vdev_dtl_sm));
-	}
-
-	if (vic->vic_mapping_object != 0) {
-		fnvlist_add_uint64(nv, ZPOOL_CONFIG_INDIRECT_OBJECT,
-		    vic->vic_mapping_object);
-	}
-
-	if (vic->vic_births_object != 0) {
-		fnvlist_add_uint64(nv, ZPOOL_CONFIG_INDIRECT_BIRTHS,
-		    vic->vic_births_object);
-	}
-
-	if (vic->vic_prev_indirect_vdev != UINT64_MAX) {
-		fnvlist_add_uint64(nv, ZPOOL_CONFIG_PREV_INDIRECT_VDEV,
-		    vic->vic_prev_indirect_vdev);
-	}
-
-	if (vd->vdev_crtxg)
-		fnvlist_add_uint64(nv, ZPOOL_CONFIG_CREATE_TXG, vd->vdev_crtxg);
-
-	if (flags & VDEV_CONFIG_MOS) {
-		if (vd->vdev_leaf_zap != 0) {
-			ASSERT(vd->vdev_ops->vdev_op_leaf);
-			fnvlist_add_uint64(nv, ZPOOL_CONFIG_VDEV_LEAF_ZAP,
-			    vd->vdev_leaf_zap);
-		}
-
-		if (vd->vdev_top_zap != 0) {
-			ASSERT(vd == vd->vdev_top);
-			fnvlist_add_uint64(nv, ZPOOL_CONFIG_VDEV_TOP_ZAP,
-			    vd->vdev_top_zap);
-		}
-	}
-
-	if (getstats) {
-		vdev_stat_t vs;
-
-		vdev_get_stats(vd, &vs);
-		fnvlist_add_uint64_array(nv, ZPOOL_CONFIG_VDEV_STATS,
-		    (uint64_t *)&vs, sizeof (vs) / sizeof (uint64_t));
-
-		root_vdev_actions_getprogress(vd, nv);
-
-		/*
-		 * Note: this can be called from open context
-		 * (spa_get_stats()), so we need the rwlock to prevent
-		 * the mapping from being changed by condensing.
-		 */
-		rw_enter(&vd->vdev_indirect_rwlock, RW_READER);
-		if (vd->vdev_indirect_mapping != NULL) {
-			ASSERT(vd->vdev_indirect_births != NULL);
-			vdev_indirect_mapping_t *vim =
-			    vd->vdev_indirect_mapping;
-			fnvlist_add_uint64(nv, ZPOOL_CONFIG_INDIRECT_SIZE,
-			    vdev_indirect_mapping_size(vim));
-		}
-		rw_exit(&vd->vdev_indirect_rwlock);
-		if (vd->vdev_mg != NULL &&
-		    vd->vdev_mg->mg_fragmentation != ZFS_FRAG_INVALID) {
-			/*
-			 * Compute approximately how much memory would be used
-			 * for the indirect mapping if this device were to
-			 * be removed.
-			 *
-			 * Note: If the frag metric is invalid, then not
-			 * enough metaslabs have been converted to have
-			 * histograms.
-			 */
-			uint64_t seg_count = 0;
-			uint64_t to_alloc = vd->vdev_stat.vs_alloc;
-
-			/*
-			 * There are the same number of allocated segments
-			 * as free segments, so we will have at least one
-			 * entry per free segment.  However, small free
-			 * segments (smaller than vdev_removal_max_span)
-			 * will be combined with adjacent allocated segments
-			 * as a single mapping.
-			 */
-			for (int i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) {
-				if (1ULL << (i + 1) < vdev_removal_max_span) {
-					to_alloc +=
-					    vd->vdev_mg->mg_histogram[i] <<
-					    i + 1;
-				} else {
-					seg_count +=
-					    vd->vdev_mg->mg_histogram[i];
-				}
-			}
-
-			/*
-			 * The maximum length of a mapping is
-			 * zfs_remove_max_segment, so we need at least one entry
-			 * per zfs_remove_max_segment of allocated data.
-			 */
-			seg_count += to_alloc / zfs_remove_max_segment;
-
-			fnvlist_add_uint64(nv, ZPOOL_CONFIG_INDIRECT_SIZE,
-			    seg_count *
-			    sizeof (vdev_indirect_mapping_entry_phys_t));
-		}
-	}
-
-	if (!vd->vdev_ops->vdev_op_leaf) {
-		nvlist_t **child;
-		int c, idx;
-
-		ASSERT(!vd->vdev_ishole);
-
-		child = kmem_alloc(vd->vdev_children * sizeof (nvlist_t *),
-		    KM_SLEEP);
-
-		for (c = 0, idx = 0; c < vd->vdev_children; c++) {
-			vdev_t *cvd = vd->vdev_child[c];
-
-			/*
-			 * If we're generating an nvlist of removing
-			 * vdevs then skip over any device which is
-			 * not being removed.
-			 */
-			if ((flags & VDEV_CONFIG_REMOVING) &&
-			    !cvd->vdev_removing)
-				continue;
-
-			child[idx++] = vdev_config_generate(spa, cvd,
-			    getstats, flags);
-		}
-
-		if (idx) {
-			fnvlist_add_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
-			    child, idx);
-		}
-
-		for (c = 0; c < idx; c++)
-			nvlist_free(child[c]);
-
-		kmem_free(child, vd->vdev_children * sizeof (nvlist_t *));
-
-	} else {
-		const char *aux = NULL;
-
-		if (vd->vdev_offline && !vd->vdev_tmpoffline)
-			fnvlist_add_uint64(nv, ZPOOL_CONFIG_OFFLINE, B_TRUE);
-		if (vd->vdev_resilver_txg != 0)
-			fnvlist_add_uint64(nv, ZPOOL_CONFIG_RESILVER_TXG,
-			    vd->vdev_resilver_txg);
-		if (vd->vdev_faulted)
-			fnvlist_add_uint64(nv, ZPOOL_CONFIG_FAULTED, B_TRUE);
-		if (vd->vdev_degraded)
-			fnvlist_add_uint64(nv, ZPOOL_CONFIG_DEGRADED, B_TRUE);
-		if (vd->vdev_removed)
-			fnvlist_add_uint64(nv, ZPOOL_CONFIG_REMOVED, B_TRUE);
-		if (vd->vdev_unspare)
-			fnvlist_add_uint64(nv, ZPOOL_CONFIG_UNSPARE, B_TRUE);
-		if (vd->vdev_ishole)
-			fnvlist_add_uint64(nv, ZPOOL_CONFIG_IS_HOLE, B_TRUE);
-
-		switch (vd->vdev_stat.vs_aux) {
-		case VDEV_AUX_ERR_EXCEEDED:
-			aux = "err_exceeded";
-			break;
-
-		case VDEV_AUX_EXTERNAL:
-			aux = "external";
-			break;
-		}
-
-		if (aux != NULL)
-			fnvlist_add_string(nv, ZPOOL_CONFIG_AUX_STATE, aux);
-
-		if (vd->vdev_splitting && vd->vdev_orig_guid != 0LL) {
-			fnvlist_add_uint64(nv, ZPOOL_CONFIG_ORIG_GUID,
-			    vd->vdev_orig_guid);
-		}
-	}
-
-	return (nv);
-}
-
-/*
- * Generate a view of the top-level vdevs.  If we currently have holes
- * in the namespace, then generate an array which contains a list of holey
- * vdevs.  Additionally, add the number of top-level children that currently
- * exist.
- */
-void
-vdev_top_config_generate(spa_t *spa, nvlist_t *config)
-{
-	vdev_t *rvd = spa->spa_root_vdev;
-	uint64_t *array;
-	uint_t c, idx;
-
-	array = kmem_alloc(rvd->vdev_children * sizeof (uint64_t), KM_SLEEP);
-
-	for (c = 0, idx = 0; c < rvd->vdev_children; c++) {
-		vdev_t *tvd = rvd->vdev_child[c];
-
-		if (tvd->vdev_ishole) {
-			array[idx++] = c;
-		}
-	}
-
-	if (idx) {
-		VERIFY(nvlist_add_uint64_array(config, ZPOOL_CONFIG_HOLE_ARRAY,
-		    array, idx) == 0);
-	}
-
-	VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_VDEV_CHILDREN,
-	    rvd->vdev_children) == 0);
-
-	kmem_free(array, rvd->vdev_children * sizeof (uint64_t));
-}
-
-/*
- * Returns the configuration from the label of the given vdev. For vdevs
- * which don't have a txg value stored on their label (i.e. spares/cache)
- * or have not been completely initialized (txg = 0) just return
- * the configuration from the first valid label we find. Otherwise,
- * find the most up-to-date label that does not exceed the specified
- * 'txg' value.
- */
-nvlist_t *
-vdev_label_read_config(vdev_t *vd, uint64_t txg)
-{
-	spa_t *spa = vd->vdev_spa;
-	nvlist_t *config = NULL;
-	vdev_phys_t *vp;
-	abd_t *vp_abd;
-	zio_t *zio;
-	uint64_t best_txg = 0;
-	uint64_t label_txg = 0;
-	int error = 0;
-	int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL |
-	    ZIO_FLAG_SPECULATIVE;
-
-	ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
-
-	if (!vdev_readable(vd))
-		return (NULL);
-
-	vp_abd = abd_alloc_linear(sizeof (vdev_phys_t), B_TRUE);
-	vp = abd_to_buf(vp_abd);
-
-retry:
-	for (int l = 0; l < VDEV_LABELS; l++) {
-		nvlist_t *label = NULL;
-
-		zio = zio_root(spa, NULL, NULL, flags);
-
-		vdev_label_read(zio, vd, l, vp_abd,
-		    offsetof(vdev_label_t, vl_vdev_phys),
-		    sizeof (vdev_phys_t), NULL, NULL, flags);
-
-		if (zio_wait(zio) == 0 &&
-		    nvlist_unpack(vp->vp_nvlist, sizeof (vp->vp_nvlist),
-		    &label, 0) == 0) {
-			/*
-			 * Auxiliary vdevs won't have txg values in their
-			 * labels and newly added vdevs may not have been
-			 * completely initialized so just return the
-			 * configuration from the first valid label we
-			 * encounter.
-			 */
-			error = nvlist_lookup_uint64(label,
-			    ZPOOL_CONFIG_POOL_TXG, &label_txg);
-			if ((error || label_txg == 0) && !config) {
-				config = label;
-				break;
-			} else if (label_txg <= txg && label_txg > best_txg) {
-				best_txg = label_txg;
-				nvlist_free(config);
-				config = fnvlist_dup(label);
-			}
-		}
-
-		if (label != NULL) {
-			nvlist_free(label);
-			label = NULL;
-		}
-	}
-
-	if (config == NULL && !(flags & ZIO_FLAG_TRYHARD)) {
-		flags |= ZIO_FLAG_TRYHARD;
-		goto retry;
-	}
-
-	/*
-	 * We found a valid label but it didn't pass txg restrictions.
-	 */
-	if (config == NULL && label_txg != 0) {
-		vdev_dbgmsg(vd, "label discarded as txg is too large "
-		    "(%llu > %llu)", (u_longlong_t)label_txg,
-		    (u_longlong_t)txg);
-	}
-
-	abd_free(vp_abd);
-
-	return (config);
-}
-
-/*
- * Determine if a device is in use.  The 'spare_guid' parameter will be filled
- * in with the device guid if this spare is active elsewhere on the system.
- */
-static boolean_t
-vdev_inuse(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason,
-    uint64_t *spare_guid, uint64_t *l2cache_guid)
-{
-	spa_t *spa = vd->vdev_spa;
-	uint64_t state, pool_guid, device_guid, txg, spare_pool;
-	uint64_t vdtxg = 0;
-	nvlist_t *label;
-
-	if (spare_guid)
-		*spare_guid = 0ULL;
-	if (l2cache_guid)
-		*l2cache_guid = 0ULL;
-
-	/*
-	 * Read the label, if any, and perform some basic sanity checks.
-	 */
-	if ((label = vdev_label_read_config(vd, -1ULL)) == NULL)
-		return (B_FALSE);
-
-	(void) nvlist_lookup_uint64(label, ZPOOL_CONFIG_CREATE_TXG,
-	    &vdtxg);
-
-	if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE,
-	    &state) != 0 ||
-	    nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID,
-	    &device_guid) != 0) {
-		nvlist_free(label);
-		return (B_FALSE);
-	}
-
-	if (state != POOL_STATE_SPARE && state != POOL_STATE_L2CACHE &&
-	    (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_GUID,
-	    &pool_guid) != 0 ||
-	    nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_TXG,
-	    &txg) != 0)) {
-		nvlist_free(label);
-		return (B_FALSE);
-	}
-
-	nvlist_free(label);
-
-	/*
-	 * Check to see if this device indeed belongs to the pool it claims to
-	 * be a part of.  The only way this is allowed is if the device is a hot
-	 * spare (which we check for later on).
-	 */
-	if (state != POOL_STATE_SPARE && state != POOL_STATE_L2CACHE &&
-	    !spa_guid_exists(pool_guid, device_guid) &&
-	    !spa_spare_exists(device_guid, NULL, NULL) &&
-	    !spa_l2cache_exists(device_guid, NULL))
-		return (B_FALSE);
-
-	/*
-	 * If the transaction group is zero, then this an initialized (but
-	 * unused) label.  This is only an error if the create transaction
-	 * on-disk is the same as the one we're using now, in which case the
-	 * user has attempted to add the same vdev multiple times in the same
-	 * transaction.
-	 */
-	if (state != POOL_STATE_SPARE && state != POOL_STATE_L2CACHE &&
-	    txg == 0 && vdtxg == crtxg)
-		return (B_TRUE);
-
-	/*
-	 * Check to see if this is a spare device.  We do an explicit check for
-	 * spa_has_spare() here because it may be on our pending list of spares
-	 * to add.  We also check if it is an l2cache device.
-	 */
-	if (spa_spare_exists(device_guid, &spare_pool, NULL) ||
-	    spa_has_spare(spa, device_guid)) {
-		if (spare_guid)
-			*spare_guid = device_guid;
-
-		switch (reason) {
-		case VDEV_LABEL_CREATE:
-		case VDEV_LABEL_L2CACHE:
-			return (B_TRUE);
-
-		case VDEV_LABEL_REPLACE:
-			return (!spa_has_spare(spa, device_guid) ||
-			    spare_pool != 0ULL);
-
-		case VDEV_LABEL_SPARE:
-			return (spa_has_spare(spa, device_guid));
-		}
-	}
-
-	/*
-	 * Check to see if this is an l2cache device.
-	 */
-	if (spa_l2cache_exists(device_guid, NULL))
-		return (B_TRUE);
-
-	/*
-	 * We can't rely on a pool's state if it's been imported
-	 * read-only.  Instead we look to see if the pools is marked
-	 * read-only in the namespace and set the state to active.
-	 */
-	if (state != POOL_STATE_SPARE && state != POOL_STATE_L2CACHE &&
-	    (spa = spa_by_guid(pool_guid, device_guid)) != NULL &&
-	    spa_mode(spa) == FREAD)
-		state = POOL_STATE_ACTIVE;
-
-	/*
-	 * If the device is marked ACTIVE, then this device is in use by another
-	 * pool on the system.
-	 */
-	return (state == POOL_STATE_ACTIVE);
-}
-
-/*
- * Initialize a vdev label.  We check to make sure each leaf device is not in
- * use, and writable.  We put down an initial label which we will later
- * overwrite with a complete label.  Note that it's important to do this
- * sequentially, not in parallel, so that we catch cases of multiple use of the
- * same leaf vdev in the vdev we're creating -- e.g. mirroring a disk with
- * itself.
- */
-int
-vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason)
-{
-	spa_t *spa = vd->vdev_spa;
-	nvlist_t *label;
-	vdev_phys_t *vp;
-	abd_t *vp_abd;
-	abd_t *pad2;
-	uberblock_t *ub;
-	abd_t *ub_abd;
-	zio_t *zio;
-	char *buf;
-	size_t buflen;
-	int error;
-	uint64_t spare_guid, l2cache_guid;
-	int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL;
-
-	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
-
-	for (int c = 0; c < vd->vdev_children; c++)
-		if ((error = vdev_label_init(vd->vdev_child[c],
-		    crtxg, reason)) != 0)
-			return (error);
-
-	/* Track the creation time for this vdev */
-	vd->vdev_crtxg = crtxg;
-
-	if (!vd->vdev_ops->vdev_op_leaf || !spa_writeable(spa))
-		return (0);
-
-	/*
-	 * Dead vdevs cannot be initialized.
-	 */
-	if (vdev_is_dead(vd))
-		return (SET_ERROR(EIO));
-
-	/*
-	 * Determine if the vdev is in use.
-	 */
-	if (reason != VDEV_LABEL_REMOVE && reason != VDEV_LABEL_SPLIT &&
-	    vdev_inuse(vd, crtxg, reason, &spare_guid, &l2cache_guid))
-		return (SET_ERROR(EBUSY));
-
-	/*
-	 * If this is a request to add or replace a spare or l2cache device
-	 * that is in use elsewhere on the system, then we must update the
-	 * guid (which was initialized to a random value) to reflect the
-	 * actual GUID (which is shared between multiple pools).
-	 */
-	if (reason != VDEV_LABEL_REMOVE && reason != VDEV_LABEL_L2CACHE &&
-	    spare_guid != 0ULL) {
-		uint64_t guid_delta = spare_guid - vd->vdev_guid;
-
-		vd->vdev_guid += guid_delta;
-
-		for (vdev_t *pvd = vd; pvd != NULL; pvd = pvd->vdev_parent)
-			pvd->vdev_guid_sum += guid_delta;
-
-		/*
-		 * If this is a replacement, then we want to fallthrough to the
-		 * rest of the code.  If we're adding a spare, then it's already
-		 * labeled appropriately and we can just return.
-		 */
-		if (reason == VDEV_LABEL_SPARE)
-			return (0);
-		ASSERT(reason == VDEV_LABEL_REPLACE ||
-		    reason == VDEV_LABEL_SPLIT);
-	}
-
-	if (reason != VDEV_LABEL_REMOVE && reason != VDEV_LABEL_SPARE &&
-	    l2cache_guid != 0ULL) {
-		uint64_t guid_delta = l2cache_guid - vd->vdev_guid;
-
-		vd->vdev_guid += guid_delta;
-
-		for (vdev_t *pvd = vd; pvd != NULL; pvd = pvd->vdev_parent)
-			pvd->vdev_guid_sum += guid_delta;
-
-		/*
-		 * If this is a replacement, then we want to fallthrough to the
-		 * rest of the code.  If we're adding an l2cache, then it's
-		 * already labeled appropriately and we can just return.
-		 */
-		if (reason == VDEV_LABEL_L2CACHE)
-			return (0);
-		ASSERT(reason == VDEV_LABEL_REPLACE);
-	}
-
-	/*
-	 * TRIM the whole thing, excluding the blank space and boot header
-	 * as specified by ZFS On-Disk Specification (section 1.3), so that
-	 * we start with a clean slate.
-	 * It's just an optimization, so we don't care if it fails.
-	 * Don't TRIM if removing so that we don't interfere with zpool
-	 * disaster recovery.
-	 */
-	if (zfs_trim_enabled && vdev_trim_on_init && !vd->vdev_notrim && 
-	    (reason == VDEV_LABEL_CREATE || reason == VDEV_LABEL_SPARE ||
-	    reason == VDEV_LABEL_L2CACHE))
-		zio_wait(zio_trim(NULL, spa, vd, VDEV_SKIP_SIZE,
-		    vd->vdev_psize - VDEV_SKIP_SIZE));
-
-	/*
-	 * Initialize its label.
-	 */
-	vp_abd = abd_alloc_linear(sizeof (vdev_phys_t), B_TRUE);
-	abd_zero(vp_abd, sizeof (vdev_phys_t));
-	vp = abd_to_buf(vp_abd);
-
-	/*
-	 * Generate a label describing the pool and our top-level vdev.
-	 * We mark it as being from txg 0 to indicate that it's not
-	 * really part of an active pool just yet.  The labels will
-	 * be written again with a meaningful txg by spa_sync().
-	 */
-	if (reason == VDEV_LABEL_SPARE ||
-	    (reason == VDEV_LABEL_REMOVE && vd->vdev_isspare)) {
-		/*
-		 * For inactive hot spares, we generate a special label that
-		 * identifies as a mutually shared hot spare.  We write the
-		 * label if we are adding a hot spare, or if we are removing an
-		 * active hot spare (in which case we want to revert the
-		 * labels).
-		 */
-		VERIFY(nvlist_alloc(&label, NV_UNIQUE_NAME, KM_SLEEP) == 0);
-
-		VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_VERSION,
-		    spa_version(spa)) == 0);
-		VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_POOL_STATE,
-		    POOL_STATE_SPARE) == 0);
-		VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_GUID,
-		    vd->vdev_guid) == 0);
-	} else if (reason == VDEV_LABEL_L2CACHE ||
-	    (reason == VDEV_LABEL_REMOVE && vd->vdev_isl2cache)) {
-		/*
-		 * For level 2 ARC devices, add a special label.
-		 */
-		VERIFY(nvlist_alloc(&label, NV_UNIQUE_NAME, KM_SLEEP) == 0);
-
-		VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_VERSION,
-		    spa_version(spa)) == 0);
-		VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_POOL_STATE,
-		    POOL_STATE_L2CACHE) == 0);
-		VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_GUID,
-		    vd->vdev_guid) == 0);
-	} else {
-		uint64_t txg = 0ULL;
-
-		if (reason == VDEV_LABEL_SPLIT)
-			txg = spa->spa_uberblock.ub_txg;
-		label = spa_config_generate(spa, vd, txg, B_FALSE);
-
-		/*
-		 * Add our creation time.  This allows us to detect multiple
-		 * vdev uses as described above, and automatically expires if we
-		 * fail.
-		 */
-		VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_CREATE_TXG,
-		    crtxg) == 0);
-	}
-
-	buf = vp->vp_nvlist;
-	buflen = sizeof (vp->vp_nvlist);
-
-	error = nvlist_pack(label, &buf, &buflen, NV_ENCODE_XDR, KM_SLEEP);
-	if (error != 0) {
-		nvlist_free(label);
-		abd_free(vp_abd);
-		/* EFAULT means nvlist_pack ran out of room */
-		return (error == EFAULT ? ENAMETOOLONG : EINVAL);
-	}
-
-	/*
-	 * Initialize uberblock template.
-	 */
-	ub_abd = abd_alloc_linear(VDEV_UBERBLOCK_RING, B_TRUE);
-	abd_zero(ub_abd, VDEV_UBERBLOCK_RING);
-	abd_copy_from_buf(ub_abd, &spa->spa_uberblock, sizeof (uberblock_t));
-	ub = abd_to_buf(ub_abd);
-	ub->ub_txg = 0;
-
-	/* Initialize the 2nd padding area. */
-	pad2 = abd_alloc_for_io(VDEV_PAD_SIZE, B_TRUE);
-	abd_zero(pad2, VDEV_PAD_SIZE);
-
-	/*
-	 * Write everything in parallel.
-	 */
-retry:
-	zio = zio_root(spa, NULL, NULL, flags);
-
-	for (int l = 0; l < VDEV_LABELS; l++) {
-
-		vdev_label_write(zio, vd, l, vp_abd,
-		    offsetof(vdev_label_t, vl_vdev_phys),
-		    sizeof (vdev_phys_t), NULL, NULL, flags);
-
-		/*
-		 * Skip the 1st padding area.
-		 * Zero out the 2nd padding area where it might have
-		 * left over data from previous filesystem format.
-		 */
-		vdev_label_write(zio, vd, l, pad2,
-		    offsetof(vdev_label_t, vl_pad2),
-		    VDEV_PAD_SIZE, NULL, NULL, flags);
-
-		vdev_label_write(zio, vd, l, ub_abd,
-		    offsetof(vdev_label_t, vl_uberblock),
-		    VDEV_UBERBLOCK_RING, NULL, NULL, flags);
-	}
-
-	error = zio_wait(zio);
-
-	if (error != 0 && !(flags & ZIO_FLAG_TRYHARD)) {
-		flags |= ZIO_FLAG_TRYHARD;
-		goto retry;
-	}
-
-	nvlist_free(label);
-	abd_free(pad2);
-	abd_free(ub_abd);
-	abd_free(vp_abd);
-
-	/*
-	 * If this vdev hasn't been previously identified as a spare, then we
-	 * mark it as such only if a) we are labeling it as a spare, or b) it
-	 * exists as a spare elsewhere in the system.  Do the same for
-	 * level 2 ARC devices.
-	 */
-	if (error == 0 && !vd->vdev_isspare &&
-	    (reason == VDEV_LABEL_SPARE ||
-	    spa_spare_exists(vd->vdev_guid, NULL, NULL)))
-		spa_spare_add(vd);
-
-	if (error == 0 && !vd->vdev_isl2cache &&
-	    (reason == VDEV_LABEL_L2CACHE ||
-	    spa_l2cache_exists(vd->vdev_guid, NULL)))
-		spa_l2cache_add(vd);
-
-	return (error);
-}
-
-int
-vdev_label_write_pad2(vdev_t *vd, const char *buf, size_t size)
-{
-	spa_t *spa = vd->vdev_spa;
-	zio_t *zio;
-	abd_t *pad2;
-	int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL;
-	int error;
-
-	if (size > VDEV_PAD_SIZE)
-		return (EINVAL);
-
-	if (!vd->vdev_ops->vdev_op_leaf)
-		return (ENODEV);
-	if (vdev_is_dead(vd))
-		return (ENXIO);
-
-	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
-
-	pad2 = abd_alloc_for_io(VDEV_PAD_SIZE, B_TRUE);
-	abd_zero(pad2, VDEV_PAD_SIZE);
-	abd_copy_from_buf(pad2, buf, size);
-
-retry:
-	zio = zio_root(spa, NULL, NULL, flags);
-	vdev_label_write(zio, vd, 0, pad2,
-	    offsetof(vdev_label_t, vl_pad2),
-	    VDEV_PAD_SIZE, NULL, NULL, flags);
-	error = zio_wait(zio);
-	if (error != 0 && !(flags & ZIO_FLAG_TRYHARD)) {
-		flags |= ZIO_FLAG_TRYHARD;
-		goto retry;
-	}
-
-	abd_free(pad2);
-	return (error);
-}
-
-/*
- * ==========================================================================
- * uberblock load/sync
- * ==========================================================================
- */
-
-/*
- * Consider the following situation: txg is safely synced to disk.  We've
- * written the first uberblock for txg + 1, and then we lose power.  When we
- * come back up, we fail to see the uberblock for txg + 1 because, say,
- * it was on a mirrored device and the replica to which we wrote txg + 1
- * is now offline.  If we then make some changes and sync txg + 1, and then
- * the missing replica comes back, then for a few seconds we'll have two
- * conflicting uberblocks on disk with the same txg.  The solution is simple:
- * among uberblocks with equal txg, choose the one with the latest timestamp.
- */
-static int
-vdev_uberblock_compare(const uberblock_t *ub1, const uberblock_t *ub2)
-{
-	int cmp = AVL_CMP(ub1->ub_txg, ub2->ub_txg);
-
-	if (likely(cmp))
-		return (cmp);
-
-	cmp = AVL_CMP(ub1->ub_timestamp, ub2->ub_timestamp);
-	if (likely(cmp))
-		return (cmp);
-
-	/*
-	 * If MMP_VALID(ub) && MMP_SEQ_VALID(ub) then the host has an MMP-aware
-	 * ZFS, e.g. zfsonlinux >= 0.7.
-	 *
-	 * If one ub has MMP and the other does not, they were written by
-	 * different hosts, which matters for MMP.  So we treat no MMP/no SEQ as
-	 * a 0 value.
-	 *
-	 * Since timestamp and txg are the same if we get this far, either is
-	 * acceptable for importing the pool.
-	 */
-	unsigned int seq1 = 0;
-	unsigned int seq2 = 0;
-
-	if (MMP_VALID(ub1) && MMP_SEQ_VALID(ub1))
-		seq1 = MMP_SEQ(ub1);
-
-	if (MMP_VALID(ub2) && MMP_SEQ_VALID(ub2))
-		seq2 = MMP_SEQ(ub2);
-
-	return (AVL_CMP(seq1, seq2));
-}
-
-struct ubl_cbdata {
-	uberblock_t	*ubl_ubbest;	/* Best uberblock */
-	vdev_t		*ubl_vd;	/* vdev associated with the above */
-};
-
-static void
-vdev_uberblock_load_done(zio_t *zio)
-{
-	vdev_t *vd = zio->io_vd;
-	spa_t *spa = zio->io_spa;
-	zio_t *rio = zio->io_private;
-	uberblock_t *ub = abd_to_buf(zio->io_abd);
-	struct ubl_cbdata *cbp = rio->io_private;
-
-	ASSERT3U(zio->io_size, ==, VDEV_UBERBLOCK_SIZE(vd));
-
-	if (zio->io_error == 0 && uberblock_verify(ub) == 0) {
-		mutex_enter(&rio->io_lock);
-		if (ub->ub_txg <= spa->spa_load_max_txg &&
-		    vdev_uberblock_compare(ub, cbp->ubl_ubbest) > 0) {
-			/*
-			 * Keep track of the vdev in which this uberblock
-			 * was found. We will use this information later
-			 * to obtain the config nvlist associated with
-			 * this uberblock.
-			 */
-			*cbp->ubl_ubbest = *ub;
-			cbp->ubl_vd = vd;
-		}
-		mutex_exit(&rio->io_lock);
-	}
-
-	abd_free(zio->io_abd);
-}
-
-static void
-vdev_uberblock_load_impl(zio_t *zio, vdev_t *vd, int flags,
-    struct ubl_cbdata *cbp)
-{
-	for (int c = 0; c < vd->vdev_children; c++)
-		vdev_uberblock_load_impl(zio, vd->vdev_child[c], flags, cbp);
-
-	if (vd->vdev_ops->vdev_op_leaf && vdev_readable(vd)) {
-		for (int l = 0; l < VDEV_LABELS; l++) {
-			for (int n = 0; n < VDEV_UBERBLOCK_COUNT(vd); n++) {
-				vdev_label_read(zio, vd, l,
-				    abd_alloc_linear(VDEV_UBERBLOCK_SIZE(vd),
-				    B_TRUE), VDEV_UBERBLOCK_OFFSET(vd, n),
-				    VDEV_UBERBLOCK_SIZE(vd),
-				    vdev_uberblock_load_done, zio, flags);
-			}
-		}
-	}
-}
-
-/*
- * Reads the 'best' uberblock from disk along with its associated
- * configuration. First, we read the uberblock array of each label of each
- * vdev, keeping track of the uberblock with the highest txg in each array.
- * Then, we read the configuration from the same vdev as the best uberblock.
- */
-void
-vdev_uberblock_load(vdev_t *rvd, uberblock_t *ub, nvlist_t **config)
-{
-	zio_t *zio;
-	spa_t *spa = rvd->vdev_spa;
-	struct ubl_cbdata cb;
-	int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL |
-	    ZIO_FLAG_SPECULATIVE | ZIO_FLAG_TRYHARD;
-
-	ASSERT(ub);
-	ASSERT(config);
-
-	bzero(ub, sizeof (uberblock_t));
-	*config = NULL;
-
-	cb.ubl_ubbest = ub;
-	cb.ubl_vd = NULL;
-
-	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
-	zio = zio_root(spa, NULL, &cb, flags);
-	vdev_uberblock_load_impl(zio, rvd, flags, &cb);
-	(void) zio_wait(zio);
-
-	/*
-	 * It's possible that the best uberblock was discovered on a label
-	 * that has a configuration which was written in a future txg.
-	 * Search all labels on this vdev to find the configuration that
-	 * matches the txg for our uberblock.
-	 */
-	if (cb.ubl_vd != NULL) {
-		vdev_dbgmsg(cb.ubl_vd, "best uberblock found for spa %s. "
-		    "txg %llu", spa->spa_name, (u_longlong_t)ub->ub_txg);
-
-		*config = vdev_label_read_config(cb.ubl_vd, ub->ub_txg);
-		if (*config == NULL && spa->spa_extreme_rewind) {
-			vdev_dbgmsg(cb.ubl_vd, "failed to read label config. "
-			    "Trying again without txg restrictions.");
-			*config = vdev_label_read_config(cb.ubl_vd, UINT64_MAX);
-		}
-		if (*config == NULL) {
-			vdev_dbgmsg(cb.ubl_vd, "failed to read label config");
-		}
-	}
-	spa_config_exit(spa, SCL_ALL, FTAG);
-}
-
-/*
- * On success, increment root zio's count of good writes.
- * We only get credit for writes to known-visible vdevs; see spa_vdev_add().
- */
-static void
-vdev_uberblock_sync_done(zio_t *zio)
-{
-	uint64_t *good_writes = zio->io_private;
-
-	if (zio->io_error == 0 && zio->io_vd->vdev_top->vdev_ms_array != 0)
-		atomic_inc_64(good_writes);
-}
-
-/*
- * Write the uberblock to all labels of all leaves of the specified vdev.
- */
-static void
-vdev_uberblock_sync(zio_t *zio, uint64_t *good_writes,
-    uberblock_t *ub, vdev_t *vd, int flags)
-{
-	for (uint64_t c = 0; c < vd->vdev_children; c++) {
-		vdev_uberblock_sync(zio, good_writes,
-		    ub, vd->vdev_child[c], flags);
-	}
-
-	if (!vd->vdev_ops->vdev_op_leaf)
-		return;
-
-	if (!vdev_writeable(vd))
-		return;
-
-	int m = spa_multihost(vd->vdev_spa) ? MMP_BLOCKS_PER_LABEL : 0;
-	int n = ub->ub_txg % (VDEV_UBERBLOCK_COUNT(vd) - m);
-
-	/* Copy the uberblock_t into the ABD */
-	abd_t *ub_abd = abd_alloc_for_io(VDEV_UBERBLOCK_SIZE(vd), B_TRUE);
-	abd_zero(ub_abd, VDEV_UBERBLOCK_SIZE(vd));
-	abd_copy_from_buf(ub_abd, ub, sizeof (uberblock_t));
-
-	for (int l = 0; l < VDEV_LABELS; l++)
-		vdev_label_write(zio, vd, l, ub_abd,
-		    VDEV_UBERBLOCK_OFFSET(vd, n), VDEV_UBERBLOCK_SIZE(vd),
-		    vdev_uberblock_sync_done, good_writes,
-		    flags | ZIO_FLAG_DONT_PROPAGATE);
-
-	abd_free(ub_abd);
-}
-
-/* Sync the uberblocks to all vdevs in svd[] */
-int
-vdev_uberblock_sync_list(vdev_t **svd, int svdcount, uberblock_t *ub, int flags)
-{
-	spa_t *spa = svd[0]->vdev_spa;
-	zio_t *zio;
-	uint64_t good_writes = 0;
-
-	zio = zio_root(spa, NULL, NULL, flags);
-
-	for (int v = 0; v < svdcount; v++)
-		vdev_uberblock_sync(zio, &good_writes, ub, svd[v], flags);
-
-	(void) zio_wait(zio);
-
-	/*
-	 * Flush the uberblocks to disk.  This ensures that the odd labels
-	 * are no longer needed (because the new uberblocks and the even
-	 * labels are safely on disk), so it is safe to overwrite them.
-	 */
-	zio = zio_root(spa, NULL, NULL, flags);
-
-	for (int v = 0; v < svdcount; v++) {
-		if (vdev_writeable(svd[v])) {
-			zio_flush(zio, svd[v]);
-		}
-	}
-
-	(void) zio_wait(zio);
-
-	return (good_writes >= 1 ? 0 : EIO);
-}
-
-/*
- * On success, increment the count of good writes for our top-level vdev.
- */
-static void
-vdev_label_sync_done(zio_t *zio)
-{
-	uint64_t *good_writes = zio->io_private;
-
-	if (zio->io_error == 0)
-		atomic_inc_64(good_writes);
-}
-
-/*
- * If there weren't enough good writes, indicate failure to the parent.
- */
-static void
-vdev_label_sync_top_done(zio_t *zio)
-{
-	uint64_t *good_writes = zio->io_private;
-
-	if (*good_writes == 0)
-		zio->io_error = SET_ERROR(EIO);
-
-	kmem_free(good_writes, sizeof (uint64_t));
-}
-
-/*
- * We ignore errors for log and cache devices, simply free the private data.
- */
-static void
-vdev_label_sync_ignore_done(zio_t *zio)
-{
-	kmem_free(zio->io_private, sizeof (uint64_t));
-}
-
-/*
- * Write all even or odd labels to all leaves of the specified vdev.
- */
-static void
-vdev_label_sync(zio_t *zio, uint64_t *good_writes,
-    vdev_t *vd, int l, uint64_t txg, int flags)
-{
-	nvlist_t *label;
-	vdev_phys_t *vp;
-	abd_t *vp_abd;
-	char *buf;
-	size_t buflen;
-
-	for (int c = 0; c < vd->vdev_children; c++) {
-		vdev_label_sync(zio, good_writes,
-		    vd->vdev_child[c], l, txg, flags);
-	}
-
-	if (!vd->vdev_ops->vdev_op_leaf)
-		return;
-
-	if (!vdev_writeable(vd))
-		return;
-
-	/*
-	 * Generate a label describing the top-level config to which we belong.
-	 */
-	label = spa_config_generate(vd->vdev_spa, vd, txg, B_FALSE);
-
-	vp_abd = abd_alloc_linear(sizeof (vdev_phys_t), B_TRUE);
-	abd_zero(vp_abd, sizeof (vdev_phys_t));
-	vp = abd_to_buf(vp_abd);
-
-	buf = vp->vp_nvlist;
-	buflen = sizeof (vp->vp_nvlist);
-
-	if (nvlist_pack(label, &buf, &buflen, NV_ENCODE_XDR, KM_SLEEP) == 0) {
-		for (; l < VDEV_LABELS; l += 2) {
-			vdev_label_write(zio, vd, l, vp_abd,
-			    offsetof(vdev_label_t, vl_vdev_phys),
-			    sizeof (vdev_phys_t),
-			    vdev_label_sync_done, good_writes,
-			    flags | ZIO_FLAG_DONT_PROPAGATE);
-		}
-	}
-
-	abd_free(vp_abd);
-	nvlist_free(label);
-}
-
-int
-vdev_label_sync_list(spa_t *spa, int l, uint64_t txg, int flags)
-{
-	list_t *dl = &spa->spa_config_dirty_list;
-	vdev_t *vd;
-	zio_t *zio;
-	int error;
-
-	/*
-	 * Write the new labels to disk.
-	 */
-	zio = zio_root(spa, NULL, NULL, flags);
-
-	for (vd = list_head(dl); vd != NULL; vd = list_next(dl, vd)) {
-		uint64_t *good_writes = kmem_zalloc(sizeof (uint64_t),
-		    KM_SLEEP);
-
-		ASSERT(!vd->vdev_ishole);
-
-		zio_t *vio = zio_null(zio, spa, NULL,
-		    (vd->vdev_islog || vd->vdev_aux != NULL) ?
-		    vdev_label_sync_ignore_done : vdev_label_sync_top_done,
-		    good_writes, flags);
-		vdev_label_sync(vio, good_writes, vd, l, txg, flags);
-		zio_nowait(vio);
-	}
-
-	error = zio_wait(zio);
-
-	/*
-	 * Flush the new labels to disk.
-	 */
-	zio = zio_root(spa, NULL, NULL, flags);
-
-	for (vd = list_head(dl); vd != NULL; vd = list_next(dl, vd))
-		zio_flush(zio, vd);
-
-	(void) zio_wait(zio);
-
-	return (error);
-}
-
-/*
- * Sync the uberblock and any changes to the vdev configuration.
- *
- * The order of operations is carefully crafted to ensure that
- * if the system panics or loses power at any time, the state on disk
- * is still transactionally consistent.  The in-line comments below
- * describe the failure semantics at each stage.
- *
- * Moreover, vdev_config_sync() is designed to be idempotent: if it fails
- * at any time, you can just call it again, and it will resume its work.
- */
-int
-vdev_config_sync(vdev_t **svd, int svdcount, uint64_t txg)
-{
-	spa_t *spa = svd[0]->vdev_spa;
-	uberblock_t *ub = &spa->spa_uberblock;
-	int error = 0;
-	int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL;
-
-	ASSERT(svdcount != 0);
-retry:
-	/*
-	 * Normally, we don't want to try too hard to write every label and
-	 * uberblock.  If there is a flaky disk, we don't want the rest of the
-	 * sync process to block while we retry.  But if we can't write a
-	 * single label out, we should retry with ZIO_FLAG_TRYHARD before
-	 * bailing out and declaring the pool faulted.
-	 */
-	if (error != 0) {
-		if ((flags & ZIO_FLAG_TRYHARD) != 0)
-			return (error);
-		flags |= ZIO_FLAG_TRYHARD;
-	}
-
-	ASSERT(ub->ub_txg <= txg);
-
-	/*
-	 * If this isn't a resync due to I/O errors,
-	 * and nothing changed in this transaction group,
-	 * and the vdev configuration hasn't changed,
-	 * then there's nothing to do.
-	 */
-	if (ub->ub_txg < txg) {
-		boolean_t changed = uberblock_update(ub, spa->spa_root_vdev,
-		    txg, spa->spa_mmp.mmp_delay);
-
-		if (!changed && list_is_empty(&spa->spa_config_dirty_list))
-			return (0);
-	}
-
-	if (txg > spa_freeze_txg(spa))
-		return (0);
-
-	ASSERT(txg <= spa->spa_final_txg);
-
-	/*
-	 * Flush the write cache of every disk that's been written to
-	 * in this transaction group.  This ensures that all blocks
-	 * written in this txg will be committed to stable storage
-	 * before any uberblock that references them.
-	 */
-	zio_t *zio = zio_root(spa, NULL, NULL, flags);
-
-	for (vdev_t *vd =
-	    txg_list_head(&spa->spa_vdev_txg_list, TXG_CLEAN(txg)); vd != NULL;
-	    vd = txg_list_next(&spa->spa_vdev_txg_list, vd, TXG_CLEAN(txg)))
-		zio_flush(zio, vd);
-
-	(void) zio_wait(zio);
-
-	/*
-	 * Sync out the even labels (L0, L2) for every dirty vdev.  If the
-	 * system dies in the middle of this process, that's OK: all of the
-	 * even labels that made it to disk will be newer than any uberblock,
-	 * and will therefore be considered invalid.  The odd labels (L1, L3),
-	 * which have not yet been touched, will still be valid.  We flush
-	 * the new labels to disk to ensure that all even-label updates
-	 * are committed to stable storage before the uberblock update.
-	 */
-	if ((error = vdev_label_sync_list(spa, 0, txg, flags)) != 0) {
-		if ((flags & ZIO_FLAG_TRYHARD) != 0) {
-			zfs_dbgmsg("vdev_label_sync_list() returned error %d "
-			    "for pool '%s' when syncing out the even labels "
-			    "of dirty vdevs", error, spa_name(spa));
-		}
-		goto retry;
-	}
-
-	/*
-	 * Sync the uberblocks to all vdevs in svd[].
-	 * If the system dies in the middle of this step, there are two cases
-	 * to consider, and the on-disk state is consistent either way:
-	 *
-	 * (1)	If none of the new uberblocks made it to disk, then the
-	 *	previous uberblock will be the newest, and the odd labels
-	 *	(which had not yet been touched) will be valid with respect
-	 *	to that uberblock.
-	 *
-	 * (2)	If one or more new uberblocks made it to disk, then they
-	 *	will be the newest, and the even labels (which had all
-	 *	been successfully committed) will be valid with respect
-	 *	to the new uberblocks.
-	 */
-	if ((error = vdev_uberblock_sync_list(svd, svdcount, ub, flags)) != 0) {
-		if ((flags & ZIO_FLAG_TRYHARD) != 0) {
-			zfs_dbgmsg("vdev_uberblock_sync_list() returned error "
-			    "%d for pool '%s'", error, spa_name(spa));
-		}
-		goto retry;
-	}
-
-	if (spa_multihost(spa))
-		mmp_update_uberblock(spa, ub);
-
-	/*
-	 * Sync out odd labels for every dirty vdev.  If the system dies
-	 * in the middle of this process, the even labels and the new
-	 * uberblocks will suffice to open the pool.  The next time
-	 * the pool is opened, the first thing we'll do -- before any
-	 * user data is modified -- is mark every vdev dirty so that
-	 * all labels will be brought up to date.  We flush the new labels
-	 * to disk to ensure that all odd-label updates are committed to
-	 * stable storage before the next transaction group begins.
-	 */
-	if ((error = vdev_label_sync_list(spa, 1, txg, flags)) != 0) {
-		if ((flags & ZIO_FLAG_TRYHARD) != 0) {
-			zfs_dbgmsg("vdev_label_sync_list() returned error %d "
-			    "for pool '%s' when syncing out the odd labels of "
-			    "dirty vdevs", error, spa_name(spa));
-		}
-		goto retry;;
-	}
-
-	trim_thread_wakeup(spa);
-
-	return (0);
-}
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_mirror.c
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_mirror.c
+++ /dev/null
@@ -1,779 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-
-/*
- * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
- */
-
-#include <sys/zfs_context.h>
-#include <sys/spa.h>
-#include <sys/spa_impl.h>
-#include <sys/dsl_pool.h>
-#include <sys/dsl_scan.h>
-#include <sys/vdev_impl.h>
-#include <sys/zio.h>
-#include <sys/abd.h>
-#include <sys/fs/zfs.h>
-
-/*
- * Virtual device vector for mirroring.
- */
-
-typedef struct mirror_child {
-	vdev_t		*mc_vd;
-	uint64_t	mc_offset;
-	int		mc_error;
-	int		mc_load;
-	uint8_t		mc_tried;
-	uint8_t		mc_skipped;
-	uint8_t		mc_speculative;
-} mirror_child_t;
-
-typedef struct mirror_map {
-	int		*mm_preferred;
-	int		mm_preferred_cnt;
-	int		mm_children;
-	boolean_t	mm_resilvering;
-	boolean_t	mm_root;
-	mirror_child_t	mm_child[];
-} mirror_map_t;
-
-static int vdev_mirror_shift = 21;
-
-#ifdef _KERNEL
-SYSCTL_DECL(_vfs_zfs_vdev);
-static SYSCTL_NODE(_vfs_zfs_vdev, OID_AUTO, mirror,
-    CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
-    "ZFS VDEV Mirror");
-#endif
-
-/*
- * The load configuration settings below are tuned by default for
- * the case where all devices are of the same rotational type.
- *
- * If there is a mixture of rotating and non-rotating media, setting
- * non_rotating_seek_inc to 0 may well provide better results as it
- * will direct more reads to the non-rotating vdevs which are more
- * likely to have a higher performance.
- */
-
-/* Rotating media load calculation configuration. */
-static int rotating_inc = 0;
-#ifdef _KERNEL
-SYSCTL_INT(_vfs_zfs_vdev_mirror, OID_AUTO, rotating_inc, CTLFLAG_RWTUN,
-    &rotating_inc, 0, "Rotating media load increment for non-seeking I/O's");
-#endif
-
-static int rotating_seek_inc = 5;
-#ifdef _KERNEL
-SYSCTL_INT(_vfs_zfs_vdev_mirror, OID_AUTO, rotating_seek_inc, CTLFLAG_RWTUN,
-    &rotating_seek_inc, 0, "Rotating media load increment for seeking I/O's");
-#endif
-
-static int rotating_seek_offset = 1 * 1024 * 1024;
-#ifdef _KERNEL
-SYSCTL_INT(_vfs_zfs_vdev_mirror, OID_AUTO, rotating_seek_offset, CTLFLAG_RWTUN,
-    &rotating_seek_offset, 0, "Offset in bytes from the last I/O which "
-    "triggers a reduced rotating media seek increment");
-#endif
-
-/* Non-rotating media load calculation configuration. */
-static int non_rotating_inc = 0;
-#ifdef _KERNEL
-SYSCTL_INT(_vfs_zfs_vdev_mirror, OID_AUTO, non_rotating_inc, CTLFLAG_RWTUN,
-    &non_rotating_inc, 0,
-    "Non-rotating media load increment for non-seeking I/O's");
-#endif
-
-static int non_rotating_seek_inc = 1;
-#ifdef _KERNEL
-SYSCTL_INT(_vfs_zfs_vdev_mirror, OID_AUTO, non_rotating_seek_inc, CTLFLAG_RWTUN,
-    &non_rotating_seek_inc, 0,
-    "Non-rotating media load increment for seeking I/O's");
-#endif
-
-
-static inline size_t
-vdev_mirror_map_size(int children)
-{
-	return (offsetof(mirror_map_t, mm_child[children]) +
-	    sizeof(int) * children);
-}
-
-static inline mirror_map_t *
-vdev_mirror_map_alloc(int children, boolean_t resilvering, boolean_t root)
-{
-	mirror_map_t *mm;
-
-	mm = kmem_zalloc(vdev_mirror_map_size(children), KM_SLEEP);
-	mm->mm_children = children;
-	mm->mm_resilvering = resilvering;
-	mm->mm_root = root;
-	mm->mm_preferred = (int *)((uintptr_t)mm + 
-	    offsetof(mirror_map_t, mm_child[children]));
-
-	return mm;
-}
-
-static void
-vdev_mirror_map_free(zio_t *zio)
-{
-	mirror_map_t *mm = zio->io_vsd;
-
-	kmem_free(mm, vdev_mirror_map_size(mm->mm_children));
-}
-
-static const zio_vsd_ops_t vdev_mirror_vsd_ops = {
-	vdev_mirror_map_free,
-	zio_vsd_default_cksum_report
-};
-
-static int
-vdev_mirror_load(mirror_map_t *mm, vdev_t *vd, uint64_t zio_offset)
-{
-	uint64_t lastoffset;
-	int load;
-
-	/* All DVAs have equal weight at the root. */
-	if (mm->mm_root)
-		return (INT_MAX);
-
-	/*
-	 * We don't return INT_MAX if the device is resilvering i.e.
-	 * vdev_resilver_txg != 0 as when tested performance was slightly
-	 * worse overall when resilvering with compared to without.
-	 */
-
-	/* Standard load based on pending queue length. */
-	load = vdev_queue_length(vd);
-	lastoffset = vdev_queue_lastoffset(vd);
-
-	if (vd->vdev_nonrot) {
-		/* Non-rotating media. */
-		if (lastoffset == zio_offset)
-			return (load + non_rotating_inc);
-
-		/*
-		 * Apply a seek penalty even for non-rotating devices as
-		 * sequential I/O'a can be aggregated into fewer operations
-		 * on the device, thus avoiding unnecessary per-command
-		 * overhead and boosting performance.
-		 */
-		return (load + non_rotating_seek_inc);
-	}
-
-	/* Rotating media I/O's which directly follow the last I/O. */
-	if (lastoffset == zio_offset)
-		return (load + rotating_inc);
-
-	/*
-	 * Apply half the seek increment to I/O's within seek offset
-	 * of the last I/O queued to this vdev as they should incure less
-	 * of a seek increment.
-	 */
-	if (ABS(lastoffset - zio_offset) < rotating_seek_offset)
-		return (load + (rotating_seek_inc / 2));
-
-	/* Apply the full seek increment to all other I/O's. */
-	return (load + rotating_seek_inc);
-}
-
-
-static mirror_map_t *
-vdev_mirror_map_init(zio_t *zio)
-{
-	mirror_map_t *mm = NULL;
-	mirror_child_t *mc;
-	vdev_t *vd = zio->io_vd;
-	int c;
-
-	if (vd == NULL) {
-		dva_t *dva = zio->io_bp->blk_dva;
-		spa_t *spa = zio->io_spa;
-		dva_t dva_copy[SPA_DVAS_PER_BP];
-
-		c = BP_GET_NDVAS(zio->io_bp);
-
-		/*
-		 * If we do not trust the pool config, some DVAs might be
-		 * invalid or point to vdevs that do not exist. We skip them.
-		 */
-		if (!spa_trust_config(spa)) {
-			ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ);
-			int j = 0;
-			for (int i = 0; i < c; i++) {
-				if (zfs_dva_valid(spa, &dva[i], zio->io_bp))
-					dva_copy[j++] = dva[i];
-			}
-			if (j == 0) {
-				zio->io_vsd = NULL;
-				zio->io_error = ENXIO;
-				return (NULL);
-			}
-			if (j < c) {
-				dva = dva_copy;
-				c = j;
-			}
-		}
-
-		mm = vdev_mirror_map_alloc(c, B_FALSE, B_TRUE);
-
-		for (c = 0; c < mm->mm_children; c++) {
-			mc = &mm->mm_child[c];
-			mc->mc_vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[c]));
-			mc->mc_offset = DVA_GET_OFFSET(&dva[c]);
-		}
-	} else {
-		/*
-		 * If we are resilvering, then we should handle scrub reads
-		 * differently; we shouldn't issue them to the resilvering
-		 * device because it might not have those blocks.
-		 *
-		 * We are resilvering iff:
-		 * 1) We are a replacing vdev (ie our name is "replacing-1" or
-		 *    "spare-1" or something like that), and
-		 * 2) The pool is currently being resilvered.
-		 *
-		 * We cannot simply check vd->vdev_resilver_txg, because it's
-		 * not set in this path.
-		 *
-		 * Nor can we just check our vdev_ops; there are cases (such as
-		 * when a user types "zpool replace pool odev spare_dev" and
-		 * spare_dev is in the spare list, or when a spare device is
-		 * automatically used to replace a DEGRADED device) when
-		 * resilvering is complete but both the original vdev and the
-		 * spare vdev remain in the pool.  That behavior is intentional.
-		 * It helps implement the policy that a spare should be
-		 * automatically removed from the pool after the user replaces
-		 * the device that originally failed.
-		 *
-		 * If a spa load is in progress, then spa_dsl_pool may be
-		 * uninitialized.  But we shouldn't be resilvering during a spa
-		 * load anyway.
-		 */
-		boolean_t replacing = (vd->vdev_ops == &vdev_replacing_ops ||
-		    vd->vdev_ops == &vdev_spare_ops) &&
-		    spa_load_state(vd->vdev_spa) == SPA_LOAD_NONE &&
-		    dsl_scan_resilvering(vd->vdev_spa->spa_dsl_pool);		
-		mm = vdev_mirror_map_alloc(vd->vdev_children, replacing,
-		    B_FALSE);
-		for (c = 0; c < mm->mm_children; c++) {
-			mc = &mm->mm_child[c];
-			mc->mc_vd = vd->vdev_child[c];
-			mc->mc_offset = zio->io_offset;
-		}
-	}
-
-	zio->io_vsd = mm;
-	zio->io_vsd_ops = &vdev_mirror_vsd_ops;
-	return (mm);
-}
-
-static int
-vdev_mirror_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize,
-    uint64_t *logical_ashift, uint64_t *physical_ashift)
-{
-	int numerrors = 0;
-	int lasterror = 0;
-
-	if (vd->vdev_children == 0) {
-		vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
-		return (SET_ERROR(EINVAL));
-	}
-
-	vdev_open_children(vd);
-
-	for (int c = 0; c < vd->vdev_children; c++) {
-		vdev_t *cvd = vd->vdev_child[c];
-
-		if (cvd->vdev_open_error) {
-			lasterror = cvd->vdev_open_error;
-			numerrors++;
-			continue;
-		}
-
-		*asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1;
-		*max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1;
-		*logical_ashift = MAX(*logical_ashift, cvd->vdev_ashift);
-		*physical_ashift = MAX(*physical_ashift,
-		    cvd->vdev_physical_ashift);
-	}
-
-	if (numerrors == vd->vdev_children) {
-		if (vdev_children_are_offline(vd))
-			vd->vdev_stat.vs_aux = VDEV_AUX_CHILDREN_OFFLINE;
-		else
-			vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
-		return (lasterror);
-	}
-
-	return (0);
-}
-
-static void
-vdev_mirror_close(vdev_t *vd)
-{
-	for (int c = 0; c < vd->vdev_children; c++)
-		vdev_close(vd->vdev_child[c]);
-}
-
-static void
-vdev_mirror_child_done(zio_t *zio)
-{
-	mirror_child_t *mc = zio->io_private;
-
-	mc->mc_error = zio->io_error;
-	mc->mc_tried = 1;
-	mc->mc_skipped = 0;
-}
-
-static void
-vdev_mirror_scrub_done(zio_t *zio)
-{
-	mirror_child_t *mc = zio->io_private;
-
-	if (zio->io_error == 0) {
-		zio_t *pio;
-		zio_link_t *zl = NULL;
-
-		mutex_enter(&zio->io_lock);
-		while ((pio = zio_walk_parents(zio, &zl)) != NULL) {
-			mutex_enter(&pio->io_lock);
-			ASSERT3U(zio->io_size, >=, pio->io_size);
-			abd_copy(pio->io_abd, zio->io_abd, pio->io_size);
-			mutex_exit(&pio->io_lock);
-		}
-		mutex_exit(&zio->io_lock);
-	}
-	abd_free(zio->io_abd);
-
-	mc->mc_error = zio->io_error;
-	mc->mc_tried = 1;
-	mc->mc_skipped = 0;
-}
-
-/*
- * Check the other, lower-index DVAs to see if they're on the same
- * vdev as the child we picked.  If they are, use them since they
- * are likely to have been allocated from the primary metaslab in
- * use at the time, and hence are more likely to have locality with
- * single-copy data.
- */
-static int
-vdev_mirror_dva_select(zio_t *zio, int p)
-{
-	dva_t *dva = zio->io_bp->blk_dva;
-	mirror_map_t *mm = zio->io_vsd;
-	int preferred;
-	int c;
-
-	preferred = mm->mm_preferred[p];
-	for (p-- ; p >= 0; p--) {
-		c = mm->mm_preferred[p];
-		if (DVA_GET_VDEV(&dva[c]) == DVA_GET_VDEV(&dva[preferred]))
-			preferred = c;
-	}
-	return (preferred);
-}
-
-static int
-vdev_mirror_preferred_child_randomize(zio_t *zio)
-{
-	mirror_map_t *mm = zio->io_vsd;
-	int p;
-
-	if (mm->mm_root) {
-		p = spa_get_random(mm->mm_preferred_cnt);
-		return (vdev_mirror_dva_select(zio, p));
-	}
-
-	/*
-	 * To ensure we don't always favour the first matching vdev,
-	 * which could lead to wear leveling issues on SSD's, we
-	 * use the I/O offset as a pseudo random seed into the vdevs
-	 * which have the lowest load.
-	 */
-	p = (zio->io_offset >> vdev_mirror_shift) % mm->mm_preferred_cnt;
-	return (mm->mm_preferred[p]);
-}
-
-/*
- * Try to find a vdev whose DTL doesn't contain the block we want to read
- * prefering vdevs based on determined load.
- *
- * If we can't, try the read on any vdev we haven't already tried.
- */
-static int
-vdev_mirror_child_select(zio_t *zio)
-{
-	mirror_map_t *mm = zio->io_vsd;
-	uint64_t txg = zio->io_txg;
-	int c, lowest_load;
-
-	ASSERT(zio->io_bp == NULL || BP_PHYSICAL_BIRTH(zio->io_bp) == txg);
-
-	lowest_load = INT_MAX;
-	mm->mm_preferred_cnt = 0;
-	for (c = 0; c < mm->mm_children; c++) {
-		mirror_child_t *mc;
-
-		mc = &mm->mm_child[c];
-		if (mc->mc_tried || mc->mc_skipped)
-			continue;
-
-		if (!vdev_readable(mc->mc_vd)) {
-			mc->mc_error = SET_ERROR(ENXIO);
-			mc->mc_tried = 1;	/* don't even try */
-			mc->mc_skipped = 1;
-			continue;
-		}
-
-		if (vdev_dtl_contains(mc->mc_vd, DTL_MISSING, txg, 1)) {
-			mc->mc_error = SET_ERROR(ESTALE);
-			mc->mc_skipped = 1;
-			mc->mc_speculative = 1;
-			continue;
-		}
-
-		mc->mc_load = vdev_mirror_load(mm, mc->mc_vd, mc->mc_offset);
-		if (mc->mc_load > lowest_load)
-			continue;
-
-		if (mc->mc_load < lowest_load) {
-			lowest_load = mc->mc_load;
-			mm->mm_preferred_cnt = 0;
-		}
-		mm->mm_preferred[mm->mm_preferred_cnt] = c;
-		mm->mm_preferred_cnt++;
-	}
-
-	if (mm->mm_preferred_cnt == 1) {
-		vdev_queue_register_lastoffset(
-		    mm->mm_child[mm->mm_preferred[0]].mc_vd, zio);
-		return (mm->mm_preferred[0]);
-	}
-
-	if (mm->mm_preferred_cnt > 1) {
-		int c = vdev_mirror_preferred_child_randomize(zio);
-
-		vdev_queue_register_lastoffset(mm->mm_child[c].mc_vd, zio);
-		return (c);
-	}
-
-	/*
-	 * Every device is either missing or has this txg in its DTL.
-	 * Look for any child we haven't already tried before giving up.
-	 */
-	for (c = 0; c < mm->mm_children; c++) {
-		if (!mm->mm_child[c].mc_tried) {
-			vdev_queue_register_lastoffset(mm->mm_child[c].mc_vd,
-			    zio);
-			return (c);
-		}
-	}
-
-	/*
-	 * Every child failed.  There's no place left to look.
-	 */
-	return (-1);
-}
-
-static void
-vdev_mirror_io_start(zio_t *zio)
-{
-	mirror_map_t *mm;
-	mirror_child_t *mc;
-	int c, children;
-
-	mm = vdev_mirror_map_init(zio);
-
-	if (mm == NULL) {
-		ASSERT(!spa_trust_config(zio->io_spa));
-		ASSERT(zio->io_type == ZIO_TYPE_READ);
-		zio_execute(zio);
-		return;
-	}
-
-	if (zio->io_type == ZIO_TYPE_READ) {
-		if (zio->io_bp != NULL &&
-		    (zio->io_flags & ZIO_FLAG_SCRUB) && !mm->mm_resilvering &&
-		    mm->mm_children > 1) {
-			/*
-			 * For scrubbing reads (if we can verify the
-			 * checksum here, as indicated by io_bp being
-			 * non-NULL) we need to allocate a read buffer for
-			 * each child and issue reads to all children.  If
-			 * any child succeeds, it will copy its data into
-			 * zio->io_data in vdev_mirror_scrub_done.
-			 */
-			for (c = 0; c < mm->mm_children; c++) {
-				mc = &mm->mm_child[c];
-				zio_nowait(zio_vdev_child_io(zio, zio->io_bp,
-				    mc->mc_vd, mc->mc_offset,
-				    abd_alloc_sametype(zio->io_abd,
-				    zio->io_size), zio->io_size,
-				    zio->io_type, zio->io_priority, 0,
-				    vdev_mirror_scrub_done, mc));
-			}
-			zio_execute(zio);
-			return;
-		}
-		/*
-		 * For normal reads just pick one child.
-		 */
-		c = vdev_mirror_child_select(zio);
-		children = (c >= 0);
-	} else {
-		ASSERT(zio->io_type == ZIO_TYPE_WRITE ||
-		    zio->io_type == ZIO_TYPE_FREE);
-
-		/*
-		 * Writes and frees go to all children.
-		 */
-		c = 0;
-		children = mm->mm_children;
-	}
-
-	while (children--) {
-		mc = &mm->mm_child[c];
-		zio_nowait(zio_vdev_child_io(zio, zio->io_bp,
-		    mc->mc_vd, mc->mc_offset, zio->io_abd, zio->io_size,
-		    zio->io_type, zio->io_priority, 0,
-		    vdev_mirror_child_done, mc));
-		c++;
-	}
-
-	zio_execute(zio);
-}
-
-static int
-vdev_mirror_worst_error(mirror_map_t *mm)
-{
-	int error[2] = { 0, 0 };
-
-	for (int c = 0; c < mm->mm_children; c++) {
-		mirror_child_t *mc = &mm->mm_child[c];
-		int s = mc->mc_speculative;
-		error[s] = zio_worst_error(error[s], mc->mc_error);
-	}
-
-	return (error[0] ? error[0] : error[1]);
-}
-
-static void
-vdev_mirror_io_done(zio_t *zio)
-{
-	mirror_map_t *mm = zio->io_vsd;
-	mirror_child_t *mc;
-	int c;
-	int good_copies = 0;
-	int unexpected_errors = 0;
-
-	if (mm == NULL)
-		return;
-
-	for (c = 0; c < mm->mm_children; c++) {
-		mc = &mm->mm_child[c];
-
-		if (mc->mc_error) {
-			if (!mc->mc_skipped)
-				unexpected_errors++;
-		} else if (mc->mc_tried) {
-			good_copies++;
-		}
-	}
-
-	if (zio->io_type == ZIO_TYPE_WRITE) {
-		/*
-		 * XXX -- for now, treat partial writes as success.
-		 *
-		 * Now that we support write reallocation, it would be better
-		 * to treat partial failure as real failure unless there are
-		 * no non-degraded top-level vdevs left, and not update DTLs
-		 * if we intend to reallocate.
-		 */
-		/* XXPOLICY */
-		if (good_copies != mm->mm_children) {
-			/*
-			 * Always require at least one good copy.
-			 *
-			 * For ditto blocks (io_vd == NULL), require
-			 * all copies to be good.
-			 *
-			 * XXX -- for replacing vdevs, there's no great answer.
-			 * If the old device is really dead, we may not even
-			 * be able to access it -- so we only want to
-			 * require good writes to the new device.  But if
-			 * the new device turns out to be flaky, we want
-			 * to be able to detach it -- which requires all
-			 * writes to the old device to have succeeded.
-			 */
-			if (good_copies == 0 || zio->io_vd == NULL)
-				zio->io_error = vdev_mirror_worst_error(mm);
-		}
-		return;
-	} else if (zio->io_type == ZIO_TYPE_FREE) {
-		return;
-	}
-
-	ASSERT(zio->io_type == ZIO_TYPE_READ);
-
-	/*
-	 * If we don't have a good copy yet, keep trying other children.
-	 */
-	/* XXPOLICY */
-	if (good_copies == 0 && (c = vdev_mirror_child_select(zio)) != -1) {
-		ASSERT(c >= 0 && c < mm->mm_children);
-		mc = &mm->mm_child[c];
-		zio_vdev_io_redone(zio);
-		zio_nowait(zio_vdev_child_io(zio, zio->io_bp,
-		    mc->mc_vd, mc->mc_offset, zio->io_abd, zio->io_size,
-		    ZIO_TYPE_READ, zio->io_priority, 0,
-		    vdev_mirror_child_done, mc));
-		return;
-	}
-
-	/* XXPOLICY */
-	if (good_copies == 0) {
-		zio->io_error = vdev_mirror_worst_error(mm);
-		ASSERT(zio->io_error != 0);
-	}
-
-	if (good_copies && spa_writeable(zio->io_spa) &&
-	    (unexpected_errors ||
-	    (zio->io_flags & ZIO_FLAG_RESILVER) ||
-	    ((zio->io_flags & ZIO_FLAG_SCRUB) && mm->mm_resilvering))) {
-		/*
-		 * Use the good data we have in hand to repair damaged children.
-		 */
-		for (c = 0; c < mm->mm_children; c++) {
-			/*
-			 * Don't rewrite known good children.
-			 * Not only is it unnecessary, it could
-			 * actually be harmful: if the system lost
-			 * power while rewriting the only good copy,
-			 * there would be no good copies left!
-			 */
-			mc = &mm->mm_child[c];
-
-			if (mc->mc_error == 0) {
-				if (mc->mc_tried)
-					continue;
-				/*
-				 * We didn't try this child.  We need to
-				 * repair it if:
-				 * 1. it's a scrub (in which case we have
-				 * tried everything that was healthy)
-				 *  - or -
-				 * 2. it's an indirect vdev (in which case
-				 * it could point to any other vdev, which
-				 * might have a bad DTL)
-				 *  - or -
-				 * 3. the DTL indicates that this data is
-				 * missing from this vdev
-				 */
-				if (!(zio->io_flags & ZIO_FLAG_SCRUB) &&
-				    mc->mc_vd->vdev_ops != &vdev_indirect_ops &&
-				    !vdev_dtl_contains(mc->mc_vd, DTL_PARTIAL,
-				    zio->io_txg, 1))
-					continue;
-				mc->mc_error = SET_ERROR(ESTALE);
-			}
-
-			zio_nowait(zio_vdev_child_io(zio, zio->io_bp,
-			    mc->mc_vd, mc->mc_offset,
-			    zio->io_abd, zio->io_size,
-			    ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE,
-			    ZIO_FLAG_IO_REPAIR | (unexpected_errors ?
-			    ZIO_FLAG_SELF_HEAL : 0), NULL, NULL));
-		}
-	}
-}
-
-static void
-vdev_mirror_state_change(vdev_t *vd, int faulted, int degraded)
-{
-	if (faulted == vd->vdev_children) {
-		if (vdev_children_are_offline(vd)) {
-			vdev_set_state(vd, B_FALSE, VDEV_STATE_OFFLINE,
-			    VDEV_AUX_CHILDREN_OFFLINE);
-		} else {
-			vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
-			    VDEV_AUX_NO_REPLICAS);
-		}
-	} else if (degraded + faulted != 0) {
-		vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
-	} else {
-		vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
-	}
-}
-
-vdev_ops_t vdev_mirror_ops = {
-	vdev_mirror_open,
-	vdev_mirror_close,
-	vdev_default_asize,
-	vdev_mirror_io_start,
-	vdev_mirror_io_done,
-	vdev_mirror_state_change,
-	NULL,
-	NULL,
-	NULL,
-	NULL,
-	vdev_default_xlate,
-	VDEV_TYPE_MIRROR,	/* name of this vdev type */
-	B_FALSE			/* not a leaf vdev */
-};
-
-vdev_ops_t vdev_replacing_ops = {
-	vdev_mirror_open,
-	vdev_mirror_close,
-	vdev_default_asize,
-	vdev_mirror_io_start,
-	vdev_mirror_io_done,
-	vdev_mirror_state_change,
-	NULL,
-	NULL,
-	NULL,
-	NULL,
-	vdev_default_xlate,
-	VDEV_TYPE_REPLACING,	/* name of this vdev type */
-	B_FALSE			/* not a leaf vdev */
-};
-
-vdev_ops_t vdev_spare_ops = {
-	vdev_mirror_open,
-	vdev_mirror_close,
-	vdev_default_asize,
-	vdev_mirror_io_start,
-	vdev_mirror_io_done,
-	vdev_mirror_state_change,
-	NULL,
-	NULL,
-	NULL,
-	NULL,
-	vdev_default_xlate,
-	VDEV_TYPE_SPARE,	/* name of this vdev type */
-	B_FALSE			/* not a leaf vdev */
-};
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_missing.c
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_missing.c
+++ /dev/null
@@ -1,113 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-
-/*
- * Copyright (c) 2012, 2016 by Delphix. All rights reserved.
- */
-
-/*
- * The 'missing' vdev is a special vdev type used only during import.  It
- * signifies a placeholder in the root vdev for some vdev that we know is
- * missing.  We pass it down to the kernel to allow the rest of the
- * configuration to parsed and an attempt made to open all available devices.
- * Because its GUID is always 0, we know that the guid sum will mismatch and we
- * won't be able to open the pool anyway.
- */
-
-#include <sys/zfs_context.h>
-#include <sys/spa.h>
-#include <sys/vdev_impl.h>
-#include <sys/fs/zfs.h>
-#include <sys/zio.h>
-
-/* ARGSUSED */
-static int
-vdev_missing_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize,
-    uint64_t *logical_ashift, uint64_t *physical_ashift)
-{
-	/*
-	 * Really this should just fail.  But then the root vdev will be in the
-	 * faulted state with VDEV_AUX_NO_REPLICAS, when what we really want is
-	 * VDEV_AUX_BAD_GUID_SUM.  So we pretend to succeed, knowing that we
-	 * will fail the GUID sum check before ever trying to open the pool.
-	 */
-	*psize = 0;
-	*max_psize = 0;
-	*logical_ashift = 0;
-	*physical_ashift = 0;
-	return (0);
-}
-
-/* ARGSUSED */
-static void
-vdev_missing_close(vdev_t *vd)
-{
-}
-
-/* ARGSUSED */
-static void
-vdev_missing_io_start(zio_t *zio)
-{
-	zio->io_error = SET_ERROR(ENOTSUP);
-	zio_execute(zio);
-}
-
-/* ARGSUSED */
-static void
-vdev_missing_io_done(zio_t *zio)
-{
-}
-
-vdev_ops_t vdev_missing_ops = {
-	vdev_missing_open,
-	vdev_missing_close,
-	vdev_default_asize,
-	vdev_missing_io_start,
-	vdev_missing_io_done,
-	NULL,
-	NULL,
-	NULL,
-	NULL,
-	NULL,
-	NULL,
-	VDEV_TYPE_MISSING,	/* name of this vdev type */
-	B_TRUE			/* leaf vdev */
-};
-
-vdev_ops_t vdev_hole_ops = {
-	vdev_missing_open,
-	vdev_missing_close,
-	vdev_default_asize,
-	vdev_missing_io_start,
-	vdev_missing_io_done,
-	NULL,
-	NULL,
-	NULL,
-	NULL,
-	NULL,
-	NULL,
-	VDEV_TYPE_HOLE,		/* name of this vdev type */
-	B_TRUE			/* leaf vdev */
-};
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c
+++ /dev/null
@@ -1,1047 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-
-/*
- * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
- * Copyright (c) 2014 Integros [integros.com]
- */
-
-#include <sys/zfs_context.h>
-#include <sys/vdev_impl.h>
-#include <sys/spa_impl.h>
-#include <sys/zio.h>
-#include <sys/avl.h>
-#include <sys/dsl_pool.h>
-#include <sys/metaslab_impl.h>
-#include <sys/abd.h>
-
-/*
- * ZFS I/O Scheduler
- * ---------------
- *
- * ZFS issues I/O operations to leaf vdevs to satisfy and complete zios.  The
- * I/O scheduler determines when and in what order those operations are
- * issued.  The I/O scheduler divides operations into six I/O classes
- * prioritized in the following order: sync read, sync write, async read,
- * async write, scrub/resilver and trim.  Each queue defines the minimum and
- * maximum number of concurrent operations that may be issued to the device.
- * In addition, the device has an aggregate maximum. Note that the sum of the
- * per-queue minimums must not exceed the aggregate maximum, and if the
- * aggregate maximum is equal to or greater than the sum of the per-queue
- * maximums, the per-queue minimum has no effect.
- *
- * For many physical devices, throughput increases with the number of
- * concurrent operations, but latency typically suffers. Further, physical
- * devices typically have a limit at which more concurrent operations have no
- * effect on throughput or can actually cause it to decrease.
- *
- * The scheduler selects the next operation to issue by first looking for an
- * I/O class whose minimum has not been satisfied. Once all are satisfied and
- * the aggregate maximum has not been hit, the scheduler looks for classes
- * whose maximum has not been satisfied. Iteration through the I/O classes is
- * done in the order specified above. No further operations are issued if the
- * aggregate maximum number of concurrent operations has been hit or if there
- * are no operations queued for an I/O class that has not hit its maximum.
- * Every time an I/O is queued or an operation completes, the I/O scheduler
- * looks for new operations to issue.
- *
- * All I/O classes have a fixed maximum number of outstanding operations
- * except for the async write class. Asynchronous writes represent the data
- * that is committed to stable storage during the syncing stage for
- * transaction groups (see txg.c). Transaction groups enter the syncing state
- * periodically so the number of queued async writes will quickly burst up and
- * then bleed down to zero. Rather than servicing them as quickly as possible,
- * the I/O scheduler changes the maximum number of active async write I/Os
- * according to the amount of dirty data in the pool (see dsl_pool.c). Since
- * both throughput and latency typically increase with the number of
- * concurrent operations issued to physical devices, reducing the burstiness
- * in the number of concurrent operations also stabilizes the response time of
- * operations from other -- and in particular synchronous -- queues. In broad
- * strokes, the I/O scheduler will issue more concurrent operations from the
- * async write queue as there's more dirty data in the pool.
- *
- * Async Writes
- *
- * The number of concurrent operations issued for the async write I/O class
- * follows a piece-wise linear function defined by a few adjustable points.
- *
- *        |                   o---------| <-- zfs_vdev_async_write_max_active
- *   ^    |                  /^         |
- *   |    |                 / |         |
- * active |                /  |         |
- *  I/O   |               /   |         |
- * count  |              /    |         |
- *        |             /     |         |
- *        |------------o      |         | <-- zfs_vdev_async_write_min_active
- *       0|____________^______|_________|
- *        0%           |      |       100% of zfs_dirty_data_max
- *                     |      |
- *                     |      `-- zfs_vdev_async_write_active_max_dirty_percent
- *                     `--------- zfs_vdev_async_write_active_min_dirty_percent
- *
- * Until the amount of dirty data exceeds a minimum percentage of the dirty
- * data allowed in the pool, the I/O scheduler will limit the number of
- * concurrent operations to the minimum. As that threshold is crossed, the
- * number of concurrent operations issued increases linearly to the maximum at
- * the specified maximum percentage of the dirty data allowed in the pool.
- *
- * Ideally, the amount of dirty data on a busy pool will stay in the sloped
- * part of the function between zfs_vdev_async_write_active_min_dirty_percent
- * and zfs_vdev_async_write_active_max_dirty_percent. If it exceeds the
- * maximum percentage, this indicates that the rate of incoming data is
- * greater than the rate that the backend storage can handle. In this case, we
- * must further throttle incoming writes (see dmu_tx_delay() for details).
- */
-
-/*
- * The maximum number of I/Os active to each device.  Ideally, this will be >=
- * the sum of each queue's max_active.  It must be at least the sum of each
- * queue's min_active.
- */
-uint32_t zfs_vdev_max_active = 1000;
-
-/*
- * Per-queue limits on the number of I/Os active to each device.  If the
- * sum of the queue's max_active is < zfs_vdev_max_active, then the
- * min_active comes into play.  We will send min_active from each queue,
- * and then select from queues in the order defined by zio_priority_t.
- *
- * In general, smaller max_active's will lead to lower latency of synchronous
- * operations.  Larger max_active's may lead to higher overall throughput,
- * depending on underlying storage.
- *
- * The ratio of the queues' max_actives determines the balance of performance
- * between reads, writes, and scrubs.  E.g., increasing
- * zfs_vdev_scrub_max_active will cause the scrub or resilver to complete
- * more quickly, but reads and writes to have higher latency and lower
- * throughput.
- */
-uint32_t zfs_vdev_sync_read_min_active = 10;
-uint32_t zfs_vdev_sync_read_max_active = 10;
-uint32_t zfs_vdev_sync_write_min_active = 10;
-uint32_t zfs_vdev_sync_write_max_active = 10;
-uint32_t zfs_vdev_async_read_min_active = 1;
-uint32_t zfs_vdev_async_read_max_active = 3;
-uint32_t zfs_vdev_async_write_min_active = 1;
-uint32_t zfs_vdev_async_write_max_active = 10;
-uint32_t zfs_vdev_scrub_min_active = 1;
-uint32_t zfs_vdev_scrub_max_active = 2;
-uint32_t zfs_vdev_trim_min_active = 1;
-/*
- * TRIM max active is large in comparison to the other values due to the fact
- * that TRIM IOs are coalesced at the device layer. This value is set such
- * that a typical SSD can process the queued IOs in a single request.
- */
-uint32_t zfs_vdev_trim_max_active = 64;
-uint32_t zfs_vdev_removal_min_active = 1;
-uint32_t zfs_vdev_removal_max_active = 2;
-uint32_t zfs_vdev_initializing_min_active = 1;
-uint32_t zfs_vdev_initializing_max_active = 1;
-
-
-/*
- * When the pool has less than zfs_vdev_async_write_active_min_dirty_percent
- * dirty data, use zfs_vdev_async_write_min_active.  When it has more than
- * zfs_vdev_async_write_active_max_dirty_percent, use
- * zfs_vdev_async_write_max_active. The value is linearly interpolated
- * between min and max.
- */
-int zfs_vdev_async_write_active_min_dirty_percent = 30;
-int zfs_vdev_async_write_active_max_dirty_percent = 60;
-
-/*
- * To reduce IOPs, we aggregate small adjacent I/Os into one large I/O.
- * For read I/Os, we also aggregate across small adjacency gaps; for writes
- * we include spans of optional I/Os to aid aggregation at the disk even when
- * they aren't able to help us aggregate at this level.
- */
-int zfs_vdev_aggregation_limit = 1 << 20;
-int zfs_vdev_aggregation_limit_non_rotating = SPA_OLD_MAXBLOCKSIZE;
-int zfs_vdev_read_gap_limit = 32 << 10;
-int zfs_vdev_write_gap_limit = 4 << 10;
-
-/*
- * Define the queue depth percentage for each top-level. This percentage is
- * used in conjunction with zfs_vdev_async_max_active to determine how many
- * allocations a specific top-level vdev should handle. Once the queue depth
- * reaches zfs_vdev_queue_depth_pct * zfs_vdev_async_write_max_active / 100
- * then allocator will stop allocating blocks on that top-level device.
- * The default kernel setting is 1000% which will yield 100 allocations per
- * device. For userland testing, the default setting is 300% which equates
- * to 30 allocations per device.
- */
-#ifdef _KERNEL
-int zfs_vdev_queue_depth_pct = 1000;
-#else
-int zfs_vdev_queue_depth_pct = 300;
-#endif
-
-/*
- * When performing allocations for a given metaslab, we want to make sure that
- * there are enough IOs to aggregate together to improve throughput. We want to
- * ensure that there are at least 128k worth of IOs that can be aggregated, and
- * we assume that the average allocation size is 4k, so we need the queue depth
- * to be 32 per allocator to get good aggregation of sequential writes.
- */
-int zfs_vdev_def_queue_depth = 32;
-
-#ifdef __FreeBSD__
-#ifdef _KERNEL
-SYSCTL_DECL(_vfs_zfs_vdev);
-
-static int sysctl_zfs_async_write_active_min_dirty_percent(SYSCTL_HANDLER_ARGS);
-SYSCTL_PROC(_vfs_zfs_vdev, OID_AUTO, async_write_active_min_dirty_percent,
-    CTLTYPE_UINT | CTLFLAG_MPSAFE | CTLFLAG_RWTUN, 0, sizeof(int),
-    sysctl_zfs_async_write_active_min_dirty_percent, "I",
-    "Percentage of async write dirty data below which "
-    "async_write_min_active is used.");
-
-static int sysctl_zfs_async_write_active_max_dirty_percent(SYSCTL_HANDLER_ARGS);
-SYSCTL_PROC(_vfs_zfs_vdev, OID_AUTO, async_write_active_max_dirty_percent,
-    CTLTYPE_UINT | CTLFLAG_MPSAFE | CTLFLAG_RWTUN, 0, sizeof(int),
-    sysctl_zfs_async_write_active_max_dirty_percent, "I",
-    "Percentage of async write dirty data above which "
-    "async_write_max_active is used.");
-
-SYSCTL_UINT(_vfs_zfs_vdev, OID_AUTO, max_active, CTLFLAG_RWTUN,
-    &zfs_vdev_max_active, 0,
-    "The maximum number of I/Os of all types active for each device.");
-
-#define ZFS_VDEV_QUEUE_KNOB_MIN(name)					\
-SYSCTL_UINT(_vfs_zfs_vdev, OID_AUTO, name ## _min_active, CTLFLAG_RWTUN,\
-    &zfs_vdev_ ## name ## _min_active, 0,				\
-    "Initial number of I/O requests of type " #name			\
-    " active for each device");
-
-#define ZFS_VDEV_QUEUE_KNOB_MAX(name)					\
-SYSCTL_UINT(_vfs_zfs_vdev, OID_AUTO, name ## _max_active, CTLFLAG_RWTUN,\
-    &zfs_vdev_ ## name ## _max_active, 0,				\
-    "Maximum number of I/O requests of type " #name			\
-    " active for each device");
-
-ZFS_VDEV_QUEUE_KNOB_MIN(sync_read);
-ZFS_VDEV_QUEUE_KNOB_MAX(sync_read);
-ZFS_VDEV_QUEUE_KNOB_MIN(sync_write);
-ZFS_VDEV_QUEUE_KNOB_MAX(sync_write);
-ZFS_VDEV_QUEUE_KNOB_MIN(async_read);
-ZFS_VDEV_QUEUE_KNOB_MAX(async_read);
-ZFS_VDEV_QUEUE_KNOB_MIN(async_write);
-ZFS_VDEV_QUEUE_KNOB_MAX(async_write);
-ZFS_VDEV_QUEUE_KNOB_MIN(scrub);
-ZFS_VDEV_QUEUE_KNOB_MAX(scrub);
-ZFS_VDEV_QUEUE_KNOB_MIN(trim);
-ZFS_VDEV_QUEUE_KNOB_MAX(trim);
-ZFS_VDEV_QUEUE_KNOB_MIN(removal);
-ZFS_VDEV_QUEUE_KNOB_MAX(removal);
-ZFS_VDEV_QUEUE_KNOB_MIN(initializing);
-ZFS_VDEV_QUEUE_KNOB_MAX(initializing);
-
-#undef ZFS_VDEV_QUEUE_KNOB
-
-SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, aggregation_limit, CTLFLAG_RWTUN,
-    &zfs_vdev_aggregation_limit, 0,
-    "I/O requests are aggregated up to this size");
-SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, aggregation_limit_non_rotating, CTLFLAG_RWTUN,
-    &zfs_vdev_aggregation_limit_non_rotating, 0,
-    "I/O requests are aggregated up to this size for non-rotating media");
-SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, read_gap_limit, CTLFLAG_RWTUN,
-    &zfs_vdev_read_gap_limit, 0,
-    "Acceptable gap between two reads being aggregated");
-SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, write_gap_limit, CTLFLAG_RWTUN,
-    &zfs_vdev_write_gap_limit, 0,
-    "Acceptable gap between two writes being aggregated");
-SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, queue_depth_pct, CTLFLAG_RWTUN,
-    &zfs_vdev_queue_depth_pct, 0,
-    "Queue depth percentage for each top-level");
-SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, def_queue_depth, CTLFLAG_RWTUN,
-    &zfs_vdev_def_queue_depth, 0,
-    "Default queue depth for each allocator");
-
-static int
-sysctl_zfs_async_write_active_min_dirty_percent(SYSCTL_HANDLER_ARGS)
-{
-	int val, err;
-
-	val = zfs_vdev_async_write_active_min_dirty_percent;
-	err = sysctl_handle_int(oidp, &val, 0, req);
-	if (err != 0 || req->newptr == NULL)
-		return (err);
-	
-	if (val < 0 || val > 100 ||
-	    val >= zfs_vdev_async_write_active_max_dirty_percent)
-		return (EINVAL);
-
-	zfs_vdev_async_write_active_min_dirty_percent = val;
-
-	return (0);
-}
-
-static int
-sysctl_zfs_async_write_active_max_dirty_percent(SYSCTL_HANDLER_ARGS)
-{
-	int val, err;
-
-	val = zfs_vdev_async_write_active_max_dirty_percent;
-	err = sysctl_handle_int(oidp, &val, 0, req);
-	if (err != 0 || req->newptr == NULL)
-		return (err);
-
-	if (val < 0 || val > 100 ||
-	    val <= zfs_vdev_async_write_active_min_dirty_percent)
-		return (EINVAL);
-
-	zfs_vdev_async_write_active_max_dirty_percent = val;
-
-	return (0);
-}
-#endif
-#endif
-
-int
-vdev_queue_offset_compare(const void *x1, const void *x2)
-{
-	const zio_t *z1 = (const zio_t *)x1;
-	const zio_t *z2 = (const zio_t *)x2;
-
-	int cmp = AVL_CMP(z1->io_offset, z2->io_offset);
-
-	if (likely(cmp))
-		return (cmp);
-
-	return (AVL_PCMP(z1, z2));
-}
-
-static inline avl_tree_t *
-vdev_queue_class_tree(vdev_queue_t *vq, zio_priority_t p)
-{
-	return (&vq->vq_class[p].vqc_queued_tree);
-}
-
-static inline avl_tree_t *
-vdev_queue_type_tree(vdev_queue_t *vq, zio_type_t t)
-{
-	if (t == ZIO_TYPE_READ)
-		return (&vq->vq_read_offset_tree);
-	else if (t == ZIO_TYPE_WRITE)
-		return (&vq->vq_write_offset_tree);
-	else
-		return (NULL);
-}
-
-int
-vdev_queue_timestamp_compare(const void *x1, const void *x2)
-{
-	const zio_t *z1 = (const zio_t *)x1;
-	const zio_t *z2 = (const zio_t *)x2;
-
-	int cmp = AVL_CMP(z1->io_timestamp, z2->io_timestamp);
-
-	if (likely(cmp))
-		return (cmp);
-
-	return (AVL_PCMP(z1, z2));
-}
-
-void
-vdev_queue_init(vdev_t *vd)
-{
-	vdev_queue_t *vq = &vd->vdev_queue;
-
-	mutex_init(&vq->vq_lock, NULL, MUTEX_DEFAULT, NULL);
-	vq->vq_vdev = vd;
-
-	avl_create(&vq->vq_active_tree, vdev_queue_offset_compare,
-	    sizeof (zio_t), offsetof(struct zio, io_queue_node));
-	avl_create(vdev_queue_type_tree(vq, ZIO_TYPE_READ),
-	    vdev_queue_offset_compare, sizeof (zio_t),
-	    offsetof(struct zio, io_offset_node));
-	avl_create(vdev_queue_type_tree(vq, ZIO_TYPE_WRITE),
-	    vdev_queue_offset_compare, sizeof (zio_t),
-	    offsetof(struct zio, io_offset_node));
-
-	for (zio_priority_t p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) {
-		int (*compfn) (const void *, const void *);
-
-		/*
-		 * The synchronous i/o queues are dispatched in FIFO rather
-		 * than LBA order.  This provides more consistent latency for
-		 * these i/os.
-		 */
-		if (p == ZIO_PRIORITY_SYNC_READ || p == ZIO_PRIORITY_SYNC_WRITE)
-			compfn = vdev_queue_timestamp_compare;
-		else
-			compfn = vdev_queue_offset_compare;
-
-		avl_create(vdev_queue_class_tree(vq, p), compfn,
-		    sizeof (zio_t), offsetof(struct zio, io_queue_node));
-	}
-
-	vq->vq_lastoffset = 0;
-}
-
-void
-vdev_queue_fini(vdev_t *vd)
-{
-	vdev_queue_t *vq = &vd->vdev_queue;
-
-	for (zio_priority_t p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++)
-		avl_destroy(vdev_queue_class_tree(vq, p));
-	avl_destroy(&vq->vq_active_tree);
-	avl_destroy(vdev_queue_type_tree(vq, ZIO_TYPE_READ));
-	avl_destroy(vdev_queue_type_tree(vq, ZIO_TYPE_WRITE));
-
-	mutex_destroy(&vq->vq_lock);
-}
-
-static void
-vdev_queue_io_add(vdev_queue_t *vq, zio_t *zio)
-{
-	spa_t *spa = zio->io_spa;
-	avl_tree_t *qtt;
-
-	ASSERT(MUTEX_HELD(&vq->vq_lock));
-	ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
-	avl_add(vdev_queue_class_tree(vq, zio->io_priority), zio);
-	qtt = vdev_queue_type_tree(vq, zio->io_type);
-	if (qtt)
-		avl_add(qtt, zio);
-
-#ifdef illumos
-	mutex_enter(&spa->spa_iokstat_lock);
-	spa->spa_queue_stats[zio->io_priority].spa_queued++;
-	if (spa->spa_iokstat != NULL)
-		kstat_waitq_enter(spa->spa_iokstat->ks_data);
-	mutex_exit(&spa->spa_iokstat_lock);
-#endif
-}
-
-static void
-vdev_queue_io_remove(vdev_queue_t *vq, zio_t *zio)
-{
-	spa_t *spa = zio->io_spa;
-	avl_tree_t *qtt;
-
-	ASSERT(MUTEX_HELD(&vq->vq_lock));
-	ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
-	avl_remove(vdev_queue_class_tree(vq, zio->io_priority), zio);
-	qtt = vdev_queue_type_tree(vq, zio->io_type);
-	if (qtt)
-		avl_remove(qtt, zio);
-
-#ifdef illumos
-	mutex_enter(&spa->spa_iokstat_lock);
-	ASSERT3U(spa->spa_queue_stats[zio->io_priority].spa_queued, >, 0);
-	spa->spa_queue_stats[zio->io_priority].spa_queued--;
-	if (spa->spa_iokstat != NULL)
-		kstat_waitq_exit(spa->spa_iokstat->ks_data);
-	mutex_exit(&spa->spa_iokstat_lock);
-#endif
-}
-
-static void
-vdev_queue_pending_add(vdev_queue_t *vq, zio_t *zio)
-{
-	spa_t *spa = zio->io_spa;
-	ASSERT(MUTEX_HELD(&vq->vq_lock));
-	ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
-	vq->vq_class[zio->io_priority].vqc_active++;
-	avl_add(&vq->vq_active_tree, zio);
-
-#ifdef illumos
-	mutex_enter(&spa->spa_iokstat_lock);
-	spa->spa_queue_stats[zio->io_priority].spa_active++;
-	if (spa->spa_iokstat != NULL)
-		kstat_runq_enter(spa->spa_iokstat->ks_data);
-	mutex_exit(&spa->spa_iokstat_lock);
-#endif
-}
-
-static void
-vdev_queue_pending_remove(vdev_queue_t *vq, zio_t *zio)
-{
-	spa_t *spa = zio->io_spa;
-	ASSERT(MUTEX_HELD(&vq->vq_lock));
-	ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
-	vq->vq_class[zio->io_priority].vqc_active--;
-	avl_remove(&vq->vq_active_tree, zio);
-
-#ifdef illumos
-	mutex_enter(&spa->spa_iokstat_lock);
-	ASSERT3U(spa->spa_queue_stats[zio->io_priority].spa_active, >, 0);
-	spa->spa_queue_stats[zio->io_priority].spa_active--;
-	if (spa->spa_iokstat != NULL) {
-		kstat_io_t *ksio = spa->spa_iokstat->ks_data;
-
-		kstat_runq_exit(spa->spa_iokstat->ks_data);
-		if (zio->io_type == ZIO_TYPE_READ) {
-			ksio->reads++;
-			ksio->nread += zio->io_size;
-		} else if (zio->io_type == ZIO_TYPE_WRITE) {
-			ksio->writes++;
-			ksio->nwritten += zio->io_size;
-		}
-	}
-	mutex_exit(&spa->spa_iokstat_lock);
-#endif
-}
-
-static void
-vdev_queue_agg_io_done(zio_t *aio)
-{
-	if (aio->io_type == ZIO_TYPE_READ) {
-		zio_t *pio;
-		zio_link_t *zl = NULL;
-		while ((pio = zio_walk_parents(aio, &zl)) != NULL) {
-			abd_copy_off(pio->io_abd, aio->io_abd,
-			    0, pio->io_offset - aio->io_offset, pio->io_size);
-		}
-	}
-
-	abd_free(aio->io_abd);
-}
-
-static int
-vdev_queue_class_min_active(zio_priority_t p)
-{
-	switch (p) {
-	case ZIO_PRIORITY_SYNC_READ:
-		return (zfs_vdev_sync_read_min_active);
-	case ZIO_PRIORITY_SYNC_WRITE:
-		return (zfs_vdev_sync_write_min_active);
-	case ZIO_PRIORITY_ASYNC_READ:
-		return (zfs_vdev_async_read_min_active);
-	case ZIO_PRIORITY_ASYNC_WRITE:
-		return (zfs_vdev_async_write_min_active);
-	case ZIO_PRIORITY_SCRUB:
-		return (zfs_vdev_scrub_min_active);
-	case ZIO_PRIORITY_TRIM:
-		return (zfs_vdev_trim_min_active);
-	case ZIO_PRIORITY_REMOVAL:
-		return (zfs_vdev_removal_min_active);
-	case ZIO_PRIORITY_INITIALIZING:
-		return (zfs_vdev_initializing_min_active);
-	default:
-		panic("invalid priority %u", p);
-		return (0);
-	}
-}
-
-static __noinline int
-vdev_queue_max_async_writes(spa_t *spa)
-{
-	int writes;
-	uint64_t dirty = spa->spa_dsl_pool->dp_dirty_total;
-	uint64_t min_bytes = zfs_dirty_data_max *
-	    zfs_vdev_async_write_active_min_dirty_percent / 100;
-	uint64_t max_bytes = zfs_dirty_data_max *
-	    zfs_vdev_async_write_active_max_dirty_percent / 100;
-
-	/*
-	 * Sync tasks correspond to interactive user actions. To reduce the
-	 * execution time of those actions we push data out as fast as possible.
-	 */
-	if (spa_has_pending_synctask(spa)) {
-		return (zfs_vdev_async_write_max_active);
-	}
-
-	if (dirty < min_bytes)
-		return (zfs_vdev_async_write_min_active);
-	if (dirty > max_bytes)
-		return (zfs_vdev_async_write_max_active);
-
-	/*
-	 * linear interpolation:
-	 * slope = (max_writes - min_writes) / (max_bytes - min_bytes)
-	 * move right by min_bytes
-	 * move up by min_writes
-	 */
-	writes = (dirty - min_bytes) *
-	    (zfs_vdev_async_write_max_active -
-	    zfs_vdev_async_write_min_active) /
-	    (max_bytes - min_bytes) +
-	    zfs_vdev_async_write_min_active;
-	ASSERT3U(writes, >=, zfs_vdev_async_write_min_active);
-	ASSERT3U(writes, <=, zfs_vdev_async_write_max_active);
-	return (writes);
-}
-
-static int
-vdev_queue_class_max_active(spa_t *spa, zio_priority_t p)
-{
-	switch (p) {
-	case ZIO_PRIORITY_SYNC_READ:
-		return (zfs_vdev_sync_read_max_active);
-	case ZIO_PRIORITY_SYNC_WRITE:
-		return (zfs_vdev_sync_write_max_active);
-	case ZIO_PRIORITY_ASYNC_READ:
-		return (zfs_vdev_async_read_max_active);
-	case ZIO_PRIORITY_ASYNC_WRITE:
-		return (vdev_queue_max_async_writes(spa));
-	case ZIO_PRIORITY_SCRUB:
-		return (zfs_vdev_scrub_max_active);
-	case ZIO_PRIORITY_TRIM:
-		return (zfs_vdev_trim_max_active);
-	case ZIO_PRIORITY_REMOVAL:
-		return (zfs_vdev_removal_max_active);
-	case ZIO_PRIORITY_INITIALIZING:
-		return (zfs_vdev_initializing_max_active);
-	default:
-		panic("invalid priority %u", p);
-		return (0);
-	}
-}
-
-/*
- * Return the i/o class to issue from, or ZIO_PRIORITY_MAX_QUEUEABLE if
- * there is no eligible class.
- */
-static zio_priority_t
-vdev_queue_class_to_issue(vdev_queue_t *vq)
-{
-	spa_t *spa = vq->vq_vdev->vdev_spa;
-	zio_priority_t p;
-
-	ASSERT(MUTEX_HELD(&vq->vq_lock));
-
-	if (avl_numnodes(&vq->vq_active_tree) >= zfs_vdev_max_active)
-		return (ZIO_PRIORITY_NUM_QUEUEABLE);
-
-	/* find a queue that has not reached its minimum # outstanding i/os */
-	for (p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) {
-		if (avl_numnodes(vdev_queue_class_tree(vq, p)) > 0 &&
-		    vq->vq_class[p].vqc_active <
-		    vdev_queue_class_min_active(p))
-			return (p);
-	}
-
-	/*
-	 * If we haven't found a queue, look for one that hasn't reached its
-	 * maximum # outstanding i/os.
-	 */
-	for (p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) {
-		if (avl_numnodes(vdev_queue_class_tree(vq, p)) > 0 &&
-		    vq->vq_class[p].vqc_active <
-		    vdev_queue_class_max_active(spa, p))
-			return (p);
-	}
-
-	/* No eligible queued i/os */
-	return (ZIO_PRIORITY_NUM_QUEUEABLE);
-}
-
-/*
- * Compute the range spanned by two i/os, which is the endpoint of the last
- * (lio->io_offset + lio->io_size) minus start of the first (fio->io_offset).
- * Conveniently, the gap between fio and lio is given by -IO_SPAN(lio, fio);
- * thus fio and lio are adjacent if and only if IO_SPAN(lio, fio) == 0.
- */
-#define	IO_SPAN(fio, lio) ((lio)->io_offset + (lio)->io_size - (fio)->io_offset)
-#define	IO_GAP(fio, lio) (-IO_SPAN(lio, fio))
-
-static zio_t *
-vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio)
-{
-	zio_t *first, *last, *aio, *dio, *mandatory, *nio;
-	zio_link_t *zl = NULL;
-	uint64_t maxgap = 0;
-	uint64_t size;
-	uint64_t limit;
-	int maxblocksize;
-	boolean_t stretch;
-	avl_tree_t *t;
-	enum zio_flag flags;
-
-	ASSERT(MUTEX_HELD(&vq->vq_lock));
-
-	maxblocksize = spa_maxblocksize(vq->vq_vdev->vdev_spa);
-	if (vq->vq_vdev->vdev_nonrot)
-		limit = zfs_vdev_aggregation_limit_non_rotating;
-	else
-		limit = zfs_vdev_aggregation_limit;
-	limit = MAX(MIN(limit, maxblocksize), 0);
-
-	if (zio->io_flags & ZIO_FLAG_DONT_AGGREGATE || limit == 0)
-		return (NULL);
-
-	first = last = zio;
-
-	if (zio->io_type == ZIO_TYPE_READ)
-		maxgap = zfs_vdev_read_gap_limit;
-
-	/*
-	 * We can aggregate I/Os that are sufficiently adjacent and of
-	 * the same flavor, as expressed by the AGG_INHERIT flags.
-	 * The latter requirement is necessary so that certain
-	 * attributes of the I/O, such as whether it's a normal I/O
-	 * or a scrub/resilver, can be preserved in the aggregate.
-	 * We can include optional I/Os, but don't allow them
-	 * to begin a range as they add no benefit in that situation.
-	 */
-
-	/*
-	 * We keep track of the last non-optional I/O.
-	 */
-	mandatory = (first->io_flags & ZIO_FLAG_OPTIONAL) ? NULL : first;
-
-	/*
-	 * Walk backwards through sufficiently contiguous I/Os
-	 * recording the last non-optional I/O.
-	 */
-	flags = zio->io_flags & ZIO_FLAG_AGG_INHERIT;
-	t = vdev_queue_type_tree(vq, zio->io_type);
-	while (t != NULL && (dio = AVL_PREV(t, first)) != NULL &&
-	    (dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags &&
-	    IO_SPAN(dio, last) <= limit &&
-	    IO_GAP(dio, first) <= maxgap &&
-	    dio->io_type == zio->io_type) {
-		first = dio;
-		if (mandatory == NULL && !(first->io_flags & ZIO_FLAG_OPTIONAL))
-			mandatory = first;
-	}
-
-	/*
-	 * Skip any initial optional I/Os.
-	 */
-	while ((first->io_flags & ZIO_FLAG_OPTIONAL) && first != last) {
-		first = AVL_NEXT(t, first);
-		ASSERT(first != NULL);
-	}
-
-	/*
-	 * Walk forward through sufficiently contiguous I/Os.
-	 * The aggregation limit does not apply to optional i/os, so that
-	 * we can issue contiguous writes even if they are larger than the
-	 * aggregation limit.
-	 */
-	while ((dio = AVL_NEXT(t, last)) != NULL &&
-	    (dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags &&
-	    (IO_SPAN(first, dio) <= limit ||
-	    (dio->io_flags & ZIO_FLAG_OPTIONAL)) &&
-	    IO_SPAN(first, dio) <= maxblocksize &&
-	    IO_GAP(last, dio) <= maxgap &&
-	    dio->io_type == zio->io_type) {
-		last = dio;
-		if (!(last->io_flags & ZIO_FLAG_OPTIONAL))
-			mandatory = last;
-	}
-
-	/*
-	 * Now that we've established the range of the I/O aggregation
-	 * we must decide what to do with trailing optional I/Os.
-	 * For reads, there's nothing to do. While we are unable to
-	 * aggregate further, it's possible that a trailing optional
-	 * I/O would allow the underlying device to aggregate with
-	 * subsequent I/Os. We must therefore determine if the next
-	 * non-optional I/O is close enough to make aggregation
-	 * worthwhile.
-	 */
-	stretch = B_FALSE;
-	if (zio->io_type == ZIO_TYPE_WRITE && mandatory != NULL) {
-		zio_t *nio = last;
-		while ((dio = AVL_NEXT(t, nio)) != NULL &&
-		    IO_GAP(nio, dio) == 0 &&
-		    IO_GAP(mandatory, dio) <= zfs_vdev_write_gap_limit) {
-			nio = dio;
-			if (!(nio->io_flags & ZIO_FLAG_OPTIONAL)) {
-				stretch = B_TRUE;
-				break;
-			}
-		}
-	}
-
-	if (stretch) {
-		/*
-		 * We are going to include an optional io in our aggregated
-		 * span, thus closing the write gap.  Only mandatory i/os can
-		 * start aggregated spans, so make sure that the next i/o
-		 * after our span is mandatory.
-		 */
-		dio = AVL_NEXT(t, last);
-		dio->io_flags &= ~ZIO_FLAG_OPTIONAL;
-	} else {
-		/* do not include the optional i/o */
-		while (last != mandatory && last != first) {
-			ASSERT(last->io_flags & ZIO_FLAG_OPTIONAL);
-			last = AVL_PREV(t, last);
-			ASSERT(last != NULL);
-		}
-	}
-
-	if (first == last)
-		return (NULL);
-
-	size = IO_SPAN(first, last);
-	ASSERT3U(size, <=, maxblocksize);
-
-	aio = zio_vdev_delegated_io(first->io_vd, first->io_offset,
-	    abd_alloc_for_io(size, B_TRUE), size, first->io_type,
-	    zio->io_priority, flags | ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE,
-	    vdev_queue_agg_io_done, NULL);
-	aio->io_timestamp = first->io_timestamp;
-
-	nio = first;
-	do {
-		dio = nio;
-		nio = AVL_NEXT(t, dio);
-		zio_add_child(dio, aio);
-		vdev_queue_io_remove(vq, dio);
-	} while (dio != last);
-
-	/*
-	 * We need to drop the vdev queue's lock during zio_execute() to
-	 * avoid a deadlock that we could encounter due to lock order
-	 * reversal between vq_lock and io_lock in zio_change_priority().
-	 * Use the dropped lock to do memory copy without congestion.
-	 */
-	mutex_exit(&vq->vq_lock);
-	while ((dio = zio_walk_parents(aio, &zl)) != NULL) {
-		ASSERT3U(dio->io_type, ==, aio->io_type);
-
-		if (dio->io_flags & ZIO_FLAG_NODATA) {
-			ASSERT3U(dio->io_type, ==, ZIO_TYPE_WRITE);
-			abd_zero_off(aio->io_abd,
-			    dio->io_offset - aio->io_offset, dio->io_size);
-		} else if (dio->io_type == ZIO_TYPE_WRITE) {
-			abd_copy_off(aio->io_abd, dio->io_abd,
-			    dio->io_offset - aio->io_offset, 0, dio->io_size);
-		}
-
-		zio_vdev_io_bypass(dio);
-		zio_execute(dio);
-	}
-	mutex_enter(&vq->vq_lock);
-
-	return (aio);
-}
-
-static zio_t *
-vdev_queue_io_to_issue(vdev_queue_t *vq)
-{
-	zio_t *zio, *aio;
-	zio_priority_t p;
-	avl_index_t idx;
-	avl_tree_t *tree;
-	zio_t search;
-
-again:
-	ASSERT(MUTEX_HELD(&vq->vq_lock));
-
-	p = vdev_queue_class_to_issue(vq);
-
-	if (p == ZIO_PRIORITY_NUM_QUEUEABLE) {
-		/* No eligible queued i/os */
-		return (NULL);
-	}
-
-	/*
-	 * For LBA-ordered queues (async / scrub / initializing), issue the
-	 * i/o which follows the most recently issued i/o in LBA (offset) order.
-	 *
-	 * For FIFO queues (sync), issue the i/o with the lowest timestamp.
-	 */
-	tree = vdev_queue_class_tree(vq, p);
-	search.io_timestamp = 0;
-	search.io_offset = vq->vq_last_offset + 1;
-	VERIFY3P(avl_find(tree, &search, &idx), ==, NULL);
-	zio = avl_nearest(tree, idx, AVL_AFTER);
-	if (zio == NULL)
-		zio = avl_first(tree);
-	ASSERT3U(zio->io_priority, ==, p);
-
-	aio = vdev_queue_aggregate(vq, zio);
-	if (aio != NULL)
-		zio = aio;
-	else
-		vdev_queue_io_remove(vq, zio);
-
-	/*
-	 * If the I/O is or was optional and therefore has no data, we need to
-	 * simply discard it. We need to drop the vdev queue's lock to avoid a
-	 * deadlock that we could encounter since this I/O will complete
-	 * immediately.
-	 */
-	if (zio->io_flags & ZIO_FLAG_NODATA) {
-		mutex_exit(&vq->vq_lock);
-		zio_vdev_io_bypass(zio);
-		zio_execute(zio);
-		mutex_enter(&vq->vq_lock);
-		goto again;
-	}
-
-	vdev_queue_pending_add(vq, zio);
-	vq->vq_last_offset = zio->io_offset;
-
-	return (zio);
-}
-
-zio_t *
-vdev_queue_io(zio_t *zio)
-{
-	vdev_queue_t *vq = &zio->io_vd->vdev_queue;
-	zio_t *nio;
-
-	if (zio->io_flags & ZIO_FLAG_DONT_QUEUE)
-		return (zio);
-
-	/*
-	 * Children i/os inherent their parent's priority, which might
-	 * not match the child's i/o type.  Fix it up here.
-	 */
-	if (zio->io_type == ZIO_TYPE_READ) {
-		if (zio->io_priority != ZIO_PRIORITY_SYNC_READ &&
-		    zio->io_priority != ZIO_PRIORITY_ASYNC_READ &&
-		    zio->io_priority != ZIO_PRIORITY_SCRUB &&
-		    zio->io_priority != ZIO_PRIORITY_REMOVAL &&
-		    zio->io_priority != ZIO_PRIORITY_INITIALIZING)
-			zio->io_priority = ZIO_PRIORITY_ASYNC_READ;
-	} else if (zio->io_type == ZIO_TYPE_WRITE) {
-		if (zio->io_priority != ZIO_PRIORITY_SYNC_WRITE &&
-		    zio->io_priority != ZIO_PRIORITY_ASYNC_WRITE &&
-		    zio->io_priority != ZIO_PRIORITY_REMOVAL &&
-		    zio->io_priority != ZIO_PRIORITY_INITIALIZING)
-			zio->io_priority = ZIO_PRIORITY_ASYNC_WRITE;
-	} else {
-		ASSERT(zio->io_type == ZIO_TYPE_FREE);
-		zio->io_priority = ZIO_PRIORITY_TRIM;
-	}
-
-	zio->io_flags |= ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE;
-
-	mutex_enter(&vq->vq_lock);
-	zio->io_timestamp = gethrtime();
-	vdev_queue_io_add(vq, zio);
-	nio = vdev_queue_io_to_issue(vq);
-	mutex_exit(&vq->vq_lock);
-
-	if (nio == NULL)
-		return (NULL);
-
-	if (nio->io_done == vdev_queue_agg_io_done) {
-		zio_nowait(nio);
-		return (NULL);
-	}
-
-	return (nio);
-}
-
-void
-vdev_queue_io_done(zio_t *zio)
-{
-	vdev_queue_t *vq = &zio->io_vd->vdev_queue;
-	zio_t *nio;
-
-	mutex_enter(&vq->vq_lock);
-
-	vdev_queue_pending_remove(vq, zio);
-
-	vq->vq_io_complete_ts = gethrtime();
-
-	while ((nio = vdev_queue_io_to_issue(vq)) != NULL) {
-		mutex_exit(&vq->vq_lock);
-		if (nio->io_done == vdev_queue_agg_io_done) {
-			zio_nowait(nio);
-		} else {
-			zio_vdev_io_reissue(nio);
-			zio_execute(nio);
-		}
-		mutex_enter(&vq->vq_lock);
-	}
-
-	mutex_exit(&vq->vq_lock);
-}
-
-void
-vdev_queue_change_io_priority(zio_t *zio, zio_priority_t priority)
-{
-	vdev_queue_t *vq = &zio->io_vd->vdev_queue;
-	avl_tree_t *tree;
-
-	/*
-	 * ZIO_PRIORITY_NOW is used by the vdev cache code and the aggregate zio
-	 * code to issue IOs without adding them to the vdev queue. In this
-	 * case, the zio is already going to be issued as quickly as possible
-	 * and so it doesn't need any reprioitization to help.
-	 */
-	if (zio->io_priority == ZIO_PRIORITY_NOW)
-		return;
-
-	ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
-	ASSERT3U(priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
-
-	if (zio->io_type == ZIO_TYPE_READ) {
-		if (priority != ZIO_PRIORITY_SYNC_READ &&
-		    priority != ZIO_PRIORITY_ASYNC_READ &&
-		    priority != ZIO_PRIORITY_SCRUB)
-			priority = ZIO_PRIORITY_ASYNC_READ;
-	} else {
-		ASSERT(zio->io_type == ZIO_TYPE_WRITE);
-		if (priority != ZIO_PRIORITY_SYNC_WRITE &&
-		    priority != ZIO_PRIORITY_ASYNC_WRITE)
-			priority = ZIO_PRIORITY_ASYNC_WRITE;
-	}
-
-	mutex_enter(&vq->vq_lock);
-
-	/*
-	 * If the zio is in none of the queues we can simply change
-	 * the priority. If the zio is waiting to be submitted we must
-	 * remove it from the queue and re-insert it with the new priority.
-	 * Otherwise, the zio is currently active and we cannot change its
-	 * priority.
-	 */
-	tree = vdev_queue_class_tree(vq, zio->io_priority);
-	if (avl_find(tree, zio, NULL) == zio) {
-		avl_remove(vdev_queue_class_tree(vq, zio->io_priority), zio);
-		zio->io_priority = priority;
-		avl_add(vdev_queue_class_tree(vq, zio->io_priority), zio);
-	} else if (avl_find(&vq->vq_active_tree, zio, NULL) != zio) {
-		zio->io_priority = priority;
-	}
-
-	mutex_exit(&vq->vq_lock);
-}
-
-/*
- * As these three methods are only used for load calculations we're not concerned
- * if we get an incorrect value on 32bit platforms due to lack of vq_lock mutex
- * use here, instead we prefer to keep it lock free for performance.
- */ 
-int
-vdev_queue_length(vdev_t *vd)
-{
-	return (avl_numnodes(&vd->vdev_queue.vq_active_tree));
-}
-
-uint64_t
-vdev_queue_lastoffset(vdev_t *vd)
-{
-	return (vd->vdev_queue.vq_lastoffset);
-}
-
-void
-vdev_queue_register_lastoffset(vdev_t *vd, zio_t *zio)
-{
-	vd->vdev_queue.vq_lastoffset = zio->io_offset + zio->io_size;
-}
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c
+++ /dev/null
@@ -1,2707 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-
-/*
- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
- * Copyright (c) 2013, Joyent, Inc. All rights reserved.
- * Copyright (c) 2014 Integros [integros.com]
- */
-
-#include <sys/zfs_context.h>
-#include <sys/spa.h>
-#include <sys/vdev_impl.h>
-#ifdef illumos
-#include <sys/vdev_disk.h>
-#endif
-#include <sys/vdev_file.h>
-#include <sys/vdev_raidz.h>
-#include <sys/zio.h>
-#include <sys/zio_checksum.h>
-#include <sys/abd.h>
-#include <sys/fs/zfs.h>
-#include <sys/fm/fs/zfs.h>
-#include <sys/bio.h>
-
-#ifdef ZFS_DEBUG
-#include <sys/vdev_initialize.h>	/* vdev_xlate testing */
-#endif
-
-/*
- * Virtual device vector for RAID-Z.
- *
- * This vdev supports single, double, and triple parity. For single parity,
- * we use a simple XOR of all the data columns. For double or triple parity,
- * we use a special case of Reed-Solomon coding. This extends the
- * technique described in "The mathematics of RAID-6" by H. Peter Anvin by
- * drawing on the system described in "A Tutorial on Reed-Solomon Coding for
- * Fault-Tolerance in RAID-like Systems" by James S. Plank on which the
- * former is also based. The latter is designed to provide higher performance
- * for writes.
- *
- * Note that the Plank paper claimed to support arbitrary N+M, but was then
- * amended six years later identifying a critical flaw that invalidates its
- * claims. Nevertheless, the technique can be adapted to work for up to
- * triple parity. For additional parity, the amendment "Note: Correction to
- * the 1997 Tutorial on Reed-Solomon Coding" by James S. Plank and Ying Ding
- * is viable, but the additional complexity means that write performance will
- * suffer.
- *
- * All of the methods above operate on a Galois field, defined over the
- * integers mod 2^N. In our case we choose N=8 for GF(8) so that all elements
- * can be expressed with a single byte. Briefly, the operations on the
- * field are defined as follows:
- *
- *   o addition (+) is represented by a bitwise XOR
- *   o subtraction (-) is therefore identical to addition: A + B = A - B
- *   o multiplication of A by 2 is defined by the following bitwise expression:
- *
- *	(A * 2)_7 = A_6
- *	(A * 2)_6 = A_5
- *	(A * 2)_5 = A_4
- *	(A * 2)_4 = A_3 + A_7
- *	(A * 2)_3 = A_2 + A_7
- *	(A * 2)_2 = A_1 + A_7
- *	(A * 2)_1 = A_0
- *	(A * 2)_0 = A_7
- *
- * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)).
- * As an aside, this multiplication is derived from the error correcting
- * primitive polynomial x^8 + x^4 + x^3 + x^2 + 1.
- *
- * Observe that any number in the field (except for 0) can be expressed as a
- * power of 2 -- a generator for the field. We store a table of the powers of
- * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can
- * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather
- * than field addition). The inverse of a field element A (A^-1) is therefore
- * A ^ (255 - 1) = A^254.
- *
- * The up-to-three parity columns, P, Q, R over several data columns,
- * D_0, ... D_n-1, can be expressed by field operations:
- *
- *	P = D_0 + D_1 + ... + D_n-2 + D_n-1
- *	Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1
- *	  = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1
- *	R = 4^n-1 * D_0 + 4^n-2 * D_1 + ... + 4^1 * D_n-2 + 4^0 * D_n-1
- *	  = ((...((D_0) * 4 + D_1) * 4 + ...) * 4 + D_n-2) * 4 + D_n-1
- *
- * We chose 1, 2, and 4 as our generators because 1 corresponds to the trival
- * XOR operation, and 2 and 4 can be computed quickly and generate linearly-
- * independent coefficients. (There are no additional coefficients that have
- * this property which is why the uncorrected Plank method breaks down.)
- *
- * See the reconstruction code below for how P, Q and R can used individually
- * or in concert to recover missing data columns.
- */
-
-typedef struct raidz_col {
-	uint64_t rc_devidx;		/* child device index for I/O */
-	uint64_t rc_offset;		/* device offset */
-	uint64_t rc_size;		/* I/O size */
-	abd_t *rc_abd;			/* I/O data */
-	void *rc_gdata;			/* used to store the "good" version */
-	int rc_error;			/* I/O error for this device */
-	uint8_t rc_tried;		/* Did we attempt this I/O column? */
-	uint8_t rc_skipped;		/* Did we skip this I/O column? */
-} raidz_col_t;
-
-typedef struct raidz_map {
-	uint64_t rm_cols;		/* Regular column count */
-	uint64_t rm_scols;		/* Count including skipped columns */
-	uint64_t rm_bigcols;		/* Number of oversized columns */
-	uint64_t rm_asize;		/* Actual total I/O size */
-	uint64_t rm_missingdata;	/* Count of missing data devices */
-	uint64_t rm_missingparity;	/* Count of missing parity devices */
-	uint64_t rm_firstdatacol;	/* First data column/parity count */
-	uint64_t rm_nskip;		/* Skipped sectors for padding */
-	uint64_t rm_skipstart;		/* Column index of padding start */
-	abd_t *rm_abd_copy;		/* rm_asize-buffer of copied data */
-	uintptr_t rm_reports;		/* # of referencing checksum reports */
-	uint8_t	rm_freed;		/* map no longer has referencing ZIO */
-	uint8_t	rm_ecksuminjected;	/* checksum error was injected */
-	raidz_col_t rm_col[1];		/* Flexible array of I/O columns */
-} raidz_map_t;
-
-#define	VDEV_RAIDZ_P		0
-#define	VDEV_RAIDZ_Q		1
-#define	VDEV_RAIDZ_R		2
-
-#define	VDEV_RAIDZ_MUL_2(x)	(((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0))
-#define	VDEV_RAIDZ_MUL_4(x)	(VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x)))
-
-/*
- * We provide a mechanism to perform the field multiplication operation on a
- * 64-bit value all at once rather than a byte at a time. This works by
- * creating a mask from the top bit in each byte and using that to
- * conditionally apply the XOR of 0x1d.
- */
-#define	VDEV_RAIDZ_64MUL_2(x, mask) \
-{ \
-	(mask) = (x) & 0x8080808080808080ULL; \
-	(mask) = ((mask) << 1) - ((mask) >> 7); \
-	(x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \
-	    ((mask) & 0x1d1d1d1d1d1d1d1d); \
-}
-
-#define	VDEV_RAIDZ_64MUL_4(x, mask) \
-{ \
-	VDEV_RAIDZ_64MUL_2((x), mask); \
-	VDEV_RAIDZ_64MUL_2((x), mask); \
-}
-
-#define	VDEV_LABEL_OFFSET(x)	(x + VDEV_LABEL_START_SIZE)
-
-/*
- * Force reconstruction to use the general purpose method.
- */
-int vdev_raidz_default_to_general;
-
-/* Powers of 2 in the Galois field defined above. */
-static const uint8_t vdev_raidz_pow2[256] = {
-	0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
-	0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26,
-	0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9,
-	0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0,
-	0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35,
-	0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23,
-	0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0,
-	0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1,
-	0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc,
-	0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0,
-	0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f,
-	0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2,
-	0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88,
-	0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce,
-	0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93,
-	0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc,
-	0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9,
-	0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54,
-	0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa,
-	0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73,
-	0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e,
-	0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff,
-	0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4,
-	0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41,
-	0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e,
-	0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6,
-	0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef,
-	0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09,
-	0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5,
-	0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16,
-	0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83,
-	0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01
-};
-/* Logs of 2 in the Galois field defined above. */
-static const uint8_t vdev_raidz_log2[256] = {
-	0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6,
-	0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b,
-	0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81,
-	0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71,
-	0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21,
-	0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45,
-	0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9,
-	0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6,
-	0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd,
-	0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88,
-	0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd,
-	0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40,
-	0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e,
-	0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d,
-	0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b,
-	0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57,
-	0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d,
-	0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18,
-	0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c,
-	0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e,
-	0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd,
-	0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61,
-	0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e,
-	0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2,
-	0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76,
-	0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6,
-	0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa,
-	0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a,
-	0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51,
-	0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7,
-	0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8,
-	0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf,
-};
-
-static void vdev_raidz_generate_parity(raidz_map_t *rm);
-
-/*
- * Multiply a given number by 2 raised to the given power.
- */
-static uint8_t
-vdev_raidz_exp2(uint_t a, int exp)
-{
-	if (a == 0)
-		return (0);
-
-	ASSERT(exp >= 0);
-	ASSERT(vdev_raidz_log2[a] > 0 || a == 1);
-
-	exp += vdev_raidz_log2[a];
-	if (exp > 255)
-		exp -= 255;
-
-	return (vdev_raidz_pow2[exp]);
-}
-
-static void
-vdev_raidz_map_free(raidz_map_t *rm)
-{
-	int c;
-
-	for (c = 0; c < rm->rm_firstdatacol; c++) {
-		if (rm->rm_col[c].rc_abd != NULL)
-			abd_free(rm->rm_col[c].rc_abd);
-
-		if (rm->rm_col[c].rc_gdata != NULL)
-			zio_buf_free(rm->rm_col[c].rc_gdata,
-			    rm->rm_col[c].rc_size);
-	}
-
-	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
-		if (rm->rm_col[c].rc_abd != NULL)
-			abd_put(rm->rm_col[c].rc_abd);
-	}
-
-	if (rm->rm_abd_copy != NULL)
-		abd_free(rm->rm_abd_copy);
-
-	kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols]));
-}
-
-static void
-vdev_raidz_map_free_vsd(zio_t *zio)
-{
-	raidz_map_t *rm = zio->io_vsd;
-
-	ASSERT0(rm->rm_freed);
-	rm->rm_freed = 1;
-
-	if (rm->rm_reports == 0)
-		vdev_raidz_map_free(rm);
-}
-
-/*ARGSUSED*/
-static void
-vdev_raidz_cksum_free(void *arg, size_t ignored)
-{
-	raidz_map_t *rm = arg;
-
-	ASSERT3U(rm->rm_reports, >, 0);
-
-	if (--rm->rm_reports == 0 && rm->rm_freed != 0)
-		vdev_raidz_map_free(rm);
-}
-
-static void
-vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const void *good_data)
-{
-	raidz_map_t *rm = zcr->zcr_cbdata;
-	size_t c = zcr->zcr_cbinfo;
-	size_t x;
-
-	const char *good = NULL;
-	char *bad;
-
-	if (good_data == NULL) {
-		zfs_ereport_finish_checksum(zcr, NULL, NULL, B_FALSE);
-		return;
-	}
-
-	if (c < rm->rm_firstdatacol) {
-		/*
-		 * The first time through, calculate the parity blocks for
-		 * the good data (this relies on the fact that the good
-		 * data never changes for a given logical ZIO)
-		 */
-		if (rm->rm_col[0].rc_gdata == NULL) {
-			abd_t *bad_parity[VDEV_RAIDZ_MAXPARITY];
-			char *buf;
-			int offset;
-
-			/*
-			 * Set up the rm_col[]s to generate the parity for
-			 * good_data, first saving the parity bufs and
-			 * replacing them with buffers to hold the result.
-			 */
-			for (x = 0; x < rm->rm_firstdatacol; x++) {
-				bad_parity[x] = rm->rm_col[x].rc_abd;
-				rm->rm_col[x].rc_gdata =
-				    zio_buf_alloc(rm->rm_col[x].rc_size);
-				rm->rm_col[x].rc_abd =
-				    abd_get_from_buf(rm->rm_col[x].rc_gdata,
-				    rm->rm_col[x].rc_size);
-			}
-
-			/* fill in the data columns from good_data */
-			buf = (char *)good_data;
-			for (; x < rm->rm_cols; x++) {
-				abd_put(rm->rm_col[x].rc_abd);
-				rm->rm_col[x].rc_abd = abd_get_from_buf(buf,
-				    rm->rm_col[x].rc_size);
-				buf += rm->rm_col[x].rc_size;
-			}
-
-			/*
-			 * Construct the parity from the good data.
-			 */
-			vdev_raidz_generate_parity(rm);
-
-			/* restore everything back to its original state */
-			for (x = 0; x < rm->rm_firstdatacol; x++) {
-				abd_put(rm->rm_col[x].rc_abd);
-				rm->rm_col[x].rc_abd = bad_parity[x];
-			}
-
-			offset = 0;
-			for (x = rm->rm_firstdatacol; x < rm->rm_cols; x++) {
-				abd_put(rm->rm_col[x].rc_abd);
-				rm->rm_col[x].rc_abd = abd_get_offset(
-				    rm->rm_abd_copy, offset);
-				offset += rm->rm_col[x].rc_size;
-			}
-		}
-
-		ASSERT3P(rm->rm_col[c].rc_gdata, !=, NULL);
-		good = rm->rm_col[c].rc_gdata;
-	} else {
-		/* adjust good_data to point at the start of our column */
-		good = good_data;
-
-		for (x = rm->rm_firstdatacol; x < c; x++)
-			good += rm->rm_col[x].rc_size;
-	}
-
-	bad = abd_borrow_buf_copy(rm->rm_col[c].rc_abd, rm->rm_col[c].rc_size);
-	/* we drop the ereport if it ends up that the data was good */
-	zfs_ereport_finish_checksum(zcr, good, bad, B_TRUE);
-	abd_return_buf(rm->rm_col[c].rc_abd, bad, rm->rm_col[c].rc_size);
-}
-
-/*
- * Invoked indirectly by zfs_ereport_start_checksum(), called
- * below when our read operation fails completely.  The main point
- * is to keep a copy of everything we read from disk, so that at
- * vdev_raidz_cksum_finish() time we can compare it with the good data.
- */
-static void
-vdev_raidz_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg)
-{
-	size_t c = (size_t)(uintptr_t)arg;
-	size_t offset;
-
-	raidz_map_t *rm = zio->io_vsd;
-	size_t size;
-
-	/* set up the report and bump the refcount  */
-	zcr->zcr_cbdata = rm;
-	zcr->zcr_cbinfo = c;
-	zcr->zcr_finish = vdev_raidz_cksum_finish;
-	zcr->zcr_free = vdev_raidz_cksum_free;
-
-	rm->rm_reports++;
-	ASSERT3U(rm->rm_reports, >, 0);
-
-	if (rm->rm_abd_copy != NULL)
-		return;
-
-	/*
-	 * It's the first time we're called for this raidz_map_t, so we need
-	 * to copy the data aside; there's no guarantee that our zio's buffer
-	 * won't be re-used for something else.
-	 *
-	 * Our parity data is already in separate buffers, so there's no need
-	 * to copy them.
-	 */
-
-	size = 0;
-	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++)
-		size += rm->rm_col[c].rc_size;
-
-	rm->rm_abd_copy =
-	    abd_alloc_sametype(rm->rm_col[rm->rm_firstdatacol].rc_abd, size);
-
-	for (offset = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
-		raidz_col_t *col = &rm->rm_col[c];
-		abd_t *tmp = abd_get_offset(rm->rm_abd_copy, offset);
-
-		abd_copy(tmp, col->rc_abd, col->rc_size);
-		abd_put(col->rc_abd);
-		col->rc_abd = tmp;
-
-		offset += col->rc_size;
-	}
-	ASSERT3U(offset, ==, size);
-}
-
-static const zio_vsd_ops_t vdev_raidz_vsd_ops = {
-	vdev_raidz_map_free_vsd,
-	vdev_raidz_cksum_report
-};
-
-/*
- * Divides the IO evenly across all child vdevs; usually, dcols is
- * the number of children in the target vdev.
- */
-static raidz_map_t *
-vdev_raidz_map_alloc(abd_t *abd, uint64_t size, uint64_t offset, boolean_t dofree,
-    uint64_t unit_shift, uint64_t dcols, uint64_t nparity)
-{
-	raidz_map_t *rm;
-	/* The starting RAIDZ (parent) vdev sector of the block. */
-	uint64_t b = offset >> unit_shift;
-	/* The zio's size in units of the vdev's minimum sector size. */
-	uint64_t s = size >> unit_shift;
-	/* The first column for this stripe. */
-	uint64_t f = b % dcols;
-	/* The starting byte offset on each child vdev. */
-	uint64_t o = (b / dcols) << unit_shift;
-	uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;
-	uint64_t off = 0;
-
-	/*
-	 * "Quotient": The number of data sectors for this stripe on all but
-	 * the "big column" child vdevs that also contain "remainder" data.
-	 */
-	q = s / (dcols - nparity);
-
-	/*
-	 * "Remainder": The number of partial stripe data sectors in this I/O.
-	 * This will add a sector to some, but not all, child vdevs.
-	 */
-	r = s - q * (dcols - nparity);
-
-	/* The number of "big columns" - those which contain remainder data. */
-	bc = (r == 0 ? 0 : r + nparity);
-
-	/*
-	 * The total number of data and parity sectors associated with
-	 * this I/O.
-	 */
-	tot = s + nparity * (q + (r == 0 ? 0 : 1));
-
-	/* acols: The columns that will be accessed. */
-	/* scols: The columns that will be accessed or skipped. */
-	if (q == 0) {
-		/* Our I/O request doesn't span all child vdevs. */
-		acols = bc;
-		scols = MIN(dcols, roundup(bc, nparity + 1));
-	} else {
-		acols = dcols;
-		scols = dcols;
-	}
-
-	ASSERT3U(acols, <=, scols);
-
-	rm = kmem_alloc(offsetof(raidz_map_t, rm_col[scols]), KM_SLEEP);
-
-	rm->rm_cols = acols;
-	rm->rm_scols = scols;
-	rm->rm_bigcols = bc;
-	rm->rm_skipstart = bc;
-	rm->rm_missingdata = 0;
-	rm->rm_missingparity = 0;
-	rm->rm_firstdatacol = nparity;
-	rm->rm_abd_copy = NULL;
-	rm->rm_reports = 0;
-	rm->rm_freed = 0;
-	rm->rm_ecksuminjected = 0;
-
-	asize = 0;
-
-	for (c = 0; c < scols; c++) {
-		col = f + c;
-		coff = o;
-		if (col >= dcols) {
-			col -= dcols;
-			coff += 1ULL << unit_shift;
-		}
-		rm->rm_col[c].rc_devidx = col;
-		rm->rm_col[c].rc_offset = coff;
-		rm->rm_col[c].rc_abd = NULL;
-		rm->rm_col[c].rc_gdata = NULL;
-		rm->rm_col[c].rc_error = 0;
-		rm->rm_col[c].rc_tried = 0;
-		rm->rm_col[c].rc_skipped = 0;
-
-		if (c >= acols)
-			rm->rm_col[c].rc_size = 0;
-		else if (c < bc)
-			rm->rm_col[c].rc_size = (q + 1) << unit_shift;
-		else
-			rm->rm_col[c].rc_size = q << unit_shift;
-
-		asize += rm->rm_col[c].rc_size;
-	}
-
-	ASSERT3U(asize, ==, tot << unit_shift);
-	rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift);
-	rm->rm_nskip = roundup(tot, nparity + 1) - tot;
-	ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift);
-	ASSERT3U(rm->rm_nskip, <=, nparity);
-
-	if (!dofree) {
-		for (c = 0; c < rm->rm_firstdatacol; c++) {
-			rm->rm_col[c].rc_abd =
-			    abd_alloc_linear(rm->rm_col[c].rc_size, B_TRUE);
-		}
-
-		for (off = 0, c = rm->rm_firstdatacol; c < acols; c++) {
-			rm->rm_col[c].rc_abd = abd_get_offset(abd, off);
-			off += rm->rm_col[c].rc_size;
-		}
-	}
-
-	/*
-	 * If all data stored spans all columns, there's a danger that parity
-	 * will always be on the same device and, since parity isn't read
-	 * during normal operation, that that device's I/O bandwidth won't be
-	 * used effectively. We therefore switch the parity every 1MB.
-	 *
-	 * ... at least that was, ostensibly, the theory. As a practical
-	 * matter unless we juggle the parity between all devices evenly, we
-	 * won't see any benefit. Further, occasional writes that aren't a
-	 * multiple of the LCM of the number of children and the minimum
-	 * stripe width are sufficient to avoid pessimal behavior.
-	 * Unfortunately, this decision created an implicit on-disk format
-	 * requirement that we need to support for all eternity, but only
-	 * for single-parity RAID-Z.
-	 *
-	 * If we intend to skip a sector in the zeroth column for padding
-	 * we must make sure to note this swap. We will never intend to
-	 * skip the first column since at least one data and one parity
-	 * column must appear in each row.
-	 */
-	ASSERT(rm->rm_cols >= 2);
-	ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size);
-
-	if (rm->rm_firstdatacol == 1 && (offset & (1ULL << 20))) {
-		devidx = rm->rm_col[0].rc_devidx;
-		o = rm->rm_col[0].rc_offset;
-		rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx;
-		rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset;
-		rm->rm_col[1].rc_devidx = devidx;
-		rm->rm_col[1].rc_offset = o;
-
-		if (rm->rm_skipstart == 0)
-			rm->rm_skipstart = 1;
-	}
-
-	return (rm);
-}
-
-struct pqr_struct {
-	uint64_t *p;
-	uint64_t *q;
-	uint64_t *r;
-};
-
-static int
-vdev_raidz_p_func(void *buf, size_t size, void *private)
-{
-	struct pqr_struct *pqr = private;
-	const uint64_t *src = buf;
-	int i, cnt = size / sizeof (src[0]);
-
-	ASSERT(pqr->p && !pqr->q && !pqr->r);
-
-	for (i = 0; i < cnt; i++, src++, pqr->p++)
-		*pqr->p ^= *src;
-
-	return (0);
-}
-
-static int
-vdev_raidz_pq_func(void *buf, size_t size, void *private)
-{
-	struct pqr_struct *pqr = private;
-	const uint64_t *src = buf;
-	uint64_t mask;
-	int i, cnt = size / sizeof (src[0]);
-
-	ASSERT(pqr->p && pqr->q && !pqr->r);
-
-	for (i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++) {
-		*pqr->p ^= *src;
-		VDEV_RAIDZ_64MUL_2(*pqr->q, mask);
-		*pqr->q ^= *src;
-	}
-
-	return (0);
-}
-
-static int
-vdev_raidz_pqr_func(void *buf, size_t size, void *private)
-{
-	struct pqr_struct *pqr = private;
-	const uint64_t *src = buf;
-	uint64_t mask;
-	int i, cnt = size / sizeof (src[0]);
-
-	ASSERT(pqr->p && pqr->q && pqr->r);
-
-	for (i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++, pqr->r++) {
-		*pqr->p ^= *src;
-		VDEV_RAIDZ_64MUL_2(*pqr->q, mask);
-		*pqr->q ^= *src;
-		VDEV_RAIDZ_64MUL_4(*pqr->r, mask);
-		*pqr->r ^= *src;
-	}
-
-	return (0);
-}
-
-static void
-vdev_raidz_generate_parity_p(raidz_map_t *rm)
-{
-	uint64_t *p;
-	int c;
-	abd_t *src;
-
-	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
-		src = rm->rm_col[c].rc_abd;
-		p = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
-
-		if (c == rm->rm_firstdatacol) {
-			abd_copy_to_buf(p, src, rm->rm_col[c].rc_size);
-		} else {
-			struct pqr_struct pqr = { p, NULL, NULL };
-			(void) abd_iterate_func(src, 0, rm->rm_col[c].rc_size,
-			    vdev_raidz_p_func, &pqr);
-		}
-	}
-}
-
-static void
-vdev_raidz_generate_parity_pq(raidz_map_t *rm)
-{
-	uint64_t *p, *q, pcnt, ccnt, mask, i;
-	int c;
-	abd_t *src;
-
-	pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]);
-	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
-	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
-
-	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
-		src = rm->rm_col[c].rc_abd;
-		p = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
-		q = abd_to_buf(rm->rm_col[VDEV_RAIDZ_Q].rc_abd);
-
-		ccnt = rm->rm_col[c].rc_size / sizeof (p[0]);
-
-		if (c == rm->rm_firstdatacol) {
-			abd_copy_to_buf(p, src, rm->rm_col[c].rc_size);
-			(void) memcpy(q, p, rm->rm_col[c].rc_size);
-		} else {
-			struct pqr_struct pqr = { p, q, NULL };
-			(void) abd_iterate_func(src, 0, rm->rm_col[c].rc_size,
-			    vdev_raidz_pq_func, &pqr);
-		}
-
-		if (c == rm->rm_firstdatacol) {
-			for (i = ccnt; i < pcnt; i++) {
-				p[i] = 0;
-				q[i] = 0;
-			}
-		} else {
-			/*
-			 * Treat short columns as though they are full of 0s.
-			 * Note that there's therefore nothing needed for P.
-			 */
-			for (i = ccnt; i < pcnt; i++) {
-				VDEV_RAIDZ_64MUL_2(q[i], mask);
-			}
-		}
-	}
-}
-
-static void
-vdev_raidz_generate_parity_pqr(raidz_map_t *rm)
-{
-	uint64_t *p, *q, *r, pcnt, ccnt, mask, i;
-	int c;
-	abd_t *src;
-
-	pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]);
-	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
-	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
-	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
-	    rm->rm_col[VDEV_RAIDZ_R].rc_size);
-
-	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
-		src = rm->rm_col[c].rc_abd;
-		p = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
-		q = abd_to_buf(rm->rm_col[VDEV_RAIDZ_Q].rc_abd);
-		r = abd_to_buf(rm->rm_col[VDEV_RAIDZ_R].rc_abd);
-
-		ccnt = rm->rm_col[c].rc_size / sizeof (p[0]);
-
-		if (c == rm->rm_firstdatacol) {
-			abd_copy_to_buf(p, src, rm->rm_col[c].rc_size);
-			(void) memcpy(q, p, rm->rm_col[c].rc_size);
-			(void) memcpy(r, p, rm->rm_col[c].rc_size);
-		} else {
-			struct pqr_struct pqr = { p, q, r };
-			(void) abd_iterate_func(src, 0, rm->rm_col[c].rc_size,
-			    vdev_raidz_pqr_func, &pqr);
-		}
-
-		if (c == rm->rm_firstdatacol) {
-			for (i = ccnt; i < pcnt; i++) {
-				p[i] = 0;
-				q[i] = 0;
-				r[i] = 0;
-			}
-		} else {
-			/*
-			 * Treat short columns as though they are full of 0s.
-			 * Note that there's therefore nothing needed for P.
-			 */
-			for (i = ccnt; i < pcnt; i++) {
-				VDEV_RAIDZ_64MUL_2(q[i], mask);
-				VDEV_RAIDZ_64MUL_4(r[i], mask);
-			}
-		}
-	}
-}
-
-/*
- * Generate RAID parity in the first virtual columns according to the number of
- * parity columns available.
- */
-static void
-vdev_raidz_generate_parity(raidz_map_t *rm)
-{
-	switch (rm->rm_firstdatacol) {
-	case 1:
-		vdev_raidz_generate_parity_p(rm);
-		break;
-	case 2:
-		vdev_raidz_generate_parity_pq(rm);
-		break;
-	case 3:
-		vdev_raidz_generate_parity_pqr(rm);
-		break;
-	default:
-		cmn_err(CE_PANIC, "invalid RAID-Z configuration");
-	}
-}
-
-/* ARGSUSED */
-static int
-vdev_raidz_reconst_p_func(void *dbuf, void *sbuf, size_t size, void *private)
-{
-	uint64_t *dst = dbuf;
-	uint64_t *src = sbuf;
-	int cnt = size / sizeof (src[0]);
-
-	for (int i = 0; i < cnt; i++) {
-		dst[i] ^= src[i];
-	}
-
-	return (0);
-}
-
-/* ARGSUSED */
-static int
-vdev_raidz_reconst_q_pre_func(void *dbuf, void *sbuf, size_t size,
-    void *private)
-{
-	uint64_t *dst = dbuf;
-	uint64_t *src = sbuf;
-	uint64_t mask;
-	int cnt = size / sizeof (dst[0]);
-
-	for (int i = 0; i < cnt; i++, dst++, src++) {
-		VDEV_RAIDZ_64MUL_2(*dst, mask);
-		*dst ^= *src;
-	}
-
-	return (0);
-}
-
-/* ARGSUSED */
-static int
-vdev_raidz_reconst_q_pre_tail_func(void *buf, size_t size, void *private)
-{
-	uint64_t *dst = buf;
-	uint64_t mask;
-	int cnt = size / sizeof (dst[0]);
-
-	for (int i = 0; i < cnt; i++, dst++) {
-		/* same operation as vdev_raidz_reconst_q_pre_func() on dst */
-		VDEV_RAIDZ_64MUL_2(*dst, mask);
-	}
-
-	return (0);
-}
-
-struct reconst_q_struct {
-	uint64_t *q;
-	int exp;
-};
-
-static int
-vdev_raidz_reconst_q_post_func(void *buf, size_t size, void *private)
-{
-	struct reconst_q_struct *rq = private;
-	uint64_t *dst = buf;
-	int cnt = size / sizeof (dst[0]);
-
-	for (int i = 0; i < cnt; i++, dst++, rq->q++) {
-		*dst ^= *rq->q;
-
-		int j;
-		uint8_t *b;
-		for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) {
-			*b = vdev_raidz_exp2(*b, rq->exp);
-		}
-	}
-
-	return (0);
-}
-
-struct reconst_pq_struct {
-	uint8_t *p;
-	uint8_t *q;
-	uint8_t *pxy;
-	uint8_t *qxy;
-	int aexp;
-	int bexp;
-};
-
-static int
-vdev_raidz_reconst_pq_func(void *xbuf, void *ybuf, size_t size, void *private)
-{
-	struct reconst_pq_struct *rpq = private;
-	uint8_t *xd = xbuf;
-	uint8_t *yd = ybuf;
-
-	for (int i = 0; i < size;
-	    i++, rpq->p++, rpq->q++, rpq->pxy++, rpq->qxy++, xd++, yd++) {
-		*xd = vdev_raidz_exp2(*rpq->p ^ *rpq->pxy, rpq->aexp) ^
-		    vdev_raidz_exp2(*rpq->q ^ *rpq->qxy, rpq->bexp);
-		*yd = *rpq->p ^ *rpq->pxy ^ *xd;
-	}
-
-	return (0);
-}
-
-static int
-vdev_raidz_reconst_pq_tail_func(void *xbuf, size_t size, void *private)
-{
-	struct reconst_pq_struct *rpq = private;
-	uint8_t *xd = xbuf;
-
-	for (int i = 0; i < size;
-	    i++, rpq->p++, rpq->q++, rpq->pxy++, rpq->qxy++, xd++) {
-		/* same operation as vdev_raidz_reconst_pq_func() on xd */
-		*xd = vdev_raidz_exp2(*rpq->p ^ *rpq->pxy, rpq->aexp) ^
-		    vdev_raidz_exp2(*rpq->q ^ *rpq->qxy, rpq->bexp);
-	}
-
-	return (0);
-}
-
-static int
-vdev_raidz_reconstruct_p(raidz_map_t *rm, int *tgts, int ntgts)
-{
-	int x = tgts[0];
-	int c;
-	abd_t *dst, *src;
-
-	ASSERT(ntgts == 1);
-	ASSERT(x >= rm->rm_firstdatacol);
-	ASSERT(x < rm->rm_cols);
-
-	ASSERT(rm->rm_col[x].rc_size <= rm->rm_col[VDEV_RAIDZ_P].rc_size);
-	ASSERT(rm->rm_col[x].rc_size > 0);
-
-	src = rm->rm_col[VDEV_RAIDZ_P].rc_abd;
-	dst = rm->rm_col[x].rc_abd;
-
-	abd_copy(dst, src, rm->rm_col[x].rc_size);
-
-	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
-		uint64_t size = MIN(rm->rm_col[x].rc_size,
-		    rm->rm_col[c].rc_size);
-
-		src = rm->rm_col[c].rc_abd;
-		dst = rm->rm_col[x].rc_abd;
-
-		if (c == x)
-			continue;
-
-		(void) abd_iterate_func2(dst, src, 0, 0, size,
-		    vdev_raidz_reconst_p_func, NULL);
-	}
-
-	return (1 << VDEV_RAIDZ_P);
-}
-
-static int
-vdev_raidz_reconstruct_q(raidz_map_t *rm, int *tgts, int ntgts)
-{
-	int x = tgts[0];
-	int c, exp;
-	abd_t *dst, *src;
-
-	ASSERT(ntgts == 1);
-
-	ASSERT(rm->rm_col[x].rc_size <= rm->rm_col[VDEV_RAIDZ_Q].rc_size);
-
-	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
-		uint64_t size = (c == x) ? 0 : MIN(rm->rm_col[x].rc_size,
-		    rm->rm_col[c].rc_size);
-
-		src = rm->rm_col[c].rc_abd;
-		dst = rm->rm_col[x].rc_abd;
-
-		if (c == rm->rm_firstdatacol) {
-			abd_copy(dst, src, size);
-			if (rm->rm_col[x].rc_size > size)
-				abd_zero_off(dst, size,
-				    rm->rm_col[x].rc_size - size);
-		} else {
-			ASSERT3U(size, <=, rm->rm_col[x].rc_size);
-			(void) abd_iterate_func2(dst, src, 0, 0, size,
-			    vdev_raidz_reconst_q_pre_func, NULL);
-			(void) abd_iterate_func(dst,
-			    size, rm->rm_col[x].rc_size - size,
-			    vdev_raidz_reconst_q_pre_tail_func, NULL);
-		}
-	}
-
-	src = rm->rm_col[VDEV_RAIDZ_Q].rc_abd;
-	dst = rm->rm_col[x].rc_abd;
-	exp = 255 - (rm->rm_cols - 1 - x);
-
-	struct reconst_q_struct rq = { abd_to_buf(src), exp };
-	(void) abd_iterate_func(dst, 0, rm->rm_col[x].rc_size,
-	    vdev_raidz_reconst_q_post_func, &rq);
-
-	return (1 << VDEV_RAIDZ_Q);
-}
-
-static int
-vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts)
-{
-	uint8_t *p, *q, *pxy, *qxy, tmp, a, b, aexp, bexp;
-	abd_t *pdata, *qdata;
-	uint64_t xsize, ysize;
-	int x = tgts[0];
-	int y = tgts[1];
-	abd_t *xd, *yd;
-
-	ASSERT(ntgts == 2);
-	ASSERT(x < y);
-	ASSERT(x >= rm->rm_firstdatacol);
-	ASSERT(y < rm->rm_cols);
-
-	ASSERT(rm->rm_col[x].rc_size >= rm->rm_col[y].rc_size);
-
-	/*
-	 * Move the parity data aside -- we're going to compute parity as
-	 * though columns x and y were full of zeros -- Pxy and Qxy. We want to
-	 * reuse the parity generation mechanism without trashing the actual
-	 * parity so we make those columns appear to be full of zeros by
-	 * setting their lengths to zero.
-	 */
-	pdata = rm->rm_col[VDEV_RAIDZ_P].rc_abd;
-	qdata = rm->rm_col[VDEV_RAIDZ_Q].rc_abd;
-	xsize = rm->rm_col[x].rc_size;
-	ysize = rm->rm_col[y].rc_size;
-
-	rm->rm_col[VDEV_RAIDZ_P].rc_abd =
-	    abd_alloc_linear(rm->rm_col[VDEV_RAIDZ_P].rc_size, B_TRUE);
-	rm->rm_col[VDEV_RAIDZ_Q].rc_abd =
-	    abd_alloc_linear(rm->rm_col[VDEV_RAIDZ_Q].rc_size, B_TRUE);
-	rm->rm_col[x].rc_size = 0;
-	rm->rm_col[y].rc_size = 0;
-
-	vdev_raidz_generate_parity_pq(rm);
-
-	rm->rm_col[x].rc_size = xsize;
-	rm->rm_col[y].rc_size = ysize;
-
-	p = abd_to_buf(pdata);
-	q = abd_to_buf(qdata);
-	pxy = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
-	qxy = abd_to_buf(rm->rm_col[VDEV_RAIDZ_Q].rc_abd);
-	xd = rm->rm_col[x].rc_abd;
-	yd = rm->rm_col[y].rc_abd;
-
-	/*
-	 * We now have:
-	 *	Pxy = P + D_x + D_y
-	 *	Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y
-	 *
-	 * We can then solve for D_x:
-	 *	D_x = A * (P + Pxy) + B * (Q + Qxy)
-	 * where
-	 *	A = 2^(x - y) * (2^(x - y) + 1)^-1
-	 *	B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1
-	 *
-	 * With D_x in hand, we can easily solve for D_y:
-	 *	D_y = P + Pxy + D_x
-	 */
-
-	a = vdev_raidz_pow2[255 + x - y];
-	b = vdev_raidz_pow2[255 - (rm->rm_cols - 1 - x)];
-	tmp = 255 - vdev_raidz_log2[a ^ 1];
-
-	aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)];
-	bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)];
-
-	ASSERT3U(xsize, >=, ysize);
-	struct reconst_pq_struct rpq = { p, q, pxy, qxy, aexp, bexp };
-	(void) abd_iterate_func2(xd, yd, 0, 0, ysize,
-	    vdev_raidz_reconst_pq_func, &rpq);
-	(void) abd_iterate_func(xd, ysize, xsize - ysize,
-	    vdev_raidz_reconst_pq_tail_func, &rpq);
-
-	abd_free(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
-	abd_free(rm->rm_col[VDEV_RAIDZ_Q].rc_abd);
-
-	/*
-	 * Restore the saved parity data.
-	 */
-	rm->rm_col[VDEV_RAIDZ_P].rc_abd = pdata;
-	rm->rm_col[VDEV_RAIDZ_Q].rc_abd = qdata;
-
-	return ((1 << VDEV_RAIDZ_P) | (1 << VDEV_RAIDZ_Q));
-}
-
-/* BEGIN CSTYLED */
-/*
- * In the general case of reconstruction, we must solve the system of linear
- * equations defined by the coeffecients used to generate parity as well as
- * the contents of the data and parity disks. This can be expressed with
- * vectors for the original data (D) and the actual data (d) and parity (p)
- * and a matrix composed of the identity matrix (I) and a dispersal matrix (V):
- *
- *            __   __                     __     __
- *            |     |         __     __   |  p_0  |
- *            |  V  |         |  D_0  |   | p_m-1 |
- *            |     |    x    |   :   | = |  d_0  |
- *            |  I  |         | D_n-1 |   |   :   |
- *            |     |         ~~     ~~   | d_n-1 |
- *            ~~   ~~                     ~~     ~~
- *
- * I is simply a square identity matrix of size n, and V is a vandermonde
- * matrix defined by the coeffecients we chose for the various parity columns
- * (1, 2, 4). Note that these values were chosen both for simplicity, speedy
- * computation as well as linear separability.
- *
- *      __               __               __     __
- *      |   1   ..  1 1 1 |               |  p_0  |
- *      | 2^n-1 ..  4 2 1 |   __     __   |   :   |
- *      | 4^n-1 .. 16 4 1 |   |  D_0  |   | p_m-1 |
- *      |   1   ..  0 0 0 |   |  D_1  |   |  d_0  |
- *      |   0   ..  0 0 0 | x |  D_2  | = |  d_1  |
- *      |   :       : : : |   |   :   |   |  d_2  |
- *      |   0   ..  1 0 0 |   | D_n-1 |   |   :   |
- *      |   0   ..  0 1 0 |   ~~     ~~   |   :   |
- *      |   0   ..  0 0 1 |               | d_n-1 |
- *      ~~               ~~               ~~     ~~
- *
- * Note that I, V, d, and p are known. To compute D, we must invert the
- * matrix and use the known data and parity values to reconstruct the unknown
- * data values. We begin by removing the rows in V|I and d|p that correspond
- * to failed or missing columns; we then make V|I square (n x n) and d|p
- * sized n by removing rows corresponding to unused parity from the bottom up
- * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)'
- * using Gauss-Jordan elimination. In the example below we use m=3 parity
- * columns, n=8 data columns, with errors in d_1, d_2, and p_1:
- *           __                               __
- *           |  1   1   1   1   1   1   1   1  |
- *           | 128  64  32  16  8   4   2   1  | <-----+-+-- missing disks
- *           |  19 205 116  29  64  16  4   1  |      / /
- *           |  1   0   0   0   0   0   0   0  |     / /
- *           |  0   1   0   0   0   0   0   0  | <--' /
- *  (V|I)  = |  0   0   1   0   0   0   0   0  | <---'
- *           |  0   0   0   1   0   0   0   0  |
- *           |  0   0   0   0   1   0   0   0  |
- *           |  0   0   0   0   0   1   0   0  |
- *           |  0   0   0   0   0   0   1   0  |
- *           |  0   0   0   0   0   0   0   1  |
- *           ~~                               ~~
- *           __                               __
- *           |  1   1   1   1   1   1   1   1  |
- *           |  19 205 116  29  64  16  4   1  |
- *           |  1   0   0   0   0   0   0   0  |
- *  (V|I)' = |  0   0   0   1   0   0   0   0  |
- *           |  0   0   0   0   1   0   0   0  |
- *           |  0   0   0   0   0   1   0   0  |
- *           |  0   0   0   0   0   0   1   0  |
- *           |  0   0   0   0   0   0   0   1  |
- *           ~~                               ~~
- *
- * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We
- * have carefully chosen the seed values 1, 2, and 4 to ensure that this
- * matrix is not singular.
- * __                                                                 __
- * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
- * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
- * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
- * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
- * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
- * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
- * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
- * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
- * ~~                                                                 ~~
- * __                                                                 __
- * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
- * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
- * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
- * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
- * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
- * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
- * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
- * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
- * ~~                                                                 ~~
- * __                                                                 __
- * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
- * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
- * |  0  205 116  0   0   0   0   0     0   1   19  29  64  16  4   1  |
- * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
- * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
- * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
- * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
- * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
- * ~~                                                                 ~~
- * __                                                                 __
- * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
- * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
- * |  0   0  185  0   0   0   0   0    205  1  222 208 141 221 201 204 |
- * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
- * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
- * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
- * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
- * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
- * ~~                                                                 ~~
- * __                                                                 __
- * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
- * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
- * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
- * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
- * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
- * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
- * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
- * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
- * ~~                                                                 ~~
- * __                                                                 __
- * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
- * |  0   1   0   0   0   0   0   0    167 100  5   41 159 169 217 208 |
- * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
- * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
- * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
- * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
- * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
- * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
- * ~~                                                                 ~~
- *                   __                               __
- *                   |  0   0   1   0   0   0   0   0  |
- *                   | 167 100  5   41 159 169 217 208 |
- *                   | 166 100  4   40 158 168 216 209 |
- *       (V|I)'^-1 = |  0   0   0   1   0   0   0   0  |
- *                   |  0   0   0   0   1   0   0   0  |
- *                   |  0   0   0   0   0   1   0   0  |
- *                   |  0   0   0   0   0   0   1   0  |
- *                   |  0   0   0   0   0   0   0   1  |
- *                   ~~                               ~~
- *
- * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values
- * of the missing data.
- *
- * As is apparent from the example above, the only non-trivial rows in the
- * inverse matrix correspond to the data disks that we're trying to
- * reconstruct. Indeed, those are the only rows we need as the others would
- * only be useful for reconstructing data known or assumed to be valid. For
- * that reason, we only build the coefficients in the rows that correspond to
- * targeted columns.
- */
-/* END CSTYLED */
-
-static void
-vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map,
-    uint8_t **rows)
-{
-	int i, j;
-	int pow;
-
-	ASSERT(n == rm->rm_cols - rm->rm_firstdatacol);
-
-	/*
-	 * Fill in the missing rows of interest.
-	 */
-	for (i = 0; i < nmap; i++) {
-		ASSERT3S(0, <=, map[i]);
-		ASSERT3S(map[i], <=, 2);
-
-		pow = map[i] * n;
-		if (pow > 255)
-			pow -= 255;
-		ASSERT(pow <= 255);
-
-		for (j = 0; j < n; j++) {
-			pow -= map[i];
-			if (pow < 0)
-				pow += 255;
-			rows[i][j] = vdev_raidz_pow2[pow];
-		}
-	}
-}
-
-static void
-vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing,
-    uint8_t **rows, uint8_t **invrows, const uint8_t *used)
-{
-	int i, j, ii, jj;
-	uint8_t log;
-
-	/*
-	 * Assert that the first nmissing entries from the array of used
-	 * columns correspond to parity columns and that subsequent entries
-	 * correspond to data columns.
-	 */
-	for (i = 0; i < nmissing; i++) {
-		ASSERT3S(used[i], <, rm->rm_firstdatacol);
-	}
-	for (; i < n; i++) {
-		ASSERT3S(used[i], >=, rm->rm_firstdatacol);
-	}
-
-	/*
-	 * First initialize the storage where we'll compute the inverse rows.
-	 */
-	for (i = 0; i < nmissing; i++) {
-		for (j = 0; j < n; j++) {
-			invrows[i][j] = (i == j) ? 1 : 0;
-		}
-	}
-
-	/*
-	 * Subtract all trivial rows from the rows of consequence.
-	 */
-	for (i = 0; i < nmissing; i++) {
-		for (j = nmissing; j < n; j++) {
-			ASSERT3U(used[j], >=, rm->rm_firstdatacol);
-			jj = used[j] - rm->rm_firstdatacol;
-			ASSERT3S(jj, <, n);
-			invrows[i][j] = rows[i][jj];
-			rows[i][jj] = 0;
-		}
-	}
-
-	/*
-	 * For each of the rows of interest, we must normalize it and subtract
-	 * a multiple of it from the other rows.
-	 */
-	for (i = 0; i < nmissing; i++) {
-		for (j = 0; j < missing[i]; j++) {
-			ASSERT0(rows[i][j]);
-		}
-		ASSERT3U(rows[i][missing[i]], !=, 0);
-
-		/*
-		 * Compute the inverse of the first element and multiply each
-		 * element in the row by that value.
-		 */
-		log = 255 - vdev_raidz_log2[rows[i][missing[i]]];
-
-		for (j = 0; j < n; j++) {
-			rows[i][j] = vdev_raidz_exp2(rows[i][j], log);
-			invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log);
-		}
-
-		for (ii = 0; ii < nmissing; ii++) {
-			if (i == ii)
-				continue;
-
-			ASSERT3U(rows[ii][missing[i]], !=, 0);
-
-			log = vdev_raidz_log2[rows[ii][missing[i]]];
-
-			for (j = 0; j < n; j++) {
-				rows[ii][j] ^=
-				    vdev_raidz_exp2(rows[i][j], log);
-				invrows[ii][j] ^=
-				    vdev_raidz_exp2(invrows[i][j], log);
-			}
-		}
-	}
-
-	/*
-	 * Verify that the data that is left in the rows are properly part of
-	 * an identity matrix.
-	 */
-	for (i = 0; i < nmissing; i++) {
-		for (j = 0; j < n; j++) {
-			if (j == missing[i]) {
-				ASSERT3U(rows[i][j], ==, 1);
-			} else {
-				ASSERT0(rows[i][j]);
-			}
-		}
-	}
-}
-
-static void
-vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing,
-    int *missing, uint8_t **invrows, const uint8_t *used)
-{
-	int i, j, x, cc, c;
-	uint8_t *src;
-	uint64_t ccount;
-	uint8_t *dst[VDEV_RAIDZ_MAXPARITY];
-	uint64_t dcount[VDEV_RAIDZ_MAXPARITY];
-	uint8_t log = 0;
-	uint8_t val;
-	int ll;
-	uint8_t *invlog[VDEV_RAIDZ_MAXPARITY];
-	uint8_t *p, *pp;
-	size_t psize;
-
-	psize = sizeof (invlog[0][0]) * n * nmissing;
-	p = kmem_alloc(psize, KM_SLEEP);
-
-	for (pp = p, i = 0; i < nmissing; i++) {
-		invlog[i] = pp;
-		pp += n;
-	}
-
-	for (i = 0; i < nmissing; i++) {
-		for (j = 0; j < n; j++) {
-			ASSERT3U(invrows[i][j], !=, 0);
-			invlog[i][j] = vdev_raidz_log2[invrows[i][j]];
-		}
-	}
-
-	for (i = 0; i < n; i++) {
-		c = used[i];
-		ASSERT3U(c, <, rm->rm_cols);
-
-		src = abd_to_buf(rm->rm_col[c].rc_abd);
-		ccount = rm->rm_col[c].rc_size;
-		for (j = 0; j < nmissing; j++) {
-			cc = missing[j] + rm->rm_firstdatacol;
-			ASSERT3U(cc, >=, rm->rm_firstdatacol);
-			ASSERT3U(cc, <, rm->rm_cols);
-			ASSERT3U(cc, !=, c);
-
-			dst[j] = abd_to_buf(rm->rm_col[cc].rc_abd);
-			dcount[j] = rm->rm_col[cc].rc_size;
-		}
-
-		ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0);
-
-		for (x = 0; x < ccount; x++, src++) {
-			if (*src != 0)
-				log = vdev_raidz_log2[*src];
-
-			for (cc = 0; cc < nmissing; cc++) {
-				if (x >= dcount[cc])
-					continue;
-
-				if (*src == 0) {
-					val = 0;
-				} else {
-					if ((ll = log + invlog[cc][i]) >= 255)
-						ll -= 255;
-					val = vdev_raidz_pow2[ll];
-				}
-
-				if (i == 0)
-					dst[cc][x] = val;
-				else
-					dst[cc][x] ^= val;
-			}
-		}
-	}
-
-	kmem_free(p, psize);
-}
-
-static int
-vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts)
-{
-	int n, i, c, t, tt;
-	int nmissing_rows;
-	int missing_rows[VDEV_RAIDZ_MAXPARITY];
-	int parity_map[VDEV_RAIDZ_MAXPARITY];
-
-	uint8_t *p, *pp;
-	size_t psize;
-
-	uint8_t *rows[VDEV_RAIDZ_MAXPARITY];
-	uint8_t *invrows[VDEV_RAIDZ_MAXPARITY];
-	uint8_t *used;
-
-	abd_t **bufs = NULL;
-
-	int code = 0;
-
-	/*
-	 * Matrix reconstruction can't use scatter ABDs yet, so we allocate
-	 * temporary linear ABDs.
-	 */
-	if (!abd_is_linear(rm->rm_col[rm->rm_firstdatacol].rc_abd)) {
-		bufs = kmem_alloc(rm->rm_cols * sizeof (abd_t *), KM_PUSHPAGE);
-
-		for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
-			raidz_col_t *col = &rm->rm_col[c];
-
-			bufs[c] = col->rc_abd;
-			col->rc_abd = abd_alloc_linear(col->rc_size, B_TRUE);
-			abd_copy(col->rc_abd, bufs[c], col->rc_size);
-		}
-	}
-
-	n = rm->rm_cols - rm->rm_firstdatacol;
-
-	/*
-	 * Figure out which data columns are missing.
-	 */
-	nmissing_rows = 0;
-	for (t = 0; t < ntgts; t++) {
-		if (tgts[t] >= rm->rm_firstdatacol) {
-			missing_rows[nmissing_rows++] =
-			    tgts[t] - rm->rm_firstdatacol;
-		}
-	}
-
-	/*
-	 * Figure out which parity columns to use to help generate the missing
-	 * data columns.
-	 */
-	for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) {
-		ASSERT(tt < ntgts);
-		ASSERT(c < rm->rm_firstdatacol);
-
-		/*
-		 * Skip any targeted parity columns.
-		 */
-		if (c == tgts[tt]) {
-			tt++;
-			continue;
-		}
-
-		code |= 1 << c;
-
-		parity_map[i] = c;
-		i++;
-	}
-
-	ASSERT(code != 0);
-	ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY);
-
-	psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) *
-	    nmissing_rows * n + sizeof (used[0]) * n;
-	p = kmem_alloc(psize, KM_SLEEP);
-
-	for (pp = p, i = 0; i < nmissing_rows; i++) {
-		rows[i] = pp;
-		pp += n;
-		invrows[i] = pp;
-		pp += n;
-	}
-	used = pp;
-
-	for (i = 0; i < nmissing_rows; i++) {
-		used[i] = parity_map[i];
-	}
-
-	for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
-		if (tt < nmissing_rows &&
-		    c == missing_rows[tt] + rm->rm_firstdatacol) {
-			tt++;
-			continue;
-		}
-
-		ASSERT3S(i, <, n);
-		used[i] = c;
-		i++;
-	}
-
-	/*
-	 * Initialize the interesting rows of the matrix.
-	 */
-	vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows);
-
-	/*
-	 * Invert the matrix.
-	 */
-	vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows,
-	    invrows, used);
-
-	/*
-	 * Reconstruct the missing data using the generated matrix.
-	 */
-	vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows,
-	    invrows, used);
-
-	kmem_free(p, psize);
-
-	/*
-	 * copy back from temporary linear abds and free them
-	 */
-	if (bufs) {
-		for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
-			raidz_col_t *col = &rm->rm_col[c];
-
-			abd_copy(bufs[c], col->rc_abd, col->rc_size);
-			abd_free(col->rc_abd);
-			col->rc_abd = bufs[c];
-		}
-		kmem_free(bufs, rm->rm_cols * sizeof (abd_t *));
-	}
-
-	return (code);
-}
-
-static int
-vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt)
-{
-	int tgts[VDEV_RAIDZ_MAXPARITY], *dt;
-	int ntgts;
-	int i, c;
-	int code;
-	int nbadparity, nbaddata;
-	int parity_valid[VDEV_RAIDZ_MAXPARITY];
-
-	/*
-	 * The tgts list must already be sorted.
-	 */
-	for (i = 1; i < nt; i++) {
-		ASSERT(t[i] > t[i - 1]);
-	}
-
-	nbadparity = rm->rm_firstdatacol;
-	nbaddata = rm->rm_cols - nbadparity;
-	ntgts = 0;
-	for (i = 0, c = 0; c < rm->rm_cols; c++) {
-		if (c < rm->rm_firstdatacol)
-			parity_valid[c] = B_FALSE;
-
-		if (i < nt && c == t[i]) {
-			tgts[ntgts++] = c;
-			i++;
-		} else if (rm->rm_col[c].rc_error != 0) {
-			tgts[ntgts++] = c;
-		} else if (c >= rm->rm_firstdatacol) {
-			nbaddata--;
-		} else {
-			parity_valid[c] = B_TRUE;
-			nbadparity--;
-		}
-	}
-
-	ASSERT(ntgts >= nt);
-	ASSERT(nbaddata >= 0);
-	ASSERT(nbaddata + nbadparity == ntgts);
-
-	dt = &tgts[nbadparity];
-
-	/*
-	 * See if we can use any of our optimized reconstruction routines.
-	 */
-	if (!vdev_raidz_default_to_general) {
-		switch (nbaddata) {
-		case 1:
-			if (parity_valid[VDEV_RAIDZ_P])
-				return (vdev_raidz_reconstruct_p(rm, dt, 1));
-
-			ASSERT(rm->rm_firstdatacol > 1);
-
-			if (parity_valid[VDEV_RAIDZ_Q])
-				return (vdev_raidz_reconstruct_q(rm, dt, 1));
-
-			ASSERT(rm->rm_firstdatacol > 2);
-			break;
-
-		case 2:
-			ASSERT(rm->rm_firstdatacol > 1);
-
-			if (parity_valid[VDEV_RAIDZ_P] &&
-			    parity_valid[VDEV_RAIDZ_Q])
-				return (vdev_raidz_reconstruct_pq(rm, dt, 2));
-
-			ASSERT(rm->rm_firstdatacol > 2);
-
-			break;
-		}
-	}
-
-	code = vdev_raidz_reconstruct_general(rm, tgts, ntgts);
-	ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY));
-	ASSERT(code > 0);
-	return (code);
-}
-
-static int
-vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize,
-    uint64_t *logical_ashift, uint64_t *physical_ashift)
-{
-	vdev_t *cvd;
-	uint64_t nparity = vd->vdev_nparity;
-	int c;
-	int lasterror = 0;
-	int numerrors = 0;
-
-	ASSERT(nparity > 0);
-
-	if (nparity > VDEV_RAIDZ_MAXPARITY ||
-	    vd->vdev_children < nparity + 1) {
-		vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
-		return (SET_ERROR(EINVAL));
-	}
-
-	vdev_open_children(vd);
-
-	for (c = 0; c < vd->vdev_children; c++) {
-		cvd = vd->vdev_child[c];
-
-		if (cvd->vdev_open_error != 0) {
-			lasterror = cvd->vdev_open_error;
-			numerrors++;
-			continue;
-		}
-
-		*asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1;
-		*max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1;
-		*logical_ashift = MAX(*logical_ashift, cvd->vdev_ashift);
-		*physical_ashift = MAX(*physical_ashift,
-		    cvd->vdev_physical_ashift);
-	}
-
-	*asize *= vd->vdev_children;
-	*max_asize *= vd->vdev_children;
-
-	if (numerrors > nparity) {
-		vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
-		return (lasterror);
-	}
-
-	return (0);
-}
-
-static void
-vdev_raidz_close(vdev_t *vd)
-{
-	int c;
-
-	for (c = 0; c < vd->vdev_children; c++)
-		vdev_close(vd->vdev_child[c]);
-}
-
-#ifdef illumos
-/*
- * Handle a read or write I/O to a RAID-Z dump device.
- *
- * The dump device is in a unique situation compared to other ZFS datasets:
- * writing to this device should be as simple and fast as possible.  In
- * addition, durability matters much less since the dump will be extracted
- * once the machine reboots.  For that reason, this function eschews parity for
- * performance and simplicity.  The dump device uses the checksum setting
- * ZIO_CHECKSUM_NOPARITY to indicate that parity is not maintained for this
- * dataset.
- *
- * Blocks of size 128 KB have been preallocated for this volume.  I/Os less than
- * 128 KB will not fill an entire block; in addition, they may not be properly
- * aligned.  In that case, this function uses the preallocated 128 KB block and
- * omits reading or writing any "empty" portions of that block, as opposed to
- * allocating a fresh appropriately-sized block.
- *
- * Looking at an example of a 32 KB I/O to a RAID-Z vdev with 5 child vdevs:
- *
- *     vdev_raidz_io_start(data, size: 32 KB, offset: 64 KB)
- *
- * If this were a standard RAID-Z dataset, a block of at least 40 KB would be
- * allocated which spans all five child vdevs.  8 KB of data would be written to
- * each of four vdevs, with the fifth containing the parity bits.
- *
- *       parity    data     data     data     data
- *     |   PP   |   XX   |   XX   |   XX   |   XX   |
- *         ^        ^        ^        ^        ^
- *         |        |        |        |        |
- *   8 KB parity    ------8 KB data blocks------
- *
- * However, when writing to the dump device, the behavior is different:
- *
- *     vdev_raidz_physio(data, size: 32 KB, offset: 64 KB)
- *
- * Unlike the normal RAID-Z case in which the block is allocated based on the
- * I/O size, reads and writes here always use a 128 KB logical I/O size.  If the
- * I/O size is less than 128 KB, only the actual portions of data are written.
- * In this example the data is written to the third data vdev since that vdev
- * contains the offset [64 KB, 96 KB).
- *
- *       parity    data     data     data     data
- *     |        |        |        |   XX   |        |
- *                                    ^
- *                                    |
- *                             32 KB data block
- *
- * As a result, an individual I/O may not span all child vdevs; moreover, a
- * small I/O may only operate on a single child vdev.
- *
- * Note that since there are no parity bits calculated or written, this format
- * remains the same no matter how many parity bits are used in a normal RAID-Z
- * stripe.  On a RAID-Z3 configuration with seven child vdevs, the example above
- * would look like:
- *
- *       parity   parity   parity    data     data     data     data
- *     |        |        |        |        |        |   XX   |        |
- *                                                      ^
- *                                                      |
- *                                               32 KB data block
- */
-int
-vdev_raidz_physio(vdev_t *vd, caddr_t data, size_t size,
-    uint64_t offset, uint64_t origoffset, boolean_t doread, boolean_t isdump)
-{
-	vdev_t *tvd = vd->vdev_top;
-	vdev_t *cvd;
-	raidz_map_t *rm;
-	raidz_col_t *rc;
-	int c, err = 0;
-
-	uint64_t start, end, colstart, colend;
-	uint64_t coloffset, colsize, colskip;
-
-	int flags = doread ? BIO_READ : BIO_WRITE;
-
-#ifdef	_KERNEL
-
-	/*
-	 * Don't write past the end of the block
-	 */
-	VERIFY3U(offset + size, <=, origoffset + SPA_OLD_MAXBLOCKSIZE);
-
-	start = offset;
-	end = start + size;
-
-	/*
-	 * Allocate a RAID-Z map for this block.  Note that this block starts
-	 * from the "original" offset, this is, the offset of the extent which
-	 * contains the requisite offset of the data being read or written.
-	 *
-	 * Even if this I/O operation doesn't span the full block size, let's
-	 * treat the on-disk format as if the only blocks are the complete 128
-	 * KB size.
-	 */
-	abd_t *abd = abd_get_from_buf(data - (offset - origoffset),
-	    SPA_OLD_MAXBLOCKSIZE);
-	rm = vdev_raidz_map_alloc(abd,
-	    SPA_OLD_MAXBLOCKSIZE, origoffset, B_FALSE, tvd->vdev_ashift,
-	    vd->vdev_children, vd->vdev_nparity);
-
-	coloffset = origoffset;
-
-	for (c = rm->rm_firstdatacol; c < rm->rm_cols;
-	    c++, coloffset += rc->rc_size) {
-		rc = &rm->rm_col[c];
-		cvd = vd->vdev_child[rc->rc_devidx];
-
-		/*
-		 * Find the start and end of this column in the RAID-Z map,
-		 * keeping in mind that the stated size and offset of the
-		 * operation may not fill the entire column for this vdev.
-		 *
-		 * If any portion of the data spans this column, issue the
-		 * appropriate operation to the vdev.
-		 */
-		if (coloffset + rc->rc_size <= start)
-			continue;
-		if (coloffset >= end)
-			continue;
-
-		colstart = MAX(coloffset, start);
-		colend = MIN(end, coloffset + rc->rc_size);
-		colsize = colend - colstart;
-		colskip = colstart - coloffset;
-
-		VERIFY3U(colsize, <=, rc->rc_size);
-		VERIFY3U(colskip, <=, rc->rc_size);
-
-		/*
-		 * Note that the child vdev will have a vdev label at the start
-		 * of its range of offsets, hence the need for
-		 * VDEV_LABEL_OFFSET().  See zio_vdev_child_io() for another
-		 * example of why this calculation is needed.
-		 */
-		if ((err = vdev_disk_physio(cvd,
-		    ((char *)abd_to_buf(rc->rc_abd)) + colskip, colsize,
-		    VDEV_LABEL_OFFSET(rc->rc_offset) + colskip,
-		    flags, isdump)) != 0)
-			break;
-	}
-
-	vdev_raidz_map_free(rm);
-	abd_put(abd);
-#endif	/* KERNEL */
-
-	return (err);
-}
-#endif
-
-static uint64_t
-vdev_raidz_asize(vdev_t *vd, uint64_t psize)
-{
-	uint64_t asize;
-	uint64_t ashift = vd->vdev_top->vdev_ashift;
-	uint64_t cols = vd->vdev_children;
-	uint64_t nparity = vd->vdev_nparity;
-
-	asize = ((psize - 1) >> ashift) + 1;
-	asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity));
-	asize = roundup(asize, nparity + 1) << ashift;
-
-	return (asize);
-}
-
-static void
-vdev_raidz_child_done(zio_t *zio)
-{
-	raidz_col_t *rc = zio->io_private;
-
-	rc->rc_error = zio->io_error;
-	rc->rc_tried = 1;
-	rc->rc_skipped = 0;
-}
-
-static void
-vdev_raidz_io_verify(zio_t *zio, raidz_map_t *rm, int col)
-{
-#ifdef ZFS_DEBUG
-	vdev_t *vd = zio->io_vd;
-	vdev_t *tvd = vd->vdev_top;
-
-	range_seg_t logical_rs, physical_rs;
-	logical_rs.rs_start = zio->io_offset;
-	logical_rs.rs_end = logical_rs.rs_start +
-	    vdev_raidz_asize(zio->io_vd, zio->io_size);
-
-	raidz_col_t *rc = &rm->rm_col[col];
-	vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
-
-	vdev_xlate(cvd, &logical_rs, &physical_rs);
-	ASSERT3U(rc->rc_offset, ==, physical_rs.rs_start);
-	ASSERT3U(rc->rc_offset, <, physical_rs.rs_end);
-	/*
-	 * It would be nice to assert that rs_end is equal
-	 * to rc_offset + rc_size but there might be an
-	 * optional I/O at the end that is not accounted in
-	 * rc_size.
-	 */
-	if (physical_rs.rs_end > rc->rc_offset + rc->rc_size) {
-		ASSERT3U(physical_rs.rs_end, ==, rc->rc_offset +
-		    rc->rc_size + (1 << tvd->vdev_ashift));
-	} else {
-		ASSERT3U(physical_rs.rs_end, ==, rc->rc_offset + rc->rc_size);
-	}
-#endif
-}
-
-/*
- * Start an IO operation on a RAIDZ VDev
- *
- * Outline:
- * - For write operations:
- *   1. Generate the parity data
- *   2. Create child zio write operations to each column's vdev, for both
- *      data and parity.
- *   3. If the column skips any sectors for padding, create optional dummy
- *      write zio children for those areas to improve aggregation continuity.
- * - For read operations:
- *   1. Create child zio read operations to each data column's vdev to read
- *      the range of data required for zio.
- *   2. If this is a scrub or resilver operation, or if any of the data
- *      vdevs have had errors, then create zio read operations to the parity
- *      columns' VDevs as well.
- */
-static void
-vdev_raidz_io_start(zio_t *zio)
-{
-	vdev_t *vd = zio->io_vd;
-	vdev_t *tvd = vd->vdev_top;
-	vdev_t *cvd;
-	raidz_map_t *rm;
-	raidz_col_t *rc;
-	int c, i;
-
-	rm = vdev_raidz_map_alloc(zio->io_abd, zio->io_size, zio->io_offset,
-	    zio->io_type == ZIO_TYPE_FREE,
-	    tvd->vdev_ashift, vd->vdev_children,
-	    vd->vdev_nparity);
-
-	zio->io_vsd = rm;
-	zio->io_vsd_ops = &vdev_raidz_vsd_ops;
-
-	ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size));
-
-	if (zio->io_type == ZIO_TYPE_FREE) {
-		for (c = 0; c < rm->rm_cols; c++) {
-			rc = &rm->rm_col[c];
-			cvd = vd->vdev_child[rc->rc_devidx];
-			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
-			    rc->rc_offset, rc->rc_abd, rc->rc_size,
-			    zio->io_type, zio->io_priority, 0,
-			    vdev_raidz_child_done, rc));
-		}
-
-		zio_execute(zio);
-		return;
-	}
-
-	if (zio->io_type == ZIO_TYPE_WRITE) {
-		vdev_raidz_generate_parity(rm);
-
-		for (c = 0; c < rm->rm_cols; c++) {
-			rc = &rm->rm_col[c];
-			cvd = vd->vdev_child[rc->rc_devidx];
-
-			/*
-			 * Verify physical to logical translation.
-			 */
-			vdev_raidz_io_verify(zio, rm, c);
-
-			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
-			    rc->rc_offset, rc->rc_abd, rc->rc_size,
-			    zio->io_type, zio->io_priority, 0,
-			    vdev_raidz_child_done, rc));
-		}
-
-		/*
-		 * Generate optional I/Os for any skipped sectors to improve
-		 * aggregation contiguity.
-		 */
-		for (c = rm->rm_skipstart, i = 0; i < rm->rm_nskip; c++, i++) {
-			ASSERT(c <= rm->rm_scols);
-			if (c == rm->rm_scols)
-				c = 0;
-			rc = &rm->rm_col[c];
-			cvd = vd->vdev_child[rc->rc_devidx];
-			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
-			    rc->rc_offset + rc->rc_size, NULL,
-			    1 << tvd->vdev_ashift,
-			    zio->io_type, zio->io_priority,
-			    ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL));
-		}
-
-		zio_execute(zio);
-		return;
-	}
-
-	ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ);
-
-	/*
-	 * Iterate over the columns in reverse order so that we hit the parity
-	 * last -- any errors along the way will force us to read the parity.
-	 */
-	for (c = rm->rm_cols - 1; c >= 0; c--) {
-		rc = &rm->rm_col[c];
-		cvd = vd->vdev_child[rc->rc_devidx];
-		if (!vdev_readable(cvd)) {
-			if (c >= rm->rm_firstdatacol)
-				rm->rm_missingdata++;
-			else
-				rm->rm_missingparity++;
-			rc->rc_error = SET_ERROR(ENXIO);
-			rc->rc_tried = 1;	/* don't even try */
-			rc->rc_skipped = 1;
-			continue;
-		}
-		if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) {
-			if (c >= rm->rm_firstdatacol)
-				rm->rm_missingdata++;
-			else
-				rm->rm_missingparity++;
-			rc->rc_error = SET_ERROR(ESTALE);
-			rc->rc_skipped = 1;
-			continue;
-		}
-		if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0 ||
-		    (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) {
-			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
-			    rc->rc_offset, rc->rc_abd, rc->rc_size,
-			    zio->io_type, zio->io_priority, 0,
-			    vdev_raidz_child_done, rc));
-		}
-	}
-
-	zio_execute(zio);
-}
-
-
-/*
- * Report a checksum error for a child of a RAID-Z device.
- */
-static void
-raidz_checksum_error(zio_t *zio, raidz_col_t *rc, void *bad_data)
-{
-	void *buf;
-	vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx];
-
-	if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
-		zio_bad_cksum_t zbc;
-		raidz_map_t *rm = zio->io_vsd;
-
-		mutex_enter(&vd->vdev_stat_lock);
-		vd->vdev_stat.vs_checksum_errors++;
-		mutex_exit(&vd->vdev_stat_lock);
-
-		zbc.zbc_has_cksum = 0;
-		zbc.zbc_injected = rm->rm_ecksuminjected;
-
-		buf = abd_borrow_buf_copy(rc->rc_abd, rc->rc_size);
-		zfs_ereport_post_checksum(zio->io_spa, vd, zio,
-		    rc->rc_offset, rc->rc_size, buf, bad_data,
-		    &zbc);
-		abd_return_buf(rc->rc_abd, buf, rc->rc_size);
-	}
-}
-
-/*
- * We keep track of whether or not there were any injected errors, so that
- * any ereports we generate can note it.
- */
-static int
-raidz_checksum_verify(zio_t *zio)
-{
-	zio_bad_cksum_t zbc;
-	raidz_map_t *rm = zio->io_vsd;
-
-	int ret = zio_checksum_error(zio, &zbc);
-	if (ret != 0 && zbc.zbc_injected != 0)
-		rm->rm_ecksuminjected = 1;
-
-	return (ret);
-}
-
-/*
- * Generate the parity from the data columns. If we tried and were able to
- * read the parity without error, verify that the generated parity matches the
- * data we read. If it doesn't, we fire off a checksum error. Return the
- * number such failures.
- */
-static int
-raidz_parity_verify(zio_t *zio, raidz_map_t *rm)
-{
-	void *orig[VDEV_RAIDZ_MAXPARITY];
-	int c, ret = 0;
-	raidz_col_t *rc;
-
-	blkptr_t *bp = zio->io_bp;
-	enum zio_checksum checksum = (bp == NULL ? zio->io_prop.zp_checksum :
-	    (BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp)));
-
-	if (checksum == ZIO_CHECKSUM_NOPARITY)
-		return (ret);
-
-	for (c = 0; c < rm->rm_firstdatacol; c++) {
-		rc = &rm->rm_col[c];
-		if (!rc->rc_tried || rc->rc_error != 0)
-			continue;
-		orig[c] = zio_buf_alloc(rc->rc_size);
-		abd_copy_to_buf(orig[c], rc->rc_abd, rc->rc_size);
-	}
-
-	vdev_raidz_generate_parity(rm);
-
-	for (c = 0; c < rm->rm_firstdatacol; c++) {
-		rc = &rm->rm_col[c];
-		if (!rc->rc_tried || rc->rc_error != 0)
-			continue;
-		if (abd_cmp_buf(rc->rc_abd, orig[c], rc->rc_size) != 0) {
-			raidz_checksum_error(zio, rc, orig[c]);
-			rc->rc_error = SET_ERROR(ECKSUM);
-			ret++;
-		}
-		zio_buf_free(orig[c], rc->rc_size);
-	}
-
-	return (ret);
-}
-
-/*
- * Keep statistics on all the ways that we used parity to correct data.
- */
-static uint64_t raidz_corrected[1 << VDEV_RAIDZ_MAXPARITY];
-
-static int
-vdev_raidz_worst_error(raidz_map_t *rm)
-{
-	int error = 0;
-
-	for (int c = 0; c < rm->rm_cols; c++)
-		error = zio_worst_error(error, rm->rm_col[c].rc_error);
-
-	return (error);
-}
-
-/*
- * Iterate over all combinations of bad data and attempt a reconstruction.
- * Note that the algorithm below is non-optimal because it doesn't take into
- * account how reconstruction is actually performed. For example, with
- * triple-parity RAID-Z the reconstruction procedure is the same if column 4
- * is targeted as invalid as if columns 1 and 4 are targeted since in both
- * cases we'd only use parity information in column 0.
- */
-static int
-vdev_raidz_combrec(zio_t *zio, int total_errors, int data_errors)
-{
-	raidz_map_t *rm = zio->io_vsd;
-	raidz_col_t *rc;
-	void *orig[VDEV_RAIDZ_MAXPARITY];
-	int tstore[VDEV_RAIDZ_MAXPARITY + 2];
-	int *tgts = &tstore[1];
-	int current, next, i, c, n;
-	int code, ret = 0;
-
-	ASSERT(total_errors < rm->rm_firstdatacol);
-
-	/*
-	 * This simplifies one edge condition.
-	 */
-	tgts[-1] = -1;
-
-	for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) {
-		/*
-		 * Initialize the targets array by finding the first n columns
-		 * that contain no error.
-		 *
-		 * If there were no data errors, we need to ensure that we're
-		 * always explicitly attempting to reconstruct at least one
-		 * data column. To do this, we simply push the highest target
-		 * up into the data columns.
-		 */
-		for (c = 0, i = 0; i < n; i++) {
-			if (i == n - 1 && data_errors == 0 &&
-			    c < rm->rm_firstdatacol) {
-				c = rm->rm_firstdatacol;
-			}
-
-			while (rm->rm_col[c].rc_error != 0) {
-				c++;
-				ASSERT3S(c, <, rm->rm_cols);
-			}
-
-			tgts[i] = c++;
-		}
-
-		/*
-		 * Setting tgts[n] simplifies the other edge condition.
-		 */
-		tgts[n] = rm->rm_cols;
-
-		/*
-		 * These buffers were allocated in previous iterations.
-		 */
-		for (i = 0; i < n - 1; i++) {
-			ASSERT(orig[i] != NULL);
-		}
-
-		orig[n - 1] = zio_buf_alloc(rm->rm_col[0].rc_size);
-
-		current = 0;
-		next = tgts[current];
-
-		while (current != n) {
-			tgts[current] = next;
-			current = 0;
-
-			/*
-			 * Save off the original data that we're going to
-			 * attempt to reconstruct.
-			 */
-			for (i = 0; i < n; i++) {
-				ASSERT(orig[i] != NULL);
-				c = tgts[i];
-				ASSERT3S(c, >=, 0);
-				ASSERT3S(c, <, rm->rm_cols);
-				rc = &rm->rm_col[c];
-				abd_copy_to_buf(orig[i], rc->rc_abd,
-				    rc->rc_size);
-			}
-
-			/*
-			 * Attempt a reconstruction and exit the outer loop on
-			 * success.
-			 */
-			code = vdev_raidz_reconstruct(rm, tgts, n);
-			if (raidz_checksum_verify(zio) == 0) {
-				atomic_inc_64(&raidz_corrected[code]);
-
-				for (i = 0; i < n; i++) {
-					c = tgts[i];
-					rc = &rm->rm_col[c];
-					ASSERT(rc->rc_error == 0);
-					if (rc->rc_tried)
-						raidz_checksum_error(zio, rc,
-						    orig[i]);
-					rc->rc_error = SET_ERROR(ECKSUM);
-				}
-
-				ret = code;
-				goto done;
-			}
-
-			/*
-			 * Restore the original data.
-			 */
-			for (i = 0; i < n; i++) {
-				c = tgts[i];
-				rc = &rm->rm_col[c];
-				abd_copy_from_buf(rc->rc_abd, orig[i],
-				    rc->rc_size);
-			}
-
-			do {
-				/*
-				 * Find the next valid column after the current
-				 * position..
-				 */
-				for (next = tgts[current] + 1;
-				    next < rm->rm_cols &&
-				    rm->rm_col[next].rc_error != 0; next++)
-					continue;
-
-				ASSERT(next <= tgts[current + 1]);
-
-				/*
-				 * If that spot is available, we're done here.
-				 */
-				if (next != tgts[current + 1])
-					break;
-
-				/*
-				 * Otherwise, find the next valid column after
-				 * the previous position.
-				 */
-				for (c = tgts[current - 1] + 1;
-				    rm->rm_col[c].rc_error != 0; c++)
-					continue;
-
-				tgts[current] = c;
-				current++;
-
-			} while (current != n);
-		}
-	}
-	n--;
-done:
-	for (i = 0; i < n; i++) {
-		zio_buf_free(orig[i], rm->rm_col[0].rc_size);
-	}
-
-	return (ret);
-}
-
-/*
- * Complete an IO operation on a RAIDZ VDev
- *
- * Outline:
- * - For write operations:
- *   1. Check for errors on the child IOs.
- *   2. Return, setting an error code if too few child VDevs were written
- *      to reconstruct the data later.  Note that partial writes are
- *      considered successful if they can be reconstructed at all.
- * - For read operations:
- *   1. Check for errors on the child IOs.
- *   2. If data errors occurred:
- *      a. Try to reassemble the data from the parity available.
- *      b. If we haven't yet read the parity drives, read them now.
- *      c. If all parity drives have been read but the data still doesn't
- *         reassemble with a correct checksum, then try combinatorial
- *         reconstruction.
- *      d. If that doesn't work, return an error.
- *   3. If there were unexpected errors or this is a resilver operation,
- *      rewrite the vdevs that had errors.
- */
-static void
-vdev_raidz_io_done(zio_t *zio)
-{
-	vdev_t *vd = zio->io_vd;
-	vdev_t *cvd;
-	raidz_map_t *rm = zio->io_vsd;
-	raidz_col_t *rc;
-	int unexpected_errors = 0;
-	int parity_errors = 0;
-	int parity_untried = 0;
-	int data_errors = 0;
-	int total_errors = 0;
-	int n, c;
-	int tgts[VDEV_RAIDZ_MAXPARITY];
-	int code;
-
-	ASSERT(zio->io_bp != NULL);  /* XXX need to add code to enforce this */
-
-	ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol);
-	ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol);
-
-	for (c = 0; c < rm->rm_cols; c++) {
-		rc = &rm->rm_col[c];
-
-		if (rc->rc_error) {
-			ASSERT(rc->rc_error != ECKSUM);	/* child has no bp */
-
-			if (c < rm->rm_firstdatacol)
-				parity_errors++;
-			else
-				data_errors++;
-
-			if (!rc->rc_skipped)
-				unexpected_errors++;
-
-			total_errors++;
-		} else if (c < rm->rm_firstdatacol && !rc->rc_tried) {
-			parity_untried++;
-		}
-	}
-
-	if (zio->io_type == ZIO_TYPE_WRITE) {
-		/*
-		 * XXX -- for now, treat partial writes as a success.
-		 * (If we couldn't write enough columns to reconstruct
-		 * the data, the I/O failed.  Otherwise, good enough.)
-		 *
-		 * Now that we support write reallocation, it would be better
-		 * to treat partial failure as real failure unless there are
-		 * no non-degraded top-level vdevs left, and not update DTLs
-		 * if we intend to reallocate.
-		 */
-		/* XXPOLICY */
-		if (total_errors > rm->rm_firstdatacol)
-			zio->io_error = vdev_raidz_worst_error(rm);
-
-		return;
-	} else if (zio->io_type == ZIO_TYPE_FREE) {
-		return;
-	}
-
-	ASSERT(zio->io_type == ZIO_TYPE_READ);
-	/*
-	 * There are three potential phases for a read:
-	 *	1. produce valid data from the columns read
-	 *	2. read all disks and try again
-	 *	3. perform combinatorial reconstruction
-	 *
-	 * Each phase is progressively both more expensive and less likely to
-	 * occur. If we encounter more errors than we can repair or all phases
-	 * fail, we have no choice but to return an error.
-	 */
-
-	/*
-	 * If the number of errors we saw was correctable -- less than or equal
-	 * to the number of parity disks read -- attempt to produce data that
-	 * has a valid checksum. Naturally, this case applies in the absence of
-	 * any errors.
-	 */
-	if (total_errors <= rm->rm_firstdatacol - parity_untried) {
-		if (data_errors == 0) {
-			if (raidz_checksum_verify(zio) == 0) {
-				/*
-				 * If we read parity information (unnecessarily
-				 * as it happens since no reconstruction was
-				 * needed) regenerate and verify the parity.
-				 * We also regenerate parity when resilvering
-				 * so we can write it out to the failed device
-				 * later.
-				 */
-				if (parity_errors + parity_untried <
-				    rm->rm_firstdatacol ||
-				    (zio->io_flags & ZIO_FLAG_RESILVER)) {
-					n = raidz_parity_verify(zio, rm);
-					unexpected_errors += n;
-					ASSERT(parity_errors + n <=
-					    rm->rm_firstdatacol);
-				}
-				goto done;
-			}
-		} else {
-			/*
-			 * We either attempt to read all the parity columns or
-			 * none of them. If we didn't try to read parity, we
-			 * wouldn't be here in the correctable case. There must
-			 * also have been fewer parity errors than parity
-			 * columns or, again, we wouldn't be in this code path.
-			 */
-			ASSERT(parity_untried == 0);
-			ASSERT(parity_errors < rm->rm_firstdatacol);
-
-			/*
-			 * Identify the data columns that reported an error.
-			 */
-			n = 0;
-			for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
-				rc = &rm->rm_col[c];
-				if (rc->rc_error != 0) {
-					ASSERT(n < VDEV_RAIDZ_MAXPARITY);
-					tgts[n++] = c;
-				}
-			}
-
-			ASSERT(rm->rm_firstdatacol >= n);
-
-			code = vdev_raidz_reconstruct(rm, tgts, n);
-
-			if (raidz_checksum_verify(zio) == 0) {
-				atomic_inc_64(&raidz_corrected[code]);
-
-				/*
-				 * If we read more parity disks than were used
-				 * for reconstruction, confirm that the other
-				 * parity disks produced correct data. This
-				 * routine is suboptimal in that it regenerates
-				 * the parity that we already used in addition
-				 * to the parity that we're attempting to
-				 * verify, but this should be a relatively
-				 * uncommon case, and can be optimized if it
-				 * becomes a problem. Note that we regenerate
-				 * parity when resilvering so we can write it
-				 * out to failed devices later.
-				 */
-				if (parity_errors < rm->rm_firstdatacol - n ||
-				    (zio->io_flags & ZIO_FLAG_RESILVER)) {
-					n = raidz_parity_verify(zio, rm);
-					unexpected_errors += n;
-					ASSERT(parity_errors + n <=
-					    rm->rm_firstdatacol);
-				}
-
-				goto done;
-			}
-		}
-	}
-
-	/*
-	 * This isn't a typical situation -- either we got a read error or
-	 * a child silently returned bad data. Read every block so we can
-	 * try again with as much data and parity as we can track down. If
-	 * we've already been through once before, all children will be marked
-	 * as tried so we'll proceed to combinatorial reconstruction.
-	 */
-	unexpected_errors = 1;
-	rm->rm_missingdata = 0;
-	rm->rm_missingparity = 0;
-
-	for (c = 0; c < rm->rm_cols; c++) {
-		if (rm->rm_col[c].rc_tried)
-			continue;
-
-		zio_vdev_io_redone(zio);
-		do {
-			rc = &rm->rm_col[c];
-			if (rc->rc_tried)
-				continue;
-			zio_nowait(zio_vdev_child_io(zio, NULL,
-			    vd->vdev_child[rc->rc_devidx],
-			    rc->rc_offset, rc->rc_abd, rc->rc_size,
-			    zio->io_type, zio->io_priority, 0,
-			    vdev_raidz_child_done, rc));
-		} while (++c < rm->rm_cols);
-
-		return;
-	}
-
-	/*
-	 * At this point we've attempted to reconstruct the data given the
-	 * errors we detected, and we've attempted to read all columns. There
-	 * must, therefore, be one or more additional problems -- silent errors
-	 * resulting in invalid data rather than explicit I/O errors resulting
-	 * in absent data. We check if there is enough additional data to
-	 * possibly reconstruct the data and then perform combinatorial
-	 * reconstruction over all possible combinations. If that fails,
-	 * we're cooked.
-	 */
-	if (total_errors > rm->rm_firstdatacol) {
-		zio->io_error = vdev_raidz_worst_error(rm);
-
-	} else if (total_errors < rm->rm_firstdatacol &&
-	    (code = vdev_raidz_combrec(zio, total_errors, data_errors)) != 0) {
-		/*
-		 * If we didn't use all the available parity for the
-		 * combinatorial reconstruction, verify that the remaining
-		 * parity is correct.
-		 */
-		if (code != (1 << rm->rm_firstdatacol) - 1)
-			(void) raidz_parity_verify(zio, rm);
-	} else {
-		/*
-		 * We're here because either:
-		 *
-		 *	total_errors == rm_firstdatacol, or
-		 *	vdev_raidz_combrec() failed
-		 *
-		 * In either case, there is enough bad data to prevent
-		 * reconstruction.
-		 *
-		 * Start checksum ereports for all children which haven't
-		 * failed, and the IO wasn't speculative.
-		 */
-		zio->io_error = SET_ERROR(ECKSUM);
-
-		if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
-			for (c = 0; c < rm->rm_cols; c++) {
-				rc = &rm->rm_col[c];
-				if (rc->rc_error == 0) {
-					zio_bad_cksum_t zbc;
-					zbc.zbc_has_cksum = 0;
-					zbc.zbc_injected =
-					    rm->rm_ecksuminjected;
-
-					zfs_ereport_start_checksum(
-					    zio->io_spa,
-					    vd->vdev_child[rc->rc_devidx],
-					    zio, rc->rc_offset, rc->rc_size,
-					    (void *)(uintptr_t)c, &zbc);
-				}
-			}
-		}
-	}
-
-done:
-	zio_checksum_verified(zio);
-
-	if (zio->io_error == 0 && spa_writeable(zio->io_spa) &&
-	    (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) {
-		/*
-		 * Use the good data we have in hand to repair damaged children.
-		 */
-		for (c = 0; c < rm->rm_cols; c++) {
-			rc = &rm->rm_col[c];
-			cvd = vd->vdev_child[rc->rc_devidx];
-
-			if (rc->rc_error == 0)
-				continue;
-
-			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
-			    rc->rc_offset, rc->rc_abd, rc->rc_size,
-			    ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE,
-			    ZIO_FLAG_IO_REPAIR | (unexpected_errors ?
-			    ZIO_FLAG_SELF_HEAL : 0), NULL, NULL));
-		}
-	}
-}
-
-static void
-vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded)
-{
-	if (faulted > vd->vdev_nparity)
-		vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
-		    VDEV_AUX_NO_REPLICAS);
-	else if (degraded + faulted != 0)
-		vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
-	else
-		vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
-}
-
-/*
- * Determine if any portion of the provided block resides on a child vdev
- * with a dirty DTL and therefore needs to be resilvered.  The function
- * assumes that at least one DTL is dirty which imples that full stripe
- * width blocks must be resilvered.
- */
-static boolean_t
-vdev_raidz_need_resilver(vdev_t *vd, uint64_t offset, size_t psize)
-{
-	uint64_t dcols = vd->vdev_children;
-	uint64_t nparity = vd->vdev_nparity;
-	uint64_t ashift = vd->vdev_top->vdev_ashift;
-	/* The starting RAIDZ (parent) vdev sector of the block. */
-	uint64_t b = offset >> ashift;
-	/* The zio's size in units of the vdev's minimum sector size. */
-	uint64_t s = ((psize - 1) >> ashift) + 1;
-	/* The first column for this stripe. */
-	uint64_t f = b % dcols;
-
-	if (s + nparity >= dcols)
-		return (B_TRUE);
-
-	for (uint64_t c = 0; c < s + nparity; c++) {
-		uint64_t devidx = (f + c) % dcols;
-		vdev_t *cvd = vd->vdev_child[devidx];
-
-		/*
-		 * dsl_scan_need_resilver() already checked vd with
-		 * vdev_dtl_contains(). So here just check cvd with
-		 * vdev_dtl_empty(), cheaper and a good approximation.
-		 */
-		if (!vdev_dtl_empty(cvd, DTL_PARTIAL))
-			return (B_TRUE);
-	}
-
-	return (B_FALSE);
-}
-
-static void
-vdev_raidz_xlate(vdev_t *cvd, const range_seg_t *in, range_seg_t *res)
-{
-	vdev_t *raidvd = cvd->vdev_parent;
-	ASSERT(raidvd->vdev_ops == &vdev_raidz_ops);
-
-	uint64_t width = raidvd->vdev_children;
-	uint64_t tgt_col = cvd->vdev_id;
-	uint64_t ashift = raidvd->vdev_top->vdev_ashift;
-
-	/* make sure the offsets are block-aligned */
-	ASSERT0(in->rs_start % (1 << ashift));
-	ASSERT0(in->rs_end % (1 << ashift));
-	uint64_t b_start = in->rs_start >> ashift;
-	uint64_t b_end = in->rs_end >> ashift;
-
-	uint64_t start_row = 0;
-	if (b_start > tgt_col) /* avoid underflow */
-		start_row = ((b_start - tgt_col - 1) / width) + 1;
-
-	uint64_t end_row = 0;
-	if (b_end > tgt_col)
-		end_row = ((b_end - tgt_col - 1) / width) + 1;
-
-	res->rs_start = start_row << ashift;
-	res->rs_end = end_row << ashift;
-
-	ASSERT3U(res->rs_start, <=, in->rs_start);
-	ASSERT3U(res->rs_end - res->rs_start, <=, in->rs_end - in->rs_start);
-}
-
-vdev_ops_t vdev_raidz_ops = {
-	vdev_raidz_open,
-	vdev_raidz_close,
-	vdev_raidz_asize,
-	vdev_raidz_io_start,
-	vdev_raidz_io_done,
-	vdev_raidz_state_change,
-	vdev_raidz_need_resilver,
-	NULL,
-	NULL,
-	NULL,
-	vdev_raidz_xlate,
-	VDEV_TYPE_RAIDZ,	/* name of this vdev type */
-	B_FALSE			/* not a leaf vdev */
-};
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_removal.c
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_removal.c
+++ /dev/null
@@ -1,2156 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-
-/*
- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
- */
-
-#include <sys/zfs_context.h>
-#include <sys/spa_impl.h>
-#include <sys/dmu.h>
-#include <sys/dmu_tx.h>
-#include <sys/zap.h>
-#include <sys/vdev_impl.h>
-#include <sys/metaslab.h>
-#include <sys/metaslab_impl.h>
-#include <sys/uberblock_impl.h>
-#include <sys/txg.h>
-#include <sys/avl.h>
-#include <sys/bpobj.h>
-#include <sys/dsl_pool.h>
-#include <sys/dsl_synctask.h>
-#include <sys/dsl_dir.h>
-#include <sys/arc.h>
-#include <sys/zfeature.h>
-#include <sys/vdev_indirect_births.h>
-#include <sys/vdev_indirect_mapping.h>
-#include <sys/abd.h>
-#include <sys/vdev_initialize.h>
-
-/*
- * This file contains the necessary logic to remove vdevs from a
- * storage pool.  Currently, the only devices that can be removed
- * are log, cache, and spare devices; and top level vdevs from a pool
- * w/o raidz.  (Note that members of a mirror can also be removed
- * by the detach operation.)
- *
- * Log vdevs are removed by evacuating them and then turning the vdev
- * into a hole vdev while holding spa config locks.
- *
- * Top level vdevs are removed and converted into an indirect vdev via
- * a multi-step process:
- *
- *  - Disable allocations from this device (spa_vdev_remove_top).
- *
- *  - From a new thread (spa_vdev_remove_thread), copy data from
- *    the removing vdev to a different vdev.  The copy happens in open
- *    context (spa_vdev_copy_impl) and issues a sync task
- *    (vdev_mapping_sync) so the sync thread can update the partial
- *    indirect mappings in core and on disk.
- *
- *  - If a free happens during a removal, it is freed from the
- *    removing vdev, and if it has already been copied, from the new
- *    location as well (free_from_removing_vdev).
- *
- *  - After the removal is completed, the copy thread converts the vdev
- *    into an indirect vdev (vdev_remove_complete) before instructing
- *    the sync thread to destroy the space maps and finish the removal
- *    (spa_finish_removal).
- */
-
-typedef struct vdev_copy_arg {
-	metaslab_t	*vca_msp;
-	uint64_t	vca_outstanding_bytes;
-	kcondvar_t	vca_cv;
-	kmutex_t	vca_lock;
-} vdev_copy_arg_t;
-
-/*
- * The maximum amount of memory we can use for outstanding i/o while
- * doing a device removal.  This determines how much i/o we can have
- * in flight concurrently.
- */
-int zfs_remove_max_copy_bytes = 64 * 1024 * 1024;
-
-/*
- * The largest contiguous segment that we will attempt to allocate when
- * removing a device.  This can be no larger than SPA_MAXBLOCKSIZE.  If
- * there is a performance problem with attempting to allocate large blocks,
- * consider decreasing this.
- *
- * Note: we will issue I/Os of up to this size.  The mpt driver does not
- * respond well to I/Os larger than 1MB, so we set this to 1MB.  (When
- * mpt processes an I/O larger than 1MB, it needs to do an allocation of
- * 2 physically contiguous pages; if this allocation fails, mpt will drop
- * the I/O and hang the device.)
- */
-int zfs_remove_max_segment = 1024 * 1024;
-
-/*
- * Allow a remap segment to span free chunks of at most this size. The main
- * impact of a larger span is that we will read and write larger, more
- * contiguous chunks, with more "unnecessary" data -- trading off bandwidth
- * for iops.  The value here was chosen to align with
- * zfs_vdev_read_gap_limit, which is a similar concept when doing regular
- * reads (but there's no reason it has to be the same).
- *
- * Additionally, a higher span will have the following relatively minor
- * effects:
- *  - the mapping will be smaller, since one entry can cover more allocated
- *    segments
- *  - more of the fragmentation in the removing device will be preserved
- *  - we'll do larger allocations, which may fail and fall back on smaller
- *    allocations
- */
-int vdev_removal_max_span = 32 * 1024;
-
-/*
- * This is used by the test suite so that it can ensure that certain
- * actions happen while in the middle of a removal.
- */
-uint64_t zfs_remove_max_bytes_pause = UINT64_MAX;
-
-#define	VDEV_REMOVAL_ZAP_OBJS	"lzap"
-
-static void spa_vdev_remove_thread(void *arg);
-
-static void
-spa_sync_removing_state(spa_t *spa, dmu_tx_t *tx)
-{
-	VERIFY0(zap_update(spa->spa_dsl_pool->dp_meta_objset,
-	    DMU_POOL_DIRECTORY_OBJECT,
-	    DMU_POOL_REMOVING, sizeof (uint64_t),
-	    sizeof (spa->spa_removing_phys) / sizeof (uint64_t),
-	    &spa->spa_removing_phys, tx));
-}
-
-static nvlist_t *
-spa_nvlist_lookup_by_guid(nvlist_t **nvpp, int count, uint64_t target_guid)
-{
-	for (int i = 0; i < count; i++) {
-		uint64_t guid =
-		    fnvlist_lookup_uint64(nvpp[i], ZPOOL_CONFIG_GUID);
-
-		if (guid == target_guid)
-			return (nvpp[i]);
-	}
-
-	return (NULL);
-}
-
-static void
-spa_vdev_remove_aux(nvlist_t *config, char *name, nvlist_t **dev, int count,
-    nvlist_t *dev_to_remove)
-{
-	nvlist_t **newdev = NULL;
-
-	if (count > 1)
-		newdev = kmem_alloc((count - 1) * sizeof (void *), KM_SLEEP);
-
-	for (int i = 0, j = 0; i < count; i++) {
-		if (dev[i] == dev_to_remove)
-			continue;
-		VERIFY(nvlist_dup(dev[i], &newdev[j++], KM_SLEEP) == 0);
-	}
-
-	VERIFY(nvlist_remove(config, name, DATA_TYPE_NVLIST_ARRAY) == 0);
-	VERIFY(nvlist_add_nvlist_array(config, name, newdev, count - 1) == 0);
-
-	for (int i = 0; i < count - 1; i++)
-		nvlist_free(newdev[i]);
-
-	if (count > 1)
-		kmem_free(newdev, (count - 1) * sizeof (void *));
-}
-
-static spa_vdev_removal_t *
-spa_vdev_removal_create(vdev_t *vd)
-{
-	spa_vdev_removal_t *svr = kmem_zalloc(sizeof (*svr), KM_SLEEP);
-	mutex_init(&svr->svr_lock, NULL, MUTEX_DEFAULT, NULL);
-	cv_init(&svr->svr_cv, NULL, CV_DEFAULT, NULL);
-	svr->svr_allocd_segs = range_tree_create(NULL, NULL);
-	svr->svr_vdev_id = vd->vdev_id;
-
-	for (int i = 0; i < TXG_SIZE; i++) {
-		svr->svr_frees[i] = range_tree_create(NULL, NULL);
-		list_create(&svr->svr_new_segments[i],
-		    sizeof (vdev_indirect_mapping_entry_t),
-		    offsetof(vdev_indirect_mapping_entry_t, vime_node));
-	}
-
-	return (svr);
-}
-
-void
-spa_vdev_removal_destroy(spa_vdev_removal_t *svr)
-{
-	for (int i = 0; i < TXG_SIZE; i++) {
-		ASSERT0(svr->svr_bytes_done[i]);
-		ASSERT0(svr->svr_max_offset_to_sync[i]);
-		range_tree_destroy(svr->svr_frees[i]);
-		list_destroy(&svr->svr_new_segments[i]);
-	}
-
-	range_tree_destroy(svr->svr_allocd_segs);
-	mutex_destroy(&svr->svr_lock);
-	cv_destroy(&svr->svr_cv);
-	kmem_free(svr, sizeof (*svr));
-}
-
-/*
- * This is called as a synctask in the txg in which we will mark this vdev
- * as removing (in the config stored in the MOS).
- *
- * It begins the evacuation of a toplevel vdev by:
- * - initializing the spa_removing_phys which tracks this removal
- * - computing the amount of space to remove for accounting purposes
- * - dirtying all dbufs in the spa_config_object
- * - creating the spa_vdev_removal
- * - starting the spa_vdev_remove_thread
- */
-static void
-vdev_remove_initiate_sync(void *arg, dmu_tx_t *tx)
-{
-	int vdev_id = (uintptr_t)arg;
-	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
-	vdev_t *vd = vdev_lookup_top(spa, vdev_id);
-	vdev_indirect_config_t *vic = &vd->vdev_indirect_config;
-	objset_t *mos = spa->spa_dsl_pool->dp_meta_objset;
-	spa_vdev_removal_t *svr = NULL;
-	uint64_t txg = dmu_tx_get_txg(tx);
-
-	ASSERT3P(vd->vdev_ops, !=, &vdev_raidz_ops);
-	svr = spa_vdev_removal_create(vd);
-
-	ASSERT(vd->vdev_removing);
-	ASSERT3P(vd->vdev_indirect_mapping, ==, NULL);
-
-	spa_feature_incr(spa, SPA_FEATURE_DEVICE_REMOVAL, tx);
-	if (spa_feature_is_enabled(spa, SPA_FEATURE_OBSOLETE_COUNTS)) {
-		/*
-		 * By activating the OBSOLETE_COUNTS feature, we prevent
-		 * the pool from being downgraded and ensure that the
-		 * refcounts are precise.
-		 */
-		spa_feature_incr(spa, SPA_FEATURE_OBSOLETE_COUNTS, tx);
-		uint64_t one = 1;
-		VERIFY0(zap_add(spa->spa_meta_objset, vd->vdev_top_zap,
-		    VDEV_TOP_ZAP_OBSOLETE_COUNTS_ARE_PRECISE, sizeof (one), 1,
-		    &one, tx));
-		ASSERT3U(vdev_obsolete_counts_are_precise(vd), !=, 0);
-	}
-
-	vic->vic_mapping_object = vdev_indirect_mapping_alloc(mos, tx);
-	vd->vdev_indirect_mapping =
-	    vdev_indirect_mapping_open(mos, vic->vic_mapping_object);
-	vic->vic_births_object = vdev_indirect_births_alloc(mos, tx);
-	vd->vdev_indirect_births =
-	    vdev_indirect_births_open(mos, vic->vic_births_object);
-	spa->spa_removing_phys.sr_removing_vdev = vd->vdev_id;
-	spa->spa_removing_phys.sr_start_time = gethrestime_sec();
-	spa->spa_removing_phys.sr_end_time = 0;
-	spa->spa_removing_phys.sr_state = DSS_SCANNING;
-	spa->spa_removing_phys.sr_to_copy = 0;
-	spa->spa_removing_phys.sr_copied = 0;
-
-	/*
-	 * Note: We can't use vdev_stat's vs_alloc for sr_to_copy, because
-	 * there may be space in the defer tree, which is free, but still
-	 * counted in vs_alloc.
-	 */
-	for (uint64_t i = 0; i < vd->vdev_ms_count; i++) {
-		metaslab_t *ms = vd->vdev_ms[i];
-		if (ms->ms_sm == NULL)
-			continue;
-
-		spa->spa_removing_phys.sr_to_copy +=
-		    metaslab_allocated_space(ms);
-
-		/*
-		 * Space which we are freeing this txg does not need to
-		 * be copied.
-		 */
-		spa->spa_removing_phys.sr_to_copy -=
-		    range_tree_space(ms->ms_freeing);
-
-		ASSERT0(range_tree_space(ms->ms_freed));
-		for (int t = 0; t < TXG_SIZE; t++)
-			ASSERT0(range_tree_space(ms->ms_allocating[t]));
-	}
-
-	/*
-	 * Sync tasks are called before metaslab_sync(), so there should
-	 * be no already-synced metaslabs in the TXG_CLEAN list.
-	 */
-	ASSERT3P(txg_list_head(&vd->vdev_ms_list, TXG_CLEAN(txg)), ==, NULL);
-
-	spa_sync_removing_state(spa, tx);
-
-	/*
-	 * All blocks that we need to read the most recent mapping must be
-	 * stored on concrete vdevs.  Therefore, we must dirty anything that
-	 * is read before spa_remove_init().  Specifically, the
-	 * spa_config_object.  (Note that although we already modified the
-	 * spa_config_object in spa_sync_removing_state, that may not have
-	 * modified all blocks of the object.)
-	 */
-	dmu_object_info_t doi;
-	VERIFY0(dmu_object_info(mos, DMU_POOL_DIRECTORY_OBJECT, &doi));
-	for (uint64_t offset = 0; offset < doi.doi_max_offset; ) {
-		dmu_buf_t *dbuf;
-		VERIFY0(dmu_buf_hold(mos, DMU_POOL_DIRECTORY_OBJECT,
-		    offset, FTAG, &dbuf, 0));
-		dmu_buf_will_dirty(dbuf, tx);
-		offset += dbuf->db_size;
-		dmu_buf_rele(dbuf, FTAG);
-	}
-
-	/*
-	 * Now that we've allocated the im_object, dirty the vdev to ensure
-	 * that the object gets written to the config on disk.
-	 */
-	vdev_config_dirty(vd);
-
-	zfs_dbgmsg("starting removal thread for vdev %llu (%p) in txg %llu "
-	    "im_obj=%llu", vd->vdev_id, vd, dmu_tx_get_txg(tx),
-	    vic->vic_mapping_object);
-
-	spa_history_log_internal(spa, "vdev remove started", tx,
-	    "%s vdev %llu %s", spa_name(spa), vd->vdev_id,
-	    (vd->vdev_path != NULL) ? vd->vdev_path : "-");
-	/*
-	 * Setting spa_vdev_removal causes subsequent frees to call
-	 * free_from_removing_vdev().  Note that we don't need any locking
-	 * because we are the sync thread, and metaslab_free_impl() is only
-	 * called from syncing context (potentially from a zio taskq thread,
-	 * but in any case only when there are outstanding free i/os, which
-	 * there are not).
-	 */
-	ASSERT3P(spa->spa_vdev_removal, ==, NULL);
-	spa->spa_vdev_removal = svr;
-	svr->svr_thread = thread_create(NULL, 0,
-	    spa_vdev_remove_thread, spa, 0, &p0, TS_RUN, minclsyspri);
-}
-
-/*
- * When we are opening a pool, we must read the mapping for each
- * indirect vdev in order from most recently removed to least
- * recently removed.  We do this because the blocks for the mapping
- * of older indirect vdevs may be stored on more recently removed vdevs.
- * In order to read each indirect mapping object, we must have
- * initialized all more recently removed vdevs.
- */
-int
-spa_remove_init(spa_t *spa)
-{
-	int error;
-
-	error = zap_lookup(spa->spa_dsl_pool->dp_meta_objset,
-	    DMU_POOL_DIRECTORY_OBJECT,
-	    DMU_POOL_REMOVING, sizeof (uint64_t),
-	    sizeof (spa->spa_removing_phys) / sizeof (uint64_t),
-	    &spa->spa_removing_phys);
-
-	if (error == ENOENT) {
-		spa->spa_removing_phys.sr_state = DSS_NONE;
-		spa->spa_removing_phys.sr_removing_vdev = -1;
-		spa->spa_removing_phys.sr_prev_indirect_vdev = -1;
-		spa->spa_indirect_vdevs_loaded = B_TRUE;
-		return (0);
-	} else if (error != 0) {
-		return (error);
-	}
-
-	if (spa->spa_removing_phys.sr_state == DSS_SCANNING) {
-		/*
-		 * We are currently removing a vdev.  Create and
-		 * initialize a spa_vdev_removal_t from the bonus
-		 * buffer of the removing vdevs vdev_im_object, and
-		 * initialize its partial mapping.
-		 */
-		spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
-		vdev_t *vd = vdev_lookup_top(spa,
-		    spa->spa_removing_phys.sr_removing_vdev);
-
-		if (vd == NULL) {
-			spa_config_exit(spa, SCL_STATE, FTAG);
-			return (EINVAL);
-		}
-
-		vdev_indirect_config_t *vic = &vd->vdev_indirect_config;
-
-		ASSERT(vdev_is_concrete(vd));
-		spa_vdev_removal_t *svr = spa_vdev_removal_create(vd);
-		ASSERT3U(svr->svr_vdev_id, ==, vd->vdev_id);
-		ASSERT(vd->vdev_removing);
-
-		vd->vdev_indirect_mapping = vdev_indirect_mapping_open(
-		    spa->spa_meta_objset, vic->vic_mapping_object);
-		vd->vdev_indirect_births = vdev_indirect_births_open(
-		    spa->spa_meta_objset, vic->vic_births_object);
-		spa_config_exit(spa, SCL_STATE, FTAG);
-
-		spa->spa_vdev_removal = svr;
-	}
-
-	spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
-	uint64_t indirect_vdev_id =
-	    spa->spa_removing_phys.sr_prev_indirect_vdev;
-	while (indirect_vdev_id != UINT64_MAX) {
-		vdev_t *vd = vdev_lookup_top(spa, indirect_vdev_id);
-		vdev_indirect_config_t *vic = &vd->vdev_indirect_config;
-
-		ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops);
-		vd->vdev_indirect_mapping = vdev_indirect_mapping_open(
-		    spa->spa_meta_objset, vic->vic_mapping_object);
-		vd->vdev_indirect_births = vdev_indirect_births_open(
-		    spa->spa_meta_objset, vic->vic_births_object);
-
-		indirect_vdev_id = vic->vic_prev_indirect_vdev;
-	}
-	spa_config_exit(spa, SCL_STATE, FTAG);
-
-	/*
-	 * Now that we've loaded all the indirect mappings, we can allow
-	 * reads from other blocks (e.g. via predictive prefetch).
-	 */
-	spa->spa_indirect_vdevs_loaded = B_TRUE;
-	return (0);
-}
-
-void
-spa_restart_removal(spa_t *spa)
-{
-	spa_vdev_removal_t *svr = spa->spa_vdev_removal;
-
-	if (svr == NULL)
-		return;
-
-	/*
-	 * In general when this function is called there is no
-	 * removal thread running. The only scenario where this
-	 * is not true is during spa_import() where this function
-	 * is called twice [once from spa_import_impl() and
-	 * spa_async_resume()]. Thus, in the scenario where we
-	 * import a pool that has an ongoing removal we don't
-	 * want to spawn a second thread.
-	 */
-	if (svr->svr_thread != NULL)
-		return;
-
-	if (!spa_writeable(spa))
-		return;
-
-	zfs_dbgmsg("restarting removal of %llu", svr->svr_vdev_id);
-	svr->svr_thread = thread_create(NULL, 0, spa_vdev_remove_thread, spa,
-	    0, &p0, TS_RUN, minclsyspri);
-}
-
-/*
- * Process freeing from a device which is in the middle of being removed.
- * We must handle this carefully so that we attempt to copy freed data,
- * and we correctly free already-copied data.
- */
-void
-free_from_removing_vdev(vdev_t *vd, uint64_t offset, uint64_t size)
-{
-	spa_t *spa = vd->vdev_spa;
-	spa_vdev_removal_t *svr = spa->spa_vdev_removal;
-	vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
-	uint64_t txg = spa_syncing_txg(spa);
-	uint64_t max_offset_yet = 0;
-
-	ASSERT(vd->vdev_indirect_config.vic_mapping_object != 0);
-	ASSERT3U(vd->vdev_indirect_config.vic_mapping_object, ==,
-	    vdev_indirect_mapping_object(vim));
-	ASSERT3U(vd->vdev_id, ==, svr->svr_vdev_id);
-
-	mutex_enter(&svr->svr_lock);
-
-	/*
-	 * Remove the segment from the removing vdev's spacemap.  This
-	 * ensures that we will not attempt to copy this space (if the
-	 * removal thread has not yet visited it), and also ensures
-	 * that we know what is actually allocated on the new vdevs
-	 * (needed if we cancel the removal).
-	 *
-	 * Note: we must do the metaslab_free_concrete() with the svr_lock
-	 * held, so that the remove_thread can not load this metaslab and then
-	 * visit this offset between the time that we metaslab_free_concrete()
-	 * and when we check to see if it has been visited.
-	 *
-	 * Note: The checkpoint flag is set to false as having/taking
-	 * a checkpoint and removing a device can't happen at the same
-	 * time.
-	 */
-	ASSERT(!spa_has_checkpoint(spa));
-	metaslab_free_concrete(vd, offset, size, B_FALSE);
-
-	uint64_t synced_size = 0;
-	uint64_t synced_offset = 0;
-	uint64_t max_offset_synced = vdev_indirect_mapping_max_offset(vim);
-	if (offset < max_offset_synced) {
-		/*
-		 * The mapping for this offset is already on disk.
-		 * Free from the new location.
-		 *
-		 * Note that we use svr_max_synced_offset because it is
-		 * updated atomically with respect to the in-core mapping.
-		 * By contrast, vim_max_offset is not.
-		 *
-		 * This block may be split between a synced entry and an
-		 * in-flight or unvisited entry.  Only process the synced
-		 * portion of it here.
-		 */
-		synced_size = MIN(size, max_offset_synced - offset);
-		synced_offset = offset;
-
-		ASSERT3U(max_offset_yet, <=, max_offset_synced);
-		max_offset_yet = max_offset_synced;
-
-		DTRACE_PROBE3(remove__free__synced,
-		    spa_t *, spa,
-		    uint64_t, offset,
-		    uint64_t, synced_size);
-
-		size -= synced_size;
-		offset += synced_size;
-	}
-
-	/*
-	 * Look at all in-flight txgs starting from the currently syncing one
-	 * and see if a section of this free is being copied. By starting from
-	 * this txg and iterating forward, we might find that this region
-	 * was copied in two different txgs and handle it appropriately.
-	 */
-	for (int i = 0; i < TXG_CONCURRENT_STATES; i++) {
-		int txgoff = (txg + i) & TXG_MASK;
-		if (size > 0 && offset < svr->svr_max_offset_to_sync[txgoff]) {
-			/*
-			 * The mapping for this offset is in flight, and
-			 * will be synced in txg+i.
-			 */
-			uint64_t inflight_size = MIN(size,
-			    svr->svr_max_offset_to_sync[txgoff] - offset);
-
-			DTRACE_PROBE4(remove__free__inflight,
-			    spa_t *, spa,
-			    uint64_t, offset,
-			    uint64_t, inflight_size,
-			    uint64_t, txg + i);
-
-			/*
-			 * We copy data in order of increasing offset.
-			 * Therefore the max_offset_to_sync[] must increase
-			 * (or be zero, indicating that nothing is being
-			 * copied in that txg).
-			 */
-			if (svr->svr_max_offset_to_sync[txgoff] != 0) {
-				ASSERT3U(svr->svr_max_offset_to_sync[txgoff],
-				    >=, max_offset_yet);
-				max_offset_yet =
-				    svr->svr_max_offset_to_sync[txgoff];
-			}
-
-			/*
-			 * We've already committed to copying this segment:
-			 * we have allocated space elsewhere in the pool for
-			 * it and have an IO outstanding to copy the data. We
-			 * cannot free the space before the copy has
-			 * completed, or else the copy IO might overwrite any
-			 * new data. To free that space, we record the
-			 * segment in the appropriate svr_frees tree and free
-			 * the mapped space later, in the txg where we have
-			 * completed the copy and synced the mapping (see
-			 * vdev_mapping_sync).
-			 */
-			range_tree_add(svr->svr_frees[txgoff],
-			    offset, inflight_size);
-			size -= inflight_size;
-			offset += inflight_size;
-
-			/*
-			 * This space is already accounted for as being
-			 * done, because it is being copied in txg+i.
-			 * However, if i!=0, then it is being copied in
-			 * a future txg.  If we crash after this txg
-			 * syncs but before txg+i syncs, then the space
-			 * will be free.  Therefore we must account
-			 * for the space being done in *this* txg
-			 * (when it is freed) rather than the future txg
-			 * (when it will be copied).
-			 */
-			ASSERT3U(svr->svr_bytes_done[txgoff], >=,
-			    inflight_size);
-			svr->svr_bytes_done[txgoff] -= inflight_size;
-			svr->svr_bytes_done[txg & TXG_MASK] += inflight_size;
-		}
-	}
-	ASSERT0(svr->svr_max_offset_to_sync[TXG_CLEAN(txg) & TXG_MASK]);
-
-	if (size > 0) {
-		/*
-		 * The copy thread has not yet visited this offset.  Ensure
-		 * that it doesn't.
-		 */
-
-		DTRACE_PROBE3(remove__free__unvisited,
-		    spa_t *, spa,
-		    uint64_t, offset,
-		    uint64_t, size);
-
-		if (svr->svr_allocd_segs != NULL)
-			range_tree_clear(svr->svr_allocd_segs, offset, size);
-
-		/*
-		 * Since we now do not need to copy this data, for
-		 * accounting purposes we have done our job and can count
-		 * it as completed.
-		 */
-		svr->svr_bytes_done[txg & TXG_MASK] += size;
-	}
-	mutex_exit(&svr->svr_lock);
-
-	/*
-	 * Now that we have dropped svr_lock, process the synced portion
-	 * of this free.
-	 */
-	if (synced_size > 0) {
-		vdev_indirect_mark_obsolete(vd, synced_offset, synced_size);
-
-		/*
-		 * Note: this can only be called from syncing context,
-		 * and the vdev_indirect_mapping is only changed from the
-		 * sync thread, so we don't need svr_lock while doing
-		 * metaslab_free_impl_cb.
-		 */
-		boolean_t checkpoint = B_FALSE;
-		vdev_indirect_ops.vdev_op_remap(vd, synced_offset, synced_size,
-		    metaslab_free_impl_cb, &checkpoint);
-	}
-}
-
-/*
- * Stop an active removal and update the spa_removing phys.
- */
-static void
-spa_finish_removal(spa_t *spa, dsl_scan_state_t state, dmu_tx_t *tx)
-{
-	spa_vdev_removal_t *svr = spa->spa_vdev_removal;
-	ASSERT3U(dmu_tx_get_txg(tx), ==, spa_syncing_txg(spa));
-
-	/* Ensure the removal thread has completed before we free the svr. */
-	spa_vdev_remove_suspend(spa);
-
-	ASSERT(state == DSS_FINISHED || state == DSS_CANCELED);
-
-	if (state == DSS_FINISHED) {
-		spa_removing_phys_t *srp = &spa->spa_removing_phys;
-		vdev_t *vd = vdev_lookup_top(spa, svr->svr_vdev_id);
-		vdev_indirect_config_t *vic = &vd->vdev_indirect_config;
-
-		if (srp->sr_prev_indirect_vdev != UINT64_MAX) {
-			vdev_t *pvd = vdev_lookup_top(spa,
-			    srp->sr_prev_indirect_vdev);
-			ASSERT3P(pvd->vdev_ops, ==, &vdev_indirect_ops);
-		}
-
-		vic->vic_prev_indirect_vdev = srp->sr_prev_indirect_vdev;
-		srp->sr_prev_indirect_vdev = vd->vdev_id;
-	}
-	spa->spa_removing_phys.sr_state = state;
-	spa->spa_removing_phys.sr_end_time = gethrestime_sec();
-
-	spa->spa_vdev_removal = NULL;
-	spa_vdev_removal_destroy(svr);
-
-	spa_sync_removing_state(spa, tx);
-
-	vdev_config_dirty(spa->spa_root_vdev);
-}
-
-static void
-free_mapped_segment_cb(void *arg, uint64_t offset, uint64_t size)
-{
-	vdev_t *vd = arg;
-	vdev_indirect_mark_obsolete(vd, offset, size);
-	boolean_t checkpoint = B_FALSE;
-	vdev_indirect_ops.vdev_op_remap(vd, offset, size,
-	    metaslab_free_impl_cb, &checkpoint);
-}
-
-/*
- * On behalf of the removal thread, syncs an incremental bit more of
- * the indirect mapping to disk and updates the in-memory mapping.
- * Called as a sync task in every txg that the removal thread makes progress.
- */
-static void
-vdev_mapping_sync(void *arg, dmu_tx_t *tx)
-{
-	spa_vdev_removal_t *svr = arg;
-	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
-	vdev_t *vd = vdev_lookup_top(spa, svr->svr_vdev_id);
-	vdev_indirect_config_t *vic = &vd->vdev_indirect_config;
-	uint64_t txg = dmu_tx_get_txg(tx);
-	vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
-
-	ASSERT(vic->vic_mapping_object != 0);
-	ASSERT3U(txg, ==, spa_syncing_txg(spa));
-
-	vdev_indirect_mapping_add_entries(vim,
-	    &svr->svr_new_segments[txg & TXG_MASK], tx);
-	vdev_indirect_births_add_entry(vd->vdev_indirect_births,
-	    vdev_indirect_mapping_max_offset(vim), dmu_tx_get_txg(tx), tx);
-
-	/*
-	 * Free the copied data for anything that was freed while the
-	 * mapping entries were in flight.
-	 */
-	mutex_enter(&svr->svr_lock);
-	range_tree_vacate(svr->svr_frees[txg & TXG_MASK],
-	    free_mapped_segment_cb, vd);
-	ASSERT3U(svr->svr_max_offset_to_sync[txg & TXG_MASK], >=,
-	    vdev_indirect_mapping_max_offset(vim));
-	svr->svr_max_offset_to_sync[txg & TXG_MASK] = 0;
-	mutex_exit(&svr->svr_lock);
-
-	spa_sync_removing_state(spa, tx);
-}
-
-typedef struct vdev_copy_segment_arg {
-	spa_t *vcsa_spa;
-	dva_t *vcsa_dest_dva;
-	uint64_t vcsa_txg;
-	range_tree_t *vcsa_obsolete_segs;
-} vdev_copy_segment_arg_t;
-
-static void
-unalloc_seg(void *arg, uint64_t start, uint64_t size)
-{
-	vdev_copy_segment_arg_t *vcsa = arg;
-	spa_t *spa = vcsa->vcsa_spa;
-	blkptr_t bp = { 0 };
-
-	BP_SET_BIRTH(&bp, TXG_INITIAL, TXG_INITIAL);
-	BP_SET_LSIZE(&bp, size);
-	BP_SET_PSIZE(&bp, size);
-	BP_SET_COMPRESS(&bp, ZIO_COMPRESS_OFF);
-	BP_SET_CHECKSUM(&bp, ZIO_CHECKSUM_OFF);
-	BP_SET_TYPE(&bp, DMU_OT_NONE);
-	BP_SET_LEVEL(&bp, 0);
-	BP_SET_DEDUP(&bp, 0);
-	BP_SET_BYTEORDER(&bp, ZFS_HOST_BYTEORDER);
-
-	DVA_SET_VDEV(&bp.blk_dva[0], DVA_GET_VDEV(vcsa->vcsa_dest_dva));
-	DVA_SET_OFFSET(&bp.blk_dva[0],
-	    DVA_GET_OFFSET(vcsa->vcsa_dest_dva) + start);
-	DVA_SET_ASIZE(&bp.blk_dva[0], size);
-
-	zio_free(spa, vcsa->vcsa_txg, &bp);
-}
-
-/*
- * All reads and writes associated with a call to spa_vdev_copy_segment()
- * are done.
- */
-static void
-spa_vdev_copy_segment_done(zio_t *zio)
-{
-	vdev_copy_segment_arg_t *vcsa = zio->io_private;
-
-	range_tree_vacate(vcsa->vcsa_obsolete_segs,
-	    unalloc_seg, vcsa);
-	range_tree_destroy(vcsa->vcsa_obsolete_segs);
-	kmem_free(vcsa, sizeof (*vcsa));
-
-	spa_config_exit(zio->io_spa, SCL_STATE, zio->io_spa);
-}
-
-/*
- * The write of the new location is done.
- */
-static void
-spa_vdev_copy_segment_write_done(zio_t *zio)
-{
-	vdev_copy_arg_t *vca = zio->io_private;
-
-	abd_free(zio->io_abd);
-
-	mutex_enter(&vca->vca_lock);
-	vca->vca_outstanding_bytes -= zio->io_size;
-	cv_signal(&vca->vca_cv);
-	mutex_exit(&vca->vca_lock);
-}
-
-/*
- * The read of the old location is done.  The parent zio is the write to
- * the new location.  Allow it to start.
- */
-static void
-spa_vdev_copy_segment_read_done(zio_t *zio)
-{
-	zio_nowait(zio_unique_parent(zio));
-}
-
-/*
- * If the old and new vdevs are mirrors, we will read both sides of the old
- * mirror, and write each copy to the corresponding side of the new mirror.
- * If the old and new vdevs have a different number of children, we will do
- * this as best as possible.  Since we aren't verifying checksums, this
- * ensures that as long as there's a good copy of the data, we'll have a
- * good copy after the removal, even if there's silent damage to one side
- * of the mirror. If we're removing a mirror that has some silent damage,
- * we'll have exactly the same damage in the new location (assuming that
- * the new location is also a mirror).
- *
- * We accomplish this by creating a tree of zio_t's, with as many writes as
- * there are "children" of the new vdev (a non-redundant vdev counts as one
- * child, a 2-way mirror has 2 children, etc). Each write has an associated
- * read from a child of the old vdev. Typically there will be the same
- * number of children of the old and new vdevs.  However, if there are more
- * children of the new vdev, some child(ren) of the old vdev will be issued
- * multiple reads.  If there are more children of the old vdev, some copies
- * will be dropped.
- *
- * For example, the tree of zio_t's for a 2-way mirror is:
- *
- *                            null
- *                           /    \
- *    write(new vdev, child 0)      write(new vdev, child 1)
- *      |                             |
- *    read(old vdev, child 0)       read(old vdev, child 1)
- *
- * Child zio's complete before their parents complete.  However, zio's
- * created with zio_vdev_child_io() may be issued before their children
- * complete.  In this case we need to make sure that the children (reads)
- * complete before the parents (writes) are *issued*.  We do this by not
- * calling zio_nowait() on each write until its corresponding read has
- * completed.
- *
- * The spa_config_lock must be held while zio's created by
- * zio_vdev_child_io() are in progress, to ensure that the vdev tree does
- * not change (e.g. due to a concurrent "zpool attach/detach"). The "null"
- * zio is needed to release the spa_config_lock after all the reads and
- * writes complete. (Note that we can't grab the config lock for each read,
- * because it is not reentrant - we could deadlock with a thread waiting
- * for a write lock.)
- */
-static void
-spa_vdev_copy_one_child(vdev_copy_arg_t *vca, zio_t *nzio,
-    vdev_t *source_vd, uint64_t source_offset,
-    vdev_t *dest_child_vd, uint64_t dest_offset, int dest_id, uint64_t size)
-{
-	ASSERT3U(spa_config_held(nzio->io_spa, SCL_ALL, RW_READER), !=, 0);
-
-	mutex_enter(&vca->vca_lock);
-	vca->vca_outstanding_bytes += size;
-	mutex_exit(&vca->vca_lock);
-
-	abd_t *abd = abd_alloc_for_io(size, B_FALSE);
-
-	vdev_t *source_child_vd;
-	if (source_vd->vdev_ops == &vdev_mirror_ops && dest_id != -1) {
-		/*
-		 * Source and dest are both mirrors.  Copy from the same
-		 * child id as we are copying to (wrapping around if there
-		 * are more dest children than source children).
-		 */
-		source_child_vd =
-		    source_vd->vdev_child[dest_id % source_vd->vdev_children];
-	} else {
-		source_child_vd = source_vd;
-	}
-
-	zio_t *write_zio = zio_vdev_child_io(nzio, NULL,
-	    dest_child_vd, dest_offset, abd, size,
-	    ZIO_TYPE_WRITE, ZIO_PRIORITY_REMOVAL,
-	    ZIO_FLAG_CANFAIL,
-	    spa_vdev_copy_segment_write_done, vca);
-
-	zio_nowait(zio_vdev_child_io(write_zio, NULL,
-	    source_child_vd, source_offset, abd, size,
-	    ZIO_TYPE_READ, ZIO_PRIORITY_REMOVAL,
-	    ZIO_FLAG_CANFAIL,
-	    spa_vdev_copy_segment_read_done, vca));
-}
-
-/*
- * Allocate a new location for this segment, and create the zio_t's to
- * read from the old location and write to the new location.
- */
-static int
-spa_vdev_copy_segment(vdev_t *vd, range_tree_t *segs,
-    uint64_t maxalloc, uint64_t txg,
-    vdev_copy_arg_t *vca, zio_alloc_list_t *zal)
-{
-	metaslab_group_t *mg = vd->vdev_mg;
-	spa_t *spa = vd->vdev_spa;
-	spa_vdev_removal_t *svr = spa->spa_vdev_removal;
-	vdev_indirect_mapping_entry_t *entry;
-	dva_t dst = { 0 };
-	uint64_t start = range_tree_min(segs);
-
-	ASSERT3U(maxalloc, <=, SPA_MAXBLOCKSIZE);
-
-	uint64_t size = range_tree_span(segs);
-	if (range_tree_span(segs) > maxalloc) {
-		/*
-		 * We can't allocate all the segments.  Prefer to end
-		 * the allocation at the end of a segment, thus avoiding
-		 * additional split blocks.
-		 */
-		range_seg_t search;
-		avl_index_t where;
-		search.rs_start = start + maxalloc;
-		search.rs_end = search.rs_start;
-		range_seg_t *rs = avl_find(&segs->rt_root, &search, &where);
-		if (rs == NULL) {
-			rs = avl_nearest(&segs->rt_root, where, AVL_BEFORE);
-		} else {
-			rs = AVL_PREV(&segs->rt_root, rs);
-		}
-		if (rs != NULL) {
-			size = rs->rs_end - start;
-		} else {
-			/*
-			 * There are no segments that end before maxalloc.
-			 * I.e. the first segment is larger than maxalloc,
-			 * so we must split it.
-			 */
-			size = maxalloc;
-		}
-	}
-	ASSERT3U(size, <=, maxalloc);
-
-	/*
-	 * An allocation class might not have any remaining vdevs or space
-	 */
-	metaslab_class_t *mc = mg->mg_class;
-	if (mc != spa_normal_class(spa) && mc->mc_groups <= 1)
-		mc = spa_normal_class(spa);
-	int error = metaslab_alloc_dva(spa, mc, size, &dst, 0, NULL, txg, 0,
-	    zal, 0);
-	if (error == ENOSPC && mc != spa_normal_class(spa)) {
-		error = metaslab_alloc_dva(spa, spa_normal_class(spa), size,
-		    &dst, 0, NULL, txg, 0, zal, 0);
-	}
-	if (error != 0)
-		return (error);
-
-	/*
-	 * Determine the ranges that are not actually needed.  Offsets are
-	 * relative to the start of the range to be copied (i.e. relative to the
-	 * local variable "start").
-	 */
-	range_tree_t *obsolete_segs = range_tree_create(NULL, NULL);
-
-	range_seg_t *rs = avl_first(&segs->rt_root);
-	ASSERT3U(rs->rs_start, ==, start);
-	uint64_t prev_seg_end = rs->rs_end;
-	while ((rs = AVL_NEXT(&segs->rt_root, rs)) != NULL) {
-		if (rs->rs_start >= start + size) {
-			break;
-		} else {
-			range_tree_add(obsolete_segs,
-			    prev_seg_end - start,
-			    rs->rs_start - prev_seg_end);
-		}
-		prev_seg_end = rs->rs_end;
-	}
-	/* We don't end in the middle of an obsolete range */
-	ASSERT3U(start + size, <=, prev_seg_end);
-
-	range_tree_clear(segs, start, size);
-
-	/*
-	 * We can't have any padding of the allocated size, otherwise we will
-	 * misunderstand what's allocated, and the size of the mapping.
-	 * The caller ensures this will be true by passing in a size that is
-	 * aligned to the worst (highest) ashift in the pool.
-	 */
-	ASSERT3U(DVA_GET_ASIZE(&dst), ==, size);
-
-	entry = kmem_zalloc(sizeof (vdev_indirect_mapping_entry_t), KM_SLEEP);
-	DVA_MAPPING_SET_SRC_OFFSET(&entry->vime_mapping, start);
-	entry->vime_mapping.vimep_dst = dst;
-	if (spa_feature_is_enabled(spa, SPA_FEATURE_OBSOLETE_COUNTS)) {
-		entry->vime_obsolete_count = range_tree_space(obsolete_segs);
-	}
-
-	vdev_copy_segment_arg_t *vcsa = kmem_zalloc(sizeof (*vcsa), KM_SLEEP);
-	vcsa->vcsa_dest_dva = &entry->vime_mapping.vimep_dst;
-	vcsa->vcsa_obsolete_segs = obsolete_segs;
-	vcsa->vcsa_spa = spa;
-	vcsa->vcsa_txg = txg;
-
-	/*
-	 * See comment before spa_vdev_copy_one_child().
-	 */
-	spa_config_enter(spa, SCL_STATE, spa, RW_READER);
-	zio_t *nzio = zio_null(spa->spa_txg_zio[txg & TXG_MASK], spa, NULL,
-	    spa_vdev_copy_segment_done, vcsa, 0);
-	vdev_t *dest_vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dst));
-	if (dest_vd->vdev_ops == &vdev_mirror_ops) {
-		for (int i = 0; i < dest_vd->vdev_children; i++) {
-			vdev_t *child = dest_vd->vdev_child[i];
-			spa_vdev_copy_one_child(vca, nzio, vd, start,
-			    child, DVA_GET_OFFSET(&dst), i, size);
-		}
-	} else {
-		spa_vdev_copy_one_child(vca, nzio, vd, start,
-		    dest_vd, DVA_GET_OFFSET(&dst), -1, size);
-	}
-	zio_nowait(nzio);
-
-	list_insert_tail(&svr->svr_new_segments[txg & TXG_MASK], entry);
-	ASSERT3U(start + size, <=, vd->vdev_ms_count << vd->vdev_ms_shift);
-	vdev_dirty(vd, 0, NULL, txg);
-
-	return (0);
-}
-
-/*
- * Complete the removal of a toplevel vdev. This is called as a
- * synctask in the same txg that we will sync out the new config (to the
- * MOS object) which indicates that this vdev is indirect.
- */
-static void
-vdev_remove_complete_sync(void *arg, dmu_tx_t *tx)
-{
-	spa_vdev_removal_t *svr = arg;
-	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
-	vdev_t *vd = vdev_lookup_top(spa, svr->svr_vdev_id);
-
-	ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops);
-
-	for (int i = 0; i < TXG_SIZE; i++) {
-		ASSERT0(svr->svr_bytes_done[i]);
-	}
-
-	ASSERT3U(spa->spa_removing_phys.sr_copied, ==,
-	    spa->spa_removing_phys.sr_to_copy);
-
-	vdev_destroy_spacemaps(vd, tx);
-
-	/* destroy leaf zaps, if any */
-	ASSERT3P(svr->svr_zaplist, !=, NULL);
-	for (nvpair_t *pair = nvlist_next_nvpair(svr->svr_zaplist, NULL);
-	    pair != NULL;
-	    pair = nvlist_next_nvpair(svr->svr_zaplist, pair)) {
-		vdev_destroy_unlink_zap(vd, fnvpair_value_uint64(pair), tx);
-	}
-	fnvlist_free(svr->svr_zaplist);
-
-	spa_finish_removal(dmu_tx_pool(tx)->dp_spa, DSS_FINISHED, tx);
-	/* vd->vdev_path is not available here */
-	spa_history_log_internal(spa, "vdev remove completed",  tx,
-	    "%s vdev %llu", spa_name(spa), vd->vdev_id);
-}
-
-static void
-vdev_remove_enlist_zaps(vdev_t *vd, nvlist_t *zlist)
-{
-	ASSERT3P(zlist, !=, NULL);
-	ASSERT3P(vd->vdev_ops, !=, &vdev_raidz_ops);
-
-	if (vd->vdev_leaf_zap != 0) {
-		char zkey[32];
-		(void) snprintf(zkey, sizeof (zkey), "%s-%ju",
-		    VDEV_REMOVAL_ZAP_OBJS, (uintmax_t)vd->vdev_leaf_zap);
-		fnvlist_add_uint64(zlist, zkey, vd->vdev_leaf_zap);
-	}
-
-	for (uint64_t id = 0; id < vd->vdev_children; id++) {
-		vdev_remove_enlist_zaps(vd->vdev_child[id], zlist);
-	}
-}
-
-static void
-vdev_remove_replace_with_indirect(vdev_t *vd, uint64_t txg)
-{
-	vdev_t *ivd;
-	dmu_tx_t *tx;
-	spa_t *spa = vd->vdev_spa;
-	spa_vdev_removal_t *svr = spa->spa_vdev_removal;
-
-	/*
-	 * First, build a list of leaf zaps to be destroyed.
-	 * This is passed to the sync context thread,
-	 * which does the actual unlinking.
-	 */
-	svr->svr_zaplist = fnvlist_alloc();
-	vdev_remove_enlist_zaps(vd, svr->svr_zaplist);
-
-	ivd = vdev_add_parent(vd, &vdev_indirect_ops);
-	ivd->vdev_removing = 0;
-
-	vd->vdev_leaf_zap = 0;
-
-	vdev_remove_child(ivd, vd);
-	vdev_compact_children(ivd);
-
-	ASSERT(!list_link_active(&vd->vdev_state_dirty_node));
-
-	tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
-	dsl_sync_task_nowait(spa->spa_dsl_pool, vdev_remove_complete_sync, svr,
-	    0, ZFS_SPACE_CHECK_NONE, tx);
-	dmu_tx_commit(tx);
-
-	/*
-	 * Indicate that this thread has exited.
-	 * After this, we can not use svr.
-	 */
-	mutex_enter(&svr->svr_lock);
-	svr->svr_thread = NULL;
-	cv_broadcast(&svr->svr_cv);
-	mutex_exit(&svr->svr_lock);
-}
-
-/*
- * Complete the removal of a toplevel vdev. This is called in open
- * context by the removal thread after we have copied all vdev's data.
- */
-static void
-vdev_remove_complete(spa_t *spa)
-{
-	uint64_t txg;
-
-	/*
-	 * Wait for any deferred frees to be synced before we call
-	 * vdev_metaslab_fini()
-	 */
-	txg_wait_synced(spa->spa_dsl_pool, 0);
-	txg = spa_vdev_enter(spa);
-	vdev_t *vd = vdev_lookup_top(spa, spa->spa_vdev_removal->svr_vdev_id);
-	ASSERT3P(vd->vdev_initialize_thread, ==, NULL);
-
-	sysevent_t *ev = spa_event_create(spa, vd, NULL,
-	    ESC_ZFS_VDEV_REMOVE_DEV);
-
-	zfs_dbgmsg("finishing device removal for vdev %llu in txg %llu",
-	    vd->vdev_id, txg);
-
-	/*
-	 * Discard allocation state.
-	 */
-	if (vd->vdev_mg != NULL) {
-		vdev_metaslab_fini(vd);
-		metaslab_group_destroy(vd->vdev_mg);
-		vd->vdev_mg = NULL;
-	}
-	ASSERT0(vd->vdev_stat.vs_space);
-	ASSERT0(vd->vdev_stat.vs_dspace);
-
-	vdev_remove_replace_with_indirect(vd, txg);
-
-	/*
-	 * We now release the locks, allowing spa_sync to run and finish the
-	 * removal via vdev_remove_complete_sync in syncing context.
-	 *
-	 * Note that we hold on to the vdev_t that has been replaced.  Since
-	 * it isn't part of the vdev tree any longer, it can't be concurrently
-	 * manipulated, even while we don't have the config lock.
-	 */
-	(void) spa_vdev_exit(spa, NULL, txg, 0);
-
-	/*
-	 * Top ZAP should have been transferred to the indirect vdev in
-	 * vdev_remove_replace_with_indirect.
-	 */
-	ASSERT0(vd->vdev_top_zap);
-
-	/*
-	 * Leaf ZAP should have been moved in vdev_remove_replace_with_indirect.
-	 */
-	ASSERT0(vd->vdev_leaf_zap);
-
-	txg = spa_vdev_enter(spa);
-	(void) vdev_label_init(vd, 0, VDEV_LABEL_REMOVE);
-	/*
-	 * Request to update the config and the config cachefile.
-	 */
-	vdev_config_dirty(spa->spa_root_vdev);
-	(void) spa_vdev_exit(spa, vd, txg, 0);
-
-	spa_event_post(ev);
-}
-
-/*
- * Evacuates a segment of size at most max_alloc from the vdev
- * via repeated calls to spa_vdev_copy_segment. If an allocation
- * fails, the pool is probably too fragmented to handle such a
- * large size, so decrease max_alloc so that the caller will not try
- * this size again this txg.
- */
-static void
-spa_vdev_copy_impl(vdev_t *vd, spa_vdev_removal_t *svr, vdev_copy_arg_t *vca,
-    uint64_t *max_alloc, dmu_tx_t *tx)
-{
-	uint64_t txg = dmu_tx_get_txg(tx);
-	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
-
-	mutex_enter(&svr->svr_lock);
-
-	/*
-	 * Determine how big of a chunk to copy.  We can allocate up
-	 * to max_alloc bytes, and we can span up to vdev_removal_max_span
-	 * bytes of unallocated space at a time.  "segs" will track the
-	 * allocated segments that we are copying.  We may also be copying
-	 * free segments (of up to vdev_removal_max_span bytes).
-	 */
-	range_tree_t *segs = range_tree_create(NULL, NULL);
-	for (;;) {
-		range_seg_t *rs = avl_first(&svr->svr_allocd_segs->rt_root);
-		if (rs == NULL)
-			break;
-
-		uint64_t seg_length;
-
-		if (range_tree_is_empty(segs)) {
-			/* need to truncate the first seg based on max_alloc */
-			seg_length =
-			    MIN(rs->rs_end - rs->rs_start, *max_alloc);
-		} else {
-			if (rs->rs_start - range_tree_max(segs) >
-			    vdev_removal_max_span) {
-				/*
-				 * Including this segment would cause us to
-				 * copy a larger unneeded chunk than is allowed.
-				 */
-				break;
-			} else if (rs->rs_end - range_tree_min(segs) >
-			    *max_alloc) {
-				/*
-				 * This additional segment would extend past
-				 * max_alloc. Rather than splitting this
-				 * segment, leave it for the next mapping.
-				 */
-				break;
-			} else {
-				seg_length = rs->rs_end - rs->rs_start;
-			}
-		}
-
-		range_tree_add(segs, rs->rs_start, seg_length);
-		range_tree_remove(svr->svr_allocd_segs,
-		    rs->rs_start, seg_length);
-	}
-
-	if (range_tree_is_empty(segs)) {
-		mutex_exit(&svr->svr_lock);
-		range_tree_destroy(segs);
-		return;
-	}
-
-	if (svr->svr_max_offset_to_sync[txg & TXG_MASK] == 0) {
-		dsl_sync_task_nowait(dmu_tx_pool(tx), vdev_mapping_sync,
-		    svr, 0, ZFS_SPACE_CHECK_NONE, tx);
-	}
-
-	svr->svr_max_offset_to_sync[txg & TXG_MASK] = range_tree_max(segs);
-
-	/*
-	 * Note: this is the amount of *allocated* space
-	 * that we are taking care of each txg.
-	 */
-	svr->svr_bytes_done[txg & TXG_MASK] += range_tree_space(segs);
-
-	mutex_exit(&svr->svr_lock);
-
-	zio_alloc_list_t zal;
-	metaslab_trace_init(&zal);
-	uint64_t thismax = SPA_MAXBLOCKSIZE;
-	while (!range_tree_is_empty(segs)) {
-		int error = spa_vdev_copy_segment(vd,
-		    segs, thismax, txg, vca, &zal);
-
-		if (error == ENOSPC) {
-			/*
-			 * Cut our segment in half, and don't try this
-			 * segment size again this txg.  Note that the
-			 * allocation size must be aligned to the highest
-			 * ashift in the pool, so that the allocation will
-			 * not be padded out to a multiple of the ashift,
-			 * which could cause us to think that this mapping
-			 * is larger than we intended.
-			 */
-			ASSERT3U(spa->spa_max_ashift, >=, SPA_MINBLOCKSHIFT);
-			ASSERT3U(spa->spa_max_ashift, ==, spa->spa_min_ashift);
-			uint64_t attempted =
-			    MIN(range_tree_span(segs), thismax);
-			thismax = P2ROUNDUP(attempted / 2,
-			    1 << spa->spa_max_ashift);
-			/*
-			 * The minimum-size allocation can not fail.
-			 */
-			ASSERT3U(attempted, >, 1 << spa->spa_max_ashift);
-			*max_alloc = attempted - (1 << spa->spa_max_ashift);
-		} else {
-			ASSERT0(error);
-
-			/*
-			 * We've performed an allocation, so reset the
-			 * alloc trace list.
-			 */
-			metaslab_trace_fini(&zal);
-			metaslab_trace_init(&zal);
-		}
-	}
-	metaslab_trace_fini(&zal);
-	range_tree_destroy(segs);
-}
-
-/*
- * The removal thread operates in open context.  It iterates over all
- * allocated space in the vdev, by loading each metaslab's spacemap.
- * For each contiguous segment of allocated space (capping the segment
- * size at SPA_MAXBLOCKSIZE), we:
- *    - Allocate space for it on another vdev.
- *    - Create a new mapping from the old location to the new location
- *      (as a record in svr_new_segments).
- *    - Initiate a logical read zio to get the data off the removing disk.
- *    - In the read zio's done callback, initiate a logical write zio to
- *      write it to the new vdev.
- * Note that all of this will take effect when a particular TXG syncs.
- * The sync thread ensures that all the phys reads and writes for the syncing
- * TXG have completed (see spa_txg_zio) and writes the new mappings to disk
- * (see vdev_mapping_sync()).
- */
-static void
-spa_vdev_remove_thread(void *arg)
-{
-	spa_t *spa = arg;
-	spa_vdev_removal_t *svr = spa->spa_vdev_removal;
-	vdev_copy_arg_t vca;
-	uint64_t max_alloc = zfs_remove_max_segment;
-	uint64_t last_txg = 0;
-
-	spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
-	vdev_t *vd = vdev_lookup_top(spa, svr->svr_vdev_id);
-	vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
-	uint64_t start_offset = vdev_indirect_mapping_max_offset(vim);
-
-	ASSERT3P(vd->vdev_ops, !=, &vdev_indirect_ops);
-	ASSERT(vdev_is_concrete(vd));
-	ASSERT(vd->vdev_removing);
-	ASSERT(vd->vdev_indirect_config.vic_mapping_object != 0);
-	ASSERT(vim != NULL);
-
-	mutex_init(&vca.vca_lock, NULL, MUTEX_DEFAULT, NULL);
-	cv_init(&vca.vca_cv, NULL, CV_DEFAULT, NULL);
-	vca.vca_outstanding_bytes = 0;
-
-	mutex_enter(&svr->svr_lock);
-
-	/*
-	 * Start from vim_max_offset so we pick up where we left off
-	 * if we are restarting the removal after opening the pool.
-	 */
-	uint64_t msi;
-	for (msi = start_offset >> vd->vdev_ms_shift;
-	    msi < vd->vdev_ms_count && !svr->svr_thread_exit; msi++) {
-		metaslab_t *msp = vd->vdev_ms[msi];
-		ASSERT3U(msi, <=, vd->vdev_ms_count);
-
-		ASSERT0(range_tree_space(svr->svr_allocd_segs));
-
-		mutex_enter(&msp->ms_sync_lock);
-		mutex_enter(&msp->ms_lock);
-
-		/*
-		 * Assert nothing in flight -- ms_*tree is empty.
-		 */
-		for (int i = 0; i < TXG_SIZE; i++) {
-			ASSERT0(range_tree_space(msp->ms_allocating[i]));
-		}
-
-		/*
-		 * If the metaslab has ever been allocated from (ms_sm!=NULL),
-		 * read the allocated segments from the space map object
-		 * into svr_allocd_segs. Since we do this while holding
-		 * svr_lock and ms_sync_lock, concurrent frees (which
-		 * would have modified the space map) will wait for us
-		 * to finish loading the spacemap, and then take the
-		 * appropriate action (see free_from_removing_vdev()).
-		 */
-		if (msp->ms_sm != NULL) {
-			VERIFY0(space_map_load(msp->ms_sm,
-			    svr->svr_allocd_segs, SM_ALLOC));
-
-			range_tree_walk(msp->ms_freeing,
-			    range_tree_remove, svr->svr_allocd_segs);
-
-			/*
-			 * When we are resuming from a paused removal (i.e.
-			 * when importing a pool with a removal in progress),
-			 * discard any state that we have already processed.
-			 */
-			range_tree_clear(svr->svr_allocd_segs, 0, start_offset);
-		}
-		mutex_exit(&msp->ms_lock);
-		mutex_exit(&msp->ms_sync_lock);
-
-		vca.vca_msp = msp;
-		zfs_dbgmsg("copying %llu segments for metaslab %llu",
-		    avl_numnodes(&svr->svr_allocd_segs->rt_root),
-		    msp->ms_id);
-
-		while (!svr->svr_thread_exit &&
-		    !range_tree_is_empty(svr->svr_allocd_segs)) {
-
-			mutex_exit(&svr->svr_lock);
-
-			/*
-			 * We need to periodically drop the config lock so that
-			 * writers can get in.  Additionally, we can't wait
-			 * for a txg to sync while holding a config lock
-			 * (since a waiting writer could cause a 3-way deadlock
-			 * with the sync thread, which also gets a config
-			 * lock for reader).  So we can't hold the config lock
-			 * while calling dmu_tx_assign().
-			 */
-			spa_config_exit(spa, SCL_CONFIG, FTAG);
-
-			/*
-			 * This delay will pause the removal around the point
-			 * specified by zfs_remove_max_bytes_pause. We do this
-			 * solely from the test suite or during debugging.
-			 */
-			uint64_t bytes_copied =
-			    spa->spa_removing_phys.sr_copied;
-			for (int i = 0; i < TXG_SIZE; i++)
-				bytes_copied += svr->svr_bytes_done[i];
-			while (zfs_remove_max_bytes_pause <= bytes_copied &&
-			    !svr->svr_thread_exit)
-				delay(hz);
-
-			mutex_enter(&vca.vca_lock);
-			while (vca.vca_outstanding_bytes >
-			    zfs_remove_max_copy_bytes) {
-				cv_wait(&vca.vca_cv, &vca.vca_lock);
-			}
-			mutex_exit(&vca.vca_lock);
-
-			dmu_tx_t *tx =
-			    dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
-
-			VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
-			uint64_t txg = dmu_tx_get_txg(tx);
-
-			/*
-			 * Reacquire the vdev_config lock.  The vdev_t
-			 * that we're removing may have changed, e.g. due
-			 * to a vdev_attach or vdev_detach.
-			 */
-			spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
-			vd = vdev_lookup_top(spa, svr->svr_vdev_id);
-
-			if (txg != last_txg)
-				max_alloc = zfs_remove_max_segment;
-			last_txg = txg;
-
-			spa_vdev_copy_impl(vd, svr, &vca, &max_alloc, tx);
-
-			dmu_tx_commit(tx);
-			mutex_enter(&svr->svr_lock);
-		}
-	}
-
-	mutex_exit(&svr->svr_lock);
-
-	spa_config_exit(spa, SCL_CONFIG, FTAG);
-
-	/*
-	 * Wait for all copies to finish before cleaning up the vca.
-	 */
-	txg_wait_synced(spa->spa_dsl_pool, 0);
-	ASSERT0(vca.vca_outstanding_bytes);
-
-	mutex_destroy(&vca.vca_lock);
-	cv_destroy(&vca.vca_cv);
-
-	if (svr->svr_thread_exit) {
-		mutex_enter(&svr->svr_lock);
-		range_tree_vacate(svr->svr_allocd_segs, NULL, NULL);
-		svr->svr_thread = NULL;
-		cv_broadcast(&svr->svr_cv);
-		mutex_exit(&svr->svr_lock);
-	} else {
-		ASSERT0(range_tree_space(svr->svr_allocd_segs));
-		vdev_remove_complete(spa);
-	}
-	thread_exit();
-}
-
-void
-spa_vdev_remove_suspend(spa_t *spa)
-{
-	spa_vdev_removal_t *svr = spa->spa_vdev_removal;
-
-	if (svr == NULL)
-		return;
-
-	mutex_enter(&svr->svr_lock);
-	svr->svr_thread_exit = B_TRUE;
-	while (svr->svr_thread != NULL)
-		cv_wait(&svr->svr_cv, &svr->svr_lock);
-	svr->svr_thread_exit = B_FALSE;
-	mutex_exit(&svr->svr_lock);
-}
-
-/* ARGSUSED */
-static int
-spa_vdev_remove_cancel_check(void *arg, dmu_tx_t *tx)
-{
-	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
-
-	if (spa->spa_vdev_removal == NULL)
-		return (ESRCH);
-	return (0);
-}
-
-/*
- * Cancel a removal by freeing all entries from the partial mapping
- * and marking the vdev as no longer being removing.
- */
-/* ARGSUSED */
-static void
-spa_vdev_remove_cancel_sync(void *arg, dmu_tx_t *tx)
-{
-	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
-	spa_vdev_removal_t *svr = spa->spa_vdev_removal;
-	vdev_t *vd = vdev_lookup_top(spa, svr->svr_vdev_id);
-	vdev_indirect_config_t *vic = &vd->vdev_indirect_config;
-	vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
-	objset_t *mos = spa->spa_meta_objset;
-
-	ASSERT3P(svr->svr_thread, ==, NULL);
-
-	spa_feature_decr(spa, SPA_FEATURE_DEVICE_REMOVAL, tx);
-	if (vdev_obsolete_counts_are_precise(vd)) {
-		spa_feature_decr(spa, SPA_FEATURE_OBSOLETE_COUNTS, tx);
-		VERIFY0(zap_remove(spa->spa_meta_objset, vd->vdev_top_zap,
-		    VDEV_TOP_ZAP_OBSOLETE_COUNTS_ARE_PRECISE, tx));
-	}
-
-	if (vdev_obsolete_sm_object(vd) != 0) {
-		ASSERT(vd->vdev_obsolete_sm != NULL);
-		ASSERT3U(vdev_obsolete_sm_object(vd), ==,
-		    space_map_object(vd->vdev_obsolete_sm));
-
-		space_map_free(vd->vdev_obsolete_sm, tx);
-		VERIFY0(zap_remove(spa->spa_meta_objset, vd->vdev_top_zap,
-		    VDEV_TOP_ZAP_INDIRECT_OBSOLETE_SM, tx));
-		space_map_close(vd->vdev_obsolete_sm);
-		vd->vdev_obsolete_sm = NULL;
-		spa_feature_decr(spa, SPA_FEATURE_OBSOLETE_COUNTS, tx);
-	}
-	for (int i = 0; i < TXG_SIZE; i++) {
-		ASSERT(list_is_empty(&svr->svr_new_segments[i]));
-		ASSERT3U(svr->svr_max_offset_to_sync[i], <=,
-		    vdev_indirect_mapping_max_offset(vim));
-	}
-
-	for (uint64_t msi = 0; msi < vd->vdev_ms_count; msi++) {
-		metaslab_t *msp = vd->vdev_ms[msi];
-
-		if (msp->ms_start >= vdev_indirect_mapping_max_offset(vim))
-			break;
-
-		ASSERT0(range_tree_space(svr->svr_allocd_segs));
-
-		mutex_enter(&msp->ms_lock);
-
-		/*
-		 * Assert nothing in flight -- ms_*tree is empty.
-		 */
-		for (int i = 0; i < TXG_SIZE; i++)
-			ASSERT0(range_tree_space(msp->ms_allocating[i]));
-		for (int i = 0; i < TXG_DEFER_SIZE; i++)
-			ASSERT0(range_tree_space(msp->ms_defer[i]));
-		ASSERT0(range_tree_space(msp->ms_freed));
-
-		if (msp->ms_sm != NULL) {
-			mutex_enter(&svr->svr_lock);
-			VERIFY0(space_map_load(msp->ms_sm,
-			    svr->svr_allocd_segs, SM_ALLOC));
-			range_tree_walk(msp->ms_freeing,
-			    range_tree_remove, svr->svr_allocd_segs);
-
-			/*
-			 * Clear everything past what has been synced,
-			 * because we have not allocated mappings for it yet.
-			 */
-			uint64_t syncd = vdev_indirect_mapping_max_offset(vim);
-			uint64_t sm_end = msp->ms_sm->sm_start +
-			    msp->ms_sm->sm_size;
-			if (sm_end > syncd)
-				range_tree_clear(svr->svr_allocd_segs,
-				    syncd, sm_end - syncd);
-
-			mutex_exit(&svr->svr_lock);
-		}
-		mutex_exit(&msp->ms_lock);
-
-		mutex_enter(&svr->svr_lock);
-		range_tree_vacate(svr->svr_allocd_segs,
-		    free_mapped_segment_cb, vd);
-		mutex_exit(&svr->svr_lock);
-	}
-
-	/*
-	 * Note: this must happen after we invoke free_mapped_segment_cb,
-	 * because it adds to the obsolete_segments.
-	 */
-	range_tree_vacate(vd->vdev_obsolete_segments, NULL, NULL);
-
-	ASSERT3U(vic->vic_mapping_object, ==,
-	    vdev_indirect_mapping_object(vd->vdev_indirect_mapping));
-	vdev_indirect_mapping_close(vd->vdev_indirect_mapping);
-	vd->vdev_indirect_mapping = NULL;
-	vdev_indirect_mapping_free(mos, vic->vic_mapping_object, tx);
-	vic->vic_mapping_object = 0;
-
-	ASSERT3U(vic->vic_births_object, ==,
-	    vdev_indirect_births_object(vd->vdev_indirect_births));
-	vdev_indirect_births_close(vd->vdev_indirect_births);
-	vd->vdev_indirect_births = NULL;
-	vdev_indirect_births_free(mos, vic->vic_births_object, tx);
-	vic->vic_births_object = 0;
-
-	/*
-	 * We may have processed some frees from the removing vdev in this
-	 * txg, thus increasing svr_bytes_done; discard that here to
-	 * satisfy the assertions in spa_vdev_removal_destroy().
-	 * Note that future txg's can not have any bytes_done, because
-	 * future TXG's are only modified from open context, and we have
-	 * already shut down the copying thread.
-	 */
-	svr->svr_bytes_done[dmu_tx_get_txg(tx) & TXG_MASK] = 0;
-	spa_finish_removal(spa, DSS_CANCELED, tx);
-
-	vd->vdev_removing = B_FALSE;
-	vdev_config_dirty(vd);
-
-	zfs_dbgmsg("canceled device removal for vdev %llu in %llu",
-	    vd->vdev_id, dmu_tx_get_txg(tx));
-	spa_history_log_internal(spa, "vdev remove canceled", tx,
-	    "%s vdev %llu %s", spa_name(spa),
-	    vd->vdev_id, (vd->vdev_path != NULL) ? vd->vdev_path : "-");
-}
-
-int
-spa_vdev_remove_cancel(spa_t *spa)
-{
-	spa_vdev_remove_suspend(spa);
-
-	if (spa->spa_vdev_removal == NULL)
-		return (ESRCH);
-
-	uint64_t vdid = spa->spa_vdev_removal->svr_vdev_id;
-
-	int error = dsl_sync_task(spa->spa_name, spa_vdev_remove_cancel_check,
-	    spa_vdev_remove_cancel_sync, NULL, 0,
-	    ZFS_SPACE_CHECK_EXTRA_RESERVED);
-
-	if (error == 0) {
-		spa_config_enter(spa, SCL_ALLOC | SCL_VDEV, FTAG, RW_WRITER);
-		vdev_t *vd = vdev_lookup_top(spa, vdid);
-		metaslab_group_activate(vd->vdev_mg);
-		spa_config_exit(spa, SCL_ALLOC | SCL_VDEV, FTAG);
-	}
-
-	return (error);
-}
-
-void
-svr_sync(spa_t *spa, dmu_tx_t *tx)
-{
-	spa_vdev_removal_t *svr = spa->spa_vdev_removal;
-	int txgoff = dmu_tx_get_txg(tx) & TXG_MASK;
-
-	/*
-	 * This check is necessary so that we do not dirty the
-	 * DIRECTORY_OBJECT via spa_sync_removing_state() when there
-	 * is nothing to do.  Dirtying it every time would prevent us
-	 * from syncing-to-convergence.
-	 */
-	if (svr->svr_bytes_done[txgoff] == 0)
-		return;
-
-	/*
-	 * Update progress accounting.
-	 */
-	spa->spa_removing_phys.sr_copied += svr->svr_bytes_done[txgoff];
-	svr->svr_bytes_done[txgoff] = 0;
-
-	spa_sync_removing_state(spa, tx);
-}
-
-static void
-vdev_remove_make_hole_and_free(vdev_t *vd)
-{
-	uint64_t id = vd->vdev_id;
-	spa_t *spa = vd->vdev_spa;
-	vdev_t *rvd = spa->spa_root_vdev;
-	boolean_t last_vdev = (id == (rvd->vdev_children - 1));
-
-	ASSERT(MUTEX_HELD(&spa_namespace_lock));
-	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
-
-	vdev_free(vd);
-
-	if (last_vdev) {
-		vdev_compact_children(rvd);
-	} else {
-		vd = vdev_alloc_common(spa, id, 0, &vdev_hole_ops);
-		vdev_add_child(rvd, vd);
-	}
-	vdev_config_dirty(rvd);
-
-	/*
-	 * Reassess the health of our root vdev.
-	 */
-	vdev_reopen(rvd);
-}
-
-/*
- * Remove a log device.  The config lock is held for the specified TXG.
- */
-static int
-spa_vdev_remove_log(vdev_t *vd, uint64_t *txg)
-{
-	metaslab_group_t *mg = vd->vdev_mg;
-	spa_t *spa = vd->vdev_spa;
-	int error = 0;
-
-	ASSERT(vd->vdev_islog);
-	ASSERT(vd == vd->vdev_top);
-	ASSERT(MUTEX_HELD(&spa_namespace_lock));
-
-	/*
-	 * Stop allocating from this vdev.
-	 */
-	metaslab_group_passivate(mg);
-
-	/*
-	 * Wait for the youngest allocations and frees to sync,
-	 * and then wait for the deferral of those frees to finish.
-	 */
-	spa_vdev_config_exit(spa, NULL,
-	    *txg + TXG_CONCURRENT_STATES + TXG_DEFER_SIZE, 0, FTAG);
-
-	/*
-	 * Evacuate the device.  We don't hold the config lock as
-	 * writer since we need to do I/O but we do keep the
-	 * spa_namespace_lock held.  Once this completes the device
-	 * should no longer have any blocks allocated on it.
-	 */
-	ASSERT(MUTEX_HELD(&spa_namespace_lock));
-	if (vd->vdev_stat.vs_alloc != 0)
-		error = spa_reset_logs(spa);
-
-	*txg = spa_vdev_config_enter(spa);
-
-	if (error != 0) {
-		metaslab_group_activate(mg);
-		return (error);
-	}
-	ASSERT0(vd->vdev_stat.vs_alloc);
-
-	/*
-	 * The evacuation succeeded.  Remove any remaining MOS metadata
-	 * associated with this vdev, and wait for these changes to sync.
-	 */
-	vd->vdev_removing = B_TRUE;
-
-	vdev_dirty_leaves(vd, VDD_DTL, *txg);
-	vdev_config_dirty(vd);
-
-	vdev_metaslab_fini(vd);
-
-	spa_history_log_internal(spa, "vdev remove", NULL,
-	    "%s vdev %llu (log) %s", spa_name(spa), vd->vdev_id,
-	    (vd->vdev_path != NULL) ? vd->vdev_path : "-");
-
-	/* Make sure these changes are sync'ed */
-	spa_vdev_config_exit(spa, NULL, *txg, 0, FTAG);
-
-	/* Stop initializing */
-	(void) vdev_initialize_stop_all(vd, VDEV_INITIALIZE_CANCELED);
-
-	*txg = spa_vdev_config_enter(spa);
-
-	sysevent_t *ev = spa_event_create(spa, vd, NULL,
-	    ESC_ZFS_VDEV_REMOVE_DEV);
-	ASSERT(MUTEX_HELD(&spa_namespace_lock));
-	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
-
-	/* The top ZAP should have been destroyed by vdev_remove_empty. */
-	ASSERT0(vd->vdev_top_zap);
-	/* The leaf ZAP should have been destroyed by vdev_dtl_sync. */
-	ASSERT0(vd->vdev_leaf_zap);
-
-	(void) vdev_label_init(vd, 0, VDEV_LABEL_REMOVE);
-
-	if (list_link_active(&vd->vdev_state_dirty_node))
-		vdev_state_clean(vd);
-	if (list_link_active(&vd->vdev_config_dirty_node))
-		vdev_config_clean(vd);
-
-	ASSERT0(vd->vdev_stat.vs_alloc);
-
-	/*
-	 * Clean up the vdev namespace.
-	 */
-	vdev_remove_make_hole_and_free(vd);
-
-	if (ev != NULL)
-		spa_event_post(ev);
-
-	return (0);
-}
-
-static int
-spa_vdev_remove_top_check(vdev_t *vd)
-{
-	spa_t *spa = vd->vdev_spa;
-
-	if (vd != vd->vdev_top)
-		return (SET_ERROR(ENOTSUP));
-
-	if (!spa_feature_is_enabled(spa, SPA_FEATURE_DEVICE_REMOVAL))
-		return (SET_ERROR(ENOTSUP));
-
-	/* available space in the pool's normal class */
-	uint64_t available = dsl_dir_space_available(
-	    spa->spa_dsl_pool->dp_root_dir, NULL, 0, B_TRUE);
-
-	metaslab_class_t *mc = vd->vdev_mg->mg_class;
-
-	/*
-	 * When removing a vdev from an allocation class that has
-	 * remaining vdevs, include available space from the class.
-	 */
-	if (mc != spa_normal_class(spa) && mc->mc_groups > 1) {
-		uint64_t class_avail = metaslab_class_get_space(mc) -
-		    metaslab_class_get_alloc(mc);
-
-		/* add class space, adjusted for overhead */
-		available += (class_avail * 94) / 100;
-	}
-
-	/*
-	 * There has to be enough free space to remove the
-	 * device and leave double the "slop" space (i.e. we
-	 * must leave at least 3% of the pool free, in addition to
-	 * the normal slop space).
-	 */
-	if (available < vd->vdev_stat.vs_dspace + spa_get_slop_space(spa)) {
-		return (SET_ERROR(ENOSPC));
-	}
-
-	/*
-	 * There can not be a removal in progress.
-	 */
-	if (spa->spa_removing_phys.sr_state == DSS_SCANNING)
-		return (SET_ERROR(EBUSY));
-
-	/*
-	 * The device must have all its data.
-	 */
-	if (!vdev_dtl_empty(vd, DTL_MISSING) ||
-	    !vdev_dtl_empty(vd, DTL_OUTAGE))
-		return (SET_ERROR(EBUSY));
-
-	/*
-	 * The device must be healthy.
-	 */
-	if (!vdev_readable(vd))
-		return (SET_ERROR(EIO));
-
-	/*
-	 * All vdevs in normal class must have the same ashift.
-	 */
-	if (spa->spa_max_ashift != spa->spa_min_ashift) {
-		return (SET_ERROR(EINVAL));
-	}
-
-	/*
-	 * All vdevs in normal class must have the same ashift
-	 * and not be raidz.
-	 */
-	vdev_t *rvd = spa->spa_root_vdev;
-	int num_indirect = 0;
-	for (uint64_t id = 0; id < rvd->vdev_children; id++) {
-		vdev_t *cvd = rvd->vdev_child[id];
-		if (cvd->vdev_ashift != 0 && !cvd->vdev_islog)
-			ASSERT3U(cvd->vdev_ashift, ==, spa->spa_max_ashift);
-		if (cvd->vdev_ops == &vdev_indirect_ops)
-			num_indirect++;
-		if (!vdev_is_concrete(cvd))
-			continue;
-		if (cvd->vdev_ops == &vdev_raidz_ops)
-			return (SET_ERROR(EINVAL));
-		/*
-		 * Need the mirror to be mirror of leaf vdevs only
-		 */
-		if (cvd->vdev_ops == &vdev_mirror_ops) {
-			for (uint64_t cid = 0;
-			    cid < cvd->vdev_children; cid++) {
-				vdev_t *tmp = cvd->vdev_child[cid];
-				if (!tmp->vdev_ops->vdev_op_leaf)
-					return (SET_ERROR(EINVAL));
-			}
-		}
-	}
-
-	return (0);
-}
-
-/*
- * Initiate removal of a top-level vdev, reducing the total space in the pool.
- * The config lock is held for the specified TXG.  Once initiated,
- * evacuation of all allocated space (copying it to other vdevs) happens
- * in the background (see spa_vdev_remove_thread()), and can be canceled
- * (see spa_vdev_remove_cancel()).  If successful, the vdev will
- * be transformed to an indirect vdev (see spa_vdev_remove_complete()).
- */
-static int
-spa_vdev_remove_top(vdev_t *vd, uint64_t *txg)
-{
-	spa_t *spa = vd->vdev_spa;
-	int error;
-
-	/*
-	 * Check for errors up-front, so that we don't waste time
-	 * passivating the metaslab group and clearing the ZIL if there
-	 * are errors.
-	 */
-	error = spa_vdev_remove_top_check(vd);
-	if (error != 0)
-		return (error);
-
-	/*
-	 * Stop allocating from this vdev.  Note that we must check
-	 * that this is not the only device in the pool before
-	 * passivating, otherwise we will not be able to make
-	 * progress because we can't allocate from any vdevs.
-	 * The above check for sufficient free space serves this
-	 * purpose.
-	 */
-	metaslab_group_t *mg = vd->vdev_mg;
-	metaslab_group_passivate(mg);
-
-	/*
-	 * Wait for the youngest allocations and frees to sync,
-	 * and then wait for the deferral of those frees to finish.
-	 */
-	spa_vdev_config_exit(spa, NULL,
-	    *txg + TXG_CONCURRENT_STATES + TXG_DEFER_SIZE, 0, FTAG);
-
-	/*
-	 * We must ensure that no "stubby" log blocks are allocated
-	 * on the device to be removed.  These blocks could be
-	 * written at any time, including while we are in the middle
-	 * of copying them.
-	 */
-	error = spa_reset_logs(spa);
-
-	/*
-	 * We stop any initializing that is currently in progress but leave
-	 * the state as "active". This will allow the initializing to resume
-	 * if the removal is canceled sometime later.
-	 */
-	vdev_initialize_stop_all(vd, VDEV_INITIALIZE_ACTIVE);
-
-	*txg = spa_vdev_config_enter(spa);
-
-	/*
-	 * Things might have changed while the config lock was dropped
-	 * (e.g. space usage).  Check for errors again.
-	 */
-	if (error == 0)
-		error = spa_vdev_remove_top_check(vd);
-
-	if (error != 0) {
-		metaslab_group_activate(mg);
-		spa_async_request(spa, SPA_ASYNC_INITIALIZE_RESTART);
-		return (error);
-	}
-
-	vd->vdev_removing = B_TRUE;
-
-	vdev_dirty_leaves(vd, VDD_DTL, *txg);
-	vdev_config_dirty(vd);
-	dmu_tx_t *tx = dmu_tx_create_assigned(spa->spa_dsl_pool, *txg);
-	dsl_sync_task_nowait(spa->spa_dsl_pool,
-	    vdev_remove_initiate_sync,
-	    (void *)(uintptr_t)vd->vdev_id, 0, ZFS_SPACE_CHECK_NONE, tx);
-	dmu_tx_commit(tx);
-
-	return (0);
-}
-
-/*
- * Remove a device from the pool.
- *
- * Removing a device from the vdev namespace requires several steps
- * and can take a significant amount of time.  As a result we use
- * the spa_vdev_config_[enter/exit] functions which allow us to
- * grab and release the spa_config_lock while still holding the namespace
- * lock.  During each step the configuration is synced out.
- */
-int
-spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare)
-{
-	vdev_t *vd;
-	nvlist_t **spares, **l2cache, *nv;
-	uint64_t txg = 0;
-	uint_t nspares, nl2cache;
-	int error = 0;
-	boolean_t locked = MUTEX_HELD(&spa_namespace_lock);
-	sysevent_t *ev = NULL;
-
-	ASSERT(spa_writeable(spa));
-
-	if (!locked)
-		txg = spa_vdev_enter(spa);
-
-	ASSERT(MUTEX_HELD(&spa_namespace_lock));
-	if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) {
-		error = (spa_has_checkpoint(spa)) ?
-		    ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT;
-
-		if (!locked)
-			return (spa_vdev_exit(spa, NULL, txg, error));
-
-		return (error);
-	}
-
-	vd = spa_lookup_by_guid(spa, guid, B_FALSE);
-
-	if (spa->spa_spares.sav_vdevs != NULL &&
-	    nvlist_lookup_nvlist_array(spa->spa_spares.sav_config,
-	    ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0 &&
-	    (nv = spa_nvlist_lookup_by_guid(spares, nspares, guid)) != NULL) {
-		/*
-		 * Only remove the hot spare if it's not currently in use
-		 * in this pool.
-		 */
-		if (vd == NULL || unspare) {
-			char *nvstr = fnvlist_lookup_string(nv,
-			    ZPOOL_CONFIG_PATH);
-			spa_history_log_internal(spa, "vdev remove", NULL,
-			    "%s vdev (%s) %s", spa_name(spa),
-			    VDEV_TYPE_SPARE, nvstr);
-			if (vd == NULL)
-				vd = spa_lookup_by_guid(spa, guid, B_TRUE);
-			ev = spa_event_create(spa, vd, NULL,
-			    ESC_ZFS_VDEV_REMOVE_AUX);
-			spa_vdev_remove_aux(spa->spa_spares.sav_config,
-			    ZPOOL_CONFIG_SPARES, spares, nspares, nv);
-			spa_load_spares(spa);
-			spa->spa_spares.sav_sync = B_TRUE;
-		} else {
-			error = SET_ERROR(EBUSY);
-		}
-	} else if (spa->spa_l2cache.sav_vdevs != NULL &&
-	    nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config,
-	    ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0 &&
-	    (nv = spa_nvlist_lookup_by_guid(l2cache, nl2cache, guid)) != NULL) {
-		char *nvstr = fnvlist_lookup_string(nv, ZPOOL_CONFIG_PATH);
-		spa_history_log_internal(spa, "vdev remove", NULL,
-		    "%s vdev (%s) %s", spa_name(spa), VDEV_TYPE_L2CACHE, nvstr);
-		/*
-		 * Cache devices can always be removed.
-		 */
-		vd = spa_lookup_by_guid(spa, guid, B_TRUE);
-		ev = spa_event_create(spa, vd, NULL, ESC_ZFS_VDEV_REMOVE_AUX);
-		spa_vdev_remove_aux(spa->spa_l2cache.sav_config,
-		    ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache, nv);
-		spa_load_l2cache(spa);
-		spa->spa_l2cache.sav_sync = B_TRUE;
-	} else if (vd != NULL && vd->vdev_islog) {
-		ASSERT(!locked);
-		error = spa_vdev_remove_log(vd, &txg);
-	} else if (vd != NULL) {
-		ASSERT(!locked);
-		error = spa_vdev_remove_top(vd, &txg);
-	} else {
-		/*
-		 * There is no vdev of any kind with the specified guid.
-		 */
-		error = SET_ERROR(ENOENT);
-	}
-
-	if (!locked)
-		error = spa_vdev_exit(spa, NULL, txg, error);
-
-	if (ev != NULL) {
-		if (error != 0) {
-			spa_event_discard(ev);
-		} else {
-			spa_event_post(ev);
-		}
-	}
-
-	return (error);
-}
-
-int
-spa_removal_get_stats(spa_t *spa, pool_removal_stat_t *prs)
-{
-	prs->prs_state = spa->spa_removing_phys.sr_state;
-
-	if (prs->prs_state == DSS_NONE)
-		return (SET_ERROR(ENOENT));
-
-	prs->prs_removing_vdev = spa->spa_removing_phys.sr_removing_vdev;
-	prs->prs_start_time = spa->spa_removing_phys.sr_start_time;
-	prs->prs_end_time = spa->spa_removing_phys.sr_end_time;
-	prs->prs_to_copy = spa->spa_removing_phys.sr_to_copy;
-	prs->prs_copied = spa->spa_removing_phys.sr_copied;
-
-	if (spa->spa_vdev_removal != NULL) {
-		for (int i = 0; i < TXG_SIZE; i++) {
-			prs->prs_copied +=
-			    spa->spa_vdev_removal->svr_bytes_done[i];
-		}
-	}
-
-	prs->prs_mapping_memory = 0;
-	uint64_t indirect_vdev_id =
-	    spa->spa_removing_phys.sr_prev_indirect_vdev;
-	while (indirect_vdev_id != -1) {
-		vdev_t *vd = spa->spa_root_vdev->vdev_child[indirect_vdev_id];
-		vdev_indirect_config_t *vic = &vd->vdev_indirect_config;
-		vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
-
-		ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops);
-		prs->prs_mapping_memory += vdev_indirect_mapping_size(vim);
-		indirect_vdev_id = vic->vic_prev_indirect_vdev;
-	}
-
-	return (0);
-}
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_root.c
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_root.c
+++ /dev/null
@@ -1,157 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-
-/*
- * Copyright (c) 2012, 2016 by Delphix. All rights reserved.
- */
-
-#include <sys/zfs_context.h>
-#include <sys/spa.h>
-#include <sys/vdev_impl.h>
-#include <sys/zio.h>
-#include <sys/fs/zfs.h>
-
-/*
- * Virtual device vector for the pool's root vdev.
- */
-
-static uint64_t
-vdev_root_core_tvds(vdev_t *vd)
-{
-	uint64_t tvds = 0;
-
-	for (uint64_t c = 0; c < vd->vdev_children; c++) {
-		vdev_t *cvd = vd->vdev_child[c];
-
-		if (!cvd->vdev_ishole && !cvd->vdev_islog &&
-		    cvd->vdev_ops != &vdev_indirect_ops) {
-			tvds++;
-		}
-	}
-
-	return (tvds);
-}
-
-/*
- * We should be able to tolerate one failure with absolutely no damage
- * to our metadata.  Two failures will take out space maps, a bunch of
- * indirect block trees, meta dnodes, dnodes, etc.  Probably not a happy
- * place to live.  When we get smarter, we can liberalize this policy.
- * e.g. If we haven't lost two consecutive top-level vdevs, then we are
- * probably fine.  Adding bean counters during alloc/free can make this
- * future guesswork more accurate.
- */
-static boolean_t
-too_many_errors(vdev_t *vd, uint64_t numerrors)
-{
-	uint64_t tvds;
-
-	if (numerrors == 0)
-		return (B_FALSE);
-
-	tvds = vdev_root_core_tvds(vd);
-	ASSERT3U(numerrors, <=, tvds);
-
-	if (numerrors == tvds)
-		return (B_TRUE);
-
-	return (numerrors > spa_missing_tvds_allowed(vd->vdev_spa));
-}
-
-static int
-vdev_root_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize,
-    uint64_t *logical_ashift, uint64_t *physical_ashift)
-{
-	spa_t *spa = vd->vdev_spa;
-	int lasterror = 0;
-	int numerrors = 0;
-
-	if (vd->vdev_children == 0) {
-		vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
-		return (SET_ERROR(EINVAL));
-	}
-
-	vdev_open_children(vd);
-
-	for (int c = 0; c < vd->vdev_children; c++) {
-		vdev_t *cvd = vd->vdev_child[c];
-
-		if (cvd->vdev_open_error && !cvd->vdev_islog) {
-			lasterror = cvd->vdev_open_error;
-			numerrors++;
-		}
-	}
-
-	if (spa_load_state(spa) != SPA_LOAD_NONE)
-		spa_set_missing_tvds(spa, numerrors);
-
-	if (too_many_errors(vd, numerrors)) {
-		vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
-		return (lasterror);
-	}
-
-	*asize = 0;
-	*max_asize = 0;
-	*logical_ashift = 0;
-	*physical_ashift = 0;
-
-	return (0);
-}
-
-static void
-vdev_root_close(vdev_t *vd)
-{
-	for (int c = 0; c < vd->vdev_children; c++)
-		vdev_close(vd->vdev_child[c]);
-}
-
-static void
-vdev_root_state_change(vdev_t *vd, int faulted, int degraded)
-{
-	if (too_many_errors(vd, faulted)) {
-		vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
-		    VDEV_AUX_NO_REPLICAS);
-	} else if (degraded || faulted) {
-		vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
-	} else {
-		vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
-	}
-}
-
-vdev_ops_t vdev_root_ops = {
-	vdev_root_open,
-	vdev_root_close,
-	vdev_default_asize,
-	NULL,			/* io_start - not applicable to the root */
-	NULL,			/* io_done - not applicable to the root */
-	vdev_root_state_change,
-	NULL,
-	NULL,
-	NULL,
-	NULL,
-	NULL,
-	VDEV_TYPE_ROOT,		/* name of this vdev type */
-	B_FALSE			/* not a leaf vdev */
-};
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap.c
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap.c
+++ /dev/null
@@ -1,1378 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
- * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
- */
-
-/*
- * This file contains the top half of the zfs directory structure
- * implementation. The bottom half is in zap_leaf.c.
- *
- * The zdir is an extendable hash data structure. There is a table of
- * pointers to buckets (zap_t->zd_data->zd_leafs). The buckets are
- * each a constant size and hold a variable number of directory entries.
- * The buckets (aka "leaf nodes") are implemented in zap_leaf.c.
- *
- * The pointer table holds a power of 2 number of pointers.
- * (1<<zap_t->zd_data->zd_phys->zd_prefix_len).  The bucket pointed to
- * by the pointer at index i in the table holds entries whose hash value
- * has a zd_prefix_len - bit prefix
- */
-
-#include <sys/spa.h>
-#include <sys/dmu.h>
-#include <sys/zfs_context.h>
-#include <sys/zfs_znode.h>
-#include <sys/fs/zfs.h>
-#include <sys/zap.h>
-#include <sys/refcount.h>
-#include <sys/zap_impl.h>
-#include <sys/zap_leaf.h>
-
-/*
- * If zap_iterate_prefetch is set, we will prefetch the entire ZAP object
- * (all leaf blocks) when we start iterating over it.
- *
- * For zap_cursor_init(), the callers all intend to iterate through all the
- * entries.  There are a few cases where an error (typically i/o error) could
- * cause it to bail out early.
- *
- * For zap_cursor_init_serialized(), there are callers that do the iteration
- * outside of ZFS.  Typically they would iterate over everything, but we
- * don't have control of that.  E.g. zfs_ioc_snapshot_list_next(),
- * zcp_snapshots_iter(), and other iterators over things in the MOS - these
- * are called by /sbin/zfs and channel programs.  The other example is
- * zfs_readdir() which iterates over directory entries for the getdents()
- * syscall.  /sbin/ls iterates to the end (unless it receives a signal), but
- * userland doesn't have to.
- *
- * Given that the ZAP entries aren't returned in a specific order, the only
- * legitimate use cases for partial iteration would be:
- *
- * 1. Pagination: e.g. you only want to display 100 entries at a time, so you
- *    get the first 100 and then wait for the user to hit "next page", which
- *    they may never do).
- *
- * 2. You want to know if there are more than X entries, without relying on
- *    the zfs-specific implementation of the directory's st_size (which is
- *    the number of entries).
- */
-boolean_t zap_iterate_prefetch = B_TRUE;
-
-int fzap_default_block_shift = 14; /* 16k blocksize */
-
-extern inline zap_phys_t *zap_f_phys(zap_t *zap);
-
-static uint64_t zap_allocate_blocks(zap_t *zap, int nblocks);
-
-void
-fzap_byteswap(void *vbuf, size_t size)
-{
-	uint64_t block_type = *(uint64_t *)vbuf;
-
-	if (block_type == ZBT_LEAF || block_type == BSWAP_64(ZBT_LEAF))
-		zap_leaf_byteswap(vbuf, size);
-	else {
-		/* it's a ptrtbl block */
-		byteswap_uint64_array(vbuf, size);
-	}
-}
-
-void
-fzap_upgrade(zap_t *zap, dmu_tx_t *tx, zap_flags_t flags)
-{
-	ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
-	zap->zap_ismicro = FALSE;
-
-	zap->zap_dbu.dbu_evict_func_sync = zap_evict_sync;
-	zap->zap_dbu.dbu_evict_func_async = NULL;
-
-	mutex_init(&zap->zap_f.zap_num_entries_mtx, 0, 0, 0);
-	zap->zap_f.zap_block_shift = highbit64(zap->zap_dbuf->db_size) - 1;
-
-	zap_phys_t *zp = zap_f_phys(zap);
-	/*
-	 * explicitly zero it since it might be coming from an
-	 * initialized microzap
-	 */
-	bzero(zap->zap_dbuf->db_data, zap->zap_dbuf->db_size);
-	zp->zap_block_type = ZBT_HEADER;
-	zp->zap_magic = ZAP_MAGIC;
-
-	zp->zap_ptrtbl.zt_shift = ZAP_EMBEDDED_PTRTBL_SHIFT(zap);
-
-	zp->zap_freeblk = 2;		/* block 1 will be the first leaf */
-	zp->zap_num_leafs = 1;
-	zp->zap_num_entries = 0;
-	zp->zap_salt = zap->zap_salt;
-	zp->zap_normflags = zap->zap_normflags;
-	zp->zap_flags = flags;
-
-	/* block 1 will be the first leaf */
-	for (int i = 0; i < (1<<zp->zap_ptrtbl.zt_shift); i++)
-		ZAP_EMBEDDED_PTRTBL_ENT(zap, i) = 1;
-
-	/*
-	 * set up block 1 - the first leaf
-	 */
-	dmu_buf_t *db;
-	VERIFY0(dmu_buf_hold(zap->zap_objset, zap->zap_object,
-	    1<<FZAP_BLOCK_SHIFT(zap), FTAG, &db, DMU_READ_NO_PREFETCH));
-	dmu_buf_will_dirty(db, tx);
-
-	zap_leaf_t *l = kmem_zalloc(sizeof (zap_leaf_t), KM_SLEEP);
-	l->l_dbuf = db;
-
-	zap_leaf_init(l, zp->zap_normflags != 0);
-
-	kmem_free(l, sizeof (zap_leaf_t));
-	dmu_buf_rele(db, FTAG);
-}
-
-static int
-zap_tryupgradedir(zap_t *zap, dmu_tx_t *tx)
-{
-	if (RW_WRITE_HELD(&zap->zap_rwlock))
-		return (1);
-	if (rw_tryupgrade(&zap->zap_rwlock)) {
-		dmu_buf_will_dirty(zap->zap_dbuf, tx);
-		return (1);
-	}
-	return (0);
-}
-
-/*
- * Generic routines for dealing with the pointer & cookie tables.
- */
-
-static int
-zap_table_grow(zap_t *zap, zap_table_phys_t *tbl,
-    void (*transfer_func)(const uint64_t *src, uint64_t *dst, int n),
-    dmu_tx_t *tx)
-{
-	uint64_t newblk;
-	int bs = FZAP_BLOCK_SHIFT(zap);
-	int hepb = 1<<(bs-4);
-	/* hepb = half the number of entries in a block */
-
-	ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
-	ASSERT(tbl->zt_blk != 0);
-	ASSERT(tbl->zt_numblks > 0);
-
-	if (tbl->zt_nextblk != 0) {
-		newblk = tbl->zt_nextblk;
-	} else {
-		newblk = zap_allocate_blocks(zap, tbl->zt_numblks * 2);
-		tbl->zt_nextblk = newblk;
-		ASSERT0(tbl->zt_blks_copied);
-		dmu_prefetch(zap->zap_objset, zap->zap_object, 0,
-		    tbl->zt_blk << bs, tbl->zt_numblks << bs,
-		    ZIO_PRIORITY_SYNC_READ);
-	}
-
-	/*
-	 * Copy the ptrtbl from the old to new location.
-	 */
-
-	uint64_t b = tbl->zt_blks_copied;
-	dmu_buf_t *db_old;
-	int err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
-	    (tbl->zt_blk + b) << bs, FTAG, &db_old, DMU_READ_NO_PREFETCH);
-	if (err != 0)
-		return (err);
-
-	/* first half of entries in old[b] go to new[2*b+0] */
-	dmu_buf_t *db_new;
-	VERIFY0(dmu_buf_hold(zap->zap_objset, zap->zap_object,
-	    (newblk + 2*b+0) << bs, FTAG, &db_new, DMU_READ_NO_PREFETCH));
-	dmu_buf_will_dirty(db_new, tx);
-	transfer_func(db_old->db_data, db_new->db_data, hepb);
-	dmu_buf_rele(db_new, FTAG);
-
-	/* second half of entries in old[b] go to new[2*b+1] */
-	VERIFY0(dmu_buf_hold(zap->zap_objset, zap->zap_object,
-	    (newblk + 2*b+1) << bs, FTAG, &db_new, DMU_READ_NO_PREFETCH));
-	dmu_buf_will_dirty(db_new, tx);
-	transfer_func((uint64_t *)db_old->db_data + hepb,
-	    db_new->db_data, hepb);
-	dmu_buf_rele(db_new, FTAG);
-
-	dmu_buf_rele(db_old, FTAG);
-
-	tbl->zt_blks_copied++;
-
-	dprintf("copied block %llu of %llu\n",
-	    tbl->zt_blks_copied, tbl->zt_numblks);
-
-	if (tbl->zt_blks_copied == tbl->zt_numblks) {
-		(void) dmu_free_range(zap->zap_objset, zap->zap_object,
-		    tbl->zt_blk << bs, tbl->zt_numblks << bs, tx);
-
-		tbl->zt_blk = newblk;
-		tbl->zt_numblks *= 2;
-		tbl->zt_shift++;
-		tbl->zt_nextblk = 0;
-		tbl->zt_blks_copied = 0;
-
-		dprintf("finished; numblocks now %llu (%lluk entries)\n",
-		    tbl->zt_numblks, 1<<(tbl->zt_shift-10));
-	}
-
-	return (0);
-}
-
-static int
-zap_table_store(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, uint64_t val,
-    dmu_tx_t *tx)
-{
-	int bs = FZAP_BLOCK_SHIFT(zap);
-
-	ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
-	ASSERT(tbl->zt_blk != 0);
-
-	dprintf("storing %llx at index %llx\n", val, idx);
-
-	uint64_t blk = idx >> (bs-3);
-	uint64_t off = idx & ((1<<(bs-3))-1);
-
-	dmu_buf_t *db;
-	int err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
-	    (tbl->zt_blk + blk) << bs, FTAG, &db, DMU_READ_NO_PREFETCH);
-	if (err != 0)
-		return (err);
-	dmu_buf_will_dirty(db, tx);
-
-	if (tbl->zt_nextblk != 0) {
-		uint64_t idx2 = idx * 2;
-		uint64_t blk2 = idx2 >> (bs-3);
-		uint64_t off2 = idx2 & ((1<<(bs-3))-1);
-		dmu_buf_t *db2;
-
-		err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
-		    (tbl->zt_nextblk + blk2) << bs, FTAG, &db2,
-		    DMU_READ_NO_PREFETCH);
-		if (err != 0) {
-			dmu_buf_rele(db, FTAG);
-			return (err);
-		}
-		dmu_buf_will_dirty(db2, tx);
-		((uint64_t *)db2->db_data)[off2] = val;
-		((uint64_t *)db2->db_data)[off2+1] = val;
-		dmu_buf_rele(db2, FTAG);
-	}
-
-	((uint64_t *)db->db_data)[off] = val;
-	dmu_buf_rele(db, FTAG);
-
-	return (0);
-}
-
-static int
-zap_table_load(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, uint64_t *valp)
-{
-	int bs = FZAP_BLOCK_SHIFT(zap);
-
-	ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
-
-	uint64_t blk = idx >> (bs-3);
-	uint64_t off = idx & ((1<<(bs-3))-1);
-
-	/*
-	 * Note: this is equivalent to dmu_buf_hold(), but we use
-	 * _dnode_enter / _by_dnode because it's faster because we don't
-	 * have to hold the dnode.
-	 */
-	dnode_t *dn = dmu_buf_dnode_enter(zap->zap_dbuf);
-	dmu_buf_t *db;
-	int err = dmu_buf_hold_by_dnode(dn,
-	    (tbl->zt_blk + blk) << bs, FTAG, &db, DMU_READ_NO_PREFETCH);
-	dmu_buf_dnode_exit(zap->zap_dbuf);
-	if (err != 0)
-		return (err);
-	*valp = ((uint64_t *)db->db_data)[off];
-	dmu_buf_rele(db, FTAG);
-
-	if (tbl->zt_nextblk != 0) {
-		/*
-		 * read the nextblk for the sake of i/o error checking,
-		 * so that zap_table_load() will catch errors for
-		 * zap_table_store.
-		 */
-		blk = (idx*2) >> (bs-3);
-
-		dn = dmu_buf_dnode_enter(zap->zap_dbuf);
-		err = dmu_buf_hold_by_dnode(dn,
-		    (tbl->zt_nextblk + blk) << bs, FTAG, &db,
-		    DMU_READ_NO_PREFETCH);
-		dmu_buf_dnode_exit(zap->zap_dbuf);
-		if (err == 0)
-			dmu_buf_rele(db, FTAG);
-	}
-	return (err);
-}
-
-/*
- * Routines for growing the ptrtbl.
- */
-
-static void
-zap_ptrtbl_transfer(const uint64_t *src, uint64_t *dst, int n)
-{
-	for (int i = 0; i < n; i++) {
-		uint64_t lb = src[i];
-		dst[2 * i + 0] = lb;
-		dst[2 * i + 1] = lb;
-	}
-}
-
-static int
-zap_grow_ptrtbl(zap_t *zap, dmu_tx_t *tx)
-{
-	/*
-	 * The pointer table should never use more hash bits than we
-	 * have (otherwise we'd be using useless zero bits to index it).
-	 * If we are within 2 bits of running out, stop growing, since
-	 * this is already an aberrant condition.
-	 */
-	if (zap_f_phys(zap)->zap_ptrtbl.zt_shift >= zap_hashbits(zap) - 2)
-		return (SET_ERROR(ENOSPC));
-
-	if (zap_f_phys(zap)->zap_ptrtbl.zt_numblks == 0) {
-		/*
-		 * We are outgrowing the "embedded" ptrtbl (the one
-		 * stored in the header block).  Give it its own entire
-		 * block, which will double the size of the ptrtbl.
-		 */
-		ASSERT3U(zap_f_phys(zap)->zap_ptrtbl.zt_shift, ==,
-		    ZAP_EMBEDDED_PTRTBL_SHIFT(zap));
-		ASSERT0(zap_f_phys(zap)->zap_ptrtbl.zt_blk);
-
-		uint64_t newblk = zap_allocate_blocks(zap, 1);
-		dmu_buf_t *db_new;
-		int err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
-		    newblk << FZAP_BLOCK_SHIFT(zap), FTAG, &db_new,
-		    DMU_READ_NO_PREFETCH);
-		if (err != 0)
-			return (err);
-		dmu_buf_will_dirty(db_new, tx);
-		zap_ptrtbl_transfer(&ZAP_EMBEDDED_PTRTBL_ENT(zap, 0),
-		    db_new->db_data, 1 << ZAP_EMBEDDED_PTRTBL_SHIFT(zap));
-		dmu_buf_rele(db_new, FTAG);
-
-		zap_f_phys(zap)->zap_ptrtbl.zt_blk = newblk;
-		zap_f_phys(zap)->zap_ptrtbl.zt_numblks = 1;
-		zap_f_phys(zap)->zap_ptrtbl.zt_shift++;
-
-		ASSERT3U(1ULL << zap_f_phys(zap)->zap_ptrtbl.zt_shift, ==,
-		    zap_f_phys(zap)->zap_ptrtbl.zt_numblks <<
-		    (FZAP_BLOCK_SHIFT(zap)-3));
-
-		return (0);
-	} else {
-		return (zap_table_grow(zap, &zap_f_phys(zap)->zap_ptrtbl,
-		    zap_ptrtbl_transfer, tx));
-	}
-}
-
-static void
-zap_increment_num_entries(zap_t *zap, int delta, dmu_tx_t *tx)
-{
-	dmu_buf_will_dirty(zap->zap_dbuf, tx);
-	mutex_enter(&zap->zap_f.zap_num_entries_mtx);
-	ASSERT(delta > 0 || zap_f_phys(zap)->zap_num_entries >= -delta);
-	zap_f_phys(zap)->zap_num_entries += delta;
-	mutex_exit(&zap->zap_f.zap_num_entries_mtx);
-}
-
-static uint64_t
-zap_allocate_blocks(zap_t *zap, int nblocks)
-{
-	ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
-	uint64_t newblk = zap_f_phys(zap)->zap_freeblk;
-	zap_f_phys(zap)->zap_freeblk += nblocks;
-	return (newblk);
-}
-
-static void
-zap_leaf_evict_sync(void *dbu)
-{
-	zap_leaf_t *l = dbu;
-
-	rw_destroy(&l->l_rwlock);
-	kmem_free(l, sizeof (zap_leaf_t));
-}
-
-static zap_leaf_t *
-zap_create_leaf(zap_t *zap, dmu_tx_t *tx)
-{
-	zap_leaf_t *l = kmem_zalloc(sizeof (zap_leaf_t), KM_SLEEP);
-
-	ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
-
-	rw_init(&l->l_rwlock, 0, 0, 0);
-	rw_enter(&l->l_rwlock, RW_WRITER);
-	l->l_blkid = zap_allocate_blocks(zap, 1);
-	l->l_dbuf = NULL;
-
-	VERIFY0(dmu_buf_hold(zap->zap_objset, zap->zap_object,
-	    l->l_blkid << FZAP_BLOCK_SHIFT(zap), NULL, &l->l_dbuf,
-	    DMU_READ_NO_PREFETCH));
-	dmu_buf_init_user(&l->l_dbu, zap_leaf_evict_sync, NULL, &l->l_dbuf);
-	VERIFY3P(NULL, ==, dmu_buf_set_user(l->l_dbuf, &l->l_dbu));
-	dmu_buf_will_dirty(l->l_dbuf, tx);
-
-	zap_leaf_init(l, zap->zap_normflags != 0);
-
-	zap_f_phys(zap)->zap_num_leafs++;
-
-	return (l);
-}
-
-int
-fzap_count(zap_t *zap, uint64_t *count)
-{
-	ASSERT(!zap->zap_ismicro);
-	mutex_enter(&zap->zap_f.zap_num_entries_mtx); /* unnecessary */
-	*count = zap_f_phys(zap)->zap_num_entries;
-	mutex_exit(&zap->zap_f.zap_num_entries_mtx);
-	return (0);
-}
-
-/*
- * Routines for obtaining zap_leaf_t's
- */
-
-void
-zap_put_leaf(zap_leaf_t *l)
-{
-	rw_exit(&l->l_rwlock);
-	dmu_buf_rele(l->l_dbuf, NULL);
-}
-
-static zap_leaf_t *
-zap_open_leaf(uint64_t blkid, dmu_buf_t *db)
-{
-	ASSERT(blkid != 0);
-
-	zap_leaf_t *l = kmem_zalloc(sizeof (zap_leaf_t), KM_SLEEP);
-	rw_init(&l->l_rwlock, 0, 0, 0);
-	rw_enter(&l->l_rwlock, RW_WRITER);
-	l->l_blkid = blkid;
-	l->l_bs = highbit64(db->db_size) - 1;
-	l->l_dbuf = db;
-
-	dmu_buf_init_user(&l->l_dbu, zap_leaf_evict_sync, NULL, &l->l_dbuf);
-	zap_leaf_t *winner = dmu_buf_set_user(db, &l->l_dbu);
-
-	rw_exit(&l->l_rwlock);
-	if (winner != NULL) {
-		/* someone else set it first */
-		zap_leaf_evict_sync(&l->l_dbu);
-		l = winner;
-	}
-
-	/*
-	 * lhr_pad was previously used for the next leaf in the leaf
-	 * chain.  There should be no chained leafs (as we have removed
-	 * support for them).
-	 */
-	ASSERT0(zap_leaf_phys(l)->l_hdr.lh_pad1);
-
-	/*
-	 * There should be more hash entries than there can be
-	 * chunks to put in the hash table
-	 */
-	ASSERT3U(ZAP_LEAF_HASH_NUMENTRIES(l), >, ZAP_LEAF_NUMCHUNKS(l) / 3);
-
-	/* The chunks should begin at the end of the hash table */
-	ASSERT3P(&ZAP_LEAF_CHUNK(l, 0), ==,
-	    &zap_leaf_phys(l)->l_hash[ZAP_LEAF_HASH_NUMENTRIES(l)]);
-
-	/* The chunks should end at the end of the block */
-	ASSERT3U((uintptr_t)&ZAP_LEAF_CHUNK(l, ZAP_LEAF_NUMCHUNKS(l)) -
-	    (uintptr_t)zap_leaf_phys(l), ==, l->l_dbuf->db_size);
-
-	return (l);
-}
-
-static int
-zap_get_leaf_byblk(zap_t *zap, uint64_t blkid, dmu_tx_t *tx, krw_t lt,
-    zap_leaf_t **lp)
-{
-	dmu_buf_t *db;
-
-	ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
-
-	int bs = FZAP_BLOCK_SHIFT(zap);
-	dnode_t *dn = dmu_buf_dnode_enter(zap->zap_dbuf);
-	int err = dmu_buf_hold_by_dnode(dn,
-	    blkid << bs, NULL, &db, DMU_READ_NO_PREFETCH);
-	dmu_buf_dnode_exit(zap->zap_dbuf);
-	if (err != 0)
-		return (err);
-
-	ASSERT3U(db->db_object, ==, zap->zap_object);
-	ASSERT3U(db->db_offset, ==, blkid << bs);
-	ASSERT3U(db->db_size, ==, 1 << bs);
-	ASSERT(blkid != 0);
-
-	zap_leaf_t *l = dmu_buf_get_user(db);
-
-	if (l == NULL)
-		l = zap_open_leaf(blkid, db);
-
-	rw_enter(&l->l_rwlock, lt);
-	/*
-	 * Must lock before dirtying, otherwise zap_leaf_phys(l) could change,
-	 * causing ASSERT below to fail.
-	 */
-	if (lt == RW_WRITER)
-		dmu_buf_will_dirty(db, tx);
-	ASSERT3U(l->l_blkid, ==, blkid);
-	ASSERT3P(l->l_dbuf, ==, db);
-	ASSERT3U(zap_leaf_phys(l)->l_hdr.lh_block_type, ==, ZBT_LEAF);
-	ASSERT3U(zap_leaf_phys(l)->l_hdr.lh_magic, ==, ZAP_LEAF_MAGIC);
-
-	*lp = l;
-	return (0);
-}
-
-static int
-zap_idx_to_blk(zap_t *zap, uint64_t idx, uint64_t *valp)
-{
-	ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
-
-	if (zap_f_phys(zap)->zap_ptrtbl.zt_numblks == 0) {
-		ASSERT3U(idx, <,
-		    (1ULL << zap_f_phys(zap)->zap_ptrtbl.zt_shift));
-		*valp = ZAP_EMBEDDED_PTRTBL_ENT(zap, idx);
-		return (0);
-	} else {
-		return (zap_table_load(zap, &zap_f_phys(zap)->zap_ptrtbl,
-		    idx, valp));
-	}
-}
-
-static int
-zap_set_idx_to_blk(zap_t *zap, uint64_t idx, uint64_t blk, dmu_tx_t *tx)
-{
-	ASSERT(tx != NULL);
-	ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
-
-	if (zap_f_phys(zap)->zap_ptrtbl.zt_blk == 0) {
-		ZAP_EMBEDDED_PTRTBL_ENT(zap, idx) = blk;
-		return (0);
-	} else {
-		return (zap_table_store(zap, &zap_f_phys(zap)->zap_ptrtbl,
-		    idx, blk, tx));
-	}
-}
-
-static int
-zap_deref_leaf(zap_t *zap, uint64_t h, dmu_tx_t *tx, krw_t lt, zap_leaf_t **lp)
-{
-	uint64_t blk;
-
-	ASSERT(zap->zap_dbuf == NULL ||
-	    zap_f_phys(zap) == zap->zap_dbuf->db_data);
-
-	/* Reality check for corrupt zap objects (leaf or header). */
-	if ((zap_f_phys(zap)->zap_block_type != ZBT_LEAF &&
-	    zap_f_phys(zap)->zap_block_type != ZBT_HEADER) ||
-	    zap_f_phys(zap)->zap_magic != ZAP_MAGIC) {
-		return (SET_ERROR(EIO));
-	}
-
-	uint64_t idx = ZAP_HASH_IDX(h, zap_f_phys(zap)->zap_ptrtbl.zt_shift);
-	int err = zap_idx_to_blk(zap, idx, &blk);
-	if (err != 0)
-		return (err);
-	err = zap_get_leaf_byblk(zap, blk, tx, lt, lp);
-
-	ASSERT(err ||
-	    ZAP_HASH_IDX(h, zap_leaf_phys(*lp)->l_hdr.lh_prefix_len) ==
-	    zap_leaf_phys(*lp)->l_hdr.lh_prefix);
-	return (err);
-}
-
-static int
-zap_expand_leaf(zap_name_t *zn, zap_leaf_t *l,
-    void *tag, dmu_tx_t *tx, zap_leaf_t **lp)
-{
-	zap_t *zap = zn->zn_zap;
-	uint64_t hash = zn->zn_hash;
-	int err;
-	int old_prefix_len = zap_leaf_phys(l)->l_hdr.lh_prefix_len;
-
-	ASSERT3U(old_prefix_len, <=, zap_f_phys(zap)->zap_ptrtbl.zt_shift);
-	ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
-
-	ASSERT3U(ZAP_HASH_IDX(hash, old_prefix_len), ==,
-	    zap_leaf_phys(l)->l_hdr.lh_prefix);
-
-	if (zap_tryupgradedir(zap, tx) == 0 ||
-	    old_prefix_len == zap_f_phys(zap)->zap_ptrtbl.zt_shift) {
-		/* We failed to upgrade, or need to grow the pointer table */
-		objset_t *os = zap->zap_objset;
-		uint64_t object = zap->zap_object;
-
-		zap_put_leaf(l);
-		zap_unlockdir(zap, tag);
-		err = zap_lockdir(os, object, tx, RW_WRITER,
-		    FALSE, FALSE, tag, &zn->zn_zap);
-		zap = zn->zn_zap;
-		if (err != 0)
-			return (err);
-		ASSERT(!zap->zap_ismicro);
-
-		while (old_prefix_len ==
-		    zap_f_phys(zap)->zap_ptrtbl.zt_shift) {
-			err = zap_grow_ptrtbl(zap, tx);
-			if (err != 0)
-				return (err);
-		}
-
-		err = zap_deref_leaf(zap, hash, tx, RW_WRITER, &l);
-		if (err != 0)
-			return (err);
-
-		if (zap_leaf_phys(l)->l_hdr.lh_prefix_len != old_prefix_len) {
-			/* it split while our locks were down */
-			*lp = l;
-			return (0);
-		}
-	}
-	ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
-	ASSERT3U(old_prefix_len, <, zap_f_phys(zap)->zap_ptrtbl.zt_shift);
-	ASSERT3U(ZAP_HASH_IDX(hash, old_prefix_len), ==,
-	    zap_leaf_phys(l)->l_hdr.lh_prefix);
-
-	int prefix_diff = zap_f_phys(zap)->zap_ptrtbl.zt_shift -
-	    (old_prefix_len + 1);
-	uint64_t sibling =
-	    (ZAP_HASH_IDX(hash, old_prefix_len + 1) | 1) << prefix_diff;
-
-	/* check for i/o errors before doing zap_leaf_split */
-	for (int i = 0; i < (1ULL << prefix_diff); i++) {
-		uint64_t blk;
-		err = zap_idx_to_blk(zap, sibling + i, &blk);
-		if (err != 0)
-			return (err);
-		ASSERT3U(blk, ==, l->l_blkid);
-	}
-
-	zap_leaf_t *nl = zap_create_leaf(zap, tx);
-	zap_leaf_split(l, nl, zap->zap_normflags != 0);
-
-	/* set sibling pointers */
-	for (int i = 0; i < (1ULL << prefix_diff); i++) {
-		err = zap_set_idx_to_blk(zap, sibling + i, nl->l_blkid, tx);
-		ASSERT0(err); /* we checked for i/o errors above */
-	}
-
-	if (hash & (1ULL << (64 - zap_leaf_phys(l)->l_hdr.lh_prefix_len))) {
-		/* we want the sibling */
-		zap_put_leaf(l);
-		*lp = nl;
-	} else {
-		zap_put_leaf(nl);
-		*lp = l;
-	}
-
-	return (0);
-}
-
-static void
-zap_put_leaf_maybe_grow_ptrtbl(zap_name_t *zn, zap_leaf_t *l,
-    void *tag, dmu_tx_t *tx)
-{
-	zap_t *zap = zn->zn_zap;
-	int shift = zap_f_phys(zap)->zap_ptrtbl.zt_shift;
-	int leaffull = (zap_leaf_phys(l)->l_hdr.lh_prefix_len == shift &&
-	    zap_leaf_phys(l)->l_hdr.lh_nfree < ZAP_LEAF_LOW_WATER);
-
-	zap_put_leaf(l);
-
-	if (leaffull || zap_f_phys(zap)->zap_ptrtbl.zt_nextblk) {
-		/*
-		 * We are in the middle of growing the pointer table, or
-		 * this leaf will soon make us grow it.
-		 */
-		if (zap_tryupgradedir(zap, tx) == 0) {
-			objset_t *os = zap->zap_objset;
-			uint64_t zapobj = zap->zap_object;
-
-			zap_unlockdir(zap, tag);
-			int err = zap_lockdir(os, zapobj, tx,
-			    RW_WRITER, FALSE, FALSE, tag, &zn->zn_zap);
-			zap = zn->zn_zap;
-			if (err != 0)
-				return;
-		}
-
-		/* could have finished growing while our locks were down */
-		if (zap_f_phys(zap)->zap_ptrtbl.zt_shift == shift)
-			(void) zap_grow_ptrtbl(zap, tx);
-	}
-}
-
-static int
-fzap_checkname(zap_name_t *zn)
-{
-	if (zn->zn_key_orig_numints * zn->zn_key_intlen > ZAP_MAXNAMELEN)
-		return (SET_ERROR(ENAMETOOLONG));
-	return (0);
-}
-
-static int
-fzap_checksize(uint64_t integer_size, uint64_t num_integers)
-{
-	/* Only integer sizes supported by C */
-	switch (integer_size) {
-	case 1:
-	case 2:
-	case 4:
-	case 8:
-		break;
-	default:
-		return (SET_ERROR(EINVAL));
-	}
-
-	if (integer_size * num_integers > ZAP_MAXVALUELEN)
-		return (E2BIG);
-
-	return (0);
-}
-
-static int
-fzap_check(zap_name_t *zn, uint64_t integer_size, uint64_t num_integers)
-{
-	int err = fzap_checkname(zn);
-	if (err != 0)
-		return (err);
-	return (fzap_checksize(integer_size, num_integers));
-}
-
-/*
- * Routines for manipulating attributes.
- */
-int
-fzap_lookup(zap_name_t *zn,
-    uint64_t integer_size, uint64_t num_integers, void *buf,
-    char *realname, int rn_len, boolean_t *ncp)
-{
-	zap_leaf_t *l;
-	zap_entry_handle_t zeh;
-
-	int err = fzap_checkname(zn);
-	if (err != 0)
-		return (err);
-
-	err = zap_deref_leaf(zn->zn_zap, zn->zn_hash, NULL, RW_READER, &l);
-	if (err != 0)
-		return (err);
-	err = zap_leaf_lookup(l, zn, &zeh);
-	if (err == 0) {
-		if ((err = fzap_checksize(integer_size, num_integers)) != 0) {
-			zap_put_leaf(l);
-			return (err);
-		}
-
-		err = zap_entry_read(&zeh, integer_size, num_integers, buf);
-		(void) zap_entry_read_name(zn->zn_zap, &zeh, rn_len, realname);
-		if (ncp) {
-			*ncp = zap_entry_normalization_conflict(&zeh,
-			    zn, NULL, zn->zn_zap);
-		}
-	}
-
-	zap_put_leaf(l);
-	return (err);
-}
-
-int
-fzap_add_cd(zap_name_t *zn,
-    uint64_t integer_size, uint64_t num_integers,
-    const void *val, uint32_t cd, void *tag, dmu_tx_t *tx)
-{
-	zap_leaf_t *l;
-	int err;
-	zap_entry_handle_t zeh;
-	zap_t *zap = zn->zn_zap;
-
-	ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
-	ASSERT(!zap->zap_ismicro);
-	ASSERT(fzap_check(zn, integer_size, num_integers) == 0);
-
-	err = zap_deref_leaf(zap, zn->zn_hash, tx, RW_WRITER, &l);
-	if (err != 0)
-		return (err);
-retry:
-	err = zap_leaf_lookup(l, zn, &zeh);
-	if (err == 0) {
-		err = SET_ERROR(EEXIST);
-		goto out;
-	}
-	if (err != ENOENT)
-		goto out;
-
-	err = zap_entry_create(l, zn, cd,
-	    integer_size, num_integers, val, &zeh);
-
-	if (err == 0) {
-		zap_increment_num_entries(zap, 1, tx);
-	} else if (err == EAGAIN) {
-		err = zap_expand_leaf(zn, l, tag, tx, &l);
-		zap = zn->zn_zap;	/* zap_expand_leaf() may change zap */
-		if (err == 0)
-			goto retry;
-	}
-
-out:
-	if (zap != NULL)
-		zap_put_leaf_maybe_grow_ptrtbl(zn, l, tag, tx);
-	return (err);
-}
-
-int
-fzap_add(zap_name_t *zn,
-    uint64_t integer_size, uint64_t num_integers,
-    const void *val, void *tag, dmu_tx_t *tx)
-{
-	int err = fzap_check(zn, integer_size, num_integers);
-	if (err != 0)
-		return (err);
-
-	return (fzap_add_cd(zn, integer_size, num_integers,
-	    val, ZAP_NEED_CD, tag, tx));
-}
-
-int
-fzap_update(zap_name_t *zn,
-    int integer_size, uint64_t num_integers, const void *val,
-    void *tag, dmu_tx_t *tx)
-{
-	zap_leaf_t *l;
-	int err;
-	boolean_t create;
-	zap_entry_handle_t zeh;
-	zap_t *zap = zn->zn_zap;
-
-	ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
-	err = fzap_check(zn, integer_size, num_integers);
-	if (err != 0)
-		return (err);
-
-	err = zap_deref_leaf(zap, zn->zn_hash, tx, RW_WRITER, &l);
-	if (err != 0)
-		return (err);
-retry:
-	err = zap_leaf_lookup(l, zn, &zeh);
-	create = (err == ENOENT);
-	ASSERT(err == 0 || err == ENOENT);
-
-	if (create) {
-		err = zap_entry_create(l, zn, ZAP_NEED_CD,
-		    integer_size, num_integers, val, &zeh);
-		if (err == 0)
-			zap_increment_num_entries(zap, 1, tx);
-	} else {
-		err = zap_entry_update(&zeh, integer_size, num_integers, val);
-	}
-
-	if (err == EAGAIN) {
-		err = zap_expand_leaf(zn, l, tag, tx, &l);
-		zap = zn->zn_zap;	/* zap_expand_leaf() may change zap */
-		if (err == 0)
-			goto retry;
-	}
-
-	if (zap != NULL)
-		zap_put_leaf_maybe_grow_ptrtbl(zn, l, tag, tx);
-	return (err);
-}
-
-int
-fzap_length(zap_name_t *zn,
-    uint64_t *integer_size, uint64_t *num_integers)
-{
-	zap_leaf_t *l;
-	int err;
-	zap_entry_handle_t zeh;
-
-	err = zap_deref_leaf(zn->zn_zap, zn->zn_hash, NULL, RW_READER, &l);
-	if (err != 0)
-		return (err);
-	err = zap_leaf_lookup(l, zn, &zeh);
-	if (err != 0)
-		goto out;
-
-	if (integer_size != 0)
-		*integer_size = zeh.zeh_integer_size;
-	if (num_integers != 0)
-		*num_integers = zeh.zeh_num_integers;
-out:
-	zap_put_leaf(l);
-	return (err);
-}
-
-int
-fzap_remove(zap_name_t *zn, dmu_tx_t *tx)
-{
-	zap_leaf_t *l;
-	int err;
-	zap_entry_handle_t zeh;
-
-	err = zap_deref_leaf(zn->zn_zap, zn->zn_hash, tx, RW_WRITER, &l);
-	if (err != 0)
-		return (err);
-	err = zap_leaf_lookup(l, zn, &zeh);
-	if (err == 0) {
-		zap_entry_remove(&zeh);
-		zap_increment_num_entries(zn->zn_zap, -1, tx);
-	}
-	zap_put_leaf(l);
-	return (err);
-}
-
-void
-fzap_prefetch(zap_name_t *zn)
-{
-	uint64_t blk;
-	zap_t *zap = zn->zn_zap;
-
-	uint64_t idx = ZAP_HASH_IDX(zn->zn_hash,
-	    zap_f_phys(zap)->zap_ptrtbl.zt_shift);
-	if (zap_idx_to_blk(zap, idx, &blk) != 0)
-		return;
-	int bs = FZAP_BLOCK_SHIFT(zap);
-	dmu_prefetch(zap->zap_objset, zap->zap_object, 0, blk << bs, 1 << bs,
-	    ZIO_PRIORITY_SYNC_READ);
-}
-
-/*
- * Helper functions for consumers.
- */
-
-uint64_t
-zap_create_link(objset_t *os, dmu_object_type_t ot, uint64_t parent_obj,
-    const char *name, dmu_tx_t *tx)
-{
-	return (zap_create_link_dnsize(os, ot, parent_obj, name, 0, tx));
-}
-
-uint64_t
-zap_create_link_dnsize(objset_t *os, dmu_object_type_t ot, uint64_t parent_obj,
-    const char *name, int dnodesize, dmu_tx_t *tx)
-{
-	uint64_t new_obj;
-
-	VERIFY((new_obj = zap_create_dnsize(os, ot, DMU_OT_NONE, 0,
-	    dnodesize, tx)) > 0);
-	VERIFY0(zap_add(os, parent_obj, name, sizeof (uint64_t), 1, &new_obj,
-	    tx));
-
-	return (new_obj);
-}
-
-int
-zap_value_search(objset_t *os, uint64_t zapobj, uint64_t value, uint64_t mask,
-    char *name)
-{
-	zap_cursor_t zc;
-	int err;
-
-	if (mask == 0)
-		mask = -1ULL;
-
-	zap_attribute_t *za = kmem_alloc(sizeof (*za), KM_SLEEP);
-	for (zap_cursor_init(&zc, os, zapobj);
-	    (err = zap_cursor_retrieve(&zc, za)) == 0;
-	    zap_cursor_advance(&zc)) {
-		if ((za->za_first_integer & mask) == (value & mask)) {
-			(void) strcpy(name, za->za_name);
-			break;
-		}
-	}
-	zap_cursor_fini(&zc);
-	kmem_free(za, sizeof (*za));
-	return (err);
-}
-
-int
-zap_join(objset_t *os, uint64_t fromobj, uint64_t intoobj, dmu_tx_t *tx)
-{
-	zap_cursor_t zc;
-	int err = 0;
-
-	zap_attribute_t *za = kmem_alloc(sizeof (*za), KM_SLEEP);
-	for (zap_cursor_init(&zc, os, fromobj);
-	    zap_cursor_retrieve(&zc, za) == 0;
-	    (void) zap_cursor_advance(&zc)) {
-		if (za->za_integer_length != 8 || za->za_num_integers != 1) {
-			err = SET_ERROR(EINVAL);
-			break;
-		}
-		err = zap_add(os, intoobj, za->za_name,
-		    8, 1, &za->za_first_integer, tx);
-		if (err != 0)
-			break;
-	}
-	zap_cursor_fini(&zc);
-	kmem_free(za, sizeof (*za));
-	return (err);
-}
-
-int
-zap_join_key(objset_t *os, uint64_t fromobj, uint64_t intoobj,
-    uint64_t value, dmu_tx_t *tx)
-{
-	zap_cursor_t zc;
-	int err = 0;
-
-	zap_attribute_t *za = kmem_alloc(sizeof (*za), KM_SLEEP);
-	for (zap_cursor_init(&zc, os, fromobj);
-	    zap_cursor_retrieve(&zc, za) == 0;
-	    (void) zap_cursor_advance(&zc)) {
-		if (za->za_integer_length != 8 || za->za_num_integers != 1) {
-			err = SET_ERROR(EINVAL);
-			break;
-		}
-		err = zap_add(os, intoobj, za->za_name,
-		    8, 1, &value, tx);
-		if (err != 0)
-			break;
-	}
-	zap_cursor_fini(&zc);
-	kmem_free(za, sizeof (*za));
-	return (err);
-}
-
-int
-zap_join_increment(objset_t *os, uint64_t fromobj, uint64_t intoobj,
-    dmu_tx_t *tx)
-{
-	zap_cursor_t zc;
-	int err = 0;
-
-	zap_attribute_t *za = kmem_alloc(sizeof (*za), KM_SLEEP);
-	for (zap_cursor_init(&zc, os, fromobj);
-	    zap_cursor_retrieve(&zc, za) == 0;
-	    (void) zap_cursor_advance(&zc)) {
-		uint64_t delta = 0;
-
-		if (za->za_integer_length != 8 || za->za_num_integers != 1) {
-			err = SET_ERROR(EINVAL);
-			break;
-		}
-
-		err = zap_lookup(os, intoobj, za->za_name, 8, 1, &delta);
-		if (err != 0 && err != ENOENT)
-			break;
-		delta += za->za_first_integer;
-		err = zap_update(os, intoobj, za->za_name, 8, 1, &delta, tx);
-		if (err != 0)
-			break;
-	}
-	zap_cursor_fini(&zc);
-	kmem_free(za, sizeof (*za));
-	return (err);
-}
-
-int
-zap_add_int(objset_t *os, uint64_t obj, uint64_t value, dmu_tx_t *tx)
-{
-	char name[20];
-
-	(void) snprintf(name, sizeof (name), "%llx", (longlong_t)value);
-	return (zap_add(os, obj, name, 8, 1, &value, tx));
-}
-
-int
-zap_remove_int(objset_t *os, uint64_t obj, uint64_t value, dmu_tx_t *tx)
-{
-	char name[20];
-
-	(void) snprintf(name, sizeof (name), "%llx", (longlong_t)value);
-	return (zap_remove(os, obj, name, tx));
-}
-
-int
-zap_lookup_int(objset_t *os, uint64_t obj, uint64_t value)
-{
-	char name[20];
-
-	(void) snprintf(name, sizeof (name), "%llx", (longlong_t)value);
-	return (zap_lookup(os, obj, name, 8, 1, &value));
-}
-
-int
-zap_add_int_key(objset_t *os, uint64_t obj,
-    uint64_t key, uint64_t value, dmu_tx_t *tx)
-{
-	char name[20];
-
-	(void) snprintf(name, sizeof (name), "%llx", (longlong_t)key);
-	return (zap_add(os, obj, name, 8, 1, &value, tx));
-}
-
-int
-zap_update_int_key(objset_t *os, uint64_t obj,
-    uint64_t key, uint64_t value, dmu_tx_t *tx)
-{
-	char name[20];
-
-	(void) snprintf(name, sizeof (name), "%llx", (longlong_t)key);
-	return (zap_update(os, obj, name, 8, 1, &value, tx));
-}
-
-int
-zap_lookup_int_key(objset_t *os, uint64_t obj, uint64_t key, uint64_t *valuep)
-{
-	char name[20];
-
-	(void) snprintf(name, sizeof (name), "%llx", (longlong_t)key);
-	return (zap_lookup(os, obj, name, 8, 1, valuep));
-}
-
-int
-zap_increment(objset_t *os, uint64_t obj, const char *name, int64_t delta,
-    dmu_tx_t *tx)
-{
-	uint64_t value = 0;
-
-	if (delta == 0)
-		return (0);
-
-	int err = zap_lookup(os, obj, name, 8, 1, &value);
-	if (err != 0 && err != ENOENT)
-		return (err);
-	value += delta;
-	if (value == 0)
-		err = zap_remove(os, obj, name, tx);
-	else
-		err = zap_update(os, obj, name, 8, 1, &value, tx);
-	return (err);
-}
-
-int
-zap_increment_int(objset_t *os, uint64_t obj, uint64_t key, int64_t delta,
-    dmu_tx_t *tx)
-{
-	char name[20];
-
-	(void) snprintf(name, sizeof (name), "%llx", (longlong_t)key);
-	return (zap_increment(os, obj, name, delta, tx));
-}
-
-/*
- * Routines for iterating over the attributes.
- */
-
-int
-fzap_cursor_retrieve(zap_t *zap, zap_cursor_t *zc, zap_attribute_t *za)
-{
-	int err = ENOENT;
-	zap_entry_handle_t zeh;
-	zap_leaf_t *l;
-
-	/* retrieve the next entry at or after zc_hash/zc_cd */
-	/* if no entry, return ENOENT */
-
-	/*
-	 * If we are reading from the beginning, we're almost
-	 * certain to iterate over the entire ZAP object.  If there are
-	 * multiple leaf blocks (freeblk > 2), prefetch the whole
-	 * object, so that we read the leaf blocks concurrently.
-	 * (Unless noprefetch was requested via zap_cursor_init_noprefetch()).
-	 */
-	if (zc->zc_hash == 0 && zap_iterate_prefetch &&
-	    zc->zc_prefetch && zap_f_phys(zap)->zap_freeblk > 2) {
-		dmu_prefetch(zc->zc_objset, zc->zc_zapobj, 0, 0,
-		    zap_f_phys(zap)->zap_freeblk << FZAP_BLOCK_SHIFT(zap),
-		    ZIO_PRIORITY_ASYNC_READ);
-	}
-
-	if (zc->zc_leaf &&
-	    (ZAP_HASH_IDX(zc->zc_hash,
-	    zap_leaf_phys(zc->zc_leaf)->l_hdr.lh_prefix_len) !=
-	    zap_leaf_phys(zc->zc_leaf)->l_hdr.lh_prefix)) {
-		rw_enter(&zc->zc_leaf->l_rwlock, RW_READER);
-		zap_put_leaf(zc->zc_leaf);
-		zc->zc_leaf = NULL;
-	}
-
-again:
-	if (zc->zc_leaf == NULL) {
-		err = zap_deref_leaf(zap, zc->zc_hash, NULL, RW_READER,
-		    &zc->zc_leaf);
-		if (err != 0)
-			return (err);
-	} else {
-		rw_enter(&zc->zc_leaf->l_rwlock, RW_READER);
-	}
-	l = zc->zc_leaf;
-
-	err = zap_leaf_lookup_closest(l, zc->zc_hash, zc->zc_cd, &zeh);
-
-	if (err == ENOENT) {
-		uint64_t nocare =
-		    (1ULL << (64 - zap_leaf_phys(l)->l_hdr.lh_prefix_len)) - 1;
-		zc->zc_hash = (zc->zc_hash & ~nocare) + nocare + 1;
-		zc->zc_cd = 0;
-		if (zap_leaf_phys(l)->l_hdr.lh_prefix_len == 0 ||
-		    zc->zc_hash == 0) {
-			zc->zc_hash = -1ULL;
-		} else {
-			zap_put_leaf(zc->zc_leaf);
-			zc->zc_leaf = NULL;
-			goto again;
-		}
-	}
-
-	if (err == 0) {
-		zc->zc_hash = zeh.zeh_hash;
-		zc->zc_cd = zeh.zeh_cd;
-		za->za_integer_length = zeh.zeh_integer_size;
-		za->za_num_integers = zeh.zeh_num_integers;
-		if (zeh.zeh_num_integers == 0) {
-			za->za_first_integer = 0;
-		} else {
-			err = zap_entry_read(&zeh, 8, 1, &za->za_first_integer);
-			ASSERT(err == 0 || err == EOVERFLOW);
-		}
-		err = zap_entry_read_name(zap, &zeh,
-		    sizeof (za->za_name), za->za_name);
-		ASSERT(err == 0);
-
-		za->za_normalization_conflict =
-		    zap_entry_normalization_conflict(&zeh,
-		    NULL, za->za_name, zap);
-	}
-	rw_exit(&zc->zc_leaf->l_rwlock);
-	return (err);
-}
-
-static void
-zap_stats_ptrtbl(zap_t *zap, uint64_t *tbl, int len, zap_stats_t *zs)
-{
-	uint64_t lastblk = 0;
-
-	/*
-	 * NB: if a leaf has more pointers than an entire ptrtbl block
-	 * can hold, then it'll be accounted for more than once, since
-	 * we won't have lastblk.
-	 */
-	for (int i = 0; i < len; i++) {
-		zap_leaf_t *l;
-
-		if (tbl[i] == lastblk)
-			continue;
-		lastblk = tbl[i];
-
-		int err = zap_get_leaf_byblk(zap, tbl[i], NULL, RW_READER, &l);
-		if (err == 0) {
-			zap_leaf_stats(zap, l, zs);
-			zap_put_leaf(l);
-		}
-	}
-}
-
-int
-fzap_cursor_move_to_key(zap_cursor_t *zc, zap_name_t *zn)
-{
-	int err;
-	zap_leaf_t *l;
-	zap_entry_handle_t zeh;
-
-	if (zn->zn_key_orig_numints * zn->zn_key_intlen > ZAP_MAXNAMELEN)
-		return (SET_ERROR(ENAMETOOLONG));
-
-	err = zap_deref_leaf(zc->zc_zap, zn->zn_hash, NULL, RW_READER, &l);
-	if (err != 0)
-		return (err);
-
-	err = zap_leaf_lookup(l, zn, &zeh);
-	if (err != 0)
-		return (err);
-
-	zc->zc_leaf = l;
-	zc->zc_hash = zeh.zeh_hash;
-	zc->zc_cd = zeh.zeh_cd;
-
-	return (err);
-}
-
-void
-fzap_get_stats(zap_t *zap, zap_stats_t *zs)
-{
-	int bs = FZAP_BLOCK_SHIFT(zap);
-	zs->zs_blocksize = 1ULL << bs;
-
-	/*
-	 * Set zap_phys_t fields
-	 */
-	zs->zs_num_leafs = zap_f_phys(zap)->zap_num_leafs;
-	zs->zs_num_entries = zap_f_phys(zap)->zap_num_entries;
-	zs->zs_num_blocks = zap_f_phys(zap)->zap_freeblk;
-	zs->zs_block_type = zap_f_phys(zap)->zap_block_type;
-	zs->zs_magic = zap_f_phys(zap)->zap_magic;
-	zs->zs_salt = zap_f_phys(zap)->zap_salt;
-
-	/*
-	 * Set zap_ptrtbl fields
-	 */
-	zs->zs_ptrtbl_len = 1ULL << zap_f_phys(zap)->zap_ptrtbl.zt_shift;
-	zs->zs_ptrtbl_nextblk = zap_f_phys(zap)->zap_ptrtbl.zt_nextblk;
-	zs->zs_ptrtbl_blks_copied =
-	    zap_f_phys(zap)->zap_ptrtbl.zt_blks_copied;
-	zs->zs_ptrtbl_zt_blk = zap_f_phys(zap)->zap_ptrtbl.zt_blk;
-	zs->zs_ptrtbl_zt_numblks = zap_f_phys(zap)->zap_ptrtbl.zt_numblks;
-	zs->zs_ptrtbl_zt_shift = zap_f_phys(zap)->zap_ptrtbl.zt_shift;
-
-	if (zap_f_phys(zap)->zap_ptrtbl.zt_numblks == 0) {
-		/* the ptrtbl is entirely in the header block. */
-		zap_stats_ptrtbl(zap, &ZAP_EMBEDDED_PTRTBL_ENT(zap, 0),
-		    1 << ZAP_EMBEDDED_PTRTBL_SHIFT(zap), zs);
-	} else {
-		dmu_prefetch(zap->zap_objset, zap->zap_object, 0,
-		    zap_f_phys(zap)->zap_ptrtbl.zt_blk << bs,
-		    zap_f_phys(zap)->zap_ptrtbl.zt_numblks << bs,
-		    ZIO_PRIORITY_SYNC_READ);
-
-		for (int b = 0; b < zap_f_phys(zap)->zap_ptrtbl.zt_numblks;
-		    b++) {
-			dmu_buf_t *db;
-			int err;
-
-			err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
-			    (zap_f_phys(zap)->zap_ptrtbl.zt_blk + b) << bs,
-			    FTAG, &db, DMU_READ_NO_PREFETCH);
-			if (err == 0) {
-				zap_stats_ptrtbl(zap, db->db_data,
-				    1<<(bs-3), zs);
-				dmu_buf_rele(db, FTAG);
-			}
-		}
-	}
-}
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_leaf.c
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_leaf.c
+++ /dev/null
@@ -1,849 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-
-/*
- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2013, 2016 by Delphix. All rights reserved.
- * Copyright 2017 Nexenta Systems, Inc.
- */
-
-/*
- * The 512-byte leaf is broken into 32 16-byte chunks.
- * chunk number n means l_chunk[n], even though the header precedes it.
- * the names are stored null-terminated.
- */
-
-#include <sys/zio.h>
-#include <sys/spa.h>
-#include <sys/dmu.h>
-#include <sys/zfs_context.h>
-#include <sys/fs/zfs.h>
-#include <sys/zap.h>
-#include <sys/zap_impl.h>
-#include <sys/zap_leaf.h>
-#include <sys/arc.h>
-
-static uint16_t *zap_leaf_rehash_entry(zap_leaf_t *l, uint16_t entry);
-
-#define	CHAIN_END 0xffff /* end of the chunk chain */
-
-/* half the (current) minimum block size */
-#define	MAX_ARRAY_BYTES (8<<10)
-
-#define	LEAF_HASH(l, h) \
-	((ZAP_LEAF_HASH_NUMENTRIES(l)-1) & \
-	((h) >> \
-	(64 - ZAP_LEAF_HASH_SHIFT(l) - zap_leaf_phys(l)->l_hdr.lh_prefix_len)))
-
-#define	LEAF_HASH_ENTPTR(l, h) (&zap_leaf_phys(l)->l_hash[LEAF_HASH(l, h)])
-
-extern inline zap_leaf_phys_t *zap_leaf_phys(zap_leaf_t *l);
-
-static void
-zap_memset(void *a, int c, size_t n)
-{
-	char *cp = a;
-	char *cpend = cp + n;
-
-	while (cp < cpend)
-		*cp++ = c;
-}
-
-static void
-stv(int len, void *addr, uint64_t value)
-{
-	switch (len) {
-	case 1:
-		*(uint8_t *)addr = value;
-		return;
-	case 2:
-		*(uint16_t *)addr = value;
-		return;
-	case 4:
-		*(uint32_t *)addr = value;
-		return;
-	case 8:
-		*(uint64_t *)addr = value;
-		return;
-	}
-	ASSERT(!"bad int len");
-}
-
-static uint64_t
-ldv(int len, const void *addr)
-{
-	switch (len) {
-	case 1:
-		return (*(uint8_t *)addr);
-	case 2:
-		return (*(uint16_t *)addr);
-	case 4:
-		return (*(uint32_t *)addr);
-	case 8:
-		return (*(uint64_t *)addr);
-	}
-	ASSERT(!"bad int len");
-	return (0xFEEDFACEDEADBEEFULL);
-}
-
-void
-zap_leaf_byteswap(zap_leaf_phys_t *buf, int size)
-{
-	zap_leaf_t l;
-	dmu_buf_t l_dbuf;
-
-	l_dbuf.db_data = buf;
-	l.l_bs = highbit64(size) - 1;
-	l.l_dbuf = &l_dbuf;
-
-	buf->l_hdr.lh_block_type =	BSWAP_64(buf->l_hdr.lh_block_type);
-	buf->l_hdr.lh_prefix =		BSWAP_64(buf->l_hdr.lh_prefix);
-	buf->l_hdr.lh_magic =		BSWAP_32(buf->l_hdr.lh_magic);
-	buf->l_hdr.lh_nfree =		BSWAP_16(buf->l_hdr.lh_nfree);
-	buf->l_hdr.lh_nentries =	BSWAP_16(buf->l_hdr.lh_nentries);
-	buf->l_hdr.lh_prefix_len =	BSWAP_16(buf->l_hdr.lh_prefix_len);
-	buf->l_hdr.lh_freelist =	BSWAP_16(buf->l_hdr.lh_freelist);
-
-	for (int i = 0; i < ZAP_LEAF_HASH_NUMENTRIES(&l); i++)
-		buf->l_hash[i] = BSWAP_16(buf->l_hash[i]);
-
-	for (int i = 0; i < ZAP_LEAF_NUMCHUNKS(&l); i++) {
-		zap_leaf_chunk_t *lc = &ZAP_LEAF_CHUNK(&l, i);
-		struct zap_leaf_entry *le;
-
-		switch (lc->l_free.lf_type) {
-		case ZAP_CHUNK_ENTRY:
-			le = &lc->l_entry;
-
-			le->le_type =		BSWAP_8(le->le_type);
-			le->le_value_intlen =	BSWAP_8(le->le_value_intlen);
-			le->le_next =		BSWAP_16(le->le_next);
-			le->le_name_chunk =	BSWAP_16(le->le_name_chunk);
-			le->le_name_numints =	BSWAP_16(le->le_name_numints);
-			le->le_value_chunk =	BSWAP_16(le->le_value_chunk);
-			le->le_value_numints =	BSWAP_16(le->le_value_numints);
-			le->le_cd =		BSWAP_32(le->le_cd);
-			le->le_hash =		BSWAP_64(le->le_hash);
-			break;
-		case ZAP_CHUNK_FREE:
-			lc->l_free.lf_type =	BSWAP_8(lc->l_free.lf_type);
-			lc->l_free.lf_next =	BSWAP_16(lc->l_free.lf_next);
-			break;
-		case ZAP_CHUNK_ARRAY:
-			lc->l_array.la_type =	BSWAP_8(lc->l_array.la_type);
-			lc->l_array.la_next =	BSWAP_16(lc->l_array.la_next);
-			/* la_array doesn't need swapping */
-			break;
-		default:
-			ASSERT(!"bad leaf type");
-		}
-	}
-}
-
-void
-zap_leaf_init(zap_leaf_t *l, boolean_t sort)
-{
-	l->l_bs = highbit64(l->l_dbuf->db_size) - 1;
-	zap_memset(&zap_leaf_phys(l)->l_hdr, 0,
-	    sizeof (struct zap_leaf_header));
-	zap_memset(zap_leaf_phys(l)->l_hash, CHAIN_END,
-	    2*ZAP_LEAF_HASH_NUMENTRIES(l));
-	for (int i = 0; i < ZAP_LEAF_NUMCHUNKS(l); i++) {
-		ZAP_LEAF_CHUNK(l, i).l_free.lf_type = ZAP_CHUNK_FREE;
-		ZAP_LEAF_CHUNK(l, i).l_free.lf_next = i+1;
-	}
-	ZAP_LEAF_CHUNK(l, ZAP_LEAF_NUMCHUNKS(l)-1).l_free.lf_next = CHAIN_END;
-	zap_leaf_phys(l)->l_hdr.lh_block_type = ZBT_LEAF;
-	zap_leaf_phys(l)->l_hdr.lh_magic = ZAP_LEAF_MAGIC;
-	zap_leaf_phys(l)->l_hdr.lh_nfree = ZAP_LEAF_NUMCHUNKS(l);
-	if (sort)
-		zap_leaf_phys(l)->l_hdr.lh_flags |= ZLF_ENTRIES_CDSORTED;
-}
-
-/*
- * Routines which manipulate leaf chunks (l_chunk[]).
- */
-
-static uint16_t
-zap_leaf_chunk_alloc(zap_leaf_t *l)
-{
-	ASSERT(zap_leaf_phys(l)->l_hdr.lh_nfree > 0);
-
-	int chunk = zap_leaf_phys(l)->l_hdr.lh_freelist;
-	ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
-	ASSERT3U(ZAP_LEAF_CHUNK(l, chunk).l_free.lf_type, ==, ZAP_CHUNK_FREE);
-
-	zap_leaf_phys(l)->l_hdr.lh_freelist =
-	    ZAP_LEAF_CHUNK(l, chunk).l_free.lf_next;
-
-	zap_leaf_phys(l)->l_hdr.lh_nfree--;
-
-	return (chunk);
-}
-
-static void
-zap_leaf_chunk_free(zap_leaf_t *l, uint16_t chunk)
-{
-	struct zap_leaf_free *zlf = &ZAP_LEAF_CHUNK(l, chunk).l_free;
-	ASSERT3U(zap_leaf_phys(l)->l_hdr.lh_nfree, <, ZAP_LEAF_NUMCHUNKS(l));
-	ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
-	ASSERT(zlf->lf_type != ZAP_CHUNK_FREE);
-
-	zlf->lf_type = ZAP_CHUNK_FREE;
-	zlf->lf_next = zap_leaf_phys(l)->l_hdr.lh_freelist;
-	bzero(zlf->lf_pad, sizeof (zlf->lf_pad)); /* help it to compress */
-	zap_leaf_phys(l)->l_hdr.lh_freelist = chunk;
-
-	zap_leaf_phys(l)->l_hdr.lh_nfree++;
-}
-
-/*
- * Routines which manipulate leaf arrays (zap_leaf_array type chunks).
- */
-
-static uint16_t
-zap_leaf_array_create(zap_leaf_t *l, const char *buf,
-    int integer_size, int num_integers)
-{
-	uint16_t chunk_head;
-	uint16_t *chunkp = &chunk_head;
-	int byten = 0;
-	uint64_t value = 0;
-	int shift = (integer_size - 1) * 8;
-	int len = num_integers;
-
-	ASSERT3U(num_integers * integer_size, <, MAX_ARRAY_BYTES);
-
-	while (len > 0) {
-		uint16_t chunk = zap_leaf_chunk_alloc(l);
-		struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(l, chunk).l_array;
-
-		la->la_type = ZAP_CHUNK_ARRAY;
-		for (int i = 0; i < ZAP_LEAF_ARRAY_BYTES; i++) {
-			if (byten == 0)
-				value = ldv(integer_size, buf);
-			la->la_array[i] = value >> shift;
-			value <<= 8;
-			if (++byten == integer_size) {
-				byten = 0;
-				buf += integer_size;
-				if (--len == 0)
-					break;
-			}
-		}
-
-		*chunkp = chunk;
-		chunkp = &la->la_next;
-	}
-	*chunkp = CHAIN_END;
-
-	return (chunk_head);
-}
-
-static void
-zap_leaf_array_free(zap_leaf_t *l, uint16_t *chunkp)
-{
-	uint16_t chunk = *chunkp;
-
-	*chunkp = CHAIN_END;
-
-	while (chunk != CHAIN_END) {
-		int nextchunk = ZAP_LEAF_CHUNK(l, chunk).l_array.la_next;
-		ASSERT3U(ZAP_LEAF_CHUNK(l, chunk).l_array.la_type, ==,
-		    ZAP_CHUNK_ARRAY);
-		zap_leaf_chunk_free(l, chunk);
-		chunk = nextchunk;
-	}
-}
-
-/* array_len and buf_len are in integers, not bytes */
-static void
-zap_leaf_array_read(zap_leaf_t *l, uint16_t chunk,
-    int array_int_len, int array_len, int buf_int_len, uint64_t buf_len,
-    void *buf)
-{
-	int len = MIN(array_len, buf_len);
-	int byten = 0;
-	uint64_t value = 0;
-	char *p = buf;
-
-	ASSERT3U(array_int_len, <=, buf_int_len);
-
-	/* Fast path for one 8-byte integer */
-	if (array_int_len == 8 && buf_int_len == 8 && len == 1) {
-		struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(l, chunk).l_array;
-		uint8_t *ip = la->la_array;
-		uint64_t *buf64 = buf;
-
-		*buf64 = (uint64_t)ip[0] << 56 | (uint64_t)ip[1] << 48 |
-		    (uint64_t)ip[2] << 40 | (uint64_t)ip[3] << 32 |
-		    (uint64_t)ip[4] << 24 | (uint64_t)ip[5] << 16 |
-		    (uint64_t)ip[6] << 8 | (uint64_t)ip[7];
-		return;
-	}
-
-	/* Fast path for an array of 1-byte integers (eg. the entry name) */
-	if (array_int_len == 1 && buf_int_len == 1 &&
-	    buf_len > array_len + ZAP_LEAF_ARRAY_BYTES) {
-		while (chunk != CHAIN_END) {
-			struct zap_leaf_array *la =
-			    &ZAP_LEAF_CHUNK(l, chunk).l_array;
-			bcopy(la->la_array, p, ZAP_LEAF_ARRAY_BYTES);
-			p += ZAP_LEAF_ARRAY_BYTES;
-			chunk = la->la_next;
-		}
-		return;
-	}
-
-	while (len > 0) {
-		struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(l, chunk).l_array;
-
-		ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
-		for (int i = 0; i < ZAP_LEAF_ARRAY_BYTES && len > 0; i++) {
-			value = (value << 8) | la->la_array[i];
-			byten++;
-			if (byten == array_int_len) {
-				stv(buf_int_len, p, value);
-				byten = 0;
-				len--;
-				if (len == 0)
-					return;
-				p += buf_int_len;
-			}
-		}
-		chunk = la->la_next;
-	}
-}
-
-static boolean_t
-zap_leaf_array_match(zap_leaf_t *l, zap_name_t *zn,
-    int chunk, int array_numints)
-{
-	int bseen = 0;
-
-	if (zap_getflags(zn->zn_zap) & ZAP_FLAG_UINT64_KEY) {
-		uint64_t *thiskey =
-		    kmem_alloc(array_numints * sizeof (*thiskey), KM_SLEEP);
-		ASSERT(zn->zn_key_intlen == sizeof (*thiskey));
-
-		zap_leaf_array_read(l, chunk, sizeof (*thiskey), array_numints,
-		    sizeof (*thiskey), array_numints, thiskey);
-		boolean_t match = bcmp(thiskey, zn->zn_key_orig,
-		    array_numints * sizeof (*thiskey)) == 0;
-		kmem_free(thiskey, array_numints * sizeof (*thiskey));
-		return (match);
-	}
-
-	ASSERT(zn->zn_key_intlen == 1);
-	if (zn->zn_matchtype & MT_NORMALIZE) {
-		char *thisname = kmem_alloc(array_numints, KM_SLEEP);
-
-		zap_leaf_array_read(l, chunk, sizeof (char), array_numints,
-		    sizeof (char), array_numints, thisname);
-		boolean_t match = zap_match(zn, thisname);
-		kmem_free(thisname, array_numints);
-		return (match);
-	}
-
-	/*
-	 * Fast path for exact matching.
-	 * First check that the lengths match, so that we don't read
-	 * past the end of the zn_key_orig array.
-	 */
-	if (array_numints != zn->zn_key_orig_numints)
-		return (B_FALSE);
-	while (bseen < array_numints) {
-		struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(l, chunk).l_array;
-		int toread = MIN(array_numints - bseen, ZAP_LEAF_ARRAY_BYTES);
-		ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
-		if (bcmp(la->la_array, (char *)zn->zn_key_orig + bseen, toread))
-			break;
-		chunk = la->la_next;
-		bseen += toread;
-	}
-	return (bseen == array_numints);
-}
-
-/*
- * Routines which manipulate leaf entries.
- */
-
-int
-zap_leaf_lookup(zap_leaf_t *l, zap_name_t *zn, zap_entry_handle_t *zeh)
-{
-	struct zap_leaf_entry *le;
-
-	ASSERT3U(zap_leaf_phys(l)->l_hdr.lh_magic, ==, ZAP_LEAF_MAGIC);
-
-	for (uint16_t *chunkp = LEAF_HASH_ENTPTR(l, zn->zn_hash);
-	    *chunkp != CHAIN_END; chunkp = &le->le_next) {
-		uint16_t chunk = *chunkp;
-		le = ZAP_LEAF_ENTRY(l, chunk);
-
-		ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
-		ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY);
-
-		if (le->le_hash != zn->zn_hash)
-			continue;
-
-		/*
-		 * NB: the entry chain is always sorted by cd on
-		 * normalized zap objects, so this will find the
-		 * lowest-cd match for MT_NORMALIZE.
-		 */
-		ASSERT((zn->zn_matchtype == 0) ||
-		    (zap_leaf_phys(l)->l_hdr.lh_flags & ZLF_ENTRIES_CDSORTED));
-		if (zap_leaf_array_match(l, zn, le->le_name_chunk,
-		    le->le_name_numints)) {
-			zeh->zeh_num_integers = le->le_value_numints;
-			zeh->zeh_integer_size = le->le_value_intlen;
-			zeh->zeh_cd = le->le_cd;
-			zeh->zeh_hash = le->le_hash;
-			zeh->zeh_chunkp = chunkp;
-			zeh->zeh_leaf = l;
-			return (0);
-		}
-	}
-
-	return (SET_ERROR(ENOENT));
-}
-
-/* Return (h1,cd1 >= h2,cd2) */
-#define	HCD_GTEQ(h1, cd1, h2, cd2) \
-	((h1 > h2) ? TRUE : ((h1 == h2 && cd1 >= cd2) ? TRUE : FALSE))
-
-int
-zap_leaf_lookup_closest(zap_leaf_t *l,
-    uint64_t h, uint32_t cd, zap_entry_handle_t *zeh)
-{
-	uint64_t besth = -1ULL;
-	uint32_t bestcd = -1U;
-	uint16_t bestlh = ZAP_LEAF_HASH_NUMENTRIES(l)-1;
-	struct zap_leaf_entry *le;
-
-	ASSERT3U(zap_leaf_phys(l)->l_hdr.lh_magic, ==, ZAP_LEAF_MAGIC);
-
-	for (uint16_t lh = LEAF_HASH(l, h); lh <= bestlh; lh++) {
-		for (uint16_t chunk = zap_leaf_phys(l)->l_hash[lh];
-		    chunk != CHAIN_END; chunk = le->le_next) {
-			le = ZAP_LEAF_ENTRY(l, chunk);
-
-			ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
-			ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY);
-
-			if (HCD_GTEQ(le->le_hash, le->le_cd, h, cd) &&
-			    HCD_GTEQ(besth, bestcd, le->le_hash, le->le_cd)) {
-				ASSERT3U(bestlh, >=, lh);
-				bestlh = lh;
-				besth = le->le_hash;
-				bestcd = le->le_cd;
-
-				zeh->zeh_num_integers = le->le_value_numints;
-				zeh->zeh_integer_size = le->le_value_intlen;
-				zeh->zeh_cd = le->le_cd;
-				zeh->zeh_hash = le->le_hash;
-				zeh->zeh_fakechunk = chunk;
-				zeh->zeh_chunkp = &zeh->zeh_fakechunk;
-				zeh->zeh_leaf = l;
-			}
-		}
-	}
-
-	return (bestcd == -1U ? ENOENT : 0);
-}
-
-int
-zap_entry_read(const zap_entry_handle_t *zeh,
-    uint8_t integer_size, uint64_t num_integers, void *buf)
-{
-	struct zap_leaf_entry *le =
-	    ZAP_LEAF_ENTRY(zeh->zeh_leaf, *zeh->zeh_chunkp);
-	ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY);
-
-	if (le->le_value_intlen > integer_size)
-		return (SET_ERROR(EINVAL));
-
-	zap_leaf_array_read(zeh->zeh_leaf, le->le_value_chunk,
-	    le->le_value_intlen, le->le_value_numints,
-	    integer_size, num_integers, buf);
-
-	if (zeh->zeh_num_integers > num_integers)
-		return (SET_ERROR(EOVERFLOW));
-	return (0);
-
-}
-
-int
-zap_entry_read_name(zap_t *zap, const zap_entry_handle_t *zeh, uint16_t buflen,
-    char *buf)
-{
-	struct zap_leaf_entry *le =
-	    ZAP_LEAF_ENTRY(zeh->zeh_leaf, *zeh->zeh_chunkp);
-	ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY);
-
-	if (zap_getflags(zap) & ZAP_FLAG_UINT64_KEY) {
-		zap_leaf_array_read(zeh->zeh_leaf, le->le_name_chunk, 8,
-		    le->le_name_numints, 8, buflen / 8, buf);
-	} else {
-		zap_leaf_array_read(zeh->zeh_leaf, le->le_name_chunk, 1,
-		    le->le_name_numints, 1, buflen, buf);
-	}
-	if (le->le_name_numints > buflen)
-		return (SET_ERROR(EOVERFLOW));
-	return (0);
-}
-
-int
-zap_entry_update(zap_entry_handle_t *zeh,
-    uint8_t integer_size, uint64_t num_integers, const void *buf)
-{
-	zap_leaf_t *l = zeh->zeh_leaf;
-	struct zap_leaf_entry *le = ZAP_LEAF_ENTRY(l, *zeh->zeh_chunkp);
-
-	int delta_chunks = ZAP_LEAF_ARRAY_NCHUNKS(num_integers * integer_size) -
-	    ZAP_LEAF_ARRAY_NCHUNKS(le->le_value_numints * le->le_value_intlen);
-
-	if ((int)zap_leaf_phys(l)->l_hdr.lh_nfree < delta_chunks)
-		return (SET_ERROR(EAGAIN));
-
-	zap_leaf_array_free(l, &le->le_value_chunk);
-	le->le_value_chunk =
-	    zap_leaf_array_create(l, buf, integer_size, num_integers);
-	le->le_value_numints = num_integers;
-	le->le_value_intlen = integer_size;
-	return (0);
-}
-
-void
-zap_entry_remove(zap_entry_handle_t *zeh)
-{
-	zap_leaf_t *l = zeh->zeh_leaf;
-
-	ASSERT3P(zeh->zeh_chunkp, !=, &zeh->zeh_fakechunk);
-
-	uint16_t entry_chunk = *zeh->zeh_chunkp;
-	struct zap_leaf_entry *le = ZAP_LEAF_ENTRY(l, entry_chunk);
-	ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY);
-
-	zap_leaf_array_free(l, &le->le_name_chunk);
-	zap_leaf_array_free(l, &le->le_value_chunk);
-
-	*zeh->zeh_chunkp = le->le_next;
-	zap_leaf_chunk_free(l, entry_chunk);
-
-	zap_leaf_phys(l)->l_hdr.lh_nentries--;
-}
-
-int
-zap_entry_create(zap_leaf_t *l, zap_name_t *zn, uint32_t cd,
-    uint8_t integer_size, uint64_t num_integers, const void *buf,
-    zap_entry_handle_t *zeh)
-{
-	uint16_t chunk;
-	struct zap_leaf_entry *le;
-	uint64_t h = zn->zn_hash;
-
-	uint64_t valuelen = integer_size * num_integers;
-
-	int numchunks = 1 + ZAP_LEAF_ARRAY_NCHUNKS(zn->zn_key_orig_numints *
-	    zn->zn_key_intlen) + ZAP_LEAF_ARRAY_NCHUNKS(valuelen);
-	if (numchunks > ZAP_LEAF_NUMCHUNKS(l))
-		return (E2BIG);
-
-	if (cd == ZAP_NEED_CD) {
-		/* find the lowest unused cd */
-		if (zap_leaf_phys(l)->l_hdr.lh_flags & ZLF_ENTRIES_CDSORTED) {
-			cd = 0;
-
-			for (chunk = *LEAF_HASH_ENTPTR(l, h);
-			    chunk != CHAIN_END; chunk = le->le_next) {
-				le = ZAP_LEAF_ENTRY(l, chunk);
-				if (le->le_cd > cd)
-					break;
-				if (le->le_hash == h) {
-					ASSERT3U(cd, ==, le->le_cd);
-					cd++;
-				}
-			}
-		} else {
-			/* old unsorted format; do it the O(n^2) way */
-			for (cd = 0; ; cd++) {
-				for (chunk = *LEAF_HASH_ENTPTR(l, h);
-				    chunk != CHAIN_END; chunk = le->le_next) {
-					le = ZAP_LEAF_ENTRY(l, chunk);
-					if (le->le_hash == h &&
-					    le->le_cd == cd) {
-						break;
-					}
-				}
-				/* If this cd is not in use, we are good. */
-				if (chunk == CHAIN_END)
-					break;
-			}
-		}
-		/*
-		 * We would run out of space in a block before we could
-		 * store enough entries to run out of CD values.
-		 */
-		ASSERT3U(cd, <, zap_maxcd(zn->zn_zap));
-	}
-
-	if (zap_leaf_phys(l)->l_hdr.lh_nfree < numchunks)
-		return (SET_ERROR(EAGAIN));
-
-	/* make the entry */
-	chunk = zap_leaf_chunk_alloc(l);
-	le = ZAP_LEAF_ENTRY(l, chunk);
-	le->le_type = ZAP_CHUNK_ENTRY;
-	le->le_name_chunk = zap_leaf_array_create(l, zn->zn_key_orig,
-	    zn->zn_key_intlen, zn->zn_key_orig_numints);
-	le->le_name_numints = zn->zn_key_orig_numints;
-	le->le_value_chunk =
-	    zap_leaf_array_create(l, buf, integer_size, num_integers);
-	le->le_value_numints = num_integers;
-	le->le_value_intlen = integer_size;
-	le->le_hash = h;
-	le->le_cd = cd;
-
-	/* link it into the hash chain */
-	/* XXX if we did the search above, we could just use that */
-	uint16_t *chunkp = zap_leaf_rehash_entry(l, chunk);
-
-	zap_leaf_phys(l)->l_hdr.lh_nentries++;
-
-	zeh->zeh_leaf = l;
-	zeh->zeh_num_integers = num_integers;
-	zeh->zeh_integer_size = le->le_value_intlen;
-	zeh->zeh_cd = le->le_cd;
-	zeh->zeh_hash = le->le_hash;
-	zeh->zeh_chunkp = chunkp;
-
-	return (0);
-}
-
-/*
- * Determine if there is another entry with the same normalized form.
- * For performance purposes, either zn or name must be provided (the
- * other can be NULL).  Note, there usually won't be any hash
- * conflicts, in which case we don't need the concatenated/normalized
- * form of the name.  But all callers have one of these on hand anyway,
- * so might as well take advantage.  A cleaner but slower interface
- * would accept neither argument, and compute the normalized name as
- * needed (using zap_name_alloc(zap_entry_read_name(zeh))).
- */
-boolean_t
-zap_entry_normalization_conflict(zap_entry_handle_t *zeh, zap_name_t *zn,
-    const char *name, zap_t *zap)
-{
-	struct zap_leaf_entry *le;
-	boolean_t allocdzn = B_FALSE;
-
-	if (zap->zap_normflags == 0)
-		return (B_FALSE);
-
-	for (uint16_t chunk = *LEAF_HASH_ENTPTR(zeh->zeh_leaf, zeh->zeh_hash);
-	    chunk != CHAIN_END; chunk = le->le_next) {
-		le = ZAP_LEAF_ENTRY(zeh->zeh_leaf, chunk);
-		if (le->le_hash != zeh->zeh_hash)
-			continue;
-		if (le->le_cd == zeh->zeh_cd)
-			continue;
-
-		if (zn == NULL) {
-			zn = zap_name_alloc(zap, name, MT_NORMALIZE);
-			allocdzn = B_TRUE;
-		}
-		if (zap_leaf_array_match(zeh->zeh_leaf, zn,
-		    le->le_name_chunk, le->le_name_numints)) {
-			if (allocdzn)
-				zap_name_free(zn);
-			return (B_TRUE);
-		}
-	}
-	if (allocdzn)
-		zap_name_free(zn);
-	return (B_FALSE);
-}
-
-/*
- * Routines for transferring entries between leafs.
- */
-
-static uint16_t *
-zap_leaf_rehash_entry(zap_leaf_t *l, uint16_t entry)
-{
-	struct zap_leaf_entry *le = ZAP_LEAF_ENTRY(l, entry);
-	struct zap_leaf_entry *le2;
-	uint16_t *chunkp;
-
-	/*
-	 * keep the entry chain sorted by cd
-	 * NB: this will not cause problems for unsorted leafs, though
-	 * it is unnecessary there.
-	 */
-	for (chunkp = LEAF_HASH_ENTPTR(l, le->le_hash);
-	    *chunkp != CHAIN_END; chunkp = &le2->le_next) {
-		le2 = ZAP_LEAF_ENTRY(l, *chunkp);
-		if (le2->le_cd > le->le_cd)
-			break;
-	}
-
-	le->le_next = *chunkp;
-	*chunkp = entry;
-	return (chunkp);
-}
-
-static uint16_t
-zap_leaf_transfer_array(zap_leaf_t *l, uint16_t chunk, zap_leaf_t *nl)
-{
-	uint16_t new_chunk;
-	uint16_t *nchunkp = &new_chunk;
-
-	while (chunk != CHAIN_END) {
-		uint16_t nchunk = zap_leaf_chunk_alloc(nl);
-		struct zap_leaf_array *nla =
-		    &ZAP_LEAF_CHUNK(nl, nchunk).l_array;
-		struct zap_leaf_array *la =
-		    &ZAP_LEAF_CHUNK(l, chunk).l_array;
-		int nextchunk = la->la_next;
-
-		ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
-		ASSERT3U(nchunk, <, ZAP_LEAF_NUMCHUNKS(l));
-
-		*nla = *la; /* structure assignment */
-
-		zap_leaf_chunk_free(l, chunk);
-		chunk = nextchunk;
-		*nchunkp = nchunk;
-		nchunkp = &nla->la_next;
-	}
-	*nchunkp = CHAIN_END;
-	return (new_chunk);
-}
-
-static void
-zap_leaf_transfer_entry(zap_leaf_t *l, int entry, zap_leaf_t *nl)
-{
-	struct zap_leaf_entry *le = ZAP_LEAF_ENTRY(l, entry);
-	ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY);
-
-	uint16_t chunk = zap_leaf_chunk_alloc(nl);
-	struct zap_leaf_entry *nle = ZAP_LEAF_ENTRY(nl, chunk);
-	*nle = *le; /* structure assignment */
-
-	(void) zap_leaf_rehash_entry(nl, chunk);
-
-	nle->le_name_chunk = zap_leaf_transfer_array(l, le->le_name_chunk, nl);
-	nle->le_value_chunk =
-	    zap_leaf_transfer_array(l, le->le_value_chunk, nl);
-
-	zap_leaf_chunk_free(l, entry);
-
-	zap_leaf_phys(l)->l_hdr.lh_nentries--;
-	zap_leaf_phys(nl)->l_hdr.lh_nentries++;
-}
-
-/*
- * Transfer the entries whose hash prefix ends in 1 to the new leaf.
- */
-void
-zap_leaf_split(zap_leaf_t *l, zap_leaf_t *nl, boolean_t sort)
-{
-	int bit = 64 - 1 - zap_leaf_phys(l)->l_hdr.lh_prefix_len;
-
-	/* set new prefix and prefix_len */
-	zap_leaf_phys(l)->l_hdr.lh_prefix <<= 1;
-	zap_leaf_phys(l)->l_hdr.lh_prefix_len++;
-	zap_leaf_phys(nl)->l_hdr.lh_prefix =
-	    zap_leaf_phys(l)->l_hdr.lh_prefix | 1;
-	zap_leaf_phys(nl)->l_hdr.lh_prefix_len =
-	    zap_leaf_phys(l)->l_hdr.lh_prefix_len;
-
-	/* break existing hash chains */
-	zap_memset(zap_leaf_phys(l)->l_hash, CHAIN_END,
-	    2*ZAP_LEAF_HASH_NUMENTRIES(l));
-
-	if (sort)
-		zap_leaf_phys(l)->l_hdr.lh_flags |= ZLF_ENTRIES_CDSORTED;
-
-	/*
-	 * Transfer entries whose hash bit 'bit' is set to nl; rehash
-	 * the remaining entries
-	 *
-	 * NB: We could find entries via the hashtable instead. That
-	 * would be O(hashents+numents) rather than O(numblks+numents),
-	 * but this accesses memory more sequentially, and when we're
-	 * called, the block is usually pretty full.
-	 */
-	for (int i = 0; i < ZAP_LEAF_NUMCHUNKS(l); i++) {
-		struct zap_leaf_entry *le = ZAP_LEAF_ENTRY(l, i);
-		if (le->le_type != ZAP_CHUNK_ENTRY)
-			continue;
-
-		if (le->le_hash & (1ULL << bit))
-			zap_leaf_transfer_entry(l, i, nl);
-		else
-			(void) zap_leaf_rehash_entry(l, i);
-	}
-}
-
-void
-zap_leaf_stats(zap_t *zap, zap_leaf_t *l, zap_stats_t *zs)
-{
-	int n = zap_f_phys(zap)->zap_ptrtbl.zt_shift -
-	    zap_leaf_phys(l)->l_hdr.lh_prefix_len;
-	n = MIN(n, ZAP_HISTOGRAM_SIZE-1);
-	zs->zs_leafs_with_2n_pointers[n]++;
-
-
-	n = zap_leaf_phys(l)->l_hdr.lh_nentries/5;
-	n = MIN(n, ZAP_HISTOGRAM_SIZE-1);
-	zs->zs_blocks_with_n5_entries[n]++;
-
-	n = ((1<<FZAP_BLOCK_SHIFT(zap)) -
-	    zap_leaf_phys(l)->l_hdr.lh_nfree * (ZAP_LEAF_ARRAY_BYTES+1))*10 /
-	    (1<<FZAP_BLOCK_SHIFT(zap));
-	n = MIN(n, ZAP_HISTOGRAM_SIZE-1);
-	zs->zs_blocks_n_tenths_full[n]++;
-
-	for (int i = 0; i < ZAP_LEAF_HASH_NUMENTRIES(l); i++) {
-		int nentries = 0;
-		int chunk = zap_leaf_phys(l)->l_hash[i];
-
-		while (chunk != CHAIN_END) {
-			struct zap_leaf_entry *le =
-			    ZAP_LEAF_ENTRY(l, chunk);
-
-			n = 1 + ZAP_LEAF_ARRAY_NCHUNKS(le->le_name_numints) +
-			    ZAP_LEAF_ARRAY_NCHUNKS(le->le_value_numints *
-			    le->le_value_intlen);
-			n = MIN(n, ZAP_HISTOGRAM_SIZE-1);
-			zs->zs_entries_using_n_chunks[n]++;
-
-			chunk = le->le_next;
-			nentries++;
-		}
-
-		n = nentries;
-		n = MIN(n, ZAP_HISTOGRAM_SIZE-1);
-		zs->zs_buckets_with_n_entries[n]++;
-	}
-}
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_micro.c
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_micro.c
+++ /dev/null
@@ -1,1609 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-
-/*
- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
- * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
- * Copyright (c) 2014 Integros [integros.com]
- * Copyright 2017 Nexenta Systems, Inc.
- */
-
-#include <sys/zio.h>
-#include <sys/spa.h>
-#include <sys/dmu.h>
-#include <sys/zfs_context.h>
-#include <sys/zap.h>
-#include <sys/refcount.h>
-#include <sys/zap_impl.h>
-#include <sys/zap_leaf.h>
-#include <sys/avl.h>
-#include <sys/arc.h>
-#include <sys/dmu_objset.h>
-
-#ifdef _KERNEL
-#include <sys/sunddi.h>
-#endif
-
-extern inline mzap_phys_t *zap_m_phys(zap_t *zap);
-
-static int mzap_upgrade(zap_t **zapp,
-    void *tag, dmu_tx_t *tx, zap_flags_t flags);
-
-uint64_t
-zap_getflags(zap_t *zap)
-{
-	if (zap->zap_ismicro)
-		return (0);
-	return (zap_f_phys(zap)->zap_flags);
-}
-
-int
-zap_hashbits(zap_t *zap)
-{
-	if (zap_getflags(zap) & ZAP_FLAG_HASH64)
-		return (48);
-	else
-		return (28);
-}
-
-uint32_t
-zap_maxcd(zap_t *zap)
-{
-	if (zap_getflags(zap) & ZAP_FLAG_HASH64)
-		return ((1<<16)-1);
-	else
-		return (-1U);
-}
-
-static uint64_t
-zap_hash(zap_name_t *zn)
-{
-	zap_t *zap = zn->zn_zap;
-	uint64_t h = 0;
-
-	if (zap_getflags(zap) & ZAP_FLAG_PRE_HASHED_KEY) {
-		ASSERT(zap_getflags(zap) & ZAP_FLAG_UINT64_KEY);
-		h = *(uint64_t *)zn->zn_key_orig;
-	} else {
-		h = zap->zap_salt;
-		ASSERT(h != 0);
-		ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
-
-		if (zap_getflags(zap) & ZAP_FLAG_UINT64_KEY) {
-			const uint64_t *wp = zn->zn_key_norm;
-
-			ASSERT(zn->zn_key_intlen == 8);
-			for (int i = 0; i < zn->zn_key_norm_numints;
-			    wp++, i++) {
-				uint64_t word = *wp;
-
-				for (int j = 0; j < zn->zn_key_intlen; j++) {
-					h = (h >> 8) ^
-					    zfs_crc64_table[(h ^ word) & 0xFF];
-					word >>= NBBY;
-				}
-			}
-		} else {
-			const uint8_t *cp = zn->zn_key_norm;
-
-			/*
-			 * We previously stored the terminating null on
-			 * disk, but didn't hash it, so we need to
-			 * continue to not hash it.  (The
-			 * zn_key_*_numints includes the terminating
-			 * null for non-binary keys.)
-			 */
-			int len = zn->zn_key_norm_numints - 1;
-
-			ASSERT(zn->zn_key_intlen == 1);
-			for (int i = 0; i < len; cp++, i++) {
-				h = (h >> 8) ^
-				    zfs_crc64_table[(h ^ *cp) & 0xFF];
-			}
-		}
-	}
-	/*
-	 * Don't use all 64 bits, since we need some in the cookie for
-	 * the collision differentiator.  We MUST use the high bits,
-	 * since those are the ones that we first pay attention to when
-	 * chosing the bucket.
-	 */
-	h &= ~((1ULL << (64 - zap_hashbits(zap))) - 1);
-
-	return (h);
-}
-
-static int
-zap_normalize(zap_t *zap, const char *name, char *namenorm, int normflags)
-{
-	ASSERT(!(zap_getflags(zap) & ZAP_FLAG_UINT64_KEY));
-
-	size_t inlen = strlen(name) + 1;
-	size_t outlen = ZAP_MAXNAMELEN;
-
-	int err = 0;
-	(void) u8_textprep_str((char *)name, &inlen, namenorm, &outlen,
-	    normflags | U8_TEXTPREP_IGNORE_NULL | U8_TEXTPREP_IGNORE_INVALID,
-	    U8_UNICODE_LATEST, &err);
-
-	return (err);
-}
-
-boolean_t
-zap_match(zap_name_t *zn, const char *matchname)
-{
-	ASSERT(!(zap_getflags(zn->zn_zap) & ZAP_FLAG_UINT64_KEY));
-
-	if (zn->zn_matchtype & MT_NORMALIZE) {
-		char norm[ZAP_MAXNAMELEN];
-
-		if (zap_normalize(zn->zn_zap, matchname, norm,
-		    zn->zn_normflags) != 0)
-			return (B_FALSE);
-
-		return (strcmp(zn->zn_key_norm, norm) == 0);
-	} else {
-		return (strcmp(zn->zn_key_orig, matchname) == 0);
-	}
-}
-
-void
-zap_name_free(zap_name_t *zn)
-{
-	kmem_free(zn, sizeof (zap_name_t));
-}
-
-zap_name_t *
-zap_name_alloc(zap_t *zap, const char *key, matchtype_t mt)
-{
-	zap_name_t *zn = kmem_alloc(sizeof (zap_name_t), KM_SLEEP);
-
-	zn->zn_zap = zap;
-	zn->zn_key_intlen = sizeof (*key);
-	zn->zn_key_orig = key;
-	zn->zn_key_orig_numints = strlen(zn->zn_key_orig) + 1;
-	zn->zn_matchtype = mt;
-	zn->zn_normflags = zap->zap_normflags;
-
-	/*
-	 * If we're dealing with a case sensitive lookup on a mixed or
-	 * insensitive fs, remove U8_TEXTPREP_TOUPPER or the lookup
-	 * will fold case to all caps overriding the lookup request.
-	 */
-	if (mt & MT_MATCH_CASE)
-		zn->zn_normflags &= ~U8_TEXTPREP_TOUPPER;
-
-	if (zap->zap_normflags) {
-		/*
-		 * We *must* use zap_normflags because this normalization is
-		 * what the hash is computed from.
-		 */
-		if (zap_normalize(zap, key, zn->zn_normbuf,
-		    zap->zap_normflags) != 0) {
-			zap_name_free(zn);
-			return (NULL);
-		}
-		zn->zn_key_norm = zn->zn_normbuf;
-		zn->zn_key_norm_numints = strlen(zn->zn_key_norm) + 1;
-	} else {
-		if (mt != 0) {
-			zap_name_free(zn);
-			return (NULL);
-		}
-		zn->zn_key_norm = zn->zn_key_orig;
-		zn->zn_key_norm_numints = zn->zn_key_orig_numints;
-	}
-
-	zn->zn_hash = zap_hash(zn);
-
-	if (zap->zap_normflags != zn->zn_normflags) {
-		/*
-		 * We *must* use zn_normflags because this normalization is
-		 * what the matching is based on.  (Not the hash!)
-		 */
-		if (zap_normalize(zap, key, zn->zn_normbuf,
-		    zn->zn_normflags) != 0) {
-			zap_name_free(zn);
-			return (NULL);
-		}
-		zn->zn_key_norm_numints = strlen(zn->zn_key_norm) + 1;
-	}
-
-	return (zn);
-}
-
-zap_name_t *
-zap_name_alloc_uint64(zap_t *zap, const uint64_t *key, int numints)
-{
-	zap_name_t *zn = kmem_alloc(sizeof (zap_name_t), KM_SLEEP);
-
-	ASSERT(zap->zap_normflags == 0);
-	zn->zn_zap = zap;
-	zn->zn_key_intlen = sizeof (*key);
-	zn->zn_key_orig = zn->zn_key_norm = key;
-	zn->zn_key_orig_numints = zn->zn_key_norm_numints = numints;
-	zn->zn_matchtype = 0;
-
-	zn->zn_hash = zap_hash(zn);
-	return (zn);
-}
-
-static void
-mzap_byteswap(mzap_phys_t *buf, size_t size)
-{
-	buf->mz_block_type = BSWAP_64(buf->mz_block_type);
-	buf->mz_salt = BSWAP_64(buf->mz_salt);
-	buf->mz_normflags = BSWAP_64(buf->mz_normflags);
-	int max = (size / MZAP_ENT_LEN) - 1;
-	for (int i = 0; i < max; i++) {
-		buf->mz_chunk[i].mze_value =
-		    BSWAP_64(buf->mz_chunk[i].mze_value);
-		buf->mz_chunk[i].mze_cd =
-		    BSWAP_32(buf->mz_chunk[i].mze_cd);
-	}
-}
-
-void
-zap_byteswap(void *buf, size_t size)
-{
-	uint64_t block_type = *(uint64_t *)buf;
-
-	if (block_type == ZBT_MICRO || block_type == BSWAP_64(ZBT_MICRO)) {
-		/* ASSERT(magic == ZAP_LEAF_MAGIC); */
-		mzap_byteswap(buf, size);
-	} else {
-		fzap_byteswap(buf, size);
-	}
-}
-
-static int
-mze_compare(const void *arg1, const void *arg2)
-{
-	const mzap_ent_t *mze1 = arg1;
-	const mzap_ent_t *mze2 = arg2;
-
-	int cmp = AVL_CMP(mze1->mze_hash, mze2->mze_hash);
-	if (likely(cmp))
-		return (cmp);
-
-	return (AVL_CMP(mze1->mze_cd, mze2->mze_cd));
-}
-
-static int
-mze_insert(zap_t *zap, int chunkid, uint64_t hash)
-{
-	avl_index_t idx;
-
-	ASSERT(zap->zap_ismicro);
-	ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
-
-	mzap_ent_t *mze = kmem_alloc(sizeof (mzap_ent_t), KM_SLEEP);
-	mze->mze_chunkid = chunkid;
-	mze->mze_hash = hash;
-	mze->mze_cd = MZE_PHYS(zap, mze)->mze_cd;
-	ASSERT(MZE_PHYS(zap, mze)->mze_name[0] != 0);
-	if (avl_find(&zap->zap_m.zap_avl, mze, &idx) != NULL) {
-		kmem_free(mze, sizeof (mzap_ent_t));
-		return (EEXIST);
-	}
-	avl_insert(&zap->zap_m.zap_avl, mze, idx);
-	return (0);
-}
-
-static mzap_ent_t *
-mze_find(zap_name_t *zn)
-{
-	mzap_ent_t mze_tofind;
-	mzap_ent_t *mze;
-	avl_index_t idx;
-	avl_tree_t *avl = &zn->zn_zap->zap_m.zap_avl;
-
-	ASSERT(zn->zn_zap->zap_ismicro);
-	ASSERT(RW_LOCK_HELD(&zn->zn_zap->zap_rwlock));
-
-	mze_tofind.mze_hash = zn->zn_hash;
-	mze_tofind.mze_cd = 0;
-
-	mze = avl_find(avl, &mze_tofind, &idx);
-	if (mze == NULL)
-		mze = avl_nearest(avl, idx, AVL_AFTER);
-	for (; mze && mze->mze_hash == zn->zn_hash; mze = AVL_NEXT(avl, mze)) {
-		ASSERT3U(mze->mze_cd, ==, MZE_PHYS(zn->zn_zap, mze)->mze_cd);
-		if (zap_match(zn, MZE_PHYS(zn->zn_zap, mze)->mze_name))
-			return (mze);
-	}
-
-	return (NULL);
-}
-
-static uint32_t
-mze_find_unused_cd(zap_t *zap, uint64_t hash)
-{
-	mzap_ent_t mze_tofind;
-	avl_index_t idx;
-	avl_tree_t *avl = &zap->zap_m.zap_avl;
-
-	ASSERT(zap->zap_ismicro);
-	ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
-
-	mze_tofind.mze_hash = hash;
-	mze_tofind.mze_cd = 0;
-
-	uint32_t cd = 0;
-	for (mzap_ent_t *mze = avl_find(avl, &mze_tofind, &idx);
-	    mze && mze->mze_hash == hash; mze = AVL_NEXT(avl, mze)) {
-		if (mze->mze_cd != cd)
-			break;
-		cd++;
-	}
-
-	return (cd);
-}
-
-static void
-mze_remove(zap_t *zap, mzap_ent_t *mze)
-{
-	ASSERT(zap->zap_ismicro);
-	ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
-
-	avl_remove(&zap->zap_m.zap_avl, mze);
-	kmem_free(mze, sizeof (mzap_ent_t));
-}
-
-static void
-mze_destroy(zap_t *zap)
-{
-	mzap_ent_t *mze;
-	void *avlcookie = NULL;
-
-	while (mze = avl_destroy_nodes(&zap->zap_m.zap_avl, &avlcookie))
-		kmem_free(mze, sizeof (mzap_ent_t));
-	avl_destroy(&zap->zap_m.zap_avl);
-}
-
-static zap_t *
-mzap_open(objset_t *os, uint64_t obj, dmu_buf_t *db)
-{
-	zap_t *winner;
-	uint64_t *zap_hdr = (uint64_t *)db->db_data;
-	uint64_t zap_block_type = zap_hdr[0];
-	uint64_t zap_magic = zap_hdr[1];
-
-	ASSERT3U(MZAP_ENT_LEN, ==, sizeof (mzap_ent_phys_t));
-
-	zap_t *zap = kmem_zalloc(sizeof (zap_t), KM_SLEEP);
-	rw_init(&zap->zap_rwlock, 0, 0, 0);
-	rw_enter(&zap->zap_rwlock, RW_WRITER);
-	zap->zap_objset = os;
-	zap->zap_object = obj;
-	zap->zap_dbuf = db;
-
-	if (zap_block_type != ZBT_MICRO) {
-		mutex_init(&zap->zap_f.zap_num_entries_mtx, 0, 0, 0);
-		zap->zap_f.zap_block_shift = highbit64(db->db_size) - 1;
-		if (zap_block_type != ZBT_HEADER || zap_magic != ZAP_MAGIC) {
-			winner = NULL;	/* No actual winner here... */
-			goto handle_winner;
-		}
-	} else {
-		zap->zap_ismicro = TRUE;
-	}
-
-	/*
-	 * Make sure that zap_ismicro is set before we let others see
-	 * it, because zap_lockdir() checks zap_ismicro without the lock
-	 * held.
-	 */
-	dmu_buf_init_user(&zap->zap_dbu, zap_evict_sync, NULL, &zap->zap_dbuf);
-	winner = dmu_buf_set_user(db, &zap->zap_dbu);
-
-	if (winner != NULL)
-		goto handle_winner;
-
-	if (zap->zap_ismicro) {
-		zap->zap_salt = zap_m_phys(zap)->mz_salt;
-		zap->zap_normflags = zap_m_phys(zap)->mz_normflags;
-		zap->zap_m.zap_num_chunks = db->db_size / MZAP_ENT_LEN - 1;
-		avl_create(&zap->zap_m.zap_avl, mze_compare,
-		    sizeof (mzap_ent_t), offsetof(mzap_ent_t, mze_node));
-
-		for (int i = 0; i < zap->zap_m.zap_num_chunks; i++) {
-			mzap_ent_phys_t *mze =
-			    &zap_m_phys(zap)->mz_chunk[i];
-			if (mze->mze_name[0]) {
-				zap_name_t *zn;
-
-				zn = zap_name_alloc(zap, mze->mze_name, 0);
-				if (mze_insert(zap, i, zn->zn_hash) == 0)
-					zap->zap_m.zap_num_entries++;
-				else {
-					printf("ZFS WARNING: Duplicated ZAP "
-					    "entry detected (%s).\n",
-					    mze->mze_name);
-				}
-				zap_name_free(zn);
-			}
-		}
-	} else {
-		zap->zap_salt = zap_f_phys(zap)->zap_salt;
-		zap->zap_normflags = zap_f_phys(zap)->zap_normflags;
-
-		ASSERT3U(sizeof (struct zap_leaf_header), ==,
-		    2*ZAP_LEAF_CHUNKSIZE);
-
-		/*
-		 * The embedded pointer table should not overlap the
-		 * other members.
-		 */
-		ASSERT3P(&ZAP_EMBEDDED_PTRTBL_ENT(zap, 0), >,
-		    &zap_f_phys(zap)->zap_salt);
-
-		/*
-		 * The embedded pointer table should end at the end of
-		 * the block
-		 */
-		ASSERT3U((uintptr_t)&ZAP_EMBEDDED_PTRTBL_ENT(zap,
-		    1<<ZAP_EMBEDDED_PTRTBL_SHIFT(zap)) -
-		    (uintptr_t)zap_f_phys(zap), ==,
-		    zap->zap_dbuf->db_size);
-	}
-	rw_exit(&zap->zap_rwlock);
-	return (zap);
-
-handle_winner:
-	rw_exit(&zap->zap_rwlock);
-	rw_destroy(&zap->zap_rwlock);
-	if (!zap->zap_ismicro)
-		mutex_destroy(&zap->zap_f.zap_num_entries_mtx);
-	kmem_free(zap, sizeof (zap_t));
-	return (winner);
-}
-
-/*
- * This routine "consumes" the caller's hold on the dbuf, which must
- * have the specified tag.
- */
-static int
-zap_lockdir_impl(dmu_buf_t *db, void *tag, dmu_tx_t *tx,
-    krw_t lti, boolean_t fatreader, boolean_t adding, zap_t **zapp)
-{
-	ASSERT0(db->db_offset);
-	objset_t *os = dmu_buf_get_objset(db);
-	uint64_t obj = db->db_object;
-
-	*zapp = NULL;
-
-	zap_t *zap = dmu_buf_get_user(db);
-	if (zap == NULL) {
-		zap = mzap_open(os, obj, db);
-		if (zap == NULL) {
-			/*
-			 * mzap_open() didn't like what it saw on-disk.
-			 * Check for corruption!
-			 */
-			return (SET_ERROR(EIO));
-		}
-	}
-
-	/*
-	 * We're checking zap_ismicro without the lock held, in order to
-	 * tell what type of lock we want.  Once we have some sort of
-	 * lock, see if it really is the right type.  In practice this
-	 * can only be different if it was upgraded from micro to fat,
-	 * and micro wanted WRITER but fat only needs READER.
-	 */
-	krw_t lt = (!zap->zap_ismicro && fatreader) ? RW_READER : lti;
-	rw_enter(&zap->zap_rwlock, lt);
-	if (lt != ((!zap->zap_ismicro && fatreader) ? RW_READER : lti)) {
-		/* it was upgraded, now we only need reader */
-		ASSERT(lt == RW_WRITER);
-		ASSERT(RW_READER ==
-		    (!zap->zap_ismicro && fatreader) ? RW_READER : lti);
-		rw_downgrade(&zap->zap_rwlock);
-		lt = RW_READER;
-	}
-
-	zap->zap_objset = os;
-
-	if (lt == RW_WRITER)
-		dmu_buf_will_dirty(db, tx);
-
-	ASSERT3P(zap->zap_dbuf, ==, db);
-
-	ASSERT(!zap->zap_ismicro ||
-	    zap->zap_m.zap_num_entries <= zap->zap_m.zap_num_chunks);
-	if (zap->zap_ismicro && tx && adding &&
-	    zap->zap_m.zap_num_entries == zap->zap_m.zap_num_chunks) {
-		uint64_t newsz = db->db_size + SPA_MINBLOCKSIZE;
-		if (newsz > MZAP_MAX_BLKSZ) {
-			dprintf("upgrading obj %llu: num_entries=%u\n",
-			    obj, zap->zap_m.zap_num_entries);
-			*zapp = zap;
-			int err = mzap_upgrade(zapp, tag, tx, 0);
-			if (err != 0)
-				rw_exit(&zap->zap_rwlock);
-			return (err);
-		}
-		VERIFY0(dmu_object_set_blocksize(os, obj, newsz, 0, tx));
-		zap->zap_m.zap_num_chunks =
-		    db->db_size / MZAP_ENT_LEN - 1;
-	}
-
-	*zapp = zap;
-	return (0);
-}
-
-static int
-zap_lockdir_by_dnode(dnode_t *dn, dmu_tx_t *tx,
-    krw_t lti, boolean_t fatreader, boolean_t adding, void *tag, zap_t **zapp)
-{
-	dmu_buf_t *db;
-
-	int err = dmu_buf_hold_by_dnode(dn, 0, tag, &db, DMU_READ_NO_PREFETCH);
-	if (err != 0) {
-		return (err);
-	}
-#ifdef ZFS_DEBUG
-	{
-		dmu_object_info_t doi;
-		dmu_object_info_from_db(db, &doi);
-		ASSERT3U(DMU_OT_BYTESWAP(doi.doi_type), ==, DMU_BSWAP_ZAP);
-	}
-#endif
-
-	err = zap_lockdir_impl(db, tag, tx, lti, fatreader, adding, zapp);
-	if (err != 0) {
-		dmu_buf_rele(db, tag);
-	}
-	return (err);
-}
-
-int
-zap_lockdir(objset_t *os, uint64_t obj, dmu_tx_t *tx,
-    krw_t lti, boolean_t fatreader, boolean_t adding, void *tag, zap_t **zapp)
-{
-	dmu_buf_t *db;
-
-	int err = dmu_buf_hold(os, obj, 0, tag, &db, DMU_READ_NO_PREFETCH);
-	if (err != 0)
-		return (err);
-#ifdef ZFS_DEBUG
-	{
-		dmu_object_info_t doi;
-		dmu_object_info_from_db(db, &doi);
-		ASSERT3U(DMU_OT_BYTESWAP(doi.doi_type), ==, DMU_BSWAP_ZAP);
-	}
-#endif
-	err = zap_lockdir_impl(db, tag, tx, lti, fatreader, adding, zapp);
-	if (err != 0)
-		dmu_buf_rele(db, tag);
-	return (err);
-}
-
-void
-zap_unlockdir(zap_t *zap, void *tag)
-{
-	rw_exit(&zap->zap_rwlock);
-	dmu_buf_rele(zap->zap_dbuf, tag);
-}
-
-static int
-mzap_upgrade(zap_t **zapp, void *tag, dmu_tx_t *tx, zap_flags_t flags)
-{
-	int err = 0;
-	zap_t *zap = *zapp;
-
-	ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
-
-	int sz = zap->zap_dbuf->db_size;
-	mzap_phys_t *mzp = zio_buf_alloc(sz);
-	bcopy(zap->zap_dbuf->db_data, mzp, sz);
-	int nchunks = zap->zap_m.zap_num_chunks;
-
-	if (!flags) {
-		err = dmu_object_set_blocksize(zap->zap_objset, zap->zap_object,
-		    1ULL << fzap_default_block_shift, 0, tx);
-		if (err != 0) {
-			zio_buf_free(mzp, sz);
-			return (err);
-		}
-	}
-
-	dprintf("upgrading obj=%llu with %u chunks\n",
-	    zap->zap_object, nchunks);
-	/* XXX destroy the avl later, so we can use the stored hash value */
-	mze_destroy(zap);
-
-	fzap_upgrade(zap, tx, flags);
-
-	for (int i = 0; i < nchunks; i++) {
-		mzap_ent_phys_t *mze = &mzp->mz_chunk[i];
-		if (mze->mze_name[0] == 0)
-			continue;
-		dprintf("adding %s=%llu\n",
-		    mze->mze_name, mze->mze_value);
-		zap_name_t *zn = zap_name_alloc(zap, mze->mze_name, 0);
-		err = fzap_add_cd(zn, 8, 1, &mze->mze_value, mze->mze_cd,
-		    tag, tx);
-		zap = zn->zn_zap;	/* fzap_add_cd() may change zap */
-		zap_name_free(zn);
-		if (err != 0)
-			break;
-	}
-	zio_buf_free(mzp, sz);
-	*zapp = zap;
-	return (err);
-}
-
-/*
- * The "normflags" determine the behavior of the matchtype_t which is
- * passed to zap_lookup_norm().  Names which have the same normalized
- * version will be stored with the same hash value, and therefore we can
- * perform normalization-insensitive lookups.  We can be Unicode form-
- * insensitive and/or case-insensitive.  The following flags are valid for
- * "normflags":
- *
- * U8_TEXTPREP_NFC
- * U8_TEXTPREP_NFD
- * U8_TEXTPREP_NFKC
- * U8_TEXTPREP_NFKD
- * U8_TEXTPREP_TOUPPER
- *
- * The *_NF* (Normalization Form) flags are mutually exclusive; at most one
- * of them may be supplied.
- */
-void
-mzap_create_impl(objset_t *os, uint64_t obj, int normflags, zap_flags_t flags,
-    dmu_tx_t *tx)
-{
-	dmu_buf_t *db;
-
-	VERIFY0(dmu_buf_hold(os, obj, 0, FTAG, &db, DMU_READ_NO_PREFETCH));
-
-	dmu_buf_will_dirty(db, tx);
-	mzap_phys_t *zp = db->db_data;
-	zp->mz_block_type = ZBT_MICRO;
-	zp->mz_salt = ((uintptr_t)db ^ (uintptr_t)tx ^ (obj << 1)) | 1ULL;
-	zp->mz_normflags = normflags;
-
-	if (flags != 0) {
-		zap_t *zap;
-		/* Only fat zap supports flags; upgrade immediately. */
-		VERIFY0(zap_lockdir_impl(db, FTAG, tx, RW_WRITER,
-		    B_FALSE, B_FALSE, &zap));
-		VERIFY0(mzap_upgrade(&zap, FTAG, tx, flags));
-		zap_unlockdir(zap, FTAG);
-	} else {
-		dmu_buf_rele(db, FTAG);
-	}
-}
-
-int
-zap_create_claim(objset_t *os, uint64_t obj, dmu_object_type_t ot,
-    dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
-{
-	return (zap_create_claim_dnsize(os, obj, ot, bonustype, bonuslen,
-	    0, tx));
-}
-
-int
-zap_create_claim_dnsize(objset_t *os, uint64_t obj, dmu_object_type_t ot,
-    dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx)
-{
-	return (zap_create_claim_norm_dnsize(os, obj,
-	    0, ot, bonustype, bonuslen, dnodesize, tx));
-}
-
-int
-zap_create_claim_norm(objset_t *os, uint64_t obj, int normflags,
-    dmu_object_type_t ot,
-    dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
-{
-	return (zap_create_claim_norm_dnsize(os, obj, normflags, ot, bonustype,
-	    bonuslen, 0, tx));
-}
-
-int
-zap_create_claim_norm_dnsize(objset_t *os, uint64_t obj, int normflags,
-    dmu_object_type_t ot, dmu_object_type_t bonustype, int bonuslen,
-    int dnodesize, dmu_tx_t *tx)
-{
-	int err;
-
-	err = dmu_object_claim_dnsize(os, obj, ot, 0, bonustype, bonuslen,
-	    dnodesize, tx);
-	if (err != 0)
-		return (err);
-	mzap_create_impl(os, obj, normflags, 0, tx);
-	return (0);
-}
-
-uint64_t
-zap_create(objset_t *os, dmu_object_type_t ot,
-    dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
-{
-	return (zap_create_norm(os, 0, ot, bonustype, bonuslen, tx));
-}
-
-uint64_t
-zap_create_dnsize(objset_t *os, dmu_object_type_t ot,
-    dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx)
-{
-	return (zap_create_norm_dnsize(os, 0, ot, bonustype, bonuslen,
-	    dnodesize, tx));
-}
-
-uint64_t
-zap_create_norm(objset_t *os, int normflags, dmu_object_type_t ot,
-    dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
-{
-	ASSERT3U(DMU_OT_BYTESWAP(ot), ==, DMU_BSWAP_ZAP);
-	return (zap_create_norm_dnsize(os, normflags, ot, bonustype, bonuslen,
-	    0, tx));
-}
-
-uint64_t
-zap_create_norm_dnsize(objset_t *os, int normflags, dmu_object_type_t ot,
-    dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx)
-{
-	uint64_t obj = dmu_object_alloc_dnsize(os, ot, 0, bonustype, bonuslen,
-	    dnodesize, tx);
-
-	mzap_create_impl(os, obj, normflags, 0, tx);
-	return (obj);
-}
-
-uint64_t
-zap_create_flags(objset_t *os, int normflags, zap_flags_t flags,
-    dmu_object_type_t ot, int leaf_blockshift, int indirect_blockshift,
-    dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
-{
-	ASSERT3U(DMU_OT_BYTESWAP(ot), ==, DMU_BSWAP_ZAP);
-	return (zap_create_flags_dnsize(os, normflags, flags, ot,
-	    leaf_blockshift, indirect_blockshift, bonustype, bonuslen, 0, tx));
-}
-
-uint64_t
-zap_create_flags_dnsize(objset_t *os, int normflags, zap_flags_t flags,
-    dmu_object_type_t ot, int leaf_blockshift, int indirect_blockshift,
-    dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx)
-{
-	uint64_t obj = dmu_object_alloc_dnsize(os, ot, 0, bonustype, bonuslen,
-	    dnodesize, tx);
-
-	ASSERT(leaf_blockshift >= SPA_MINBLOCKSHIFT &&
-	    leaf_blockshift <= SPA_OLD_MAXBLOCKSHIFT &&
-	    indirect_blockshift >= SPA_MINBLOCKSHIFT &&
-	    indirect_blockshift <= SPA_OLD_MAXBLOCKSHIFT);
-
-	VERIFY(dmu_object_set_blocksize(os, obj,
-	    1ULL << leaf_blockshift, indirect_blockshift, tx) == 0);
-
-	mzap_create_impl(os, obj, normflags, flags, tx);
-	return (obj);
-}
-
-int
-zap_destroy(objset_t *os, uint64_t zapobj, dmu_tx_t *tx)
-{
-	/*
-	 * dmu_object_free will free the object number and free the
-	 * data.  Freeing the data will cause our pageout function to be
-	 * called, which will destroy our data (zap_leaf_t's and zap_t).
-	 */
-
-	return (dmu_object_free(os, zapobj, tx));
-}
-
-void
-zap_evict_sync(void *dbu)
-{
-	zap_t *zap = dbu;
-
-	rw_destroy(&zap->zap_rwlock);
-
-	if (zap->zap_ismicro)
-		mze_destroy(zap);
-	else
-		mutex_destroy(&zap->zap_f.zap_num_entries_mtx);
-
-	kmem_free(zap, sizeof (zap_t));
-}
-
-int
-zap_count(objset_t *os, uint64_t zapobj, uint64_t *count)
-{
-	zap_t *zap;
-
-	int err =
-	    zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
-	if (err != 0)
-		return (err);
-	if (!zap->zap_ismicro) {
-		err = fzap_count(zap, count);
-	} else {
-		*count = zap->zap_m.zap_num_entries;
-	}
-	zap_unlockdir(zap, FTAG);
-	return (err);
-}
-
-/*
- * zn may be NULL; if not specified, it will be computed if needed.
- * See also the comment above zap_entry_normalization_conflict().
- */
-static boolean_t
-mzap_normalization_conflict(zap_t *zap, zap_name_t *zn, mzap_ent_t *mze)
-{
-	int direction = AVL_BEFORE;
-	boolean_t allocdzn = B_FALSE;
-
-	if (zap->zap_normflags == 0)
-		return (B_FALSE);
-
-again:
-	for (mzap_ent_t *other = avl_walk(&zap->zap_m.zap_avl, mze, direction);
-	    other && other->mze_hash == mze->mze_hash;
-	    other = avl_walk(&zap->zap_m.zap_avl, other, direction)) {
-
-		if (zn == NULL) {
-			zn = zap_name_alloc(zap, MZE_PHYS(zap, mze)->mze_name,
-			    MT_NORMALIZE);
-			allocdzn = B_TRUE;
-		}
-		if (zap_match(zn, MZE_PHYS(zap, other)->mze_name)) {
-			if (allocdzn)
-				zap_name_free(zn);
-			return (B_TRUE);
-		}
-	}
-
-	if (direction == AVL_BEFORE) {
-		direction = AVL_AFTER;
-		goto again;
-	}
-
-	if (allocdzn)
-		zap_name_free(zn);
-	return (B_FALSE);
-}
-
-/*
- * Routines for manipulating attributes.
- */
-
-int
-zap_lookup(objset_t *os, uint64_t zapobj, const char *name,
-    uint64_t integer_size, uint64_t num_integers, void *buf)
-{
-	return (zap_lookup_norm(os, zapobj, name, integer_size,
-	    num_integers, buf, 0, NULL, 0, NULL));
-}
-
-static int
-zap_lookup_impl(zap_t *zap, const char *name,
-    uint64_t integer_size, uint64_t num_integers, void *buf,
-    matchtype_t mt, char *realname, int rn_len,
-    boolean_t *ncp)
-{
-	int err = 0;
-
-	zap_name_t *zn = zap_name_alloc(zap, name, mt);
-	if (zn == NULL)
-		return (SET_ERROR(ENOTSUP));
-
-	if (!zap->zap_ismicro) {
-		err = fzap_lookup(zn, integer_size, num_integers, buf,
-		    realname, rn_len, ncp);
-	} else {
-		mzap_ent_t *mze = mze_find(zn);
-		if (mze == NULL) {
-			err = SET_ERROR(ENOENT);
-		} else {
-			if (num_integers < 1) {
-				err = SET_ERROR(EOVERFLOW);
-			} else if (integer_size != 8) {
-				err = SET_ERROR(EINVAL);
-			} else {
-				*(uint64_t *)buf =
-				    MZE_PHYS(zap, mze)->mze_value;
-				(void) strlcpy(realname,
-				    MZE_PHYS(zap, mze)->mze_name, rn_len);
-				if (ncp) {
-					*ncp = mzap_normalization_conflict(zap,
-					    zn, mze);
-				}
-			}
-		}
-	}
-	zap_name_free(zn);
-	return (err);
-}
-
-int
-zap_lookup_norm(objset_t *os, uint64_t zapobj, const char *name,
-    uint64_t integer_size, uint64_t num_integers, void *buf,
-    matchtype_t mt, char *realname, int rn_len,
-    boolean_t *ncp)
-{
-	zap_t *zap;
-
-	int err =
-	    zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
-	if (err != 0)
-		return (err);
-	err = zap_lookup_impl(zap, name, integer_size,
-	    num_integers, buf, mt, realname, rn_len, ncp);
-	zap_unlockdir(zap, FTAG);
-	return (err);
-}
-
-int
-zap_lookup_by_dnode(dnode_t *dn, const char *name,
-    uint64_t integer_size, uint64_t num_integers, void *buf)
-{
-	return (zap_lookup_norm_by_dnode(dn, name, integer_size,
-	    num_integers, buf, 0, NULL, 0, NULL));
-}
-
-int
-zap_lookup_norm_by_dnode(dnode_t *dn, const char *name,
-    uint64_t integer_size, uint64_t num_integers, void *buf,
-    matchtype_t mt, char *realname, int rn_len,
-    boolean_t *ncp)
-{
-	zap_t *zap;
-
-	int err = zap_lockdir_by_dnode(dn, NULL, RW_READER, TRUE, FALSE,
-	    FTAG, &zap);
-	if (err != 0)
-		return (err);
-	err = zap_lookup_impl(zap, name, integer_size,
-	    num_integers, buf, mt, realname, rn_len, ncp);
-	zap_unlockdir(zap, FTAG);
-	return (err);
-}
-
-int
-zap_prefetch_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
-    int key_numints)
-{
-	zap_t *zap;
-
-	int err =
-	    zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
-	if (err != 0)
-		return (err);
-	zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints);
-	if (zn == NULL) {
-		zap_unlockdir(zap, FTAG);
-		return (SET_ERROR(ENOTSUP));
-	}
-
-	fzap_prefetch(zn);
-	zap_name_free(zn);
-	zap_unlockdir(zap, FTAG);
-	return (err);
-}
-
-int
-zap_lookup_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
-    int key_numints, uint64_t integer_size, uint64_t num_integers, void *buf)
-{
-	zap_t *zap;
-
-	int err =
-	    zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
-	if (err != 0)
-		return (err);
-	zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints);
-	if (zn == NULL) {
-		zap_unlockdir(zap, FTAG);
-		return (SET_ERROR(ENOTSUP));
-	}
-
-	err = fzap_lookup(zn, integer_size, num_integers, buf,
-	    NULL, 0, NULL);
-	zap_name_free(zn);
-	zap_unlockdir(zap, FTAG);
-	return (err);
-}
-
-int
-zap_contains(objset_t *os, uint64_t zapobj, const char *name)
-{
-	int err = zap_lookup_norm(os, zapobj, name, 0,
-	    0, NULL, 0, NULL, 0, NULL);
-	if (err == EOVERFLOW || err == EINVAL)
-		err = 0; /* found, but skipped reading the value */
-	return (err);
-}
-
-int
-zap_length(objset_t *os, uint64_t zapobj, const char *name,
-    uint64_t *integer_size, uint64_t *num_integers)
-{
-	zap_t *zap;
-
-	int err =
-	    zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
-	if (err != 0)
-		return (err);
-	zap_name_t *zn = zap_name_alloc(zap, name, 0);
-	if (zn == NULL) {
-		zap_unlockdir(zap, FTAG);
-		return (SET_ERROR(ENOTSUP));
-	}
-	if (!zap->zap_ismicro) {
-		err = fzap_length(zn, integer_size, num_integers);
-	} else {
-		mzap_ent_t *mze = mze_find(zn);
-		if (mze == NULL) {
-			err = SET_ERROR(ENOENT);
-		} else {
-			if (integer_size)
-				*integer_size = 8;
-			if (num_integers)
-				*num_integers = 1;
-		}
-	}
-	zap_name_free(zn);
-	zap_unlockdir(zap, FTAG);
-	return (err);
-}
-
-int
-zap_length_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
-    int key_numints, uint64_t *integer_size, uint64_t *num_integers)
-{
-	zap_t *zap;
-
-	int err =
-	    zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
-	if (err != 0)
-		return (err);
-	zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints);
-	if (zn == NULL) {
-		zap_unlockdir(zap, FTAG);
-		return (SET_ERROR(ENOTSUP));
-	}
-	err = fzap_length(zn, integer_size, num_integers);
-	zap_name_free(zn);
-	zap_unlockdir(zap, FTAG);
-	return (err);
-}
-
-static void
-mzap_addent(zap_name_t *zn, uint64_t value)
-{
-	zap_t *zap = zn->zn_zap;
-	int start = zap->zap_m.zap_alloc_next;
-
-	ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
-
-#ifdef ZFS_DEBUG
-	for (int i = 0; i < zap->zap_m.zap_num_chunks; i++) {
-		mzap_ent_phys_t *mze = &zap_m_phys(zap)->mz_chunk[i];
-		ASSERT(strcmp(zn->zn_key_orig, mze->mze_name) != 0);
-	}
-#endif
-
-	uint32_t cd = mze_find_unused_cd(zap, zn->zn_hash);
-	/* given the limited size of the microzap, this can't happen */
-	ASSERT(cd < zap_maxcd(zap));
-
-again:
-	for (int i = start; i < zap->zap_m.zap_num_chunks; i++) {
-		mzap_ent_phys_t *mze = &zap_m_phys(zap)->mz_chunk[i];
-		if (mze->mze_name[0] == 0) {
-			mze->mze_value = value;
-			mze->mze_cd = cd;
-			(void) strcpy(mze->mze_name, zn->zn_key_orig);
-			zap->zap_m.zap_num_entries++;
-			zap->zap_m.zap_alloc_next = i+1;
-			if (zap->zap_m.zap_alloc_next ==
-			    zap->zap_m.zap_num_chunks)
-				zap->zap_m.zap_alloc_next = 0;
-			VERIFY(0 == mze_insert(zap, i, zn->zn_hash));
-			return;
-		}
-	}
-	if (start != 0) {
-		start = 0;
-		goto again;
-	}
-	ASSERT(!"out of entries!");
-}
-
-static int
-zap_add_impl(zap_t *zap, const char *key,
-    int integer_size, uint64_t num_integers,
-    const void *val, dmu_tx_t *tx, void *tag)
-{
-	const uint64_t *intval = val;
-	int err = 0;
-
-	zap_name_t *zn = zap_name_alloc(zap, key, 0);
-	if (zn == NULL) {
-		zap_unlockdir(zap, tag);
-		return (SET_ERROR(ENOTSUP));
-	}
-	if (!zap->zap_ismicro) {
-		err = fzap_add(zn, integer_size, num_integers, val, tag, tx);
-		zap = zn->zn_zap;	/* fzap_add() may change zap */
-	} else if (integer_size != 8 || num_integers != 1 ||
-	    strlen(key) >= MZAP_NAME_LEN) {
-		err = mzap_upgrade(&zn->zn_zap, tag, tx, 0);
-		if (err == 0) {
-			err = fzap_add(zn, integer_size, num_integers, val,
-			    tag, tx);
-		}
-		zap = zn->zn_zap;	/* fzap_add() may change zap */
-	} else {
-		if (mze_find(zn) != NULL) {
-			err = SET_ERROR(EEXIST);
-		} else {
-			mzap_addent(zn, *intval);
-		}
-	}
-	ASSERT(zap == zn->zn_zap);
-	zap_name_free(zn);
-	if (zap != NULL)	/* may be NULL if fzap_add() failed */
-		zap_unlockdir(zap, tag);
-	return (err);
-}
-
-int
-zap_add(objset_t *os, uint64_t zapobj, const char *key,
-    int integer_size, uint64_t num_integers,
-    const void *val, dmu_tx_t *tx)
-{
-	zap_t *zap;
-	int err;
-
-	err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap);
-	if (err != 0)
-		return (err);
-	err = zap_add_impl(zap, key, integer_size, num_integers, val, tx, FTAG);
-	/* zap_add_impl() calls zap_unlockdir() */
-	return (err);
-}
-
-int
-zap_add_by_dnode(dnode_t *dn, const char *key,
-    int integer_size, uint64_t num_integers,
-    const void *val, dmu_tx_t *tx)
-{
-	zap_t *zap;
-	int err;
-
-	err = zap_lockdir_by_dnode(dn, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap);
-	if (err != 0)
-		return (err);
-	err = zap_add_impl(zap, key, integer_size, num_integers, val, tx, FTAG);
-	/* zap_add_impl() calls zap_unlockdir() */
-	return (err);
-}
-
-int
-zap_add_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
-    int key_numints, int integer_size, uint64_t num_integers,
-    const void *val, dmu_tx_t *tx)
-{
-	zap_t *zap;
-
-	int err =
-	    zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap);
-	if (err != 0)
-		return (err);
-	zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints);
-	if (zn == NULL) {
-		zap_unlockdir(zap, FTAG);
-		return (SET_ERROR(ENOTSUP));
-	}
-	err = fzap_add(zn, integer_size, num_integers, val, FTAG, tx);
-	zap = zn->zn_zap;	/* fzap_add() may change zap */
-	zap_name_free(zn);
-	if (zap != NULL)	/* may be NULL if fzap_add() failed */
-		zap_unlockdir(zap, FTAG);
-	return (err);
-}
-
-int
-zap_update(objset_t *os, uint64_t zapobj, const char *name,
-    int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx)
-{
-	zap_t *zap;
-	uint64_t oldval;
-	const uint64_t *intval = val;
-
-#ifdef ZFS_DEBUG
-	/*
-	 * If there is an old value, it shouldn't change across the
-	 * lockdir (eg, due to bprewrite's xlation).
-	 */
-	if (integer_size == 8 && num_integers == 1)
-		(void) zap_lookup(os, zapobj, name, 8, 1, &oldval);
-#endif
-
-	int err =
-	    zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap);
-	if (err != 0)
-		return (err);
-	zap_name_t *zn = zap_name_alloc(zap, name, 0);
-	if (zn == NULL) {
-		zap_unlockdir(zap, FTAG);
-		return (SET_ERROR(ENOTSUP));
-	}
-	if (!zap->zap_ismicro) {
-		err = fzap_update(zn, integer_size, num_integers, val,
-		    FTAG, tx);
-		zap = zn->zn_zap;	/* fzap_update() may change zap */
-	} else if (integer_size != 8 || num_integers != 1 ||
-	    strlen(name) >= MZAP_NAME_LEN) {
-		dprintf("upgrading obj %llu: intsz=%u numint=%llu name=%s\n",
-		    zapobj, integer_size, num_integers, name);
-		err = mzap_upgrade(&zn->zn_zap, FTAG, tx, 0);
-		if (err == 0) {
-			err = fzap_update(zn, integer_size, num_integers,
-			    val, FTAG, tx);
-		}
-		zap = zn->zn_zap;	/* fzap_update() may change zap */
-	} else {
-		mzap_ent_t *mze = mze_find(zn);
-		if (mze != NULL) {
-			ASSERT3U(MZE_PHYS(zap, mze)->mze_value, ==, oldval);
-			MZE_PHYS(zap, mze)->mze_value = *intval;
-		} else {
-			mzap_addent(zn, *intval);
-		}
-	}
-	ASSERT(zap == zn->zn_zap);
-	zap_name_free(zn);
-	if (zap != NULL)	/* may be NULL if fzap_upgrade() failed */
-		zap_unlockdir(zap, FTAG);
-	return (err);
-}
-
-int
-zap_update_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
-    int key_numints,
-    int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx)
-{
-	zap_t *zap;
-
-	int err =
-	    zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap);
-	if (err != 0)
-		return (err);
-	zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints);
-	if (zn == NULL) {
-		zap_unlockdir(zap, FTAG);
-		return (SET_ERROR(ENOTSUP));
-	}
-	err = fzap_update(zn, integer_size, num_integers, val, FTAG, tx);
-	zap = zn->zn_zap;	/* fzap_update() may change zap */
-	zap_name_free(zn);
-	if (zap != NULL)	/* may be NULL if fzap_upgrade() failed */
-		zap_unlockdir(zap, FTAG);
-	return (err);
-}
-
-int
-zap_remove(objset_t *os, uint64_t zapobj, const char *name, dmu_tx_t *tx)
-{
-	return (zap_remove_norm(os, zapobj, name, 0, tx));
-}
-
-static int
-zap_remove_impl(zap_t *zap, const char *name,
-    matchtype_t mt, dmu_tx_t *tx)
-{
-	int err = 0;
-
-	zap_name_t *zn = zap_name_alloc(zap, name, mt);
-	if (zn == NULL)
-		return (SET_ERROR(ENOTSUP));
-	if (!zap->zap_ismicro) {
-		err = fzap_remove(zn, tx);
-	} else {
-		mzap_ent_t *mze = mze_find(zn);
-		if (mze == NULL) {
-			err = SET_ERROR(ENOENT);
-		} else {
-			zap->zap_m.zap_num_entries--;
-			bzero(&zap_m_phys(zap)->mz_chunk[mze->mze_chunkid],
-			    sizeof (mzap_ent_phys_t));
-			mze_remove(zap, mze);
-		}
-	}
-	zap_name_free(zn);
-	return (err);
-}
-
-int
-zap_remove_norm(objset_t *os, uint64_t zapobj, const char *name,
-    matchtype_t mt, dmu_tx_t *tx)
-{
-	zap_t *zap;
-	int err;
-
-	err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, FALSE, FTAG, &zap);
-	if (err)
-		return (err);
-	err = zap_remove_impl(zap, name, mt, tx);
-	zap_unlockdir(zap, FTAG);
-	return (err);
-}
-
-int
-zap_remove_by_dnode(dnode_t *dn, const char *name, dmu_tx_t *tx)
-{
-	zap_t *zap;
-	int err;
-
-	err = zap_lockdir_by_dnode(dn, tx, RW_WRITER, TRUE, FALSE, FTAG, &zap);
-	if (err)
-		return (err);
-	err = zap_remove_impl(zap, name, 0, tx);
-	zap_unlockdir(zap, FTAG);
-	return (err);
-}
-
-int
-zap_remove_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
-    int key_numints, dmu_tx_t *tx)
-{
-	zap_t *zap;
-
-	int err =
-	    zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, FALSE, FTAG, &zap);
-	if (err != 0)
-		return (err);
-	zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints);
-	if (zn == NULL) {
-		zap_unlockdir(zap, FTAG);
-		return (SET_ERROR(ENOTSUP));
-	}
-	err = fzap_remove(zn, tx);
-	zap_name_free(zn);
-	zap_unlockdir(zap, FTAG);
-	return (err);
-}
-
-/*
- * Routines for iterating over the attributes.
- */
-
-static void
-zap_cursor_init_impl(zap_cursor_t *zc, objset_t *os, uint64_t zapobj,
-    uint64_t serialized, boolean_t prefetch)
-{
-	zc->zc_objset = os;
-	zc->zc_zap = NULL;
-	zc->zc_leaf = NULL;
-	zc->zc_zapobj = zapobj;
-	zc->zc_serialized = serialized;
-	zc->zc_hash = 0;
-	zc->zc_cd = 0;
-	zc->zc_prefetch = prefetch;
-}
-void
-zap_cursor_init_serialized(zap_cursor_t *zc, objset_t *os, uint64_t zapobj,
-    uint64_t serialized)
-{
-	zap_cursor_init_impl(zc, os, zapobj, serialized, B_TRUE);
-}
-
-/*
- * Initialize a cursor at the beginning of the ZAP object.  The entire
- * ZAP object will be prefetched.
- */
-void
-zap_cursor_init(zap_cursor_t *zc, objset_t *os, uint64_t zapobj)
-{
-	zap_cursor_init_impl(zc, os, zapobj, 0, B_TRUE);
-}
-
-/*
- * Initialize a cursor at the beginning, but request that we not prefetch
- * the entire ZAP object.
- */
-void
-zap_cursor_init_noprefetch(zap_cursor_t *zc, objset_t *os, uint64_t zapobj)
-{
-	zap_cursor_init_impl(zc, os, zapobj, 0, B_FALSE);
-}
-
-void
-zap_cursor_fini(zap_cursor_t *zc)
-{
-	if (zc->zc_zap) {
-		rw_enter(&zc->zc_zap->zap_rwlock, RW_READER);
-		zap_unlockdir(zc->zc_zap, NULL);
-		zc->zc_zap = NULL;
-	}
-	if (zc->zc_leaf) {
-		rw_enter(&zc->zc_leaf->l_rwlock, RW_READER);
-		zap_put_leaf(zc->zc_leaf);
-		zc->zc_leaf = NULL;
-	}
-	zc->zc_objset = NULL;
-}
-
-uint64_t
-zap_cursor_serialize(zap_cursor_t *zc)
-{
-	if (zc->zc_hash == -1ULL)
-		return (-1ULL);
-	if (zc->zc_zap == NULL)
-		return (zc->zc_serialized);
-	ASSERT((zc->zc_hash & zap_maxcd(zc->zc_zap)) == 0);
-	ASSERT(zc->zc_cd < zap_maxcd(zc->zc_zap));
-
-	/*
-	 * We want to keep the high 32 bits of the cursor zero if we can, so
-	 * that 32-bit programs can access this.  So usually use a small
-	 * (28-bit) hash value so we can fit 4 bits of cd into the low 32-bits
-	 * of the cursor.
-	 *
-	 * [ collision differentiator | zap_hashbits()-bit hash value ]
-	 */
-	return ((zc->zc_hash >> (64 - zap_hashbits(zc->zc_zap))) |
-	    ((uint64_t)zc->zc_cd << zap_hashbits(zc->zc_zap)));
-}
-
-int
-zap_cursor_retrieve(zap_cursor_t *zc, zap_attribute_t *za)
-{
-	int err;
-
-	if (zc->zc_hash == -1ULL)
-		return (SET_ERROR(ENOENT));
-
-	if (zc->zc_zap == NULL) {
-		int hb;
-		err = zap_lockdir(zc->zc_objset, zc->zc_zapobj, NULL,
-		    RW_READER, TRUE, FALSE, NULL, &zc->zc_zap);
-		if (err != 0)
-			return (err);
-
-		/*
-		 * To support zap_cursor_init_serialized, advance, retrieve,
-		 * we must add to the existing zc_cd, which may already
-		 * be 1 due to the zap_cursor_advance.
-		 */
-		ASSERT(zc->zc_hash == 0);
-		hb = zap_hashbits(zc->zc_zap);
-		zc->zc_hash = zc->zc_serialized << (64 - hb);
-		zc->zc_cd += zc->zc_serialized >> hb;
-		if (zc->zc_cd >= zap_maxcd(zc->zc_zap)) /* corrupt serialized */
-			zc->zc_cd = 0;
-	} else {
-		rw_enter(&zc->zc_zap->zap_rwlock, RW_READER);
-	}
-	if (!zc->zc_zap->zap_ismicro) {
-		err = fzap_cursor_retrieve(zc->zc_zap, zc, za);
-	} else {
-		avl_index_t idx;
-		mzap_ent_t mze_tofind;
-
-		mze_tofind.mze_hash = zc->zc_hash;
-		mze_tofind.mze_cd = zc->zc_cd;
-
-		mzap_ent_t *mze =
-		    avl_find(&zc->zc_zap->zap_m.zap_avl, &mze_tofind, &idx);
-		if (mze == NULL) {
-			mze = avl_nearest(&zc->zc_zap->zap_m.zap_avl,
-			    idx, AVL_AFTER);
-		}
-		if (mze) {
-			mzap_ent_phys_t *mzep = MZE_PHYS(zc->zc_zap, mze);
-			ASSERT3U(mze->mze_cd, ==, mzep->mze_cd);
-			za->za_normalization_conflict =
-			    mzap_normalization_conflict(zc->zc_zap, NULL, mze);
-			za->za_integer_length = 8;
-			za->za_num_integers = 1;
-			za->za_first_integer = mzep->mze_value;
-			(void) strcpy(za->za_name, mzep->mze_name);
-			zc->zc_hash = mze->mze_hash;
-			zc->zc_cd = mze->mze_cd;
-			err = 0;
-		} else {
-			zc->zc_hash = -1ULL;
-			err = SET_ERROR(ENOENT);
-		}
-	}
-	rw_exit(&zc->zc_zap->zap_rwlock);
-	return (err);
-}
-
-void
-zap_cursor_advance(zap_cursor_t *zc)
-{
-	if (zc->zc_hash == -1ULL)
-		return;
-	zc->zc_cd++;
-}
-
-int
-zap_cursor_move_to_key(zap_cursor_t *zc, const char *name, matchtype_t mt)
-{
-	int err = 0;
-	mzap_ent_t *mze;
-	zap_name_t *zn;
-
-	if (zc->zc_zap == NULL) {
-		err = zap_lockdir(zc->zc_objset, zc->zc_zapobj, NULL,
-		    RW_READER, TRUE, FALSE, FTAG, &zc->zc_zap);
-		if (err)
-			return (err);
-	} else {
-		rw_enter(&zc->zc_zap->zap_rwlock, RW_READER);
-	}
-
-	zn = zap_name_alloc(zc->zc_zap, name, mt);
-	if (zn == NULL) {
-		rw_exit(&zc->zc_zap->zap_rwlock);
-		return (SET_ERROR(ENOTSUP));
-	}
-
-	if (!zc->zc_zap->zap_ismicro) {
-		err = fzap_cursor_move_to_key(zc, zn);
-	} else {
-		mze = mze_find(zn);
-		if (mze == NULL) {
-			err = SET_ERROR(ENOENT);
-			goto out;
-		}
-		zc->zc_hash = mze->mze_hash;
-		zc->zc_cd = mze->mze_cd;
-	}
-
-out:
-	zap_name_free(zn);
-	rw_exit(&zc->zc_zap->zap_rwlock);
-	return (err);
-}
-
-int
-zap_get_stats(objset_t *os, uint64_t zapobj, zap_stats_t *zs)
-{
-	zap_t *zap;
-
-	int err =
-	    zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
-	if (err != 0)
-		return (err);
-
-	bzero(zs, sizeof (zap_stats_t));
-
-	if (zap->zap_ismicro) {
-		zs->zs_blocksize = zap->zap_dbuf->db_size;
-		zs->zs_num_entries = zap->zap_m.zap_num_entries;
-		zs->zs_num_blocks = 1;
-	} else {
-		fzap_get_stats(zap, zs);
-	}
-	zap_unlockdir(zap, FTAG);
-	return (0);
-}
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zcp.c
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zcp.c
+++ /dev/null
@@ -1,1432 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * This file and its contents are supplied under the terms of the
- * Common Development and Distribution License ("CDDL"), version 1.0.
- * You may only use this file in accordance with the terms of version
- * 1.0 of the CDDL.
- *
- * A full copy of the text of the CDDL should have accompanied this
- * source.  A copy of the CDDL is also available via the Internet at
- * http://www.illumos.org/license/CDDL.
- *
- * CDDL HEADER END
- */
-
-/*
- * Copyright (c) 2016, 2017 by Delphix. All rights reserved.
- */
-
-/*
- * ZFS Channel Programs (ZCP)
- *
- * The ZCP interface allows various ZFS commands and operations ZFS
- * administrative operations (e.g. creating and destroying snapshots, typically
- * performed via an ioctl to /dev/zfs by the zfs(1M) command and
- * libzfs/libzfs_core) to be run * programmatically as a Lua script.  A ZCP
- * script is run as a dsl_sync_task and fully executed during one transaction
- * group sync.  This ensures that no other changes can be written concurrently
- * with a running Lua script.  Combining multiple calls to the exposed ZFS
- * functions into one script gives a number of benefits:
- *
- * 1. Atomicity.  For some compound or iterative operations, it's useful to be
- * able to guarantee that the state of a pool has not changed between calls to
- * ZFS.
- *
- * 2. Performance.  If a large number of changes need to be made (e.g. deleting
- * many filesystems), there can be a significant performance penalty as a
- * result of the need to wait for a transaction group sync to pass for every
- * single operation.  When expressed as a single ZCP script, all these changes
- * can be performed at once in one txg sync.
- *
- * A modified version of the Lua 5.2 interpreter is used to run channel program
- * scripts. The Lua 5.2 manual can be found at:
- *
- *      http://www.lua.org/manual/5.2/
- *
- * If being run by a user (via an ioctl syscall), executing a ZCP script
- * requires root privileges in the global zone.
- *
- * Scripts are passed to zcp_eval() as a string, then run in a synctask by
- * zcp_eval_sync().  Arguments can be passed into the Lua script as an nvlist,
- * which will be converted to a Lua table.  Similarly, values returned from
- * a ZCP script will be converted to an nvlist.  See zcp_lua_to_nvlist_impl()
- * for details on exact allowed types and conversion.
- *
- * ZFS functionality is exposed to a ZCP script as a library of function calls.
- * These calls are sorted into submodules, such as zfs.list and zfs.sync, for
- * iterators and synctasks, respectively.  Each of these submodules resides in
- * its own source file, with a zcp_*_info structure describing each library
- * call in the submodule.
- *
- * Error handling in ZCP scripts is handled by a number of different methods
- * based on severity:
- *
- * 1. Memory and time limits are in place to prevent a channel program from
- * consuming excessive system or running forever.  If one of these limits is
- * hit, the channel program will be stopped immediately and return from
- * zcp_eval() with an error code. No attempt will be made to roll back or undo
- * any changes made by the channel program before the error occured.
- * Consumers invoking zcp_eval() from elsewhere in the kernel may pass a time
- * limit of 0, disabling the time limit.
- *
- * 2. Internal Lua errors can occur as a result of a syntax error, calling a
- * library function with incorrect arguments, invoking the error() function,
- * failing an assert(), or other runtime errors.  In these cases the channel
- * program will stop executing and return from zcp_eval() with an error code.
- * In place of a return value, an error message will also be returned in the
- * 'result' nvlist containing information about the error. No attempt will be
- * made to roll back or undo any changes made by the channel program before the
- * error occured.
- *
- * 3. If an error occurs inside a ZFS library call which returns an error code,
- * the error is returned to the Lua script to be handled as desired.
- *
- * In the first two cases, Lua's error-throwing mechanism is used, which
- * longjumps out of the script execution with luaL_error() and returns with the
- * error.
- *
- * See zfs-program(1M) for more information on high level usage.
- */
-
-#include "lua.h"
-#include "lualib.h"
-#include "lauxlib.h"
-
-#include <sys/dsl_prop.h>
-#include <sys/dsl_synctask.h>
-#include <sys/dsl_dataset.h>
-#include <sys/zcp.h>
-#include <sys/zcp_iter.h>
-#include <sys/zcp_prop.h>
-#include <sys/zcp_global.h>
-#ifdef illumos
-#include <util/sscanf.h>
-#endif
-
-#ifdef __FreeBSD__
-#define	ECHRNG	EDOM
-#define	ETIME	ETIMEDOUT
-#endif
-
-#define	ZCP_NVLIST_MAX_DEPTH 20
-
-uint64_t zfs_lua_check_instrlimit_interval = 100;
-uint64_t zfs_lua_max_instrlimit = ZCP_MAX_INSTRLIMIT;
-uint64_t zfs_lua_max_memlimit = ZCP_MAX_MEMLIMIT;
-
-/*
- * Forward declarations for mutually recursive functions
- */
-static int zcp_nvpair_value_to_lua(lua_State *, nvpair_t *, char *, int);
-static int zcp_lua_to_nvlist_impl(lua_State *, int, nvlist_t *, const char *,
-    int);
-
-/*
- * The outer-most error callback handler for use with lua_pcall(). On
- * error Lua will call this callback with a single argument that
- * represents the error value. In most cases this will be a string
- * containing an error message, but channel programs can use Lua's
- * error() function to return arbitrary objects as errors. This callback
- * returns (on the Lua stack) the original error object along with a traceback.
- *
- * Fatal Lua errors can occur while resources are held, so we also call any
- * registered cleanup function here.
- */
-static int
-zcp_error_handler(lua_State *state)
-{
-	const char *msg;
-
-	zcp_cleanup(state);
-
-	VERIFY3U(1, ==, lua_gettop(state));
-	msg = lua_tostring(state, 1);
-	luaL_traceback(state, state, msg, 1);
-	return (1);
-}
-
-int
-zcp_argerror(lua_State *state, int narg, const char *msg, ...)
-{
-	va_list alist;
-
-	va_start(alist, msg);
-	const char *buf = lua_pushvfstring(state, msg, alist);
-	va_end(alist);
-
-	return (luaL_argerror(state, narg, buf));
-}
-
-/*
- * Install a new cleanup function, which will be invoked with the given
- * opaque argument if a fatal error causes the Lua interpreter to longjump out
- * of a function call.
- *
- * If an error occurs, the cleanup function will be invoked exactly once and
- * then unreigstered.
- *
- * Returns the registered cleanup handler so the caller can deregister it
- * if no error occurs.
- */
-zcp_cleanup_handler_t *
-zcp_register_cleanup(lua_State *state, zcp_cleanup_t cleanfunc, void *cleanarg)
-{
-	zcp_run_info_t *ri = zcp_run_info(state);
-
-	zcp_cleanup_handler_t *zch = kmem_alloc(sizeof (*zch), KM_SLEEP);
-	zch->zch_cleanup_func = cleanfunc;
-	zch->zch_cleanup_arg = cleanarg;
-	list_insert_head(&ri->zri_cleanup_handlers, zch);
-
-	return (zch);
-}
-
-void
-zcp_deregister_cleanup(lua_State *state, zcp_cleanup_handler_t *zch)
-{
-	zcp_run_info_t *ri = zcp_run_info(state);
-	list_remove(&ri->zri_cleanup_handlers, zch);
-	kmem_free(zch, sizeof (*zch));
-}
-
-/*
- * Execute the currently registered cleanup handlers then free them and
- * destroy the handler list.
- */
-void
-zcp_cleanup(lua_State *state)
-{
-	zcp_run_info_t *ri = zcp_run_info(state);
-
-	for (zcp_cleanup_handler_t *zch =
-	    list_remove_head(&ri->zri_cleanup_handlers); zch != NULL;
-	    zch = list_remove_head(&ri->zri_cleanup_handlers)) {
-		zch->zch_cleanup_func(zch->zch_cleanup_arg);
-		kmem_free(zch, sizeof (*zch));
-	}
-}
-
-/*
- * Convert the lua table at the given index on the Lua stack to an nvlist
- * and return it.
- *
- * If the table can not be converted for any reason, NULL is returned and
- * an error message is pushed onto the Lua stack.
- */
-static nvlist_t *
-zcp_table_to_nvlist(lua_State *state, int index, int depth)
-{
-	nvlist_t *nvl;
-	/*
-	 * Converting a Lua table to an nvlist with key uniqueness checking is
-	 * O(n^2) in the number of keys in the nvlist, which can take a long
-	 * time when we return a large table from a channel program.
-	 * Furthermore, Lua's table interface *almost* guarantees unique keys
-	 * on its own (details below). Therefore, we don't use fnvlist_alloc()
-	 * here to avoid the built-in uniqueness checking.
-	 *
-	 * The *almost* is because it's possible to have key collisions between
-	 * e.g. the string "1" and the number 1, or the string "true" and the
-	 * boolean true, so we explicitly check that when we're looking at a
-	 * key which is an integer / boolean or a string that can be parsed as
-	 * one of those types. In the worst case this could still devolve into
-	 * O(n^2), so we only start doing these checks on boolean/integer keys
-	 * once we've seen a string key which fits this weird usage pattern.
-	 *
-	 * Ultimately, we still want callers to know that the keys in this
-	 * nvlist are unique, so before we return this we set the nvlist's
-	 * flags to reflect that.
-	 */
-	VERIFY0(nvlist_alloc(&nvl, 0, KM_SLEEP));
-
-	/*
-	 * Push an empty stack slot where lua_next() will store each
-	 * table key.
-	 */
-	lua_pushnil(state);
-	boolean_t saw_str_could_collide = B_FALSE;
-	while (lua_next(state, index) != 0) {
-		/*
-		 * The next key-value pair from the table at index is
-		 * now on the stack, with the key at stack slot -2 and
-		 * the value at slot -1.
-		 */
-		int err = 0;
-		char buf[32];
-		const char *key = NULL;
-		boolean_t key_could_collide = B_FALSE;
-
-		switch (lua_type(state, -2)) {
-		case LUA_TSTRING:
-			key = lua_tostring(state, -2);
-
-			/* check if this could collide with a number or bool */
-			long long tmp;
-			int parselen;
-			if ((sscanf(key, "%lld%n", &tmp, &parselen) > 0 &&
-			    parselen == strlen(key)) ||
-			    strcmp(key, "true") == 0 ||
-			    strcmp(key, "false") == 0) {
-				key_could_collide = B_TRUE;
-				saw_str_could_collide = B_TRUE;
-			}
-			break;
-		case LUA_TBOOLEAN:
-			key = (lua_toboolean(state, -2) == B_TRUE ?
-			    "true" : "false");
-			if (saw_str_could_collide) {
-				key_could_collide = B_TRUE;
-			}
-			break;
-		case LUA_TNUMBER:
-			VERIFY3U(sizeof (buf), >,
-			    snprintf(buf, sizeof (buf), "%lld",
-			    (longlong_t)lua_tonumber(state, -2)));
-			key = buf;
-			if (saw_str_could_collide) {
-				key_could_collide = B_TRUE;
-			}
-			break;
-		default:
-			fnvlist_free(nvl);
-			(void) lua_pushfstring(state, "Invalid key "
-			    "type '%s' in table",
-			    lua_typename(state, lua_type(state, -2)));
-			return (NULL);
-		}
-		/*
-		 * Check for type-mismatched key collisions, and throw an error.
-		 */
-		if (key_could_collide && nvlist_exists(nvl, key)) {
-			fnvlist_free(nvl);
-			(void) lua_pushfstring(state, "Collision of "
-			    "key '%s' in table", key);
-			return (NULL);
-		}
-		/*
-		 * Recursively convert the table value and insert into
-		 * the new nvlist with the parsed key.  To prevent
-		 * stack overflow on circular or heavily nested tables,
-		 * we track the current nvlist depth.
-		 */
-		if (depth >= ZCP_NVLIST_MAX_DEPTH) {
-			fnvlist_free(nvl);
-			(void) lua_pushfstring(state, "Maximum table "
-			    "depth (%d) exceeded for table",
-			    ZCP_NVLIST_MAX_DEPTH);
-			return (NULL);
-		}
-		err = zcp_lua_to_nvlist_impl(state, -1, nvl, key,
-		    depth + 1);
-		if (err != 0) {
-			fnvlist_free(nvl);
-			/*
-			 * Error message has been pushed to the lua
-			 * stack by the recursive call.
-			 */
-			return (NULL);
-		}
-		/*
-		 * Pop the value pushed by lua_next().
-		 */
-		lua_pop(state, 1);
-	}
-
-	/*
-	 * Mark the nvlist as having unique keys. This is a little ugly, but we
-	 * ensured above that there are no duplicate keys in the nvlist.
-	 */
-	nvl->nvl_nvflag |= NV_UNIQUE_NAME;
-
-	return (nvl);
-}
-
-/*
- * Convert a value from the given index into the lua stack to an nvpair, adding
- * it to an nvlist with the given key.
- *
- * Values are converted as follows:
- *
- *   string -> string
- *   number -> int64
- *   boolean -> boolean
- *   nil -> boolean (no value)
- *
- * Lua tables are converted to nvlists and then inserted. The table's keys
- * are converted to strings then used as keys in the nvlist to store each table
- * element.  Keys are converted as follows:
- *
- *   string -> no change
- *   number -> "%lld"
- *   boolean -> "true" | "false"
- *   nil -> error
- *
- * In the case of a key collision, an error is thrown.
- *
- * If an error is encountered, a nonzero error code is returned, and an error
- * string will be pushed onto the Lua stack.
- */
-static int
-zcp_lua_to_nvlist_impl(lua_State *state, int index, nvlist_t *nvl,
-    const char *key, int depth)
-{
-	/*
-	 * Verify that we have enough remaining space in the lua stack to parse
-	 * a key-value pair and push an error.
-	 */
-	if (!lua_checkstack(state, 3)) {
-		(void) lua_pushstring(state, "Lua stack overflow");
-		return (1);
-	}
-
-	index = lua_absindex(state, index);
-
-	switch (lua_type(state, index)) {
-	case LUA_TNIL:
-		fnvlist_add_boolean(nvl, key);
-		break;
-	case LUA_TBOOLEAN:
-		fnvlist_add_boolean_value(nvl, key,
-		    lua_toboolean(state, index));
-		break;
-	case LUA_TNUMBER:
-		fnvlist_add_int64(nvl, key, lua_tonumber(state, index));
-		break;
-	case LUA_TSTRING:
-		fnvlist_add_string(nvl, key, lua_tostring(state, index));
-		break;
-	case LUA_TTABLE: {
-		nvlist_t *value_nvl = zcp_table_to_nvlist(state, index, depth);
-		if (value_nvl == NULL)
-			return (EINVAL);
-
-		fnvlist_add_nvlist(nvl, key, value_nvl);
-		fnvlist_free(value_nvl);
-		break;
-	}
-	default:
-		(void) lua_pushfstring(state,
-		    "Invalid value type '%s' for key '%s'",
-		    lua_typename(state, lua_type(state, index)), key);
-		return (EINVAL);
-	}
-
-	return (0);
-}
-
-/*
- * Convert a lua value to an nvpair, adding it to an nvlist with the given key.
- */
-static void
-zcp_lua_to_nvlist(lua_State *state, int index, nvlist_t *nvl, const char *key)
-{
-	/*
-	 * On error, zcp_lua_to_nvlist_impl pushes an error string onto the Lua
-	 * stack before returning with a nonzero error code. If an error is
-	 * returned, throw a fatal lua error with the given string.
-	 */
-	if (zcp_lua_to_nvlist_impl(state, index, nvl, key, 0) != 0)
-		(void) lua_error(state);
-}
-
-static int
-zcp_lua_to_nvlist_helper(lua_State *state)
-{
-	nvlist_t *nv = (nvlist_t *)lua_touserdata(state, 2);
-	const char *key = (const char *)lua_touserdata(state, 1);
-	zcp_lua_to_nvlist(state, 3, nv, key);
-	return (0);
-}
-
-static void
-zcp_convert_return_values(lua_State *state, nvlist_t *nvl,
-    const char *key, int *result)
-{
-	int err;
-	VERIFY3U(1, ==, lua_gettop(state));
-	lua_pushcfunction(state, zcp_lua_to_nvlist_helper);
-	lua_pushlightuserdata(state, (char *)key);
-	lua_pushlightuserdata(state, nvl);
-	lua_pushvalue(state, 1);
-	lua_remove(state, 1);
-	err = lua_pcall(state, 3, 0, 0); /* zcp_lua_to_nvlist_helper */
-	if (err != 0) {
-		zcp_lua_to_nvlist(state, 1, nvl, ZCP_RET_ERROR);
-		*result = SET_ERROR(ECHRNG);
-	}
-}
-
-/*
- * Push a Lua table representing nvl onto the stack.  If it can't be
- * converted, return EINVAL, fill in errbuf, and push nothing. errbuf may
- * be specified as NULL, in which case no error string will be output.
- *
- * Most nvlists are converted as simple key->value Lua tables, but we make
- * an exception for the case where all nvlist entries are BOOLEANs (a string
- * key without a value). In Lua, a table key pointing to a value of Nil
- * (no value) is equivalent to the key not existing, so a BOOLEAN nvlist
- * entry can't be directly converted to a Lua table entry. Nvlists of entirely
- * BOOLEAN entries are frequently used to pass around lists of datasets, so for
- * convenience we check for this case, and convert it to a simple Lua array of
- * strings.
- */
-int
-zcp_nvlist_to_lua(lua_State *state, nvlist_t *nvl,
-    char *errbuf, int errbuf_len)
-{
-	nvpair_t *pair;
-	lua_newtable(state);
-	boolean_t has_values = B_FALSE;
-	/*
-	 * If the list doesn't have any values, just convert it to a string
-	 * array.
-	 */
-	for (pair = nvlist_next_nvpair(nvl, NULL);
-	    pair != NULL; pair = nvlist_next_nvpair(nvl, pair)) {
-		if (nvpair_type(pair) != DATA_TYPE_BOOLEAN) {
-			has_values = B_TRUE;
-			break;
-		}
-	}
-	if (!has_values) {
-		int i = 1;
-		for (pair = nvlist_next_nvpair(nvl, NULL);
-		    pair != NULL; pair = nvlist_next_nvpair(nvl, pair)) {
-			(void) lua_pushinteger(state, i);
-			(void) lua_pushstring(state, nvpair_name(pair));
-			(void) lua_settable(state, -3);
-			i++;
-		}
-	} else {
-		for (pair = nvlist_next_nvpair(nvl, NULL);
-		    pair != NULL; pair = nvlist_next_nvpair(nvl, pair)) {
-			int err = zcp_nvpair_value_to_lua(state, pair,
-			    errbuf, errbuf_len);
-			if (err != 0) {
-				lua_pop(state, 1);
-				return (err);
-			}
-			(void) lua_setfield(state, -2, nvpair_name(pair));
-		}
-	}
-	return (0);
-}
-
-/*
- * Push a Lua object representing the value of "pair" onto the stack.
- *
- * Only understands boolean_value, string, int64, nvlist,
- * string_array, and int64_array type values.  For other
- * types, returns EINVAL, fills in errbuf, and pushes nothing.
- */
-static int
-zcp_nvpair_value_to_lua(lua_State *state, nvpair_t *pair,
-    char *errbuf, int errbuf_len)
-{
-	int err = 0;
-
-	if (pair == NULL) {
-		lua_pushnil(state);
-		return (0);
-	}
-
-	switch (nvpair_type(pair)) {
-	case DATA_TYPE_BOOLEAN_VALUE:
-		(void) lua_pushboolean(state,
-		    fnvpair_value_boolean_value(pair));
-		break;
-	case DATA_TYPE_STRING:
-		(void) lua_pushstring(state, fnvpair_value_string(pair));
-		break;
-	case DATA_TYPE_INT64:
-		(void) lua_pushinteger(state, fnvpair_value_int64(pair));
-		break;
-	case DATA_TYPE_NVLIST:
-		err = zcp_nvlist_to_lua(state,
-		    fnvpair_value_nvlist(pair), errbuf, errbuf_len);
-		break;
-	case DATA_TYPE_STRING_ARRAY: {
-		char **strarr;
-		uint_t nelem;
-		(void) nvpair_value_string_array(pair, &strarr, &nelem);
-		lua_newtable(state);
-		for (int i = 0; i < nelem; i++) {
-			(void) lua_pushinteger(state, i + 1);
-			(void) lua_pushstring(state, strarr[i]);
-			(void) lua_settable(state, -3);
-		}
-		break;
-	}
-	case DATA_TYPE_UINT64_ARRAY: {
-		uint64_t *intarr;
-		uint_t nelem;
-		(void) nvpair_value_uint64_array(pair, &intarr, &nelem);
-		lua_newtable(state);
-		for (int i = 0; i < nelem; i++) {
-			(void) lua_pushinteger(state, i + 1);
-			(void) lua_pushinteger(state, intarr[i]);
-			(void) lua_settable(state, -3);
-		}
-		break;
-	}
-	case DATA_TYPE_INT64_ARRAY: {
-		int64_t *intarr;
-		uint_t nelem;
-		(void) nvpair_value_int64_array(pair, &intarr, &nelem);
-		lua_newtable(state);
-		for (int i = 0; i < nelem; i++) {
-			(void) lua_pushinteger(state, i + 1);
-			(void) lua_pushinteger(state, intarr[i]);
-			(void) lua_settable(state, -3);
-		}
-		break;
-	}
-	default: {
-		if (errbuf != NULL) {
-			(void) snprintf(errbuf, errbuf_len,
-			    "Unhandled nvpair type %d for key '%s'",
-			    nvpair_type(pair), nvpair_name(pair));
-		}
-		return (EINVAL);
-	}
-	}
-	return (err);
-}
-
-int
-zcp_dataset_hold_error(lua_State *state, dsl_pool_t *dp, const char *dsname,
-    int error)
-{
-	if (error == ENOENT) {
-		(void) zcp_argerror(state, 1, "no such dataset '%s'", dsname);
-		return (0); /* not reached; zcp_argerror will longjmp */
-	} else if (error == EXDEV) {
-		(void) zcp_argerror(state, 1,
-		    "dataset '%s' is not in the target pool '%s'",
-		    dsname, spa_name(dp->dp_spa));
-		return (0); /* not reached; zcp_argerror will longjmp */
-	} else if (error == EIO) {
-		(void) luaL_error(state,
-		    "I/O error while accessing dataset '%s'", dsname);
-		return (0); /* not reached; luaL_error will longjmp */
-	} else if (error != 0) {
-		(void) luaL_error(state,
-		    "unexpected error %d while accessing dataset '%s'",
-		    error, dsname);
-		return (0); /* not reached; luaL_error will longjmp */
-	}
-	return (0);
-}
-
-/*
- * Note: will longjmp (via lua_error()) on error.
- * Assumes that the dsname is argument #1 (for error reporting purposes).
- */
-dsl_dataset_t *
-zcp_dataset_hold(lua_State *state, dsl_pool_t *dp, const char *dsname,
-    void *tag)
-{
-	dsl_dataset_t *ds;
-	int error = dsl_dataset_hold(dp, dsname, tag, &ds);
-	(void) zcp_dataset_hold_error(state, dp, dsname, error);
-	return (ds);
-}
-
-static int zcp_debug(lua_State *);
-static zcp_lib_info_t zcp_debug_info = {
-	.name = "debug",
-	.func = zcp_debug,
-	.pargs = {
-	    { .za_name = "debug string", .za_lua_type = LUA_TSTRING},
-	    {NULL, 0}
-	},
-	.kwargs = {
-	    {NULL, 0}
-	}
-};
-
-static int
-zcp_debug(lua_State *state)
-{
-	const char *dbgstring;
-	zcp_run_info_t *ri = zcp_run_info(state);
-	zcp_lib_info_t *libinfo = &zcp_debug_info;
-
-	zcp_parse_args(state, libinfo->name, libinfo->pargs, libinfo->kwargs);
-
-	dbgstring = lua_tostring(state, 1);
-
-	zfs_dbgmsg("txg %lld ZCP: %s", ri->zri_tx->tx_txg, dbgstring);
-
-	return (0);
-}
-
-static int zcp_exists(lua_State *);
-static zcp_lib_info_t zcp_exists_info = {
-	.name = "exists",
-	.func = zcp_exists,
-	.pargs = {
-	    { .za_name = "dataset", .za_lua_type = LUA_TSTRING},
-	    {NULL, 0}
-	},
-	.kwargs = {
-	    {NULL, 0}
-	}
-};
-
-static int
-zcp_exists(lua_State *state)
-{
-	zcp_run_info_t *ri = zcp_run_info(state);
-	dsl_pool_t *dp = ri->zri_pool;
-	zcp_lib_info_t *libinfo = &zcp_exists_info;
-
-	zcp_parse_args(state, libinfo->name, libinfo->pargs, libinfo->kwargs);
-
-	const char *dsname = lua_tostring(state, 1);
-
-	dsl_dataset_t *ds;
-	int error = dsl_dataset_hold(dp, dsname, FTAG, &ds);
-	if (error == 0) {
-		dsl_dataset_rele(ds, FTAG);
-		lua_pushboolean(state, B_TRUE);
-	} else if (error == ENOENT) {
-		lua_pushboolean(state, B_FALSE);
-	} else if (error == EXDEV) {
-		return (luaL_error(state, "dataset '%s' is not in the "
-		    "target pool", dsname));
-	} else if (error == EIO) {
-		return (luaL_error(state, "I/O error opening dataset '%s'",
-		    dsname));
-	} else if (error != 0) {
-		return (luaL_error(state, "unexpected error %d", error));
-	}
-
-	return (1);
-}
-
-/*
- * Allocate/realloc/free a buffer for the lua interpreter.
- *
- * When nsize is 0, behaves as free() and returns NULL.
- *
- * If ptr is NULL, behaves as malloc() and returns an allocated buffer of size
- * at least nsize.
- *
- * Otherwise, behaves as realloc(), changing the allocation from osize to nsize.
- * Shrinking the buffer size never fails.
- *
- * The original allocated buffer size is stored as a uint64 at the beginning of
- * the buffer to avoid actually reallocating when shrinking a buffer, since lua
- * requires that this operation never fail.
- */
-static void *
-zcp_lua_alloc(void *ud, void *ptr, size_t osize, size_t nsize)
-{
-	zcp_alloc_arg_t *allocargs = ud;
-	int flags = (allocargs->aa_must_succeed) ?
-	    KM_SLEEP : (KM_NOSLEEP | KM_NORMALPRI);
-
-	if (nsize == 0) {
-		if (ptr != NULL) {
-			int64_t *allocbuf = (int64_t *)ptr - 1;
-			int64_t allocsize = *allocbuf;
-			ASSERT3S(allocsize, >, 0);
-			ASSERT3S(allocargs->aa_alloc_remaining + allocsize, <=,
-			    allocargs->aa_alloc_limit);
-			allocargs->aa_alloc_remaining += allocsize;
-			kmem_free(allocbuf, allocsize);
-		}
-		return (NULL);
-	} else if (ptr == NULL) {
-		int64_t *allocbuf;
-		int64_t allocsize = nsize + sizeof (int64_t);
-
-		if (!allocargs->aa_must_succeed &&
-		    (allocsize <= 0 ||
-		    allocsize > allocargs->aa_alloc_remaining)) {
-			return (NULL);
-		}
-
-		allocbuf = kmem_alloc(allocsize, flags);
-		if (allocbuf == NULL) {
-			return (NULL);
-		}
-		allocargs->aa_alloc_remaining -= allocsize;
-
-		*allocbuf = allocsize;
-		return (allocbuf + 1);
-	} else if (nsize <= osize) {
-		/*
-		 * If shrinking the buffer, lua requires that the reallocation
-		 * never fail.
-		 */
-		return (ptr);
-	} else {
-		ASSERT3U(nsize, >, osize);
-
-		uint64_t *luabuf = zcp_lua_alloc(ud, NULL, 0, nsize);
-		if (luabuf == NULL) {
-			return (NULL);
-		}
-		(void) memcpy(luabuf, ptr, osize);
-		VERIFY3P(zcp_lua_alloc(ud, ptr, osize, 0), ==, NULL);
-		return (luabuf);
-	}
-}
-
-/* ARGSUSED */
-static void
-zcp_lua_counthook(lua_State *state, lua_Debug *ar)
-{
-	lua_getfield(state, LUA_REGISTRYINDEX, ZCP_RUN_INFO_KEY);
-	zcp_run_info_t *ri = lua_touserdata(state, -1);
-
-	/*
-	 * Check if we were canceled while waiting for the
-	 * txg to sync or from our open context thread
-	 */
-	if (ri->zri_canceled ||
-	    (!ri->zri_sync && issig(JUSTLOOKING) && issig(FORREAL))) {
-		ri->zri_canceled = B_TRUE;
-		(void) lua_pushstring(state, "Channel program was canceled.");
-		(void) lua_error(state);
-	}
-
-	/*
-	 * Check how many instructions the channel program has
-	 * executed so far, and compare against the limit.
-	 */
-	ri->zri_curinstrs += zfs_lua_check_instrlimit_interval;
-	if (ri->zri_maxinstrs != 0 && ri->zri_curinstrs > ri->zri_maxinstrs) {
-		ri->zri_timed_out = B_TRUE;
-		(void) lua_pushstring(state,
-		    "Channel program timed out.");
-		(void) lua_error(state);
-	}
-}
-
-static int
-zcp_panic_cb(lua_State *state)
-{
-	panic("unprotected error in call to Lua API (%s)\n",
-	    lua_tostring(state, -1));
-	return (0);
-}
-
-static void
-zcp_eval_impl(dmu_tx_t *tx, zcp_run_info_t *ri)
-{
-	int err;
-	lua_State *state = ri->zri_state;
-
-	VERIFY3U(3, ==, lua_gettop(state));
-
-	/* finish initializing our runtime state */
-	ri->zri_pool = dmu_tx_pool(tx);
-	ri->zri_tx = tx;
-	list_create(&ri->zri_cleanup_handlers, sizeof (zcp_cleanup_handler_t),
-	    offsetof(zcp_cleanup_handler_t, zch_node));
-
-	/*
-	 * Store the zcp_run_info_t struct for this run in the Lua registry.
-	 * Registry entries are not directly accessible by the Lua scripts but
-	 * can be accessed by our callbacks.
-	 */
-	lua_pushlightuserdata(state, ri);
-	lua_setfield(state, LUA_REGISTRYINDEX, ZCP_RUN_INFO_KEY);
-	VERIFY3U(3, ==, lua_gettop(state));
-
-	/*
-	 * Tell the Lua interpreter to call our handler every count
-	 * instructions. Channel programs that execute too many instructions
-	 * should die with ETIMEDOUT.
-	 */
-	(void) lua_sethook(state, zcp_lua_counthook, LUA_MASKCOUNT,
-	    zfs_lua_check_instrlimit_interval);
-
-	/*
-	 * Tell the Lua memory allocator to stop using KM_SLEEP before handing
-	 * off control to the channel program. Channel programs that use too
-	 * much memory should die with ENOSPC.
-	 */
-	ri->zri_allocargs->aa_must_succeed = B_FALSE;
-
-	/*
-	 * Call the Lua function that open-context passed us. This pops the
-	 * function and its input from the stack and pushes any return
-	 * or error values.
-	 */
-	err = lua_pcall(state, 1, LUA_MULTRET, 1);
-
-	/*
-	 * Let Lua use KM_SLEEP while we interpret the return values.
-	 */
-	ri->zri_allocargs->aa_must_succeed = B_TRUE;
-
-	/*
-	 * Remove the error handler callback from the stack. At this point,
-	 * there shouldn't be any cleanup handler registered in the handler
-	 * list (zri_cleanup_handlers), regardless of whether it ran or not.
-	 */
-	list_destroy(&ri->zri_cleanup_handlers);
-	lua_remove(state, 1);
-
-	switch (err) {
-	case LUA_OK: {
-		/*
-		 * Lua supports returning multiple values in a single return
-		 * statement.  Return values will have been pushed onto the
-		 * stack:
-		 * 1: Return value 1
-		 * 2: Return value 2
-		 * 3: etc...
-		 * To simplify the process of retrieving a return value from a
-		 * channel program, we disallow returning more than one value
-		 * to ZFS from the Lua script, yielding a singleton return
-		 * nvlist of the form { "return": Return value 1 }.
-		 */
-		int return_count = lua_gettop(state);
-
-		if (return_count == 1) {
-			ri->zri_result = 0;
-			zcp_convert_return_values(state, ri->zri_outnvl,
-			    ZCP_RET_RETURN, &ri->zri_result);
-		} else if (return_count > 1) {
-			ri->zri_result = SET_ERROR(ECHRNG);
-			lua_settop(state, 0);
-			(void) lua_pushfstring(state, "Multiple return "
-			    "values not supported");
-			zcp_convert_return_values(state, ri->zri_outnvl,
-			    ZCP_RET_ERROR, &ri->zri_result);
-		}
-		break;
-	}
-	case LUA_ERRRUN:
-	case LUA_ERRGCMM: {
-		/*
-		 * The channel program encountered a fatal error within the
-		 * script, such as failing an assertion, or calling a function
-		 * with incompatible arguments. The error value and the
-		 * traceback generated by zcp_error_handler() should be on the
-		 * stack.
-		 */
-		VERIFY3U(1, ==, lua_gettop(state));
-		if (ri->zri_timed_out) {
-			ri->zri_result = SET_ERROR(ETIME);
-		} else if (ri->zri_canceled) {
-			ri->zri_result = SET_ERROR(EINTR);
-		} else {
-			ri->zri_result = SET_ERROR(ECHRNG);
-		}
-
-		zcp_convert_return_values(state, ri->zri_outnvl,
-		    ZCP_RET_ERROR, &ri->zri_result);
-		break;
-	}
-	case LUA_ERRERR: {
-		/*
-		 * The channel program encountered a fatal error within the
-		 * script, and we encountered another error while trying to
-		 * compute the traceback in zcp_error_handler(). We can only
-		 * return the error message.
-		 */
-		VERIFY3U(1, ==, lua_gettop(state));
-		if (ri->zri_timed_out) {
-			ri->zri_result = SET_ERROR(ETIME);
-		} else if (ri->zri_canceled) {
-			ri->zri_result = SET_ERROR(EINTR);
-		} else {
-			ri->zri_result = SET_ERROR(ECHRNG);
-		}
-
-		zcp_convert_return_values(state, ri->zri_outnvl,
-		    ZCP_RET_ERROR, &ri->zri_result);
-		break;
-	}
-	case LUA_ERRMEM:
-		/*
-		 * Lua ran out of memory while running the channel program.
-		 * There's not much we can do.
-		 */
-		ri->zri_result = SET_ERROR(ENOSPC);
-		break;
-	default:
-		VERIFY0(err);
-	}
-}
-
-static void
-zcp_pool_error(zcp_run_info_t *ri, const char *poolname)
-{
-	ri->zri_result = SET_ERROR(ECHRNG);
-	lua_settop(ri->zri_state, 0);
-	(void) lua_pushfstring(ri->zri_state, "Could not open pool: %s",
-	    poolname);
-	zcp_convert_return_values(ri->zri_state, ri->zri_outnvl,
-	    ZCP_RET_ERROR, &ri->zri_result);
-
-}
-
-/*
- * This callback is called when txg_wait_synced_sig encountered a signal.
- * The txg_wait_synced_sig will continue to wait for the txg to complete
- * after calling this callback.
- */
-/* ARGSUSED */
-static void
-zcp_eval_sig(void *arg, dmu_tx_t *tx)
-{
-	zcp_run_info_t *ri = arg;
-
-	ri->zri_canceled = B_TRUE;
-}
-
-static void
-zcp_eval_sync(void *arg, dmu_tx_t *tx)
-{
-	zcp_run_info_t *ri = arg;
-
-	/*
-	 * Open context should have setup the stack to contain:
-	 * 1: Error handler callback
-	 * 2: Script to run (converted to a Lua function)
-	 * 3: nvlist input to function (converted to Lua table or nil)
-	 */
-	VERIFY3U(3, ==, lua_gettop(ri->zri_state));
-
-	zcp_eval_impl(tx, ri);
-}
-
-static void
-zcp_eval_open(zcp_run_info_t *ri, const char *poolname)
-{
-	int error;
-	dsl_pool_t *dp;
-	dmu_tx_t *tx;
-
-	/*
-	 * See comment from the same assertion in zcp_eval_sync().
-	 */
-	VERIFY3U(3, ==, lua_gettop(ri->zri_state));
-
-	error = dsl_pool_hold(poolname, FTAG, &dp);
-	if (error != 0) {
-		zcp_pool_error(ri, poolname);
-		return;
-	}
-
-	/*
-	 * As we are running in open-context, we have no transaction associated
-	 * with the channel program. At the same time, functions from the
-	 * zfs.check submodule need to be associated with a transaction as
-	 * they are basically dry-runs of their counterparts in the zfs.sync
-	 * submodule. These functions should be able to run in open-context.
-	 * Therefore we create a new transaction that we later abort once
-	 * the channel program has been evaluated.
-	 */
-	tx = dmu_tx_create_dd(dp->dp_mos_dir);
-
-	zcp_eval_impl(tx, ri);
-
-	dmu_tx_abort(tx);
-
-	dsl_pool_rele(dp, FTAG);
-}
-
-int
-zcp_eval(const char *poolname, const char *program, boolean_t sync,
-    uint64_t instrlimit, uint64_t memlimit, nvpair_t *nvarg, nvlist_t *outnvl)
-{
-	int err;
-	lua_State *state;
-	zcp_run_info_t runinfo;
-
-	if (instrlimit > zfs_lua_max_instrlimit)
-		return (SET_ERROR(EINVAL));
-	if (memlimit == 0 || memlimit > zfs_lua_max_memlimit)
-		return (SET_ERROR(EINVAL));
-
-	zcp_alloc_arg_t allocargs = {
-		.aa_must_succeed = B_TRUE,
-		.aa_alloc_remaining = (int64_t)memlimit,
-		.aa_alloc_limit = (int64_t)memlimit,
-	};
-
-	/*
-	 * Creates a Lua state with a memory allocator that uses KM_SLEEP.
-	 * This should never fail.
-	 */
-	state = lua_newstate(zcp_lua_alloc, &allocargs);
-	VERIFY(state != NULL);
-	(void) lua_atpanic(state, zcp_panic_cb);
-
-	/*
-	 * Load core Lua libraries we want access to.
-	 */
-	VERIFY3U(1, ==, luaopen_base(state));
-	lua_pop(state, 1);
-	VERIFY3U(1, ==, luaopen_coroutine(state));
-	lua_setglobal(state, LUA_COLIBNAME);
-	VERIFY0(lua_gettop(state));
-	VERIFY3U(1, ==, luaopen_string(state));
-	lua_setglobal(state, LUA_STRLIBNAME);
-	VERIFY0(lua_gettop(state));
-	VERIFY3U(1, ==, luaopen_table(state));
-	lua_setglobal(state, LUA_TABLIBNAME);
-	VERIFY0(lua_gettop(state));
-
-	/*
-	 * Load globally visible variables such as errno aliases.
-	 */
-	zcp_load_globals(state);
-	VERIFY0(lua_gettop(state));
-
-	/*
-	 * Load ZFS-specific modules.
-	 */
-	lua_newtable(state);
-	VERIFY3U(1, ==, zcp_load_list_lib(state));
-	lua_setfield(state, -2, "list");
-	VERIFY3U(1, ==, zcp_load_synctask_lib(state, B_FALSE));
-	lua_setfield(state, -2, "check");
-	VERIFY3U(1, ==, zcp_load_synctask_lib(state, B_TRUE));
-	lua_setfield(state, -2, "sync");
-	VERIFY3U(1, ==, zcp_load_get_lib(state));
-	lua_pushcclosure(state, zcp_debug_info.func, 0);
-	lua_setfield(state, -2, zcp_debug_info.name);
-	lua_pushcclosure(state, zcp_exists_info.func, 0);
-	lua_setfield(state, -2, zcp_exists_info.name);
-	lua_setglobal(state, "zfs");
-	VERIFY0(lua_gettop(state));
-
-	/*
-	 * Push the error-callback that calculates Lua stack traces on
-	 * unexpected failures.
-	 */
-	lua_pushcfunction(state, zcp_error_handler);
-	VERIFY3U(1, ==, lua_gettop(state));
-
-	/*
-	 * Load the actual script as a function onto the stack as text ("t").
-	 * The only valid error condition is a syntax error in the script.
-	 * ERRMEM should not be possible because our allocator is using
-	 * KM_SLEEP.  ERRGCMM should not be possible because we have not added
-	 * any objects with __gc metamethods to the interpreter that could
-	 * fail.
-	 */
-	err = luaL_loadbufferx(state, program, strlen(program),
-	    "channel program", "t");
-	if (err == LUA_ERRSYNTAX) {
-		fnvlist_add_string(outnvl, ZCP_RET_ERROR,
-		    lua_tostring(state, -1));
-		lua_close(state);
-		return (SET_ERROR(EINVAL));
-	}
-	VERIFY0(err);
-	VERIFY3U(2, ==, lua_gettop(state));
-
-	/*
-	 * Convert the input nvlist to a Lua object and put it on top of the
-	 * stack.
-	 */
-	char errmsg[128];
-	err = zcp_nvpair_value_to_lua(state, nvarg,
-	    errmsg, sizeof (errmsg));
-	if (err != 0) {
-		fnvlist_add_string(outnvl, ZCP_RET_ERROR, errmsg);
-		lua_close(state);
-		return (SET_ERROR(EINVAL));
-	}
-	VERIFY3U(3, ==, lua_gettop(state));
-
-	runinfo.zri_state = state;
-	runinfo.zri_allocargs = &allocargs;
-	runinfo.zri_outnvl = outnvl;
-	runinfo.zri_result = 0;
-	runinfo.zri_cred = CRED();
-	runinfo.zri_timed_out = B_FALSE;
-	runinfo.zri_canceled = B_FALSE;
-	runinfo.zri_sync = sync;
-	runinfo.zri_space_used = 0;
-	runinfo.zri_curinstrs = 0;
-	runinfo.zri_maxinstrs = instrlimit;
-
-	if (sync) {
-		err = dsl_sync_task_sig(poolname, NULL, zcp_eval_sync,
-		    zcp_eval_sig, &runinfo, 0, ZFS_SPACE_CHECK_ZCP_EVAL);
-		if (err != 0)
-			zcp_pool_error(&runinfo, poolname);
-	} else {
-		zcp_eval_open(&runinfo, poolname);
-	}
-	lua_close(state);
-
-	return (runinfo.zri_result);
-}
-
-/*
- * Retrieve metadata about the currently running channel program.
- */
-zcp_run_info_t *
-zcp_run_info(lua_State *state)
-{
-	zcp_run_info_t *ri;
-
-	lua_getfield(state, LUA_REGISTRYINDEX, ZCP_RUN_INFO_KEY);
-	ri = lua_touserdata(state, -1);
-	lua_pop(state, 1);
-	return (ri);
-}
-
-/*
- * Argument Parsing
- * ================
- *
- * The Lua language allows methods to be called with any number
- * of arguments of any type. When calling back into ZFS we need to sanitize
- * arguments from channel programs to make sure unexpected arguments or
- * arguments of the wrong type result in clear error messages. To do this
- * in a uniform way all callbacks from channel programs should use the
- * zcp_parse_args() function to interpret inputs.
- *
- * Positional vs Keyword Arguments
- * ===============================
- *
- * Every callback function takes a fixed set of required positional arguments
- * and optional keyword arguments. For example, the destroy function takes
- * a single positional string argument (the name of the dataset to destroy)
- * and an optional "defer" keyword boolean argument. When calling lua functions
- * with parentheses, only positional arguments can be used:
- *
- *     zfs.sync.snapshot("rpool@snap")
- *
- * To use keyword arguments functions should be called with a single argument
- * that is a lua table containing mappings of integer -> positional arguments
- * and string -> keyword arguments:
- *
- *     zfs.sync.snapshot({1="rpool@snap", defer=true})
- *
- * The lua language allows curly braces to be used in place of parenthesis as
- * syntactic sugar for this calling convention:
- *
- *     zfs.sync.snapshot{"rpool@snap", defer=true}
- */
-
-/*
- * Throw an error and print the given arguments.  If there are too many
- * arguments to fit in the output buffer, only the error format string is
- * output.
- */
-static void
-zcp_args_error(lua_State *state, const char *fname, const zcp_arg_t *pargs,
-    const zcp_arg_t *kwargs, const char *fmt, ...)
-{
-	int i;
-	char errmsg[512];
-	size_t len = sizeof (errmsg);
-	size_t msglen = 0;
-	va_list argp;
-
-	va_start(argp, fmt);
-	VERIFY3U(len, >, vsnprintf(errmsg, len, fmt, argp));
-	va_end(argp);
-
-	/*
-	 * Calculate the total length of the final string, including extra
-	 * formatting characters. If the argument dump would be too large,
-	 * only print the error string.
-	 */
-	msglen = strlen(errmsg);
-	msglen += strlen(fname) + 4; /* : + {} + null terminator */
-	for (i = 0; pargs[i].za_name != NULL; i++) {
-		msglen += strlen(pargs[i].za_name);
-		msglen += strlen(lua_typename(state, pargs[i].za_lua_type));
-		if (pargs[i + 1].za_name != NULL || kwargs[0].za_name != NULL)
-			msglen += 5; /* < + ( + )> + , */
-		else
-			msglen += 4; /* < + ( + )> */
-	}
-	for (i = 0; kwargs[i].za_name != NULL; i++) {
-		msglen += strlen(kwargs[i].za_name);
-		msglen += strlen(lua_typename(state, kwargs[i].za_lua_type));
-		if (kwargs[i + 1].za_name != NULL)
-			msglen += 4; /* =( + ) + , */
-		else
-			msglen += 3; /* =( + ) */
-	}
-
-	if (msglen >= len)
-		(void) luaL_error(state, errmsg);
-
-	VERIFY3U(len, >, strlcat(errmsg, ": ", len));
-	VERIFY3U(len, >, strlcat(errmsg, fname, len));
-	VERIFY3U(len, >, strlcat(errmsg, "{", len));
-	for (i = 0; pargs[i].za_name != NULL; i++) {
-		VERIFY3U(len, >, strlcat(errmsg, "<", len));
-		VERIFY3U(len, >, strlcat(errmsg, pargs[i].za_name, len));
-		VERIFY3U(len, >, strlcat(errmsg, "(", len));
-		VERIFY3U(len, >, strlcat(errmsg,
-		    lua_typename(state, pargs[i].za_lua_type), len));
-		VERIFY3U(len, >, strlcat(errmsg, ")>", len));
-		if (pargs[i + 1].za_name != NULL || kwargs[0].za_name != NULL) {
-			VERIFY3U(len, >, strlcat(errmsg, ", ", len));
-		}
-	}
-	for (i = 0; kwargs[i].za_name != NULL; i++) {
-		VERIFY3U(len, >, strlcat(errmsg, kwargs[i].za_name, len));
-		VERIFY3U(len, >, strlcat(errmsg, "=(", len));
-		VERIFY3U(len, >, strlcat(errmsg,
-		    lua_typename(state, kwargs[i].za_lua_type), len));
-		VERIFY3U(len, >, strlcat(errmsg, ")", len));
-		if (kwargs[i + 1].za_name != NULL) {
-			VERIFY3U(len, >, strlcat(errmsg, ", ", len));
-		}
-	}
-	VERIFY3U(len, >, strlcat(errmsg, "}", len));
-
-	(void) luaL_error(state, errmsg);
-	panic("unreachable code");
-}
-
-static void
-zcp_parse_table_args(lua_State *state, const char *fname,
-    const zcp_arg_t *pargs, const zcp_arg_t *kwargs)
-{
-	int i;
-	int type;
-
-	for (i = 0; pargs[i].za_name != NULL; i++) {
-		/*
-		 * Check the table for this positional argument, leaving it
-		 * on the top of the stack once we finish validating it.
-		 */
-		lua_pushinteger(state, i + 1);
-		lua_gettable(state, 1);
-
-		type = lua_type(state, -1);
-		if (type == LUA_TNIL) {
-			zcp_args_error(state, fname, pargs, kwargs,
-			    "too few arguments");
-			panic("unreachable code");
-		} else if (type != pargs[i].za_lua_type) {
-			zcp_args_error(state, fname, pargs, kwargs,
-			    "arg %d wrong type (is '%s', expected '%s')",
-			    i + 1, lua_typename(state, type),
-			    lua_typename(state, pargs[i].za_lua_type));
-			panic("unreachable code");
-		}
-
-		/*
-		 * Remove the positional argument from the table.
-		 */
-		lua_pushinteger(state, i + 1);
-		lua_pushnil(state);
-		lua_settable(state, 1);
-	}
-
-	for (i = 0; kwargs[i].za_name != NULL; i++) {
-		/*
-		 * Check the table for this keyword argument, which may be
-		 * nil if it was omitted. Leave the value on the top of
-		 * the stack after validating it.
-		 */
-		lua_getfield(state, 1, kwargs[i].za_name);
-
-		type = lua_type(state, -1);
-		if (type != LUA_TNIL && type != kwargs[i].za_lua_type) {
-			zcp_args_error(state, fname, pargs, kwargs,
-			    "kwarg '%s' wrong type (is '%s', expected '%s')",
-			    kwargs[i].za_name, lua_typename(state, type),
-			    lua_typename(state, kwargs[i].za_lua_type));
-			panic("unreachable code");
-		}
-
-		/*
-		 * Remove the keyword argument from the table.
-		 */
-		lua_pushnil(state);
-		lua_setfield(state, 1, kwargs[i].za_name);
-	}
-
-	/*
-	 * Any entries remaining in the table are invalid inputs, print
-	 * an error message based on what the entry is.
-	 */
-	lua_pushnil(state);
-	if (lua_next(state, 1)) {
-		if (lua_isnumber(state, -2) && lua_tointeger(state, -2) > 0) {
-			zcp_args_error(state, fname, pargs, kwargs,
-			    "too many positional arguments");
-		} else if (lua_isstring(state, -2)) {
-			zcp_args_error(state, fname, pargs, kwargs,
-			    "invalid kwarg '%s'", lua_tostring(state, -2));
-		} else {
-			zcp_args_error(state, fname, pargs, kwargs,
-			    "kwarg keys must be strings");
-		}
-		panic("unreachable code");
-	}
-
-	lua_remove(state, 1);
-}
-
-static void
-zcp_parse_pos_args(lua_State *state, const char *fname, const zcp_arg_t *pargs,
-    const zcp_arg_t *kwargs)
-{
-	int i;
-	int type;
-
-	for (i = 0; pargs[i].za_name != NULL; i++) {
-		type = lua_type(state, i + 1);
-		if (type == LUA_TNONE) {
-			zcp_args_error(state, fname, pargs, kwargs,
-			    "too few arguments");
-			panic("unreachable code");
-		} else if (type != pargs[i].za_lua_type) {
-			zcp_args_error(state, fname, pargs, kwargs,
-			    "arg %d wrong type (is '%s', expected '%s')",
-			    i + 1, lua_typename(state, type),
-			    lua_typename(state, pargs[i].za_lua_type));
-			panic("unreachable code");
-		}
-	}
-	if (lua_gettop(state) != i) {
-		zcp_args_error(state, fname, pargs, kwargs,
-		    "too many positional arguments");
-		panic("unreachable code");
-	}
-
-	for (i = 0; kwargs[i].za_name != NULL; i++) {
-		lua_pushnil(state);
-	}
-}
-
-/*
- * Checks the current Lua stack against an expected set of positional and
- * keyword arguments. If the stack does not match the expected arguments
- * aborts the current channel program with a useful error message, otherwise
- * it re-arranges the stack so that it contains the positional arguments
- * followed by the keyword argument values in declaration order. Any missing
- * keyword argument will be represented by a nil value on the stack.
- *
- * If the stack contains exactly one argument of type LUA_TTABLE the curly
- * braces calling convention is assumed, otherwise the stack is parsed for
- * positional arguments only.
- *
- * This function should be used by every function callback. It should be called
- * before the callback manipulates the Lua stack as it assumes the stack
- * represents the function arguments.
- */
-void
-zcp_parse_args(lua_State *state, const char *fname, const zcp_arg_t *pargs,
-    const zcp_arg_t *kwargs)
-{
-	if (lua_gettop(state) == 1 && lua_istable(state, 1)) {
-		zcp_parse_table_args(state, fname, pargs, kwargs);
-	} else {
-		zcp_parse_pos_args(state, fname, pargs, kwargs);
-	}
-}
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zcp_get.c
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zcp_get.c
+++ /dev/null
@@ -1,865 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * This file and its contents are supplied under the terms of the
- * Common Development and Distribution License ("CDDL"), version 1.0.
- * You may only use this file in accordance with the terms of version
- * 1.0 of the CDDL.
- *
- * A full copy of the text of the CDDL should have accompanied this
- * source.  A copy of the CDDL is also available via the Internet at
- * http://www.illumos.org/license/CDDL.
- *
- * CDDL HEADER END
- */
-
-/*
- * Copyright (c) 2016, 2017 by Delphix. All rights reserved.
- */
-
-#include "lua.h"
-#include "lualib.h"
-#include "lauxlib.h"
-
-#include <zfs_prop.h>
-
-#include <sys/dsl_prop.h>
-#include <sys/dsl_synctask.h>
-#include <sys/dsl_dataset.h>
-#include <sys/dsl_dir.h>
-#include <sys/dmu_objset.h>
-#include <sys/mntent.h>
-#include <sys/sunddi.h>
-#include <sys/zap.h>
-#include <sys/zcp.h>
-#include <sys/zcp_iter.h>
-#include <sys/zcp_global.h>
-#include <sys/zfs_ioctl.h>
-#include <sys/zfs_znode.h>
-#include <sys/zvol.h>
-
-#ifdef _KERNEL
-#include <sys/zfs_vfsops.h>
-#endif
-
-static int
-get_objset_type(dsl_dataset_t *ds, zfs_type_t *type)
-{
-	int error;
-	objset_t *os;
-	error = dmu_objset_from_ds(ds, &os);
-	if (error != 0)
-		return (error);
-	if (ds->ds_is_snapshot) {
-		*type = ZFS_TYPE_SNAPSHOT;
-	} else {
-		switch (os->os_phys->os_type) {
-		case DMU_OST_ZFS:
-			*type = ZFS_TYPE_FILESYSTEM;
-			break;
-		case DMU_OST_ZVOL:
-			*type = ZFS_TYPE_VOLUME;
-			break;
-		default:
-			return (EINVAL);
-		}
-	}
-	return (0);
-}
-
-/*
- * Returns the string name of ds's type in str (a buffer which should be
- * at least 12 bytes long).
- */
-static int
-get_objset_type_name(dsl_dataset_t *ds, char *str)
-{
-	int error;
-	zfs_type_t type;
-	error = get_objset_type(ds, &type);
-	if (error != 0)
-		return (error);
-	switch (type) {
-	case ZFS_TYPE_SNAPSHOT:
-		(void) strcpy(str, "snapshot");
-		break;
-	case ZFS_TYPE_FILESYSTEM:
-		(void) strcpy(str, "filesystem");
-		break;
-	case ZFS_TYPE_VOLUME:
-		(void) strcpy(str, "volume");
-		break;
-	default:
-		return (EINVAL);
-	}
-	return (0);
-}
-
-/*
- * Determines the source of a property given its setpoint and
- * property type. It pushes the source to the lua stack.
- */
-static void
-get_prop_src(lua_State *state, const char *setpoint, zfs_prop_t prop)
-{
-	if (zfs_prop_readonly(prop) || (prop == ZFS_PROP_VERSION)) {
-		lua_pushnil(state);
-	} else {
-		const char *src;
-		if (strcmp("", setpoint) == 0) {
-			src = "default";
-		} else {
-			src = setpoint;
-		}
-		(void) lua_pushstring(state, src);
-	}
-}
-
-/*
- * Given an error encountered while getting properties, either longjmp's for
- * a fatal error or pushes nothing to the stack for a non fatal one.
- */
-static int
-zcp_handle_error(lua_State *state, const char *dataset_name,
-    const char *property_name, int error)
-{
-	ASSERT3S(error, !=, 0);
-	if (error == ENOENT) {
-		return (0);
-	} else if (error == EINVAL) {
-		return (luaL_error(state,
-		    "property '%s' is not a valid property on dataset '%s'",
-		    property_name, dataset_name));
-	} else if (error == EIO) {
-		return (luaL_error(state,
-		    "I/O error while retrieving property '%s' on dataset '%s'",
-		    property_name, dataset_name));
-	} else {
-		return (luaL_error(state, "unexpected error %d while "
-		    "retrieving property '%s' on dataset '%s'",
-		    error, property_name, dataset_name));
-	}
-}
-
-/*
- * Look up a user defined property in the zap object. If it exists, push it
- * and the setpoint onto the stack, otherwise don't push anything.
- */
-static int
-zcp_get_user_prop(lua_State *state, dsl_pool_t *dp, const char *dataset_name,
-    const char *property_name)
-{
-	int error;
-	char *buf;
-	char setpoint[ZFS_MAX_DATASET_NAME_LEN];
-	/*
-	 * zcp_dataset_hold will either successfully return the requested
-	 * dataset or throw a lua error and longjmp out of the zfs.get_prop call
-	 * without returning.
-	 */
-	dsl_dataset_t *ds = zcp_dataset_hold(state, dp, dataset_name, FTAG);
-	if (ds == NULL)
-		return (1); /* not reached; zcp_dataset_hold() longjmp'd */
-
-	buf = kmem_alloc(ZAP_MAXVALUELEN, KM_SLEEP);
-	error = dsl_prop_get_ds(ds, property_name, 1, ZAP_MAXVALUELEN,
-	    buf, setpoint);
-	dsl_dataset_rele(ds, FTAG);
-
-	if (error != 0) {
-		kmem_free(buf, ZAP_MAXVALUELEN);
-		return (zcp_handle_error(state, dataset_name, property_name,
-		    error));
-	}
-	(void) lua_pushstring(state, buf);
-	(void) lua_pushstring(state, setpoint);
-	kmem_free(buf, ZAP_MAXVALUELEN);
-	return (2);
-}
-
-/*
- * Check if the property we're looking for is stored in the ds_dir. If so,
- * return it in the 'val' argument. Return 0 on success and ENOENT and if
- * the property is not present.
- */
-static int
-get_dsl_dir_prop(dsl_dataset_t *ds, zfs_prop_t zfs_prop,
-    uint64_t *val)
-{
-	dsl_dir_t *dd = ds->ds_dir;
-	mutex_enter(&dd->dd_lock);
-	switch (zfs_prop) {
-	case ZFS_PROP_USEDSNAP:
-		*val = dsl_dir_get_usedsnap(dd);
-		break;
-	case ZFS_PROP_USEDCHILD:
-		*val = dsl_dir_get_usedchild(dd);
-		break;
-	case ZFS_PROP_USEDDS:
-		*val = dsl_dir_get_usedds(dd);
-		break;
-	case ZFS_PROP_USEDREFRESERV:
-		*val = dsl_dir_get_usedrefreserv(dd);
-		break;
-	case ZFS_PROP_LOGICALUSED:
-		*val = dsl_dir_get_logicalused(dd);
-		break;
-	default:
-		mutex_exit(&dd->dd_lock);
-		return (ENOENT);
-	}
-	mutex_exit(&dd->dd_lock);
-	return (0);
-}
-
-/*
- * Takes a dataset, a property, a value and that value's setpoint as
- * found in the ZAP. Checks if the property has been changed in the vfs.
- * If so, val and setpoint will be overwritten with updated content.
- * Otherwise, they are left unchanged.
- */
-static int
-get_temporary_prop(dsl_dataset_t *ds, zfs_prop_t zfs_prop, uint64_t *val,
-    char *setpoint)
-{
-#ifndef _KERNEL
-	return (0);
-#else
-	int error;
-#ifdef illumos
-	zfsvfs_t *zfvp;
-#endif
-	vfs_t *vfsp;
-	objset_t *os;
-	uint64_t tmp = *val;
-
-	error = dmu_objset_from_ds(ds, &os);
-	if (error != 0)
-		return (error);
-
-	error = getzfsvfs_impl(os, &vfsp);
-	if (error != 0)
-		return (error);
-#ifdef illumos
-	vfsp = zfvp->z_vfs;
-#endif
-	switch (zfs_prop) {
-	case ZFS_PROP_ATIME:
-		if (vfs_optionisset(vfsp, MNTOPT_NOATIME, NULL))
-			tmp = 0;
-		if (vfs_optionisset(vfsp, MNTOPT_ATIME, NULL))
-			tmp = 1;
-		break;
-	case ZFS_PROP_DEVICES:
-		if (vfs_optionisset(vfsp, MNTOPT_NODEVICES, NULL))
-			tmp = 0;
-		if (vfs_optionisset(vfsp, MNTOPT_DEVICES, NULL))
-			tmp = 1;
-		break;
-	case ZFS_PROP_EXEC:
-		if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL))
-			tmp = 0;
-		if (vfs_optionisset(vfsp, MNTOPT_EXEC, NULL))
-			tmp = 1;
-		break;
-	case ZFS_PROP_SETUID:
-		if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL))
-			tmp = 0;
-		if (vfs_optionisset(vfsp, MNTOPT_SETUID, NULL))
-			tmp = 1;
-		break;
-	case ZFS_PROP_READONLY:
-		if (vfs_optionisset(vfsp, MNTOPT_RW, NULL))
-			tmp = 0;
-		if (vfs_optionisset(vfsp, MNTOPT_RO, NULL))
-			tmp = 1;
-		break;
-	case ZFS_PROP_XATTR:
-		if (vfs_optionisset(vfsp, MNTOPT_NOXATTR, NULL))
-			tmp = 0;
-		if (vfs_optionisset(vfsp, MNTOPT_XATTR, NULL))
-			tmp = 1;
-		break;
-	case ZFS_PROP_NBMAND:
-		if (vfs_optionisset(vfsp, MNTOPT_NONBMAND, NULL))
-			tmp = 0;
-		if (vfs_optionisset(vfsp, MNTOPT_NBMAND, NULL))
-			tmp = 1;
-		break;
-	default:
-#ifdef illumos
-		VFS_RELE(vfsp);
-#else
-		vfs_rel(vfsp);
-#endif
-		return (ENOENT);
-	}
-
-#ifdef illumos
-	VFS_RELE(vfsp);
-#else
-	vfs_rel(vfsp);
-#endif
-	if (tmp != *val) {
-		(void) strcpy(setpoint, "temporary");
-		*val = tmp;
-	}
-	return (0);
-#endif
-}
-
-/*
- * Check if the property we're looking for is stored at the dsl_dataset or
- * dsl_dir level. If so, push the property value and source onto the lua stack
- * and return 0. If it is not present or a failure occurs in lookup, return a
- * non-zero error value.
- */
-static int
-get_special_prop(lua_State *state, dsl_dataset_t *ds, const char *dsname,
-    zfs_prop_t zfs_prop)
-{
-	int error = 0;
-	objset_t *os;
-	uint64_t numval;
-	char *strval = kmem_alloc(ZAP_MAXVALUELEN, KM_SLEEP);
-	char setpoint[ZFS_MAX_DATASET_NAME_LEN] =
-	    "Internal error - setpoint not determined";
-	zfs_type_t ds_type;
-	zprop_type_t prop_type = zfs_prop_get_type(zfs_prop);
-	(void) get_objset_type(ds, &ds_type);
-
-	switch (zfs_prop) {
-	case ZFS_PROP_REFRATIO:
-		numval = dsl_get_refratio(ds);
-		break;
-	case ZFS_PROP_USED:
-		numval = dsl_get_used(ds);
-		break;
-	case ZFS_PROP_CLONES: {
-		nvlist_t *clones = fnvlist_alloc();
-		error = get_clones_stat_impl(ds, clones);
-		if (error == 0) {
-			/* push list to lua stack */
-			VERIFY0(zcp_nvlist_to_lua(state, clones, NULL, 0));
-			/* source */
-			(void) lua_pushnil(state);
-		}
-		nvlist_free(clones);
-		kmem_free(strval, ZAP_MAXVALUELEN);
-		return (error);
-	}
-	case ZFS_PROP_COMPRESSRATIO:
-		numval = dsl_get_compressratio(ds);
-		break;
-	case ZFS_PROP_CREATION:
-		numval = dsl_get_creation(ds);
-		break;
-	case ZFS_PROP_REFERENCED:
-		numval = dsl_get_referenced(ds);
-		break;
-	case ZFS_PROP_AVAILABLE:
-		numval = dsl_get_available(ds);
-		break;
-	case ZFS_PROP_LOGICALREFERENCED:
-		numval = dsl_get_logicalreferenced(ds);
-		break;
-	case ZFS_PROP_CREATETXG:
-		numval = dsl_get_creationtxg(ds);
-		break;
-	case ZFS_PROP_GUID:
-		numval = dsl_get_guid(ds);
-		break;
-	case ZFS_PROP_UNIQUE:
-		numval = dsl_get_unique(ds);
-		break;
-	case ZFS_PROP_OBJSETID:
-		numval = dsl_get_objsetid(ds);
-		break;
-	case ZFS_PROP_ORIGIN:
-		dsl_dir_get_origin(ds->ds_dir, strval);
-		break;
-	case ZFS_PROP_USERACCOUNTING:
-		error = dmu_objset_from_ds(ds, &os);
-		if (error == 0)
-			numval = dmu_objset_userspace_present(os);
-		break;
-	case ZFS_PROP_WRITTEN:
-		error = dsl_get_written(ds, &numval);
-		break;
-	case ZFS_PROP_TYPE:
-		error = get_objset_type_name(ds, strval);
-		break;
-	case ZFS_PROP_PREV_SNAP:
-		error = dsl_get_prev_snap(ds, strval);
-		break;
-	case ZFS_PROP_NAME:
-		dsl_dataset_name(ds, strval);
-		break;
-	case ZFS_PROP_MOUNTPOINT:
-		error = dsl_get_mountpoint(ds, dsname, strval, setpoint);
-		break;
-	case ZFS_PROP_VERSION:
-		/* should be a snapshot or filesystem */
-		ASSERT(ds_type != ZFS_TYPE_VOLUME);
-		error = dmu_objset_from_ds(ds, &os);
-		/* look in the master node for the version */
-		if (error == 0) {
-			error = zap_lookup(os, MASTER_NODE_OBJ, ZPL_VERSION_STR,
-			    sizeof (numval), 1, &numval);
-		}
-		break;
-	case ZFS_PROP_DEFER_DESTROY:
-		numval = dsl_get_defer_destroy(ds);
-		break;
-	case ZFS_PROP_USERREFS:
-		numval = dsl_get_userrefs(ds);
-		break;
-	case ZFS_PROP_FILESYSTEM_COUNT:
-		error = dsl_dir_get_filesystem_count(ds->ds_dir, &numval);
-		(void) strcpy(setpoint, "");
-		break;
-	case ZFS_PROP_SNAPSHOT_COUNT:
-		error = dsl_dir_get_snapshot_count(ds->ds_dir, &numval);
-		(void) strcpy(setpoint, "");
-		break;
-	case ZFS_PROP_REMAPTXG:
-		error = dsl_dir_get_remaptxg(ds->ds_dir, &numval);
-		break;
-	case ZFS_PROP_NUMCLONES:
-		numval = dsl_get_numclones(ds);
-		break;
-	case ZFS_PROP_INCONSISTENT:
-		numval = dsl_get_inconsistent(ds);
-		break;
-	case ZFS_PROP_RECEIVE_RESUME_TOKEN: {
-		char *token = get_receive_resume_stats_impl(ds);
-		VERIFY3U(strlcpy(strval, token, ZAP_MAXVALUELEN), <,
-		    ZAP_MAXVALUELEN);
-		strfree(token);
-		if (strcmp(strval, "") == 0) {
-			token = get_child_receive_stats(ds);
-			VERIFY3U(strlcpy(strval, token, ZAP_MAXVALUELEN), <,
-			    ZAP_MAXVALUELEN);
-			strfree(token);
-			if (strcmp(strval, "") == 0)
-				error = ENOENT;
-		}
-		break;
-	}
-	case ZFS_PROP_VOLSIZE:
-		ASSERT(ds_type == ZFS_TYPE_VOLUME);
-		error = dmu_objset_from_ds(ds, &os);
-		if (error == 0) {
-			error = zap_lookup(os, ZVOL_ZAP_OBJ, "size",
-			    sizeof (numval), 1, &numval);
-		}
-		if (error == 0)
-			(void) strcpy(setpoint, dsname);
-
-		break;
-	case ZFS_PROP_VOLBLOCKSIZE: {
-		ASSERT(ds_type == ZFS_TYPE_VOLUME);
-		dmu_object_info_t doi;
-		error = dmu_objset_from_ds(ds, &os);
-		if (error == 0) {
-			error = dmu_object_info(os, ZVOL_OBJ, &doi);
-			if (error == 0)
-				numval = doi.doi_data_block_size;
-		}
-		break;
-	}
-	default:
-		/* Did not match these props, check in the dsl_dir */
-		error = get_dsl_dir_prop(ds, zfs_prop, &numval);
-	}
-	if (error != 0) {
-		kmem_free(strval, ZAP_MAXVALUELEN);
-		return (error);
-	}
-
-	switch (prop_type) {
-	case PROP_TYPE_NUMBER: {
-		(void) lua_pushnumber(state, numval);
-		break;
-	}
-	case PROP_TYPE_STRING: {
-		(void) lua_pushstring(state, strval);
-		break;
-	}
-	case PROP_TYPE_INDEX: {
-		const char *propval;
-		error = zfs_prop_index_to_string(zfs_prop, numval, &propval);
-		if (error != 0) {
-			kmem_free(strval, ZAP_MAXVALUELEN);
-			return (error);
-		}
-		(void) lua_pushstring(state, propval);
-		break;
-	}
-	}
-	kmem_free(strval, ZAP_MAXVALUELEN);
-
-	/* Push the source to the stack */
-	get_prop_src(state, setpoint, zfs_prop);
-	return (0);
-}
-
-/*
- * Look up a property and its source in the zap object. If the value is
- * present and successfully retrieved, push the value and source on the
- * lua stack and return 0. On failure, return a non-zero error value.
- */
-static int
-get_zap_prop(lua_State *state, dsl_dataset_t *ds, zfs_prop_t zfs_prop)
-{
-	int error = 0;
-	char setpoint[ZFS_MAX_DATASET_NAME_LEN];
-	char *strval = kmem_alloc(ZAP_MAXVALUELEN, KM_SLEEP);
-	uint64_t numval;
-	const char *prop_name = zfs_prop_to_name(zfs_prop);
-	zprop_type_t prop_type = zfs_prop_get_type(zfs_prop);
-
-	if (prop_type == PROP_TYPE_STRING) {
-		/* Push value to lua stack */
-		error = dsl_prop_get_ds(ds, prop_name, 1,
-		    ZAP_MAXVALUELEN, strval, setpoint);
-		if (error == 0)
-			(void) lua_pushstring(state, strval);
-	} else {
-		error = dsl_prop_get_ds(ds, prop_name, sizeof (numval),
-		    1, &numval, setpoint);
-
-		/* Fill in temorary value for prop, if applicable */
-		(void) get_temporary_prop(ds, zfs_prop, &numval, setpoint);
-
-		/* Push value to lua stack */
-		if (prop_type == PROP_TYPE_INDEX) {
-			const char *propval;
-			error = zfs_prop_index_to_string(zfs_prop, numval,
-			    &propval);
-			if (error == 0)
-				(void) lua_pushstring(state, propval);
-		} else {
-			if (error == 0)
-				(void) lua_pushnumber(state, numval);
-		}
-	}
-	kmem_free(strval, ZAP_MAXVALUELEN);
-	if (error == 0)
-		get_prop_src(state, setpoint, zfs_prop);
-	return (error);
-}
-
-/*
- * Determine whether property is valid for a given dataset
- */
-boolean_t
-prop_valid_for_ds(dsl_dataset_t *ds, zfs_prop_t zfs_prop)
-{
-	int error;
-	zfs_type_t zfs_type;
-
-	/* properties not supported */
-	if ((zfs_prop == ZFS_PROP_ISCSIOPTIONS) ||
-	    (zfs_prop == ZFS_PROP_MOUNTED))
-		return (B_FALSE);
-
-	/* if we want the origin prop, ds must be a clone */
-	if ((zfs_prop == ZFS_PROP_ORIGIN) && (!dsl_dir_is_clone(ds->ds_dir)))
-		return (B_FALSE);
-
-	error = get_objset_type(ds, &zfs_type);
-	if (error != 0)
-		return (B_FALSE);
-	return (zfs_prop_valid_for_type(zfs_prop, zfs_type));
-}
-
-/*
- * Look up a given dataset property. On success return 2, the number of
- * values pushed to the lua stack (property value and source). On a fatal
- * error, longjmp. On a non fatal error push nothing.
- */
-static int
-zcp_get_system_prop(lua_State *state, dsl_pool_t *dp, const char *dataset_name,
-    zfs_prop_t zfs_prop)
-{
-	int error;
-	/*
-	 * zcp_dataset_hold will either successfully return the requested
-	 * dataset or throw a lua error and longjmp out of the zfs.get_prop call
-	 * without returning.
-	 */
-	dsl_dataset_t *ds = zcp_dataset_hold(state, dp, dataset_name, FTAG);
-	if (ds == NULL)
-		return (1); /* not reached; zcp_dataset_hold() longjmp'd */
-
-	/* Check that the property is valid for the given dataset */
-	const char *prop_name = zfs_prop_to_name(zfs_prop);
-	if (!prop_valid_for_ds(ds, zfs_prop)) {
-		dsl_dataset_rele(ds, FTAG);
-		return (0);
-	}
-
-	/* Check if the property can be accessed directly */
-	error = get_special_prop(state, ds, dataset_name, zfs_prop);
-	if (error == 0) {
-		dsl_dataset_rele(ds, FTAG);
-		/* The value and source have been pushed by get_special_prop */
-		return (2);
-	}
-	if (error != ENOENT) {
-		dsl_dataset_rele(ds, FTAG);
-		return (zcp_handle_error(state, dataset_name,
-		    prop_name, error));
-	}
-
-	/* If we were unable to find it, look in the zap object */
-	error = get_zap_prop(state, ds, zfs_prop);
-	dsl_dataset_rele(ds, FTAG);
-	if (error != 0) {
-		return (zcp_handle_error(state, dataset_name,
-		    prop_name, error));
-	}
-	/* The value and source have been pushed by get_zap_prop */
-	return (2);
-}
-
-static zfs_userquota_prop_t
-get_userquota_prop(const char *prop_name)
-{
-	zfs_userquota_prop_t type;
-	/* Figure out the property type ({user|group}{quota|used}) */
-	for (type = 0; type < ZFS_NUM_USERQUOTA_PROPS; type++) {
-		if (strncmp(prop_name, zfs_userquota_prop_prefixes[type],
-		    strlen(zfs_userquota_prop_prefixes[type])) == 0)
-			break;
-	}
-	return (type);
-}
-
-#ifdef _KERNEL
-/*
- * Given the name of a zfs_userquota_prop, this function determines the
- * prop type as well as the numeric group/user ids based on the string
- * following the '@' in the property name. On success, returns 0. On failure,
- * returns a non-zero error.
- * 'domain' must be free'd by caller using strfree()
- */
-static int
-parse_userquota_prop(const char *prop_name, zfs_userquota_prop_t *type,
-    char **domain, uint64_t *rid)
-{
-	char *cp, *end, *domain_val;
-
-	*type = get_userquota_prop(prop_name);
-	if (*type >= ZFS_NUM_USERQUOTA_PROPS)
-		return (EINVAL);
-
-	*rid = 0;
-	cp = strchr(prop_name, '@') + 1;
-	if (strncmp(cp, "S-1-", 4) == 0) {
-		/*
-		 * It's a numeric SID (eg "S-1-234-567-89") and we want to
-		 * seperate the domain id and the rid
-		 */
-		int domain_len = strrchr(cp, '-') - cp;
-		domain_val = kmem_alloc(domain_len + 1, KM_SLEEP);
-		(void) strncpy(domain_val, cp, domain_len);
-		domain_val[domain_len] = '\0';
-		cp += domain_len + 1;
-
-		(void) ddi_strtoll(cp, &end, 10, (longlong_t *)rid);
-		if (*end != '\0') {
-			strfree(domain_val);
-			return (EINVAL);
-		}
-	} else {
-		/* It's only a user/group ID (eg "12345"), just get the rid */
-		domain_val = NULL;
-		(void) ddi_strtoll(cp, &end, 10, (longlong_t *)rid);
-		if (*end != '\0')
-			return (EINVAL);
-	}
-	*domain = domain_val;
-	return (0);
-}
-
-/*
- * Look up {user|group}{quota|used} property for given dataset. On success
- * push the value (quota or used amount) and the setpoint. On failure, push
- * a lua error.
- */
-static int
-zcp_get_userquota_prop(lua_State *state, dsl_pool_t *dp,
-    const char *dataset_name, const char *prop_name)
-{
-	zfsvfs_t *zfvp;
-	zfsvfs_t *zfsvfs;
-	int error;
-	zfs_userquota_prop_t type;
-	char *domain;
-	uint64_t rid, value;
-	objset_t *os;
-
-	dsl_dataset_t *ds = zcp_dataset_hold(state, dp, dataset_name, FTAG);
-	if (ds == NULL)
-		return (1); /* not reached; zcp_dataset_hold() longjmp'd */
-
-	error = parse_userquota_prop(prop_name, &type, &domain, &rid);
-	if (error == 0) {
-		error = dmu_objset_from_ds(ds, &os);
-		if (error == 0) {
-			zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP);
-			error = zfsvfs_create_impl(&zfvp, zfsvfs, os);
-			if (error == 0) {
-				error = zfs_userspace_one(zfvp, type, domain,
-				    rid, &value);
-				zfsvfs_free(zfvp);
-			}
-		}
-		if (domain != NULL)
-			strfree(domain);
-	}
-	dsl_dataset_rele(ds, FTAG);
-
-	if ((value == 0) && ((type == ZFS_PROP_USERQUOTA) ||
-	    (type == ZFS_PROP_GROUPQUOTA)))
-		error = ENOENT;
-	if (error != 0) {
-		return (zcp_handle_error(state, dataset_name,
-		    prop_name, error));
-	}
-
-	(void) lua_pushnumber(state, value);
-	(void) lua_pushstring(state, dataset_name);
-	return (2);
-}
-#endif
-
-/*
- * Determines the name of the snapshot referenced in the written property
- * name. Returns snapshot name in snap_name, a buffer that must be at least
- * as large as ZFS_MAX_DATASET_NAME_LEN
- */
-static void
-parse_written_prop(const char *dataset_name, const char *prop_name,
-    char *snap_name)
-{
-	ASSERT(zfs_prop_written(prop_name));
-	const char *name = prop_name + ZFS_WRITTEN_PROP_PREFIX_LEN;
-	if (strchr(name, '@') == NULL) {
-		(void) sprintf(snap_name, "%s@%s", dataset_name, name);
-	} else {
-		(void) strcpy(snap_name, name);
-	}
-}
-
-/*
- * Look up written@ property for given dataset. On success
- * push the value and the setpoint. If error is fatal, we will
- * longjmp, otherwise push nothing.
- */
-static int
-zcp_get_written_prop(lua_State *state, dsl_pool_t *dp,
-    const char *dataset_name, const char *prop_name)
-{
-	char snap_name[ZFS_MAX_DATASET_NAME_LEN];
-	uint64_t used, comp, uncomp;
-	dsl_dataset_t *old;
-	int error = 0;
-
-	parse_written_prop(dataset_name, prop_name, snap_name);
-	dsl_dataset_t *new = zcp_dataset_hold(state, dp, dataset_name, FTAG);
-	if (new == NULL)
-		return (1); /* not reached; zcp_dataset_hold() longjmp'd */
-
-	error = dsl_dataset_hold(dp, snap_name, FTAG, &old);
-	if (error != 0) {
-		dsl_dataset_rele(new, FTAG);
-		return (zcp_dataset_hold_error(state, dp, snap_name,
-		    error));
-	}
-	error = dsl_dataset_space_written(old, new,
-	    &used, &comp, &uncomp);
-
-	dsl_dataset_rele(old, FTAG);
-	dsl_dataset_rele(new, FTAG);
-
-	if (error != 0) {
-		return (zcp_handle_error(state, dataset_name,
-		    snap_name, error));
-	}
-	(void) lua_pushnumber(state, used);
-	(void) lua_pushstring(state, dataset_name);
-	return (2);
-}
-
-static int zcp_get_prop(lua_State *state);
-static zcp_lib_info_t zcp_get_prop_info = {
-	.name = "get_prop",
-	.func = zcp_get_prop,
-	.pargs = {
-	    { .za_name = "dataset", .za_lua_type = LUA_TSTRING},
-	    { .za_name = "property", .za_lua_type =  LUA_TSTRING},
-	    {NULL, 0}
-	},
-	.kwargs = {
-	    {NULL, 0}
-	}
-};
-
-static int
-zcp_get_prop(lua_State *state)
-{
-	const char *dataset_name;
-	const char *property_name;
-	dsl_pool_t *dp = zcp_run_info(state)->zri_pool;
-	zcp_lib_info_t *libinfo = &zcp_get_prop_info;
-
-	zcp_parse_args(state, libinfo->name, libinfo->pargs, libinfo->kwargs);
-
-	dataset_name = lua_tostring(state, 1);
-	property_name = lua_tostring(state, 2);
-
-	/* User defined property */
-	if (zfs_prop_user(property_name)) {
-		return (zcp_get_user_prop(state, dp,
-		    dataset_name, property_name));
-	}
-	/* userspace property */
-	if (zfs_prop_userquota(property_name)) {
-#ifdef _KERNEL
-		return (zcp_get_userquota_prop(state, dp,
-		    dataset_name, property_name));
-#else
-		return (luaL_error(state,
-		    "user quota properties only supported in kernel mode",
-		    property_name));
-#endif
-	}
-	/* written@ property */
-	if (zfs_prop_written(property_name)) {
-		return (zcp_get_written_prop(state, dp,
-		    dataset_name, property_name));
-	}
-
-	zfs_prop_t zfs_prop = zfs_name_to_prop(property_name);
-	/* Valid system property */
-	if (zfs_prop != ZPROP_INVAL) {
-		return (zcp_get_system_prop(state, dp, dataset_name,
-		    zfs_prop));
-	}
-
-	/* Invalid property name */
-	return (luaL_error(state,
-	    "'%s' is not a valid property", property_name));
-}
-
-int
-zcp_load_get_lib(lua_State *state)
-{
-	lua_pushcclosure(state, zcp_get_prop_info.func, 0);
-	lua_setfield(state, -2, zcp_get_prop_info.name);
-
-	return (1);
-}
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zcp_global.c
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zcp_global.c
+++ /dev/null
@@ -1,89 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * This file and its contents are supplied under the terms of the
- * Common Development and Distribution License ("CDDL"), version 1.0.
- * You may only use this file in accordance with the terms of version
- * 1.0 of the CDDL.
- *
- * A full copy of the text of the CDDL should have accompanied this
- * source.  A copy of the CDDL is also available via the Internet at
- * http://www.illumos.org/license/CDDL.
- *
- * CDDL HEADER END
- */
-
-/*
- * Copyright (c) 2016, 2017 by Delphix. All rights reserved.
- */
-
-#include <sys/zcp_global.h>
-
-#include "lua.h"
-#include "lauxlib.h"
-
-typedef struct zcp_errno_global {
-	const char *zeg_name;
-	int zeg_errno;
-} zcp_errno_global_t;
-
-static const zcp_errno_global_t errno_globals[] = {
-	{"EPERM", EPERM},
-	{"ENOENT", ENOENT},
-	{"ESRCH", ESRCH},
-	{"EINTR", EINTR},
-	{"EIO", EIO},
-	{"ENXIO", ENXIO},
-	{"E2BIG", E2BIG},
-	{"ENOEXEC", ENOEXEC},
-	{"EBADF", EBADF},
-	{"ECHILD", ECHILD},
-	{"EAGAIN", EAGAIN},
-	{"ENOMEM", ENOMEM},
-	{"EACCES", EACCES},
-	{"EFAULT", EFAULT},
-	{"ENOTBLK", ENOTBLK},
-	{"EBUSY", EBUSY},
-	{"EEXIST", EEXIST},
-	{"EXDEV", EXDEV},
-	{"ENODEV", ENODEV},
-	{"ENOTDIR", ENOTDIR},
-	{"EISDIR", EISDIR},
-	{"EINVAL", EINVAL},
-	{"ENFILE", ENFILE},
-	{"EMFILE", EMFILE},
-	{"ENOTTY", ENOTTY},
-	{"ETXTBSY", ETXTBSY},
-	{"EFBIG", EFBIG},
-	{"ENOSPC", ENOSPC},
-	{"ESPIPE", ESPIPE},
-	{"EROFS", EROFS},
-	{"EMLINK", EMLINK},
-	{"EPIPE", EPIPE},
-	{"EDOM", EDOM},
-	{"ERANGE", ERANGE},
-	{"EDEADLK", EDEADLK},
-	{"ENOLCK", ENOLCK},
-	{"ECANCELED", ECANCELED},
-	{"ENOTSUP", ENOTSUP},
-	{"EDQUOT", EDQUOT},
-	{"ENAMETOOLONG", ENAMETOOLONG},
-	{NULL, 0}
-};
-
-static void
-zcp_load_errno_globals(lua_State *state)
-{
-	const zcp_errno_global_t *global = errno_globals;
-	while (global->zeg_name != NULL) {
-		lua_pushnumber(state, (lua_Number)global->zeg_errno);
-		lua_setglobal(state, global->zeg_name);
-		global++;
-	}
-}
-
-void
-zcp_load_globals(lua_State *state)
-{
-	zcp_load_errno_globals(state);
-}
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zcp_iter.c
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zcp_iter.c
+++ /dev/null
@@ -1,531 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * This file and its contents are supplied under the terms of the
- * Common Development and Distribution License ("CDDL"), version 1.0.
- * You may only use this file in accordance with the terms of version
- * 1.0 of the CDDL.
- *
- * A full copy of the text of the CDDL should have accompanied this
- * source.  A copy of the CDDL is also available via the Internet at
- * http://www.illumos.org/license/CDDL.
- *
- * CDDL HEADER END
- */
-
-/*
- * Copyright (c) 2016 by Delphix. All rights reserved.
- */
-
-#include "lua.h"
-#include "lauxlib.h"
-
-#include <sys/dmu.h>
-#include <sys/dsl_prop.h>
-#include <sys/dsl_synctask.h>
-#include <sys/dsl_dataset.h>
-#include <sys/dsl_pool.h>
-#include <sys/dmu_tx.h>
-#include <sys/dmu_objset.h>
-#include <sys/zap.h>
-#include <sys/dsl_dir.h>
-#include <sys/zcp_prop.h>
-
-#include <sys/zcp.h>
-
-typedef int (zcp_list_func_t)(lua_State *);
-typedef struct zcp_list_info {
-	const char *name;
-	zcp_list_func_t *func;
-	zcp_list_func_t *gc;
-	const zcp_arg_t pargs[4];
-	const zcp_arg_t kwargs[2];
-} zcp_list_info_t;
-
-static int
-zcp_clones_iter(lua_State *state)
-{
-	int err;
-	char clonename[ZFS_MAX_DATASET_NAME_LEN];
-	uint64_t dsobj = lua_tonumber(state, lua_upvalueindex(1));
-	uint64_t cursor = lua_tonumber(state, lua_upvalueindex(2));
-	dsl_pool_t *dp = zcp_run_info(state)->zri_pool;
-	dsl_dataset_t *ds, *clone;
-	zap_attribute_t za;
-	zap_cursor_t zc;
-
-	err = dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds);
-	if (err == ENOENT) {
-		return (0);
-	} else if (err != 0) {
-		return (luaL_error(state,
-		    "unexpected error %d from dsl_dataset_hold_obj(dsobj)",
-		    err));
-	}
-
-	if (dsl_dataset_phys(ds)->ds_next_clones_obj == 0) {
-		dsl_dataset_rele(ds, FTAG);
-		return (0);
-	}
-
-	zap_cursor_init_serialized(&zc, dp->dp_meta_objset,
-	    dsl_dataset_phys(ds)->ds_next_clones_obj, cursor);
-	dsl_dataset_rele(ds, FTAG);
-
-	err = zap_cursor_retrieve(&zc, &za);
-	if (err != 0) {
-		zap_cursor_fini(&zc);
-		if (err != ENOENT) {
-			return (luaL_error(state,
-			    "unexpected error %d from zap_cursor_retrieve()",
-			    err));
-		}
-		return (0);
-	}
-	zap_cursor_advance(&zc);
-	cursor = zap_cursor_serialize(&zc);
-	zap_cursor_fini(&zc);
-
-	err = dsl_dataset_hold_obj(dp, za.za_first_integer, FTAG, &clone);
-	if (err != 0) {
-		return (luaL_error(state,
-		    "unexpected error %d from "
-		    "dsl_dataset_hold_obj(za_first_integer)", err));
-	}
-
-	dsl_dir_name(clone->ds_dir, clonename);
-	dsl_dataset_rele(clone, FTAG);
-
-	lua_pushnumber(state, cursor);
-	lua_replace(state, lua_upvalueindex(2));
-
-	(void) lua_pushstring(state, clonename);
-	return (1);
-}
-
-static int zcp_clones_list(lua_State *);
-static zcp_list_info_t zcp_clones_list_info = {
-	.name = "clones",
-	.func = zcp_clones_list,
-	.gc = NULL,
-	.pargs = {
-	    { .za_name = "snapshot", .za_lua_type = LUA_TSTRING},
-	    {NULL, 0}
-	},
-	.kwargs = {
-	    {NULL, 0}
-	}
-};
-
-static int
-zcp_clones_list(lua_State *state)
-{
-	const char *snapname = lua_tostring(state, 1);
-	dsl_pool_t *dp = zcp_run_info(state)->zri_pool;
-	boolean_t issnap;
-	uint64_t dsobj, cursor;
-
-	/*
-	 * zcp_dataset_hold will either successfully return the requested
-	 * dataset or throw a lua error and longjmp out of the zfs.list.clones
-	 * call without returning.
-	 */
-	dsl_dataset_t *ds = zcp_dataset_hold(state, dp, snapname, FTAG);
-	if (ds == NULL)
-		return (1); /* not reached; zcp_dataset_hold() longjmp'd */
-	cursor = 0;
-	issnap = ds->ds_is_snapshot;
-	dsobj = ds->ds_object;
-	dsl_dataset_rele(ds, FTAG);
-
-	if (!issnap) {
-		return (zcp_argerror(state, 1, "%s is not a snapshot",
-		    snapname));
-	}
-
-	lua_pushnumber(state, dsobj);
-	lua_pushnumber(state, cursor);
-	lua_pushcclosure(state, &zcp_clones_iter, 2);
-	return (1);
-}
-
-static int
-zcp_snapshots_iter(lua_State *state)
-{
-	int err;
-	char snapname[ZFS_MAX_DATASET_NAME_LEN];
-	uint64_t dsobj = lua_tonumber(state, lua_upvalueindex(1));
-	uint64_t cursor = lua_tonumber(state, lua_upvalueindex(2));
-	dsl_pool_t *dp = zcp_run_info(state)->zri_pool;
-	dsl_dataset_t *ds;
-	objset_t *os;
-	char *p;
-
-	err = dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds);
-	if (err != 0) {
-		return (luaL_error(state,
-		    "unexpected error %d from dsl_dataset_hold_obj(dsobj)",
-		    err));
-	}
-
-	dsl_dataset_name(ds, snapname);
-	VERIFY3U(sizeof (snapname), >,
-	    strlcat(snapname, "@", sizeof (snapname)));
-
-	p = strchr(snapname, '\0');
-	VERIFY0(dmu_objset_from_ds(ds, &os));
-	err = dmu_snapshot_list_next(os,
-	    sizeof (snapname) - (p - snapname), p, NULL, &cursor, NULL);
-	dsl_dataset_rele(ds, FTAG);
-
-	if (err == ENOENT) {
-		return (0);
-	} else if (err != 0) {
-		return (luaL_error(state,
-		    "unexpected error %d from dmu_snapshot_list_next()", err));
-	}
-
-	lua_pushnumber(state, cursor);
-	lua_replace(state, lua_upvalueindex(2));
-
-	(void) lua_pushstring(state, snapname);
-	return (1);
-}
-
-static int zcp_snapshots_list(lua_State *);
-static zcp_list_info_t zcp_snapshots_list_info = {
-	.name = "snapshots",
-	.func = zcp_snapshots_list,
-	.gc = NULL,
-	.pargs = {
-	    { .za_name = "filesystem | volume", .za_lua_type = LUA_TSTRING},
-	    {NULL, 0}
-	},
-	.kwargs = {
-	    {NULL, 0}
-	}
-};
-
-static int
-zcp_snapshots_list(lua_State *state)
-{
-	const char *fsname = lua_tostring(state, 1);
-	dsl_pool_t *dp = zcp_run_info(state)->zri_pool;
-	boolean_t issnap;
-	uint64_t dsobj;
-
-	dsl_dataset_t *ds = zcp_dataset_hold(state, dp, fsname, FTAG);
-	if (ds == NULL)
-		return (1); /* not reached; zcp_dataset_hold() longjmp'd */
-	issnap = ds->ds_is_snapshot;
-	dsobj = ds->ds_object;
-	dsl_dataset_rele(ds, FTAG);
-
-	if (issnap) {
-		return (zcp_argerror(state, 1,
-		    "argument %s cannot be a snapshot", fsname));
-	}
-
-	lua_pushnumber(state, dsobj);
-	lua_pushnumber(state, 0);
-	lua_pushcclosure(state, &zcp_snapshots_iter, 2);
-	return (1);
-}
-
-/*
- * Note: channel programs only run in the global zone, so all datasets
- * are visible to this zone.
- */
-static boolean_t
-dataset_name_hidden(const char *name)
-{
-	if (strchr(name, '$') != NULL)
-		return (B_TRUE);
-	if (strchr(name, '%') != NULL)
-		return (B_TRUE);
-	return (B_FALSE);
-}
-
-static int
-zcp_children_iter(lua_State *state)
-{
-	int err;
-	char childname[ZFS_MAX_DATASET_NAME_LEN];
-	uint64_t dsobj = lua_tonumber(state, lua_upvalueindex(1));
-	uint64_t cursor = lua_tonumber(state, lua_upvalueindex(2));
-	zcp_run_info_t *ri = zcp_run_info(state);
-	dsl_pool_t *dp = ri->zri_pool;
-	dsl_dataset_t *ds;
-	objset_t *os;
-	char *p;
-
-	err = dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds);
-	if (err != 0) {
-		return (luaL_error(state,
-		    "unexpected error %d from dsl_dataset_hold_obj(dsobj)",
-		    err));
-	}
-
-	dsl_dataset_name(ds, childname);
-	VERIFY3U(sizeof (childname), >,
-	    strlcat(childname, "/", sizeof (childname)));
-	p = strchr(childname, '\0');
-
-	VERIFY0(dmu_objset_from_ds(ds, &os));
-	do {
-		err = dmu_dir_list_next(os,
-		    sizeof (childname) - (p - childname), p, NULL, &cursor);
-	} while (err == 0 && dataset_name_hidden(childname));
-	dsl_dataset_rele(ds, FTAG);
-
-	if (err == ENOENT) {
-		return (0);
-	} else if (err != 0) {
-		return (luaL_error(state,
-		    "unexpected error %d from dmu_dir_list_next()",
-		    err));
-	}
-
-	lua_pushnumber(state, cursor);
-	lua_replace(state, lua_upvalueindex(2));
-
-	(void) lua_pushstring(state, childname);
-	return (1);
-}
-
-static int zcp_children_list(lua_State *);
-static zcp_list_info_t zcp_children_list_info = {
-	.name = "children",
-	.func = zcp_children_list,
-	.gc = NULL,
-	.pargs = {
-	    { .za_name = "filesystem | volume", .za_lua_type = LUA_TSTRING},
-	    {NULL, 0}
-	},
-	.kwargs = {
-	    {NULL, 0}
-	}
-};
-
-static int
-zcp_children_list(lua_State *state)
-{
-	const char *fsname = lua_tostring(state, 1);
-	dsl_pool_t *dp = zcp_run_info(state)->zri_pool;
-	boolean_t issnap;
-	uint64_t dsobj;
-
-	dsl_dataset_t *ds = zcp_dataset_hold(state, dp, fsname, FTAG);
-	if (ds == NULL)
-		return (1); /* not reached; zcp_dataset_hold() longjmp'd */
-
-	issnap = ds->ds_is_snapshot;
-	dsobj = ds->ds_object;
-	dsl_dataset_rele(ds, FTAG);
-
-	if (issnap) {
-		return (zcp_argerror(state, 1,
-		    "argument %s cannot be a snapshot", fsname));
-	}
-
-	lua_pushnumber(state, dsobj);
-	lua_pushnumber(state, 0);
-	lua_pushcclosure(state, &zcp_children_iter, 2);
-	return (1);
-}
-
-static int
-zcp_props_list_gc(lua_State *state)
-{
-	nvlist_t **props = lua_touserdata(state, 1);
-	if (*props != NULL)
-		fnvlist_free(*props);
-	return (0);
-}
-
-static int
-zcp_props_iter(lua_State *state)
-{
-	char *source, *val;
-	nvlist_t *nvprop;
-	nvlist_t **props = lua_touserdata(state, lua_upvalueindex(1));
-	nvpair_t *pair = lua_touserdata(state, lua_upvalueindex(2));
-
-	do {
-		pair = nvlist_next_nvpair(*props, pair);
-		if (pair == NULL) {
-			fnvlist_free(*props);
-			*props = NULL;
-			return (0);
-		}
-	} while (!zfs_prop_user(nvpair_name(pair)));
-
-	lua_pushlightuserdata(state, pair);
-	lua_replace(state, lua_upvalueindex(2));
-
-	nvprop = fnvpair_value_nvlist(pair);
-	val = fnvlist_lookup_string(nvprop, ZPROP_VALUE);
-	source = fnvlist_lookup_string(nvprop, ZPROP_SOURCE);
-
-	(void) lua_pushstring(state, nvpair_name(pair));
-	(void) lua_pushstring(state, val);
-	(void) lua_pushstring(state, source);
-	return (3);
-}
-
-static int zcp_props_list(lua_State *);
-static zcp_list_info_t zcp_props_list_info = {
-	.name = "properties",
-	.func = zcp_props_list,
-	.gc = zcp_props_list_gc,
-	.pargs = {
-	    { .za_name = "filesystem | snapshot | volume",
-	    .za_lua_type = LUA_TSTRING},
-	    {NULL, 0}
-	},
-	.kwargs = {
-	    {NULL, 0}
-	}
-};
-
-static int
-zcp_props_list(lua_State *state)
-{
-	const char *dsname = lua_tostring(state, 1);
-	dsl_pool_t *dp = zcp_run_info(state)->zri_pool;
-	objset_t *os;
-	nvlist_t **props = lua_newuserdata(state, sizeof (nvlist_t *));
-
-	dsl_dataset_t *ds = zcp_dataset_hold(state, dp, dsname, FTAG);
-	if (ds == NULL)
-		return (1); /* not reached; zcp_dataset_hold() longjmp'd */
-	VERIFY0(dmu_objset_from_ds(ds, &os));
-	VERIFY0(dsl_prop_get_all(os, props));
-	dsl_dataset_rele(ds, FTAG);
-
-	/*
-	 * Set the metatable for the properties list to free it on completion.
-	 */
-	luaL_getmetatable(state, zcp_props_list_info.name);
-	(void) lua_setmetatable(state, -2);
-
-	lua_pushlightuserdata(state, NULL);
-	lua_pushcclosure(state, &zcp_props_iter, 2);
-	return (1);
-}
-
-
-/*
- * Populate nv with all valid properties and their values for the given
- * dataset.
- */
-static void
-zcp_dataset_props(dsl_dataset_t *ds, nvlist_t *nv)
-{
-	for (int prop = ZFS_PROP_TYPE; prop < ZFS_NUM_PROPS; prop++) {
-		/* Do not display hidden props */
-		if (!zfs_prop_visible(prop))
-			continue;
-		/* Do not display props not valid for this dataset */
-		if (!prop_valid_for_ds(ds, prop))
-			continue;
-		fnvlist_add_boolean(nv, zfs_prop_to_name(prop));
-	}
-}
-
-static int zcp_system_props_list(lua_State *);
-static zcp_list_info_t zcp_system_props_list_info = {
-	.name = "system_properties",
-	.func = zcp_system_props_list,
-	.pargs = {
-	    { .za_name = "dataset", .za_lua_type = LUA_TSTRING},
-	    {NULL, 0}
-	},
-	.kwargs = {
-	    {NULL, 0}
-	}
-};
-
-/*
- * Get a list of all visble properties and their values for a given dataset.
- * Returned on the stack as a Lua table.
- */
-static int
-zcp_system_props_list(lua_State *state)
-{
-	int error;
-	char errbuf[128];
-	const char *dataset_name;
-	dsl_pool_t *dp = zcp_run_info(state)->zri_pool;
-	zcp_list_info_t *libinfo = &zcp_system_props_list_info;
-	zcp_parse_args(state, libinfo->name, libinfo->pargs, libinfo->kwargs);
-	dataset_name = lua_tostring(state, 1);
-	nvlist_t *nv = fnvlist_alloc();
-
-	dsl_dataset_t *ds = zcp_dataset_hold(state, dp, dataset_name, FTAG);
-	if (ds == NULL)
-		return (1); /* not reached; zcp_dataset_hold() longjmp'd */
-
-	/* Get the names of all valid properties for this dataset */
-	zcp_dataset_props(ds, nv);
-	dsl_dataset_rele(ds, FTAG);
-
-	/* push list as lua table */
-	error = zcp_nvlist_to_lua(state, nv, errbuf, sizeof (errbuf));
-	nvlist_free(nv);
-	if (error != 0) {
-		return (luaL_error(state,
-		    "Error returning nvlist: %s", errbuf));
-	}
-	return (1);
-}
-
-static int
-zcp_list_func(lua_State *state)
-{
-	zcp_list_info_t *info = lua_touserdata(state, lua_upvalueindex(1));
-
-	zcp_parse_args(state, info->name, info->pargs, info->kwargs);
-
-	return (info->func(state));
-}
-
-int
-zcp_load_list_lib(lua_State *state)
-{
-	int i;
-	zcp_list_info_t *zcp_list_funcs[] = {
-		&zcp_children_list_info,
-		&zcp_snapshots_list_info,
-		&zcp_props_list_info,
-		&zcp_clones_list_info,
-		&zcp_system_props_list_info,
-		NULL
-	};
-
-	lua_newtable(state);
-
-	for (i = 0; zcp_list_funcs[i] != NULL; i++) {
-		zcp_list_info_t *info = zcp_list_funcs[i];
-
-		if (info->gc != NULL) {
-			/*
-			 * If the function requires garbage collection, create
-			 * a metatable with its name and register the __gc
-			 * function.
-			 */
-			(void) luaL_newmetatable(state, info->name);
-			(void) lua_pushstring(state, "__gc");
-			lua_pushcfunction(state, info->gc);
-			lua_settable(state, -3);
-			lua_pop(state, 1);
-		}
-
-		lua_pushlightuserdata(state, info);
-		lua_pushcclosure(state, &zcp_list_func, 1);
-		lua_setfield(state, -2, info->name);
-		info++;
-	}
-
-	return (1);
-}
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zcp_synctask.c
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zcp_synctask.c
+++ /dev/null
@@ -1,360 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * This file and its contents are supplied under the terms of the
- * Common Development and Distribution License ("CDDL"), version 1.0.
- * You may only use this file in accordance with the terms of version
- * 1.0 of the CDDL.
- *
- * A full copy of the text of the CDDL should have accompanied this
- * source.  A copy of the CDDL is also available via the Internet at
- * http://www.illumos.org/license/CDDL.
- *
- * CDDL HEADER END
- */
-
-/*
- * Copyright (c) 2016, 2017 by Delphix. All rights reserved.
- */
-
-#include "lua.h"
-#include "lauxlib.h"
-
-#include <sys/zcp.h>
-#include <sys/dsl_dir.h>
-#include <sys/dsl_pool.h>
-#include <sys/dsl_prop.h>
-#include <sys/dsl_synctask.h>
-#include <sys/dsl_dataset.h>
-#include <sys/dsl_bookmark.h>
-#include <sys/dsl_destroy.h>
-#include <sys/dmu_objset.h>
-#include <sys/zfs_znode.h>
-#include <sys/zfeature.h>
-#include <sys/metaslab.h>
-
-#define	DST_AVG_BLKSHIFT 14
-
-typedef int (zcp_synctask_func_t)(lua_State *, boolean_t, nvlist_t *);
-typedef struct zcp_synctask_info {
-	const char *name;
-	zcp_synctask_func_t *func;
-	const zcp_arg_t pargs[4];
-	const zcp_arg_t kwargs[2];
-	zfs_space_check_t space_check;
-	int blocks_modified;
-} zcp_synctask_info_t;
-
-/*
- * Generic synctask interface for channel program syncfuncs.
- *
- * To perform some action in syncing context, we'd generally call
- * dsl_sync_task(), but since the Lua script is already running inside a
- * synctask we need to leave out some actions (such as acquiring the config
- * rwlock and performing space checks).
- *
- * If 'sync' is false, executes a dry run and returns the error code.
- *
- * If we are not running in syncing context and we are not doing a dry run
- * (meaning we are running a zfs.sync function in open-context) then we
- * return a Lua error.
- *
- * This function also handles common fatal error cases for channel program
- * library functions. If a fatal error occurs, err_dsname will be the dataset
- * name reported in error messages, if supplied.
- */
-static int
-zcp_sync_task(lua_State *state, dsl_checkfunc_t *checkfunc,
-    dsl_syncfunc_t *syncfunc, void *arg, boolean_t sync, const char *err_dsname)
-{
-	int err;
-	zcp_run_info_t *ri = zcp_run_info(state);
-
-	err = checkfunc(arg, ri->zri_tx);
-	if (!sync)
-		return (err);
-
-	if (!ri->zri_sync) {
-		return (luaL_error(state, "running functions from the zfs.sync "
-		    "submodule requires passing sync=TRUE to "
-		    "lzc_channel_program() (i.e. do not specify the \"-n\" "
-		    "command line argument)"));
-	}
-
-	if (err == 0) {
-		syncfunc(arg, ri->zri_tx);
-	} else if (err == EIO) {
-		if (err_dsname != NULL) {
-			return (luaL_error(state,
-			    "I/O error while accessing dataset '%s'",
-			    err_dsname));
-		} else {
-			return (luaL_error(state,
-			    "I/O error while accessing dataset."));
-		}
-	}
-
-	return (err);
-}
-
-
-static int zcp_synctask_destroy(lua_State *, boolean_t, nvlist_t *);
-static zcp_synctask_info_t zcp_synctask_destroy_info = {
-	.name = "destroy",
-	.func = zcp_synctask_destroy,
-	.pargs = {
-	    {.za_name = "filesystem | snapshot", .za_lua_type = LUA_TSTRING},
-	    {NULL, 0}
-	},
-	.kwargs = {
-	    {.za_name = "defer", .za_lua_type = LUA_TBOOLEAN},
-	    {NULL, 0}
-	},
-	.space_check = ZFS_SPACE_CHECK_DESTROY,
-	.blocks_modified = 0
-};
-
-/* ARGSUSED */
-static int
-zcp_synctask_destroy(lua_State *state, boolean_t sync, nvlist_t *err_details)
-{
-	int err;
-	const char *dsname = lua_tostring(state, 1);
-
-	boolean_t issnap = (strchr(dsname, '@') != NULL);
-
-	if (!issnap && !lua_isnil(state, 2)) {
-		return (luaL_error(state,
-		    "'deferred' kwarg only supported for snapshots: %s",
-		    dsname));
-	}
-
-	if (issnap) {
-		dsl_destroy_snapshot_arg_t ddsa = { 0 };
-		ddsa.ddsa_name = dsname;
-		if (!lua_isnil(state, 2)) {
-			ddsa.ddsa_defer = lua_toboolean(state, 2);
-		} else {
-			ddsa.ddsa_defer = B_FALSE;
-		}
-
-		err = zcp_sync_task(state, dsl_destroy_snapshot_check,
-		    dsl_destroy_snapshot_sync, &ddsa, sync, dsname);
-	} else {
-		dsl_destroy_head_arg_t ddha = { 0 };
-		ddha.ddha_name = dsname;
-
-		err = zcp_sync_task(state, dsl_destroy_head_check,
-		    dsl_destroy_head_sync, &ddha, sync, dsname);
-	}
-
-	return (err);
-}
-
-static int zcp_synctask_promote(lua_State *, boolean_t, nvlist_t *);
-static zcp_synctask_info_t zcp_synctask_promote_info = {
-	.name = "promote",
-	.func = zcp_synctask_promote,
-	.pargs = {
-	    {.za_name = "clone", .za_lua_type = LUA_TSTRING},
-	    {NULL, 0}
-	},
-	.kwargs = {
-	    {NULL, 0}
-	},
-	.space_check = ZFS_SPACE_CHECK_RESERVED,
-	.blocks_modified = 3
-};
-
-static int
-zcp_synctask_promote(lua_State *state, boolean_t sync, nvlist_t *err_details)
-{
-	int err;
-	dsl_dataset_promote_arg_t ddpa = { 0 };
-	const char *dsname = lua_tostring(state, 1);
-	zcp_run_info_t *ri = zcp_run_info(state);
-
-	ddpa.ddpa_clonename = dsname;
-	ddpa.err_ds = err_details;
-	ddpa.cr = ri->zri_cred;
-
-	/*
-	 * If there was a snapshot name conflict, then err_ds will be filled
-	 * with a list of conflicting snapshot names.
-	 */
-	err = zcp_sync_task(state, dsl_dataset_promote_check,
-	    dsl_dataset_promote_sync, &ddpa, sync, dsname);
-
-	return (err);
-}
-
-static int zcp_synctask_rollback(lua_State *, boolean_t, nvlist_t *err_details);
-static zcp_synctask_info_t zcp_synctask_rollback_info = {
-	.name = "rollback",
-	.func = zcp_synctask_rollback,
-	.space_check = ZFS_SPACE_CHECK_RESERVED,
-	.blocks_modified = 1,
-	.pargs = {
-	    {.za_name = "filesystem", .za_lua_type = LUA_TSTRING},
-	    {NULL, 0}
-	},
-	.kwargs = {
-	    {NULL, 0}
-	}
-};
-
-static int
-zcp_synctask_rollback(lua_State *state, boolean_t sync, nvlist_t *err_details)
-{
-	int err;
-	const char *dsname = lua_tostring(state, 1);
-	dsl_dataset_rollback_arg_t ddra = { 0 };
-
-	ddra.ddra_fsname = dsname;
-	ddra.ddra_result = err_details;
-
-	err = zcp_sync_task(state, dsl_dataset_rollback_check,
-	    dsl_dataset_rollback_sync, &ddra, sync, dsname);
-
-	return (err);
-}
-
-static int zcp_synctask_snapshot(lua_State *, boolean_t, nvlist_t *);
-static zcp_synctask_info_t zcp_synctask_snapshot_info = {
-	.name = "snapshot",
-	.func = zcp_synctask_snapshot,
-	.pargs = {
-	    {.za_name = "filesystem@snapname | volume@snapname",
-	    .za_lua_type = LUA_TSTRING},
-	    {NULL, 0}
-	},
-	.kwargs = {
-	    {NULL, 0}
-	},
-	.space_check = ZFS_SPACE_CHECK_NORMAL,
-	.blocks_modified = 3
-};
-
-/* ARGSUSED */
-static int
-zcp_synctask_snapshot(lua_State *state, boolean_t sync, nvlist_t *err_details)
-{
-	int err;
-	dsl_dataset_snapshot_arg_t ddsa = { 0 };
-	const char *dsname = lua_tostring(state, 1);
-	zcp_run_info_t *ri = zcp_run_info(state);
-
-	/*
-	 * On old pools, the ZIL must not be active when a snapshot is created,
-	 * but we can't suspend the ZIL because we're already in syncing
-	 * context.
-	 */
-	if (spa_version(ri->zri_pool->dp_spa) < SPA_VERSION_FAST_SNAP) {
-		return (ENOTSUP);
-	}
-
-	/*
-	 * We only allow for a single snapshot rather than a list, so the
-	 * error list output is unnecessary.
-	 */
-	ddsa.ddsa_errors = NULL;
-	ddsa.ddsa_props = NULL;
-	ddsa.ddsa_cr = ri->zri_cred;
-	ddsa.ddsa_snaps = fnvlist_alloc();
-	fnvlist_add_boolean(ddsa.ddsa_snaps, dsname);
-
-	zcp_cleanup_handler_t *zch = zcp_register_cleanup(state,
-	    (zcp_cleanup_t *)&fnvlist_free, ddsa.ddsa_snaps);
-
-	err = zcp_sync_task(state, dsl_dataset_snapshot_check,
-	    dsl_dataset_snapshot_sync, &ddsa, sync, dsname);
-
-	zcp_deregister_cleanup(state, zch);
-	fnvlist_free(ddsa.ddsa_snaps);
-
-	return (err);
-}
-
-static int
-zcp_synctask_wrapper(lua_State *state)
-{
-	int err;
-	zcp_cleanup_handler_t *zch;
-	int num_ret = 1;
-	nvlist_t *err_details = fnvlist_alloc();
-
-	/*
-	 * Make sure err_details is properly freed, even if a fatal error is
-	 * thrown during the synctask.
-	 */
-	zch = zcp_register_cleanup(state,
-	    (zcp_cleanup_t *)&fnvlist_free, err_details);
-
-	zcp_synctask_info_t *info = lua_touserdata(state, lua_upvalueindex(1));
-	boolean_t sync = lua_toboolean(state, lua_upvalueindex(2));
-
-	zcp_run_info_t *ri = zcp_run_info(state);
-	dsl_pool_t *dp = ri->zri_pool;
-
-	/* MOS space is triple-dittoed, so we multiply by 3. */
-	uint64_t funcspace = (info->blocks_modified << DST_AVG_BLKSHIFT) * 3;
-
-	zcp_parse_args(state, info->name, info->pargs, info->kwargs);
-
-	err = 0;
-	if (info->space_check != ZFS_SPACE_CHECK_NONE) {
-		uint64_t quota = dsl_pool_unreserved_space(dp,
-		    info->space_check);
-		uint64_t used = dsl_dir_phys(dp->dp_root_dir)->dd_used_bytes +
-		    ri->zri_space_used;
-
-		if (used + funcspace > quota) {
-			err = SET_ERROR(ENOSPC);
-		}
-	}
-
-	if (err == 0) {
-		err = info->func(state, sync, err_details);
-	}
-
-	if (err == 0) {
-		ri->zri_space_used += funcspace;
-	}
-
-	lua_pushnumber(state, (lua_Number)err);
-	if (fnvlist_num_pairs(err_details) > 0) {
-		(void) zcp_nvlist_to_lua(state, err_details, NULL, 0);
-		num_ret++;
-	}
-
-	zcp_deregister_cleanup(state, zch);
-	fnvlist_free(err_details);
-
-	return (num_ret);
-}
-
-int
-zcp_load_synctask_lib(lua_State *state, boolean_t sync)
-{
-	int i;
-	zcp_synctask_info_t *zcp_synctask_funcs[] = {
-		&zcp_synctask_destroy_info,
-		&zcp_synctask_promote_info,
-		&zcp_synctask_rollback_info,
-		&zcp_synctask_snapshot_info,
-		NULL
-	};
-
-	lua_newtable(state);
-
-	for (i = 0; zcp_synctask_funcs[i] != NULL; i++) {
-		zcp_synctask_info_t *info = zcp_synctask_funcs[i];
-		lua_pushlightuserdata(state, info);
-		lua_pushboolean(state, sync);
-		lua_pushcclosure(state, &zcp_synctask_wrapper, 2);
-		lua_setfield(state, -2, info->name);
-		info++;
-	}
-
-	return (1);
-}
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfeature.c
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfeature.c
+++ /dev/null
@@ -1,505 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-
-/*
- * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
- */
-
-#include <sys/zfs_context.h>
-#include <sys/zfeature.h>
-#include <sys/dmu.h>
-#include <sys/nvpair.h>
-#include <sys/zap.h>
-#include <sys/dmu_tx.h>
-#include "zfeature_common.h"
-#include <sys/spa_impl.h>
-
-/*
- * ZFS Feature Flags
- * -----------------
- *
- * ZFS feature flags are used to provide fine-grained versioning to the ZFS
- * on-disk format. Once enabled on a pool feature flags replace the old
- * spa_version() number.
- *
- * Each new on-disk format change will be given a uniquely identifying string
- * guid rather than a version number. This avoids the problem of different
- * organizations creating new on-disk formats with the same version number. To
- * keep feature guids unique they should consist of the reverse dns name of the
- * organization which implemented the feature and a short name for the feature,
- * separated by a colon (e.g. com.delphix:async_destroy).
- *
- * Reference Counts
- * ----------------
- *
- * Within each pool features can be in one of three states: disabled, enabled,
- * or active. These states are differentiated by a reference count stored on
- * disk for each feature:
- *
- *   1) If there is no reference count stored on disk the feature is disabled.
- *   2) If the reference count is 0 a system administrator has enabled the
- *      feature, but the feature has not been used yet, so no on-disk
- *      format changes have been made.
- *   3) If the reference count is greater than 0 the feature is active.
- *      The format changes required by the feature are currently on disk.
- *      Note that if the feature's format changes are reversed the feature
- *      may choose to set its reference count back to 0.
- *
- * Feature flags makes no differentiation between non-zero reference counts
- * for an active feature (e.g. a reference count of 1 means the same thing as a
- * reference count of 27834721), but feature implementations may choose to use
- * the reference count to store meaningful information. For example, a new RAID
- * implementation might set the reference count to the number of vdevs using
- * it. If all those disks are removed from the pool the feature goes back to
- * having a reference count of 0.
- *
- * It is the responsibility of the individual features to maintain a non-zero
- * reference count as long as the feature's format changes are present on disk.
- *
- * Dependencies
- * ------------
- *
- * Each feature may depend on other features. The only effect of this
- * relationship is that when a feature is enabled all of its dependencies are
- * automatically enabled as well. Any future work to support disabling of
- * features would need to ensure that features cannot be disabled if other
- * enabled features depend on them.
- *
- * On-disk Format
- * --------------
- *
- * When feature flags are enabled spa_version() is set to SPA_VERSION_FEATURES
- * (5000). In order for this to work the pool is automatically upgraded to
- * SPA_VERSION_BEFORE_FEATURES (28) first, so all pre-feature flags on disk
- * format changes will be in use.
- *
- * Information about features is stored in 3 ZAP objects in the pool's MOS.
- * These objects are linked to by the following names in the pool directory
- * object:
- *
- * 1) features_for_read: feature guid -> reference count
- *    Features needed to open the pool for reading.
- * 2) features_for_write: feature guid -> reference count
- *    Features needed to open the pool for writing.
- * 3) feature_descriptions: feature guid -> descriptive string
- *    A human readable string.
- *
- * All enabled features appear in either features_for_read or
- * features_for_write, but not both.
- *
- * To open a pool in read-only mode only the features listed in
- * features_for_read need to be supported.
- *
- * To open the pool in read-write mode features in both features_for_read and
- * features_for_write need to be supported.
- *
- * Some features may be required to read the ZAP objects containing feature
- * information. To allow software to check for compatibility with these features
- * before the pool is opened their names must be stored in the label in a
- * new "features_for_read" entry (note that features that are only required
- * to write to a pool never need to be stored in the label since the
- * features_for_write ZAP object can be read before the pool is written to).
- * To save space in the label features must be explicitly marked as needing to
- * be written to the label. Also, reference counts are not stored in the label,
- * instead any feature whose reference count drops to 0 is removed from the
- * label.
- *
- * Adding New Features
- * -------------------
- *
- * Features must be registered in zpool_feature_init() function in
- * zfeature_common.c using the zfeature_register() function. This function
- * has arguments to specify if the feature should be stored in the
- * features_for_read or features_for_write ZAP object and if it needs to be
- * written to the label when active.
- *
- * Once a feature is registered it will appear as a "feature@<feature name>"
- * property which can be set by an administrator. Feature implementors should
- * use the spa_feature_is_enabled() and spa_feature_is_active() functions to
- * query the state of a feature and the spa_feature_incr() and
- * spa_feature_decr() functions to change an enabled feature's reference count.
- * Reference counts may only be updated in the syncing context.
- *
- * Features may not perform enable-time initialization. Instead, any such
- * initialization should occur when the feature is first used. This design
- * enforces that on-disk changes be made only when features are used. Code
- * should only check if a feature is enabled using spa_feature_is_enabled(),
- * not by relying on any feature specific metadata existing. If a feature is
- * enabled, but the feature's metadata is not on disk yet then it should be
- * created as needed.
- *
- * As an example, consider the com.delphix:async_destroy feature. This feature
- * relies on the existence of a bptree in the MOS that store blocks for
- * asynchronous freeing. This bptree is not created when async_destroy is
- * enabled. Instead, when a dataset is destroyed spa_feature_is_enabled() is
- * called to check if async_destroy is enabled. If it is and the bptree object
- * does not exist yet, the bptree object is created as part of the dataset
- * destroy and async_destroy's reference count is incremented to indicate it
- * has made an on-disk format change. Later, after the destroyed dataset's
- * blocks have all been asynchronously freed there is no longer any use for the
- * bptree object, so it is destroyed and async_destroy's reference count is
- * decremented back to 0 to indicate that it has undone its on-disk format
- * changes.
- */
-
-typedef enum {
-	FEATURE_ACTION_INCR,
-	FEATURE_ACTION_DECR,
-} feature_action_t;
-
-/*
- * Checks that the active features in the pool are supported by
- * this software.  Adds each unsupported feature (name -> description) to
- * the supplied nvlist.
- */
-boolean_t
-spa_features_check(spa_t *spa, boolean_t for_write,
-    nvlist_t *unsup_feat, nvlist_t *enabled_feat)
-{
-	objset_t *os = spa->spa_meta_objset;
-	boolean_t supported;
-	zap_cursor_t zc;
-	zap_attribute_t za;
-	uint64_t obj = for_write ?
-	    spa->spa_feat_for_write_obj : spa->spa_feat_for_read_obj;
-
-	supported = B_TRUE;
-	for (zap_cursor_init(&zc, os, obj);
-	    zap_cursor_retrieve(&zc, &za) == 0;
-	    zap_cursor_advance(&zc)) {
-		ASSERT(za.za_integer_length == sizeof (uint64_t) &&
-		    za.za_num_integers == 1);
-
-		if (NULL != enabled_feat) {
-			fnvlist_add_uint64(enabled_feat, za.za_name,
-			    za.za_first_integer);
-		}
-
-		if (za.za_first_integer != 0 &&
-		    !zfeature_is_supported(za.za_name)) {
-			supported = B_FALSE;
-
-			if (NULL != unsup_feat) {
-				char *desc = "";
-				char buf[MAXPATHLEN];
-
-				if (zap_lookup(os, spa->spa_feat_desc_obj,
-				    za.za_name, 1, sizeof (buf), buf) == 0)
-					desc = buf;
-
-				VERIFY(nvlist_add_string(unsup_feat, za.za_name,
-				    desc) == 0);
-			}
-		}
-	}
-	zap_cursor_fini(&zc);
-
-	return (supported);
-}
-
-/*
- * Use an in-memory cache of feature refcounts for quick retrieval.
- *
- * Note: well-designed features will not need to use this; they should
- * use spa_feature_is_enabled() and spa_feature_is_active() instead.
- * However, this is non-static for zdb, zhack, and spa_add_feature_stats().
- */
-int
-feature_get_refcount(spa_t *spa, zfeature_info_t *feature, uint64_t *res)
-{
-	ASSERT(VALID_FEATURE_FID(feature->fi_feature));
-	if (spa->spa_feat_refcount_cache[feature->fi_feature] ==
-	    SPA_FEATURE_DISABLED) {
-		return (SET_ERROR(ENOTSUP));
-	}
-	*res = spa->spa_feat_refcount_cache[feature->fi_feature];
-	return (0);
-}
-
-/*
- * Note: well-designed features will not need to use this; they should
- * use spa_feature_is_enabled() and spa_feature_is_active() instead.
- * However, this is non-static for zdb and zhack.
- */
-int
-feature_get_refcount_from_disk(spa_t *spa, zfeature_info_t *feature,
-    uint64_t *res)
-{
-	int err;
-	uint64_t refcount;
-	uint64_t zapobj = (feature->fi_flags & ZFEATURE_FLAG_READONLY_COMPAT) ?
-	    spa->spa_feat_for_write_obj : spa->spa_feat_for_read_obj;
-
-	/*
-	 * If the pool is currently being created, the feature objects may not
-	 * have been allocated yet.  Act as though all features are disabled.
-	 */
-	if (zapobj == 0)
-		return (SET_ERROR(ENOTSUP));
-
-	err = zap_lookup(spa->spa_meta_objset, zapobj,
-	    feature->fi_guid, sizeof (uint64_t), 1, &refcount);
-	if (err != 0) {
-		if (err == ENOENT)
-			return (SET_ERROR(ENOTSUP));
-		else
-			return (err);
-	}
-	*res = refcount;
-	return (0);
-}
-
-
-static int
-feature_get_enabled_txg(spa_t *spa, zfeature_info_t *feature, uint64_t *res)
-{
-	uint64_t enabled_txg_obj = spa->spa_feat_enabled_txg_obj;
-
-	ASSERT(zfeature_depends_on(feature->fi_feature,
-	    SPA_FEATURE_ENABLED_TXG));
-
-	if (!spa_feature_is_enabled(spa, feature->fi_feature)) {
-		return (SET_ERROR(ENOTSUP));
-	}
-
-	ASSERT(enabled_txg_obj != 0);
-
-	VERIFY0(zap_lookup(spa->spa_meta_objset, spa->spa_feat_enabled_txg_obj,
-	    feature->fi_guid, sizeof (uint64_t), 1, res));
-
-	return (0);
-}
-
-/*
- * This function is non-static for zhack; it should otherwise not be used
- * outside this file.
- */
-void
-feature_sync(spa_t *spa, zfeature_info_t *feature, uint64_t refcount,
-    dmu_tx_t *tx)
-{
-	ASSERT(VALID_FEATURE_OR_NONE(feature->fi_feature));
-	uint64_t zapobj = (feature->fi_flags & ZFEATURE_FLAG_READONLY_COMPAT) ?
-	    spa->spa_feat_for_write_obj : spa->spa_feat_for_read_obj;
-
-	VERIFY0(zap_update(spa->spa_meta_objset, zapobj, feature->fi_guid,
-	    sizeof (uint64_t), 1, &refcount, tx));
-
-	/*
-	 * feature_sync is called directly from zhack, allowing the
-	 * creation of arbitrary features whose fi_feature field may
-	 * be greater than SPA_FEATURES. When called from zhack, the
-	 * zfeature_info_t object's fi_feature field will be set to
-	 * SPA_FEATURE_NONE.
-	 */
-	if (feature->fi_feature != SPA_FEATURE_NONE) {
-		uint64_t *refcount_cache =
-		    &spa->spa_feat_refcount_cache[feature->fi_feature];
-		VERIFY3U(*refcount_cache, ==,
-		    atomic_swap_64(refcount_cache, refcount));
-	}
-
-	if (refcount == 0)
-		spa_deactivate_mos_feature(spa, feature->fi_guid);
-	else if (feature->fi_flags & ZFEATURE_FLAG_MOS)
-		spa_activate_mos_feature(spa, feature->fi_guid, tx);
-}
-
-/*
- * This function is non-static for zhack; it should otherwise not be used
- * outside this file.
- */
-void
-feature_enable_sync(spa_t *spa, zfeature_info_t *feature, dmu_tx_t *tx)
-{
-	uint64_t initial_refcount =
-	    (feature->fi_flags & ZFEATURE_FLAG_ACTIVATE_ON_ENABLE) ? 1 : 0;
-	uint64_t zapobj = (feature->fi_flags & ZFEATURE_FLAG_READONLY_COMPAT) ?
-	    spa->spa_feat_for_write_obj : spa->spa_feat_for_read_obj;
-
-	ASSERT(0 != zapobj);
-	ASSERT(zfeature_is_valid_guid(feature->fi_guid));
-	ASSERT3U(spa_version(spa), >=, SPA_VERSION_FEATURES);
-
-	/*
-	 * If the feature is already enabled, ignore the request.
-	 */
-	if (zap_contains(spa->spa_meta_objset, zapobj, feature->fi_guid) == 0)
-		return;
-
-	for (int i = 0; feature->fi_depends[i] != SPA_FEATURE_NONE; i++)
-		spa_feature_enable(spa, feature->fi_depends[i], tx);
-
-	VERIFY0(zap_update(spa->spa_meta_objset, spa->spa_feat_desc_obj,
-	    feature->fi_guid, 1, strlen(feature->fi_desc) + 1,
-	    feature->fi_desc, tx));
-
-	feature_sync(spa, feature, initial_refcount, tx);
-
-	if (spa_feature_is_enabled(spa, SPA_FEATURE_ENABLED_TXG)) {
-		uint64_t enabling_txg = dmu_tx_get_txg(tx);
-
-		if (spa->spa_feat_enabled_txg_obj == 0ULL) {
-			spa->spa_feat_enabled_txg_obj =
-			    zap_create_link(spa->spa_meta_objset,
-			    DMU_OTN_ZAP_METADATA, DMU_POOL_DIRECTORY_OBJECT,
-			    DMU_POOL_FEATURE_ENABLED_TXG, tx);
-		}
-		spa_feature_incr(spa, SPA_FEATURE_ENABLED_TXG, tx);
-
-		VERIFY0(zap_add(spa->spa_meta_objset,
-		    spa->spa_feat_enabled_txg_obj, feature->fi_guid,
-		    sizeof (uint64_t), 1, &enabling_txg, tx));
-	}
-}
-
-static void
-feature_do_action(spa_t *spa, spa_feature_t fid, feature_action_t action,
-    dmu_tx_t *tx)
-{
-	uint64_t refcount;
-	zfeature_info_t *feature = &spa_feature_table[fid];
-	uint64_t zapobj = (feature->fi_flags & ZFEATURE_FLAG_READONLY_COMPAT) ?
-	    spa->spa_feat_for_write_obj : spa->spa_feat_for_read_obj;
-
-	ASSERT(VALID_FEATURE_FID(fid));
-	ASSERT(0 != zapobj);
-	ASSERT(zfeature_is_valid_guid(feature->fi_guid));
-
-	ASSERT(dmu_tx_is_syncing(tx));
-	ASSERT3U(spa_version(spa), >=, SPA_VERSION_FEATURES);
-
-	VERIFY3U(feature_get_refcount(spa, feature, &refcount), !=, ENOTSUP);
-
-	switch (action) {
-	case FEATURE_ACTION_INCR:
-		VERIFY3U(refcount, !=, UINT64_MAX);
-		refcount++;
-		break;
-	case FEATURE_ACTION_DECR:
-		VERIFY3U(refcount, !=, 0);
-		refcount--;
-		break;
-	default:
-		ASSERT(0);
-		break;
-	}
-
-	feature_sync(spa, feature, refcount, tx);
-}
-
-void
-spa_feature_create_zap_objects(spa_t *spa, dmu_tx_t *tx)
-{
-	/*
-	 * We create feature flags ZAP objects in two instances: during pool
-	 * creation and during pool upgrade.
-	 */
-	ASSERT(dsl_pool_sync_context(spa_get_dsl(spa)) || (!spa->spa_sync_on &&
-	    tx->tx_txg == TXG_INITIAL));
-
-	spa->spa_feat_for_read_obj = zap_create_link(spa->spa_meta_objset,
-	    DMU_OTN_ZAP_METADATA, DMU_POOL_DIRECTORY_OBJECT,
-	    DMU_POOL_FEATURES_FOR_READ, tx);
-	spa->spa_feat_for_write_obj = zap_create_link(spa->spa_meta_objset,
-	    DMU_OTN_ZAP_METADATA, DMU_POOL_DIRECTORY_OBJECT,
-	    DMU_POOL_FEATURES_FOR_WRITE, tx);
-	spa->spa_feat_desc_obj = zap_create_link(spa->spa_meta_objset,
-	    DMU_OTN_ZAP_METADATA, DMU_POOL_DIRECTORY_OBJECT,
-	    DMU_POOL_FEATURE_DESCRIPTIONS, tx);
-}
-
-/*
- * Enable any required dependencies, then enable the requested feature.
- */
-void
-spa_feature_enable(spa_t *spa, spa_feature_t fid, dmu_tx_t *tx)
-{
-	ASSERT3U(spa_version(spa), >=, SPA_VERSION_FEATURES);
-	ASSERT(VALID_FEATURE_FID(fid));
-	feature_enable_sync(spa, &spa_feature_table[fid], tx);
-}
-
-void
-spa_feature_incr(spa_t *spa, spa_feature_t fid, dmu_tx_t *tx)
-{
-	feature_do_action(spa, fid, FEATURE_ACTION_INCR, tx);
-}
-
-void
-spa_feature_decr(spa_t *spa, spa_feature_t fid, dmu_tx_t *tx)
-{
-	feature_do_action(spa, fid, FEATURE_ACTION_DECR, tx);
-}
-
-boolean_t
-spa_feature_is_enabled(spa_t *spa, spa_feature_t fid)
-{
-	int err;
-	uint64_t refcount;
-
-	ASSERT(VALID_FEATURE_FID(fid));
-	if (spa_version(spa) < SPA_VERSION_FEATURES)
-		return (B_FALSE);
-
-	err = feature_get_refcount(spa, &spa_feature_table[fid], &refcount);
-	ASSERT(err == 0 || err == ENOTSUP);
-	return (err == 0);
-}
-
-boolean_t
-spa_feature_is_active(spa_t *spa, spa_feature_t fid)
-{
-	int err;
-	uint64_t refcount;
-
-	ASSERT(VALID_FEATURE_FID(fid));
-	if (spa_version(spa) < SPA_VERSION_FEATURES)
-		return (B_FALSE);
-
-	err = feature_get_refcount(spa, &spa_feature_table[fid], &refcount);
-	ASSERT(err == 0 || err == ENOTSUP);
-	return (err == 0 && refcount > 0);
-}
-
-/*
- * For the feature specified by fid (which must depend on
- * SPA_FEATURE_ENABLED_TXG), return the TXG at which it was enabled in the
- * OUT txg argument.
- *
- * Returns B_TRUE if the feature is enabled, in which case txg will be filled
- * with the transaction group in which the specified feature was enabled.
- * Returns B_FALSE otherwise (i.e. if the feature is not enabled).
- */
-boolean_t
-spa_feature_enabled_txg(spa_t *spa, spa_feature_t fid, uint64_t *txg)
-{
-	int err;
-
-	ASSERT(VALID_FEATURE_FID(fid));
-	if (spa_version(spa) < SPA_VERSION_FEATURES)
-		return (B_FALSE);
-
-	err = feature_get_enabled_txg(spa, &spa_feature_table[fid], txg);
-	ASSERT(err == 0 || err == ENOTSUP);
-
-	return (err == 0);
-}
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs.conf
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs.conf
+++ /dev/null
@@ -1,28 +0,0 @@
-#
-# CDDL HEADER START
-#
-# The contents of this file are subject to the terms of the
-# Common Development and Distribution License, Version 1.0 only
-# (the "License").  You may not use this file except in compliance
-# with the License.
-#
-# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
-# or http://www.opensolaris.org/os/licensing.
-# See the License for the specific language governing permissions
-# and limitations under the License.
-#
-# When distributing Covered Code, include this CDDL HEADER in each
-# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
-# If applicable, add the following below this CDDL HEADER, with the
-# fields enclosed by brackets "[]" replaced with your own identifying
-# information: Portions Copyright [yyyy] [name of copyright owner]
-#
-# CDDL HEADER END
-#
-#
-# Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
-# Use is subject to license terms.
-#
-# ident	"%Z%%M%	%I%	%E% SMI"
-#
-name="zfs" parent="pseudo";
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_acl.c
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_acl.c
+++ /dev/null
@@ -1,2778 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2013 by Delphix. All rights reserved.
- * Copyright 2017 Nexenta Systems, Inc.  All rights reserved.
- */
-
-#include <sys/types.h>
-#include <sys/param.h>
-#include <sys/time.h>
-#include <sys/systm.h>
-#include <sys/sysmacros.h>
-#include <sys/resource.h>
-#include <sys/vfs.h>
-#include <sys/vnode.h>
-#include <sys/file.h>
-#include <sys/stat.h>
-#include <sys/kmem.h>
-#include <sys/cmn_err.h>
-#include <sys/errno.h>
-#include <sys/unistd.h>
-#include <sys/sdt.h>
-#include <sys/fs/zfs.h>
-#include <sys/policy.h>
-#include <sys/zfs_znode.h>
-#include <sys/zfs_fuid.h>
-#include <sys/zfs_acl.h>
-#include <sys/zfs_dir.h>
-#include <sys/zfs_vfsops.h>
-#include <sys/dmu.h>
-#include <sys/dnode.h>
-#include <sys/zap.h>
-#include <sys/sa.h>
-#include <acl/acl_common.h>
-
-#define	ALLOW	ACE_ACCESS_ALLOWED_ACE_TYPE
-#define	DENY	ACE_ACCESS_DENIED_ACE_TYPE
-#define	MAX_ACE_TYPE	ACE_SYSTEM_ALARM_CALLBACK_OBJECT_ACE_TYPE
-#define	MIN_ACE_TYPE	ALLOW
-
-#define	OWNING_GROUP		(ACE_GROUP|ACE_IDENTIFIER_GROUP)
-#define	EVERYONE_ALLOW_MASK (ACE_READ_ACL|ACE_READ_ATTRIBUTES | \
-    ACE_READ_NAMED_ATTRS|ACE_SYNCHRONIZE)
-#define	EVERYONE_DENY_MASK (ACE_WRITE_ACL|ACE_WRITE_OWNER | \
-    ACE_WRITE_ATTRIBUTES|ACE_WRITE_NAMED_ATTRS)
-#define	OWNER_ALLOW_MASK (ACE_WRITE_ACL | ACE_WRITE_OWNER | \
-    ACE_WRITE_ATTRIBUTES|ACE_WRITE_NAMED_ATTRS)
-
-#define	ZFS_CHECKED_MASKS (ACE_READ_ACL|ACE_READ_ATTRIBUTES|ACE_READ_DATA| \
-    ACE_READ_NAMED_ATTRS|ACE_WRITE_DATA|ACE_WRITE_ATTRIBUTES| \
-    ACE_WRITE_NAMED_ATTRS|ACE_APPEND_DATA|ACE_EXECUTE|ACE_WRITE_OWNER| \
-    ACE_WRITE_ACL|ACE_DELETE|ACE_DELETE_CHILD|ACE_SYNCHRONIZE)
-
-#define	WRITE_MASK_DATA (ACE_WRITE_DATA|ACE_APPEND_DATA|ACE_WRITE_NAMED_ATTRS)
-#define	WRITE_MASK_ATTRS (ACE_WRITE_ACL|ACE_WRITE_OWNER|ACE_WRITE_ATTRIBUTES| \
-    ACE_DELETE|ACE_DELETE_CHILD)
-#define	WRITE_MASK (WRITE_MASK_DATA|WRITE_MASK_ATTRS)
-
-#define	OGE_CLEAR	(ACE_READ_DATA|ACE_LIST_DIRECTORY|ACE_WRITE_DATA| \
-    ACE_ADD_FILE|ACE_APPEND_DATA|ACE_ADD_SUBDIRECTORY|ACE_EXECUTE)
-
-#define	OKAY_MASK_BITS (ACE_READ_DATA|ACE_LIST_DIRECTORY|ACE_WRITE_DATA| \
-    ACE_ADD_FILE|ACE_APPEND_DATA|ACE_ADD_SUBDIRECTORY|ACE_EXECUTE)
-
-#define	ALL_INHERIT	(ACE_FILE_INHERIT_ACE|ACE_DIRECTORY_INHERIT_ACE | \
-    ACE_NO_PROPAGATE_INHERIT_ACE|ACE_INHERIT_ONLY_ACE|ACE_INHERITED_ACE)
-
-#define	RESTRICTED_CLEAR	(ACE_WRITE_ACL|ACE_WRITE_OWNER)
-
-#define	V4_ACL_WIDE_FLAGS (ZFS_ACL_AUTO_INHERIT|ZFS_ACL_DEFAULTED|\
-    ZFS_ACL_PROTECTED)
-
-#define	ZFS_ACL_WIDE_FLAGS (V4_ACL_WIDE_FLAGS|ZFS_ACL_TRIVIAL|ZFS_INHERIT_ACE|\
-    ZFS_ACL_OBJ_ACE)
-
-#define	ALL_MODE_EXECS (S_IXUSR | S_IXGRP | S_IXOTH)
-
-static uint16_t
-zfs_ace_v0_get_type(void *acep)
-{
-	return (((zfs_oldace_t *)acep)->z_type);
-}
-
-static uint16_t
-zfs_ace_v0_get_flags(void *acep)
-{
-	return (((zfs_oldace_t *)acep)->z_flags);
-}
-
-static uint32_t
-zfs_ace_v0_get_mask(void *acep)
-{
-	return (((zfs_oldace_t *)acep)->z_access_mask);
-}
-
-static uint64_t
-zfs_ace_v0_get_who(void *acep)
-{
-	return (((zfs_oldace_t *)acep)->z_fuid);
-}
-
-static void
-zfs_ace_v0_set_type(void *acep, uint16_t type)
-{
-	((zfs_oldace_t *)acep)->z_type = type;
-}
-
-static void
-zfs_ace_v0_set_flags(void *acep, uint16_t flags)
-{
-	((zfs_oldace_t *)acep)->z_flags = flags;
-}
-
-static void
-zfs_ace_v0_set_mask(void *acep, uint32_t mask)
-{
-	((zfs_oldace_t *)acep)->z_access_mask = mask;
-}
-
-static void
-zfs_ace_v0_set_who(void *acep, uint64_t who)
-{
-	((zfs_oldace_t *)acep)->z_fuid = who;
-}
-
-/*ARGSUSED*/
-static size_t
-zfs_ace_v0_size(void *acep)
-{
-	return (sizeof (zfs_oldace_t));
-}
-
-static size_t
-zfs_ace_v0_abstract_size(void)
-{
-	return (sizeof (zfs_oldace_t));
-}
-
-static int
-zfs_ace_v0_mask_off(void)
-{
-	return (offsetof(zfs_oldace_t, z_access_mask));
-}
-
-/*ARGSUSED*/
-static int
-zfs_ace_v0_data(void *acep, void **datap)
-{
-	*datap = NULL;
-	return (0);
-}
-
-static acl_ops_t zfs_acl_v0_ops = {
-	zfs_ace_v0_get_mask,
-	zfs_ace_v0_set_mask,
-	zfs_ace_v0_get_flags,
-	zfs_ace_v0_set_flags,
-	zfs_ace_v0_get_type,
-	zfs_ace_v0_set_type,
-	zfs_ace_v0_get_who,
-	zfs_ace_v0_set_who,
-	zfs_ace_v0_size,
-	zfs_ace_v0_abstract_size,
-	zfs_ace_v0_mask_off,
-	zfs_ace_v0_data
-};
-
-static uint16_t
-zfs_ace_fuid_get_type(void *acep)
-{
-	return (((zfs_ace_hdr_t *)acep)->z_type);
-}
-
-static uint16_t
-zfs_ace_fuid_get_flags(void *acep)
-{
-	return (((zfs_ace_hdr_t *)acep)->z_flags);
-}
-
-static uint32_t
-zfs_ace_fuid_get_mask(void *acep)
-{
-	return (((zfs_ace_hdr_t *)acep)->z_access_mask);
-}
-
-static uint64_t
-zfs_ace_fuid_get_who(void *args)
-{
-	uint16_t entry_type;
-	zfs_ace_t *acep = args;
-
-	entry_type = acep->z_hdr.z_flags & ACE_TYPE_FLAGS;
-
-	if (entry_type == ACE_OWNER || entry_type == OWNING_GROUP ||
-	    entry_type == ACE_EVERYONE)
-		return (-1);
-	return (((zfs_ace_t *)acep)->z_fuid);
-}
-
-static void
-zfs_ace_fuid_set_type(void *acep, uint16_t type)
-{
-	((zfs_ace_hdr_t *)acep)->z_type = type;
-}
-
-static void
-zfs_ace_fuid_set_flags(void *acep, uint16_t flags)
-{
-	((zfs_ace_hdr_t *)acep)->z_flags = flags;
-}
-
-static void
-zfs_ace_fuid_set_mask(void *acep, uint32_t mask)
-{
-	((zfs_ace_hdr_t *)acep)->z_access_mask = mask;
-}
-
-static void
-zfs_ace_fuid_set_who(void *arg, uint64_t who)
-{
-	zfs_ace_t *acep = arg;
-
-	uint16_t entry_type = acep->z_hdr.z_flags & ACE_TYPE_FLAGS;
-
-	if (entry_type == ACE_OWNER || entry_type == OWNING_GROUP ||
-	    entry_type == ACE_EVERYONE)
-		return;
-	acep->z_fuid = who;
-}
-
-static size_t
-zfs_ace_fuid_size(void *acep)
-{
-	zfs_ace_hdr_t *zacep = acep;
-	uint16_t entry_type;
-
-	switch (zacep->z_type) {
-	case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE:
-	case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE:
-	case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE:
-	case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE:
-		return (sizeof (zfs_object_ace_t));
-	case ALLOW:
-	case DENY:
-		entry_type =
-		    (((zfs_ace_hdr_t *)acep)->z_flags & ACE_TYPE_FLAGS);
-		if (entry_type == ACE_OWNER ||
-		    entry_type == OWNING_GROUP ||
-		    entry_type == ACE_EVERYONE)
-			return (sizeof (zfs_ace_hdr_t));
-		/*FALLTHROUGH*/
-	default:
-		return (sizeof (zfs_ace_t));
-	}
-}
-
-static size_t
-zfs_ace_fuid_abstract_size(void)
-{
-	return (sizeof (zfs_ace_hdr_t));
-}
-
-static int
-zfs_ace_fuid_mask_off(void)
-{
-	return (offsetof(zfs_ace_hdr_t, z_access_mask));
-}
-
-static int
-zfs_ace_fuid_data(void *acep, void **datap)
-{
-	zfs_ace_t *zacep = acep;
-	zfs_object_ace_t *zobjp;
-
-	switch (zacep->z_hdr.z_type) {
-	case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE:
-	case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE:
-	case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE:
-	case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE:
-		zobjp = acep;
-		*datap = (caddr_t)zobjp + sizeof (zfs_ace_t);
-		return (sizeof (zfs_object_ace_t) - sizeof (zfs_ace_t));
-	default:
-		*datap = NULL;
-		return (0);
-	}
-}
-
-static acl_ops_t zfs_acl_fuid_ops = {
-	zfs_ace_fuid_get_mask,
-	zfs_ace_fuid_set_mask,
-	zfs_ace_fuid_get_flags,
-	zfs_ace_fuid_set_flags,
-	zfs_ace_fuid_get_type,
-	zfs_ace_fuid_set_type,
-	zfs_ace_fuid_get_who,
-	zfs_ace_fuid_set_who,
-	zfs_ace_fuid_size,
-	zfs_ace_fuid_abstract_size,
-	zfs_ace_fuid_mask_off,
-	zfs_ace_fuid_data
-};
-
-/*
- * The following three functions are provided for compatibility with
- * older ZPL version in order to determine if the file use to have
- * an external ACL and what version of ACL previously existed on the
- * file.  Would really be nice to not need this, sigh.
- */
-uint64_t
-zfs_external_acl(znode_t *zp)
-{
-	zfs_acl_phys_t acl_phys;
-	int error;
-
-	if (zp->z_is_sa)
-		return (0);
-
-	/*
-	 * Need to deal with a potential
-	 * race where zfs_sa_upgrade could cause
-	 * z_isa_sa to change.
-	 *
-	 * If the lookup fails then the state of z_is_sa should have
-	 * changed.
-	 */
-
-	if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_ZNODE_ACL(zp->z_zfsvfs),
-	    &acl_phys, sizeof (acl_phys))) == 0)
-		return (acl_phys.z_acl_extern_obj);
-	else {
-		/*
-		 * after upgrade the SA_ZPL_ZNODE_ACL should have been
-		 * removed
-		 */
-		VERIFY(zp->z_is_sa && error == ENOENT);
-		return (0);
-	}
-}
-
-/*
- * Determine size of ACL in bytes
- *
- * This is more complicated than it should be since we have to deal
- * with old external ACLs.
- */
-static int
-zfs_acl_znode_info(znode_t *zp, int *aclsize, int *aclcount,
-    zfs_acl_phys_t *aclphys)
-{
-	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
-	uint64_t acl_count;
-	int size;
-	int error;
-
-	ASSERT(MUTEX_HELD(&zp->z_acl_lock));
-	if (zp->z_is_sa) {
-		if ((error = sa_size(zp->z_sa_hdl, SA_ZPL_DACL_ACES(zfsvfs),
-		    &size)) != 0)
-			return (error);
-		*aclsize = size;
-		if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_DACL_COUNT(zfsvfs),
-		    &acl_count, sizeof (acl_count))) != 0)
-			return (error);
-		*aclcount = acl_count;
-	} else {
-		if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_ZNODE_ACL(zfsvfs),
-		    aclphys, sizeof (*aclphys))) != 0)
-			return (error);
-
-		if (aclphys->z_acl_version == ZFS_ACL_VERSION_INITIAL) {
-			*aclsize = ZFS_ACL_SIZE(aclphys->z_acl_size);
-			*aclcount = aclphys->z_acl_size;
-		} else {
-			*aclsize = aclphys->z_acl_size;
-			*aclcount = aclphys->z_acl_count;
-		}
-	}
-	return (0);
-}
-
-int
-zfs_znode_acl_version(znode_t *zp)
-{
-	zfs_acl_phys_t acl_phys;
-
-	if (zp->z_is_sa)
-		return (ZFS_ACL_VERSION_FUID);
-	else {
-		int error;
-
-		/*
-		 * Need to deal with a potential
-		 * race where zfs_sa_upgrade could cause
-		 * z_isa_sa to change.
-		 *
-		 * If the lookup fails then the state of z_is_sa should have
-		 * changed.
-		 */
-		if ((error = sa_lookup(zp->z_sa_hdl,
-		    SA_ZPL_ZNODE_ACL(zp->z_zfsvfs),
-		    &acl_phys, sizeof (acl_phys))) == 0)
-			return (acl_phys.z_acl_version);
-		else {
-			/*
-			 * After upgrade SA_ZPL_ZNODE_ACL should have
-			 * been removed.
-			 */
-			VERIFY(zp->z_is_sa && error == ENOENT);
-			return (ZFS_ACL_VERSION_FUID);
-		}
-	}
-}
-
-static int
-zfs_acl_version(int version)
-{
-	if (version < ZPL_VERSION_FUID)
-		return (ZFS_ACL_VERSION_INITIAL);
-	else
-		return (ZFS_ACL_VERSION_FUID);
-}
-
-static int
-zfs_acl_version_zp(znode_t *zp)
-{
-	return (zfs_acl_version(zp->z_zfsvfs->z_version));
-}
-
-zfs_acl_t *
-zfs_acl_alloc(int vers)
-{
-	zfs_acl_t *aclp;
-
-	aclp = kmem_zalloc(sizeof (zfs_acl_t), KM_SLEEP);
-	list_create(&aclp->z_acl, sizeof (zfs_acl_node_t),
-	    offsetof(zfs_acl_node_t, z_next));
-	aclp->z_version = vers;
-	if (vers == ZFS_ACL_VERSION_FUID)
-		aclp->z_ops = zfs_acl_fuid_ops;
-	else
-		aclp->z_ops = zfs_acl_v0_ops;
-	return (aclp);
-}
-
-zfs_acl_node_t *
-zfs_acl_node_alloc(size_t bytes)
-{
-	zfs_acl_node_t *aclnode;
-
-	aclnode = kmem_zalloc(sizeof (zfs_acl_node_t), KM_SLEEP);
-	if (bytes) {
-		aclnode->z_acldata = kmem_alloc(bytes, KM_SLEEP);
-		aclnode->z_allocdata = aclnode->z_acldata;
-		aclnode->z_allocsize = bytes;
-		aclnode->z_size = bytes;
-	}
-
-	return (aclnode);
-}
-
-static void
-zfs_acl_node_free(zfs_acl_node_t *aclnode)
-{
-	if (aclnode->z_allocsize)
-		kmem_free(aclnode->z_allocdata, aclnode->z_allocsize);
-	kmem_free(aclnode, sizeof (zfs_acl_node_t));
-}
-
-static void
-zfs_acl_release_nodes(zfs_acl_t *aclp)
-{
-	zfs_acl_node_t *aclnode;
-
-	while (aclnode = list_head(&aclp->z_acl)) {
-		list_remove(&aclp->z_acl, aclnode);
-		zfs_acl_node_free(aclnode);
-	}
-	aclp->z_acl_count = 0;
-	aclp->z_acl_bytes = 0;
-}
-
-void
-zfs_acl_free(zfs_acl_t *aclp)
-{
-	zfs_acl_release_nodes(aclp);
-	list_destroy(&aclp->z_acl);
-	kmem_free(aclp, sizeof (zfs_acl_t));
-}
-
-static boolean_t
-zfs_acl_valid_ace_type(uint_t type, uint_t flags)
-{
-	uint16_t entry_type;
-
-	switch (type) {
-	case ALLOW:
-	case DENY:
-	case ACE_SYSTEM_AUDIT_ACE_TYPE:
-	case ACE_SYSTEM_ALARM_ACE_TYPE:
-		entry_type = flags & ACE_TYPE_FLAGS;
-		return (entry_type == ACE_OWNER ||
-		    entry_type == OWNING_GROUP ||
-		    entry_type == ACE_EVERYONE || entry_type == 0 ||
-		    entry_type == ACE_IDENTIFIER_GROUP);
-	default:
-		if (type >= MIN_ACE_TYPE && type <= MAX_ACE_TYPE)
-			return (B_TRUE);
-	}
-	return (B_FALSE);
-}
-
-static boolean_t
-zfs_ace_valid(vtype_t obj_type, zfs_acl_t *aclp, uint16_t type, uint16_t iflags)
-{
-	/*
-	 * first check type of entry
-	 */
-
-	if (!zfs_acl_valid_ace_type(type, iflags))
-		return (B_FALSE);
-
-	switch (type) {
-	case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE:
-	case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE:
-	case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE:
-	case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE:
-		if (aclp->z_version < ZFS_ACL_VERSION_FUID)
-			return (B_FALSE);
-		aclp->z_hints |= ZFS_ACL_OBJ_ACE;
-	}
-
-	/*
-	 * next check inheritance level flags
-	 */
-
-	if (obj_type == VDIR &&
-	    (iflags & (ACE_FILE_INHERIT_ACE|ACE_DIRECTORY_INHERIT_ACE)))
-		aclp->z_hints |= ZFS_INHERIT_ACE;
-
-	if (iflags & (ACE_INHERIT_ONLY_ACE|ACE_NO_PROPAGATE_INHERIT_ACE)) {
-		if ((iflags & (ACE_FILE_INHERIT_ACE|
-		    ACE_DIRECTORY_INHERIT_ACE)) == 0) {
-			return (B_FALSE);
-		}
-	}
-
-	return (B_TRUE);
-}
-
-static void *
-zfs_acl_next_ace(zfs_acl_t *aclp, void *start, uint64_t *who,
-    uint32_t *access_mask, uint16_t *iflags, uint16_t *type)
-{
-	zfs_acl_node_t *aclnode;
-
-	ASSERT(aclp);
-
-	if (start == NULL) {
-		aclnode = list_head(&aclp->z_acl);
-		if (aclnode == NULL)
-			return (NULL);
-
-		aclp->z_next_ace = aclnode->z_acldata;
-		aclp->z_curr_node = aclnode;
-		aclnode->z_ace_idx = 0;
-	}
-
-	aclnode = aclp->z_curr_node;
-
-	if (aclnode == NULL)
-		return (NULL);
-
-	if (aclnode->z_ace_idx >= aclnode->z_ace_count) {
-		aclnode = list_next(&aclp->z_acl, aclnode);
-		if (aclnode == NULL)
-			return (NULL);
-		else {
-			aclp->z_curr_node = aclnode;
-			aclnode->z_ace_idx = 0;
-			aclp->z_next_ace = aclnode->z_acldata;
-		}
-	}
-
-	if (aclnode->z_ace_idx < aclnode->z_ace_count) {
-		void *acep = aclp->z_next_ace;
-		size_t ace_size;
-
-		/*
-		 * Make sure we don't overstep our bounds
-		 */
-		ace_size = aclp->z_ops.ace_size(acep);
-
-		if (((caddr_t)acep + ace_size) >
-		    ((caddr_t)aclnode->z_acldata + aclnode->z_size)) {
-			return (NULL);
-		}
-
-		*iflags = aclp->z_ops.ace_flags_get(acep);
-		*type = aclp->z_ops.ace_type_get(acep);
-		*access_mask = aclp->z_ops.ace_mask_get(acep);
-		*who = aclp->z_ops.ace_who_get(acep);
-		aclp->z_next_ace = (caddr_t)aclp->z_next_ace + ace_size;
-		aclnode->z_ace_idx++;
-
-		return ((void *)acep);
-	}
-	return (NULL);
-}
-
-/*ARGSUSED*/
-static uint64_t
-zfs_ace_walk(void *datap, uint64_t cookie, int aclcnt,
-    uint16_t *flags, uint16_t *type, uint32_t *mask)
-{
-	zfs_acl_t *aclp = datap;
-	zfs_ace_hdr_t *acep = (zfs_ace_hdr_t *)(uintptr_t)cookie;
-	uint64_t who;
-
-	acep = zfs_acl_next_ace(aclp, acep, &who, mask,
-	    flags, type);
-	return ((uint64_t)(uintptr_t)acep);
-}
-
-static zfs_acl_node_t *
-zfs_acl_curr_node(zfs_acl_t *aclp)
-{
-	ASSERT(aclp->z_curr_node);
-	return (aclp->z_curr_node);
-}
-
-/*
- * Copy ACE to internal ZFS format.
- * While processing the ACL each ACE will be validated for correctness.
- * ACE FUIDs will be created later.
- */
-int
-zfs_copy_ace_2_fuid(zfsvfs_t *zfsvfs, vtype_t obj_type, zfs_acl_t *aclp,
-    void *datap, zfs_ace_t *z_acl, uint64_t aclcnt, size_t *size,
-    zfs_fuid_info_t **fuidp, cred_t *cr)
-{
-	int i;
-	uint16_t entry_type;
-	zfs_ace_t *aceptr = z_acl;
-	ace_t *acep = datap;
-	zfs_object_ace_t *zobjacep;
-	ace_object_t *aceobjp;
-
-	for (i = 0; i != aclcnt; i++) {
-		aceptr->z_hdr.z_access_mask = acep->a_access_mask;
-		aceptr->z_hdr.z_flags = acep->a_flags;
-		aceptr->z_hdr.z_type = acep->a_type;
-		entry_type = aceptr->z_hdr.z_flags & ACE_TYPE_FLAGS;
-		if (entry_type != ACE_OWNER && entry_type != OWNING_GROUP &&
-		    entry_type != ACE_EVERYONE) {
-			aceptr->z_fuid = zfs_fuid_create(zfsvfs, acep->a_who,
-			    cr, (entry_type == 0) ?
-			    ZFS_ACE_USER : ZFS_ACE_GROUP, fuidp);
-		}
-
-		/*
-		 * Make sure ACE is valid
-		 */
-		if (zfs_ace_valid(obj_type, aclp, aceptr->z_hdr.z_type,
-		    aceptr->z_hdr.z_flags) != B_TRUE)
-			return (SET_ERROR(EINVAL));
-
-		switch (acep->a_type) {
-		case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE:
-		case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE:
-		case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE:
-		case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE:
-			zobjacep = (zfs_object_ace_t *)aceptr;
-			aceobjp = (ace_object_t *)acep;
-
-			bcopy(aceobjp->a_obj_type, zobjacep->z_object_type,
-			    sizeof (aceobjp->a_obj_type));
-			bcopy(aceobjp->a_inherit_obj_type,
-			    zobjacep->z_inherit_type,
-			    sizeof (aceobjp->a_inherit_obj_type));
-			acep = (ace_t *)((caddr_t)acep + sizeof (ace_object_t));
-			break;
-		default:
-			acep = (ace_t *)((caddr_t)acep + sizeof (ace_t));
-		}
-
-		aceptr = (zfs_ace_t *)((caddr_t)aceptr +
-		    aclp->z_ops.ace_size(aceptr));
-	}
-
-	*size = (caddr_t)aceptr - (caddr_t)z_acl;
-
-	return (0);
-}
-
-/*
- * Copy ZFS ACEs to fixed size ace_t layout
- */
-static void
-zfs_copy_fuid_2_ace(zfsvfs_t *zfsvfs, zfs_acl_t *aclp, cred_t *cr,
-    void *datap, int filter)
-{
-	uint64_t who;
-	uint32_t access_mask;
-	uint16_t iflags, type;
-	zfs_ace_hdr_t *zacep = NULL;
-	ace_t *acep = datap;
-	ace_object_t *objacep;
-	zfs_object_ace_t *zobjacep;
-	size_t ace_size;
-	uint16_t entry_type;
-
-	while (zacep = zfs_acl_next_ace(aclp, zacep,
-	    &who, &access_mask, &iflags, &type)) {
-
-		switch (type) {
-		case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE:
-		case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE:
-		case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE:
-		case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE:
-			if (filter) {
-				continue;
-			}
-			zobjacep = (zfs_object_ace_t *)zacep;
-			objacep = (ace_object_t *)acep;
-			bcopy(zobjacep->z_object_type,
-			    objacep->a_obj_type,
-			    sizeof (zobjacep->z_object_type));
-			bcopy(zobjacep->z_inherit_type,
-			    objacep->a_inherit_obj_type,
-			    sizeof (zobjacep->z_inherit_type));
-			ace_size = sizeof (ace_object_t);
-			break;
-		default:
-			ace_size = sizeof (ace_t);
-			break;
-		}
-
-		entry_type = (iflags & ACE_TYPE_FLAGS);
-		if ((entry_type != ACE_OWNER &&
-		    entry_type != OWNING_GROUP &&
-		    entry_type != ACE_EVERYONE)) {
-			acep->a_who = zfs_fuid_map_id(zfsvfs, who,
-			    cr, (entry_type & ACE_IDENTIFIER_GROUP) ?
-			    ZFS_ACE_GROUP : ZFS_ACE_USER);
-		} else {
-			acep->a_who = (uid_t)(int64_t)who;
-		}
-		acep->a_access_mask = access_mask;
-		acep->a_flags = iflags;
-		acep->a_type = type;
-		acep = (ace_t *)((caddr_t)acep + ace_size);
-	}
-}
-
-static int
-zfs_copy_ace_2_oldace(vtype_t obj_type, zfs_acl_t *aclp, ace_t *acep,
-    zfs_oldace_t *z_acl, int aclcnt, size_t *size)
-{
-	int i;
-	zfs_oldace_t *aceptr = z_acl;
-
-	for (i = 0; i != aclcnt; i++, aceptr++) {
-		aceptr->z_access_mask = acep[i].a_access_mask;
-		aceptr->z_type = acep[i].a_type;
-		aceptr->z_flags = acep[i].a_flags;
-		aceptr->z_fuid = acep[i].a_who;
-		/*
-		 * Make sure ACE is valid
-		 */
-		if (zfs_ace_valid(obj_type, aclp, aceptr->z_type,
-		    aceptr->z_flags) != B_TRUE)
-			return (SET_ERROR(EINVAL));
-	}
-	*size = (caddr_t)aceptr - (caddr_t)z_acl;
-	return (0);
-}
-
-/*
- * convert old ACL format to new
- */
-void
-zfs_acl_xform(znode_t *zp, zfs_acl_t *aclp, cred_t *cr)
-{
-	zfs_oldace_t *oldaclp;
-	int i;
-	uint16_t type, iflags;
-	uint32_t access_mask;
-	uint64_t who;
-	void *cookie = NULL;
-	zfs_acl_node_t *newaclnode;
-
-	ASSERT(aclp->z_version == ZFS_ACL_VERSION_INITIAL);
-	/*
-	 * First create the ACE in a contiguous piece of memory
-	 * for zfs_copy_ace_2_fuid().
-	 *
-	 * We only convert an ACL once, so this won't happen
-	 * everytime.
-	 */
-	oldaclp = kmem_alloc(sizeof (zfs_oldace_t) * aclp->z_acl_count,
-	    KM_SLEEP);
-	i = 0;
-	while (cookie = zfs_acl_next_ace(aclp, cookie, &who,
-	    &access_mask, &iflags, &type)) {
-		oldaclp[i].z_flags = iflags;
-		oldaclp[i].z_type = type;
-		oldaclp[i].z_fuid = who;
-		oldaclp[i++].z_access_mask = access_mask;
-	}
-
-	newaclnode = zfs_acl_node_alloc(aclp->z_acl_count *
-	    sizeof (zfs_object_ace_t));
-	aclp->z_ops = zfs_acl_fuid_ops;
-	VERIFY(zfs_copy_ace_2_fuid(zp->z_zfsvfs, ZTOV(zp)->v_type, aclp,
-	    oldaclp, newaclnode->z_acldata, aclp->z_acl_count,
-	    &newaclnode->z_size, NULL, cr) == 0);
-	newaclnode->z_ace_count = aclp->z_acl_count;
-	aclp->z_version = ZFS_ACL_VERSION;
-	kmem_free(oldaclp, aclp->z_acl_count * sizeof (zfs_oldace_t));
-
-	/*
-	 * Release all previous ACL nodes
-	 */
-
-	zfs_acl_release_nodes(aclp);
-
-	list_insert_head(&aclp->z_acl, newaclnode);
-
-	aclp->z_acl_bytes = newaclnode->z_size;
-	aclp->z_acl_count = newaclnode->z_ace_count;
-
-}
-
-/*
- * Convert unix access mask to v4 access mask
- */
-static uint32_t
-zfs_unix_to_v4(uint32_t access_mask)
-{
-	uint32_t new_mask = 0;
-
-	if (access_mask & S_IXOTH)
-		new_mask |= ACE_EXECUTE;
-	if (access_mask & S_IWOTH)
-		new_mask |= ACE_WRITE_DATA;
-	if (access_mask & S_IROTH)
-		new_mask |= ACE_READ_DATA;
-	return (new_mask);
-}
-
-static void
-zfs_set_ace(zfs_acl_t *aclp, void *acep, uint32_t access_mask,
-    uint16_t access_type, uint64_t fuid, uint16_t entry_type)
-{
-	uint16_t type = entry_type & ACE_TYPE_FLAGS;
-
-	aclp->z_ops.ace_mask_set(acep, access_mask);
-	aclp->z_ops.ace_type_set(acep, access_type);
-	aclp->z_ops.ace_flags_set(acep, entry_type);
-	if ((type != ACE_OWNER && type != OWNING_GROUP &&
-	    type != ACE_EVERYONE))
-		aclp->z_ops.ace_who_set(acep, fuid);
-}
-
-/*
- * Determine mode of file based on ACL.
- */
-uint64_t
-zfs_mode_compute(uint64_t fmode, zfs_acl_t *aclp,
-    uint64_t *pflags, uint64_t fuid, uint64_t fgid)
-{
-	int		entry_type;
-	mode_t		mode;
-	mode_t		seen = 0;
-	zfs_ace_hdr_t	*acep = NULL;
-	uint64_t	who;
-	uint16_t	iflags, type;
-	uint32_t	access_mask;
-	boolean_t	an_exec_denied = B_FALSE;
-
-	mode = (fmode & (S_IFMT | S_ISUID | S_ISGID | S_ISVTX));
-
-	while (acep = zfs_acl_next_ace(aclp, acep, &who,
-	    &access_mask, &iflags, &type)) {
-
-		if (!zfs_acl_valid_ace_type(type, iflags))
-			continue;
-
-		entry_type = (iflags & ACE_TYPE_FLAGS);
-
-		/*
-		 * Skip over any inherit_only ACEs
-		 */
-		if (iflags & ACE_INHERIT_ONLY_ACE)
-			continue;
-
-		if (entry_type == ACE_OWNER || (entry_type == 0 &&
-		    who == fuid)) {
-			if ((access_mask & ACE_READ_DATA) &&
-			    (!(seen & S_IRUSR))) {
-				seen |= S_IRUSR;
-				if (type == ALLOW) {
-					mode |= S_IRUSR;
-				}
-			}
-			if ((access_mask & ACE_WRITE_DATA) &&
-			    (!(seen & S_IWUSR))) {
-				seen |= S_IWUSR;
-				if (type == ALLOW) {
-					mode |= S_IWUSR;
-				}
-			}
-			if ((access_mask & ACE_EXECUTE) &&
-			    (!(seen & S_IXUSR))) {
-				seen |= S_IXUSR;
-				if (type == ALLOW) {
-					mode |= S_IXUSR;
-				}
-			}
-		} else if (entry_type == OWNING_GROUP ||
-		    (entry_type == ACE_IDENTIFIER_GROUP && who == fgid)) {
-			if ((access_mask & ACE_READ_DATA) &&
-			    (!(seen & S_IRGRP))) {
-				seen |= S_IRGRP;
-				if (type == ALLOW) {
-					mode |= S_IRGRP;
-				}
-			}
-			if ((access_mask & ACE_WRITE_DATA) &&
-			    (!(seen & S_IWGRP))) {
-				seen |= S_IWGRP;
-				if (type == ALLOW) {
-					mode |= S_IWGRP;
-				}
-			}
-			if ((access_mask & ACE_EXECUTE) &&
-			    (!(seen & S_IXGRP))) {
-				seen |= S_IXGRP;
-				if (type == ALLOW) {
-					mode |= S_IXGRP;
-				}
-			}
-		} else if (entry_type == ACE_EVERYONE) {
-			if ((access_mask & ACE_READ_DATA)) {
-				if (!(seen & S_IRUSR)) {
-					seen |= S_IRUSR;
-					if (type == ALLOW) {
-						mode |= S_IRUSR;
-					}
-				}
-				if (!(seen & S_IRGRP)) {
-					seen |= S_IRGRP;
-					if (type == ALLOW) {
-						mode |= S_IRGRP;
-					}
-				}
-				if (!(seen & S_IROTH)) {
-					seen |= S_IROTH;
-					if (type == ALLOW) {
-						mode |= S_IROTH;
-					}
-				}
-			}
-			if ((access_mask & ACE_WRITE_DATA)) {
-				if (!(seen & S_IWUSR)) {
-					seen |= S_IWUSR;
-					if (type == ALLOW) {
-						mode |= S_IWUSR;
-					}
-				}
-				if (!(seen & S_IWGRP)) {
-					seen |= S_IWGRP;
-					if (type == ALLOW) {
-						mode |= S_IWGRP;
-					}
-				}
-				if (!(seen & S_IWOTH)) {
-					seen |= S_IWOTH;
-					if (type == ALLOW) {
-						mode |= S_IWOTH;
-					}
-				}
-			}
-			if ((access_mask & ACE_EXECUTE)) {
-				if (!(seen & S_IXUSR)) {
-					seen |= S_IXUSR;
-					if (type == ALLOW) {
-						mode |= S_IXUSR;
-					}
-				}
-				if (!(seen & S_IXGRP)) {
-					seen |= S_IXGRP;
-					if (type == ALLOW) {
-						mode |= S_IXGRP;
-					}
-				}
-				if (!(seen & S_IXOTH)) {
-					seen |= S_IXOTH;
-					if (type == ALLOW) {
-						mode |= S_IXOTH;
-					}
-				}
-			}
-		} else {
-			/*
-			 * Only care if this IDENTIFIER_GROUP or
-			 * USER ACE denies execute access to someone,
-			 * mode is not affected
-			 */
-			if ((access_mask & ACE_EXECUTE) && type == DENY)
-				an_exec_denied = B_TRUE;
-		}
-	}
-
-	/*
-	 * Failure to allow is effectively a deny, so execute permission
-	 * is denied if it was never mentioned or if we explicitly
-	 * weren't allowed it.
-	 */
-	if (!an_exec_denied &&
-	    ((seen & ALL_MODE_EXECS) != ALL_MODE_EXECS ||
-	    (mode & ALL_MODE_EXECS) != ALL_MODE_EXECS))
-		an_exec_denied = B_TRUE;
-
-	if (an_exec_denied)
-		*pflags &= ~ZFS_NO_EXECS_DENIED;
-	else
-		*pflags |= ZFS_NO_EXECS_DENIED;
-
-	return (mode);
-}
-
-/*
- * Read an external acl object.  If the intent is to modify, always
- * create a new acl and leave any cached acl in place.
- */
-static int
-zfs_acl_node_read(znode_t *zp, zfs_acl_t **aclpp, boolean_t will_modify)
-{
-	zfs_acl_t	*aclp;
-	int		aclsize;
-	int		acl_count;
-	zfs_acl_node_t	*aclnode;
-	zfs_acl_phys_t	znode_acl;
-	int		version;
-	int		error;
-
-	ASSERT(MUTEX_HELD(&zp->z_acl_lock));
-	ASSERT_VOP_LOCKED(ZTOV(zp), __func__);
-
-	if (zp->z_acl_cached && !will_modify) {
-		*aclpp = zp->z_acl_cached;
-		return (0);
-	}
-
-	version = zfs_znode_acl_version(zp);
-
-	if ((error = zfs_acl_znode_info(zp, &aclsize,
-	    &acl_count, &znode_acl)) != 0) {
-		goto done;
-	}
-
-	aclp = zfs_acl_alloc(version);
-
-	aclp->z_acl_count = acl_count;
-	aclp->z_acl_bytes = aclsize;
-
-	aclnode = zfs_acl_node_alloc(aclsize);
-	aclnode->z_ace_count = aclp->z_acl_count;
-	aclnode->z_size = aclsize;
-
-	if (!zp->z_is_sa) {
-		if (znode_acl.z_acl_extern_obj) {
-			error = dmu_read(zp->z_zfsvfs->z_os,
-			    znode_acl.z_acl_extern_obj, 0, aclnode->z_size,
-			    aclnode->z_acldata, DMU_READ_PREFETCH);
-		} else {
-			bcopy(znode_acl.z_ace_data, aclnode->z_acldata,
-			    aclnode->z_size);
-		}
-	} else {
-		error = sa_lookup(zp->z_sa_hdl, SA_ZPL_DACL_ACES(zp->z_zfsvfs),
-		    aclnode->z_acldata, aclnode->z_size);
-	}
-
-	if (error != 0) {
-		zfs_acl_free(aclp);
-		zfs_acl_node_free(aclnode);
-		/* convert checksum errors into IO errors */
-		if (error == ECKSUM)
-			error = SET_ERROR(EIO);
-		goto done;
-	}
-
-	list_insert_head(&aclp->z_acl, aclnode);
-
-	*aclpp = aclp;
-	if (!will_modify)
-		zp->z_acl_cached = aclp;
-done:
-	return (error);
-}
-
-/*ARGSUSED*/
-void
-zfs_acl_data_locator(void **dataptr, uint32_t *length, uint32_t buflen,
-    boolean_t start, void *userdata)
-{
-	zfs_acl_locator_cb_t *cb = (zfs_acl_locator_cb_t *)userdata;
-
-	if (start) {
-		cb->cb_acl_node = list_head(&cb->cb_aclp->z_acl);
-	} else {
-		cb->cb_acl_node = list_next(&cb->cb_aclp->z_acl,
-		    cb->cb_acl_node);
-	}
-	*dataptr = cb->cb_acl_node->z_acldata;
-	*length = cb->cb_acl_node->z_size;
-}
-
-int
-zfs_acl_chown_setattr(znode_t *zp)
-{
-	int error;
-	zfs_acl_t *aclp;
-
-	ASSERT_VOP_ELOCKED(ZTOV(zp), __func__);
-	ASSERT(MUTEX_HELD(&zp->z_acl_lock));
-	ASSERT_VOP_IN_SEQC(ZTOV(zp));
-
-	if ((error = zfs_acl_node_read(zp, &aclp, B_FALSE)) == 0)
-		zp->z_mode = zfs_mode_compute(zp->z_mode, aclp,
-		    &zp->z_pflags, zp->z_uid, zp->z_gid);
-	return (error);
-}
-
-/*
- * common code for setting ACLs.
- *
- * This function is called from zfs_mode_update, zfs_perm_init, and zfs_setacl.
- * zfs_setacl passes a non-NULL inherit pointer (ihp) to indicate that it's
- * already checked the acl and knows whether to inherit.
- */
-int
-zfs_aclset_common(znode_t *zp, zfs_acl_t *aclp, cred_t *cr, dmu_tx_t *tx)
-{
-	int			error;
-	zfsvfs_t		*zfsvfs = zp->z_zfsvfs;
-	dmu_object_type_t	otype;
-	zfs_acl_locator_cb_t	locate = { 0 };
-	uint64_t		mode;
-	sa_bulk_attr_t		bulk[5];
-	uint64_t		ctime[2];
-	int			count = 0;
-	zfs_acl_phys_t		acl_phys;
-
-	ASSERT_VOP_IN_SEQC(ZTOV(zp));
-
-	mode = zp->z_mode;
-
-	mode = zfs_mode_compute(mode, aclp, &zp->z_pflags,
-	    zp->z_uid, zp->z_gid);
-
-	zp->z_mode = mode;
-	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL,
-	    &mode, sizeof (mode));
-	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
-	    &zp->z_pflags, sizeof (zp->z_pflags));
-	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
-	    &ctime, sizeof (ctime));
-
-	if (zp->z_acl_cached) {
-		zfs_acl_free(zp->z_acl_cached);
-		zp->z_acl_cached = NULL;
-	}
-
-	/*
-	 * Upgrade needed?
-	 */
-	if (!zfsvfs->z_use_fuids) {
-		otype = DMU_OT_OLDACL;
-	} else {
-		if ((aclp->z_version == ZFS_ACL_VERSION_INITIAL) &&
-		    (zfsvfs->z_version >= ZPL_VERSION_FUID))
-			zfs_acl_xform(zp, aclp, cr);
-		ASSERT(aclp->z_version >= ZFS_ACL_VERSION_FUID);
-		otype = DMU_OT_ACL;
-	}
-
-	/*
-	 * Arrgh, we have to handle old on disk format
-	 * as well as newer (preferred) SA format.
-	 */
-
-	if (zp->z_is_sa) { /* the easy case, just update the ACL attribute */
-		locate.cb_aclp = aclp;
-		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_DACL_ACES(zfsvfs),
-		    zfs_acl_data_locator, &locate, aclp->z_acl_bytes);
-		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_DACL_COUNT(zfsvfs),
-		    NULL, &aclp->z_acl_count, sizeof (uint64_t));
-	} else { /* Painful legacy way */
-		zfs_acl_node_t *aclnode;
-		uint64_t off = 0;
-		uint64_t aoid;
-
-		if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_ZNODE_ACL(zfsvfs),
-		    &acl_phys, sizeof (acl_phys))) != 0)
-			return (error);
-
-		aoid = acl_phys.z_acl_extern_obj;
-
-		if (aclp->z_acl_bytes > ZFS_ACE_SPACE) {
-			/*
-			 * If ACL was previously external and we are now
-			 * converting to new ACL format then release old
-			 * ACL object and create a new one.
-			 */
-			if (aoid &&
-			    aclp->z_version != acl_phys.z_acl_version) {
-				error = dmu_object_free(zfsvfs->z_os, aoid, tx);
-				if (error)
-					return (error);
-				aoid = 0;
-			}
-			if (aoid == 0) {
-				aoid = dmu_object_alloc(zfsvfs->z_os,
-				    otype, aclp->z_acl_bytes,
-				    otype == DMU_OT_ACL ?
-				    DMU_OT_SYSACL : DMU_OT_NONE,
-				    otype == DMU_OT_ACL ?
-				    DN_OLD_MAX_BONUSLEN : 0, tx);
-			} else {
-				(void) dmu_object_set_blocksize(zfsvfs->z_os,
-				    aoid, aclp->z_acl_bytes, 0, tx);
-			}
-			acl_phys.z_acl_extern_obj = aoid;
-			for (aclnode = list_head(&aclp->z_acl); aclnode;
-			    aclnode = list_next(&aclp->z_acl, aclnode)) {
-				if (aclnode->z_ace_count == 0)
-					continue;
-				dmu_write(zfsvfs->z_os, aoid, off,
-				    aclnode->z_size, aclnode->z_acldata, tx);
-				off += aclnode->z_size;
-			}
-		} else {
-			void *start = acl_phys.z_ace_data;
-			/*
-			 * Migrating back embedded?
-			 */
-			if (acl_phys.z_acl_extern_obj) {
-				error = dmu_object_free(zfsvfs->z_os,
-				    acl_phys.z_acl_extern_obj, tx);
-				if (error)
-					return (error);
-				acl_phys.z_acl_extern_obj = 0;
-			}
-
-			for (aclnode = list_head(&aclp->z_acl); aclnode;
-			    aclnode = list_next(&aclp->z_acl, aclnode)) {
-				if (aclnode->z_ace_count == 0)
-					continue;
-				bcopy(aclnode->z_acldata, start,
-				    aclnode->z_size);
-				start = (caddr_t)start + aclnode->z_size;
-			}
-		}
-		/*
-		 * If Old version then swap count/bytes to match old
-		 * layout of znode_acl_phys_t.
-		 */
-		if (aclp->z_version == ZFS_ACL_VERSION_INITIAL) {
-			acl_phys.z_acl_size = aclp->z_acl_count;
-			acl_phys.z_acl_count = aclp->z_acl_bytes;
-		} else {
-			acl_phys.z_acl_size = aclp->z_acl_bytes;
-			acl_phys.z_acl_count = aclp->z_acl_count;
-		}
-		acl_phys.z_acl_version = aclp->z_version;
-
-		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ZNODE_ACL(zfsvfs), NULL,
-		    &acl_phys, sizeof (acl_phys));
-	}
-
-	/*
-	 * Replace ACL wide bits, but first clear them.
-	 */
-	zp->z_pflags &= ~ZFS_ACL_WIDE_FLAGS;
-
-	zp->z_pflags |= aclp->z_hints;
-
-	if (ace_trivial_common(aclp, 0, zfs_ace_walk) == 0)
-		zp->z_pflags |= ZFS_ACL_TRIVIAL;
-
-	zfs_tstamp_update_setup(zp, STATE_CHANGED, NULL, ctime, B_TRUE);
-	return (sa_bulk_update(zp->z_sa_hdl, bulk, count, tx));
-}
-
-static void
-zfs_acl_chmod(vtype_t vtype, uint64_t mode, boolean_t split, boolean_t trim,
-    zfs_acl_t *aclp)
-{
-	void		*acep = NULL;
-	uint64_t	who;
-	int		new_count, new_bytes;
-	int		ace_size;
-	int		entry_type;
-	uint16_t	iflags, type;
-	uint32_t	access_mask;
-	zfs_acl_node_t	*newnode;
-	size_t		abstract_size = aclp->z_ops.ace_abstract_size();
-	void		*zacep;
-	boolean_t	isdir;
-	trivial_acl_t	masks;
-
-	new_count = new_bytes = 0;
-
-	isdir = (vtype == VDIR);
-
-	acl_trivial_access_masks((mode_t)mode, isdir, &masks);
-
-	newnode = zfs_acl_node_alloc((abstract_size * 6) + aclp->z_acl_bytes);
-
-	zacep = newnode->z_acldata;
-	if (masks.allow0) {
-		zfs_set_ace(aclp, zacep, masks.allow0, ALLOW, -1, ACE_OWNER);
-		zacep = (void *)((uintptr_t)zacep + abstract_size);
-		new_count++;
-		new_bytes += abstract_size;
-	}
-	if (masks.deny1) {
-		zfs_set_ace(aclp, zacep, masks.deny1, DENY, -1, ACE_OWNER);
-		zacep = (void *)((uintptr_t)zacep + abstract_size);
-		new_count++;
-		new_bytes += abstract_size;
-	}
-	if (masks.deny2) {
-		zfs_set_ace(aclp, zacep, masks.deny2, DENY, -1, OWNING_GROUP);
-		zacep = (void *)((uintptr_t)zacep + abstract_size);
-		new_count++;
-		new_bytes += abstract_size;
-	}
-
-	while (acep = zfs_acl_next_ace(aclp, acep, &who, &access_mask,
-	    &iflags, &type)) {
-		entry_type = (iflags & ACE_TYPE_FLAGS);
-		/*
-		 * ACEs used to represent the file mode may be divided
-		 * into an equivalent pair of inherit-only and regular
-		 * ACEs, if they are inheritable.
-		 * Skip regular ACEs, which are replaced by the new mode.
-		 */
-		if (split && (entry_type == ACE_OWNER ||
-		    entry_type == OWNING_GROUP ||
-		    entry_type == ACE_EVERYONE)) {
-			if (!isdir || !(iflags &
-			    (ACE_FILE_INHERIT_ACE|ACE_DIRECTORY_INHERIT_ACE)))
-				continue;
-			/*
-			 * We preserve owner@, group@, or @everyone
-			 * permissions, if they are inheritable, by
-			 * copying them to inherit_only ACEs. This
-			 * prevents inheritable permissions from being
-			 * altered along with the file mode.
-			 */
-			iflags |= ACE_INHERIT_ONLY_ACE;
-		}
-
-		/*
-		 * If this ACL has any inheritable ACEs, mark that in
-		 * the hints (which are later masked into the pflags)
-		 * so create knows to do inheritance.
-		 */
-		if (isdir && (iflags &
-		    (ACE_FILE_INHERIT_ACE|ACE_DIRECTORY_INHERIT_ACE)))
-			aclp->z_hints |= ZFS_INHERIT_ACE;
-
-		if ((type != ALLOW && type != DENY) ||
-		    (iflags & ACE_INHERIT_ONLY_ACE)) {
-			switch (type) {
-			case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE:
-			case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE:
-			case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE:
-			case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE:
-				aclp->z_hints |= ZFS_ACL_OBJ_ACE;
-				break;
-			}
-		} else {
-			/*
-			 * Limit permissions granted by ACEs to be no greater
-			 * than permissions of the requested group mode.
-			 * Applies when the "aclmode" property is set to
-			 * "groupmask".
-			 */
-			if ((type == ALLOW) && trim)
-				access_mask &= masks.group;
-		}
-		zfs_set_ace(aclp, zacep, access_mask, type, who, iflags);
-		ace_size = aclp->z_ops.ace_size(acep);
-		zacep = (void *)((uintptr_t)zacep + ace_size);
-		new_count++;
-		new_bytes += ace_size;
-	}
-	zfs_set_ace(aclp, zacep, masks.owner, ALLOW, -1, ACE_OWNER);
-	zacep = (void *)((uintptr_t)zacep + abstract_size);
-	zfs_set_ace(aclp, zacep, masks.group, ALLOW, -1, OWNING_GROUP);
-	zacep = (void *)((uintptr_t)zacep + abstract_size);
-	zfs_set_ace(aclp, zacep, masks.everyone, ALLOW, -1, ACE_EVERYONE);
-
-	new_count += 3;
-	new_bytes += abstract_size * 3;
-	zfs_acl_release_nodes(aclp);
-	aclp->z_acl_count = new_count;
-	aclp->z_acl_bytes = new_bytes;
-	newnode->z_ace_count = new_count;
-	newnode->z_size = new_bytes;
-	list_insert_tail(&aclp->z_acl, newnode);
-}
-
-int
-zfs_acl_chmod_setattr(znode_t *zp, zfs_acl_t **aclp, uint64_t mode)
-{
-	int error = 0;
-
-	mutex_enter(&zp->z_acl_lock);
-	ASSERT_VOP_ELOCKED(ZTOV(zp), __func__);
-	if (zp->z_zfsvfs->z_acl_mode == ZFS_ACL_DISCARD)
-		*aclp = zfs_acl_alloc(zfs_acl_version_zp(zp));
-	else
-		error = zfs_acl_node_read(zp, aclp, B_TRUE);
-
-	if (error == 0) {
-		(*aclp)->z_hints = zp->z_pflags & V4_ACL_WIDE_FLAGS;
-		zfs_acl_chmod(ZTOV(zp)->v_type, mode, B_TRUE,
-		    (zp->z_zfsvfs->z_acl_mode == ZFS_ACL_GROUPMASK), *aclp);
-	}
-	mutex_exit(&zp->z_acl_lock);
-
-	return (error);
-}
-
-/*
- * Should ACE be inherited?
- */
-static int
-zfs_ace_can_use(vtype_t vtype, uint16_t acep_flags)
-{
-	int	iflags = (acep_flags & 0xf);
-
-	if ((vtype == VDIR) && (iflags & ACE_DIRECTORY_INHERIT_ACE))
-		return (1);
-	else if (iflags & ACE_FILE_INHERIT_ACE)
-		return (!((vtype == VDIR) &&
-		    (iflags & ACE_NO_PROPAGATE_INHERIT_ACE)));
-	return (0);
-}
-
-/*
- * inherit inheritable ACEs from parent
- */
-static zfs_acl_t *
-zfs_acl_inherit(zfsvfs_t *zfsvfs, vtype_t vtype, zfs_acl_t *paclp,
-    uint64_t mode, boolean_t *need_chmod)
-{
-	void		*pacep = NULL;
-	void		*acep;
-	zfs_acl_node_t  *aclnode;
-	zfs_acl_t	*aclp = NULL;
-	uint64_t	who;
-	uint32_t	access_mask;
-	uint16_t	iflags, newflags, type;
-	size_t		ace_size;
-	void		*data1, *data2;
-	size_t		data1sz, data2sz;
-	uint_t		aclinherit;
-	boolean_t	isdir = (vtype == VDIR);
-	boolean_t	isreg = (vtype == VREG);
-
-	*need_chmod = B_TRUE;
-
-	aclp = zfs_acl_alloc(paclp->z_version);
-	aclinherit = zfsvfs->z_acl_inherit;
-	if (aclinherit == ZFS_ACL_DISCARD || vtype == VLNK)
-		return (aclp);
-
-	while (pacep = zfs_acl_next_ace(paclp, pacep, &who,
-	    &access_mask, &iflags, &type)) {
-
-		/*
-		 * don't inherit bogus ACEs
-		 */
-		if (!zfs_acl_valid_ace_type(type, iflags))
-			continue;
-
-		/*
-		 * Check if ACE is inheritable by this vnode
-		 */
-		if ((aclinherit == ZFS_ACL_NOALLOW && type == ALLOW) ||
-		    !zfs_ace_can_use(vtype, iflags))
-			continue;
-
-		/*
-		 * If owner@, group@, or everyone@ inheritable
-		 * then zfs_acl_chmod() isn't needed.
-		 */
-		if ((aclinherit == ZFS_ACL_PASSTHROUGH ||
-		    aclinherit == ZFS_ACL_PASSTHROUGH_X) &&
-		    ((iflags & (ACE_OWNER|ACE_EVERYONE)) ||
-		    ((iflags & OWNING_GROUP) == OWNING_GROUP)) &&
-		    (isreg || (isdir && (iflags & ACE_DIRECTORY_INHERIT_ACE))))
-			*need_chmod = B_FALSE;
-
-		/*
-		 * Strip inherited execute permission from file if
-		 * not in mode
-		 */
-		if (aclinherit == ZFS_ACL_PASSTHROUGH_X && type == ALLOW &&
-		    !isdir && ((mode & (S_IXUSR|S_IXGRP|S_IXOTH)) == 0)) {
-			access_mask &= ~ACE_EXECUTE;
-		}
-
-		/*
-		 * Strip write_acl and write_owner from permissions
-		 * when inheriting an ACE
-		 */
-		if (aclinherit == ZFS_ACL_RESTRICTED && type == ALLOW) {
-			access_mask &= ~RESTRICTED_CLEAR;
-		}
-
-		ace_size = aclp->z_ops.ace_size(pacep);
-		aclnode = zfs_acl_node_alloc(ace_size);
-		list_insert_tail(&aclp->z_acl, aclnode);
-		acep = aclnode->z_acldata;
-
-		zfs_set_ace(aclp, acep, access_mask, type,
-		    who, iflags|ACE_INHERITED_ACE);
-
-		/*
-		 * Copy special opaque data if any
-		 */
-		if ((data1sz = paclp->z_ops.ace_data(pacep, &data1)) != 0) {
-			VERIFY((data2sz = aclp->z_ops.ace_data(acep,
-			    &data2)) == data1sz);
-			bcopy(data1, data2, data2sz);
-		}
-
-		aclp->z_acl_count++;
-		aclnode->z_ace_count++;
-		aclp->z_acl_bytes += aclnode->z_size;
-		newflags = aclp->z_ops.ace_flags_get(acep);
-
-		/*
-		 * If ACE is not to be inherited further, or if the vnode is
-		 * not a directory, remove all inheritance flags
-		 */
-		if (!isdir || (iflags & ACE_NO_PROPAGATE_INHERIT_ACE)) {
-			newflags &= ~ALL_INHERIT;
-			aclp->z_ops.ace_flags_set(acep,
-			    newflags|ACE_INHERITED_ACE);
-			continue;
-		}
-
-		/*
-		 * This directory has an inheritable ACE
-		 */
-		aclp->z_hints |= ZFS_INHERIT_ACE;
-
-		/*
-		 * If only FILE_INHERIT is set then turn on
-		 * inherit_only
-		 */
-		if ((iflags & (ACE_FILE_INHERIT_ACE |
-		    ACE_DIRECTORY_INHERIT_ACE)) == ACE_FILE_INHERIT_ACE) {
-			newflags |= ACE_INHERIT_ONLY_ACE;
-			aclp->z_ops.ace_flags_set(acep,
-			    newflags|ACE_INHERITED_ACE);
-		} else {
-			newflags &= ~ACE_INHERIT_ONLY_ACE;
-			aclp->z_ops.ace_flags_set(acep,
-			    newflags|ACE_INHERITED_ACE);
-		}
-	}
-
-	return (aclp);
-}
-
-/*
- * Create file system object initial permissions
- * including inheritable ACEs.
- * Also, create FUIDs for owner and group.
- */
-int
-zfs_acl_ids_create(znode_t *dzp, int flag, vattr_t *vap, cred_t *cr,
-    vsecattr_t *vsecp, zfs_acl_ids_t *acl_ids)
-{
-	int		error;
-	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
-	zfs_acl_t	*paclp;
-	gid_t		gid;
-	boolean_t	need_chmod = B_TRUE;
-	boolean_t	trim = B_FALSE;
-	boolean_t	inherited = B_FALSE;
-
-	if ((flag & IS_ROOT_NODE) == 0)
-		ASSERT_VOP_ELOCKED(ZTOV(dzp), __func__);
-	else
-		ASSERT(dzp->z_vnode == NULL);
-	bzero(acl_ids, sizeof (zfs_acl_ids_t));
-	acl_ids->z_mode = MAKEIMODE(vap->va_type, vap->va_mode);
-
-	if (vsecp)
-		if ((error = zfs_vsec_2_aclp(zfsvfs, vap->va_type, vsecp, cr,
-		    &acl_ids->z_fuidp, &acl_ids->z_aclp)) != 0)
-			return (error);
-	/*
-	 * Determine uid and gid.
-	 */
-	if ((flag & IS_ROOT_NODE) || zfsvfs->z_replay ||
-	    ((flag & IS_XATTR) && (vap->va_type == VDIR))) {
-		acl_ids->z_fuid = zfs_fuid_create(zfsvfs,
-		    (uint64_t)vap->va_uid, cr,
-		    ZFS_OWNER, &acl_ids->z_fuidp);
-		acl_ids->z_fgid = zfs_fuid_create(zfsvfs,
-		    (uint64_t)vap->va_gid, cr,
-		    ZFS_GROUP, &acl_ids->z_fuidp);
-		gid = vap->va_gid;
-	} else {
-		acl_ids->z_fuid = zfs_fuid_create_cred(zfsvfs, ZFS_OWNER,
-		    cr, &acl_ids->z_fuidp);
-		acl_ids->z_fgid = 0;
-		if (vap->va_mask & AT_GID)  {
-			acl_ids->z_fgid = zfs_fuid_create(zfsvfs,
-			    (uint64_t)vap->va_gid,
-			    cr, ZFS_GROUP, &acl_ids->z_fuidp);
-			gid = vap->va_gid;
-			if (acl_ids->z_fgid != dzp->z_gid &&
-			    !groupmember(vap->va_gid, cr) &&
-			    secpolicy_vnode_create_gid(cr) != 0)
-				acl_ids->z_fgid = 0;
-		}
-		if (acl_ids->z_fgid == 0) {
-#ifndef __FreeBSD_kernel__
-			if (dzp->z_mode & S_ISGID) {
-#endif
-				char		*domain;
-				uint32_t	rid;
-
-				acl_ids->z_fgid = dzp->z_gid;
-				gid = zfs_fuid_map_id(zfsvfs, acl_ids->z_fgid,
-				    cr, ZFS_GROUP);
-
-				if (zfsvfs->z_use_fuids &&
-				    IS_EPHEMERAL(acl_ids->z_fgid)) {
-					domain = zfs_fuid_idx_domain(
-					    &zfsvfs->z_fuid_idx,
-					    FUID_INDEX(acl_ids->z_fgid));
-					rid = FUID_RID(acl_ids->z_fgid);
-					zfs_fuid_node_add(&acl_ids->z_fuidp,
-					    domain, rid,
-					    FUID_INDEX(acl_ids->z_fgid),
-					    acl_ids->z_fgid, ZFS_GROUP);
-				}
-#ifndef __FreeBSD_kernel__
-			} else {
-				acl_ids->z_fgid = zfs_fuid_create_cred(zfsvfs,
-				    ZFS_GROUP, cr, &acl_ids->z_fuidp);
-				gid = crgetgid(cr);
-			}
-#endif
-		}
-	}
-
-	/*
-	 * If we're creating a directory, and the parent directory has the
-	 * set-GID bit set, set in on the new directory.
-	 * Otherwise, if the user is neither privileged nor a member of the
-	 * file's new group, clear the file's set-GID bit.
-	 */
-
-	if (!(flag & IS_ROOT_NODE) && (dzp->z_mode & S_ISGID) &&
-	    (vap->va_type == VDIR)) {
-		acl_ids->z_mode |= S_ISGID;
-	} else {
-		if ((acl_ids->z_mode & S_ISGID) &&
-		    secpolicy_vnode_setids_setgids(ZTOV(dzp), cr, gid) != 0)
-			acl_ids->z_mode &= ~S_ISGID;
-	}
-
-	if (acl_ids->z_aclp == NULL) {
-		mutex_enter(&dzp->z_acl_lock);
-		if (!(flag & IS_ROOT_NODE) &&
-		    (dzp->z_pflags & ZFS_INHERIT_ACE) &&
-		    !(dzp->z_pflags & ZFS_XATTR)) {
-			VERIFY(0 == zfs_acl_node_read(dzp, &paclp, B_FALSE));
-			acl_ids->z_aclp = zfs_acl_inherit(zfsvfs,
-			    vap->va_type, paclp, acl_ids->z_mode, &need_chmod);
-			inherited = B_TRUE;
-		} else {
-			acl_ids->z_aclp =
-			    zfs_acl_alloc(zfs_acl_version_zp(dzp));
-			acl_ids->z_aclp->z_hints |= ZFS_ACL_TRIVIAL;
-		}
-		mutex_exit(&dzp->z_acl_lock);
-
-		if (need_chmod) {
-			if (vap->va_type == VDIR)
-				acl_ids->z_aclp->z_hints |=
-				    ZFS_ACL_AUTO_INHERIT;
-
-			if (zfsvfs->z_acl_mode == ZFS_ACL_GROUPMASK &&
-			    zfsvfs->z_acl_inherit != ZFS_ACL_PASSTHROUGH &&
-			    zfsvfs->z_acl_inherit != ZFS_ACL_PASSTHROUGH_X)
-				trim = B_TRUE;
-			zfs_acl_chmod(vap->va_type, acl_ids->z_mode, B_FALSE,
-			    trim, acl_ids->z_aclp);
-		}
-	}
-
-	if (inherited || vsecp) {
-		acl_ids->z_mode = zfs_mode_compute(acl_ids->z_mode,
-		    acl_ids->z_aclp, &acl_ids->z_aclp->z_hints,
-		    acl_ids->z_fuid, acl_ids->z_fgid);
-		if (ace_trivial_common(acl_ids->z_aclp, 0, zfs_ace_walk) == 0)
-			acl_ids->z_aclp->z_hints |= ZFS_ACL_TRIVIAL;
-	}
-
-	return (0);
-}
-
-/*
- * Free ACL and fuid_infop, but not the acl_ids structure
- */
-void
-zfs_acl_ids_free(zfs_acl_ids_t *acl_ids)
-{
-	if (acl_ids->z_aclp)
-		zfs_acl_free(acl_ids->z_aclp);
-	if (acl_ids->z_fuidp)
-		zfs_fuid_info_free(acl_ids->z_fuidp);
-	acl_ids->z_aclp = NULL;
-	acl_ids->z_fuidp = NULL;
-}
-
-boolean_t
-zfs_acl_ids_overquota(zfsvfs_t *zfsvfs, zfs_acl_ids_t *acl_ids)
-{
-	return (zfs_fuid_overquota(zfsvfs, B_FALSE, acl_ids->z_fuid) ||
-	    zfs_fuid_overquota(zfsvfs, B_TRUE, acl_ids->z_fgid));
-}
-
-/*
- * Retrieve a file's ACL
- */
-int
-zfs_getacl(znode_t *zp, vsecattr_t *vsecp, boolean_t skipaclchk, cred_t *cr)
-{
-	zfs_acl_t	*aclp;
-	ulong_t		mask;
-	int		error;
-	int		count = 0;
-	int		largeace = 0;
-
-	mask = vsecp->vsa_mask & (VSA_ACE | VSA_ACECNT |
-	    VSA_ACE_ACLFLAGS | VSA_ACE_ALLTYPES);
-
-	if (mask == 0)
-		return (SET_ERROR(ENOSYS));
-
-	if (error = zfs_zaccess(zp, ACE_READ_ACL, 0, skipaclchk, cr))
-		return (error);
-
-	mutex_enter(&zp->z_acl_lock);
-
-	ASSERT_VOP_LOCKED(ZTOV(zp), __func__);
-	error = zfs_acl_node_read(zp, &aclp, B_FALSE);
-	if (error != 0) {
-		mutex_exit(&zp->z_acl_lock);
-		return (error);
-	}
-
-	/*
-	 * Scan ACL to determine number of ACEs
-	 */
-	if ((zp->z_pflags & ZFS_ACL_OBJ_ACE) && !(mask & VSA_ACE_ALLTYPES)) {
-		void *zacep = NULL;
-		uint64_t who;
-		uint32_t access_mask;
-		uint16_t type, iflags;
-
-		while (zacep = zfs_acl_next_ace(aclp, zacep,
-		    &who, &access_mask, &iflags, &type)) {
-			switch (type) {
-			case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE:
-			case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE:
-			case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE:
-			case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE:
-				largeace++;
-				continue;
-			default:
-				count++;
-			}
-		}
-		vsecp->vsa_aclcnt = count;
-	} else
-		count = (int)aclp->z_acl_count;
-
-	if (mask & VSA_ACECNT) {
-		vsecp->vsa_aclcnt = count;
-	}
-
-	if (mask & VSA_ACE) {
-		size_t aclsz;
-
-		aclsz = count * sizeof (ace_t) +
-		    sizeof (ace_object_t) * largeace;
-
-		vsecp->vsa_aclentp = kmem_alloc(aclsz, KM_SLEEP);
-		vsecp->vsa_aclentsz = aclsz;
-
-		if (aclp->z_version == ZFS_ACL_VERSION_FUID)
-			zfs_copy_fuid_2_ace(zp->z_zfsvfs, aclp, cr,
-			    vsecp->vsa_aclentp, !(mask & VSA_ACE_ALLTYPES));
-		else {
-			zfs_acl_node_t *aclnode;
-			void *start = vsecp->vsa_aclentp;
-
-			for (aclnode = list_head(&aclp->z_acl); aclnode;
-			    aclnode = list_next(&aclp->z_acl, aclnode)) {
-				bcopy(aclnode->z_acldata, start,
-				    aclnode->z_size);
-				start = (caddr_t)start + aclnode->z_size;
-			}
-			ASSERT((caddr_t)start - (caddr_t)vsecp->vsa_aclentp ==
-			    aclp->z_acl_bytes);
-		}
-	}
-	if (mask & VSA_ACE_ACLFLAGS) {
-		vsecp->vsa_aclflags = 0;
-		if (zp->z_pflags & ZFS_ACL_DEFAULTED)
-			vsecp->vsa_aclflags |= ACL_DEFAULTED;
-		if (zp->z_pflags & ZFS_ACL_PROTECTED)
-			vsecp->vsa_aclflags |= ACL_PROTECTED;
-		if (zp->z_pflags & ZFS_ACL_AUTO_INHERIT)
-			vsecp->vsa_aclflags |= ACL_AUTO_INHERIT;
-	}
-
-	mutex_exit(&zp->z_acl_lock);
-
-	return (0);
-}
-
-int
-zfs_vsec_2_aclp(zfsvfs_t *zfsvfs, vtype_t obj_type,
-    vsecattr_t *vsecp, cred_t *cr, zfs_fuid_info_t **fuidp, zfs_acl_t **zaclp)
-{
-	zfs_acl_t *aclp;
-	zfs_acl_node_t *aclnode;
-	int aclcnt = vsecp->vsa_aclcnt;
-	int error;
-
-	if (vsecp->vsa_aclcnt > MAX_ACL_ENTRIES || vsecp->vsa_aclcnt <= 0)
-		return (SET_ERROR(EINVAL));
-
-	aclp = zfs_acl_alloc(zfs_acl_version(zfsvfs->z_version));
-
-	aclp->z_hints = 0;
-	aclnode = zfs_acl_node_alloc(aclcnt * sizeof (zfs_object_ace_t));
-	if (aclp->z_version == ZFS_ACL_VERSION_INITIAL) {
-		if ((error = zfs_copy_ace_2_oldace(obj_type, aclp,
-		    (ace_t *)vsecp->vsa_aclentp, aclnode->z_acldata,
-		    aclcnt, &aclnode->z_size)) != 0) {
-			zfs_acl_free(aclp);
-			zfs_acl_node_free(aclnode);
-			return (error);
-		}
-	} else {
-		if ((error = zfs_copy_ace_2_fuid(zfsvfs, obj_type, aclp,
-		    vsecp->vsa_aclentp, aclnode->z_acldata, aclcnt,
-		    &aclnode->z_size, fuidp, cr)) != 0) {
-			zfs_acl_free(aclp);
-			zfs_acl_node_free(aclnode);
-			return (error);
-		}
-	}
-	aclp->z_acl_bytes = aclnode->z_size;
-	aclnode->z_ace_count = aclcnt;
-	aclp->z_acl_count = aclcnt;
-	list_insert_head(&aclp->z_acl, aclnode);
-
-	/*
-	 * If flags are being set then add them to z_hints
-	 */
-	if (vsecp->vsa_mask & VSA_ACE_ACLFLAGS) {
-		if (vsecp->vsa_aclflags & ACL_PROTECTED)
-			aclp->z_hints |= ZFS_ACL_PROTECTED;
-		if (vsecp->vsa_aclflags & ACL_DEFAULTED)
-			aclp->z_hints |= ZFS_ACL_DEFAULTED;
-		if (vsecp->vsa_aclflags & ACL_AUTO_INHERIT)
-			aclp->z_hints |= ZFS_ACL_AUTO_INHERIT;
-	}
-
-	*zaclp = aclp;
-
-	return (0);
-}
-
-/*
- * Set a file's ACL
- */
-int
-zfs_setacl(znode_t *zp, vsecattr_t *vsecp, boolean_t skipaclchk, cred_t *cr)
-{
-	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
-	zilog_t		*zilog = zfsvfs->z_log;
-	ulong_t		mask = vsecp->vsa_mask & (VSA_ACE | VSA_ACECNT);
-	dmu_tx_t	*tx;
-	int		error;
-	zfs_acl_t	*aclp;
-	zfs_fuid_info_t	*fuidp = NULL;
-	boolean_t	fuid_dirtied;
-	uint64_t	acl_obj;
-
-	ASSERT_VOP_ELOCKED(ZTOV(zp), __func__);
-	if (mask == 0)
-		return (SET_ERROR(ENOSYS));
-
-	if (zp->z_pflags & ZFS_IMMUTABLE)
-		return (SET_ERROR(EPERM));
-
-	if (error = zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr))
-		return (error);
-
-	error = zfs_vsec_2_aclp(zfsvfs, ZTOV(zp)->v_type, vsecp, cr, &fuidp,
-	    &aclp);
-	if (error)
-		return (error);
-
-	/*
-	 * If ACL wide flags aren't being set then preserve any
-	 * existing flags.
-	 */
-	if (!(vsecp->vsa_mask & VSA_ACE_ACLFLAGS)) {
-		aclp->z_hints |=
-		    (zp->z_pflags & V4_ACL_WIDE_FLAGS);
-	}
-top:
-	mutex_enter(&zp->z_acl_lock);
-
-	tx = dmu_tx_create(zfsvfs->z_os);
-
-	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
-
-	fuid_dirtied = zfsvfs->z_fuid_dirty;
-	if (fuid_dirtied)
-		zfs_fuid_txhold(zfsvfs, tx);
-
-	/*
-	 * If old version and ACL won't fit in bonus and we aren't
-	 * upgrading then take out necessary DMU holds
-	 */
-
-	if ((acl_obj = zfs_external_acl(zp)) != 0) {
-		if (zfsvfs->z_version >= ZPL_VERSION_FUID &&
-		    zfs_znode_acl_version(zp) <= ZFS_ACL_VERSION_INITIAL) {
-			dmu_tx_hold_free(tx, acl_obj, 0,
-			    DMU_OBJECT_END);
-			dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
-			    aclp->z_acl_bytes);
-		} else {
-			dmu_tx_hold_write(tx, acl_obj, 0, aclp->z_acl_bytes);
-		}
-	} else if (!zp->z_is_sa && aclp->z_acl_bytes > ZFS_ACE_SPACE) {
-		dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, aclp->z_acl_bytes);
-	}
-
-	zfs_sa_upgrade_txholds(tx, zp);
-	error = dmu_tx_assign(tx, TXG_NOWAIT);
-	if (error) {
-		mutex_exit(&zp->z_acl_lock);
-
-		if (error == ERESTART) {
-			dmu_tx_wait(tx);
-			dmu_tx_abort(tx);
-			goto top;
-		}
-		dmu_tx_abort(tx);
-		zfs_acl_free(aclp);
-		return (error);
-	}
-
-	error = zfs_aclset_common(zp, aclp, cr, tx);
-	ASSERT(error == 0);
-	ASSERT(zp->z_acl_cached == NULL);
-	zp->z_acl_cached = aclp;
-
-	if (fuid_dirtied)
-		zfs_fuid_sync(zfsvfs, tx);
-
-	zfs_log_acl(zilog, tx, zp, vsecp, fuidp);
-
-	if (fuidp)
-		zfs_fuid_info_free(fuidp);
-	dmu_tx_commit(tx);
-	mutex_exit(&zp->z_acl_lock);
-
-	return (error);
-}
-
-/*
- * Check accesses of interest (AoI) against attributes of the dataset
- * such as read-only.  Returns zero if no AoI conflict with dataset
- * attributes, otherwise an appropriate errno is returned.
- */
-static int
-zfs_zaccess_dataset_check(znode_t *zp, uint32_t v4_mode)
-{
-	if ((v4_mode & WRITE_MASK) &&
-	    (zp->z_zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) &&
-	    (!IS_DEVVP(ZTOV(zp)) ||
-	    (IS_DEVVP(ZTOV(zp)) && (v4_mode & WRITE_MASK_ATTRS)))) {
-		return (SET_ERROR(EROFS));
-	}
-
-	/*
-	 * Intentionally allow ZFS_READONLY through here.
-	 * See zfs_zaccess_common().
-	 */
-	if ((v4_mode & WRITE_MASK_DATA) &&
-	    (zp->z_pflags & ZFS_IMMUTABLE)) {
-		return (SET_ERROR(EPERM));
-	}
-
-#ifdef illumos
-	if ((v4_mode & (ACE_DELETE | ACE_DELETE_CHILD)) &&
-	    (zp->z_pflags & ZFS_NOUNLINK)) {
-		return (SET_ERROR(EPERM));
-	}
-#else
-	/*
-	 * In FreeBSD we allow to modify directory's content is ZFS_NOUNLINK
-	 * (sunlnk) is set. We just don't allow directory removal, which is
-	 * handled in zfs_zaccess_delete().
-	 */
-	if ((v4_mode & ACE_DELETE) &&
-	    (zp->z_pflags & ZFS_NOUNLINK)) {
-		return (EPERM);
-	}
-#endif
-
-	if (((v4_mode & (ACE_READ_DATA|ACE_EXECUTE)) &&
-	    (zp->z_pflags & ZFS_AV_QUARANTINED))) {
-		return (SET_ERROR(EACCES));
-	}
-
-	return (0);
-}
-
-/*
- * The primary usage of this function is to loop through all of the
- * ACEs in the znode, determining what accesses of interest (AoI) to
- * the caller are allowed or denied.  The AoI are expressed as bits in
- * the working_mode parameter.  As each ACE is processed, bits covered
- * by that ACE are removed from the working_mode.  This removal
- * facilitates two things.  The first is that when the working mode is
- * empty (= 0), we know we've looked at all the AoI. The second is
- * that the ACE interpretation rules don't allow a later ACE to undo
- * something granted or denied by an earlier ACE.  Removing the
- * discovered access or denial enforces this rule.  At the end of
- * processing the ACEs, all AoI that were found to be denied are
- * placed into the working_mode, giving the caller a mask of denied
- * accesses.  Returns:
- *	0		if all AoI granted
- *	EACCESS 	if the denied mask is non-zero
- *	other error	if abnormal failure (e.g., IO error)
- *
- * A secondary usage of the function is to determine if any of the
- * AoI are granted.  If an ACE grants any access in
- * the working_mode, we immediately short circuit out of the function.
- * This mode is chosen by setting anyaccess to B_TRUE.  The
- * working_mode is not a denied access mask upon exit if the function
- * is used in this manner.
- */
-static int
-zfs_zaccess_aces_check(znode_t *zp, uint32_t *working_mode,
-    boolean_t anyaccess, cred_t *cr)
-{
-	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
-	zfs_acl_t	*aclp;
-	int		error;
-	uid_t		uid = crgetuid(cr);
-	uint64_t	who;
-	uint16_t	type, iflags;
-	uint16_t	entry_type;
-	uint32_t	access_mask;
-	uint32_t	deny_mask = 0;
-	zfs_ace_hdr_t	*acep = NULL;
-	boolean_t	checkit;
-	uid_t		gowner;
-	uid_t		fowner;
-
-	zfs_fuid_map_ids(zp, cr, &fowner, &gowner);
-
-	mutex_enter(&zp->z_acl_lock);
-
-	ASSERT_VOP_LOCKED(ZTOV(zp), __func__);
-	error = zfs_acl_node_read(zp, &aclp, B_FALSE);
-	if (error != 0) {
-		mutex_exit(&zp->z_acl_lock);
-		return (error);
-	}
-
-	ASSERT(zp->z_acl_cached);
-
-	while (acep = zfs_acl_next_ace(aclp, acep, &who, &access_mask,
-	    &iflags, &type)) {
-		uint32_t mask_matched;
-
-		if (!zfs_acl_valid_ace_type(type, iflags))
-			continue;
-
-		if (ZTOV(zp)->v_type == VDIR && (iflags & ACE_INHERIT_ONLY_ACE))
-			continue;
-
-		/* Skip ACE if it does not affect any AoI */
-		mask_matched = (access_mask & *working_mode);
-		if (!mask_matched)
-			continue;
-
-		entry_type = (iflags & ACE_TYPE_FLAGS);
-
-		checkit = B_FALSE;
-
-		switch (entry_type) {
-		case ACE_OWNER:
-			if (uid == fowner)
-				checkit = B_TRUE;
-			break;
-		case OWNING_GROUP:
-			who = gowner;
-			/*FALLTHROUGH*/
-		case ACE_IDENTIFIER_GROUP:
-			checkit = zfs_groupmember(zfsvfs, who, cr);
-			break;
-		case ACE_EVERYONE:
-			checkit = B_TRUE;
-			break;
-
-		/* USER Entry */
-		default:
-			if (entry_type == 0) {
-				uid_t newid;
-
-				newid = zfs_fuid_map_id(zfsvfs, who, cr,
-				    ZFS_ACE_USER);
-				if (newid != IDMAP_WK_CREATOR_OWNER_UID &&
-				    uid == newid)
-					checkit = B_TRUE;
-				break;
-			} else {
-				mutex_exit(&zp->z_acl_lock);
-				return (SET_ERROR(EIO));
-			}
-		}
-
-		if (checkit) {
-			if (type == DENY) {
-				DTRACE_PROBE3(zfs__ace__denies,
-				    znode_t *, zp,
-				    zfs_ace_hdr_t *, acep,
-				    uint32_t, mask_matched);
-				deny_mask |= mask_matched;
-			} else {
-				DTRACE_PROBE3(zfs__ace__allows,
-				    znode_t *, zp,
-				    zfs_ace_hdr_t *, acep,
-				    uint32_t, mask_matched);
-				if (anyaccess) {
-					mutex_exit(&zp->z_acl_lock);
-					return (0);
-				}
-			}
-			*working_mode &= ~mask_matched;
-		}
-
-		/* Are we done? */
-		if (*working_mode == 0)
-			break;
-	}
-
-	mutex_exit(&zp->z_acl_lock);
-
-	/* Put the found 'denies' back on the working mode */
-	if (deny_mask) {
-		*working_mode |= deny_mask;
-		return (SET_ERROR(EACCES));
-	} else if (*working_mode) {
-		return (-1);
-	}
-
-	return (0);
-}
-
-/*
- * Return true if any access whatsoever granted, we don't actually
- * care what access is granted.
- */
-boolean_t
-zfs_has_access(znode_t *zp, cred_t *cr)
-{
-	uint32_t have = ACE_ALL_PERMS;
-
-	if (zfs_zaccess_aces_check(zp, &have, B_TRUE, cr) != 0) {
-		uid_t owner;
-
-		owner = zfs_fuid_map_id(zp->z_zfsvfs, zp->z_uid, cr, ZFS_OWNER);
-		return (secpolicy_vnode_any_access(cr, ZTOV(zp), owner) == 0);
-	}
-	return (B_TRUE);
-}
-
-static int
-zfs_zaccess_common(znode_t *zp, uint32_t v4_mode, uint32_t *working_mode,
-    boolean_t *check_privs, boolean_t skipaclchk, cred_t *cr)
-{
-	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
-	int err;
-
-	*working_mode = v4_mode;
-	*check_privs = B_TRUE;
-
-	/*
-	 * Short circuit empty requests
-	 */
-	if (v4_mode == 0 || zfsvfs->z_replay) {
-		*working_mode = 0;
-		return (0);
-	}
-
-	if ((err = zfs_zaccess_dataset_check(zp, v4_mode)) != 0) {
-		*check_privs = B_FALSE;
-		return (err);
-	}
-
-	/*
-	 * The caller requested that the ACL check be skipped.  This
-	 * would only happen if the caller checked VOP_ACCESS() with a
-	 * 32 bit ACE mask and already had the appropriate permissions.
-	 */
-	if (skipaclchk) {
-		*working_mode = 0;
-		return (0);
-	}
-
-	/*
-	 * Note: ZFS_READONLY represents the "DOS R/O" attribute.
-	 * When that flag is set, we should behave as if write access
-	 * were not granted by anything in the ACL.  In particular:
-	 * We _must_ allow writes after opening the file r/w, then
-	 * setting the DOS R/O attribute, and writing some more.
-	 * (Similar to how you can write after fchmod(fd, 0444).)
-	 *
-	 * Therefore ZFS_READONLY is ignored in the dataset check
-	 * above, and checked here as if part of the ACL check.
-	 * Also note: DOS R/O is ignored for directories.
-	 */
-	if ((v4_mode & WRITE_MASK_DATA) &&
-	    (ZTOV(zp)->v_type != VDIR) &&
-	    (zp->z_pflags & ZFS_READONLY)) {
-		return (SET_ERROR(EPERM));
-	}
-
-	return (zfs_zaccess_aces_check(zp, working_mode, B_FALSE, cr));
-}
-
-static int
-zfs_zaccess_append(znode_t *zp, uint32_t *working_mode, boolean_t *check_privs,
-    cred_t *cr)
-{
-	if (*working_mode != ACE_WRITE_DATA)
-		return (SET_ERROR(EACCES));
-
-	return (zfs_zaccess_common(zp, ACE_APPEND_DATA, working_mode,
-	    check_privs, B_FALSE, cr));
-}
-
-/*
- * Check if VEXEC is allowed.
- *
- * This routine is based on zfs_fastaccesschk_execute which has slowpath
- * calling zfs_zaccess. This would be incorrect on FreeBSD (see
- * zfs_freebsd_access for the difference). Thus this variant let's the
- * caller handle the slowpath (if necessary).
- *
- * We only check for ZFS_NO_EXECS_DENIED and fail early. This routine can
- * be extended to cover more cases, but the flag covers the majority.
- */
-int
-zfs_freebsd_fastaccesschk_execute(struct vnode *vp, cred_t *cr)
-{
-	boolean_t is_attr;
-	znode_t *zdp = VTOZ(vp);
-
-	ASSERT_VOP_LOCKED(vp, __func__);
-
-	if (zdp->z_pflags & ZFS_AV_QUARANTINED)
-		return (1);
-
-	is_attr = ((zdp->z_pflags & ZFS_XATTR) &&
-	    (ZTOV(zdp)->v_type == VDIR));
-	if (is_attr)
-		return (1);
-
-	if (zdp->z_pflags & ZFS_NO_EXECS_DENIED)
-		return (0);
-
-	return (1);
-}
-
-#ifdef illumos
-int
-zfs_fastaccesschk_execute(znode_t *zdp, cred_t *cr)
-{
-	boolean_t owner = B_FALSE;
-	boolean_t groupmbr = B_FALSE;
-	boolean_t is_attr;
-	uid_t uid = crgetuid(cr);
-	int error;
-
-	if (zdp->z_pflags & ZFS_AV_QUARANTINED)
-		return (SET_ERROR(EACCES));
-
-	is_attr = ((zdp->z_pflags & ZFS_XATTR) &&
-	    (ZTOV(zdp)->v_type == VDIR));
-	if (is_attr)
-		goto slow;
-
-
-	mutex_enter(&zdp->z_acl_lock);
-
-	if (zdp->z_pflags & ZFS_NO_EXECS_DENIED) {
-		mutex_exit(&zdp->z_acl_lock);
-		return (0);
-	}
-
-	if (FUID_INDEX(zdp->z_uid) != 0 || FUID_INDEX(zdp->z_gid) != 0) {
-		mutex_exit(&zdp->z_acl_lock);
-		goto slow;
-	}
-
-	if (uid == zdp->z_uid) {
-		owner = B_TRUE;
-		if (zdp->z_mode & S_IXUSR) {
-			mutex_exit(&zdp->z_acl_lock);
-			return (0);
-		} else {
-			mutex_exit(&zdp->z_acl_lock);
-			goto slow;
-		}
-	}
-	if (groupmember(zdp->z_gid, cr)) {
-		groupmbr = B_TRUE;
-		if (zdp->z_mode & S_IXGRP) {
-			mutex_exit(&zdp->z_acl_lock);
-			return (0);
-		} else {
-			mutex_exit(&zdp->z_acl_lock);
-			goto slow;
-		}
-	}
-	if (!owner && !groupmbr) {
-		if (zdp->z_mode & S_IXOTH) {
-			mutex_exit(&zdp->z_acl_lock);
-			return (0);
-		}
-	}
-
-	mutex_exit(&zdp->z_acl_lock);
-
-slow:
-	DTRACE_PROBE(zfs__fastpath__execute__access__miss);
-	ZFS_ENTER(zdp->z_zfsvfs);
-	error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr);
-	ZFS_EXIT(zdp->z_zfsvfs);
-	return (error);
-}
-#endif
-
-/*
- * Determine whether Access should be granted/denied.
- *
- * The least priv subsystem is always consulted as a basic privilege
- * can define any form of access.
- */
-int
-zfs_zaccess(znode_t *zp, int mode, int flags, boolean_t skipaclchk, cred_t *cr)
-{
-	uint32_t	working_mode;
-	int		error;
-	int		is_attr;
-	boolean_t	check_privs;
-	znode_t		*xzp;
-	znode_t		*check_zp = zp;
-	mode_t		needed_bits;
-	uid_t		owner;
-
-	is_attr = ((zp->z_pflags & ZFS_XATTR) && (ZTOV(zp)->v_type == VDIR));
-
-#ifdef __FreeBSD_kernel__
-	/*
-	 * In FreeBSD, we don't care about permissions of individual ADS.
-	 * Note that not checking them is not just an optimization - without
-	 * this shortcut, EA operations may bogusly fail with EACCES.
-	 */
-	if (zp->z_pflags & ZFS_XATTR)
-		return (0);
-#else
-	/*
-	 * If attribute then validate against base file
-	 */
-	if (is_attr) {
-		uint64_t	parent;
-
-		if ((error = sa_lookup(zp->z_sa_hdl,
-		    SA_ZPL_PARENT(zp->z_zfsvfs), &parent,
-		    sizeof (parent))) != 0)
-			return (error);
-
-		if ((error = zfs_zget(zp->z_zfsvfs,
-		    parent, &xzp)) != 0)	{
-			return (error);
-		}
-
-		check_zp = xzp;
-
-		/*
-		 * fixup mode to map to xattr perms
-		 */
-
-		if (mode & (ACE_WRITE_DATA|ACE_APPEND_DATA)) {
-			mode &= ~(ACE_WRITE_DATA|ACE_APPEND_DATA);
-			mode |= ACE_WRITE_NAMED_ATTRS;
-		}
-
-		if (mode & (ACE_READ_DATA|ACE_EXECUTE)) {
-			mode &= ~(ACE_READ_DATA|ACE_EXECUTE);
-			mode |= ACE_READ_NAMED_ATTRS;
-		}
-	}
-#endif
-
-	owner = zfs_fuid_map_id(zp->z_zfsvfs, zp->z_uid, cr, ZFS_OWNER);
-	/*
-	 * Map the bits required to the standard vnode flags VREAD|VWRITE|VEXEC
-	 * in needed_bits.  Map the bits mapped by working_mode (currently
-	 * missing) in missing_bits.
-	 * Call secpolicy_vnode_access2() with (needed_bits & ~checkmode),
-	 * needed_bits.
-	 */
-	needed_bits = 0;
-
-	working_mode = mode;
-	if ((working_mode & (ACE_READ_ACL|ACE_READ_ATTRIBUTES)) &&
-	    owner == crgetuid(cr))
-		working_mode &= ~(ACE_READ_ACL|ACE_READ_ATTRIBUTES);
-
-	if (working_mode & (ACE_READ_DATA|ACE_READ_NAMED_ATTRS|
-	    ACE_READ_ACL|ACE_READ_ATTRIBUTES|ACE_SYNCHRONIZE))
-		needed_bits |= VREAD;
-	if (working_mode & (ACE_WRITE_DATA|ACE_WRITE_NAMED_ATTRS|
-	    ACE_APPEND_DATA|ACE_WRITE_ATTRIBUTES|ACE_SYNCHRONIZE))
-		needed_bits |= VWRITE;
-	if (working_mode & ACE_EXECUTE)
-		needed_bits |= VEXEC;
-
-	if ((error = zfs_zaccess_common(check_zp, mode, &working_mode,
-	    &check_privs, skipaclchk, cr)) == 0) {
-		if (is_attr)
-			VN_RELE(ZTOV(xzp));
-		return (secpolicy_vnode_access2(cr, ZTOV(zp), owner,
-		    needed_bits, needed_bits));
-	}
-
-	if (error && !check_privs) {
-		if (is_attr)
-			VN_RELE(ZTOV(xzp));
-		return (error);
-	}
-
-	if (error && (flags & V_APPEND)) {
-		error = zfs_zaccess_append(zp, &working_mode, &check_privs, cr);
-	}
-
-	if (error && check_privs) {
-		mode_t		checkmode = 0;
-
-		/*
-		 * First check for implicit owner permission on
-		 * read_acl/read_attributes
-		 */
-
-		error = 0;
-		ASSERT(working_mode != 0);
-
-		if ((working_mode & (ACE_READ_ACL|ACE_READ_ATTRIBUTES) &&
-		    owner == crgetuid(cr)))
-			working_mode &= ~(ACE_READ_ACL|ACE_READ_ATTRIBUTES);
-
-		if (working_mode & (ACE_READ_DATA|ACE_READ_NAMED_ATTRS|
-		    ACE_READ_ACL|ACE_READ_ATTRIBUTES|ACE_SYNCHRONIZE))
-			checkmode |= VREAD;
-		if (working_mode & (ACE_WRITE_DATA|ACE_WRITE_NAMED_ATTRS|
-		    ACE_APPEND_DATA|ACE_WRITE_ATTRIBUTES|ACE_SYNCHRONIZE))
-			checkmode |= VWRITE;
-		if (working_mode & ACE_EXECUTE)
-			checkmode |= VEXEC;
-
-		error = secpolicy_vnode_access2(cr, ZTOV(check_zp), owner,
-		    needed_bits & ~checkmode, needed_bits);
-
-		if (error == 0 && (working_mode & ACE_WRITE_OWNER))
-			error = secpolicy_vnode_chown(ZTOV(check_zp), cr, owner);
-		if (error == 0 && (working_mode & ACE_WRITE_ACL))
-			error = secpolicy_vnode_setdac(ZTOV(check_zp), cr, owner);
-
-		if (error == 0 && (working_mode &
-		    (ACE_DELETE|ACE_DELETE_CHILD)))
-			error = secpolicy_vnode_remove(ZTOV(check_zp), cr);
-
-		if (error == 0 && (working_mode & ACE_SYNCHRONIZE)) {
-			error = secpolicy_vnode_chown(ZTOV(check_zp), cr, owner);
-		}
-		if (error == 0) {
-			/*
-			 * See if any bits other than those already checked
-			 * for are still present.  If so then return EACCES
-			 */
-			if (working_mode & ~(ZFS_CHECKED_MASKS)) {
-				error = SET_ERROR(EACCES);
-			}
-		}
-	} else if (error == 0) {
-		error = secpolicy_vnode_access2(cr, ZTOV(zp), owner,
-		    needed_bits, needed_bits);
-	}
-
-
-	if (is_attr)
-		VN_RELE(ZTOV(xzp));
-
-	return (error);
-}
-
-/*
- * Translate traditional unix VREAD/VWRITE/VEXEC mode into
- * native ACL format and call zfs_zaccess()
- */
-int
-zfs_zaccess_rwx(znode_t *zp, mode_t mode, int flags, cred_t *cr)
-{
-	return (zfs_zaccess(zp, zfs_unix_to_v4(mode >> 6), flags, B_FALSE, cr));
-}
-
-/*
- * Access function for secpolicy_vnode_setattr
- */
-int
-zfs_zaccess_unix(znode_t *zp, mode_t mode, cred_t *cr)
-{
-	int v4_mode = zfs_unix_to_v4(mode >> 6);
-
-	return (zfs_zaccess(zp, v4_mode, 0, B_FALSE, cr));
-}
-
-static int
-zfs_delete_final_check(znode_t *zp, znode_t *dzp,
-    mode_t available_perms, cred_t *cr)
-{
-	int error;
-	uid_t downer;
-
-	downer = zfs_fuid_map_id(dzp->z_zfsvfs, dzp->z_uid, cr, ZFS_OWNER);
-
-	error = secpolicy_vnode_access2(cr, ZTOV(dzp),
-	    downer, available_perms, VWRITE|VEXEC);
-
-	if (error == 0)
-		error = zfs_sticky_remove_access(dzp, zp, cr);
-
-	return (error);
-}
-
-/*
- * Determine whether Access should be granted/deny, without
- * consulting least priv subsystem.
- *
- * The following chart is the recommended NFSv4 enforcement for
- * ability to delete an object.
- *
- *      -------------------------------------------------------
- *      |   Parent Dir  |           Target Object Permissions |
- *      |  permissions  |                                     |
- *      -------------------------------------------------------
- *      |               | ACL Allows | ACL Denies| Delete     |
- *      |               |  Delete    |  Delete   | unspecified|
- *      -------------------------------------------------------
- *      |  ACL Allows   | Permit     | Permit    | Permit     |
- *      |  DELETE_CHILD |                                     |
- *      -------------------------------------------------------
- *      |  ACL Denies   | Permit     | Deny      | Deny       |
- *      |  DELETE_CHILD |            |           |            |
- *      -------------------------------------------------------
- *      | ACL specifies |            |           |            |
- *      | only allow    | Permit     | Permit    | Permit     |
- *      | write and     |            |           |            |
- *      | execute       |            |           |            |
- *      -------------------------------------------------------
- *      | ACL denies    |            |           |            |
- *      | write and     | Permit     | Deny      | Deny       |
- *      | execute       |            |           |            |
- *      -------------------------------------------------------
- *         ^
- *         |
- *         No search privilege, can't even look up file?
- *
- */
-int
-zfs_zaccess_delete(znode_t *dzp, znode_t *zp, cred_t *cr)
-{
-	uint32_t dzp_working_mode = 0;
-	uint32_t zp_working_mode = 0;
-	int dzp_error, zp_error;
-	mode_t available_perms;
-	boolean_t dzpcheck_privs = B_TRUE;
-	boolean_t zpcheck_privs = B_TRUE;
-
-	/*
-	 * We want specific DELETE permissions to
-	 * take precedence over WRITE/EXECUTE.  We don't
-	 * want an ACL such as this to mess us up.
-	 * user:joe:write_data:deny,user:joe:delete:allow
-	 *
-	 * However, deny permissions may ultimately be overridden
-	 * by secpolicy_vnode_access().
-	 *
-	 * We will ask for all of the necessary permissions and then
-	 * look at the working modes from the directory and target object
-	 * to determine what was found.
-	 */
-
-	if (zp->z_pflags & (ZFS_IMMUTABLE | ZFS_NOUNLINK))
-		return (SET_ERROR(EPERM));
-
-	/*
-	 * First row
-	 * If the directory permissions allow the delete, we are done.
-	 */
-	if ((dzp_error = zfs_zaccess_common(dzp, ACE_DELETE_CHILD,
-	    &dzp_working_mode, &dzpcheck_privs, B_FALSE, cr)) == 0)
-		return (0);
-
-	/*
-	 * If target object has delete permission then we are done
-	 */
-	if ((zp_error = zfs_zaccess_common(zp, ACE_DELETE, &zp_working_mode,
-	    &zpcheck_privs, B_FALSE, cr)) == 0)
-		return (0);
-
-	ASSERT(dzp_error && zp_error);
-
-	if (!dzpcheck_privs)
-		return (dzp_error);
-	if (!zpcheck_privs)
-		return (zp_error);
-
-	/*
-	 * Second row
-	 *
-	 * If directory returns EACCES then delete_child was denied
-	 * due to deny delete_child.  In this case send the request through
-	 * secpolicy_vnode_remove().  We don't use zfs_delete_final_check()
-	 * since that *could* allow the delete based on write/execute permission
-	 * and we want delete permissions to override write/execute.
-	 */
-
-	if (dzp_error == EACCES)
-		return (secpolicy_vnode_remove(ZTOV(dzp), cr));	/* XXXPJD: s/dzp/zp/ ? */
-
-	/*
-	 * Third Row
-	 * only need to see if we have write/execute on directory.
-	 */
-
-	dzp_error = zfs_zaccess_common(dzp, ACE_EXECUTE|ACE_WRITE_DATA,
-	    &dzp_working_mode, &dzpcheck_privs, B_FALSE, cr);
-
-	if (dzp_error != 0 && !dzpcheck_privs)
-		return (dzp_error);
-
-	/*
-	 * Fourth row
-	 */
-
-	available_perms = (dzp_working_mode & ACE_WRITE_DATA) ? 0 : VWRITE;
-	available_perms |= (dzp_working_mode & ACE_EXECUTE) ? 0 : VEXEC;
-
-	return (zfs_delete_final_check(zp, dzp, available_perms, cr));
-
-}
-
-int
-zfs_zaccess_rename(znode_t *sdzp, znode_t *szp, znode_t *tdzp,
-    znode_t *tzp, cred_t *cr)
-{
-	int add_perm;
-	int error;
-
-	if (szp->z_pflags & ZFS_AV_QUARANTINED)
-		return (SET_ERROR(EACCES));
-
-	add_perm = (ZTOV(szp)->v_type == VDIR) ?
-	    ACE_ADD_SUBDIRECTORY : ACE_ADD_FILE;
-
-	/*
-	 * Rename permissions are combination of delete permission +
-	 * add file/subdir permission.
-	 *
-	 * BSD operating systems also require write permission
-	 * on the directory being moved from one parent directory
-	 * to another.
-	 */
-	if (ZTOV(szp)->v_type == VDIR && ZTOV(sdzp) != ZTOV(tdzp)) {
-		if (error = zfs_zaccess(szp, ACE_WRITE_DATA, 0, B_FALSE, cr))
-			return (error);
-	}
-
-	/*
-	 * first make sure we do the delete portion.
-	 *
-	 * If that succeeds then check for add_file/add_subdir permissions
-	 */
-
-	if (error = zfs_zaccess_delete(sdzp, szp, cr))
-		return (error);
-
-	/*
-	 * If we have a tzp, see if we can delete it?
-	 */
-	if (tzp) {
-		if (error = zfs_zaccess_delete(tdzp, tzp, cr))
-			return (error);
-	}
-
-	/*
-	 * Now check for add permissions
-	 */
-	error = zfs_zaccess(tdzp, add_perm, 0, B_FALSE, cr);
-
-	return (error);
-}
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_byteswap.c
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_byteswap.c
+++ /dev/null
@@ -1,199 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-
-#include <sys/zfs_context.h>
-#include <sys/vfs.h>
-#include <sys/fs/zfs.h>
-#include <sys/zfs_znode.h>
-#include <sys/zfs_sa.h>
-#include <sys/zfs_acl.h>
-
-void
-zfs_oldace_byteswap(ace_t *ace, int ace_cnt)
-{
-	int i;
-
-	for (i = 0; i != ace_cnt; i++, ace++) {
-		ace->a_who = BSWAP_32(ace->a_who);
-		ace->a_access_mask = BSWAP_32(ace->a_access_mask);
-		ace->a_flags = BSWAP_16(ace->a_flags);
-		ace->a_type = BSWAP_16(ace->a_type);
-	}
-}
-
-/*
- * swap ace_t and ace_oject_t
- */
-void
-zfs_ace_byteswap(void *buf, size_t size, boolean_t zfs_layout)
-{
-	caddr_t end;
-	caddr_t ptr;
-	zfs_ace_t *zacep = NULL;
-	ace_t *acep;
-	uint16_t entry_type;
-	size_t entry_size;
-	int ace_type;
-
-	end = (caddr_t)buf + size;
-	ptr = buf;
-
-	while (ptr < end) {
-		if (zfs_layout) {
-			/*
-			 * Avoid overrun.  Embedded aces can have one
-			 * of several sizes.  We don't know exactly
-			 * how many our present, only the size of the
-			 * buffer containing them.  That size may be
-			 * larger than needed to hold the aces
-			 * present.  As long as we do not do any
-			 * swapping beyond the end of our block we are
-			 * okay.  It it safe to swap any non-ace data
-			 * within the block since it is just zeros.
-			 */
-			if (ptr + sizeof (zfs_ace_hdr_t) > end) {
-				break;
-			}
-			zacep = (zfs_ace_t *)ptr;
-			zacep->z_hdr.z_access_mask =
-			    BSWAP_32(zacep->z_hdr.z_access_mask);
-			zacep->z_hdr.z_flags = BSWAP_16(zacep->z_hdr.z_flags);
-			ace_type = zacep->z_hdr.z_type =
-			    BSWAP_16(zacep->z_hdr.z_type);
-			entry_type = zacep->z_hdr.z_flags & ACE_TYPE_FLAGS;
-		} else {
-			/* Overrun avoidance */
-			if (ptr + sizeof (ace_t) > end) {
-				break;
-			}
-			acep = (ace_t *)ptr;
-			acep->a_access_mask = BSWAP_32(acep->a_access_mask);
-			acep->a_flags = BSWAP_16(acep->a_flags);
-			ace_type = acep->a_type = BSWAP_16(acep->a_type);
-			acep->a_who = BSWAP_32(acep->a_who);
-			entry_type = acep->a_flags & ACE_TYPE_FLAGS;
-		}
-		switch (entry_type) {
-		case ACE_OWNER:
-		case ACE_EVERYONE:
-		case (ACE_IDENTIFIER_GROUP | ACE_GROUP):
-			entry_size = zfs_layout ?
-			    sizeof (zfs_ace_hdr_t) : sizeof (ace_t);
-			break;
-		case ACE_IDENTIFIER_GROUP:
-		default:
-			/* Overrun avoidance */
-			if (zfs_layout) {
-				if (ptr + sizeof (zfs_ace_t) <= end) {
-					zacep->z_fuid = BSWAP_64(zacep->z_fuid);
-				} else {
-					entry_size = sizeof (zfs_ace_t);
-					break;
-				}
-			}
-			switch (ace_type) {
-			case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE:
-			case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE:
-			case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE:
-			case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE:
-				entry_size = zfs_layout ?
-				    sizeof (zfs_object_ace_t) :
-				    sizeof (ace_object_t);
-				break;
-			default:
-				entry_size = zfs_layout ? sizeof (zfs_ace_t) :
-				    sizeof (ace_t);
-				break;
-			}
-		}
-		ptr = ptr + entry_size;
-	}
-}
-
-/* ARGSUSED */
-void
-zfs_oldacl_byteswap(void *buf, size_t size)
-{
-	int cnt;
-
-	/*
-	 * Arggh, since we don't know how many ACEs are in
-	 * the array, we have to swap the entire block
-	 */
-
-	cnt = size / sizeof (ace_t);
-
-	zfs_oldace_byteswap((ace_t *)buf, cnt);
-}
-
-/* ARGSUSED */
-void
-zfs_acl_byteswap(void *buf, size_t size)
-{
-	zfs_ace_byteswap(buf, size, B_TRUE);
-}
-
-void
-zfs_znode_byteswap(void *buf, size_t size)
-{
-	znode_phys_t *zp = buf;
-
-	ASSERT(size >= sizeof (znode_phys_t));
-
-	zp->zp_crtime[0] = BSWAP_64(zp->zp_crtime[0]);
-	zp->zp_crtime[1] = BSWAP_64(zp->zp_crtime[1]);
-	zp->zp_atime[0] = BSWAP_64(zp->zp_atime[0]);
-	zp->zp_atime[1] = BSWAP_64(zp->zp_atime[1]);
-	zp->zp_mtime[0] = BSWAP_64(zp->zp_mtime[0]);
-	zp->zp_mtime[1] = BSWAP_64(zp->zp_mtime[1]);
-	zp->zp_ctime[0] = BSWAP_64(zp->zp_ctime[0]);
-	zp->zp_ctime[1] = BSWAP_64(zp->zp_ctime[1]);
-	zp->zp_gen = BSWAP_64(zp->zp_gen);
-	zp->zp_mode = BSWAP_64(zp->zp_mode);
-	zp->zp_size = BSWAP_64(zp->zp_size);
-	zp->zp_parent = BSWAP_64(zp->zp_parent);
-	zp->zp_links = BSWAP_64(zp->zp_links);
-	zp->zp_xattr = BSWAP_64(zp->zp_xattr);
-	zp->zp_rdev = BSWAP_64(zp->zp_rdev);
-	zp->zp_flags = BSWAP_64(zp->zp_flags);
-	zp->zp_uid = BSWAP_64(zp->zp_uid);
-	zp->zp_gid = BSWAP_64(zp->zp_gid);
-	zp->zp_zap = BSWAP_64(zp->zp_zap);
-	zp->zp_pad[0] = BSWAP_64(zp->zp_pad[0]);
-	zp->zp_pad[1] = BSWAP_64(zp->zp_pad[1]);
-	zp->zp_pad[2] = BSWAP_64(zp->zp_pad[2]);
-
-	zp->zp_acl.z_acl_extern_obj = BSWAP_64(zp->zp_acl.z_acl_extern_obj);
-	zp->zp_acl.z_acl_size = BSWAP_32(zp->zp_acl.z_acl_size);
-	zp->zp_acl.z_acl_version = BSWAP_16(zp->zp_acl.z_acl_version);
-	zp->zp_acl.z_acl_count = BSWAP_16(zp->zp_acl.z_acl_count);
-	if (zp->zp_acl.z_acl_version == ZFS_ACL_VERSION) {
-		zfs_acl_byteswap((void *)&zp->zp_acl.z_ace_data[0],
-		    ZFS_ACE_SPACE);
-	} else {
-		zfs_oldace_byteswap((ace_t *)&zp->zp_acl.z_ace_data[0],
-		    ACE_SLOT_CNT);
-	}
-}
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ctldir.c
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ctldir.c
+++ /dev/null
@@ -1,1364 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
- * Copyright 2015, OmniTI Computer Consulting, Inc. All rights reserved.
- */
-
-/*
- * ZFS control directory (a.k.a. ".zfs")
- *
- * This directory provides a common location for all ZFS meta-objects.
- * Currently, this is only the 'snapshot' directory, but this may expand in the
- * future.  The elements are built using the GFS primitives, as the hierarchy
- * does not actually exist on disk.
- *
- * For 'snapshot', we don't want to have all snapshots always mounted, because
- * this would take up a huge amount of space in /etc/mnttab.  We have three
- * types of objects:
- *
- * 	ctldir ------> snapshotdir -------> snapshot
- *                                             |
- *                                             |
- *                                             V
- *                                         mounted fs
- *
- * The 'snapshot' node contains just enough information to lookup '..' and act
- * as a mountpoint for the snapshot.  Whenever we lookup a specific snapshot, we
- * perform an automount of the underlying filesystem and return the
- * corresponding vnode.
- *
- * All mounts are handled automatically by the kernel, but unmounts are
- * (currently) handled from user land.  The main reason is that there is no
- * reliable way to auto-unmount the filesystem when it's "no longer in use".
- * When the user unmounts a filesystem, we call zfsctl_unmount(), which
- * unmounts any snapshots within the snapshot directory.
- *
- * The '.zfs', '.zfs/snapshot', and all directories created under
- * '.zfs/snapshot' (ie: '.zfs/snapshot/<snapname>') are all GFS nodes and
- * share the same vfs_t as the head filesystem (what '.zfs' lives under).
- *
- * File systems mounted ontop of the GFS nodes '.zfs/snapshot/<snapname>'
- * (ie: snapshots) are ZFS nodes and have their own unique vfs_t.
- * However, vnodes within these mounted on file systems have their v_vfsp
- * fields set to the head filesystem to make NFS happy (see
- * zfsctl_snapdir_lookup()). We VFS_HOLD the head filesystem's vfs_t
- * so that it cannot be freed until all snapshots have been unmounted.
- */
-
-#include <sys/zfs_context.h>
-#include <sys/zfs_ctldir.h>
-#include <sys/zfs_ioctl.h>
-#include <sys/zfs_vfsops.h>
-#include <sys/namei.h>
-#include <sys/stat.h>
-#include <sys/dmu.h>
-#include <sys/dsl_dataset.h>
-#include <sys/dsl_destroy.h>
-#include <sys/dsl_deleg.h>
-#include <sys/mount.h>
-#include <sys/zap.h>
-
-#include "zfs_namecheck.h"
-
-/* Common access mode for all virtual directories under the ctldir */
-const u_short zfsctl_ctldir_mode = S_IRUSR | S_IXUSR | S_IRGRP | S_IXGRP |
-    S_IROTH | S_IXOTH;
-
-/*
- * "Synthetic" filesystem implementation.
- */
-
-/*
- * Assert that A implies B.
- */
-#define KASSERT_IMPLY(A, B, msg)	KASSERT(!(A) || (B), (msg));
-
-static MALLOC_DEFINE(M_SFSNODES, "sfs_nodes", "synthetic-fs nodes");
-
-typedef struct sfs_node {
-	char		sn_name[ZFS_MAX_DATASET_NAME_LEN];
-	uint64_t	sn_parent_id;
-	uint64_t	sn_id;
-} sfs_node_t;
-
-/*
- * Check the parent's ID as well as the node's to account for a chance
- * that IDs originating from different domains (snapshot IDs, artifical
- * IDs, znode IDs) may clash.
- */
-static int
-sfs_compare_ids(struct vnode *vp, void *arg)
-{
-	sfs_node_t *n1 = vp->v_data;
-	sfs_node_t *n2 = arg;
-	bool equal;
-
-	equal = n1->sn_id == n2->sn_id &&
-	    n1->sn_parent_id == n2->sn_parent_id;
-
-	/* Zero means equality. */
-	return (!equal);
-}
-
-static int
-sfs_vnode_get(const struct mount *mp, int flags, uint64_t parent_id,
-   uint64_t id, struct vnode **vpp)
-{
-	sfs_node_t search;
-	int err;
-
-	search.sn_id = id;
-	search.sn_parent_id = parent_id;
-	err = vfs_hash_get(mp, (u_int)id, flags, curthread, vpp,
-	    sfs_compare_ids, &search);
-	return (err);
-}
-
-static int
-sfs_vnode_insert(struct vnode *vp, int flags, uint64_t parent_id,
-   uint64_t id, struct vnode **vpp)
-{
-	int err;
-
-	KASSERT(vp->v_data != NULL, ("sfs_vnode_insert with NULL v_data"));
-	err = vfs_hash_insert(vp, (u_int)id, flags, curthread, vpp,
-	    sfs_compare_ids, vp->v_data);
-	return (err);
-}
-
-static void
-sfs_vnode_remove(struct vnode *vp)
-{
-	vfs_hash_remove(vp);
-}
-
-typedef void sfs_vnode_setup_fn(vnode_t *vp, void *arg);
-
-static int
-sfs_vgetx(struct mount *mp, int flags, uint64_t parent_id, uint64_t id,
-    const char *tag, struct vop_vector *vops,
-    sfs_vnode_setup_fn setup, void *arg,
-    struct vnode **vpp)
-{
-	struct vnode *vp;
-	int error;
-
-	error = sfs_vnode_get(mp, flags, parent_id, id, vpp);
-	if (error != 0 || *vpp != NULL) {
-		KASSERT_IMPLY(error == 0, (*vpp)->v_data != NULL,
-		    "sfs vnode with no data");
-		return (error);
-	}
-
-	/* Allocate a new vnode/inode. */
-	error = getnewvnode(tag, mp, vops, &vp);
-	if (error != 0) {
-		*vpp = NULL;
-		return (error);
-	}
-
-	/*
-	 * Exclusively lock the vnode vnode while it's being constructed.
-	 */
-	lockmgr(vp->v_vnlock, LK_EXCLUSIVE, NULL);
-	error = insmntque(vp, mp);
-	if (error != 0) {
-		*vpp = NULL;
-		return (error);
-	}
-
-	setup(vp, arg);
-
-	error = sfs_vnode_insert(vp, flags, parent_id, id, vpp);
-	if (error != 0 || *vpp != NULL) {
-		KASSERT_IMPLY(error == 0, (*vpp)->v_data != NULL,
-		    "sfs vnode with no data");
-		return (error);
-	}
-
-	*vpp = vp;
-	return (0);
-}
-
-static void
-sfs_print_node(sfs_node_t *node)
-{
-	printf("\tname = %s\n", node->sn_name);
-	printf("\tparent_id = %ju\n", (uintmax_t)node->sn_parent_id);
-	printf("\tid = %ju\n", (uintmax_t)node->sn_id);
-}
-
-static sfs_node_t *
-sfs_alloc_node(size_t size, const char *name, uint64_t parent_id, uint64_t id)
-{
-	struct sfs_node *node;
-
-	KASSERT(strlen(name) < sizeof(node->sn_name),
-	    ("sfs node name is too long"));
-	KASSERT(size >= sizeof(*node), ("sfs node size is too small"));
-	node = malloc(size, M_SFSNODES, M_WAITOK | M_ZERO);
-	strlcpy(node->sn_name, name, sizeof(node->sn_name));
-	node->sn_parent_id = parent_id;
-	node->sn_id = id;
-
-	return (node);
-}
-
-static void
-sfs_destroy_node(sfs_node_t *node)
-{
-	free(node, M_SFSNODES);
-}
-
-static void *
-sfs_reclaim_vnode(vnode_t *vp)
-{
-	sfs_node_t *node;
-	void *data;
-
-	sfs_vnode_remove(vp);
-	data = vp->v_data;
-	vp->v_data = NULL;
-	return (data);
-}
-
-static int
-sfs_readdir_common(uint64_t parent_id, uint64_t id, struct vop_readdir_args *ap,
-    uio_t *uio, off_t *offp)
-{
-	struct dirent entry;
-	int error;
-
-	/* Reset ncookies for subsequent use of vfs_read_dirent. */
-	if (ap->a_ncookies != NULL)
-		*ap->a_ncookies = 0;
-
-	if (uio->uio_resid < sizeof(entry))
-		return (SET_ERROR(EINVAL));
-
-	if (uio->uio_offset < 0)
-		return (SET_ERROR(EINVAL));
-	if (uio->uio_offset == 0) {
-		entry.d_fileno = id;
-		entry.d_type = DT_DIR;
-		entry.d_name[0] = '.';
-		entry.d_namlen = 1;
-		entry.d_reclen = sizeof(entry);
-		dirent_terminate(&entry);
-		error = vfs_read_dirent(ap, &entry, uio->uio_offset);
-		if (error != 0)
-			return (SET_ERROR(error));
-	}
-
-	if (uio->uio_offset < sizeof(entry))
-		return (SET_ERROR(EINVAL));
-	if (uio->uio_offset == sizeof(entry)) {
-		entry.d_fileno = parent_id;
-		entry.d_type = DT_DIR;
-		entry.d_name[0] = '.';
-		entry.d_name[1] = '.';
-		entry.d_namlen = 2;
-		entry.d_reclen = sizeof(entry);
-		dirent_terminate(&entry);
-		error = vfs_read_dirent(ap, &entry, uio->uio_offset);
-		if (error != 0)
-			return (SET_ERROR(error));
-	}
-
-	if (offp != NULL)
-		*offp = 2 * sizeof(entry);
-	return (0);
-}
-
-
-/*
- * .zfs inode namespace
- *
- * We need to generate unique inode numbers for all files and directories
- * within the .zfs pseudo-filesystem.  We use the following scheme:
- *
- * 	ENTRY			ZFSCTL_INODE
- * 	.zfs			1
- * 	.zfs/snapshot		2
- * 	.zfs/snapshot/<snap>	objectid(snap)
- */
-#define	ZFSCTL_INO_SNAP(id)	(id)
-
-static struct vop_vector zfsctl_ops_root;
-static struct vop_vector zfsctl_ops_snapdir;
-static struct vop_vector zfsctl_ops_snapshot;
-static struct vop_vector zfsctl_ops_shares_dir;
-
-void
-zfsctl_init(void)
-{
-}
-
-void
-zfsctl_fini(void)
-{
-}
-
-boolean_t
-zfsctl_is_node(vnode_t *vp)
-{
-	return (vn_matchops(vp, zfsctl_ops_root) ||
-	    vn_matchops(vp, zfsctl_ops_snapdir) ||
-	    vn_matchops(vp, zfsctl_ops_snapshot) ||
-	    vn_matchops(vp, zfsctl_ops_shares_dir));
-
-}
-
-typedef struct zfsctl_root {
-	sfs_node_t	node;
-	sfs_node_t	*snapdir;
-	timestruc_t	cmtime;
-} zfsctl_root_t;
-
-
-/*
- * Create the '.zfs' directory.
- */
-void
-zfsctl_create(zfsvfs_t *zfsvfs)
-{
-	zfsctl_root_t *dot_zfs;
-	sfs_node_t *snapdir;
-	vnode_t *rvp;
-	uint64_t crtime[2];
-
-	ASSERT(zfsvfs->z_ctldir == NULL);
-
-	snapdir = sfs_alloc_node(sizeof(*snapdir), "snapshot", ZFSCTL_INO_ROOT,
-	    ZFSCTL_INO_SNAPDIR);
-	dot_zfs = (zfsctl_root_t *)sfs_alloc_node(sizeof(*dot_zfs), ".zfs", 0,
-	    ZFSCTL_INO_ROOT);
-	dot_zfs->snapdir = snapdir;
-
-	VERIFY(VFS_ROOT(zfsvfs->z_vfs, LK_EXCLUSIVE, &rvp) == 0);
-	VERIFY(0 == sa_lookup(VTOZ(rvp)->z_sa_hdl, SA_ZPL_CRTIME(zfsvfs),
-	    &crtime, sizeof(crtime)));
-	ZFS_TIME_DECODE(&dot_zfs->cmtime, crtime);
-	vput(rvp);
-
-	zfsvfs->z_ctldir = dot_zfs;
-}
-
-/*
- * Destroy the '.zfs' directory.  Only called when the filesystem is unmounted.
- * The nodes must not have any associated vnodes by now as they should be
- * vflush-ed.
- */
-void
-zfsctl_destroy(zfsvfs_t *zfsvfs)
-{
-	sfs_destroy_node(zfsvfs->z_ctldir->snapdir);
-	sfs_destroy_node((sfs_node_t *)zfsvfs->z_ctldir);
-	zfsvfs->z_ctldir = NULL;
-}
-
-static int
-zfsctl_fs_root_vnode(struct mount *mp, void *arg __unused, int flags,
-    struct vnode **vpp)
-{
-	return (VFS_ROOT(mp, flags, vpp));
-}
-
-static void
-zfsctl_common_vnode_setup(vnode_t *vp, void *arg)
-{
-	ASSERT_VOP_ELOCKED(vp, __func__);
-
-	/* We support shared locking. */
-	VN_LOCK_ASHARE(vp);
-	vp->v_type = VDIR;
-	vp->v_data = arg;
-}
-
-static int
-zfsctl_root_vnode(struct mount *mp, void *arg __unused, int flags,
-    struct vnode **vpp)
-{
-	void *node;
-	int err;
-
-	node = ((zfsvfs_t*)mp->mnt_data)->z_ctldir;
-	err = sfs_vgetx(mp, flags, 0, ZFSCTL_INO_ROOT, "zfs", &zfsctl_ops_root,
-	    zfsctl_common_vnode_setup, node, vpp);
-	return (err);
-}
-
-static int
-zfsctl_snapdir_vnode(struct mount *mp, void *arg __unused, int flags,
-    struct vnode **vpp)
-{
-	void *node;
-	int err;
-
-	node = ((zfsvfs_t*)mp->mnt_data)->z_ctldir->snapdir;
-	err = sfs_vgetx(mp, flags, ZFSCTL_INO_ROOT, ZFSCTL_INO_SNAPDIR, "zfs",
-	   &zfsctl_ops_snapdir, zfsctl_common_vnode_setup, node, vpp);
-	return (err);
-}
-
-/*
- * Given a root znode, retrieve the associated .zfs directory.
- * Add a hold to the vnode and return it.
- */
-int
-zfsctl_root(zfsvfs_t *zfsvfs, int flags, vnode_t **vpp)
-{
-	vnode_t *vp;
-	int error;
-
-	error = zfsctl_root_vnode(zfsvfs->z_vfs, NULL, flags, vpp);
-	return (error);
-}
-
-/*
- * Common open routine.  Disallow any write access.
- */
-static int
-zfsctl_common_open(struct vop_open_args *ap)
-{
-	int flags = ap->a_mode;
-
-	if (flags & FWRITE)
-		return (SET_ERROR(EACCES));
-
-	return (0);
-}
-
-/*
- * Common close routine.  Nothing to do here.
- */
-/* ARGSUSED */
-static int
-zfsctl_common_close(struct vop_close_args *ap)
-{
-	return (0);
-}
-
-/*
- * Common access routine.  Disallow writes.
- */
-static int
-zfsctl_common_access(ap)
-	struct vop_access_args /* {
-		struct vnode *a_vp;
-		accmode_t a_accmode;
-		struct ucred *a_cred;
-		struct thread *a_td;
-	} */ *ap;
-{
-	accmode_t accmode = ap->a_accmode;
-
-	if (accmode & VWRITE)
-		return (SET_ERROR(EACCES));
-	return (0);
-}
-
-/*
- * Common getattr function.  Fill in basic information.
- */
-static void
-zfsctl_common_getattr(vnode_t *vp, vattr_t *vap)
-{
-	timestruc_t	now;
-	sfs_node_t *node;
-
-	node = vp->v_data;
-
-	vap->va_uid = 0;
-	vap->va_gid = 0;
-	vap->va_rdev = 0;
-	/*
-	 * We are a purely virtual object, so we have no
-	 * blocksize or allocated blocks.
-	 */
-	vap->va_blksize = 0;
-	vap->va_nblocks = 0;
-	vap->va_seq = 0;
-	vn_fsid(vp, vap);
-	vap->va_mode = zfsctl_ctldir_mode;
-	vap->va_type = VDIR;
-	/*
-	 * We live in the now (for atime).
-	 */
-	gethrestime(&now);
-	vap->va_atime = now;
-	/* FreeBSD: Reset chflags(2) flags. */
-	vap->va_flags = 0;
-
-	vap->va_nodeid = node->sn_id;
-
-	/* At least '.' and '..'. */
-	vap->va_nlink = 2;
-}
-
-static int
-zfsctl_common_fid(ap)
-	struct vop_fid_args /* {
-		struct vnode *a_vp;
-		struct fid *a_fid;
-	} */ *ap;
-{
-	vnode_t		*vp = ap->a_vp;
-	fid_t		*fidp = (void *)ap->a_fid;
-	sfs_node_t	*node = vp->v_data;
-	uint64_t	object = node->sn_id;
-	zfid_short_t	*zfid;
-	int		i;
-
-	zfid = (zfid_short_t *)fidp;
-	zfid->zf_len = SHORT_FID_LEN;
-
-	for (i = 0; i < sizeof(zfid->zf_object); i++)
-		zfid->zf_object[i] = (uint8_t)(object >> (8 * i));
-
-	/* .zfs nodes always have a generation number of 0 */
-	for (i = 0; i < sizeof(zfid->zf_gen); i++)
-		zfid->zf_gen[i] = 0;
-
-	return (0);
-}
-
-static int
-zfsctl_common_reclaim(ap)
-	struct vop_reclaim_args /* {
-		struct vnode *a_vp;
-		struct thread *a_td;
-	} */ *ap;
-{
-	vnode_t *vp = ap->a_vp;
-
-	(void) sfs_reclaim_vnode(vp);
-	return (0);
-}
-
-static int
-zfsctl_common_print(ap)
-	struct vop_print_args /* {
-		struct vnode *a_vp;
-	} */ *ap;
-{
-	sfs_print_node(ap->a_vp->v_data);
-	return (0);
-}
-
-/*
- * Get root directory attributes.
- */
-static int
-zfsctl_root_getattr(ap)
-	struct vop_getattr_args /* {
-		struct vnode *a_vp;
-		struct vattr *a_vap;
-		struct ucred *a_cred;
-	} */ *ap;
-{
-	struct vnode *vp = ap->a_vp;
-	struct vattr *vap = ap->a_vap;
-	zfsctl_root_t *node = vp->v_data;
-
-	zfsctl_common_getattr(vp, vap);
-	vap->va_ctime = node->cmtime;
-	vap->va_mtime = vap->va_ctime;
-	vap->va_birthtime = vap->va_ctime;
-	vap->va_nlink += 1; /* snapdir */
-	vap->va_size = vap->va_nlink;
-	return (0);
-}
-
-/*
- * When we lookup "." we still can be asked to lock it
- * differently, can't we?
- */
-int
-zfsctl_relock_dot(vnode_t *dvp, int ltype)
-{
-	vref(dvp);
-	if (ltype != VOP_ISLOCKED(dvp)) {
-		if (ltype == LK_EXCLUSIVE)
-			vn_lock(dvp, LK_UPGRADE | LK_RETRY);
-		else /* if (ltype == LK_SHARED) */
-			vn_lock(dvp, LK_DOWNGRADE | LK_RETRY);
-
-		/* Relock for the "." case may left us with reclaimed vnode. */
-		if (VN_IS_DOOMED(dvp)) {
-			vrele(dvp);
-			return (SET_ERROR(ENOENT));
-		}
-	}
-	return (0);
-}
-
-/*
- * Special case the handling of "..".
- */
-int
-zfsctl_root_lookup(ap)
-	struct vop_lookup_args /* {
-		struct vnode *a_dvp;
-		struct vnode **a_vpp;
-		struct componentname *a_cnp;
-	} */ *ap;
-{
-	struct componentname *cnp = ap->a_cnp;
-	vnode_t *dvp = ap->a_dvp;
-	vnode_t **vpp = ap->a_vpp;
-	cred_t *cr = ap->a_cnp->cn_cred;
-	int flags = ap->a_cnp->cn_flags;
-	int lkflags = ap->a_cnp->cn_lkflags;
-	int nameiop = ap->a_cnp->cn_nameiop;
-	int err;
-	int ltype;
-
-	ASSERT(dvp->v_type == VDIR);
-
-	if ((flags & ISLASTCN) != 0 && nameiop != LOOKUP)
-		return (SET_ERROR(ENOTSUP));
-
-	if (cnp->cn_namelen == 1 && *cnp->cn_nameptr == '.') {
-		err = zfsctl_relock_dot(dvp, lkflags & LK_TYPE_MASK);
-		if (err == 0)
-			*vpp = dvp;
-	} else if ((flags & ISDOTDOT) != 0) {
-		err = vn_vget_ino_gen(dvp, zfsctl_fs_root_vnode, NULL,
-		    lkflags, vpp);
-	} else if (strncmp(cnp->cn_nameptr, "snapshot", cnp->cn_namelen) == 0) {
-		err = zfsctl_snapdir_vnode(dvp->v_mount, NULL, lkflags, vpp);
-	} else {
-		err = SET_ERROR(ENOENT);
-	}
-	if (err != 0)
-		*vpp = NULL;
-	return (err);
-}
-
-static int
-zfsctl_root_readdir(ap)
-	struct vop_readdir_args /* {
-		struct vnode *a_vp;
-		struct uio *a_uio;
-		struct ucred *a_cred;
-		int *a_eofflag;
-		int *ncookies;
-		u_long **a_cookies;
-	} */ *ap;
-{
-	struct dirent entry;
-	vnode_t *vp = ap->a_vp;
-	zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
-	zfsctl_root_t *node = vp->v_data;
-	uio_t *uio = ap->a_uio;
-	int *eofp = ap->a_eofflag;
-	off_t dots_offset;
-	int error;
-
-	ASSERT(vp->v_type == VDIR);
-
-	error = sfs_readdir_common(zfsvfs->z_root, ZFSCTL_INO_ROOT, ap, uio,
-	    &dots_offset);
-	if (error != 0) {
-		if (error == ENAMETOOLONG) /* ran out of destination space */
-			error = 0;
-		return (error);
-	}
-	if (uio->uio_offset != dots_offset)
-		return (SET_ERROR(EINVAL));
-
-	CTASSERT(sizeof(node->snapdir->sn_name) <= sizeof(entry.d_name));
-	entry.d_fileno = node->snapdir->sn_id;
-	entry.d_type = DT_DIR;
-	strcpy(entry.d_name, node->snapdir->sn_name);
-	entry.d_namlen = strlen(entry.d_name);
-	entry.d_reclen = sizeof(entry);
-	dirent_terminate(&entry);
-	error = vfs_read_dirent(ap, &entry, uio->uio_offset);
-	if (error != 0) {
-		if (error == ENAMETOOLONG)
-			error = 0;
-		return (SET_ERROR(error));
-	}
-	if (eofp != NULL)
-		*eofp = 1;
-	return (0);
-}
-
-static int
-zfsctl_root_vptocnp(struct vop_vptocnp_args *ap)
-{
-	static const char dotzfs_name[4] = ".zfs";
-	vnode_t *dvp;
-	int error;
-
-	if (*ap->a_buflen < sizeof (dotzfs_name))
-		return (SET_ERROR(ENOMEM));
-
-	error = vn_vget_ino_gen(ap->a_vp, zfsctl_fs_root_vnode, NULL,
-	    LK_SHARED, &dvp);
-	if (error != 0)
-		return (SET_ERROR(error));
-
-	VOP_UNLOCK(dvp);
-	*ap->a_vpp = dvp;
-	*ap->a_buflen -= sizeof (dotzfs_name);
-	bcopy(dotzfs_name, ap->a_buf + *ap->a_buflen, sizeof (dotzfs_name));
-	return (0);
-}
-
-static int
-zfsctl_common_pathconf(ap)
-	struct vop_pathconf_args /* {
-		struct vnode *a_vp;
-		int a_name;
-		int *a_retval;
-	} */ *ap;
-{
-	/*
-	 * We care about ACL variables so that user land utilities like ls
-	 * can display them correctly.  Since the ctldir's st_dev is set to be
-	 * the same as the parent dataset, we must support all variables that
-	 * it supports.
-	 */
-	switch (ap->a_name) {
-	case _PC_LINK_MAX:
-		*ap->a_retval = MIN(LONG_MAX, ZFS_LINK_MAX);
-		return (0);
-
-	case _PC_FILESIZEBITS:
-		*ap->a_retval = 64;
-		return (0);
-
-	case _PC_MIN_HOLE_SIZE:
-		*ap->a_retval = (int)SPA_MINBLOCKSIZE;
-		return (0);
-
-	case _PC_ACL_NFS4:
-		*ap->a_retval = 1;
-		return (0);
-
-	case _PC_ACL_PATH_MAX:
-		*ap->a_retval = ACL_MAX_ENTRIES;
-		return (0);
-
-	case _PC_NAME_MAX:
-		*ap->a_retval = NAME_MAX;
-		return (0);
-
-	default:
-		return (vop_stdpathconf(ap));
-	}
-}
-
-/**
- * Returns a trivial ACL
- */
-int
-zfsctl_common_getacl(ap)
-	struct vop_getacl_args /* {
-		struct vnode *vp;
-		acl_type_t a_type;
-		struct acl *a_aclp;
-		struct ucred *cred;
-		struct thread *td;
-	} */ *ap;
-{
-	int i;
-
-	if (ap->a_type != ACL_TYPE_NFS4)
-		return (EINVAL);
-
-	acl_nfs4_sync_acl_from_mode(ap->a_aclp, zfsctl_ctldir_mode, 0);
-	/*
-	 * acl_nfs4_sync_acl_from_mode assumes that the owner can always modify
-	 * attributes.  That is not the case for the ctldir, so we must clear
-	 * those bits.  We also must clear ACL_READ_NAMED_ATTRS, because xattrs
-	 * aren't supported by the ctldir.
-	 */
-	for (i = 0; i < ap->a_aclp->acl_cnt; i++) {
-		struct acl_entry *entry;
-		entry = &(ap->a_aclp->acl_entry[i]);
-		uint32_t old_perm = entry->ae_perm;
-		entry->ae_perm &= ~(ACL_WRITE_ACL | ACL_WRITE_OWNER |
-		    ACL_WRITE_ATTRIBUTES | ACL_WRITE_NAMED_ATTRS |
-		    ACL_READ_NAMED_ATTRS );
-	}
-
-	return (0);
-}
-
-static struct vop_vector zfsctl_ops_root = {
-	.vop_default =	&default_vnodeops,
-	.vop_open =	zfsctl_common_open,
-	.vop_close =	zfsctl_common_close,
-	.vop_ioctl =	VOP_EINVAL,
-	.vop_getattr =	zfsctl_root_getattr,
-	.vop_access =	zfsctl_common_access,
-	.vop_readdir =	zfsctl_root_readdir,
-	.vop_lookup =	zfsctl_root_lookup,
-	.vop_inactive =	VOP_NULL,
-	.vop_reclaim =	zfsctl_common_reclaim,
-	.vop_fid =	zfsctl_common_fid,
-	.vop_print =	zfsctl_common_print,
-	.vop_vptocnp =	zfsctl_root_vptocnp,
-	.vop_pathconf =	zfsctl_common_pathconf,
-	.vop_getacl =	zfsctl_common_getacl,
-};
-VFS_VOP_VECTOR_REGISTER(zfsctl_ops_root);
-
-static int
-zfsctl_snapshot_zname(vnode_t *vp, const char *name, int len, char *zname)
-{
-	objset_t *os = ((zfsvfs_t *)((vp)->v_vfsp->vfs_data))->z_os;
-
-	dmu_objset_name(os, zname);
-	if (strlen(zname) + 1 + strlen(name) >= len)
-		return (SET_ERROR(ENAMETOOLONG));
-	(void) strcat(zname, "@");
-	(void) strcat(zname, name);
-	return (0);
-}
-
-static int
-zfsctl_snapshot_lookup(vnode_t *vp, const char *name, uint64_t *id)
-{
-	objset_t *os = ((zfsvfs_t *)((vp)->v_vfsp->vfs_data))->z_os;
-	int err;
-
-	err = dsl_dataset_snap_lookup(dmu_objset_ds(os), name, id);
-	return (err);
-}
-
-/*
- * Given a vnode get a root vnode of a filesystem mounted on top of
- * the vnode, if any.  The root vnode is referenced and locked.
- * If no filesystem is mounted then the orinal vnode remains referenced
- * and locked.  If any error happens the orinal vnode is unlocked and
- * released.
- */
-static int
-zfsctl_mounted_here(vnode_t **vpp, int flags)
-{
-	struct mount *mp;
-	int err;
-
-	ASSERT_VOP_LOCKED(*vpp, __func__);
-	ASSERT3S((*vpp)->v_type, ==, VDIR);
-
-	if ((mp = (*vpp)->v_mountedhere) != NULL) {
-		err = vfs_busy(mp, 0);
-		KASSERT(err == 0, ("vfs_busy(mp, 0) failed with %d", err));
-		KASSERT(vrefcnt(*vpp) > 1, ("unreferenced mountpoint"));
-		vput(*vpp);
-		err = VFS_ROOT(mp, flags, vpp);
-		vfs_unbusy(mp);
-		return (err);
-	}
-	return (EJUSTRETURN);
-}
-
-typedef struct {
-	const char *snap_name;
-	uint64_t    snap_id;
-} snapshot_setup_arg_t;
-
-static void
-zfsctl_snapshot_vnode_setup(vnode_t *vp, void *arg)
-{
-	snapshot_setup_arg_t *ssa = arg;
-	sfs_node_t *node;
-
-	ASSERT_VOP_ELOCKED(vp, __func__);
-
-	node = sfs_alloc_node(sizeof(sfs_node_t),
-	    ssa->snap_name, ZFSCTL_INO_SNAPDIR, ssa->snap_id);
-	zfsctl_common_vnode_setup(vp, node);
-
-	/* We have to support recursive locking. */
-	VN_LOCK_AREC(vp);
-}
-
-/*
- * Lookup entry point for the 'snapshot' directory.  Try to open the
- * snapshot if it exist, creating the pseudo filesystem vnode as necessary.
- * Perform a mount of the associated dataset on top of the vnode.
- * There are four possibilities:
- * - the snapshot node and vnode do not exist
- * - the snapshot vnode is covered by the mounted snapshot
- * - the snapshot vnode is not covered yet, the mount operation is in progress
- * - the snapshot vnode is not covered, because the snapshot has been unmounted
- * The last two states are transient and should be relatively short-lived.
- */
-int
-zfsctl_snapdir_lookup(ap)
-	struct vop_lookup_args /* {
-		struct vnode *a_dvp;
-		struct vnode **a_vpp;
-		struct componentname *a_cnp;
-	} */ *ap;
-{
-	vnode_t *dvp = ap->a_dvp;
-	vnode_t **vpp = ap->a_vpp;
-	struct componentname *cnp = ap->a_cnp;
-	char name[NAME_MAX + 1];
-	char fullname[ZFS_MAX_DATASET_NAME_LEN];
-	char *mountpoint;
-	size_t mountpoint_len;
-	zfsvfs_t *zfsvfs = dvp->v_vfsp->vfs_data;
-	uint64_t snap_id;
-	int nameiop = cnp->cn_nameiop;
-	int lkflags = cnp->cn_lkflags;
-	int flags = cnp->cn_flags;
-	int err;
-
-	ASSERT(dvp->v_type == VDIR);
-
-	if ((flags & ISLASTCN) != 0 && nameiop != LOOKUP)
-		return (SET_ERROR(ENOTSUP));
-
-	if (cnp->cn_namelen == 1 && *cnp->cn_nameptr == '.') {
-		err = zfsctl_relock_dot(dvp, lkflags & LK_TYPE_MASK);
-		if (err == 0)
-			*vpp = dvp;
-		return (err);
-	}
-	if (flags & ISDOTDOT) {
-		err = vn_vget_ino_gen(dvp, zfsctl_root_vnode, NULL, lkflags,
-		    vpp);
-		return (err);
-	}
-
-	if (cnp->cn_namelen >= sizeof(name))
-		return (SET_ERROR(ENAMETOOLONG));
-
-	strlcpy(name, ap->a_cnp->cn_nameptr, ap->a_cnp->cn_namelen + 1);
-	err = zfsctl_snapshot_lookup(dvp, name, &snap_id);
-	if (err != 0)
-		return (SET_ERROR(ENOENT));
-
-	for (;;) {
-		snapshot_setup_arg_t ssa;
-
-		ssa.snap_name = name;
-		ssa.snap_id = snap_id;
-		err = sfs_vgetx(dvp->v_mount, LK_SHARED, ZFSCTL_INO_SNAPDIR,
-		   snap_id, "zfs", &zfsctl_ops_snapshot,
-		   zfsctl_snapshot_vnode_setup, &ssa, vpp);
-		if (err != 0)
-			return (err);
-
-		/* Check if a new vnode has just been created. */
-		if (VOP_ISLOCKED(*vpp) == LK_EXCLUSIVE)
-			break;
-
-		/*
-		 * Check if a snapshot is already mounted on top of the vnode.
-		 */
-		err = zfsctl_mounted_here(vpp, lkflags);
-		if (err != EJUSTRETURN)
-			return (err);
-
-		/*
-		 * If the vnode is not covered, then either the mount operation
-		 * is in progress or the snapshot has already been unmounted
-		 * but the vnode hasn't been inactivated and reclaimed yet.
-		 * We can try to re-use the vnode in the latter case.
-		 */
-		VI_LOCK(*vpp);
-		if (((*vpp)->v_iflag & VI_MOUNT) == 0) {
-			/* Upgrade to exclusive lock in order to:
-			 * - avoid race conditions
-			 * - satisfy the contract of mount_snapshot()
-			 */
-			err = VOP_LOCK(*vpp, LK_TRYUPGRADE | LK_INTERLOCK);
-			if (err == 0)
-				break;
-		} else {
-			VI_UNLOCK(*vpp);
-		}
-
-		/*
-		 * In this state we can loop on uncontested locks and starve
-		 * the thread doing the lengthy, non-trivial mount operation.
-		 * So, yield to prevent that from happening.
-		 */
-		vput(*vpp);
-		kern_yield(PRI_USER);
-	}
-
-	VERIFY0(zfsctl_snapshot_zname(dvp, name, sizeof(fullname), fullname));
-
-	mountpoint_len = strlen(dvp->v_vfsp->mnt_stat.f_mntonname) +
-	    strlen("/" ZFS_CTLDIR_NAME "/snapshot/") + strlen(name) + 1;
-	mountpoint = kmem_alloc(mountpoint_len, KM_SLEEP);
-	(void) snprintf(mountpoint, mountpoint_len,
-	    "%s/" ZFS_CTLDIR_NAME "/snapshot/%s",
-	    dvp->v_vfsp->mnt_stat.f_mntonname, name);
-
-	err = mount_snapshot(curthread, vpp, "zfs", mountpoint, fullname, 0);
-	kmem_free(mountpoint, mountpoint_len);
-	if (err == 0) {
-		/*
-		 * Fix up the root vnode mounted on .zfs/snapshot/<snapname>.
-		 *
-		 * This is where we lie about our v_vfsp in order to
-		 * make .zfs/snapshot/<snapname> accessible over NFS
-		 * without requiring manual mounts of <snapname>.
-		 */
-		ASSERT(VTOZ(*vpp)->z_zfsvfs != zfsvfs);
-		VTOZ(*vpp)->z_zfsvfs->z_parent = zfsvfs;
-
-		/* Clear the root flag (set via VFS_ROOT) as well. */
-		(*vpp)->v_vflag &= ~VV_ROOT;
-	}
-
-	if (err != 0)
-		*vpp = NULL;
-	return (err);
-}
-
-static int
-zfsctl_snapdir_readdir(ap)
-	struct vop_readdir_args /* {
-		struct vnode *a_vp;
-		struct uio *a_uio;
-		struct ucred *a_cred;
-		int *a_eofflag;
-		int *ncookies;
-		u_long **a_cookies;
-	} */ *ap;
-{
-	char snapname[ZFS_MAX_DATASET_NAME_LEN];
-	struct dirent entry;
-	vnode_t *vp = ap->a_vp;
-	zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
-	uio_t *uio = ap->a_uio;
-	int *eofp = ap->a_eofflag;
-	off_t dots_offset;
-	int error;
-
-	ASSERT(vp->v_type == VDIR);
-
-	error = sfs_readdir_common(ZFSCTL_INO_ROOT, ZFSCTL_INO_SNAPDIR, ap, uio,
-	    &dots_offset);
-	if (error != 0) {
-		if (error == ENAMETOOLONG) /* ran out of destination space */
-			error = 0;
-		return (error);
-	}
-
-	ZFS_ENTER(zfsvfs);
-	for (;;) {
-		uint64_t cookie;
-		uint64_t id;
-
-		cookie = uio->uio_offset - dots_offset;
-
-		dsl_pool_config_enter(dmu_objset_pool(zfsvfs->z_os), FTAG);
-		error = dmu_snapshot_list_next(zfsvfs->z_os, sizeof(snapname),
-		    snapname, &id, &cookie, NULL);
-		dsl_pool_config_exit(dmu_objset_pool(zfsvfs->z_os), FTAG);
-		if (error != 0) {
-			if (error == ENOENT) {
-				if (eofp != NULL)
-					*eofp = 1;
-				error = 0;
-			}
-			ZFS_EXIT(zfsvfs);
-			return (error);
-		}
-
-		entry.d_fileno = id;
-		entry.d_type = DT_DIR;
-		strcpy(entry.d_name, snapname);
-		entry.d_namlen = strlen(entry.d_name);
-		entry.d_reclen = sizeof(entry);
-		/* NOTE: d_off is the offset for the *next* entry. */
-		entry.d_off = cookie + dots_offset;
-		dirent_terminate(&entry);
-		error = vfs_read_dirent(ap, &entry, uio->uio_offset);
-		if (error != 0) {
-			if (error == ENAMETOOLONG)
-				error = 0;
-			ZFS_EXIT(zfsvfs);
-			return (SET_ERROR(error));
-		}
-		uio->uio_offset = cookie + dots_offset;
-	}
-	/* NOTREACHED */
-}
-
-static int
-zfsctl_snapdir_getattr(ap)
-	struct vop_getattr_args /* {
-		struct vnode *a_vp;
-		struct vattr *a_vap;
-		struct ucred *a_cred;
-	} */ *ap;
-{
-	vnode_t *vp = ap->a_vp;
-	vattr_t *vap = ap->a_vap;
-	zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
-	dsl_dataset_t *ds;
-	sfs_node_t *node = vp->v_data;
-	uint64_t snap_count;
-	int err;
-
-	ZFS_ENTER(zfsvfs);
-	ds = dmu_objset_ds(zfsvfs->z_os);
-	zfsctl_common_getattr(vp, vap);
-	vap->va_ctime = dmu_objset_snap_cmtime(zfsvfs->z_os);
-	vap->va_mtime = vap->va_ctime;
-	vap->va_birthtime = vap->va_ctime;
-	if (dsl_dataset_phys(ds)->ds_snapnames_zapobj != 0) {
-		err = zap_count(dmu_objset_pool(ds->ds_objset)->dp_meta_objset,
-		    dsl_dataset_phys(ds)->ds_snapnames_zapobj, &snap_count);
-		if (err != 0) {
-			ZFS_EXIT(zfsvfs);
-			return (err);
-		}
-		vap->va_nlink += snap_count;
-	}
-	vap->va_size = vap->va_nlink;
-
-	ZFS_EXIT(zfsvfs);
-	return (0);
-}
-
-static struct vop_vector zfsctl_ops_snapdir = {
-	.vop_default =	&default_vnodeops,
-	.vop_open =	zfsctl_common_open,
-	.vop_close =	zfsctl_common_close,
-	.vop_getattr =	zfsctl_snapdir_getattr,
-	.vop_access =	zfsctl_common_access,
-	.vop_readdir =	zfsctl_snapdir_readdir,
-	.vop_lookup =	zfsctl_snapdir_lookup,
-	.vop_reclaim =	zfsctl_common_reclaim,
-	.vop_fid =	zfsctl_common_fid,
-	.vop_print =	zfsctl_common_print,
-	.vop_pathconf =	zfsctl_common_pathconf,
-	.vop_getacl =	zfsctl_common_getacl,
-};
-VFS_VOP_VECTOR_REGISTER(zfsctl_ops_snapdir);
-
-static int
-zfsctl_snapshot_inactive(ap)
-	struct vop_inactive_args /* {
-		struct vnode *a_vp;
-		struct thread *a_td;
-	} */ *ap;
-{
-	vnode_t *vp = ap->a_vp;
-
-	VERIFY(vrecycle(vp) == 1);
-	return (0);
-}
-
-static int
-zfsctl_snapshot_reclaim(ap)
-	struct vop_reclaim_args /* {
-		struct vnode *a_vp;
-		struct thread *a_td;
-	} */ *ap;
-{
-	vnode_t *vp = ap->a_vp;
-	void *data = vp->v_data;
-
-	sfs_reclaim_vnode(vp);
-	sfs_destroy_node(data);
-	return (0);
-}
-
-static int
-zfsctl_snapshot_vptocnp(struct vop_vptocnp_args *ap)
-{
-	struct mount *mp;
-	vnode_t *dvp;
-	vnode_t *vp;
-	sfs_node_t *node;
-	size_t len;
-	enum vgetstate vs;
-	int locked;
-	int error;
-
-	vp = ap->a_vp;
-	node = vp->v_data;
-	len = strlen(node->sn_name);
-	if (*ap->a_buflen < len)
-		return (SET_ERROR(ENOMEM));
-
-	/*
-	 * Prevent unmounting of the snapshot while the vnode lock
-	 * is not held.  That is not strictly required, but allows
-	 * us to assert that an uncovered snapshot vnode is never
-	 * "leaked".
-	 */
-	mp = vp->v_mountedhere;
-	if (mp == NULL)
-		return (SET_ERROR(ENOENT));
-	error = vfs_busy(mp, 0);
-	KASSERT(error == 0, ("vfs_busy(mp, 0) failed with %d", error));
-
-	/*
-	 * We can vput the vnode as we can now depend on the reference owned
-	 * by the busied mp.  But we also need to hold the vnode, because
-	 * the reference may go after vfs_unbusy() which has to be called
-	 * before we can lock the vnode again.
-	 */
-	locked = VOP_ISLOCKED(vp);
-	vs = vget_prep(vp);
-	vput(vp);
-
-	/* Look up .zfs/snapshot, our parent. */
-	error = zfsctl_snapdir_vnode(vp->v_mount, NULL, LK_SHARED, &dvp);
-	if (error == 0) {
-		VOP_UNLOCK(dvp);
-		*ap->a_vpp = dvp;
-		*ap->a_buflen -= len;
-		bcopy(node->sn_name, ap->a_buf + *ap->a_buflen, len);
-	}
-	vfs_unbusy(mp);
-	vget_finish(vp, locked | LK_RETRY, vs);
-	return (error);
-}
-
-/*
- * These VP's should never see the light of day.  They should always
- * be covered.
- */
-static struct vop_vector zfsctl_ops_snapshot = {
-	.vop_default =		NULL, /* ensure very restricted access */
-	.vop_inactive =		zfsctl_snapshot_inactive,
-	.vop_need_inactive =	vop_stdneed_inactive,
-	.vop_reclaim =		zfsctl_snapshot_reclaim,
-	.vop_vptocnp =		zfsctl_snapshot_vptocnp,
-	.vop_lock1 =		vop_stdlock,
-	.vop_unlock =		vop_stdunlock,
-	.vop_islocked =		vop_stdislocked,
-	.vop_advlockpurge =	vop_stdadvlockpurge, /* called by vgone */
-	.vop_print =		zfsctl_common_print,
-};
-VFS_VOP_VECTOR_REGISTER(zfsctl_ops_snapshot);
-
-int
-zfsctl_lookup_objset(vfs_t *vfsp, uint64_t objsetid, zfsvfs_t **zfsvfsp)
-{
-	struct mount *mp;
-	zfsvfs_t *zfsvfs = vfsp->vfs_data;
-	vnode_t *vp;
-	int error;
-
-	ASSERT(zfsvfs->z_ctldir != NULL);
-	*zfsvfsp = NULL;
-	error = sfs_vnode_get(vfsp, LK_EXCLUSIVE,
-	    ZFSCTL_INO_SNAPDIR, objsetid, &vp);
-	if (error == 0 && vp != NULL) {
-		/*
-		 * XXX Probably need to at least reference, if not busy, the mp.
-		 */
-		if (vp->v_mountedhere != NULL)
-			*zfsvfsp = vp->v_mountedhere->mnt_data;
-		vput(vp);
-	}
-	if (*zfsvfsp == NULL)
-		return (SET_ERROR(EINVAL));
-	return (0);
-}
-
-/*
- * Unmount any snapshots for the given filesystem.  This is called from
- * zfs_umount() - if we have a ctldir, then go through and unmount all the
- * snapshots.
- */
-int
-zfsctl_umount_snapshots(vfs_t *vfsp, int fflags, cred_t *cr)
-{
-	char snapname[ZFS_MAX_DATASET_NAME_LEN];
-	zfsvfs_t *zfsvfs = vfsp->vfs_data;
-	struct mount *mp;
-	vnode_t *dvp;
-	vnode_t *vp;
-	sfs_node_t *node;
-	sfs_node_t *snap;
-	uint64_t cookie;
-	int error;
-
-	ASSERT(zfsvfs->z_ctldir != NULL);
-
-	cookie = 0;
-	for (;;) {
-		uint64_t id;
-
-		dsl_pool_config_enter(dmu_objset_pool(zfsvfs->z_os), FTAG);
-		error = dmu_snapshot_list_next(zfsvfs->z_os, sizeof(snapname),
-		    snapname, &id, &cookie, NULL);
-		dsl_pool_config_exit(dmu_objset_pool(zfsvfs->z_os), FTAG);
-		if (error != 0) {
-			if (error == ENOENT)
-				error = 0;
-			break;
-		}
-
-		for (;;) {
-			error = sfs_vnode_get(vfsp, LK_EXCLUSIVE,
-			    ZFSCTL_INO_SNAPDIR, id, &vp);
-			if (error != 0 || vp == NULL)
-				break;
-
-			mp = vp->v_mountedhere;
-
-			/*
-			 * v_mountedhere being NULL means that the
-			 * (uncovered) vnode is in a transient state
-			 * (mounting or unmounting), so loop until it
-			 * settles down.
-			 */
-			if (mp != NULL)
-				break;
-			vput(vp);
-		}
-		if (error != 0)
-			break;
-		if (vp == NULL)
-			continue;	/* no mountpoint, nothing to do */
-
-		/*
-		 * The mount-point vnode is kept locked to avoid spurious EBUSY
-		 * from a concurrent umount.
-		 * The vnode lock must have recursive locking enabled.
-		 */
-		vfs_ref(mp);
-		error = dounmount(mp, fflags, curthread);
-		KASSERT_IMPLY(error == 0, vrefcnt(vp) == 1,
-		    ("extra references after unmount"));
-		vput(vp);
-		if (error != 0)
-			break;
-	}
-	KASSERT_IMPLY((fflags & MS_FORCE) != 0, error == 0,
-	    ("force unmounting failed"));
-	return (error);
-}
-
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_debug.c
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_debug.c
+++ /dev/null
@@ -1,112 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
- */
-
-#include <sys/zfs_context.h>
-
-list_t zfs_dbgmsgs;
-int zfs_dbgmsg_size;
-kmutex_t zfs_dbgmsgs_lock;
-int zfs_dbgmsg_maxsize = 4<<20; /* 4MB */
-
-void
-zfs_dbgmsg_init(void)
-{
-	list_create(&zfs_dbgmsgs, sizeof (zfs_dbgmsg_t),
-	    offsetof(zfs_dbgmsg_t, zdm_node));
-	mutex_init(&zfs_dbgmsgs_lock, NULL, MUTEX_DEFAULT, NULL);
-}
-
-void
-zfs_dbgmsg_fini(void)
-{
-	zfs_dbgmsg_t *zdm;
-
-	while ((zdm = list_remove_head(&zfs_dbgmsgs)) != NULL) {
-		int size = sizeof (zfs_dbgmsg_t) + strlen(zdm->zdm_msg);
-		kmem_free(zdm, size);
-		zfs_dbgmsg_size -= size;
-	}
-	mutex_destroy(&zfs_dbgmsgs_lock);
-	ASSERT0(zfs_dbgmsg_size);
-}
-
-/*
- * Print these messages by running:
- * echo ::zfs_dbgmsg | mdb -k
- *
- * Monitor these messages by running:
- * dtrace -qn 'zfs-dbgmsg{printf("%s\n", stringof(arg0))}'
- *
- * When used with libzpool, monitor with:
- * dtrace -qn 'zfs$pid::zfs_dbgmsg:probe1{printf("%s\n", copyinstr(arg1))}'
- */
-void
-zfs_dbgmsg(const char *fmt, ...)
-{
-	int size;
-	va_list adx;
-	zfs_dbgmsg_t *zdm;
-
-	va_start(adx, fmt);
-	size = vsnprintf(NULL, 0, fmt, adx);
-	va_end(adx);
-
-	/*
-	 * There is one byte of string in sizeof (zfs_dbgmsg_t), used
-	 * for the terminating null.
-	 */
-	zdm = kmem_alloc(sizeof (zfs_dbgmsg_t) + size, KM_SLEEP);
-	zdm->zdm_timestamp = gethrestime_sec();
-
-	va_start(adx, fmt);
-	(void) vsnprintf(zdm->zdm_msg, size + 1, fmt, adx);
-	va_end(adx);
-
-	DTRACE_PROBE1(zfs__dbgmsg, char *, zdm->zdm_msg);
-
-	mutex_enter(&zfs_dbgmsgs_lock);
-	list_insert_tail(&zfs_dbgmsgs, zdm);
-	zfs_dbgmsg_size += sizeof (zfs_dbgmsg_t) + size;
-	while (zfs_dbgmsg_size > zfs_dbgmsg_maxsize) {
-		zdm = list_remove_head(&zfs_dbgmsgs);
-		size = sizeof (zfs_dbgmsg_t) + strlen(zdm->zdm_msg);
-		kmem_free(zdm, size);
-		zfs_dbgmsg_size -= size;
-	}
-	mutex_exit(&zfs_dbgmsgs_lock);
-}
-
-void
-zfs_dbgmsg_print(const char *tag)
-{
-	zfs_dbgmsg_t *zdm;
-
-	(void) printf("ZFS_DBGMSG(%s):\n", tag);
-	mutex_enter(&zfs_dbgmsgs_lock);
-	for (zdm = list_head(&zfs_dbgmsgs); zdm;
-	    zdm = list_next(&zfs_dbgmsgs, zdm))
-		(void) printf("%s\n", zdm->zdm_msg);
-	mutex_exit(&zfs_dbgmsgs_lock);
-}
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_dir.c
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_dir.c
+++ /dev/null
@@ -1,968 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-
-/*
- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2013, 2016 by Delphix. All rights reserved.
- * Copyright 2017 Nexenta Systems, Inc.
- */
-
-#include <sys/types.h>
-#include <sys/param.h>
-#include <sys/time.h>
-#include <sys/systm.h>
-#include <sys/sysmacros.h>
-#include <sys/resource.h>
-#include <sys/vfs.h>
-#include <sys/vnode.h>
-#include <sys/file.h>
-#include <sys/kmem.h>
-#include <sys/uio.h>
-#include <sys/cmn_err.h>
-#include <sys/errno.h>
-#include <sys/stat.h>
-#include <sys/unistd.h>
-#include <sys/sunddi.h>
-#include <sys/random.h>
-#include <sys/policy.h>
-#include <sys/kcondvar.h>
-#include <sys/callb.h>
-#include <sys/smp.h>
-#include <sys/zfs_dir.h>
-#include <sys/zfs_acl.h>
-#include <sys/fs/zfs.h>
-#include <sys/zap.h>
-#include <sys/dmu.h>
-#include <sys/atomic.h>
-#include <sys/zfs_ctldir.h>
-#include <sys/zfs_fuid.h>
-#include <sys/sa.h>
-#include <sys/zfs_sa.h>
-#include <sys/dnlc.h>
-#include <sys/extdirent.h>
-
-/*
- * zfs_match_find() is used by zfs_dirent_lookup() to peform zap lookups
- * of names after deciding which is the appropriate lookup interface.
- */
-static int
-zfs_match_find(zfsvfs_t *zfsvfs, znode_t *dzp, const char *name,
-    matchtype_t mt, uint64_t *zoid)
-{
-	int error;
-
-	if (zfsvfs->z_norm) {
-
-		/*
-		 * In the non-mixed case we only expect there would ever
-		 * be one match, but we need to use the normalizing lookup.
-		 */
-		error = zap_lookup_norm(zfsvfs->z_os, dzp->z_id, name, 8, 1,
-		    zoid, mt, NULL, 0, NULL);
-	} else {
-		error = zap_lookup(zfsvfs->z_os, dzp->z_id, name, 8, 1, zoid);
-	}
-	*zoid = ZFS_DIRENT_OBJ(*zoid);
-
-	return (error);
-}
-
-/*
- * Look up a directory entry under a locked vnode.
- * dvp being locked gives us a guarantee that there are no concurrent
- * modification of the directory and, thus, if a node can be found in
- * the directory, then it must not be unlinked.
- *
- * Input arguments:
- *	dzp	- znode for directory
- *	name	- name of entry to lock
- *	flag	- ZNEW: if the entry already exists, fail with EEXIST.
- *		  ZEXISTS: if the entry does not exist, fail with ENOENT.
- *		  ZXATTR: we want dzp's xattr directory
- *
- * Output arguments:
- *	zpp	- pointer to the znode for the entry (NULL if there isn't one)
- *
- * Return value: 0 on success or errno on failure.
- *
- * NOTE: Always checks for, and rejects, '.' and '..'.
- */
-int
-zfs_dirent_lookup(znode_t *dzp, const char *name, znode_t **zpp, int flag)
-{
-	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
-	matchtype_t	mt = 0;
-	uint64_t	zoid;
-	vnode_t		*vp = NULL;
-	int		error = 0;
-
-	ASSERT_VOP_LOCKED(ZTOV(dzp), __func__);
-
-	*zpp = NULL;
-
-	/*
-	 * Verify that we are not trying to lock '.', '..', or '.zfs'
-	 */
-	if (name[0] == '.' &&
-	    (name[1] == '\0' || (name[1] == '.' && name[2] == '\0')) ||
-	    zfs_has_ctldir(dzp) && strcmp(name, ZFS_CTLDIR_NAME) == 0)
-		return (SET_ERROR(EEXIST));
-
-	/*
-	 * Case sensitivity and normalization preferences are set when
-	 * the file system is created.  These are stored in the
-	 * zfsvfs->z_case and zfsvfs->z_norm fields.  These choices
-	 * affect how we perform zap lookups.
-	 *
-	 * When matching we may need to normalize & change case according to
-	 * FS settings.
-	 *
-	 * Note that a normalized match is necessary for a case insensitive
-	 * filesystem when the lookup request is not exact because normalization
-	 * can fold case independent of normalizing code point sequences.
-	 *
-	 * See the table above zfs_dropname().
-	 */
-	if (zfsvfs->z_norm != 0) {
-		mt = MT_NORMALIZE;
-
-		/*
-		 * Determine if the match needs to honor the case specified in
-		 * lookup, and if so keep track of that so that during
-		 * normalization we don't fold case.
-		 */
-		if (zfsvfs->z_case == ZFS_CASE_MIXED) {
-			mt |= MT_MATCH_CASE;
-		}
-	}
-
-	/*
-	 * Only look in or update the DNLC if we are looking for the
-	 * name on a file system that does not require normalization
-	 * or case folding.  We can also look there if we happen to be
-	 * on a non-normalizing, mixed sensitivity file system IF we
-	 * are looking for the exact name.
-	 *
-	 * NB: we do not need to worry about this flag for ZFS_CASE_SENSITIVE
-	 * because in that case MT_EXACT and MT_FIRST should produce exactly
-	 * the same result.
-	 */
-
-	if (dzp->z_unlinked && !(flag & ZXATTR))
-		return (ENOENT);
-	if (flag & ZXATTR) {
-		error = sa_lookup(dzp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), &zoid,
-		    sizeof (zoid));
-		if (error == 0)
-			error = (zoid == 0 ? ENOENT : 0);
-	} else {
-		error = zfs_match_find(zfsvfs, dzp, name, mt, &zoid);
-	}
-	if (error) {
-		if (error != ENOENT || (flag & ZEXISTS)) {
-			return (error);
-		}
-	} else {
-		if (flag & ZNEW) {
-			return (SET_ERROR(EEXIST));
-		}
-		error = zfs_zget(zfsvfs, zoid, zpp);
-		if (error)
-			return (error);
-		ASSERT(!(*zpp)->z_unlinked);
-	}
-
-	return (0);
-}
-
-static int
-zfs_dd_lookup(znode_t *dzp, znode_t **zpp)
-{
-	zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
-	znode_t *zp;
-	uint64_t parent;
-	int error;
-
-	ASSERT_VOP_LOCKED(ZTOV(dzp), __func__);
-	ASSERT(RRM_READ_HELD(&zfsvfs->z_teardown_lock));
-
-	if (dzp->z_unlinked)
-		return (ENOENT);
-
-	if ((error = sa_lookup(dzp->z_sa_hdl,
-	    SA_ZPL_PARENT(zfsvfs), &parent, sizeof (parent))) != 0)
-		return (error);
-
-	error = zfs_zget(zfsvfs, parent, &zp);
-	if (error == 0)
-		*zpp = zp;
-	return (error);
-}
-
-int
-zfs_dirlook(znode_t *dzp, const char *name, znode_t **zpp)
-{
-	zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
-	znode_t *zp;
-	int error = 0;
-
-	ASSERT_VOP_LOCKED(ZTOV(dzp), __func__);
-	ASSERT(RRM_READ_HELD(&zfsvfs->z_teardown_lock));
-
-	if (dzp->z_unlinked)
-		return (SET_ERROR(ENOENT));
-
-	if (name[0] == 0 || (name[0] == '.' && name[1] == 0)) {
-		*zpp = dzp;
-	} else if (name[0] == '.' && name[1] == '.' && name[2] == 0) {
-		error = zfs_dd_lookup(dzp, zpp);
-	} else {
-		error = zfs_dirent_lookup(dzp, name, &zp, ZEXISTS);
-		if (error == 0) {
-			dzp->z_zn_prefetch = B_TRUE; /* enable prefetching */
-			*zpp = zp;
-		}
-	}
-	return (error);
-}
-
-/*
- * unlinked Set (formerly known as the "delete queue") Error Handling
- *
- * When dealing with the unlinked set, we dmu_tx_hold_zap(), but we
- * don't specify the name of the entry that we will be manipulating.  We
- * also fib and say that we won't be adding any new entries to the
- * unlinked set, even though we might (this is to lower the minimum file
- * size that can be deleted in a full filesystem).  So on the small
- * chance that the nlink list is using a fat zap (ie. has more than
- * 2000 entries), we *may* not pre-read a block that's needed.
- * Therefore it is remotely possible for some of the assertions
- * regarding the unlinked set below to fail due to i/o error.  On a
- * nondebug system, this will result in the space being leaked.
- */
-void
-zfs_unlinked_add(znode_t *zp, dmu_tx_t *tx)
-{
-	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
-
-	ASSERT(zp->z_unlinked);
-	ASSERT(zp->z_links == 0);
-
-	VERIFY3U(0, ==,
-	    zap_add_int(zfsvfs->z_os, zfsvfs->z_unlinkedobj, zp->z_id, tx));
-}
-
-/*
- * Clean up any znodes that had no links when we either crashed or
- * (force) umounted the file system.
- */
-void
-zfs_unlinked_drain(zfsvfs_t *zfsvfs)
-{
-	zap_cursor_t	zc;
-	zap_attribute_t zap;
-	dmu_object_info_t doi;
-	znode_t		*zp;
-	dmu_tx_t	*tx;
-	int		error;
-
-	/*
-	 * Interate over the contents of the unlinked set.
-	 */
-	for (zap_cursor_init(&zc, zfsvfs->z_os, zfsvfs->z_unlinkedobj);
-	    zap_cursor_retrieve(&zc, &zap) == 0;
-	    zap_cursor_advance(&zc)) {
-
-		/*
-		 * See what kind of object we have in list
-		 */
-
-		error = dmu_object_info(zfsvfs->z_os,
-		    zap.za_first_integer, &doi);
-		if (error != 0)
-			continue;
-
-		ASSERT((doi.doi_type == DMU_OT_PLAIN_FILE_CONTENTS) ||
-		    (doi.doi_type == DMU_OT_DIRECTORY_CONTENTS));
-		/*
-		 * We need to re-mark these list entries for deletion,
-		 * so we pull them back into core and set zp->z_unlinked.
-		 */
-		error = zfs_zget(zfsvfs, zap.za_first_integer, &zp);
-
-		/*
-		 * We may pick up znodes that are already marked for deletion.
-		 * This could happen during the purge of an extended attribute
-		 * directory.  All we need to do is skip over them, since they
-		 * are already in the system marked z_unlinked.
-		 */
-		if (error != 0)
-			continue;
-
-		vn_lock(ZTOV(zp), LK_EXCLUSIVE | LK_RETRY);
-#if defined(__FreeBSD__)
-		/*
-		 * Due to changes in zfs_rmnode we need to make sure the
-		 * link count is set to zero here.
-		 */
-		if (zp->z_links != 0) {
-			tx = dmu_tx_create(zfsvfs->z_os);
-			dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
-			error = dmu_tx_assign(tx, TXG_WAIT);
-			if (error != 0) {
-				dmu_tx_abort(tx);
-				vput(ZTOV(zp));
-				continue;
-			}
-			zp->z_links = 0;
-			VERIFY0(sa_update(zp->z_sa_hdl, SA_ZPL_LINKS(zfsvfs),
-			    &zp->z_links, sizeof (zp->z_links), tx));
-			dmu_tx_commit(tx);
-		}
-#endif
-		zp->z_unlinked = B_TRUE;
-		vput(ZTOV(zp));
-	}
-	zap_cursor_fini(&zc);
-}
-
-/*
- * Delete the entire contents of a directory.  Return a count
- * of the number of entries that could not be deleted. If we encounter
- * an error, return a count of at least one so that the directory stays
- * in the unlinked set.
- *
- * NOTE: this function assumes that the directory is inactive,
- *	so there is no need to lock its entries before deletion.
- *	Also, it assumes the directory contents is *only* regular
- *	files.
- */
-static int
-zfs_purgedir(znode_t *dzp)
-{
-	zap_cursor_t	zc;
-	zap_attribute_t	zap;
-	znode_t		*xzp;
-	dmu_tx_t	*tx;
-	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
-	int skipped = 0;
-	int error;
-
-	for (zap_cursor_init(&zc, zfsvfs->z_os, dzp->z_id);
-	    (error = zap_cursor_retrieve(&zc, &zap)) == 0;
-	    zap_cursor_advance(&zc)) {
-		error = zfs_zget(zfsvfs,
-		    ZFS_DIRENT_OBJ(zap.za_first_integer), &xzp);
-		if (error) {
-			skipped += 1;
-			continue;
-		}
-
-		vn_lock(ZTOV(xzp), LK_EXCLUSIVE | LK_RETRY);
-		ASSERT((ZTOV(xzp)->v_type == VREG) ||
-		    (ZTOV(xzp)->v_type == VLNK));
-
-		tx = dmu_tx_create(zfsvfs->z_os);
-		dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
-		dmu_tx_hold_zap(tx, dzp->z_id, FALSE, zap.za_name);
-		dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE);
-		dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
-		/* Is this really needed ? */
-		zfs_sa_upgrade_txholds(tx, xzp);
-		dmu_tx_mark_netfree(tx);
-		error = dmu_tx_assign(tx, TXG_WAIT);
-		if (error) {
-			dmu_tx_abort(tx);
-			vput(ZTOV(xzp));
-			skipped += 1;
-			continue;
-		}
-
-		error = zfs_link_destroy(dzp, zap.za_name, xzp, tx, 0, NULL);
-		if (error)
-			skipped += 1;
-		dmu_tx_commit(tx);
-
-		vput(ZTOV(xzp));
-	}
-	zap_cursor_fini(&zc);
-	if (error != ENOENT)
-		skipped += 1;
-	return (skipped);
-}
-
-#if defined(__FreeBSD__)
-extern taskq_t *zfsvfs_taskq;
-#endif
-
-void
-zfs_rmnode(znode_t *zp)
-{
-	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
-	objset_t	*os = zfsvfs->z_os;
-	dmu_tx_t	*tx;
-	uint64_t	acl_obj;
-	uint64_t	xattr_obj;
-	int		error;
-
-	ASSERT(zp->z_links == 0);
-	ASSERT_VOP_ELOCKED(ZTOV(zp), __func__);
-
-	/*
-	 * If this is an attribute directory, purge its contents.
-	 */
-	if (ZTOV(zp) != NULL && ZTOV(zp)->v_type == VDIR &&
-	    (zp->z_pflags & ZFS_XATTR)) {
-		if (zfs_purgedir(zp) != 0) {
-			/*
-			 * Not enough space to delete some xattrs.
-			 * Leave it in the unlinked set.
-			 */
-			zfs_znode_dmu_fini(zp);
-			zfs_znode_free(zp);
-			return;
-		}
-	} else {
-		/*
-		 * Free up all the data in the file.  We don't do this for
-		 * XATTR directories because we need truncate and remove to be
-		 * in the same tx, like in zfs_znode_delete(). Otherwise, if
-		 * we crash here we'll end up with an inconsistent truncated
-		 * zap object in the delete queue.  Note a truncated file is
-		 * harmless since it only contains user data.
-		 */
-		error = dmu_free_long_range(os, zp->z_id, 0, DMU_OBJECT_END);
-		if (error) {
-			/*
-			 * Not enough space or we were interrupted by unmount.
-			 * Leave the file in the unlinked set.
-			 */
-			zfs_znode_dmu_fini(zp);
-			zfs_znode_free(zp);
-			return;
-		}
-	}
-
-	/*
-	 * If the file has extended attributes, we're going to unlink
-	 * the xattr dir.
-	 */
-	error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
-	    &xattr_obj, sizeof (xattr_obj));
-	if (error)
-		xattr_obj = 0;
-
-	acl_obj = zfs_external_acl(zp);
-
-	/*
-	 * Set up the final transaction.
-	 */
-	tx = dmu_tx_create(os);
-	dmu_tx_hold_free(tx, zp->z_id, 0, DMU_OBJECT_END);
-	dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
-	if (xattr_obj)
-		dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, TRUE, NULL);
-	if (acl_obj)
-		dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END);
-
-	zfs_sa_upgrade_txholds(tx, zp);
-	error = dmu_tx_assign(tx, TXG_WAIT);
-	if (error) {
-		/*
-		 * Not enough space to delete the file.  Leave it in the
-		 * unlinked set, leaking it until the fs is remounted (at
-		 * which point we'll call zfs_unlinked_drain() to process it).
-		 */
-		dmu_tx_abort(tx);
-		zfs_znode_dmu_fini(zp);
-		zfs_znode_free(zp);
-		return;
-	}
-
-#if defined(__FreeBSD__)
-	/*
-	 * FreeBSD's implemention of zfs_zget requires a vnode to back it.
-	 * This means that we could end up calling into getnewvnode while
-	 * calling zfs_rmnode as a result of a prior call to getnewvnode
-	 * trying to clear vnodes out of the cache. If this repeats we can
-	 * recurse enough that we overflow our stack. To avoid this, we
-	 * avoid calling zfs_zget on the xattr znode and instead simply add
-	 * it to the unlinked set and schedule a call to zfs_unlinked_drain.
-	 */
-	if (xattr_obj) {
-		/* Add extended attribute directory to the unlinked set. */
-		VERIFY3U(0, ==,
-		    zap_add_int(os, zfsvfs->z_unlinkedobj, xattr_obj, tx));
-	}
-#else
-	if (xzp) {
-		ASSERT(error == 0);
-		xzp->z_unlinked = B_TRUE;	/* mark xzp for deletion */
-		xzp->z_links = 0;	/* no more links to it */
-		VERIFY(0 == sa_update(xzp->z_sa_hdl, SA_ZPL_LINKS(zfsvfs),
-		    &xzp->z_links, sizeof (xzp->z_links), tx));
-		zfs_unlinked_add(xzp, tx);
-	}
-#endif
-
-	/* Remove this znode from the unlinked set */
-	VERIFY3U(0, ==,
-	    zap_remove_int(zfsvfs->z_os, zfsvfs->z_unlinkedobj, zp->z_id, tx));
-
-	zfs_znode_delete(zp, tx);
-
-	dmu_tx_commit(tx);
-
-#if defined(__FreeBSD__)
-	if (xattr_obj) {
-		/*
-		 * We're using the FreeBSD taskqueue API here instead of
-		 * the Solaris taskq API since the FreeBSD API allows for a
-		 * task to be enqueued multiple times but executed once.
-		 */
-		taskqueue_enqueue(zfsvfs_taskq->tq_queue,
-		    &zfsvfs->z_unlinked_drain_task);
-	}
-#endif
-}
-
-static uint64_t
-zfs_dirent(znode_t *zp, uint64_t mode)
-{
-	uint64_t de = zp->z_id;
-
-	if (zp->z_zfsvfs->z_version >= ZPL_VERSION_DIRENT_TYPE)
-		de |= IFTODT(mode) << 60;
-	return (de);
-}
-
-/*
- * Link zp into dzp.  Can only fail if zp has been unlinked.
- */
-int
-zfs_link_create(znode_t *dzp, const char *name, znode_t *zp, dmu_tx_t *tx,
-    int flag)
-{
-	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
-	vnode_t *vp = ZTOV(zp);
-	uint64_t value;
-	int zp_is_dir = (vp->v_type == VDIR);
-	sa_bulk_attr_t bulk[5];
-	uint64_t mtime[2], ctime[2];
-	int count = 0;
-	int error;
-
-	ASSERT_VOP_ELOCKED(ZTOV(dzp), __func__);
-	ASSERT_VOP_ELOCKED(ZTOV(zp), __func__);
-#ifdef __FreeBSD__
-	if (zp_is_dir) {
-		if (dzp->z_links >= ZFS_LINK_MAX)
-			return (SET_ERROR(EMLINK));
-	}
-#endif
-	if (!(flag & ZRENAMING)) {
-		if (zp->z_unlinked) {	/* no new links to unlinked zp */
-			ASSERT(!(flag & (ZNEW | ZEXISTS)));
-			return (SET_ERROR(ENOENT));
-		}
-#ifdef __FreeBSD__
-		if (zp->z_links >= ZFS_LINK_MAX - zp_is_dir) {
-			return (SET_ERROR(EMLINK));
-		}
-#endif
-		zp->z_links++;
-		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL,
-		    &zp->z_links, sizeof (zp->z_links));
-
-	} else {
-		ASSERT(zp->z_unlinked == 0);
-	}
-	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PARENT(zfsvfs), NULL,
-	    &dzp->z_id, sizeof (dzp->z_id));
-	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
-	    &zp->z_pflags, sizeof (zp->z_pflags));
-
-	if (!(flag & ZNEW)) {
-		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
-		    ctime, sizeof (ctime));
-		zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime,
-		    ctime, B_TRUE);
-	}
-	error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
-	ASSERT0(error);
-
-	dzp->z_size++;
-	dzp->z_links += zp_is_dir;
-	count = 0;
-	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL,
-	    &dzp->z_size, sizeof (dzp->z_size));
-	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL,
-	    &dzp->z_links, sizeof (dzp->z_links));
-	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
-	    mtime, sizeof (mtime));
-	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
-	    ctime, sizeof (ctime));
-	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
-	    &dzp->z_pflags, sizeof (dzp->z_pflags));
-	zfs_tstamp_update_setup(dzp, CONTENT_MODIFIED, mtime, ctime, B_TRUE);
-	error = sa_bulk_update(dzp->z_sa_hdl, bulk, count, tx);
-	ASSERT0(error);
-
-	value = zfs_dirent(zp, zp->z_mode);
-	error = zap_add(zp->z_zfsvfs->z_os, dzp->z_id, name,
-	    8, 1, &value, tx);
-	VERIFY0(error);
-
-	return (0);
-}
-
-/*
- * The match type in the code for this function should conform to:
- *
- * ------------------------------------------------------------------------
- * fs type  | z_norm      | lookup type | match type
- * ---------|-------------|-------------|----------------------------------
- * CS !norm | 0           |           0 | 0 (exact)
- * CS  norm | formX       |           0 | MT_NORMALIZE
- * CI !norm | upper       |   !ZCIEXACT | MT_NORMALIZE
- * CI !norm | upper       |    ZCIEXACT | MT_NORMALIZE | MT_MATCH_CASE
- * CI  norm | upper|formX |   !ZCIEXACT | MT_NORMALIZE
- * CI  norm | upper|formX |    ZCIEXACT | MT_NORMALIZE | MT_MATCH_CASE
- * CM !norm | upper       |    !ZCILOOK | MT_NORMALIZE | MT_MATCH_CASE
- * CM !norm | upper       |     ZCILOOK | MT_NORMALIZE
- * CM  norm | upper|formX |    !ZCILOOK | MT_NORMALIZE | MT_MATCH_CASE
- * CM  norm | upper|formX |     ZCILOOK | MT_NORMALIZE
- *
- * Abbreviations:
- *    CS = Case Sensitive, CI = Case Insensitive, CM = Case Mixed
- *    upper = case folding set by fs type on creation (U8_TEXTPREP_TOUPPER)
- *    formX = unicode normalization form set on fs creation
- */
-static int
-zfs_dropname(znode_t *dzp, const char *name, znode_t *zp, dmu_tx_t *tx,
-    int flag)
-{
-	int error;
-
-	if (zp->z_zfsvfs->z_norm) {
-		matchtype_t mt = MT_NORMALIZE;
-
-		if (zp->z_zfsvfs->z_case == ZFS_CASE_MIXED) {
-			mt |= MT_MATCH_CASE;
-		}
-
-		error = zap_remove_norm(zp->z_zfsvfs->z_os, dzp->z_id,
-		    name, mt, tx);
-	} else {
-		error = zap_remove(zp->z_zfsvfs->z_os, dzp->z_id, name, tx);
-	}
-
-	return (error);
-}
-
-/*
- * Unlink zp from dzp, and mark zp for deletion if this was the last link.
- * Can fail if zp is a mount point (EBUSY) or a non-empty directory (EEXIST).
- * If 'unlinkedp' is NULL, we put unlinked znodes on the unlinked list.
- * If it's non-NULL, we use it to indicate whether the znode needs deletion,
- * and it's the caller's job to do it.
- */
-int
-zfs_link_destroy(znode_t *dzp, const char *name, znode_t *zp, dmu_tx_t *tx,
-    int flag, boolean_t *unlinkedp)
-{
-	zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
-	vnode_t *vp = ZTOV(zp);
-	int zp_is_dir = (vp->v_type == VDIR);
-	boolean_t unlinked = B_FALSE;
-	sa_bulk_attr_t bulk[5];
-	uint64_t mtime[2], ctime[2];
-	int count = 0;
-	int error;
-
-	ASSERT_VOP_ELOCKED(ZTOV(dzp), __func__);
-	ASSERT_VOP_ELOCKED(ZTOV(zp), __func__);
-
-	if (!(flag & ZRENAMING)) {
-
-		if (zp_is_dir && !zfs_dirempty(zp)) {
-#ifdef illumos
-			return (SET_ERROR(EEXIST));
-#else
-			return (SET_ERROR(ENOTEMPTY));
-#endif
-		}
-
-		/*
-		 * If we get here, we are going to try to remove the object.
-		 * First try removing the name from the directory; if that
-		 * fails, return the error.
-		 */
-		error = zfs_dropname(dzp, name, zp, tx, flag);
-		if (error != 0) {
-			return (error);
-		}
-
-		if (zp->z_links <= zp_is_dir) {
-			zfs_panic_recover("zfs: link count on vnode %p is %u, "
-			    "should be at least %u", zp->z_vnode,
-			    (int)zp->z_links,
-			    zp_is_dir + 1);
-			zp->z_links = zp_is_dir + 1;
-		}
-		if (--zp->z_links == zp_is_dir) {
-			zp->z_unlinked = B_TRUE;
-			zp->z_links = 0;
-			unlinked = B_TRUE;
-		} else {
-			SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs),
-			    NULL, &ctime, sizeof (ctime));
-			SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs),
-			    NULL, &zp->z_pflags, sizeof (zp->z_pflags));
-			zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime, ctime,
-			    B_TRUE);
-		}
-		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs),
-		    NULL, &zp->z_links, sizeof (zp->z_links));
-		error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
-		count = 0;
-		ASSERT0(error);
-	} else {
-		ASSERT(zp->z_unlinked == 0);
-		error = zfs_dropname(dzp, name, zp, tx, flag);
-		if (error != 0)
-			return (error);
-	}
-
-	dzp->z_size--;		/* one dirent removed */
-	dzp->z_links -= zp_is_dir;	/* ".." link from zp */
-	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs),
-	    NULL, &dzp->z_links, sizeof (dzp->z_links));
-	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs),
-	    NULL, &dzp->z_size, sizeof (dzp->z_size));
-	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs),
-	    NULL, ctime, sizeof (ctime));
-	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs),
-	    NULL, mtime, sizeof (mtime));
-	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs),
-	    NULL, &dzp->z_pflags, sizeof (dzp->z_pflags));
-	zfs_tstamp_update_setup(dzp, CONTENT_MODIFIED, mtime, ctime, B_TRUE);
-	error = sa_bulk_update(dzp->z_sa_hdl, bulk, count, tx);
-	ASSERT0(error);
-
-	if (unlinkedp != NULL)
-		*unlinkedp = unlinked;
-	else if (unlinked)
-		zfs_unlinked_add(zp, tx);
-
-	return (0);
-}
-
-/*
- * Indicate whether the directory is empty.
- */
-boolean_t
-zfs_dirempty(znode_t *dzp)
-{
-	return (dzp->z_size == 2);
-}
-
-int
-zfs_make_xattrdir(znode_t *zp, vattr_t *vap, vnode_t **xvpp, cred_t *cr)
-{
-	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
-	znode_t *xzp;
-	dmu_tx_t *tx;
-	int error;
-	zfs_acl_ids_t acl_ids;
-	boolean_t fuid_dirtied;
-	uint64_t parent;
-
-	*xvpp = NULL;
-
-	/*
-	 * In FreeBSD, access checking for creating an EA is being done
-	 * in zfs_setextattr(),
-	 */
-#ifndef __FreeBSD_kernel__
-	if (error = zfs_zaccess(zp, ACE_WRITE_NAMED_ATTRS, 0, B_FALSE, cr))
-		return (error);
-#endif
-
-	if ((error = zfs_acl_ids_create(zp, IS_XATTR, vap, cr, NULL,
-	    &acl_ids)) != 0)
-		return (error);
-	if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
-		zfs_acl_ids_free(&acl_ids);
-		return (SET_ERROR(EDQUOT));
-	}
-
-	getnewvnode_reserve();
-
-	tx = dmu_tx_create(zfsvfs->z_os);
-	dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
-	    ZFS_SA_BASE_ATTR_SIZE);
-	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
-	dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
-	fuid_dirtied = zfsvfs->z_fuid_dirty;
-	if (fuid_dirtied)
-		zfs_fuid_txhold(zfsvfs, tx);
-	error = dmu_tx_assign(tx, TXG_WAIT);
-	if (error) {
-		zfs_acl_ids_free(&acl_ids);
-		dmu_tx_abort(tx);
-		getnewvnode_drop_reserve();
-		return (error);
-	}
-	zfs_mknode(zp, vap, tx, cr, IS_XATTR, &xzp, &acl_ids);
-
-	if (fuid_dirtied)
-		zfs_fuid_sync(zfsvfs, tx);
-
-#ifdef DEBUG
-	error = sa_lookup(xzp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
-	    &parent, sizeof (parent));
-	ASSERT(error == 0 && parent == zp->z_id);
-#endif
-
-	VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), &xzp->z_id,
-	    sizeof (xzp->z_id), tx));
-
-	(void) zfs_log_create(zfsvfs->z_log, tx, TX_MKXATTR, zp,
-	    xzp, "", NULL, acl_ids.z_fuidp, vap);
-
-	zfs_acl_ids_free(&acl_ids);
-	dmu_tx_commit(tx);
-
-	getnewvnode_drop_reserve();
-
-	*xvpp = ZTOV(xzp);
-
-	return (0);
-}
-
-/*
- * Return a znode for the extended attribute directory for zp.
- * ** If the directory does not already exist, it is created **
- *
- *	IN:	zp	- znode to obtain attribute directory from
- *		cr	- credentials of caller
- *		flags	- flags from the VOP_LOOKUP call
- *
- *	OUT:	xzpp	- pointer to extended attribute znode
- *
- *	RETURN:	0 on success
- *		error number on failure
- */
-int
-zfs_get_xattrdir(znode_t *zp, vnode_t **xvpp, cred_t *cr, int flags)
-{
-	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
-	znode_t		*xzp;
-	vattr_t		va;
-	int		error;
-top:
-	error = zfs_dirent_lookup(zp, "", &xzp, ZXATTR);
-	if (error)
-		return (error);
-
-	if (xzp != NULL) {
-		*xvpp = ZTOV(xzp);
-		return (0);
-	}
-
-
-	if (!(flags & CREATE_XATTR_DIR)) {
-#ifdef illumos
-		return (SET_ERROR(ENOENT));
-#else
-		return (SET_ERROR(ENOATTR));
-#endif
-	}
-
-	if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) {
-		return (SET_ERROR(EROFS));
-	}
-
-	/*
-	 * The ability to 'create' files in an attribute
-	 * directory comes from the write_xattr permission on the base file.
-	 *
-	 * The ability to 'search' an attribute directory requires
-	 * read_xattr permission on the base file.
-	 *
-	 * Once in a directory the ability to read/write attributes
-	 * is controlled by the permissions on the attribute file.
-	 */
-	va.va_mask = AT_TYPE | AT_MODE | AT_UID | AT_GID;
-	va.va_type = VDIR;
-	va.va_mode = S_IFDIR | S_ISVTX | 0777;
-	zfs_fuid_map_ids(zp, cr, &va.va_uid, &va.va_gid);
-
-	error = zfs_make_xattrdir(zp, &va, xvpp, cr);
-
-	if (error == ERESTART) {
-		/* NB: we already did dmu_tx_wait() if necessary */
-		goto top;
-	}
-	if (error == 0)
-		VOP_UNLOCK(*xvpp);
-
-	return (error);
-}
-
-/*
- * Decide whether it is okay to remove within a sticky directory.
- *
- * In sticky directories, write access is not sufficient;
- * you can remove entries from a directory only if:
- *
- *	you own the directory,
- *	you own the entry,
- *	the entry is a plain file and you have write access,
- *	or you are privileged (checked in secpolicy...).
- *
- * The function returns 0 if remove access is granted.
- */
-int
-zfs_sticky_remove_access(znode_t *zdp, znode_t *zp, cred_t *cr)
-{
-	uid_t  		uid;
-	uid_t		downer;
-	uid_t		fowner;
-	zfsvfs_t	*zfsvfs = zdp->z_zfsvfs;
-
-	if (zdp->z_zfsvfs->z_replay)
-		return (0);
-
-	if ((zdp->z_mode & S_ISVTX) == 0)
-		return (0);
-
-	downer = zfs_fuid_map_id(zfsvfs, zdp->z_uid, cr, ZFS_OWNER);
-	fowner = zfs_fuid_map_id(zfsvfs, zp->z_uid, cr, ZFS_OWNER);
-
-	if ((uid = crgetuid(cr)) == downer || uid == fowner ||
-	    (ZTOV(zp)->v_type == VREG &&
-	    zfs_zaccess(zp, ACE_WRITE_DATA, 0, B_FALSE, cr) == 0))
-		return (0);
-	else
-		return (secpolicy_vnode_remove(ZTOV(zp), cr));
-}
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_fm.c
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_fm.c
+++ /dev/null
@@ -1,871 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-
-/*
- * Copyright (c) 2012 by Delphix. All rights reserved.
- */
-
-#include <sys/spa.h>
-#include <sys/spa_impl.h>
-#include <sys/vdev.h>
-#include <sys/vdev_impl.h>
-#include <sys/zio.h>
-#include <sys/zio_checksum.h>
-
-#include <sys/fm/fs/zfs.h>
-#include <sys/fm/protocol.h>
-#include <sys/fm/util.h>
-#include <sys/sysevent.h>
-
-/*
- * This general routine is responsible for generating all the different ZFS
- * ereports.  The payload is dependent on the class, and which arguments are
- * supplied to the function:
- *
- * 	EREPORT			POOL	VDEV	IO
- * 	block			X	X	X
- * 	data			X		X
- * 	device			X	X
- * 	pool			X
- *
- * If we are in a loading state, all errors are chained together by the same
- * SPA-wide ENA (Error Numeric Association).
- *
- * For isolated I/O requests, we get the ENA from the zio_t. The propagation
- * gets very complicated due to RAID-Z, gang blocks, and vdev caching.  We want
- * to chain together all ereports associated with a logical piece of data.  For
- * read I/Os, there  are basically three 'types' of I/O, which form a roughly
- * layered diagram:
- *
- *      +---------------+
- * 	| Aggregate I/O |	No associated logical data or device
- * 	+---------------+
- *              |
- *              V
- * 	+---------------+	Reads associated with a piece of logical data.
- * 	|   Read I/O    |	This includes reads on behalf of RAID-Z,
- * 	+---------------+       mirrors, gang blocks, retries, etc.
- *              |
- *              V
- * 	+---------------+	Reads associated with a particular device, but
- * 	| Physical I/O  |	no logical data.  Issued as part of vdev caching
- * 	+---------------+	and I/O aggregation.
- *
- * Note that 'physical I/O' here is not the same terminology as used in the rest
- * of ZIO.  Typically, 'physical I/O' simply means that there is no attached
- * blockpointer.  But I/O with no associated block pointer can still be related
- * to a logical piece of data (i.e. RAID-Z requests).
- *
- * Purely physical I/O always have unique ENAs.  They are not related to a
- * particular piece of logical data, and therefore cannot be chained together.
- * We still generate an ereport, but the DE doesn't correlate it with any
- * logical piece of data.  When such an I/O fails, the delegated I/O requests
- * will issue a retry, which will trigger the 'real' ereport with the correct
- * ENA.
- *
- * We keep track of the ENA for a ZIO chain through the 'io_logical' member.
- * When a new logical I/O is issued, we set this to point to itself.  Child I/Os
- * then inherit this pointer, so that when it is first set subsequent failures
- * will use the same ENA.  For vdev cache fill and queue aggregation I/O,
- * this pointer is set to NULL, and no ereport will be generated (since it
- * doesn't actually correspond to any particular device or piece of data,
- * and the caller will always retry without caching or queueing anyway).
- *
- * For checksum errors, we want to include more information about the actual
- * error which occurs.  Accordingly, we build an ereport when the error is
- * noticed, but instead of sending it in immediately, we hang it off of the
- * io_cksum_report field of the logical IO.  When the logical IO completes
- * (successfully or not), zfs_ereport_finish_checksum() is called with the
- * good and bad versions of the buffer (if available), and we annotate the
- * ereport with information about the differences.
- */
-#ifdef _KERNEL
-static void
-zfs_ereport_start(nvlist_t **ereport_out, nvlist_t **detector_out,
-    const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio,
-    uint64_t stateoroffset, uint64_t size)
-{
-	nvlist_t *ereport, *detector;
-
-	uint64_t ena;
-	char class[64];
-
-	/*
-	 * If we are doing a spa_tryimport() or in recovery mode,
-	 * ignore errors.
-	 */
-	if (spa_load_state(spa) == SPA_LOAD_TRYIMPORT ||
-	    spa_load_state(spa) == SPA_LOAD_RECOVER)
-		return;
-
-	/*
-	 * If we are in the middle of opening a pool, and the previous attempt
-	 * failed, don't bother logging any new ereports - we're just going to
-	 * get the same diagnosis anyway.
-	 */
-	if (spa_load_state(spa) != SPA_LOAD_NONE &&
-	    spa->spa_last_open_failed)
-		return;
-
-	if (zio != NULL) {
-		/*
-		 * If this is not a read or write zio, ignore the error.  This
-		 * can occur if the DKIOCFLUSHWRITECACHE ioctl fails.
-		 */
-		if (zio->io_type != ZIO_TYPE_READ &&
-		    zio->io_type != ZIO_TYPE_WRITE)
-			return;
-
-		/*
-		 * Ignore any errors from speculative I/Os, as failure is an
-		 * expected result.
-		 */
-		if (zio->io_flags & ZIO_FLAG_SPECULATIVE)
-			return;
-
-		/*
-		 * If this I/O is not a retry I/O, don't post an ereport.
-		 * Otherwise, we risk making bad diagnoses based on B_FAILFAST
-		 * I/Os.
-		 */
-		if (zio->io_error == EIO &&
-		    !(zio->io_flags & ZIO_FLAG_IO_RETRY))
-			return;
-
-		if (vd != NULL) {
-			/*
-			 * If the vdev has already been marked as failing due
-			 * to a failed probe, then ignore any subsequent I/O
-			 * errors, as the DE will automatically fault the vdev
-			 * on the first such failure.  This also catches cases
-			 * where vdev_remove_wanted is set and the device has
-			 * not yet been asynchronously placed into the REMOVED
-			 * state.
-			 */
-			if (zio->io_vd == vd && !vdev_accessible(vd, zio))
-				return;
-
-			/*
-			 * Ignore checksum errors for reads from DTL regions of
-			 * leaf vdevs.
-			 */
-			if (zio->io_type == ZIO_TYPE_READ &&
-			    zio->io_error == ECKSUM &&
-			    vd->vdev_ops->vdev_op_leaf &&
-			    vdev_dtl_contains(vd, DTL_MISSING, zio->io_txg, 1))
-				return;
-		}
-	}
-
-	/*
-	 * For probe failure, we want to avoid posting ereports if we've
-	 * already removed the device in the meantime.
-	 */
-	if (vd != NULL &&
-	    strcmp(subclass, FM_EREPORT_ZFS_PROBE_FAILURE) == 0 &&
-	    (vd->vdev_remove_wanted || vd->vdev_state == VDEV_STATE_REMOVED))
-		return;
-
-	if ((ereport = fm_nvlist_create(NULL)) == NULL)
-		return;
-
-	if ((detector = fm_nvlist_create(NULL)) == NULL) {
-		fm_nvlist_destroy(ereport, FM_NVA_FREE);
-		return;
-	}
-
-	/*
-	 * Serialize ereport generation
-	 */
-	mutex_enter(&spa->spa_errlist_lock);
-
-	/*
-	 * Determine the ENA to use for this event.  If we are in a loading
-	 * state, use a SPA-wide ENA.  Otherwise, if we are in an I/O state, use
-	 * a root zio-wide ENA.  Otherwise, simply use a unique ENA.
-	 */
-	if (spa_load_state(spa) != SPA_LOAD_NONE) {
-		if (spa->spa_ena == 0)
-			spa->spa_ena = fm_ena_generate(0, FM_ENA_FMT1);
-		ena = spa->spa_ena;
-	} else if (zio != NULL && zio->io_logical != NULL) {
-		if (zio->io_logical->io_ena == 0)
-			zio->io_logical->io_ena =
-			    fm_ena_generate(0, FM_ENA_FMT1);
-		ena = zio->io_logical->io_ena;
-	} else {
-		ena = fm_ena_generate(0, FM_ENA_FMT1);
-	}
-
-	/*
-	 * Construct the full class, detector, and other standard FMA fields.
-	 */
-	(void) snprintf(class, sizeof (class), "%s.%s",
-	    ZFS_ERROR_CLASS, subclass);
-
-	fm_fmri_zfs_set(detector, FM_ZFS_SCHEME_VERSION, spa_guid(spa),
-	    vd != NULL ? vd->vdev_guid : 0);
-
-	fm_ereport_set(ereport, FM_EREPORT_VERSION, class, ena, detector, NULL);
-
-	/*
-	 * Construct the per-ereport payload, depending on which parameters are
-	 * passed in.
-	 */
-
-	/*
-	 * Generic payload members common to all ereports.
-	 */
-	fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_POOL,
-	    DATA_TYPE_STRING, spa_name(spa), FM_EREPORT_PAYLOAD_ZFS_POOL_GUID,
-	    DATA_TYPE_UINT64, spa_guid(spa),
-	    FM_EREPORT_PAYLOAD_ZFS_POOL_CONTEXT, DATA_TYPE_INT32,
-	    spa_load_state(spa), NULL);
-
-	if (spa != NULL) {
-		fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_POOL_FAILMODE,
-		    DATA_TYPE_STRING,
-		    spa_get_failmode(spa) == ZIO_FAILURE_MODE_WAIT ?
-		    FM_EREPORT_FAILMODE_WAIT :
-		    spa_get_failmode(spa) == ZIO_FAILURE_MODE_CONTINUE ?
-		    FM_EREPORT_FAILMODE_CONTINUE : FM_EREPORT_FAILMODE_PANIC,
-		    NULL);
-	}
-
-	if (vd != NULL) {
-		vdev_t *pvd = vd->vdev_parent;
-
-		fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID,
-		    DATA_TYPE_UINT64, vd->vdev_guid,
-		    FM_EREPORT_PAYLOAD_ZFS_VDEV_TYPE,
-		    DATA_TYPE_STRING, vd->vdev_ops->vdev_op_type, NULL);
-		if (vd->vdev_path != NULL)
-			fm_payload_set(ereport,
-			    FM_EREPORT_PAYLOAD_ZFS_VDEV_PATH,
-			    DATA_TYPE_STRING, vd->vdev_path, NULL);
-		if (vd->vdev_devid != NULL)
-			fm_payload_set(ereport,
-			    FM_EREPORT_PAYLOAD_ZFS_VDEV_DEVID,
-			    DATA_TYPE_STRING, vd->vdev_devid, NULL);
-		if (vd->vdev_fru != NULL)
-			fm_payload_set(ereport,
-			    FM_EREPORT_PAYLOAD_ZFS_VDEV_FRU,
-			    DATA_TYPE_STRING, vd->vdev_fru, NULL);
-
-		if (pvd != NULL) {
-			fm_payload_set(ereport,
-			    FM_EREPORT_PAYLOAD_ZFS_PARENT_GUID,
-			    DATA_TYPE_UINT64, pvd->vdev_guid,
-			    FM_EREPORT_PAYLOAD_ZFS_PARENT_TYPE,
-			    DATA_TYPE_STRING, pvd->vdev_ops->vdev_op_type,
-			    NULL);
-			if (pvd->vdev_path)
-				fm_payload_set(ereport,
-				    FM_EREPORT_PAYLOAD_ZFS_PARENT_PATH,
-				    DATA_TYPE_STRING, pvd->vdev_path, NULL);
-			if (pvd->vdev_devid)
-				fm_payload_set(ereport,
-				    FM_EREPORT_PAYLOAD_ZFS_PARENT_DEVID,
-				    DATA_TYPE_STRING, pvd->vdev_devid, NULL);
-		}
-	}
-
-	if (zio != NULL) {
-		/*
-		 * Payload common to all I/Os.
-		 */
-		fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_ERR,
-		    DATA_TYPE_INT32, zio->io_error, NULL);
-
-		/*
-		 * If the 'size' parameter is non-zero, it indicates this is a
-		 * RAID-Z or other I/O where the physical offset and length are
-		 * provided for us, instead of within the zio_t.
-		 */
-		if (vd != NULL) {
-			if (size)
-				fm_payload_set(ereport,
-				    FM_EREPORT_PAYLOAD_ZFS_ZIO_OFFSET,
-				    DATA_TYPE_UINT64, stateoroffset,
-				    FM_EREPORT_PAYLOAD_ZFS_ZIO_SIZE,
-				    DATA_TYPE_UINT64, size, NULL);
-			else
-				fm_payload_set(ereport,
-				    FM_EREPORT_PAYLOAD_ZFS_ZIO_OFFSET,
-				    DATA_TYPE_UINT64, zio->io_offset,
-				    FM_EREPORT_PAYLOAD_ZFS_ZIO_SIZE,
-				    DATA_TYPE_UINT64, zio->io_size, NULL);
-		}
-
-		/*
-		 * Payload for I/Os with corresponding logical information.
-		 */
-		if (zio->io_logical != NULL)
-			fm_payload_set(ereport,
-			    FM_EREPORT_PAYLOAD_ZFS_ZIO_OBJSET,
-			    DATA_TYPE_UINT64,
-			    zio->io_logical->io_bookmark.zb_objset,
-			    FM_EREPORT_PAYLOAD_ZFS_ZIO_OBJECT,
-			    DATA_TYPE_UINT64,
-			    zio->io_logical->io_bookmark.zb_object,
-			    FM_EREPORT_PAYLOAD_ZFS_ZIO_LEVEL,
-			    DATA_TYPE_INT64,
-			    zio->io_logical->io_bookmark.zb_level,
-			    FM_EREPORT_PAYLOAD_ZFS_ZIO_BLKID,
-			    DATA_TYPE_UINT64,
-			    zio->io_logical->io_bookmark.zb_blkid, NULL);
-	} else if (vd != NULL) {
-		/*
-		 * If we have a vdev but no zio, this is a device fault, and the
-		 * 'stateoroffset' parameter indicates the previous state of the
-		 * vdev.
-		 */
-		fm_payload_set(ereport,
-		    FM_EREPORT_PAYLOAD_ZFS_PREV_STATE,
-		    DATA_TYPE_UINT64, stateoroffset, NULL);
-	}
-
-	mutex_exit(&spa->spa_errlist_lock);
-
-	*ereport_out = ereport;
-	*detector_out = detector;
-}
-
-/* if it's <= 128 bytes, save the corruption directly */
-#define	ZFM_MAX_INLINE		(128 / sizeof (uint64_t))
-
-#define	MAX_RANGES		16
-
-typedef struct zfs_ecksum_info {
-	/* histograms of set and cleared bits by bit number in a 64-bit word */
-	uint32_t zei_histogram_set[sizeof (uint64_t) * NBBY];
-	uint32_t zei_histogram_cleared[sizeof (uint64_t) * NBBY];
-
-	/* inline arrays of bits set and cleared. */
-	uint64_t zei_bits_set[ZFM_MAX_INLINE];
-	uint64_t zei_bits_cleared[ZFM_MAX_INLINE];
-
-	/*
-	 * for each range, the number of bits set and cleared.  The Hamming
-	 * distance between the good and bad buffers is the sum of them all.
-	 */
-	uint32_t zei_range_sets[MAX_RANGES];
-	uint32_t zei_range_clears[MAX_RANGES];
-
-	struct zei_ranges {
-		uint32_t	zr_start;
-		uint32_t	zr_end;
-	} zei_ranges[MAX_RANGES];
-
-	size_t	zei_range_count;
-	uint32_t zei_mingap;
-	uint32_t zei_allowed_mingap;
-
-} zfs_ecksum_info_t;
-
-static void
-update_histogram(uint64_t value_arg, uint32_t *hist, uint32_t *count)
-{
-	size_t i;
-	size_t bits = 0;
-	uint64_t value = BE_64(value_arg);
-
-	/* We store the bits in big-endian (largest-first) order */
-	for (i = 0; i < 64; i++) {
-		if (value & (1ull << i)) {
-			hist[63 - i]++;
-			++bits;
-		}
-	}
-	/* update the count of bits changed */
-	*count += bits;
-}
-
-/*
- * We've now filled up the range array, and need to increase "mingap" and
- * shrink the range list accordingly.  zei_mingap is always the smallest
- * distance between array entries, so we set the new_allowed_gap to be
- * one greater than that.  We then go through the list, joining together
- * any ranges which are closer than the new_allowed_gap.
- *
- * By construction, there will be at least one.  We also update zei_mingap
- * to the new smallest gap, to prepare for our next invocation.
- */
-static void
-shrink_ranges(zfs_ecksum_info_t *eip)
-{
-	uint32_t mingap = UINT32_MAX;
-	uint32_t new_allowed_gap = eip->zei_mingap + 1;
-
-	size_t idx, output;
-	size_t max = eip->zei_range_count;
-
-	struct zei_ranges *r = eip->zei_ranges;
-
-	ASSERT3U(eip->zei_range_count, >, 0);
-	ASSERT3U(eip->zei_range_count, <=, MAX_RANGES);
-
-	output = idx = 0;
-	while (idx < max - 1) {
-		uint32_t start = r[idx].zr_start;
-		uint32_t end = r[idx].zr_end;
-
-		while (idx < max - 1) {
-			idx++;
-
-			uint32_t nstart = r[idx].zr_start;
-			uint32_t nend = r[idx].zr_end;
-
-			uint32_t gap = nstart - end;
-			if (gap < new_allowed_gap) {
-				end = nend;
-				continue;
-			}
-			if (gap < mingap)
-				mingap = gap;
-			break;
-		}
-		r[output].zr_start = start;
-		r[output].zr_end = end;
-		output++;
-	}
-	ASSERT3U(output, <, eip->zei_range_count);
-	eip->zei_range_count = output;
-	eip->zei_mingap = mingap;
-	eip->zei_allowed_mingap = new_allowed_gap;
-}
-
-static void
-add_range(zfs_ecksum_info_t *eip, int start, int end)
-{
-	struct zei_ranges *r = eip->zei_ranges;
-	size_t count = eip->zei_range_count;
-
-	if (count >= MAX_RANGES) {
-		shrink_ranges(eip);
-		count = eip->zei_range_count;
-	}
-	if (count == 0) {
-		eip->zei_mingap = UINT32_MAX;
-		eip->zei_allowed_mingap = 1;
-	} else {
-		int gap = start - r[count - 1].zr_end;
-
-		if (gap < eip->zei_allowed_mingap) {
-			r[count - 1].zr_end = end;
-			return;
-		}
-		if (gap < eip->zei_mingap)
-			eip->zei_mingap = gap;
-	}
-	r[count].zr_start = start;
-	r[count].zr_end = end;
-	eip->zei_range_count++;
-}
-
-static size_t
-range_total_size(zfs_ecksum_info_t *eip)
-{
-	struct zei_ranges *r = eip->zei_ranges;
-	size_t count = eip->zei_range_count;
-	size_t result = 0;
-	size_t idx;
-
-	for (idx = 0; idx < count; idx++)
-		result += (r[idx].zr_end - r[idx].zr_start);
-
-	return (result);
-}
-
-static zfs_ecksum_info_t *
-annotate_ecksum(nvlist_t *ereport, zio_bad_cksum_t *info,
-    const uint8_t *goodbuf, const uint8_t *badbuf, size_t size,
-    boolean_t drop_if_identical)
-{
-	const uint64_t *good = (const uint64_t *)goodbuf;
-	const uint64_t *bad = (const uint64_t *)badbuf;
-
-	uint64_t allset = 0;
-	uint64_t allcleared = 0;
-
-	size_t nui64s = size / sizeof (uint64_t);
-
-	size_t inline_size;
-	int no_inline = 0;
-	size_t idx;
-	size_t range;
-
-	size_t offset = 0;
-	ssize_t start = -1;
-
-	zfs_ecksum_info_t *eip = kmem_zalloc(sizeof (*eip), KM_SLEEP);
-
-	/* don't do any annotation for injected checksum errors */
-	if (info != NULL && info->zbc_injected)
-		return (eip);
-
-	if (info != NULL && info->zbc_has_cksum) {
-		fm_payload_set(ereport,
-		    FM_EREPORT_PAYLOAD_ZFS_CKSUM_EXPECTED,
-		    DATA_TYPE_UINT64_ARRAY,
-		    sizeof (info->zbc_expected) / sizeof (uint64_t),
-		    (uint64_t *)&info->zbc_expected,
-		    FM_EREPORT_PAYLOAD_ZFS_CKSUM_ACTUAL,
-		    DATA_TYPE_UINT64_ARRAY,
-		    sizeof (info->zbc_actual) / sizeof (uint64_t),
-		    (uint64_t *)&info->zbc_actual,
-		    FM_EREPORT_PAYLOAD_ZFS_CKSUM_ALGO,
-		    DATA_TYPE_STRING,
-		    info->zbc_checksum_name,
-		    NULL);
-
-		if (info->zbc_byteswapped) {
-			fm_payload_set(ereport,
-			    FM_EREPORT_PAYLOAD_ZFS_CKSUM_BYTESWAP,
-			    DATA_TYPE_BOOLEAN, 1,
-			    NULL);
-		}
-	}
-
-	if (badbuf == NULL || goodbuf == NULL)
-		return (eip);
-
-	ASSERT3U(nui64s, <=, UINT32_MAX);
-	ASSERT3U(size, ==, nui64s * sizeof (uint64_t));
-	ASSERT3U(size, <=, SPA_MAXBLOCKSIZE);
-	ASSERT3U(size, <=, UINT32_MAX);
-
-	/* build up the range list by comparing the two buffers. */
-	for (idx = 0; idx < nui64s; idx++) {
-		if (good[idx] == bad[idx]) {
-			if (start == -1)
-				continue;
-
-			add_range(eip, start, idx);
-			start = -1;
-		} else {
-			if (start != -1)
-				continue;
-
-			start = idx;
-		}
-	}
-	if (start != -1)
-		add_range(eip, start, idx);
-
-	/* See if it will fit in our inline buffers */
-	inline_size = range_total_size(eip);
-	if (inline_size > ZFM_MAX_INLINE)
-		no_inline = 1;
-
-	/*
-	 * If there is no change and we want to drop if the buffers are
-	 * identical, do so.
-	 */
-	if (inline_size == 0 && drop_if_identical) {
-		kmem_free(eip, sizeof (*eip));
-		return (NULL);
-	}
-
-	/*
-	 * Now walk through the ranges, filling in the details of the
-	 * differences.  Also convert our uint64_t-array offsets to byte
-	 * offsets.
-	 */
-	for (range = 0; range < eip->zei_range_count; range++) {
-		size_t start = eip->zei_ranges[range].zr_start;
-		size_t end = eip->zei_ranges[range].zr_end;
-
-		for (idx = start; idx < end; idx++) {
-			uint64_t set, cleared;
-
-			// bits set in bad, but not in good
-			set = ((~good[idx]) & bad[idx]);
-			// bits set in good, but not in bad
-			cleared = (good[idx] & (~bad[idx]));
-
-			allset |= set;
-			allcleared |= cleared;
-
-			if (!no_inline) {
-				ASSERT3U(offset, <, inline_size);
-				eip->zei_bits_set[offset] = set;
-				eip->zei_bits_cleared[offset] = cleared;
-				offset++;
-			}
-
-			update_histogram(set, eip->zei_histogram_set,
-			    &eip->zei_range_sets[range]);
-			update_histogram(cleared, eip->zei_histogram_cleared,
-			    &eip->zei_range_clears[range]);
-		}
-
-		/* convert to byte offsets */
-		eip->zei_ranges[range].zr_start	*= sizeof (uint64_t);
-		eip->zei_ranges[range].zr_end	*= sizeof (uint64_t);
-	}
-	eip->zei_allowed_mingap	*= sizeof (uint64_t);
-	inline_size		*= sizeof (uint64_t);
-
-	/* fill in ereport */
-	fm_payload_set(ereport,
-	    FM_EREPORT_PAYLOAD_ZFS_BAD_OFFSET_RANGES,
-	    DATA_TYPE_UINT32_ARRAY, 2 * eip->zei_range_count,
-	    (uint32_t *)eip->zei_ranges,
-	    FM_EREPORT_PAYLOAD_ZFS_BAD_RANGE_MIN_GAP,
-	    DATA_TYPE_UINT32, eip->zei_allowed_mingap,
-	    FM_EREPORT_PAYLOAD_ZFS_BAD_RANGE_SETS,
-	    DATA_TYPE_UINT32_ARRAY, eip->zei_range_count, eip->zei_range_sets,
-	    FM_EREPORT_PAYLOAD_ZFS_BAD_RANGE_CLEARS,
-	    DATA_TYPE_UINT32_ARRAY, eip->zei_range_count, eip->zei_range_clears,
-	    NULL);
-
-	if (!no_inline) {
-		fm_payload_set(ereport,
-		    FM_EREPORT_PAYLOAD_ZFS_BAD_SET_BITS,
-		    DATA_TYPE_UINT8_ARRAY,
-		    inline_size, (uint8_t *)eip->zei_bits_set,
-		    FM_EREPORT_PAYLOAD_ZFS_BAD_CLEARED_BITS,
-		    DATA_TYPE_UINT8_ARRAY,
-		    inline_size, (uint8_t *)eip->zei_bits_cleared,
-		    NULL);
-	} else {
-		fm_payload_set(ereport,
-		    FM_EREPORT_PAYLOAD_ZFS_BAD_SET_HISTOGRAM,
-		    DATA_TYPE_UINT32_ARRAY,
-		    NBBY * sizeof (uint64_t), eip->zei_histogram_set,
-		    FM_EREPORT_PAYLOAD_ZFS_BAD_CLEARED_HISTOGRAM,
-		    DATA_TYPE_UINT32_ARRAY,
-		    NBBY * sizeof (uint64_t), eip->zei_histogram_cleared,
-		    NULL);
-	}
-	return (eip);
-}
-#endif
-
-void
-zfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio,
-    uint64_t stateoroffset, uint64_t size)
-{
-#ifdef _KERNEL
-	nvlist_t *ereport = NULL;
-	nvlist_t *detector = NULL;
-
-	zfs_ereport_start(&ereport, &detector,
-	    subclass, spa, vd, zio, stateoroffset, size);
-
-	if (ereport == NULL)
-		return;
-
-	fm_ereport_post(ereport, EVCH_SLEEP);
-
-	fm_nvlist_destroy(ereport, FM_NVA_FREE);
-	fm_nvlist_destroy(detector, FM_NVA_FREE);
-#endif
-}
-
-void
-zfs_ereport_start_checksum(spa_t *spa, vdev_t *vd,
-    struct zio *zio, uint64_t offset, uint64_t length, void *arg,
-    zio_bad_cksum_t *info)
-{
-	zio_cksum_report_t *report = kmem_zalloc(sizeof (*report), KM_SLEEP);
-
-	if (zio->io_vsd != NULL)
-		zio->io_vsd_ops->vsd_cksum_report(zio, report, arg);
-	else
-		zio_vsd_default_cksum_report(zio, report, arg);
-
-	/* copy the checksum failure information if it was provided */
-	if (info != NULL) {
-		report->zcr_ckinfo = kmem_zalloc(sizeof (*info), KM_SLEEP);
-		bcopy(info, report->zcr_ckinfo, sizeof (*info));
-	}
-
-	report->zcr_align = 1ULL << vd->vdev_top->vdev_ashift;
-	report->zcr_length = length;
-
-#ifdef _KERNEL
-	zfs_ereport_start(&report->zcr_ereport, &report->zcr_detector,
-	    FM_EREPORT_ZFS_CHECKSUM, spa, vd, zio, offset, length);
-
-	if (report->zcr_ereport == NULL) {
-		report->zcr_free(report->zcr_cbdata, report->zcr_cbinfo);
-		if (report->zcr_ckinfo != NULL) {
-			kmem_free(report->zcr_ckinfo,
-			    sizeof (*report->zcr_ckinfo));
-		}
-		kmem_free(report, sizeof (*report));
-		return;
-	}
-#endif
-
-	mutex_enter(&spa->spa_errlist_lock);
-	report->zcr_next = zio->io_logical->io_cksum_report;
-	zio->io_logical->io_cksum_report = report;
-	mutex_exit(&spa->spa_errlist_lock);
-}
-
-void
-zfs_ereport_finish_checksum(zio_cksum_report_t *report,
-    const void *good_data, const void *bad_data, boolean_t drop_if_identical)
-{
-#ifdef _KERNEL
-	zfs_ecksum_info_t *info = NULL;
-	info = annotate_ecksum(report->zcr_ereport, report->zcr_ckinfo,
-	    good_data, bad_data, report->zcr_length, drop_if_identical);
-
-	if (info != NULL)
-		fm_ereport_post(report->zcr_ereport, EVCH_SLEEP);
-
-	fm_nvlist_destroy(report->zcr_ereport, FM_NVA_FREE);
-	fm_nvlist_destroy(report->zcr_detector, FM_NVA_FREE);
-	report->zcr_ereport = report->zcr_detector = NULL;
-
-	if (info != NULL)
-		kmem_free(info, sizeof (*info));
-#endif
-}
-
-void
-zfs_ereport_free_checksum(zio_cksum_report_t *rpt)
-{
-#ifdef _KERNEL
-	if (rpt->zcr_ereport != NULL) {
-		fm_nvlist_destroy(rpt->zcr_ereport,
-		    FM_NVA_FREE);
-		fm_nvlist_destroy(rpt->zcr_detector,
-		    FM_NVA_FREE);
-	}
-#endif
-	rpt->zcr_free(rpt->zcr_cbdata, rpt->zcr_cbinfo);
-
-	if (rpt->zcr_ckinfo != NULL)
-		kmem_free(rpt->zcr_ckinfo, sizeof (*rpt->zcr_ckinfo));
-
-	kmem_free(rpt, sizeof (*rpt));
-}
-
-void
-zfs_ereport_send_interim_checksum(zio_cksum_report_t *report)
-{
-#ifdef _KERNEL
-	fm_ereport_post(report->zcr_ereport, EVCH_SLEEP);
-#endif
-}
-
-void
-zfs_ereport_post_checksum(spa_t *spa, vdev_t *vd,
-    struct zio *zio, uint64_t offset, uint64_t length,
-    const void *good_data, const void *bad_data, zio_bad_cksum_t *zbc)
-{
-#ifdef _KERNEL
-	nvlist_t *ereport = NULL;
-	nvlist_t *detector = NULL;
-	zfs_ecksum_info_t *info;
-
-	zfs_ereport_start(&ereport, &detector,
-	    FM_EREPORT_ZFS_CHECKSUM, spa, vd, zio, offset, length);
-
-	if (ereport == NULL)
-		return;
-
-	info = annotate_ecksum(ereport, zbc, good_data, bad_data, length,
-	    B_FALSE);
-
-	if (info != NULL)
-		fm_ereport_post(ereport, EVCH_SLEEP);
-
-	fm_nvlist_destroy(ereport, FM_NVA_FREE);
-	fm_nvlist_destroy(detector, FM_NVA_FREE);
-
-	if (info != NULL)
-		kmem_free(info, sizeof (*info));
-#endif
-}
-
-static void
-zfs_post_common(spa_t *spa, vdev_t *vd, const char *name)
-{
-#ifdef _KERNEL
-	nvlist_t *resource;
-	char class[64];
-
-	if (spa_load_state(spa) == SPA_LOAD_TRYIMPORT)
-		return;
-
-	if ((resource = fm_nvlist_create(NULL)) == NULL)
-		return;
-
-	(void) snprintf(class, sizeof (class), "%s.%s.%s", FM_RSRC_RESOURCE,
-	    ZFS_ERROR_CLASS, name);
-	VERIFY(nvlist_add_uint8(resource, FM_VERSION, FM_RSRC_VERSION) == 0);
-	VERIFY(nvlist_add_string(resource, FM_CLASS, class) == 0);
-	VERIFY(nvlist_add_uint64(resource,
-	    FM_EREPORT_PAYLOAD_ZFS_POOL_GUID, spa_guid(spa)) == 0);
-	if (vd)
-		VERIFY(nvlist_add_uint64(resource,
-		    FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID, vd->vdev_guid) == 0);
-
-	fm_ereport_post(resource, EVCH_SLEEP);
-
-	fm_nvlist_destroy(resource, FM_NVA_FREE);
-#endif
-}
-
-/*
- * The 'resource.fs.zfs.removed' event is an internal signal that the given vdev
- * has been removed from the system.  This will cause the DE to ignore any
- * recent I/O errors, inferring that they are due to the asynchronous device
- * removal.
- */
-void
-zfs_post_remove(spa_t *spa, vdev_t *vd)
-{
-	zfs_post_common(spa, vd, FM_RESOURCE_REMOVED);
-}
-
-/*
- * The 'resource.fs.zfs.autoreplace' event is an internal signal that the pool
- * has the 'autoreplace' property set, and therefore any broken vdevs will be
- * handled by higher level logic, and no vdev fault should be generated.
- */
-void
-zfs_post_autoreplace(spa_t *spa, vdev_t *vd)
-{
-	zfs_post_common(spa, vd, FM_RESOURCE_AUTOREPLACE);
-}
-
-/*
- * The 'resource.fs.zfs.statechange' event is an internal signal that the
- * given vdev has transitioned its state to DEGRADED or HEALTHY.  This will
- * cause the retire agent to repair any outstanding fault management cases
- * open because the device was not found (fault.fs.zfs.device).
- */
-void
-zfs_post_state_change(spa_t *spa, vdev_t *vd)
-{
-	zfs_post_common(spa, vd, FM_RESOURCE_STATECHANGE);
-}
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_fuid.c
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_fuid.c
+++ /dev/null
@@ -1,762 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
- */
-
-#include <sys/zfs_context.h>
-#include <sys/dmu.h>
-#include <sys/avl.h>
-#include <sys/zap.h>
-#include <sys/refcount.h>
-#include <sys/nvpair.h>
-#ifdef _KERNEL
-#include <sys/kidmap.h>
-#include <sys/sid.h>
-#include <sys/zfs_vfsops.h>
-#include <sys/zfs_znode.h>
-#endif
-#include <sys/zfs_fuid.h>
-
-/*
- * FUID Domain table(s).
- *
- * The FUID table is stored as a packed nvlist of an array
- * of nvlists which contain an index, domain string and offset
- *
- * During file system initialization the nvlist(s) are read and
- * two AVL trees are created.  One tree is keyed by the index number
- * and the other by the domain string.  Nodes are never removed from
- * trees, but new entries may be added.  If a new entry is added then
- * the zfsvfs->z_fuid_dirty flag is set to true and the caller will then
- * be responsible for calling zfs_fuid_sync() to sync the changes to disk.
- *
- */
-
-#define	FUID_IDX	"fuid_idx"
-#define	FUID_DOMAIN	"fuid_domain"
-#define	FUID_OFFSET	"fuid_offset"
-#define	FUID_NVP_ARRAY	"fuid_nvlist"
-
-typedef struct fuid_domain {
-	avl_node_t	f_domnode;
-	avl_node_t	f_idxnode;
-	ksiddomain_t	*f_ksid;
-	uint64_t	f_idx;
-} fuid_domain_t;
-
-static char *nulldomain = "";
-
-/*
- * Compare two indexes.
- */
-static int
-idx_compare(const void *arg1, const void *arg2)
-{
-	const fuid_domain_t *node1 = (const fuid_domain_t *)arg1;
-	const fuid_domain_t *node2 = (const fuid_domain_t *)arg2;
-
-	return (AVL_CMP(node1->f_idx, node2->f_idx));
-}
-
-/*
- * Compare two domain strings.
- */
-static int
-domain_compare(const void *arg1, const void *arg2)
-{
-	const fuid_domain_t *node1 = (const fuid_domain_t *)arg1;
-	const fuid_domain_t *node2 = (const fuid_domain_t *)arg2;
-	int val;
-
-	val = strcmp(node1->f_ksid->kd_name, node2->f_ksid->kd_name);
-
-	return (AVL_ISIGN(val));
-}
-
-void
-zfs_fuid_avl_tree_create(avl_tree_t *idx_tree, avl_tree_t *domain_tree)
-{
-	avl_create(idx_tree, idx_compare,
-	    sizeof (fuid_domain_t), offsetof(fuid_domain_t, f_idxnode));
-	avl_create(domain_tree, domain_compare,
-	    sizeof (fuid_domain_t), offsetof(fuid_domain_t, f_domnode));
-}
-
-/*
- * load initial fuid domain and idx trees.  This function is used by
- * both the kernel and zdb.
- */
-uint64_t
-zfs_fuid_table_load(objset_t *os, uint64_t fuid_obj, avl_tree_t *idx_tree,
-    avl_tree_t *domain_tree)
-{
-	dmu_buf_t *db;
-	uint64_t fuid_size;
-
-	ASSERT(fuid_obj != 0);
-	VERIFY(0 == dmu_bonus_hold(os, fuid_obj,
-	    FTAG, &db));
-	fuid_size = *(uint64_t *)db->db_data;
-	dmu_buf_rele(db, FTAG);
-
-	if (fuid_size)  {
-		nvlist_t **fuidnvp;
-		nvlist_t *nvp = NULL;
-		uint_t count;
-		char *packed;
-		int i;
-
-		packed = kmem_alloc(fuid_size, KM_SLEEP);
-		VERIFY(dmu_read(os, fuid_obj, 0,
-		    fuid_size, packed, DMU_READ_PREFETCH) == 0);
-		VERIFY(nvlist_unpack(packed, fuid_size,
-		    &nvp, 0) == 0);
-		VERIFY(nvlist_lookup_nvlist_array(nvp, FUID_NVP_ARRAY,
-		    &fuidnvp, &count) == 0);
-
-		for (i = 0; i != count; i++) {
-			fuid_domain_t *domnode;
-			char *domain;
-			uint64_t idx;
-
-			VERIFY(nvlist_lookup_string(fuidnvp[i], FUID_DOMAIN,
-			    &domain) == 0);
-			VERIFY(nvlist_lookup_uint64(fuidnvp[i], FUID_IDX,
-			    &idx) == 0);
-
-			domnode = kmem_alloc(sizeof (fuid_domain_t), KM_SLEEP);
-
-			domnode->f_idx = idx;
-			domnode->f_ksid = ksid_lookupdomain(domain);
-			avl_add(idx_tree, domnode);
-			avl_add(domain_tree, domnode);
-		}
-		nvlist_free(nvp);
-		kmem_free(packed, fuid_size);
-	}
-	return (fuid_size);
-}
-
-void
-zfs_fuid_table_destroy(avl_tree_t *idx_tree, avl_tree_t *domain_tree)
-{
-	fuid_domain_t *domnode;
-	void *cookie;
-
-	cookie = NULL;
-	while (domnode = avl_destroy_nodes(domain_tree, &cookie))
-		ksiddomain_rele(domnode->f_ksid);
-
-	avl_destroy(domain_tree);
-	cookie = NULL;
-	while (domnode = avl_destroy_nodes(idx_tree, &cookie))
-		kmem_free(domnode, sizeof (fuid_domain_t));
-	avl_destroy(idx_tree);
-}
-
-char *
-zfs_fuid_idx_domain(avl_tree_t *idx_tree, uint32_t idx)
-{
-	fuid_domain_t searchnode, *findnode;
-	avl_index_t loc;
-
-	searchnode.f_idx = idx;
-
-	findnode = avl_find(idx_tree, &searchnode, &loc);
-
-	return (findnode ? findnode->f_ksid->kd_name : nulldomain);
-}
-
-#ifdef _KERNEL
-/*
- * Load the fuid table(s) into memory.
- */
-static void
-zfs_fuid_init(zfsvfs_t *zfsvfs)
-{
-	rw_enter(&zfsvfs->z_fuid_lock, RW_WRITER);
-
-	if (zfsvfs->z_fuid_loaded) {
-		rw_exit(&zfsvfs->z_fuid_lock);
-		return;
-	}
-
-	zfs_fuid_avl_tree_create(&zfsvfs->z_fuid_idx, &zfsvfs->z_fuid_domain);
-
-	(void) zap_lookup(zfsvfs->z_os, MASTER_NODE_OBJ,
-	    ZFS_FUID_TABLES, 8, 1, &zfsvfs->z_fuid_obj);
-	if (zfsvfs->z_fuid_obj != 0) {
-		zfsvfs->z_fuid_size = zfs_fuid_table_load(zfsvfs->z_os,
-		    zfsvfs->z_fuid_obj, &zfsvfs->z_fuid_idx,
-		    &zfsvfs->z_fuid_domain);
-	}
-
-	zfsvfs->z_fuid_loaded = B_TRUE;
-	rw_exit(&zfsvfs->z_fuid_lock);
-}
-
-/*
- * sync out AVL trees to persistent storage.
- */
-void
-zfs_fuid_sync(zfsvfs_t *zfsvfs, dmu_tx_t *tx)
-{
-	nvlist_t *nvp;
-	nvlist_t **fuids;
-	size_t nvsize = 0;
-	char *packed;
-	dmu_buf_t *db;
-	fuid_domain_t *domnode;
-	int numnodes;
-	int i;
-
-	if (!zfsvfs->z_fuid_dirty) {
-		return;
-	}
-
-	rw_enter(&zfsvfs->z_fuid_lock, RW_WRITER);
-
-	/*
-	 * First see if table needs to be created?
-	 */
-	if (zfsvfs->z_fuid_obj == 0) {
-		zfsvfs->z_fuid_obj = dmu_object_alloc(zfsvfs->z_os,
-		    DMU_OT_FUID, 1 << 14, DMU_OT_FUID_SIZE,
-		    sizeof (uint64_t), tx);
-		VERIFY(zap_add(zfsvfs->z_os, MASTER_NODE_OBJ,
-		    ZFS_FUID_TABLES, sizeof (uint64_t), 1,
-		    &zfsvfs->z_fuid_obj, tx) == 0);
-	}
-
-	VERIFY(nvlist_alloc(&nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0);
-
-	numnodes = avl_numnodes(&zfsvfs->z_fuid_idx);
-	fuids = kmem_alloc(numnodes * sizeof (void *), KM_SLEEP);
-	for (i = 0, domnode = avl_first(&zfsvfs->z_fuid_domain); domnode; i++,
-	    domnode = AVL_NEXT(&zfsvfs->z_fuid_domain, domnode)) {
-		VERIFY(nvlist_alloc(&fuids[i], NV_UNIQUE_NAME, KM_SLEEP) == 0);
-		VERIFY(nvlist_add_uint64(fuids[i], FUID_IDX,
-		    domnode->f_idx) == 0);
-		VERIFY(nvlist_add_uint64(fuids[i], FUID_OFFSET, 0) == 0);
-		VERIFY(nvlist_add_string(fuids[i], FUID_DOMAIN,
-		    domnode->f_ksid->kd_name) == 0);
-	}
-	VERIFY(nvlist_add_nvlist_array(nvp, FUID_NVP_ARRAY,
-	    fuids, numnodes) == 0);
-	for (i = 0; i != numnodes; i++)
-		nvlist_free(fuids[i]);
-	kmem_free(fuids, numnodes * sizeof (void *));
-	VERIFY(nvlist_size(nvp, &nvsize, NV_ENCODE_XDR) == 0);
-	packed = kmem_alloc(nvsize, KM_SLEEP);
-	VERIFY(nvlist_pack(nvp, &packed, &nvsize,
-	    NV_ENCODE_XDR, KM_SLEEP) == 0);
-	nvlist_free(nvp);
-	zfsvfs->z_fuid_size = nvsize;
-	dmu_write(zfsvfs->z_os, zfsvfs->z_fuid_obj, 0,
-	    zfsvfs->z_fuid_size, packed, tx);
-	kmem_free(packed, zfsvfs->z_fuid_size);
-	VERIFY(0 == dmu_bonus_hold(zfsvfs->z_os, zfsvfs->z_fuid_obj,
-	    FTAG, &db));
-	dmu_buf_will_dirty(db, tx);
-	*(uint64_t *)db->db_data = zfsvfs->z_fuid_size;
-	dmu_buf_rele(db, FTAG);
-
-	zfsvfs->z_fuid_dirty = B_FALSE;
-	rw_exit(&zfsvfs->z_fuid_lock);
-}
-
-/*
- * Query domain table for a given domain.
- *
- * If domain isn't found and addok is set, it is added to AVL trees and
- * the zfsvfs->z_fuid_dirty flag will be set to TRUE.  It will then be
- * necessary for the caller or another thread to detect the dirty table
- * and sync out the changes.
- */
-int
-zfs_fuid_find_by_domain(zfsvfs_t *zfsvfs, const char *domain,
-    char **retdomain, boolean_t addok)
-{
-	fuid_domain_t searchnode, *findnode;
-	avl_index_t loc;
-	krw_t rw = RW_READER;
-
-	/*
-	 * If the dummy "nobody" domain then return an index of 0
-	 * to cause the created FUID to be a standard POSIX id
-	 * for the user nobody.
-	 */
-	if (domain[0] == '\0') {
-		if (retdomain)
-			*retdomain = nulldomain;
-		return (0);
-	}
-
-	searchnode.f_ksid = ksid_lookupdomain(domain);
-	if (retdomain)
-		*retdomain = searchnode.f_ksid->kd_name;
-	if (!zfsvfs->z_fuid_loaded)
-		zfs_fuid_init(zfsvfs);
-
-retry:
-	rw_enter(&zfsvfs->z_fuid_lock, rw);
-	findnode = avl_find(&zfsvfs->z_fuid_domain, &searchnode, &loc);
-
-	if (findnode) {
-		rw_exit(&zfsvfs->z_fuid_lock);
-		ksiddomain_rele(searchnode.f_ksid);
-		return (findnode->f_idx);
-	} else if (addok) {
-		fuid_domain_t *domnode;
-		uint64_t retidx;
-
-		if (rw == RW_READER && !rw_tryupgrade(&zfsvfs->z_fuid_lock)) {
-			rw_exit(&zfsvfs->z_fuid_lock);
-			rw = RW_WRITER;
-			goto retry;
-		}
-
-		domnode = kmem_alloc(sizeof (fuid_domain_t), KM_SLEEP);
-		domnode->f_ksid = searchnode.f_ksid;
-
-		retidx = domnode->f_idx = avl_numnodes(&zfsvfs->z_fuid_idx) + 1;
-
-		avl_add(&zfsvfs->z_fuid_domain, domnode);
-		avl_add(&zfsvfs->z_fuid_idx, domnode);
-		zfsvfs->z_fuid_dirty = B_TRUE;
-		rw_exit(&zfsvfs->z_fuid_lock);
-		return (retidx);
-	} else {
-		rw_exit(&zfsvfs->z_fuid_lock);
-		return (-1);
-	}
-}
-
-/*
- * Query domain table by index, returning domain string
- *
- * Returns a pointer from an avl node of the domain string.
- *
- */
-const char *
-zfs_fuid_find_by_idx(zfsvfs_t *zfsvfs, uint32_t idx)
-{
-	char *domain;
-
-	if (idx == 0 || !zfsvfs->z_use_fuids)
-		return (NULL);
-
-	if (!zfsvfs->z_fuid_loaded)
-		zfs_fuid_init(zfsvfs);
-
-	rw_enter(&zfsvfs->z_fuid_lock, RW_READER);
-
-	if (zfsvfs->z_fuid_obj || zfsvfs->z_fuid_dirty)
-		domain = zfs_fuid_idx_domain(&zfsvfs->z_fuid_idx, idx);
-	else
-		domain = nulldomain;
-	rw_exit(&zfsvfs->z_fuid_lock);
-
-	ASSERT(domain);
-	return (domain);
-}
-
-void
-zfs_fuid_map_ids(znode_t *zp, cred_t *cr, uid_t *uidp, uid_t *gidp)
-{
-	*uidp = zfs_fuid_map_id(zp->z_zfsvfs, zp->z_uid, cr, ZFS_OWNER);
-	*gidp = zfs_fuid_map_id(zp->z_zfsvfs, zp->z_gid, cr, ZFS_GROUP);
-}
-
-uid_t
-zfs_fuid_map_id(zfsvfs_t *zfsvfs, uint64_t fuid,
-    cred_t *cr, zfs_fuid_type_t type)
-{
-	uint32_t index = FUID_INDEX(fuid);
-	const char *domain;
-	uid_t id;
-
-	if (index == 0)
-		return (fuid);
-
-	domain = zfs_fuid_find_by_idx(zfsvfs, index);
-	ASSERT(domain != NULL);
-
-#ifdef illumos
-	if (type == ZFS_OWNER || type == ZFS_ACE_USER) {
-		(void) kidmap_getuidbysid(crgetzone(cr), domain,
-		    FUID_RID(fuid), &id);
-	} else {
-		(void) kidmap_getgidbysid(crgetzone(cr), domain,
-		    FUID_RID(fuid), &id);
-	}
-#else
-	id = UID_NOBODY;
-#endif
-	return (id);
-}
-
-/*
- * Add a FUID node to the list of fuid's being created for this
- * ACL
- *
- * If ACL has multiple domains, then keep only one copy of each unique
- * domain.
- */
-void
-zfs_fuid_node_add(zfs_fuid_info_t **fuidpp, const char *domain, uint32_t rid,
-    uint64_t idx, uint64_t id, zfs_fuid_type_t type)
-{
-	zfs_fuid_t *fuid;
-	zfs_fuid_domain_t *fuid_domain;
-	zfs_fuid_info_t *fuidp;
-	uint64_t fuididx;
-	boolean_t found = B_FALSE;
-
-	if (*fuidpp == NULL)
-		*fuidpp = zfs_fuid_info_alloc();
-
-	fuidp = *fuidpp;
-	/*
-	 * First find fuid domain index in linked list
-	 *
-	 * If one isn't found then create an entry.
-	 */
-
-	for (fuididx = 1, fuid_domain = list_head(&fuidp->z_domains);
-	    fuid_domain; fuid_domain = list_next(&fuidp->z_domains,
-	    fuid_domain), fuididx++) {
-		if (idx == fuid_domain->z_domidx) {
-			found = B_TRUE;
-			break;
-		}
-	}
-
-	if (!found) {
-		fuid_domain = kmem_alloc(sizeof (zfs_fuid_domain_t), KM_SLEEP);
-		fuid_domain->z_domain = domain;
-		fuid_domain->z_domidx = idx;
-		list_insert_tail(&fuidp->z_domains, fuid_domain);
-		fuidp->z_domain_str_sz += strlen(domain) + 1;
-		fuidp->z_domain_cnt++;
-	}
-
-	if (type == ZFS_ACE_USER || type == ZFS_ACE_GROUP) {
-
-		/*
-		 * Now allocate fuid entry and add it on the end of the list
-		 */
-
-		fuid = kmem_alloc(sizeof (zfs_fuid_t), KM_SLEEP);
-		fuid->z_id = id;
-		fuid->z_domidx = idx;
-		fuid->z_logfuid = FUID_ENCODE(fuididx, rid);
-
-		list_insert_tail(&fuidp->z_fuids, fuid);
-		fuidp->z_fuid_cnt++;
-	} else {
-		if (type == ZFS_OWNER)
-			fuidp->z_fuid_owner = FUID_ENCODE(fuididx, rid);
-		else
-			fuidp->z_fuid_group = FUID_ENCODE(fuididx, rid);
-	}
-}
-
-/*
- * Create a file system FUID, based on information in the users cred
- *
- * If cred contains KSID_OWNER then it should be used to determine
- * the uid otherwise cred's uid will be used. By default cred's gid
- * is used unless it's an ephemeral ID in which case KSID_GROUP will
- * be used if it exists.
- */
-uint64_t
-zfs_fuid_create_cred(zfsvfs_t *zfsvfs, zfs_fuid_type_t type,
-    cred_t *cr, zfs_fuid_info_t **fuidp)
-{
-	uint64_t	idx;
-	ksid_t		*ksid;
-	uint32_t	rid;
-	char 		*kdomain;
-	const char	*domain;
-	uid_t		id;
-
-	VERIFY(type == ZFS_OWNER || type == ZFS_GROUP);
-
-	ksid = crgetsid(cr, (type == ZFS_OWNER) ? KSID_OWNER : KSID_GROUP);
-
-	if (!zfsvfs->z_use_fuids || (ksid == NULL)) {
-		id = (type == ZFS_OWNER) ? crgetuid(cr) : crgetgid(cr);
-
-		if (IS_EPHEMERAL(id))
-			return ((type == ZFS_OWNER) ? UID_NOBODY : GID_NOBODY);
-
-		return ((uint64_t)id);
-	}
-
-	/*
-	 * ksid is present and FUID is supported
-	 */
-	id = (type == ZFS_OWNER) ? ksid_getid(ksid) : crgetgid(cr);
-
-	if (!IS_EPHEMERAL(id))
-		return ((uint64_t)id);
-
-	if (type == ZFS_GROUP)
-		id = ksid_getid(ksid);
-
-	rid = ksid_getrid(ksid);
-	domain = ksid_getdomain(ksid);
-
-	idx = zfs_fuid_find_by_domain(zfsvfs, domain, &kdomain, B_TRUE);
-
-	zfs_fuid_node_add(fuidp, kdomain, rid, idx, id, type);
-
-	return (FUID_ENCODE(idx, rid));
-}
-
-/*
- * Create a file system FUID for an ACL ace
- * or a chown/chgrp of the file.
- * This is similar to zfs_fuid_create_cred, except that
- * we can't find the domain + rid information in the
- * cred.  Instead we have to query Winchester for the
- * domain and rid.
- *
- * During replay operations the domain+rid information is
- * found in the zfs_fuid_info_t that the replay code has
- * attached to the zfsvfs of the file system.
- */
-uint64_t
-zfs_fuid_create(zfsvfs_t *zfsvfs, uint64_t id, cred_t *cr,
-    zfs_fuid_type_t type, zfs_fuid_info_t **fuidpp)
-{
-	const char *domain;
-	char *kdomain;
-	uint32_t fuid_idx = FUID_INDEX(id);
-	uint32_t rid;
-	idmap_stat status;
-	uint64_t idx = 0;
-	zfs_fuid_t *zfuid = NULL;
-	zfs_fuid_info_t *fuidp = NULL;
-
-	/*
-	 * If POSIX ID, or entry is already a FUID then
-	 * just return the id
-	 *
-	 * We may also be handed an already FUID'ized id via
-	 * chmod.
-	 */
-
-	if (!zfsvfs->z_use_fuids || !IS_EPHEMERAL(id) || fuid_idx != 0)
-		return (id);
-
-	if (zfsvfs->z_replay) {
-		fuidp = zfsvfs->z_fuid_replay;
-
-		/*
-		 * If we are passed an ephemeral id, but no
-		 * fuid_info was logged then return NOBODY.
-		 * This is most likely a result of idmap service
-		 * not being available.
-		 */
-		if (fuidp == NULL)
-			return (UID_NOBODY);
-
-		VERIFY3U(type, >=, ZFS_OWNER);
-		VERIFY3U(type, <=, ZFS_ACE_GROUP);
-
-		switch (type) {
-		case ZFS_ACE_USER:
-		case ZFS_ACE_GROUP:
-			zfuid = list_head(&fuidp->z_fuids);
-			rid = FUID_RID(zfuid->z_logfuid);
-			idx = FUID_INDEX(zfuid->z_logfuid);
-			break;
-		case ZFS_OWNER:
-			rid = FUID_RID(fuidp->z_fuid_owner);
-			idx = FUID_INDEX(fuidp->z_fuid_owner);
-			break;
-		case ZFS_GROUP:
-			rid = FUID_RID(fuidp->z_fuid_group);
-			idx = FUID_INDEX(fuidp->z_fuid_group);
-			break;
-		};
-		domain = fuidp->z_domain_table[idx - 1];
-	} else {
-		if (type == ZFS_OWNER || type == ZFS_ACE_USER)
-			status = kidmap_getsidbyuid(crgetzone(cr), id,
-			    &domain, &rid);
-		else
-			status = kidmap_getsidbygid(crgetzone(cr), id,
-			    &domain, &rid);
-
-		if (status != 0) {
-			/*
-			 * When returning nobody we will need to
-			 * make a dummy fuid table entry for logging
-			 * purposes.
-			 */
-			rid = UID_NOBODY;
-			domain = nulldomain;
-		}
-	}
-
-	idx = zfs_fuid_find_by_domain(zfsvfs, domain, &kdomain, B_TRUE);
-
-	if (!zfsvfs->z_replay)
-		zfs_fuid_node_add(fuidpp, kdomain,
-		    rid, idx, id, type);
-	else if (zfuid != NULL) {
-		list_remove(&fuidp->z_fuids, zfuid);
-		kmem_free(zfuid, sizeof (zfs_fuid_t));
-	}
-	return (FUID_ENCODE(idx, rid));
-}
-
-void
-zfs_fuid_destroy(zfsvfs_t *zfsvfs)
-{
-	rw_enter(&zfsvfs->z_fuid_lock, RW_WRITER);
-	if (!zfsvfs->z_fuid_loaded) {
-		rw_exit(&zfsvfs->z_fuid_lock);
-		return;
-	}
-	zfs_fuid_table_destroy(&zfsvfs->z_fuid_idx, &zfsvfs->z_fuid_domain);
-	rw_exit(&zfsvfs->z_fuid_lock);
-}
-
-/*
- * Allocate zfs_fuid_info for tracking FUIDs created during
- * zfs_mknode, VOP_SETATTR() or VOP_SETSECATTR()
- */
-zfs_fuid_info_t *
-zfs_fuid_info_alloc(void)
-{
-	zfs_fuid_info_t *fuidp;
-
-	fuidp = kmem_zalloc(sizeof (zfs_fuid_info_t), KM_SLEEP);
-	list_create(&fuidp->z_domains, sizeof (zfs_fuid_domain_t),
-	    offsetof(zfs_fuid_domain_t, z_next));
-	list_create(&fuidp->z_fuids, sizeof (zfs_fuid_t),
-	    offsetof(zfs_fuid_t, z_next));
-	return (fuidp);
-}
-
-/*
- * Release all memory associated with zfs_fuid_info_t
- */
-void
-zfs_fuid_info_free(zfs_fuid_info_t *fuidp)
-{
-	zfs_fuid_t *zfuid;
-	zfs_fuid_domain_t *zdomain;
-
-	while ((zfuid = list_head(&fuidp->z_fuids)) != NULL) {
-		list_remove(&fuidp->z_fuids, zfuid);
-		kmem_free(zfuid, sizeof (zfs_fuid_t));
-	}
-
-	if (fuidp->z_domain_table != NULL)
-		kmem_free(fuidp->z_domain_table,
-		    (sizeof (char **)) * fuidp->z_domain_cnt);
-
-	while ((zdomain = list_head(&fuidp->z_domains)) != NULL) {
-		list_remove(&fuidp->z_domains, zdomain);
-		kmem_free(zdomain, sizeof (zfs_fuid_domain_t));
-	}
-
-	kmem_free(fuidp, sizeof (zfs_fuid_info_t));
-}
-
-/*
- * Check to see if id is a groupmember.  If cred
- * has ksid info then sidlist is checked first
- * and if still not found then POSIX groups are checked
- *
- * Will use a straight FUID compare when possible.
- */
-boolean_t
-zfs_groupmember(zfsvfs_t *zfsvfs, uint64_t id, cred_t *cr)
-{
-#ifdef illumos
-	ksid_t		*ksid = crgetsid(cr, KSID_GROUP);
-	ksidlist_t	*ksidlist = crgetsidlist(cr);
-#endif
-	uid_t		gid;
-
-#ifdef illumos
-	if (ksid && ksidlist) {
-		int 		i;
-		ksid_t		*ksid_groups;
-		uint32_t	idx = FUID_INDEX(id);
-		uint32_t	rid = FUID_RID(id);
-
-		ksid_groups = ksidlist->ksl_sids;
-
-		for (i = 0; i != ksidlist->ksl_nsid; i++) {
-			if (idx == 0) {
-				if (id != IDMAP_WK_CREATOR_GROUP_GID &&
-				    id == ksid_groups[i].ks_id) {
-					return (B_TRUE);
-				}
-			} else {
-				const char *domain;
-
-				domain = zfs_fuid_find_by_idx(zfsvfs, idx);
-				ASSERT(domain != NULL);
-
-				if (strcmp(domain,
-				    IDMAP_WK_CREATOR_SID_AUTHORITY) == 0)
-					return (B_FALSE);
-
-				if ((strcmp(domain,
-				    ksid_groups[i].ks_domain->kd_name) == 0) &&
-				    rid == ksid_groups[i].ks_rid)
-					return (B_TRUE);
-			}
-		}
-	}
-#endif	/* illumos */
-
-	/*
-	 * Not found in ksidlist, check posix groups
-	 */
-	gid = zfs_fuid_map_id(zfsvfs, id, cr, ZFS_GROUP);
-	return (groupmember(gid, cr));
-}
-
-void
-zfs_fuid_txhold(zfsvfs_t *zfsvfs, dmu_tx_t *tx)
-{
-	if (zfsvfs->z_fuid_obj == 0) {
-		dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
-		dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
-		    FUID_SIZE_ESTIMATE(zfsvfs));
-		dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, FALSE, NULL);
-	} else {
-		dmu_tx_hold_bonus(tx, zfsvfs->z_fuid_obj);
-		dmu_tx_hold_write(tx, zfsvfs->z_fuid_obj, 0,
-		    FUID_SIZE_ESTIMATE(zfsvfs));
-	}
-}
-#endif
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c
+++ /dev/null
@@ -1,7628 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-
-/*
- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011-2012 Pawel Jakub Dawidek. All rights reserved.
- * Copyright 2013 Martin Matuska <mm@FreeBSD.org>. All rights reserved.
- * Copyright 2014 Xin Li <delphij@FreeBSD.org>. All rights reserved.
- * Copyright 2015, OmniTI Computer Consulting, Inc. All rights reserved.
- * Copyright 2015 Nexenta Systems, Inc.  All rights reserved.
- * Copyright (c) 2014, 2016 Joyent, Inc. All rights reserved.
- * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
- * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
- * Copyright (c) 2013 Steven Hartland. All rights reserved.
- * Copyright (c) 2014 Integros [integros.com]
- * Copyright 2016 Toomas Soome <tsoome@me.com>
- * Copyright 2017 RackTop Systems.
- * Copyright (c) 2018, loli10K <ezomori.nozomu@gmail.com>. All rights reserved.
- * Copyright (c) 2019 Datto Inc.
- */
-
-/*
- * ZFS ioctls.
- *
- * This file handles the ioctls to /dev/zfs, used for configuring ZFS storage
- * pools and filesystems, e.g. with /sbin/zfs and /sbin/zpool.
- *
- * There are two ways that we handle ioctls: the legacy way where almost
- * all of the logic is in the ioctl callback, and the new way where most
- * of the marshalling is handled in the common entry point, zfsdev_ioctl().
- *
- * Non-legacy ioctls should be registered by calling
- * zfs_ioctl_register() from zfs_ioctl_init().  The ioctl is invoked
- * from userland by lzc_ioctl().
- *
- * The registration arguments are as follows:
- *
- * const char *name
- *   The name of the ioctl.  This is used for history logging.  If the
- *   ioctl returns successfully (the callback returns 0), and allow_log
- *   is true, then a history log entry will be recorded with the input &
- *   output nvlists.  The log entry can be printed with "zpool history -i".
- *
- * zfs_ioc_t ioc
- *   The ioctl request number, which userland will pass to ioctl(2).
- *   We want newer versions of libzfs and libzfs_core to run against
- *   existing zfs kernel modules (i.e. a deferred reboot after an update).
- *   Therefore the ioctl numbers cannot change from release to release.
- *
- * zfs_secpolicy_func_t *secpolicy
- *   This function will be called before the zfs_ioc_func_t, to
- *   determine if this operation is permitted.  It should return EPERM
- *   on failure, and 0 on success.  Checks include determining if the
- *   dataset is visible in this zone, and if the user has either all
- *   zfs privileges in the zone (SYS_MOUNT), or has been granted permission
- *   to do this operation on this dataset with "zfs allow".
- *
- * zfs_ioc_namecheck_t namecheck
- *   This specifies what to expect in the zfs_cmd_t:zc_name -- a pool
- *   name, a dataset name, or nothing.  If the name is not well-formed,
- *   the ioctl will fail and the callback will not be called.
- *   Therefore, the callback can assume that the name is well-formed
- *   (e.g. is null-terminated, doesn't have more than one '@' character,
- *   doesn't have invalid characters).
- *
- * zfs_ioc_poolcheck_t pool_check
- *   This specifies requirements on the pool state.  If the pool does
- *   not meet them (is suspended or is readonly), the ioctl will fail
- *   and the callback will not be called.  If any checks are specified
- *   (i.e. it is not POOL_CHECK_NONE), namecheck must not be NO_NAME.
- *   Multiple checks can be or-ed together (e.g. POOL_CHECK_SUSPENDED |
- *   POOL_CHECK_READONLY).
- *
- * zfs_ioc_key_t *nvl_keys
- *  The list of expected/allowable innvl input keys. This list is used
- *  to validate the nvlist input to the ioctl.
- *
- * boolean_t smush_outnvlist
- *   If smush_outnvlist is true, then the output is presumed to be a
- *   list of errors, and it will be "smushed" down to fit into the
- *   caller's buffer, by removing some entries and replacing them with a
- *   single "N_MORE_ERRORS" entry indicating how many were removed.  See
- *   nvlist_smush() for details.  If smush_outnvlist is false, and the
- *   outnvlist does not fit into the userland-provided buffer, then the
- *   ioctl will fail with ENOMEM.
- *
- * zfs_ioc_func_t *func
- *   The callback function that will perform the operation.
- *
- *   The callback should return 0 on success, or an error number on
- *   failure.  If the function fails, the userland ioctl will return -1,
- *   and errno will be set to the callback's return value.  The callback
- *   will be called with the following arguments:
- *
- *   const char *name
- *     The name of the pool or dataset to operate on, from
- *     zfs_cmd_t:zc_name.  The 'namecheck' argument specifies the
- *     expected type (pool, dataset, or none).
- *
- *   nvlist_t *innvl
- *     The input nvlist, deserialized from zfs_cmd_t:zc_nvlist_src.  Or
- *     NULL if no input nvlist was provided.  Changes to this nvlist are
- *     ignored.  If the input nvlist could not be deserialized, the
- *     ioctl will fail and the callback will not be called.
- *
- *   nvlist_t *outnvl
- *     The output nvlist, initially empty.  The callback can fill it in,
- *     and it will be returned to userland by serializing it into
- *     zfs_cmd_t:zc_nvlist_dst.  If it is non-empty, and serialization
- *     fails (e.g. because the caller didn't supply a large enough
- *     buffer), then the overall ioctl will fail.  See the
- *     'smush_nvlist' argument above for additional behaviors.
- *
- *     There are two typical uses of the output nvlist:
- *       - To return state, e.g. property values.  In this case,
- *         smush_outnvlist should be false.  If the buffer was not large
- *         enough, the caller will reallocate a larger buffer and try
- *         the ioctl again.
- *
- *       - To return multiple errors from an ioctl which makes on-disk
- *         changes.  In this case, smush_outnvlist should be true.
- *         Ioctls which make on-disk modifications should generally not
- *         use the outnvl if they succeed, because the caller can not
- *         distinguish between the operation failing, and
- *         deserialization failing.
- *
- *
- * IOCTL Interface Errors
- *
- * The following ioctl input errors can be returned:
- *   ZFS_ERR_IOC_CMD_UNAVAIL   the ioctl number is not supported by kernel
- *   ZFS_ERR_IOC_ARG_UNAVAIL   an input argument is not supported by kernel
- *   ZFS_ERR_IOC_ARG_REQUIRED  a required input argument is missing
- *   ZFS_ERR_IOC_ARG_BADTYPE   an input argument has an invalid type
- */
-
-#ifdef __FreeBSD__
-#include "opt_kstack_pages.h"
-#endif
-
-#include <sys/types.h>
-#include <sys/param.h>
-#include <sys/systm.h>
-#include <sys/conf.h>
-#include <sys/kernel.h>
-#include <sys/lock.h>
-#include <sys/malloc.h>
-#include <sys/mutex.h>
-#include <sys/proc.h>
-#include <sys/errno.h>
-#include <sys/uio.h>
-#include <sys/buf.h>
-#include <sys/file.h>
-#include <sys/kmem.h>
-#include <sys/conf.h>
-#include <sys/cmn_err.h>
-#include <sys/stat.h>
-#include <sys/zfs_ioctl.h>
-#include <sys/zfs_vfsops.h>
-#include <sys/zfs_znode.h>
-#include <sys/zap.h>
-#include <sys/spa.h>
-#include <sys/spa_impl.h>
-#include <sys/vdev.h>
-#include <sys/dmu.h>
-#include <sys/dsl_dir.h>
-#include <sys/dsl_dataset.h>
-#include <sys/dsl_prop.h>
-#include <sys/dsl_deleg.h>
-#include <sys/dmu_objset.h>
-#include <sys/dmu_impl.h>
-#include <sys/dmu_tx.h>
-#include <sys/sunddi.h>
-#include <sys/policy.h>
-#include <sys/zone.h>
-#include <sys/nvpair.h>
-#include <sys/mount.h>
-#include <sys/taskqueue.h>
-#include <sys/sdt.h>
-#include <sys/varargs.h>
-#include <sys/fs/zfs.h>
-#include <sys/zfs_ctldir.h>
-#include <sys/zfs_dir.h>
-#include <sys/zfs_onexit.h>
-#include <sys/zvol.h>
-#include <sys/dsl_scan.h>
-#include <sys/dmu_objset.h>
-#include <sys/dmu_send.h>
-#include <sys/dsl_destroy.h>
-#include <sys/dsl_bookmark.h>
-#include <sys/dsl_userhold.h>
-#include <sys/zfeature.h>
-#include <sys/zcp.h>
-#include <sys/zio_checksum.h>
-#include <sys/vdev_removal.h>
-#include <sys/vdev_impl.h>
-#include <sys/vdev_initialize.h>
-
-#include "zfs_namecheck.h"
-#include "zfs_prop.h"
-#include "zfs_deleg.h"
-#include "zfs_comutil.h"
-#include "zfs_ioctl_compat.h"
-
-#include "lua.h"
-#include "lauxlib.h"
-
-#ifndef ARRAY_SIZE
-#define	ARRAY_SIZE(x)	nitems(x)
-#endif
-
-static struct cdev *zfsdev;
-
-extern void zfs_init(void);
-extern void zfs_fini(void);
-
-uint_t zfs_fsyncer_key;
-extern uint_t rrw_tsd_key;
-static uint_t zfs_allow_log_key;
-extern uint_t zfs_geom_probe_vdev_key;
-
-typedef int zfs_ioc_legacy_func_t(zfs_cmd_t *);
-typedef int zfs_ioc_func_t(const char *, nvlist_t *, nvlist_t *);
-typedef int zfs_secpolicy_func_t(zfs_cmd_t *, nvlist_t *, cred_t *);
-
-/*
- * IOC Keys are used to document and validate user->kernel interface inputs.
- * See zfs_keys_recv_new for an example declaration. Any key name that is not
- * listed will be rejected as input.
- *
- * The keyname 'optional' is always allowed, and must be an nvlist if present.
- * Arguments which older kernels can safely ignore can be placed under the
- * "optional" key.
- *
- * When adding new keys to an existing ioc for new functionality, consider:
- *	- adding an entry into zfs_sysfs.c zfs_features[] list
- *	- updating the libzfs_input_check.c test utility
- *
- * Note: in the ZK_WILDCARDLIST case, the name serves as documentation
- * for the expected name (bookmark, snapshot, property, etc) but there
- * is no validation in the preflight zfs_check_input_nvpairs() check.
- */
-typedef enum {
-	ZK_OPTIONAL = 1 << 0,		/* pair is optional */
-	ZK_WILDCARDLIST = 1 << 1,	/* one or more unspecified key names */
-} ioc_key_flag_t;
-
-/* DATA_TYPE_ANY is used when zkey_type can vary. */
-#define	DATA_TYPE_ANY	DATA_TYPE_UNKNOWN
-
-typedef struct zfs_ioc_key {
-	const char	*zkey_name;
-	data_type_t	zkey_type;
-	ioc_key_flag_t	zkey_flags;
-} zfs_ioc_key_t;
-
-typedef enum {
-	NO_NAME,
-	POOL_NAME,
-	DATASET_NAME,
-	ENTITY_NAME
-} zfs_ioc_namecheck_t;
-
-typedef enum {
-	POOL_CHECK_NONE		= 1 << 0,
-	POOL_CHECK_SUSPENDED	= 1 << 1,
-	POOL_CHECK_READONLY	= 1 << 2,
-} zfs_ioc_poolcheck_t;
-
-typedef struct zfs_ioc_vec {
-	zfs_ioc_legacy_func_t	*zvec_legacy_func;
-	zfs_ioc_func_t		*zvec_func;
-	zfs_secpolicy_func_t	*zvec_secpolicy;
-	zfs_ioc_namecheck_t	zvec_namecheck;
-	boolean_t		zvec_allow_log;
-	zfs_ioc_poolcheck_t	zvec_pool_check;
-	boolean_t		zvec_smush_outnvlist;
-	const char		*zvec_name;
-	const zfs_ioc_key_t	*zvec_nvl_keys;
-	size_t			zvec_nvl_key_count;
-} zfs_ioc_vec_t;
-
-/* This array is indexed by zfs_userquota_prop_t */
-static const char *userquota_perms[] = {
-	ZFS_DELEG_PERM_USERUSED,
-	ZFS_DELEG_PERM_USERQUOTA,
-	ZFS_DELEG_PERM_GROUPUSED,
-	ZFS_DELEG_PERM_GROUPQUOTA,
-};
-
-static int zfs_ioc_userspace_upgrade(zfs_cmd_t *zc);
-static int zfs_check_settable(const char *name, nvpair_t *property,
-    cred_t *cr);
-static int zfs_check_clearable(char *dataset, nvlist_t *props,
-    nvlist_t **errors);
-static int zfs_fill_zplprops_root(uint64_t, nvlist_t *, nvlist_t *,
-    boolean_t *);
-int zfs_set_prop_nvlist(const char *, zprop_source_t, nvlist_t *, nvlist_t *);
-static int get_nvlist(uint64_t nvl, uint64_t size, int iflag, nvlist_t **nvp);
- 
-static void zfsdev_close(void *data);
-
-static int zfs_prop_activate_feature(spa_t *spa, spa_feature_t feature);
-
-/* _NOTE(PRINTFLIKE(4)) - this is printf-like, but lint is too whiney */
-void
-__dprintf(const char *file, const char *func, int line, const char *fmt, ...)
-{
-	const char *newfile;
-	char buf[512];
-	va_list adx;
-
-	/*
-	 * Get rid of annoying "../common/" prefix to filename.
-	 */
-	newfile = strrchr(file, '/');
-	if (newfile != NULL) {
-		newfile = newfile + 1; /* Get rid of leading / */
-	} else {
-		newfile = file;
-	}
-
-	va_start(adx, fmt);
-	(void) vsnprintf(buf, sizeof (buf), fmt, adx);
-	va_end(adx);
-
-	/*
-	 * To get this data, use the zfs-dprintf probe as so:
-	 * dtrace -q -n 'zfs-dprintf \
-	 *	/stringof(arg0) == "dbuf.c"/ \
-	 *	{printf("%s: %s", stringof(arg1), stringof(arg3))}'
-	 * arg0 = file name
-	 * arg1 = function name
-	 * arg2 = line number
-	 * arg3 = message
-	 */
-	DTRACE_PROBE4(zfs__dprintf,
-	    char *, newfile, char *, func, int, line, char *, buf);
-}
-
-static void
-history_str_free(char *buf)
-{
-	kmem_free(buf, HIS_MAX_RECORD_LEN);
-}
-
-static char *
-history_str_get(zfs_cmd_t *zc)
-{
-	char *buf;
-
-	if (zc->zc_history == 0)
-		return (NULL);
-
-	buf = kmem_alloc(HIS_MAX_RECORD_LEN, KM_SLEEP);
-	if (copyinstr((void *)(uintptr_t)zc->zc_history,
-	    buf, HIS_MAX_RECORD_LEN, NULL) != 0) {
-		history_str_free(buf);
-		return (NULL);
-	}
-
-	buf[HIS_MAX_RECORD_LEN -1] = '\0';
-
-	return (buf);
-}
-
-/*
- * Check to see if the named dataset is currently defined as bootable
- */
-static boolean_t
-zfs_is_bootfs(const char *name)
-{
-	objset_t *os;
-
-	if (dmu_objset_hold(name, FTAG, &os) == 0) {
-		boolean_t ret;
-		ret = (dmu_objset_id(os) == spa_bootfs(dmu_objset_spa(os)));
-		dmu_objset_rele(os, FTAG);
-		return (ret);
-	}
-	return (B_FALSE);
-}
-
-/*
- * Return non-zero if the spa version is less than requested version.
- */
-static int
-zfs_earlier_version(const char *name, int version)
-{
-	spa_t *spa;
-
-	if (spa_open(name, &spa, FTAG) == 0) {
-		if (spa_version(spa) < version) {
-			spa_close(spa, FTAG);
-			return (1);
-		}
-		spa_close(spa, FTAG);
-	}
-	return (0);
-}
-
-/*
- * Return TRUE if the ZPL version is less than requested version.
- */
-static boolean_t
-zpl_earlier_version(const char *name, int version)
-{
-	objset_t *os;
-	boolean_t rc = B_TRUE;
-
-	if (dmu_objset_hold(name, FTAG, &os) == 0) {
-		uint64_t zplversion;
-
-		if (dmu_objset_type(os) != DMU_OST_ZFS) {
-			dmu_objset_rele(os, FTAG);
-			return (B_TRUE);
-		}
-		/* XXX reading from non-owned objset */
-		if (zfs_get_zplprop(os, ZFS_PROP_VERSION, &zplversion) == 0)
-			rc = zplversion < version;
-		dmu_objset_rele(os, FTAG);
-	}
-	return (rc);
-}
-
-static void
-zfs_log_history(zfs_cmd_t *zc)
-{
-	spa_t *spa;
-	char *buf;
-
-	if ((buf = history_str_get(zc)) == NULL)
-		return;
-
-	if (spa_open(zc->zc_name, &spa, FTAG) == 0) {
-		if (spa_version(spa) >= SPA_VERSION_ZPOOL_HISTORY)
-			(void) spa_history_log(spa, buf);
-		spa_close(spa, FTAG);
-	}
-	history_str_free(buf);
-}
-
-/*
- * Policy for top-level read operations (list pools).  Requires no privileges,
- * and can be used in the local zone, as there is no associated dataset.
- */
-/* ARGSUSED */
-static int
-zfs_secpolicy_none(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
-{
-	return (0);
-}
-
-/*
- * Policy for dataset read operations (list children, get statistics).  Requires
- * no privileges, but must be visible in the local zone.
- */
-/* ARGSUSED */
-static int
-zfs_secpolicy_read(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
-{
-	if (INGLOBALZONE(curthread) ||
-	    zone_dataset_visible(zc->zc_name, NULL))
-		return (0);
-
-	return (SET_ERROR(ENOENT));
-}
-
-static int
-zfs_dozonecheck_impl(const char *dataset, uint64_t zoned, cred_t *cr)
-{
-	int writable = 1;
-
-	/*
-	 * The dataset must be visible by this zone -- check this first
-	 * so they don't see EPERM on something they shouldn't know about.
-	 */
-	if (!INGLOBALZONE(curthread) &&
-	    !zone_dataset_visible(dataset, &writable))
-		return (SET_ERROR(ENOENT));
-
-	if (INGLOBALZONE(curthread)) {
-		/*
-		 * If the fs is zoned, only root can access it from the
-		 * global zone.
-		 */
-		if (secpolicy_zfs(cr) && zoned)
-			return (SET_ERROR(EPERM));
-	} else {
-		/*
-		 * If we are in a local zone, the 'zoned' property must be set.
-		 */
-		if (!zoned)
-			return (SET_ERROR(EPERM));
-
-		/* must be writable by this zone */
-		if (!writable)
-			return (SET_ERROR(EPERM));
-	}
-	return (0);
-}
-
-static int
-zfs_dozonecheck(const char *dataset, cred_t *cr)
-{
-	uint64_t zoned;
-
-	if (dsl_prop_get_integer(dataset, "jailed", &zoned, NULL))
-		return (SET_ERROR(ENOENT));
-
-	return (zfs_dozonecheck_impl(dataset, zoned, cr));
-}
-
-static int
-zfs_dozonecheck_ds(const char *dataset, dsl_dataset_t *ds, cred_t *cr)
-{
-	uint64_t zoned;
-
-	if (dsl_prop_get_int_ds(ds, "jailed", &zoned))
-		return (SET_ERROR(ENOENT));
-
-	return (zfs_dozonecheck_impl(dataset, zoned, cr));
-}
-
-static int
-zfs_secpolicy_write_perms_ds(const char *name, dsl_dataset_t *ds,
-    const char *perm, cred_t *cr)
-{
-	int error;
-
-	error = zfs_dozonecheck_ds(name, ds, cr);
-	if (error == 0) {
-		error = secpolicy_zfs(cr);
-		if (error != 0)
-			error = dsl_deleg_access_impl(ds, perm, cr);
-	}
-	return (error);
-}
-
-static int
-zfs_secpolicy_write_perms(const char *name, const char *perm, cred_t *cr)
-{
-	int error;
-	dsl_dataset_t *ds;
-	dsl_pool_t *dp;
-
-	/*
-	 * First do a quick check for root in the global zone, which
-	 * is allowed to do all write_perms.  This ensures that zfs_ioc_*
-	 * will get to handle nonexistent datasets.
-	 */
-	if (INGLOBALZONE(curthread) && secpolicy_zfs(cr) == 0)
-		return (0);
-
-	error = dsl_pool_hold(name, FTAG, &dp);
-	if (error != 0)
-		return (error);
-
-	error = dsl_dataset_hold(dp, name, FTAG, &ds);
-	if (error != 0) {
-		dsl_pool_rele(dp, FTAG);
-		return (error);
-	}
-
-	error = zfs_secpolicy_write_perms_ds(name, ds, perm, cr);
-
-	dsl_dataset_rele(ds, FTAG);
-	dsl_pool_rele(dp, FTAG);
-	return (error);
-}
-
-#ifdef SECLABEL
-/*
- * Policy for setting the security label property.
- *
- * Returns 0 for success, non-zero for access and other errors.
- */
-static int
-zfs_set_slabel_policy(const char *name, char *strval, cred_t *cr)
-{
-	char		ds_hexsl[MAXNAMELEN];
-	bslabel_t	ds_sl, new_sl;
-	boolean_t	new_default = FALSE;
-	uint64_t	zoned;
-	int		needed_priv = -1;
-	int		error;
-
-	/* First get the existing dataset label. */
-	error = dsl_prop_get(name, zfs_prop_to_name(ZFS_PROP_MLSLABEL),
-	    1, sizeof (ds_hexsl), &ds_hexsl, NULL);
-	if (error != 0)
-		return (SET_ERROR(EPERM));
-
-	if (strcasecmp(strval, ZFS_MLSLABEL_DEFAULT) == 0)
-		new_default = TRUE;
-
-	/* The label must be translatable */
-	if (!new_default && (hexstr_to_label(strval, &new_sl) != 0))
-		return (SET_ERROR(EINVAL));
-
-	/*
-	 * In a non-global zone, disallow attempts to set a label that
-	 * doesn't match that of the zone; otherwise no other checks
-	 * are needed.
-	 */
-	if (!INGLOBALZONE(curproc)) {
-		if (new_default || !blequal(&new_sl, CR_SL(CRED())))
-			return (SET_ERROR(EPERM));
-		return (0);
-	}
-
-	/*
-	 * For global-zone datasets (i.e., those whose zoned property is
-	 * "off", verify that the specified new label is valid for the
-	 * global zone.
-	 */
-	if (dsl_prop_get_integer(name,
-	    zfs_prop_to_name(ZFS_PROP_ZONED), &zoned, NULL))
-		return (SET_ERROR(EPERM));
-	if (!zoned) {
-		if (zfs_check_global_label(name, strval) != 0)
-			return (SET_ERROR(EPERM));
-	}
-
-	/*
-	 * If the existing dataset label is nondefault, check if the
-	 * dataset is mounted (label cannot be changed while mounted).
-	 * Get the zfsvfs; if there isn't one, then the dataset isn't
-	 * mounted (or isn't a dataset, doesn't exist, ...).
-	 */
-	if (strcasecmp(ds_hexsl, ZFS_MLSLABEL_DEFAULT) != 0) {
-		objset_t *os;
-		static char *setsl_tag = "setsl_tag";
-
-		/*
-		 * Try to own the dataset; abort if there is any error,
-		 * (e.g., already mounted, in use, or other error).
-		 */
-		error = dmu_objset_own(name, DMU_OST_ZFS, B_TRUE,
-		    setsl_tag, &os);
-		if (error != 0)
-			return (SET_ERROR(EPERM));
-
-		dmu_objset_disown(os, setsl_tag);
-
-		if (new_default) {
-			needed_priv = PRIV_FILE_DOWNGRADE_SL;
-			goto out_check;
-		}
-
-		if (hexstr_to_label(strval, &new_sl) != 0)
-			return (SET_ERROR(EPERM));
-
-		if (blstrictdom(&ds_sl, &new_sl))
-			needed_priv = PRIV_FILE_DOWNGRADE_SL;
-		else if (blstrictdom(&new_sl, &ds_sl))
-			needed_priv = PRIV_FILE_UPGRADE_SL;
-	} else {
-		/* dataset currently has a default label */
-		if (!new_default)
-			needed_priv = PRIV_FILE_UPGRADE_SL;
-	}
-
-out_check:
-	if (needed_priv != -1)
-		return (PRIV_POLICY(cr, needed_priv, B_FALSE, EPERM, NULL));
-	return (0);
-}
-#endif	/* SECLABEL */
-
-static int
-zfs_secpolicy_setprop(const char *dsname, zfs_prop_t prop, nvpair_t *propval,
-    cred_t *cr)
-{
-	char *strval;
-
-	/*
-	 * Check permissions for special properties.
-	 */
-	switch (prop) {
-	case ZFS_PROP_ZONED:
-		/*
-		 * Disallow setting of 'zoned' from within a local zone.
-		 */
-		if (!INGLOBALZONE(curthread))
-			return (SET_ERROR(EPERM));
-		break;
-
-	case ZFS_PROP_QUOTA:
-	case ZFS_PROP_FILESYSTEM_LIMIT:
-	case ZFS_PROP_SNAPSHOT_LIMIT:
-		if (!INGLOBALZONE(curthread)) {
-			uint64_t zoned;
-			char setpoint[ZFS_MAX_DATASET_NAME_LEN];
-			/*
-			 * Unprivileged users are allowed to modify the
-			 * limit on things *under* (ie. contained by)
-			 * the thing they own.
-			 */
-			if (dsl_prop_get_integer(dsname, "jailed", &zoned,
-			    setpoint))
-				return (SET_ERROR(EPERM));
-			if (!zoned || strlen(dsname) <= strlen(setpoint))
-				return (SET_ERROR(EPERM));
-		}
-		break;
-
-	case ZFS_PROP_MLSLABEL:
-#ifdef SECLABEL
-		if (!is_system_labeled())
-			return (SET_ERROR(EPERM));
-
-		if (nvpair_value_string(propval, &strval) == 0) {
-			int err;
-
-			err = zfs_set_slabel_policy(dsname, strval, CRED());
-			if (err != 0)
-				return (err);
-		}
-#else
-		return (EOPNOTSUPP);
-#endif
-		break;
-	}
-
-	return (zfs_secpolicy_write_perms(dsname, zfs_prop_to_name(prop), cr));
-}
-
-/* ARGSUSED */
-static int
-zfs_secpolicy_set_fsacl(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
-{
-	int error;
-
-	error = zfs_dozonecheck(zc->zc_name, cr);
-	if (error != 0)
-		return (error);
-
-	/*
-	 * permission to set permissions will be evaluated later in
-	 * dsl_deleg_can_allow()
-	 */
-	return (0);
-}
-
-/* ARGSUSED */
-static int
-zfs_secpolicy_rollback(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
-{
-	return (zfs_secpolicy_write_perms(zc->zc_name,
-	    ZFS_DELEG_PERM_ROLLBACK, cr));
-}
-
-/* ARGSUSED */
-static int
-zfs_secpolicy_send(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
-{
-	dsl_pool_t *dp;
-	dsl_dataset_t *ds;
-	char *cp;
-	int error;
-
-	/*
-	 * Generate the current snapshot name from the given objsetid, then
-	 * use that name for the secpolicy/zone checks.
-	 */
-	cp = strchr(zc->zc_name, '@');
-	if (cp == NULL)
-		return (SET_ERROR(EINVAL));
-	error = dsl_pool_hold(zc->zc_name, FTAG, &dp);
-	if (error != 0)
-		return (error);
-
-	error = dsl_dataset_hold_obj(dp, zc->zc_sendobj, FTAG, &ds);
-	if (error != 0) {
-		dsl_pool_rele(dp, FTAG);
-		return (error);
-	}
-
-	dsl_dataset_name(ds, zc->zc_name);
-
-	error = zfs_secpolicy_write_perms_ds(zc->zc_name, ds,
-	    ZFS_DELEG_PERM_SEND, cr);
-	dsl_dataset_rele(ds, FTAG);
-	dsl_pool_rele(dp, FTAG);
-
-	return (error);
-}
-
-/* ARGSUSED */
-static int
-zfs_secpolicy_send_new(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
-{
-	return (zfs_secpolicy_write_perms(zc->zc_name,
-	    ZFS_DELEG_PERM_SEND, cr));
-}
-
-/* ARGSUSED */
-static int
-zfs_secpolicy_deleg_share(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
-{
-	vnode_t *vp;
-	int error;
-
-	if ((error = lookupname(zc->zc_value, UIO_SYSSPACE,
-	    NO_FOLLOW, NULL, &vp)) != 0)
-		return (error);
-
-	/* Now make sure mntpnt and dataset are ZFS */
-
-	if (strcmp(vp->v_vfsp->mnt_stat.f_fstypename, "zfs") != 0 ||
-	    (strcmp((char *)refstr_value(vp->v_vfsp->vfs_resource),
-	    zc->zc_name) != 0)) {
-		VN_RELE(vp);
-		return (SET_ERROR(EPERM));
-	}
-
-	VN_RELE(vp);
-	return (dsl_deleg_access(zc->zc_name,
-	    ZFS_DELEG_PERM_SHARE, cr));
-}
-
-int
-zfs_secpolicy_share(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
-{
-	if (!INGLOBALZONE(curthread))
-		return (SET_ERROR(EPERM));
-
-	if (secpolicy_nfs(cr) == 0) {
-		return (0);
-	} else {
-		return (zfs_secpolicy_deleg_share(zc, innvl, cr));
-	}
-}
-
-int
-zfs_secpolicy_smb_acl(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
-{
-	if (!INGLOBALZONE(curthread))
-		return (SET_ERROR(EPERM));
-
-	if (secpolicy_smb(cr) == 0) {
-		return (0);
-	} else {
-		return (zfs_secpolicy_deleg_share(zc, innvl, cr));
-	}
-}
-
-static int
-zfs_get_parent(const char *datasetname, char *parent, int parentsize)
-{
-	char *cp;
-
-	/*
-	 * Remove the @bla or /bla from the end of the name to get the parent.
-	 */
-	(void) strncpy(parent, datasetname, parentsize);
-	cp = strrchr(parent, '@');
-	if (cp != NULL) {
-		cp[0] = '\0';
-	} else {
-		cp = strrchr(parent, '/');
-		if (cp == NULL)
-			return (SET_ERROR(ENOENT));
-		cp[0] = '\0';
-	}
-
-	return (0);
-}
-
-int
-zfs_secpolicy_destroy_perms(const char *name, cred_t *cr)
-{
-	int error;
-
-	if ((error = zfs_secpolicy_write_perms(name,
-	    ZFS_DELEG_PERM_MOUNT, cr)) != 0)
-		return (error);
-
-	return (zfs_secpolicy_write_perms(name, ZFS_DELEG_PERM_DESTROY, cr));
-}
-
-/* ARGSUSED */
-static int
-zfs_secpolicy_destroy(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
-{
-	return (zfs_secpolicy_destroy_perms(zc->zc_name, cr));
-}
-
-/*
- * Destroying snapshots with delegated permissions requires
- * descendant mount and destroy permissions.
- */
-/* ARGSUSED */
-static int
-zfs_secpolicy_destroy_snaps(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
-{
-	nvlist_t *snaps;
-	nvpair_t *pair, *nextpair;
-	int error = 0;
-
-	snaps = fnvlist_lookup_nvlist(innvl, "snaps");
-
-	for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL;
-	    pair = nextpair) {
-		nextpair = nvlist_next_nvpair(snaps, pair);
-		error = zfs_secpolicy_destroy_perms(nvpair_name(pair), cr);
-		if (error == ENOENT) {
-			/*
-			 * Ignore any snapshots that don't exist (we consider
-			 * them "already destroyed").  Remove the name from the
-			 * nvl here in case the snapshot is created between
-			 * now and when we try to destroy it (in which case
-			 * we don't want to destroy it since we haven't
-			 * checked for permission).
-			 */
-			fnvlist_remove_nvpair(snaps, pair);
-			error = 0;
-		}
-		if (error != 0)
-			break;
-	}
-
-	return (error);
-}
-
-int
-zfs_secpolicy_rename_perms(const char *from, const char *to, cred_t *cr)
-{
-	char	parentname[ZFS_MAX_DATASET_NAME_LEN];
-	int	error;
-
-	if ((error = zfs_secpolicy_write_perms(from,
-	    ZFS_DELEG_PERM_RENAME, cr)) != 0)
-		return (error);
-
-	if ((error = zfs_secpolicy_write_perms(from,
-	    ZFS_DELEG_PERM_MOUNT, cr)) != 0)
-		return (error);
-
-	if ((error = zfs_get_parent(to, parentname,
-	    sizeof (parentname))) != 0)
-		return (error);
-
-	if ((error = zfs_secpolicy_write_perms(parentname,
-	    ZFS_DELEG_PERM_CREATE, cr)) != 0)
-		return (error);
-
-	if ((error = zfs_secpolicy_write_perms(parentname,
-	    ZFS_DELEG_PERM_MOUNT, cr)) != 0)
-		return (error);
-
-	return (error);
-}
-
-/* ARGSUSED */
-static int
-zfs_secpolicy_rename(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
-{
-	char *at = NULL;
-	char *pound;
-	int error;
-
-	if ((pound = strchr(zc->zc_name, '#')) != NULL) {
-		*pound = '\0';
-		error = zfs_secpolicy_write_perms(zc->zc_name,
-		    ZFS_DELEG_PERM_RENAME, cr);
-		if (error == 0) {
-			error = zfs_secpolicy_write_perms(zc->zc_name,
-			    ZFS_DELEG_PERM_BOOKMARK, cr);
-		}
-		*pound = '#';
-		return (error);
-	}
-
-	if ((zc->zc_cookie & 1) != 0) {
-		/*
-		 * This is recursive rename, so the starting snapshot might
-		 * not exist. Check file system or volume permission instead.
-		 */
-		at = strchr(zc->zc_name, '@');
-		if (at == NULL)
-			return (EINVAL);
-		*at = '\0';
-	}
-
-	error = zfs_secpolicy_rename_perms(zc->zc_name, zc->zc_value, cr);
-
-	if (at != NULL)
-		*at = '@';
-
-	return (error);
-}
-
-/* ARGSUSED */
-static int
-zfs_secpolicy_promote(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
-{
-	dsl_pool_t *dp;
-	dsl_dataset_t *clone;
-	int error;
-
-	error = zfs_secpolicy_write_perms(zc->zc_name,
-	    ZFS_DELEG_PERM_PROMOTE, cr);
-	if (error != 0)
-		return (error);
-
-	error = dsl_pool_hold(zc->zc_name, FTAG, &dp);
-	if (error != 0)
-		return (error);
-
-	error = dsl_dataset_hold(dp, zc->zc_name, FTAG, &clone);
-
-	if (error == 0) {
-		char parentname[ZFS_MAX_DATASET_NAME_LEN];
-		dsl_dataset_t *origin = NULL;
-		dsl_dir_t *dd;
-		dd = clone->ds_dir;
-
-		error = dsl_dataset_hold_obj(dd->dd_pool,
-		    dsl_dir_phys(dd)->dd_origin_obj, FTAG, &origin);
-		if (error != 0) {
-			dsl_dataset_rele(clone, FTAG);
-			dsl_pool_rele(dp, FTAG);
-			return (error);
-		}
-
-		error = zfs_secpolicy_write_perms_ds(zc->zc_name, clone,
-		    ZFS_DELEG_PERM_MOUNT, cr);
-
-		dsl_dataset_name(origin, parentname);
-		if (error == 0) {
-			error = zfs_secpolicy_write_perms_ds(parentname, origin,
-			    ZFS_DELEG_PERM_PROMOTE, cr);
-		}
-		dsl_dataset_rele(clone, FTAG);
-		dsl_dataset_rele(origin, FTAG);
-	}
-	dsl_pool_rele(dp, FTAG);
-	return (error);
-}
-
-/* ARGSUSED */
-static int
-zfs_secpolicy_recv(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
-{
-	int error;
-
-	if ((error = zfs_secpolicy_write_perms(zc->zc_name,
-	    ZFS_DELEG_PERM_RECEIVE, cr)) != 0)
-		return (error);
-
-	if ((error = zfs_secpolicy_write_perms(zc->zc_name,
-	    ZFS_DELEG_PERM_MOUNT, cr)) != 0)
-		return (error);
-
-	return (zfs_secpolicy_write_perms(zc->zc_name,
-	    ZFS_DELEG_PERM_CREATE, cr));
-}
-
-int
-zfs_secpolicy_snapshot_perms(const char *name, cred_t *cr)
-{
-	return (zfs_secpolicy_write_perms(name,
-	    ZFS_DELEG_PERM_SNAPSHOT, cr));
-}
-
-/*
- * Check for permission to create each snapshot in the nvlist.
- */
-/* ARGSUSED */
-static int
-zfs_secpolicy_snapshot(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
-{
-	nvlist_t *snaps;
-	int error;
-	nvpair_t *pair;
-
-	snaps = fnvlist_lookup_nvlist(innvl, "snaps");
-
-	for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL;
-	    pair = nvlist_next_nvpair(snaps, pair)) {
-		char *name = nvpair_name(pair);
-		char *atp = strchr(name, '@');
-
-		if (atp == NULL) {
-			error = SET_ERROR(EINVAL);
-			break;
-		}
-		*atp = '\0';
-		error = zfs_secpolicy_snapshot_perms(name, cr);
-		*atp = '@';
-		if (error != 0)
-			break;
-	}
-	return (error);
-}
-
-/*
- * Check for permission to create each bookmark in the nvlist.
- */
-/* ARGSUSED */
-static int
-zfs_secpolicy_bookmark(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
-{
-	int error = 0;
-
-	for (nvpair_t *pair = nvlist_next_nvpair(innvl, NULL);
-	    pair != NULL; pair = nvlist_next_nvpair(innvl, pair)) {
-		char *name = nvpair_name(pair);
-		char *hashp = strchr(name, '#');
-
-		if (hashp == NULL) {
-			error = SET_ERROR(EINVAL);
-			break;
-		}
-		*hashp = '\0';
-		error = zfs_secpolicy_write_perms(name,
-		    ZFS_DELEG_PERM_BOOKMARK, cr);
-		*hashp = '#';
-		if (error != 0)
-			break;
-	}
-	return (error);
-}
-
-/* ARGSUSED */
-static int
-zfs_secpolicy_remap(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
-{
-	return (zfs_secpolicy_write_perms(zc->zc_name,
-	    ZFS_DELEG_PERM_REMAP, cr));
-}
-
-/* ARGSUSED */
-static int
-zfs_secpolicy_destroy_bookmarks(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
-{
-	nvpair_t *pair, *nextpair;
-	int error = 0;
-
-	for (pair = nvlist_next_nvpair(innvl, NULL); pair != NULL;
-	    pair = nextpair) {
-		char *name = nvpair_name(pair);
-		char *hashp = strchr(name, '#');
-		nextpair = nvlist_next_nvpair(innvl, pair);
-
-		if (hashp == NULL) {
-			error = SET_ERROR(EINVAL);
-			break;
-		}
-
-		*hashp = '\0';
-		error = zfs_secpolicy_write_perms(name,
-		    ZFS_DELEG_PERM_DESTROY, cr);
-		*hashp = '#';
-		if (error == ENOENT) {
-			/*
-			 * Ignore any filesystems that don't exist (we consider
-			 * their bookmarks "already destroyed").  Remove
-			 * the name from the nvl here in case the filesystem
-			 * is created between now and when we try to destroy
-			 * the bookmark (in which case we don't want to
-			 * destroy it since we haven't checked for permission).
-			 */
-			fnvlist_remove_nvpair(innvl, pair);
-			error = 0;
-		}
-		if (error != 0)
-			break;
-	}
-
-	return (error);
-}
-
-/* ARGSUSED */
-static int
-zfs_secpolicy_log_history(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
-{
-	/*
-	 * Even root must have a proper TSD so that we know what pool
-	 * to log to.
-	 */
-	if (tsd_get(zfs_allow_log_key) == NULL)
-		return (SET_ERROR(EPERM));
-	return (0);
-}
-
-static int
-zfs_secpolicy_create_clone(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
-{
-	char	parentname[ZFS_MAX_DATASET_NAME_LEN];
-	int	error;
-	char	*origin;
-
-	if ((error = zfs_get_parent(zc->zc_name, parentname,
-	    sizeof (parentname))) != 0)
-		return (error);
-
-	if (nvlist_lookup_string(innvl, "origin", &origin) == 0 &&
-	    (error = zfs_secpolicy_write_perms(origin,
-	    ZFS_DELEG_PERM_CLONE, cr)) != 0)
-		return (error);
-
-	if ((error = zfs_secpolicy_write_perms(parentname,
-	    ZFS_DELEG_PERM_CREATE, cr)) != 0)
-		return (error);
-
-	return (zfs_secpolicy_write_perms(parentname,
-	    ZFS_DELEG_PERM_MOUNT, cr));
-}
-
-/*
- * Policy for pool operations - create/destroy pools, add vdevs, etc.  Requires
- * SYS_CONFIG privilege, which is not available in a local zone.
- */
-/* ARGSUSED */
-static int
-zfs_secpolicy_config(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
-{
-	if (secpolicy_sys_config(cr, B_FALSE) != 0)
-		return (SET_ERROR(EPERM));
-
-	return (0);
-}
-
-/*
- * Policy for object to name lookups.
- */
-/* ARGSUSED */
-static int
-zfs_secpolicy_diff(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
-{
-	int error;
-
-	if ((error = secpolicy_sys_config(cr, B_FALSE)) == 0)
-		return (0);
-
-	error = zfs_secpolicy_write_perms(zc->zc_name, ZFS_DELEG_PERM_DIFF, cr);
-	return (error);
-}
-
-/*
- * Policy for fault injection.  Requires all privileges.
- */
-/* ARGSUSED */
-static int
-zfs_secpolicy_inject(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
-{
-	return (secpolicy_zinject(cr));
-}
-
-/* ARGSUSED */
-static int
-zfs_secpolicy_inherit_prop(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
-{
-	zfs_prop_t prop = zfs_name_to_prop(zc->zc_value);
-
-	if (prop == ZPROP_INVAL) {
-		if (!zfs_prop_user(zc->zc_value))
-			return (SET_ERROR(EINVAL));
-		return (zfs_secpolicy_write_perms(zc->zc_name,
-		    ZFS_DELEG_PERM_USERPROP, cr));
-	} else {
-		return (zfs_secpolicy_setprop(zc->zc_name, prop,
-		    NULL, cr));
-	}
-}
-
-static int
-zfs_secpolicy_userspace_one(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
-{
-	int err = zfs_secpolicy_read(zc, innvl, cr);
-	if (err)
-		return (err);
-
-	if (zc->zc_objset_type >= ZFS_NUM_USERQUOTA_PROPS)
-		return (SET_ERROR(EINVAL));
-
-	if (zc->zc_value[0] == 0) {
-		/*
-		 * They are asking about a posix uid/gid.  If it's
-		 * themself, allow it.
-		 */
-		if (zc->zc_objset_type == ZFS_PROP_USERUSED ||
-		    zc->zc_objset_type == ZFS_PROP_USERQUOTA) {
-			if (zc->zc_guid == crgetuid(cr))
-				return (0);
-		} else {
-			if (groupmember(zc->zc_guid, cr))
-				return (0);
-		}
-	}
-
-	return (zfs_secpolicy_write_perms(zc->zc_name,
-	    userquota_perms[zc->zc_objset_type], cr));
-}
-
-static int
-zfs_secpolicy_userspace_many(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
-{
-	int err = zfs_secpolicy_read(zc, innvl, cr);
-	if (err)
-		return (err);
-
-	if (zc->zc_objset_type >= ZFS_NUM_USERQUOTA_PROPS)
-		return (SET_ERROR(EINVAL));
-
-	return (zfs_secpolicy_write_perms(zc->zc_name,
-	    userquota_perms[zc->zc_objset_type], cr));
-}
-
-/* ARGSUSED */
-static int
-zfs_secpolicy_userspace_upgrade(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
-{
-	return (zfs_secpolicy_setprop(zc->zc_name, ZFS_PROP_VERSION,
-	    NULL, cr));
-}
-
-/* ARGSUSED */
-static int
-zfs_secpolicy_hold(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
-{
-	nvpair_t *pair;
-	nvlist_t *holds;
-	int error;
-
-	holds = fnvlist_lookup_nvlist(innvl, "holds");
-
-	for (pair = nvlist_next_nvpair(holds, NULL); pair != NULL;
-	    pair = nvlist_next_nvpair(holds, pair)) {
-		char fsname[ZFS_MAX_DATASET_NAME_LEN];
-		error = dmu_fsname(nvpair_name(pair), fsname);
-		if (error != 0)
-			return (error);
-		error = zfs_secpolicy_write_perms(fsname,
-		    ZFS_DELEG_PERM_HOLD, cr);
-		if (error != 0)
-			return (error);
-	}
-	return (0);
-}
-
-/* ARGSUSED */
-static int
-zfs_secpolicy_release(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
-{
-	nvpair_t *pair;
-	int error;
-
-	for (pair = nvlist_next_nvpair(innvl, NULL); pair != NULL;
-	    pair = nvlist_next_nvpair(innvl, pair)) {
-		char fsname[ZFS_MAX_DATASET_NAME_LEN];
-		error = dmu_fsname(nvpair_name(pair), fsname);
-		if (error != 0)
-			return (error);
-		error = zfs_secpolicy_write_perms(fsname,
-		    ZFS_DELEG_PERM_RELEASE, cr);
-		if (error != 0)
-			return (error);
-	}
-	return (0);
-}
-
-/*
- * Policy for allowing temporary snapshots to be taken or released
- */
-static int
-zfs_secpolicy_tmp_snapshot(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
-{
-	/*
-	 * A temporary snapshot is the same as a snapshot,
-	 * hold, destroy and release all rolled into one.
-	 * Delegated diff alone is sufficient that we allow this.
-	 */
-	int error;
-
-	if ((error = zfs_secpolicy_write_perms(zc->zc_name,
-	    ZFS_DELEG_PERM_DIFF, cr)) == 0)
-		return (0);
-
-	error = zfs_secpolicy_snapshot_perms(zc->zc_name, cr);
-	if (innvl != NULL) {
-		if (error == 0)
-			error = zfs_secpolicy_hold(zc, innvl, cr);
-		if (error == 0)
-			error = zfs_secpolicy_release(zc, innvl, cr);
-		if (error == 0)
-			error = zfs_secpolicy_destroy(zc, innvl, cr);
-	}
-	return (error);
-}
-
-/*
- * Returns the nvlist as specified by the user in the zfs_cmd_t.
- */
-static int
-get_nvlist(uint64_t nvl, uint64_t size, int iflag, nvlist_t **nvp)
-{
-	char *packed;
-	int error;
-	nvlist_t *list = NULL;
-
-	/*
-	 * Read in and unpack the user-supplied nvlist.
-	 */
-	if (size == 0)
-		return (SET_ERROR(EINVAL));
-
-	packed = kmem_alloc(size, KM_SLEEP);
-
-	if ((error = ddi_copyin((void *)(uintptr_t)nvl, packed, size,
-	    iflag)) != 0) {
-		kmem_free(packed, size);
-		return (SET_ERROR(EFAULT));
-	}
-
-	if ((error = nvlist_unpack(packed, size, &list, 0)) != 0) {
-		kmem_free(packed, size);
-		return (error);
-	}
-
-	kmem_free(packed, size);
-
-	*nvp = list;
-	return (0);
-}
-
-/*
- * Reduce the size of this nvlist until it can be serialized in 'max' bytes.
- * Entries will be removed from the end of the nvlist, and one int32 entry
- * named "N_MORE_ERRORS" will be added indicating how many entries were
- * removed.
- */
-static int
-nvlist_smush(nvlist_t *errors, size_t max)
-{
-	size_t size;
-
-	size = fnvlist_size(errors);
-
-	if (size > max) {
-		nvpair_t *more_errors;
-		int n = 0;
-
-		if (max < 1024)
-			return (SET_ERROR(ENOMEM));
-
-		fnvlist_add_int32(errors, ZPROP_N_MORE_ERRORS, 0);
-		more_errors = nvlist_prev_nvpair(errors, NULL);
-
-		do {
-			nvpair_t *pair = nvlist_prev_nvpair(errors,
-			    more_errors);
-			fnvlist_remove_nvpair(errors, pair);
-			n++;
-			size = fnvlist_size(errors);
-		} while (size > max);
-
-		fnvlist_remove_nvpair(errors, more_errors);
-		fnvlist_add_int32(errors, ZPROP_N_MORE_ERRORS, n);
-		ASSERT3U(fnvlist_size(errors), <=, max);
-	}
-
-	return (0);
-}
-
-static int
-put_nvlist(zfs_cmd_t *zc, nvlist_t *nvl)
-{
-	char *packed = NULL;
-	int error = 0;
-	size_t size;
-
-	size = fnvlist_size(nvl);
-
-	if (size > zc->zc_nvlist_dst_size) {
-		/*
-		 * Solaris returns ENOMEM here, because even if an error is
-		 * returned from an ioctl(2), new zc_nvlist_dst_size will be
-		 * passed to the userland. This is not the case for FreeBSD.
-		 * We need to return 0, so the kernel will copy the
-		 * zc_nvlist_dst_size back and the userland can discover that a
-		 * bigger buffer is needed.
-		 */
-		error = 0;
-	} else {
-		packed = fnvlist_pack(nvl, &size);
-		if (ddi_copyout(packed, (void *)(uintptr_t)zc->zc_nvlist_dst,
-		    size, zc->zc_iflags) != 0)
-			error = SET_ERROR(EFAULT);
-		fnvlist_pack_free(packed, size);
-	}
-
-	zc->zc_nvlist_dst_size = size;
-	zc->zc_nvlist_dst_filled = B_TRUE;
-	return (error);
-}
-
-int
-getzfsvfs_impl(objset_t *os, vfs_t **vfsp)
-{
-	zfsvfs_t *zfvp;
-	int error = 0;
-
-	if (dmu_objset_type(os) != DMU_OST_ZFS) {
-		return (SET_ERROR(EINVAL));
-	}
-
-	mutex_enter(&os->os_user_ptr_lock);
-	zfvp = dmu_objset_get_user(os);
-	if (zfvp) {
-		*vfsp = zfvp->z_vfs;
-		vfs_ref(zfvp->z_vfs);
-	} else {
-		error = SET_ERROR(ESRCH);
-	}
-	mutex_exit(&os->os_user_ptr_lock);
-	return (error);
-}
-
-int
-getzfsvfs(const char *dsname, zfsvfs_t **zfvp)
-{
-	objset_t *os;
-	vfs_t *vfsp;
-	int error;
-
-	error = dmu_objset_hold(dsname, FTAG, &os);
-	if (error != 0)
-		return (error);
-	error = getzfsvfs_impl(os, &vfsp);
-	dmu_objset_rele(os, FTAG);
-	if (error != 0)
-		return (error);
-
-	error = vfs_busy(vfsp, 0);
-	vfs_rel(vfsp);
-	if (error != 0) {
-		*zfvp = NULL;
-		error = SET_ERROR(ESRCH);
-	} else {
-		*zfvp = vfsp->vfs_data;
-	}
-	return (error);
-}
-
-/*
- * Find a zfsvfs_t for a mounted filesystem, or create our own, in which
- * case its z_vfs will be NULL, and it will be opened as the owner.
- * If 'writer' is set, the z_teardown_lock will be held for RW_WRITER,
- * which prevents all vnode ops from running.
- */
-static int
-zfsvfs_hold(const char *name, void *tag, zfsvfs_t **zfvp, boolean_t writer)
-{
-	int error = 0;
-
-	if (getzfsvfs(name, zfvp) != 0)
-		error = zfsvfs_create(name, zfvp);
-	if (error == 0) {
-		rrm_enter(&(*zfvp)->z_teardown_lock, (writer) ? RW_WRITER :
-		    RW_READER, tag);
-#ifdef illumos
-		if ((*zfvp)->z_unmounted) {
-			/*
-			 * XXX we could probably try again, since the unmounting
-			 * thread should be just about to disassociate the
-			 * objset from the zfsvfs.
-			 */
-			rrm_exit(&(*zfvp)->z_teardown_lock, tag);
-			return (SET_ERROR(EBUSY));
-		}
-#else
-		/*
-		 * vfs_busy() ensures that the filesystem is not and
-		 * can not be unmounted.
-		 */
-		ASSERT(!(*zfvp)->z_unmounted);
-#endif
-	}
-	return (error);
-}
-
-static void
-zfsvfs_rele(zfsvfs_t *zfsvfs, void *tag)
-{
-	rrm_exit(&zfsvfs->z_teardown_lock, tag);
-
-	if (zfsvfs->z_vfs) {
-#ifdef illumos
-		VFS_RELE(zfsvfs->z_vfs);
-#else
-		vfs_unbusy(zfsvfs->z_vfs);
-#endif
-	} else {
-		dmu_objset_disown(zfsvfs->z_os, zfsvfs);
-		zfsvfs_free(zfsvfs);
-	}
-}
-
-static int
-zfs_ioc_pool_create(zfs_cmd_t *zc)
-{
-	int error;
-	nvlist_t *config, *props = NULL;
-	nvlist_t *rootprops = NULL;
-	nvlist_t *zplprops = NULL;
-	char *spa_name = zc->zc_name;
-
-	if (error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size,
-	    zc->zc_iflags, &config))
-		return (error);
-
-	if (zc->zc_nvlist_src_size != 0 && (error =
-	    get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size,
-	    zc->zc_iflags, &props))) {
-		nvlist_free(config);
-		return (error);
-	}
-
-	if (props) {
-		nvlist_t *nvl = NULL;
-		uint64_t version = SPA_VERSION;
-		char *tname;
-
-		(void) nvlist_lookup_uint64(props,
-		    zpool_prop_to_name(ZPOOL_PROP_VERSION), &version);
-		if (!SPA_VERSION_IS_SUPPORTED(version)) {
-			error = SET_ERROR(EINVAL);
-			goto pool_props_bad;
-		}
-		(void) nvlist_lookup_nvlist(props, ZPOOL_ROOTFS_PROPS, &nvl);
-		if (nvl) {
-			error = nvlist_dup(nvl, &rootprops, KM_SLEEP);
-			if (error != 0) {
-				nvlist_free(config);
-				nvlist_free(props);
-				return (error);
-			}
-			(void) nvlist_remove_all(props, ZPOOL_ROOTFS_PROPS);
-		}
-		VERIFY(nvlist_alloc(&zplprops, NV_UNIQUE_NAME, KM_SLEEP) == 0);
-		error = zfs_fill_zplprops_root(version, rootprops,
-		    zplprops, NULL);
-		if (error != 0)
-			goto pool_props_bad;
-
-		if (nvlist_lookup_string(props,
-		    zpool_prop_to_name(ZPOOL_PROP_TNAME), &tname) == 0)
-			spa_name = tname;
-	}
-
-	error = spa_create(zc->zc_name, config, props, zplprops);
-
-	/*
-	 * Set the remaining root properties
-	 */
-	if (!error && (error = zfs_set_prop_nvlist(spa_name,
-	    ZPROP_SRC_LOCAL, rootprops, NULL)) != 0)
-		(void) spa_destroy(spa_name);
-
-pool_props_bad:
-	nvlist_free(rootprops);
-	nvlist_free(zplprops);
-	nvlist_free(config);
-	nvlist_free(props);
-
-	return (error);
-}
-
-static int
-zfs_ioc_pool_destroy(zfs_cmd_t *zc)
-{
-	int error;
-	zfs_log_history(zc);
-	error = spa_destroy(zc->zc_name);
-#ifndef __FreeBSD__
-	if (error == 0)
-		zvol_remove_minors(zc->zc_name);
-#endif
-	return (error);
-}
-
-static int
-zfs_ioc_pool_import(zfs_cmd_t *zc)
-{
-	nvlist_t *config, *props = NULL;
-	uint64_t guid;
-	int error;
-
-	if ((error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size,
-	    zc->zc_iflags, &config)) != 0)
-		return (error);
-
-	if (zc->zc_nvlist_src_size != 0 && (error =
-	    get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size,
-	    zc->zc_iflags, &props))) {
-		nvlist_free(config);
-		return (error);
-	}
-
-	if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &guid) != 0 ||
-	    guid != zc->zc_guid)
-		error = SET_ERROR(EINVAL);
-	else
-		error = spa_import(zc->zc_name, config, props, zc->zc_cookie);
-
-	if (zc->zc_nvlist_dst != 0) {
-		int err;
-
-		if ((err = put_nvlist(zc, config)) != 0)
-			error = err;
-	}
-
-	nvlist_free(config);
-
-	nvlist_free(props);
-
-	return (error);
-}
-
-static int
-zfs_ioc_pool_export(zfs_cmd_t *zc)
-{
-	int error;
-	boolean_t force = (boolean_t)zc->zc_cookie;
-	boolean_t hardforce = (boolean_t)zc->zc_guid;
-
-	zfs_log_history(zc);
-	error = spa_export(zc->zc_name, NULL, force, hardforce);
-#ifndef __FreeBSD__
-	if (error == 0)
-		zvol_remove_minors(zc->zc_name);
-#endif
-	return (error);
-}
-
-static int
-zfs_ioc_pool_configs(zfs_cmd_t *zc)
-{
-	nvlist_t *configs;
-	int error;
-
-	if ((configs = spa_all_configs(&zc->zc_cookie)) == NULL)
-		return (SET_ERROR(EEXIST));
-
-	error = put_nvlist(zc, configs);
-
-	nvlist_free(configs);
-
-	return (error);
-}
-
-/*
- * inputs:
- * zc_name		name of the pool
- *
- * outputs:
- * zc_cookie		real errno
- * zc_nvlist_dst	config nvlist
- * zc_nvlist_dst_size	size of config nvlist
- */
-static int
-zfs_ioc_pool_stats(zfs_cmd_t *zc)
-{
-	nvlist_t *config;
-	int error;
-	int ret = 0;
-
-	error = spa_get_stats(zc->zc_name, &config, zc->zc_value,
-	    sizeof (zc->zc_value));
-
-	if (config != NULL) {
-		ret = put_nvlist(zc, config);
-		nvlist_free(config);
-
-		/*
-		 * The config may be present even if 'error' is non-zero.
-		 * In this case we return success, and preserve the real errno
-		 * in 'zc_cookie'.
-		 */
-		zc->zc_cookie = error;
-	} else {
-		ret = error;
-	}
-
-	return (ret);
-}
-
-/*
- * Try to import the given pool, returning pool stats as appropriate so that
- * user land knows which devices are available and overall pool health.
- */
-static int
-zfs_ioc_pool_tryimport(zfs_cmd_t *zc)
-{
-	nvlist_t *tryconfig, *config;
-	int error;
-
-	if ((error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size,
-	    zc->zc_iflags, &tryconfig)) != 0)
-		return (error);
-
-	config = spa_tryimport(tryconfig);
-
-	nvlist_free(tryconfig);
-
-	if (config == NULL)
-		return (SET_ERROR(EINVAL));
-
-	error = put_nvlist(zc, config);
-	nvlist_free(config);
-
-	return (error);
-}
-
-/*
- * inputs:
- * zc_name              name of the pool
- * zc_cookie            scan func (pool_scan_func_t)
- * zc_flags             scrub pause/resume flag (pool_scrub_cmd_t)
- */
-static int
-zfs_ioc_pool_scan(zfs_cmd_t *zc)
-{
-	spa_t *spa;
-	int error;
-
-	if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
-		return (error);
-
-	if (zc->zc_flags >= POOL_SCRUB_FLAGS_END)
-		return (SET_ERROR(EINVAL));
-
-	if (zc->zc_flags == POOL_SCRUB_PAUSE)
-		error = spa_scrub_pause_resume(spa, POOL_SCRUB_PAUSE);
-	else if (zc->zc_cookie == POOL_SCAN_NONE)
-		error = spa_scan_stop(spa);
-	else
-		error = spa_scan(spa, zc->zc_cookie);
-
-	spa_close(spa, FTAG);
-
-	return (error);
-}
-
-static int
-zfs_ioc_pool_freeze(zfs_cmd_t *zc)
-{
-	spa_t *spa;
-	int error;
-
-	error = spa_open(zc->zc_name, &spa, FTAG);
-	if (error == 0) {
-		spa_freeze(spa);
-		spa_close(spa, FTAG);
-	}
-	return (error);
-}
-
-static int
-zfs_ioc_pool_upgrade(zfs_cmd_t *zc)
-{
-	spa_t *spa;
-	int error;
-
-	if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
-		return (error);
-
-	if (zc->zc_cookie < spa_version(spa) ||
-	    !SPA_VERSION_IS_SUPPORTED(zc->zc_cookie)) {
-		spa_close(spa, FTAG);
-		return (SET_ERROR(EINVAL));
-	}
-
-	spa_upgrade(spa, zc->zc_cookie);
-	spa_close(spa, FTAG);
-
-	return (error);
-}
-
-static int
-zfs_ioc_pool_get_history(zfs_cmd_t *zc)
-{
-	spa_t *spa;
-	char *hist_buf;
-	uint64_t size;
-	int error;
-
-	if ((size = zc->zc_history_len) == 0)
-		return (SET_ERROR(EINVAL));
-
-	if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
-		return (error);
-
-	if (spa_version(spa) < SPA_VERSION_ZPOOL_HISTORY) {
-		spa_close(spa, FTAG);
-		return (SET_ERROR(ENOTSUP));
-	}
-
-	hist_buf = kmem_alloc(size, KM_SLEEP);
-	if ((error = spa_history_get(spa, &zc->zc_history_offset,
-	    &zc->zc_history_len, hist_buf)) == 0) {
-		error = ddi_copyout(hist_buf,
-		    (void *)(uintptr_t)zc->zc_history,
-		    zc->zc_history_len, zc->zc_iflags);
-	}
-
-	spa_close(spa, FTAG);
-	kmem_free(hist_buf, size);
-	return (error);
-}
-
-static int
-zfs_ioc_pool_reguid(zfs_cmd_t *zc)
-{
-	spa_t *spa;
-	int error;
-
-	error = spa_open(zc->zc_name, &spa, FTAG);
-	if (error == 0) {
-		error = spa_change_guid(spa);
-		spa_close(spa, FTAG);
-	}
-	return (error);
-}
-
-static int
-zfs_ioc_dsobj_to_dsname(zfs_cmd_t *zc)
-{
-	return (dsl_dsobj_to_dsname(zc->zc_name, zc->zc_obj, zc->zc_value));
-}
-
-/*
- * inputs:
- * zc_name		name of filesystem
- * zc_obj		object to find
- *
- * outputs:
- * zc_value		name of object
- */
-static int
-zfs_ioc_obj_to_path(zfs_cmd_t *zc)
-{
-	objset_t *os;
-	int error;
-
-	/* XXX reading from objset not owned */
-	if ((error = dmu_objset_hold(zc->zc_name, FTAG, &os)) != 0)
-		return (error);
-	if (dmu_objset_type(os) != DMU_OST_ZFS) {
-		dmu_objset_rele(os, FTAG);
-		return (SET_ERROR(EINVAL));
-	}
-	error = zfs_obj_to_path(os, zc->zc_obj, zc->zc_value,
-	    sizeof (zc->zc_value));
-	dmu_objset_rele(os, FTAG);
-
-	return (error);
-}
-
-/*
- * inputs:
- * zc_name		name of filesystem
- * zc_obj		object to find
- *
- * outputs:
- * zc_stat		stats on object
- * zc_value		path to object
- */
-static int
-zfs_ioc_obj_to_stats(zfs_cmd_t *zc)
-{
-	objset_t *os;
-	int error;
-
-	/* XXX reading from objset not owned */
-	if ((error = dmu_objset_hold(zc->zc_name, FTAG, &os)) != 0)
-		return (error);
-	if (dmu_objset_type(os) != DMU_OST_ZFS) {
-		dmu_objset_rele(os, FTAG);
-		return (SET_ERROR(EINVAL));
-	}
-	error = zfs_obj_to_stats(os, zc->zc_obj, &zc->zc_stat, zc->zc_value,
-	    sizeof (zc->zc_value));
-	dmu_objset_rele(os, FTAG);
-
-	return (error);
-}
-
-static int
-zfs_ioc_vdev_add(zfs_cmd_t *zc)
-{
-	spa_t *spa;
-	int error;
-	nvlist_t *config, **l2cache, **spares;
-	uint_t nl2cache = 0, nspares = 0;
-
-	error = spa_open(zc->zc_name, &spa, FTAG);
-	if (error != 0)
-		return (error);
-
-	error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size,
-	    zc->zc_iflags, &config);
-	(void) nvlist_lookup_nvlist_array(config, ZPOOL_CONFIG_L2CACHE,
-	    &l2cache, &nl2cache);
-
-	(void) nvlist_lookup_nvlist_array(config, ZPOOL_CONFIG_SPARES,
-	    &spares, &nspares);
-
-#ifdef illumos
-	/*
-	 * A root pool with concatenated devices is not supported.
-	 * Thus, can not add a device to a root pool.
-	 *
-	 * Intent log device can not be added to a rootpool because
-	 * during mountroot, zil is replayed, a seperated log device
-	 * can not be accessed during the mountroot time.
-	 *
-	 * l2cache and spare devices are ok to be added to a rootpool.
-	 */
-	if (spa_bootfs(spa) != 0 && nl2cache == 0 && nspares == 0) {
-		nvlist_free(config);
-		spa_close(spa, FTAG);
-		return (SET_ERROR(EDOM));
-	}
-#endif /* illumos */
-
-	if (error == 0) {
-		error = spa_vdev_add(spa, config);
-		nvlist_free(config);
-	}
-	spa_close(spa, FTAG);
-	return (error);
-}
-
-/*
- * inputs:
- * zc_name		name of the pool
- * zc_guid		guid of vdev to remove
- * zc_cookie		cancel removal
- */
-static int
-zfs_ioc_vdev_remove(zfs_cmd_t *zc)
-{
-	spa_t *spa;
-	int error;
-
-	error = spa_open(zc->zc_name, &spa, FTAG);
-	if (error != 0)
-		return (error);
-	if (zc->zc_cookie != 0) {
-		error = spa_vdev_remove_cancel(spa);
-	} else {
-		error = spa_vdev_remove(spa, zc->zc_guid, B_FALSE);
-	}
-	spa_close(spa, FTAG);
-	return (error);
-}
-
-static int
-zfs_ioc_vdev_set_state(zfs_cmd_t *zc)
-{
-	spa_t *spa;
-	int error;
-	vdev_state_t newstate = VDEV_STATE_UNKNOWN;
-
-	if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
-		return (error);
-	switch (zc->zc_cookie) {
-	case VDEV_STATE_ONLINE:
-		error = vdev_online(spa, zc->zc_guid, zc->zc_obj, &newstate);
-		break;
-
-	case VDEV_STATE_OFFLINE:
-		error = vdev_offline(spa, zc->zc_guid, zc->zc_obj);
-		break;
-
-	case VDEV_STATE_FAULTED:
-		if (zc->zc_obj != VDEV_AUX_ERR_EXCEEDED &&
-		    zc->zc_obj != VDEV_AUX_EXTERNAL)
-			zc->zc_obj = VDEV_AUX_ERR_EXCEEDED;
-
-		error = vdev_fault(spa, zc->zc_guid, zc->zc_obj);
-		break;
-
-	case VDEV_STATE_DEGRADED:
-		if (zc->zc_obj != VDEV_AUX_ERR_EXCEEDED &&
-		    zc->zc_obj != VDEV_AUX_EXTERNAL)
-			zc->zc_obj = VDEV_AUX_ERR_EXCEEDED;
-
-		error = vdev_degrade(spa, zc->zc_guid, zc->zc_obj);
-		break;
-
-	default:
-		error = SET_ERROR(EINVAL);
-	}
-	zc->zc_cookie = newstate;
-	spa_close(spa, FTAG);
-	return (error);
-}
-
-static int
-zfs_ioc_vdev_attach(zfs_cmd_t *zc)
-{
-	spa_t *spa;
-	int replacing = zc->zc_cookie;
-	nvlist_t *config;
-	int error;
-
-	if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
-		return (error);
-
-	if ((error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size,
-	    zc->zc_iflags, &config)) == 0) {
-		error = spa_vdev_attach(spa, zc->zc_guid, config, replacing);
-		nvlist_free(config);
-	}
-
-	spa_close(spa, FTAG);
-	return (error);
-}
-
-static int
-zfs_ioc_vdev_detach(zfs_cmd_t *zc)
-{
-	spa_t *spa;
-	int error;
-
-	if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
-		return (error);
-
-	error = spa_vdev_detach(spa, zc->zc_guid, 0, B_FALSE);
-
-	spa_close(spa, FTAG);
-	return (error);
-}
-
-static int
-zfs_ioc_vdev_split(zfs_cmd_t *zc)
-{
-	spa_t *spa;
-	nvlist_t *config, *props = NULL;
-	int error;
-	boolean_t exp = !!(zc->zc_cookie & ZPOOL_EXPORT_AFTER_SPLIT);
-
-	if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
-		return (error);
-
-	if (error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size,
-	    zc->zc_iflags, &config)) {
-		spa_close(spa, FTAG);
-		return (error);
-	}
-
-	if (zc->zc_nvlist_src_size != 0 && (error =
-	    get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size,
-	    zc->zc_iflags, &props))) {
-		spa_close(spa, FTAG);
-		nvlist_free(config);
-		return (error);
-	}
-
-	error = spa_vdev_split_mirror(spa, zc->zc_string, config, props, exp);
-
-	spa_close(spa, FTAG);
-
-	nvlist_free(config);
-	nvlist_free(props);
-
-	return (error);
-}
-
-static int
-zfs_ioc_vdev_setpath(zfs_cmd_t *zc)
-{
-	spa_t *spa;
-	char *path = zc->zc_value;
-	uint64_t guid = zc->zc_guid;
-	int error;
-
-	error = spa_open(zc->zc_name, &spa, FTAG);
-	if (error != 0)
-		return (error);
-
-	error = spa_vdev_setpath(spa, guid, path);
-	spa_close(spa, FTAG);
-	return (error);
-}
-
-static int
-zfs_ioc_vdev_setfru(zfs_cmd_t *zc)
-{
-	spa_t *spa;
-	char *fru = zc->zc_value;
-	uint64_t guid = zc->zc_guid;
-	int error;
-
-	error = spa_open(zc->zc_name, &spa, FTAG);
-	if (error != 0)
-		return (error);
-
-	error = spa_vdev_setfru(spa, guid, fru);
-	spa_close(spa, FTAG);
-	return (error);
-}
-
-static int
-zfs_ioc_objset_stats_impl(zfs_cmd_t *zc, objset_t *os)
-{
-	int error = 0;
-	nvlist_t *nv;
-
-	dmu_objset_fast_stat(os, &zc->zc_objset_stats);
-
-	if (zc->zc_nvlist_dst != 0 &&
-	    (error = dsl_prop_get_all(os, &nv)) == 0) {
-		dmu_objset_stats(os, nv);
-		/*
-		 * NB: zvol_get_stats() will read the objset contents,
-		 * which we aren't supposed to do with a
-		 * DS_MODE_USER hold, because it could be
-		 * inconsistent.  So this is a bit of a workaround...
-		 * XXX reading with out owning
-		 */
-		if (!zc->zc_objset_stats.dds_inconsistent &&
-		    dmu_objset_type(os) == DMU_OST_ZVOL) {
-			error = zvol_get_stats(os, nv);
-			if (error == EIO)
-				return (error);
-			VERIFY0(error);
-		}
-		error = put_nvlist(zc, nv);
-		nvlist_free(nv);
-	}
-
-	return (error);
-}
-
-/*
- * inputs:
- * zc_name		name of filesystem
- * zc_nvlist_dst_size	size of buffer for property nvlist
- *
- * outputs:
- * zc_objset_stats	stats
- * zc_nvlist_dst	property nvlist
- * zc_nvlist_dst_size	size of property nvlist
- */
-static int
-zfs_ioc_objset_stats(zfs_cmd_t *zc)
-{
-	objset_t *os;
-	int error;
-
-	error = dmu_objset_hold(zc->zc_name, FTAG, &os);
-	if (error == 0) {
-		error = zfs_ioc_objset_stats_impl(zc, os);
-		dmu_objset_rele(os, FTAG);
-	}
-
-	if (error == ENOMEM)
-		error = 0;
-	return (error);
-}
-
-/*
- * inputs:
- * zc_name		name of filesystem
- * zc_nvlist_dst_size	size of buffer for property nvlist
- *
- * outputs:
- * zc_nvlist_dst	received property nvlist
- * zc_nvlist_dst_size	size of received property nvlist
- *
- * Gets received properties (distinct from local properties on or after
- * SPA_VERSION_RECVD_PROPS) for callers who want to differentiate received from
- * local property values.
- */
-static int
-zfs_ioc_objset_recvd_props(zfs_cmd_t *zc)
-{
-	int error = 0;
-	nvlist_t *nv;
-
-	/*
-	 * Without this check, we would return local property values if the
-	 * caller has not already received properties on or after
-	 * SPA_VERSION_RECVD_PROPS.
-	 */
-	if (!dsl_prop_get_hasrecvd(zc->zc_name))
-		return (SET_ERROR(ENOTSUP));
-
-	if (zc->zc_nvlist_dst != 0 &&
-	    (error = dsl_prop_get_received(zc->zc_name, &nv)) == 0) {
-		error = put_nvlist(zc, nv);
-		nvlist_free(nv);
-	}
-
-	return (error);
-}
-
-static int
-nvl_add_zplprop(objset_t *os, nvlist_t *props, zfs_prop_t prop)
-{
-	uint64_t value;
-	int error;
-
-	/*
-	 * zfs_get_zplprop() will either find a value or give us
-	 * the default value (if there is one).
-	 */
-	if ((error = zfs_get_zplprop(os, prop, &value)) != 0)
-		return (error);
-	VERIFY(nvlist_add_uint64(props, zfs_prop_to_name(prop), value) == 0);
-	return (0);
-}
-
-/*
- * inputs:
- * zc_name		name of filesystem
- * zc_nvlist_dst_size	size of buffer for zpl property nvlist
- *
- * outputs:
- * zc_nvlist_dst	zpl property nvlist
- * zc_nvlist_dst_size	size of zpl property nvlist
- */
-static int
-zfs_ioc_objset_zplprops(zfs_cmd_t *zc)
-{
-	objset_t *os;
-	int err;
-
-	/* XXX reading without owning */
-	if (err = dmu_objset_hold(zc->zc_name, FTAG, &os))
-		return (err);
-
-	dmu_objset_fast_stat(os, &zc->zc_objset_stats);
-
-	/*
-	 * NB: nvl_add_zplprop() will read the objset contents,
-	 * which we aren't supposed to do with a DS_MODE_USER
-	 * hold, because it could be inconsistent.
-	 */
-	if (zc->zc_nvlist_dst != 0 &&
-	    !zc->zc_objset_stats.dds_inconsistent &&
-	    dmu_objset_type(os) == DMU_OST_ZFS) {
-		nvlist_t *nv;
-
-		VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0);
-		if ((err = nvl_add_zplprop(os, nv, ZFS_PROP_VERSION)) == 0 &&
-		    (err = nvl_add_zplprop(os, nv, ZFS_PROP_NORMALIZE)) == 0 &&
-		    (err = nvl_add_zplprop(os, nv, ZFS_PROP_UTF8ONLY)) == 0 &&
-		    (err = nvl_add_zplprop(os, nv, ZFS_PROP_CASE)) == 0)
-			err = put_nvlist(zc, nv);
-		nvlist_free(nv);
-	} else {
-		err = SET_ERROR(ENOENT);
-	}
-	dmu_objset_rele(os, FTAG);
-	return (err);
-}
-
-boolean_t
-dataset_name_hidden(const char *name)
-{
-	/*
-	 * Skip over datasets that are not visible in this zone,
-	 * internal datasets (which have a $ in their name), and
-	 * temporary datasets (which have a % in their name).
-	 */
-	if (strchr(name, '$') != NULL)
-		return (B_TRUE);
-	if (strchr(name, '%') != NULL)
-		return (B_TRUE);
-	if (!INGLOBALZONE(curthread) && !zone_dataset_visible(name, NULL))
-		return (B_TRUE);
-	return (B_FALSE);
-}
-
-/*
- * inputs:
- * zc_name		name of filesystem
- * zc_cookie		zap cursor
- * zc_nvlist_src	iteration range nvlist
- * zc_nvlist_src_size	size of iteration range nvlist
- *
- * outputs:
- * zc_name		name of next filesystem
- * zc_cookie		zap cursor
- * zc_objset_stats	stats
- * zc_nvlist_dst	property nvlist
- * zc_nvlist_dst_size	size of property nvlist
- */
-static int
-zfs_ioc_dataset_list_next(zfs_cmd_t *zc)
-{
-	objset_t *os;
-	int error;
-	char *p;
-	size_t orig_len = strlen(zc->zc_name);
-
-top:
-	if (error = dmu_objset_hold(zc->zc_name, FTAG, &os)) {
-		if (error == ENOENT)
-			error = SET_ERROR(ESRCH);
-		return (error);
-	}
-
-	p = strrchr(zc->zc_name, '/');
-	if (p == NULL || p[1] != '\0')
-		(void) strlcat(zc->zc_name, "/", sizeof (zc->zc_name));
-	p = zc->zc_name + strlen(zc->zc_name);
-
-	do {
-		error = dmu_dir_list_next(os,
-		    sizeof (zc->zc_name) - (p - zc->zc_name), p,
-		    NULL, &zc->zc_cookie);
-		if (error == ENOENT)
-			error = SET_ERROR(ESRCH);
-	} while (error == 0 && dataset_name_hidden(zc->zc_name));
-	dmu_objset_rele(os, FTAG);
-
-	/*
-	 * If it's an internal dataset (ie. with a '$' in its name),
-	 * don't try to get stats for it, otherwise we'll return ENOENT.
-	 */
-	if (error == 0 && strchr(zc->zc_name, '$') == NULL) {
-		error = zfs_ioc_objset_stats(zc); /* fill in the stats */
-		if (error == ENOENT) {
-			/* We lost a race with destroy, get the next one. */
-			zc->zc_name[orig_len] = '\0';
-			goto top;
-		}
-	}
-	return (error);
-}
-
-/*
- * inputs:
- * zc_name		name of filesystem
- * zc_cookie		zap cursor
- * zc_nvlist_dst_size	size of buffer for property nvlist
- * zc_simple		when set, only name is requested
- *
- * outputs:
- * zc_name		name of next snapshot
- * zc_objset_stats	stats
- * zc_nvlist_dst	property nvlist
- * zc_nvlist_dst_size	size of property nvlist
- */
-static int
-zfs_ioc_snapshot_list_next(zfs_cmd_t *zc)
-{
-	int error;
-	objset_t *os, *ossnap;
-	dsl_dataset_t *ds;
-	uint64_t min_txg = 0, max_txg = 0;
-
-	if (zc->zc_nvlist_src_size != 0) {
-		nvlist_t *props = NULL;
-		error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size,
-		    zc->zc_iflags, &props);
-		if (error != 0)
-			return (error);
-		(void) nvlist_lookup_uint64(props, SNAP_ITER_MIN_TXG,
-		    &min_txg);
-		(void) nvlist_lookup_uint64(props, SNAP_ITER_MAX_TXG,
-		    &max_txg);
-		nvlist_free(props);
-	}
-
-	error = dmu_objset_hold(zc->zc_name, FTAG, &os);
-	if (error != 0) {
-		return (error == ENOENT ? ESRCH : error);
-	}
-
-	/*
-	 * A dataset name of maximum length cannot have any snapshots,
-	 * so exit immediately.
-	 */
-	if (strlcat(zc->zc_name, "@", sizeof (zc->zc_name)) >=
-	    ZFS_MAX_DATASET_NAME_LEN) {
-		dmu_objset_rele(os, FTAG);
-		return (SET_ERROR(ESRCH));
-	}
-
-	while (error == 0) {
-		if (issig(JUSTLOOKING) && issig(FORREAL)) {
-			error = SET_ERROR(EINTR);
-			break;
-		}
-
-		error = dmu_snapshot_list_next(os,
-		    sizeof (zc->zc_name) - strlen(zc->zc_name),
-		    zc->zc_name + strlen(zc->zc_name), &zc->zc_obj,
-		    &zc->zc_cookie, NULL);
-		if (error == ENOENT) {
-			error = SET_ERROR(ESRCH);
-			break;
-		} else if (error != 0) {
-			break;
-		}
-
-		error = dsl_dataset_hold_obj(dmu_objset_pool(os), zc->zc_obj,
-		    FTAG, &ds);
-		if (error != 0)
-			break;
-
-		if ((min_txg != 0 && dsl_get_creationtxg(ds) < min_txg) ||
-		    (max_txg != 0 && dsl_get_creationtxg(ds) > max_txg)) {
-			dsl_dataset_rele(ds, FTAG);
-			/* undo snapshot name append */
-			*(strchr(zc->zc_name, '@') + 1) = '\0';
-			/* skip snapshot */
-			continue;
-		}
-
-		if (zc->zc_simple) {
-			dsl_dataset_rele(ds, FTAG);
-			break;
-		}
-
-		if ((error = dmu_objset_from_ds(ds, &ossnap)) != 0) {
-			dsl_dataset_rele(ds, FTAG);
-			break;
-		}
-		if ((error = zfs_ioc_objset_stats_impl(zc, ossnap)) != 0) {
-			dsl_dataset_rele(ds, FTAG);
-			break;
-		}
-		dsl_dataset_rele(ds, FTAG);
-		break;
-	}
-
-	dmu_objset_rele(os, FTAG);
-	/* if we failed, undo the @ that we tacked on to zc_name */
-	if (error != 0)
-		*strchr(zc->zc_name, '@') = '\0';
-	return (error);
-}
-
-static int
-zfs_prop_set_userquota(const char *dsname, nvpair_t *pair)
-{
-	const char *propname = nvpair_name(pair);
-	uint64_t *valary;
-	unsigned int vallen;
-	const char *domain;
-	char *dash;
-	zfs_userquota_prop_t type;
-	uint64_t rid;
-	uint64_t quota;
-	zfsvfs_t *zfsvfs;
-	int err;
-
-	if (nvpair_type(pair) == DATA_TYPE_NVLIST) {
-		nvlist_t *attrs;
-		VERIFY(nvpair_value_nvlist(pair, &attrs) == 0);
-		if (nvlist_lookup_nvpair(attrs, ZPROP_VALUE,
-		    &pair) != 0)
-			return (SET_ERROR(EINVAL));
-	}
-
-	/*
-	 * A correctly constructed propname is encoded as
-	 * userquota@<rid>-<domain>.
-	 */
-	if ((dash = strchr(propname, '-')) == NULL ||
-	    nvpair_value_uint64_array(pair, &valary, &vallen) != 0 ||
-	    vallen != 3)
-		return (SET_ERROR(EINVAL));
-
-	domain = dash + 1;
-	type = valary[0];
-	rid = valary[1];
-	quota = valary[2];
-
-	err = zfsvfs_hold(dsname, FTAG, &zfsvfs, B_FALSE);
-	if (err == 0) {
-		err = zfs_set_userquota(zfsvfs, type, domain, rid, quota);
-		zfsvfs_rele(zfsvfs, FTAG);
-	}
-
-	return (err);
-}
-
-/*
- * If the named property is one that has a special function to set its value,
- * return 0 on success and a positive error code on failure; otherwise if it is
- * not one of the special properties handled by this function, return -1.
- *
- * XXX: It would be better for callers of the property interface if we handled
- * these special cases in dsl_prop.c (in the dsl layer).
- */
-static int
-zfs_prop_set_special(const char *dsname, zprop_source_t source,
-    nvpair_t *pair)
-{
-	const char *propname = nvpair_name(pair);
-	zfs_prop_t prop = zfs_name_to_prop(propname);
-	uint64_t intval;
-	int err = -1;
-
-	if (prop == ZPROP_INVAL) {
-		if (zfs_prop_userquota(propname))
-			return (zfs_prop_set_userquota(dsname, pair));
-		return (-1);
-	}
-
-	if (nvpair_type(pair) == DATA_TYPE_NVLIST) {
-		nvlist_t *attrs;
-		VERIFY(nvpair_value_nvlist(pair, &attrs) == 0);
-		VERIFY(nvlist_lookup_nvpair(attrs, ZPROP_VALUE,
-		    &pair) == 0);
-	}
-
-	if (zfs_prop_get_type(prop) == PROP_TYPE_STRING)
-		return (-1);
-
-	VERIFY(0 == nvpair_value_uint64(pair, &intval));
-
-	switch (prop) {
-	case ZFS_PROP_QUOTA:
-		err = dsl_dir_set_quota(dsname, source, intval);
-		break;
-	case ZFS_PROP_REFQUOTA:
-		err = dsl_dataset_set_refquota(dsname, source, intval);
-		break;
-	case ZFS_PROP_FILESYSTEM_LIMIT:
-	case ZFS_PROP_SNAPSHOT_LIMIT:
-		if (intval == UINT64_MAX) {
-			/* clearing the limit, just do it */
-			err = 0;
-		} else {
-			err = dsl_dir_activate_fs_ss_limit(dsname);
-		}
-		/*
-		 * Set err to -1 to force the zfs_set_prop_nvlist code down the
-		 * default path to set the value in the nvlist.
-		 */
-		if (err == 0)
-			err = -1;
-		break;
-	case ZFS_PROP_RESERVATION:
-		err = dsl_dir_set_reservation(dsname, source, intval);
-		break;
-	case ZFS_PROP_REFRESERVATION:
-		err = dsl_dataset_set_refreservation(dsname, source, intval);
-		break;
-	case ZFS_PROP_VOLSIZE:
-		err = zvol_set_volsize(dsname, intval);
-		break;
-	case ZFS_PROP_VERSION:
-	{
-		zfsvfs_t *zfsvfs;
-
-		if ((err = zfsvfs_hold(dsname, FTAG, &zfsvfs, B_TRUE)) != 0)
-			break;
-
-		err = zfs_set_version(zfsvfs, intval);
-		zfsvfs_rele(zfsvfs, FTAG);
-
-		if (err == 0 && intval >= ZPL_VERSION_USERSPACE) {
-			zfs_cmd_t *zc;
-
-			zc = kmem_zalloc(sizeof (zfs_cmd_t), KM_SLEEP);
-			(void) strcpy(zc->zc_name, dsname);
-			(void) zfs_ioc_userspace_upgrade(zc);
-			kmem_free(zc, sizeof (zfs_cmd_t));
-		}
-		break;
-	}
-	default:
-		err = -1;
-	}
-
-	return (err);
-}
-
-/*
- * This function is best effort. If it fails to set any of the given properties,
- * it continues to set as many as it can and returns the last error
- * encountered. If the caller provides a non-NULL errlist, it will be filled in
- * with the list of names of all the properties that failed along with the
- * corresponding error numbers.
- *
- * If every property is set successfully, zero is returned and errlist is not
- * modified.
- */
-int
-zfs_set_prop_nvlist(const char *dsname, zprop_source_t source, nvlist_t *nvl,
-    nvlist_t *errlist)
-{
-	nvpair_t *pair;
-	nvpair_t *propval;
-	int rv = 0;
-	uint64_t intval;
-	char *strval;
-	nvlist_t *genericnvl = fnvlist_alloc();
-	nvlist_t *retrynvl = fnvlist_alloc();
-
-retry:
-	pair = NULL;
-	while ((pair = nvlist_next_nvpair(nvl, pair)) != NULL) {
-		const char *propname = nvpair_name(pair);
-		zfs_prop_t prop = zfs_name_to_prop(propname);
-		int err = 0;
-
-		/* decode the property value */
-		propval = pair;
-		if (nvpair_type(pair) == DATA_TYPE_NVLIST) {
-			nvlist_t *attrs;
-			attrs = fnvpair_value_nvlist(pair);
-			if (nvlist_lookup_nvpair(attrs, ZPROP_VALUE,
-			    &propval) != 0)
-				err = SET_ERROR(EINVAL);
-		}
-
-		/* Validate value type */
-		if (err == 0 && prop == ZPROP_INVAL) {
-			if (zfs_prop_user(propname)) {
-				if (nvpair_type(propval) != DATA_TYPE_STRING)
-					err = SET_ERROR(EINVAL);
-			} else if (zfs_prop_userquota(propname)) {
-				if (nvpair_type(propval) !=
-				    DATA_TYPE_UINT64_ARRAY)
-					err = SET_ERROR(EINVAL);
-			} else {
-				err = SET_ERROR(EINVAL);
-			}
-		} else if (err == 0) {
-			if (nvpair_type(propval) == DATA_TYPE_STRING) {
-				if (zfs_prop_get_type(prop) != PROP_TYPE_STRING)
-					err = SET_ERROR(EINVAL);
-			} else if (nvpair_type(propval) == DATA_TYPE_UINT64) {
-				const char *unused;
-
-				intval = fnvpair_value_uint64(propval);
-
-				switch (zfs_prop_get_type(prop)) {
-				case PROP_TYPE_NUMBER:
-					break;
-				case PROP_TYPE_STRING:
-					err = SET_ERROR(EINVAL);
-					break;
-				case PROP_TYPE_INDEX:
-					if (zfs_prop_index_to_string(prop,
-					    intval, &unused) != 0)
-						err = SET_ERROR(EINVAL);
-					break;
-				default:
-					cmn_err(CE_PANIC,
-					    "unknown property type");
-				}
-			} else {
-				err = SET_ERROR(EINVAL);
-			}
-		}
-
-		/* Validate permissions */
-		if (err == 0)
-			err = zfs_check_settable(dsname, pair, CRED());
-
-		if (err == 0) {
-			err = zfs_prop_set_special(dsname, source, pair);
-			if (err == -1) {
-				/*
-				 * For better performance we build up a list of
-				 * properties to set in a single transaction.
-				 */
-				err = nvlist_add_nvpair(genericnvl, pair);
-			} else if (err != 0 && nvl != retrynvl) {
-				/*
-				 * This may be a spurious error caused by
-				 * receiving quota and reservation out of order.
-				 * Try again in a second pass.
-				 */
-				err = nvlist_add_nvpair(retrynvl, pair);
-			}
-		}
-
-		if (err != 0) {
-			if (errlist != NULL)
-				fnvlist_add_int32(errlist, propname, err);
-			rv = err;
-		}
-	}
-
-	if (nvl != retrynvl && !nvlist_empty(retrynvl)) {
-		nvl = retrynvl;
-		goto retry;
-	}
-
-	if (!nvlist_empty(genericnvl) &&
-	    dsl_props_set(dsname, source, genericnvl) != 0) {
-		/*
-		 * If this fails, we still want to set as many properties as we
-		 * can, so try setting them individually.
-		 */
-		pair = NULL;
-		while ((pair = nvlist_next_nvpair(genericnvl, pair)) != NULL) {
-			const char *propname = nvpair_name(pair);
-			int err = 0;
-
-			propval = pair;
-			if (nvpair_type(pair) == DATA_TYPE_NVLIST) {
-				nvlist_t *attrs;
-				attrs = fnvpair_value_nvlist(pair);
-				propval = fnvlist_lookup_nvpair(attrs,
-				    ZPROP_VALUE);
-			}
-
-			if (nvpair_type(propval) == DATA_TYPE_STRING) {
-				strval = fnvpair_value_string(propval);
-				err = dsl_prop_set_string(dsname, propname,
-				    source, strval);
-			} else {
-				intval = fnvpair_value_uint64(propval);
-				err = dsl_prop_set_int(dsname, propname, source,
-				    intval);
-			}
-
-			if (err != 0) {
-				if (errlist != NULL) {
-					fnvlist_add_int32(errlist, propname,
-					    err);
-				}
-				rv = err;
-			}
-		}
-	}
-	nvlist_free(genericnvl);
-	nvlist_free(retrynvl);
-
-	return (rv);
-}
-
-/*
- * Check that all the properties are valid user properties.
- */
-static int
-zfs_check_userprops(nvlist_t *nvl)
-{
-	nvpair_t *pair = NULL;
-
-	while ((pair = nvlist_next_nvpair(nvl, pair)) != NULL) {
-		const char *propname = nvpair_name(pair);
-
-		if (!zfs_prop_user(propname) ||
-		    nvpair_type(pair) != DATA_TYPE_STRING)
-			return (SET_ERROR(EINVAL));
-
-		if (strlen(propname) >= ZAP_MAXNAMELEN)
-			return (SET_ERROR(ENAMETOOLONG));
-
-		if (strlen(fnvpair_value_string(pair)) >= ZAP_MAXVALUELEN)
-			return (E2BIG);
-	}
-	return (0);
-}
-
-static void
-props_skip(nvlist_t *props, nvlist_t *skipped, nvlist_t **newprops)
-{
-	nvpair_t *pair;
-
-	VERIFY(nvlist_alloc(newprops, NV_UNIQUE_NAME, KM_SLEEP) == 0);
-
-	pair = NULL;
-	while ((pair = nvlist_next_nvpair(props, pair)) != NULL) {
-		if (nvlist_exists(skipped, nvpair_name(pair)))
-			continue;
-
-		VERIFY(nvlist_add_nvpair(*newprops, pair) == 0);
-	}
-}
-
-static int
-clear_received_props(const char *dsname, nvlist_t *props,
-    nvlist_t *skipped)
-{
-	int err = 0;
-	nvlist_t *cleared_props = NULL;
-	props_skip(props, skipped, &cleared_props);
-	if (!nvlist_empty(cleared_props)) {
-		/*
-		 * Acts on local properties until the dataset has received
-		 * properties at least once on or after SPA_VERSION_RECVD_PROPS.
-		 */
-		zprop_source_t flags = (ZPROP_SRC_NONE |
-		    (dsl_prop_get_hasrecvd(dsname) ? ZPROP_SRC_RECEIVED : 0));
-		err = zfs_set_prop_nvlist(dsname, flags, cleared_props, NULL);
-	}
-	nvlist_free(cleared_props);
-	return (err);
-}
-
-/*
- * inputs:
- * zc_name		name of filesystem
- * zc_value		name of property to set
- * zc_nvlist_src{_size}	nvlist of properties to apply
- * zc_cookie		received properties flag
- *
- * outputs:
- * zc_nvlist_dst{_size} error for each unapplied received property
- */
-static int
-zfs_ioc_set_prop(zfs_cmd_t *zc)
-{
-	nvlist_t *nvl;
-	boolean_t received = zc->zc_cookie;
-	zprop_source_t source = (received ? ZPROP_SRC_RECEIVED :
-	    ZPROP_SRC_LOCAL);
-	nvlist_t *errors;
-	int error;
-
-	if ((error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size,
-	    zc->zc_iflags, &nvl)) != 0)
-		return (error);
-
-	if (received) {
-		nvlist_t *origprops;
-
-		if (dsl_prop_get_received(zc->zc_name, &origprops) == 0) {
-			(void) clear_received_props(zc->zc_name,
-			    origprops, nvl);
-			nvlist_free(origprops);
-		}
-
-		error = dsl_prop_set_hasrecvd(zc->zc_name);
-	}
-
-	errors = fnvlist_alloc();
-	if (error == 0)
-		error = zfs_set_prop_nvlist(zc->zc_name, source, nvl, errors);
-
-	if (zc->zc_nvlist_dst != 0 && errors != NULL) {
-		(void) put_nvlist(zc, errors);
-	}
-
-	nvlist_free(errors);
-	nvlist_free(nvl);
-	return (error);
-}
-
-/*
- * inputs:
- * zc_name		name of filesystem
- * zc_value		name of property to inherit
- * zc_cookie		revert to received value if TRUE
- *
- * outputs:		none
- */
-static int
-zfs_ioc_inherit_prop(zfs_cmd_t *zc)
-{
-	const char *propname = zc->zc_value;
-	zfs_prop_t prop = zfs_name_to_prop(propname);
-	boolean_t received = zc->zc_cookie;
-	zprop_source_t source = (received
-	    ? ZPROP_SRC_NONE		/* revert to received value, if any */
-	    : ZPROP_SRC_INHERITED);	/* explicitly inherit */
-
-	if (received) {
-		nvlist_t *dummy;
-		nvpair_t *pair;
-		zprop_type_t type;
-		int err;
-
-		/*
-		 * zfs_prop_set_special() expects properties in the form of an
-		 * nvpair with type info.
-		 */
-		if (prop == ZPROP_INVAL) {
-			if (!zfs_prop_user(propname))
-				return (SET_ERROR(EINVAL));
-
-			type = PROP_TYPE_STRING;
-		} else if (prop == ZFS_PROP_VOLSIZE ||
-		    prop == ZFS_PROP_VERSION) {
-			return (SET_ERROR(EINVAL));
-		} else {
-			type = zfs_prop_get_type(prop);
-		}
-
-		VERIFY(nvlist_alloc(&dummy, NV_UNIQUE_NAME, KM_SLEEP) == 0);
-
-		switch (type) {
-		case PROP_TYPE_STRING:
-			VERIFY(0 == nvlist_add_string(dummy, propname, ""));
-			break;
-		case PROP_TYPE_NUMBER:
-		case PROP_TYPE_INDEX:
-			VERIFY(0 == nvlist_add_uint64(dummy, propname, 0));
-			break;
-		default:
-			nvlist_free(dummy);
-			return (SET_ERROR(EINVAL));
-		}
-
-		pair = nvlist_next_nvpair(dummy, NULL);
-		err = zfs_prop_set_special(zc->zc_name, source, pair);
-		nvlist_free(dummy);
-		if (err != -1)
-			return (err); /* special property already handled */
-	} else {
-		/*
-		 * Only check this in the non-received case. We want to allow
-		 * 'inherit -S' to revert non-inheritable properties like quota
-		 * and reservation to the received or default values even though
-		 * they are not considered inheritable.
-		 */
-		if (prop != ZPROP_INVAL && !zfs_prop_inheritable(prop))
-			return (SET_ERROR(EINVAL));
-	}
-
-	/* property name has been validated by zfs_secpolicy_inherit_prop() */
-	return (dsl_prop_inherit(zc->zc_name, zc->zc_value, source));
-}
-
-static int
-zfs_ioc_pool_set_props(zfs_cmd_t *zc)
-{
-	nvlist_t *props;
-	spa_t *spa;
-	int error;
-	nvpair_t *pair;
-
-	if (error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size,
-	    zc->zc_iflags, &props))
-		return (error);
-
-	/*
-	 * If the only property is the configfile, then just do a spa_lookup()
-	 * to handle the faulted case.
-	 */
-	pair = nvlist_next_nvpair(props, NULL);
-	if (pair != NULL && strcmp(nvpair_name(pair),
-	    zpool_prop_to_name(ZPOOL_PROP_CACHEFILE)) == 0 &&
-	    nvlist_next_nvpair(props, pair) == NULL) {
-		mutex_enter(&spa_namespace_lock);
-		if ((spa = spa_lookup(zc->zc_name)) != NULL) {
-			spa_configfile_set(spa, props, B_FALSE);
-			spa_write_cachefile(spa, B_FALSE, B_TRUE);
-		}
-		mutex_exit(&spa_namespace_lock);
-		if (spa != NULL) {
-			nvlist_free(props);
-			return (0);
-		}
-	}
-
-	if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) {
-		nvlist_free(props);
-		return (error);
-	}
-
-	error = spa_prop_set(spa, props);
-
-	nvlist_free(props);
-	spa_close(spa, FTAG);
-
-	return (error);
-}
-
-static int
-zfs_ioc_pool_get_props(zfs_cmd_t *zc)
-{
-	spa_t *spa;
-	int error;
-	nvlist_t *nvp = NULL;
-
-	if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) {
-		/*
-		 * If the pool is faulted, there may be properties we can still
-		 * get (such as altroot and cachefile), so attempt to get them
-		 * anyway.
-		 */
-		mutex_enter(&spa_namespace_lock);
-		if ((spa = spa_lookup(zc->zc_name)) != NULL)
-			error = spa_prop_get(spa, &nvp);
-		mutex_exit(&spa_namespace_lock);
-	} else {
-		error = spa_prop_get(spa, &nvp);
-		spa_close(spa, FTAG);
-	}
-
-	if (error == 0 && zc->zc_nvlist_dst != 0)
-		error = put_nvlist(zc, nvp);
-	else
-		error = SET_ERROR(EFAULT);
-
-	nvlist_free(nvp);
-	return (error);
-}
-
-/*
- * inputs:
- * zc_name		name of filesystem
- * zc_nvlist_src{_size}	nvlist of delegated permissions
- * zc_perm_action	allow/unallow flag
- *
- * outputs:		none
- */
-static int
-zfs_ioc_set_fsacl(zfs_cmd_t *zc)
-{
-	int error;
-	nvlist_t *fsaclnv = NULL;
-
-	if ((error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size,
-	    zc->zc_iflags, &fsaclnv)) != 0)
-		return (error);
-
-	/*
-	 * Verify nvlist is constructed correctly
-	 */
-	if ((error = zfs_deleg_verify_nvlist(fsaclnv)) != 0) {
-		nvlist_free(fsaclnv);
-		return (SET_ERROR(EINVAL));
-	}
-
-	/*
-	 * If we don't have PRIV_SYS_MOUNT, then validate
-	 * that user is allowed to hand out each permission in
-	 * the nvlist(s)
-	 */
-
-	error = secpolicy_zfs(CRED());
-	if (error != 0) {
-		if (zc->zc_perm_action == B_FALSE) {
-			error = dsl_deleg_can_allow(zc->zc_name,
-			    fsaclnv, CRED());
-		} else {
-			error = dsl_deleg_can_unallow(zc->zc_name,
-			    fsaclnv, CRED());
-		}
-	}
-
-	if (error == 0)
-		error = dsl_deleg_set(zc->zc_name, fsaclnv, zc->zc_perm_action);
-
-	nvlist_free(fsaclnv);
-	return (error);
-}
-
-/*
- * inputs:
- * zc_name		name of filesystem
- *
- * outputs:
- * zc_nvlist_src{_size}	nvlist of delegated permissions
- */
-static int
-zfs_ioc_get_fsacl(zfs_cmd_t *zc)
-{
-	nvlist_t *nvp;
-	int error;
-
-	if ((error = dsl_deleg_get(zc->zc_name, &nvp)) == 0) {
-		error = put_nvlist(zc, nvp);
-		nvlist_free(nvp);
-	}
-
-	return (error);
-}
-
-/* ARGSUSED */
-static void
-zfs_create_cb(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx)
-{
-	zfs_creat_t *zct = arg;
-
-	zfs_create_fs(os, cr, zct->zct_zplprops, tx);
-}
-
-#define	ZFS_PROP_UNDEFINED	((uint64_t)-1)
-
-/*
- * inputs:
- * os			parent objset pointer (NULL if root fs)
- * fuids_ok		fuids allowed in this version of the spa?
- * sa_ok		SAs allowed in this version of the spa?
- * createprops		list of properties requested by creator
- *
- * outputs:
- * zplprops	values for the zplprops we attach to the master node object
- * is_ci	true if requested file system will be purely case-insensitive
- *
- * Determine the settings for utf8only, normalization and
- * casesensitivity.  Specific values may have been requested by the
- * creator and/or we can inherit values from the parent dataset.  If
- * the file system is of too early a vintage, a creator can not
- * request settings for these properties, even if the requested
- * setting is the default value.  We don't actually want to create dsl
- * properties for these, so remove them from the source nvlist after
- * processing.
- */
-static int
-zfs_fill_zplprops_impl(objset_t *os, uint64_t zplver,
-    boolean_t fuids_ok, boolean_t sa_ok, nvlist_t *createprops,
-    nvlist_t *zplprops, boolean_t *is_ci)
-{
-	uint64_t sense = ZFS_PROP_UNDEFINED;
-	uint64_t norm = ZFS_PROP_UNDEFINED;
-	uint64_t u8 = ZFS_PROP_UNDEFINED;
-
-	ASSERT(zplprops != NULL);
-
-	/* parent dataset must be a filesystem */
-	if (os != NULL && os->os_phys->os_type != DMU_OST_ZFS)
-		return (SET_ERROR(ZFS_ERR_WRONG_PARENT));
-
-	/*
-	 * Pull out creator prop choices, if any.
-	 */
-	if (createprops) {
-		(void) nvlist_lookup_uint64(createprops,
-		    zfs_prop_to_name(ZFS_PROP_VERSION), &zplver);
-		(void) nvlist_lookup_uint64(createprops,
-		    zfs_prop_to_name(ZFS_PROP_NORMALIZE), &norm);
-		(void) nvlist_remove_all(createprops,
-		    zfs_prop_to_name(ZFS_PROP_NORMALIZE));
-		(void) nvlist_lookup_uint64(createprops,
-		    zfs_prop_to_name(ZFS_PROP_UTF8ONLY), &u8);
-		(void) nvlist_remove_all(createprops,
-		    zfs_prop_to_name(ZFS_PROP_UTF8ONLY));
-		(void) nvlist_lookup_uint64(createprops,
-		    zfs_prop_to_name(ZFS_PROP_CASE), &sense);
-		(void) nvlist_remove_all(createprops,
-		    zfs_prop_to_name(ZFS_PROP_CASE));
-	}
-
-	/*
-	 * If the zpl version requested is whacky or the file system
-	 * or pool is version is too "young" to support normalization
-	 * and the creator tried to set a value for one of the props,
-	 * error out.
-	 */
-	if ((zplver < ZPL_VERSION_INITIAL || zplver > ZPL_VERSION) ||
-	    (zplver >= ZPL_VERSION_FUID && !fuids_ok) ||
-	    (zplver >= ZPL_VERSION_SA && !sa_ok) ||
-	    (zplver < ZPL_VERSION_NORMALIZATION &&
-	    (norm != ZFS_PROP_UNDEFINED || u8 != ZFS_PROP_UNDEFINED ||
-	    sense != ZFS_PROP_UNDEFINED)))
-		return (SET_ERROR(ENOTSUP));
-
-	/*
-	 * Put the version in the zplprops
-	 */
-	VERIFY(nvlist_add_uint64(zplprops,
-	    zfs_prop_to_name(ZFS_PROP_VERSION), zplver) == 0);
-
-	if (norm == ZFS_PROP_UNDEFINED)
-		VERIFY(zfs_get_zplprop(os, ZFS_PROP_NORMALIZE, &norm) == 0);
-	VERIFY(nvlist_add_uint64(zplprops,
-	    zfs_prop_to_name(ZFS_PROP_NORMALIZE), norm) == 0);
-
-	/*
-	 * If we're normalizing, names must always be valid UTF-8 strings.
-	 */
-	if (norm)
-		u8 = 1;
-	if (u8 == ZFS_PROP_UNDEFINED)
-		VERIFY(zfs_get_zplprop(os, ZFS_PROP_UTF8ONLY, &u8) == 0);
-	VERIFY(nvlist_add_uint64(zplprops,
-	    zfs_prop_to_name(ZFS_PROP_UTF8ONLY), u8) == 0);
-
-	if (sense == ZFS_PROP_UNDEFINED)
-		VERIFY(zfs_get_zplprop(os, ZFS_PROP_CASE, &sense) == 0);
-	VERIFY(nvlist_add_uint64(zplprops,
-	    zfs_prop_to_name(ZFS_PROP_CASE), sense) == 0);
-
-	if (is_ci)
-		*is_ci = (sense == ZFS_CASE_INSENSITIVE);
-
-	return (0);
-}
-
-static int
-zfs_fill_zplprops(const char *dataset, nvlist_t *createprops,
-    nvlist_t *zplprops, boolean_t *is_ci)
-{
-	boolean_t fuids_ok, sa_ok;
-	uint64_t zplver = ZPL_VERSION;
-	objset_t *os = NULL;
-	char parentname[ZFS_MAX_DATASET_NAME_LEN];
-	spa_t *spa;
-	uint64_t spa_vers;
-	int error;
-
-	zfs_get_parent(dataset, parentname, sizeof (parentname));
-
-	if ((error = spa_open(dataset, &spa, FTAG)) != 0)
-		return (error);
-
-	spa_vers = spa_version(spa);
-	spa_close(spa, FTAG);
-
-	zplver = zfs_zpl_version_map(spa_vers);
-	fuids_ok = (zplver >= ZPL_VERSION_FUID);
-	sa_ok = (zplver >= ZPL_VERSION_SA);
-
-	/*
-	 * Open parent object set so we can inherit zplprop values.
-	 */
-	if ((error = dmu_objset_hold(parentname, FTAG, &os)) != 0)
-		return (error);
-
-	error = zfs_fill_zplprops_impl(os, zplver, fuids_ok, sa_ok, createprops,
-	    zplprops, is_ci);
-	dmu_objset_rele(os, FTAG);
-	return (error);
-}
-
-static int
-zfs_fill_zplprops_root(uint64_t spa_vers, nvlist_t *createprops,
-    nvlist_t *zplprops, boolean_t *is_ci)
-{
-	boolean_t fuids_ok;
-	boolean_t sa_ok;
-	uint64_t zplver = ZPL_VERSION;
-	int error;
-
-	zplver = zfs_zpl_version_map(spa_vers);
-	fuids_ok = (zplver >= ZPL_VERSION_FUID);
-	sa_ok = (zplver >= ZPL_VERSION_SA);
-
-	error = zfs_fill_zplprops_impl(NULL, zplver, fuids_ok, sa_ok,
-	    createprops, zplprops, is_ci);
-	return (error);
-}
-
-/*
- * innvl: {
- *     "type" -> dmu_objset_type_t (int32)
- *     (optional) "props" -> { prop -> value }
- * }
- *
- * outnvl: propname -> error code (int32)
- */
-
-static const zfs_ioc_key_t zfs_keys_create[] = {
-	{"type",	DATA_TYPE_INT32,	0},
-	{"props",	DATA_TYPE_NVLIST,	ZK_OPTIONAL},
-	{"hidden_args",	DATA_TYPE_NVLIST,	ZK_OPTIONAL},
-};
-
-static int
-zfs_ioc_create(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl)
-{
-	int error = 0;
-	zfs_creat_t zct = { 0 };
-	nvlist_t *nvprops = NULL;
-	void (*cbfunc)(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx);
-	dmu_objset_type_t type;
-	boolean_t is_insensitive = B_FALSE;
-
-	type = (dmu_objset_type_t)fnvlist_lookup_int32(innvl, "type");
-	(void) nvlist_lookup_nvlist(innvl, "props", &nvprops);
-
-	switch (type) {
-	case DMU_OST_ZFS:
-		cbfunc = zfs_create_cb;
-		break;
-
-	case DMU_OST_ZVOL:
-		cbfunc = zvol_create_cb;
-		break;
-
-	default:
-		cbfunc = NULL;
-		break;
-	}
-	if (strchr(fsname, '@') ||
-	    strchr(fsname, '%'))
-		return (SET_ERROR(EINVAL));
-
-	zct.zct_props = nvprops;
-
-	if (cbfunc == NULL)
-		return (SET_ERROR(EINVAL));
-
-	if (type == DMU_OST_ZVOL) {
-		uint64_t volsize, volblocksize;
-
-		if (nvprops == NULL)
-			return (SET_ERROR(EINVAL));
-		if (nvlist_lookup_uint64(nvprops,
-		    zfs_prop_to_name(ZFS_PROP_VOLSIZE), &volsize) != 0)
-			return (SET_ERROR(EINVAL));
-
-		if ((error = nvlist_lookup_uint64(nvprops,
-		    zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE),
-		    &volblocksize)) != 0 && error != ENOENT)
-			return (SET_ERROR(EINVAL));
-
-		if (error != 0)
-			volblocksize = zfs_prop_default_numeric(
-			    ZFS_PROP_VOLBLOCKSIZE);
-
-		if ((error = zvol_check_volblocksize(
-		    volblocksize)) != 0 ||
-		    (error = zvol_check_volsize(volsize,
-		    volblocksize)) != 0)
-			return (error);
-	} else if (type == DMU_OST_ZFS) {
-		int error;
-
-		/*
-		 * We have to have normalization and
-		 * case-folding flags correct when we do the
-		 * file system creation, so go figure them out
-		 * now.
-		 */
-		VERIFY(nvlist_alloc(&zct.zct_zplprops,
-		    NV_UNIQUE_NAME, KM_SLEEP) == 0);
-		error = zfs_fill_zplprops(fsname, nvprops,
-		    zct.zct_zplprops, &is_insensitive);
-		if (error != 0) {
-			nvlist_free(zct.zct_zplprops);
-			return (error);
-		}
-	}
-
-	error = dmu_objset_create(fsname, type,
-	    is_insensitive ? DS_FLAG_CI_DATASET : 0, cbfunc, &zct);
-	nvlist_free(zct.zct_zplprops);
-
-	/*
-	 * It would be nice to do this atomically.
-	 */
-	if (error == 0) {
-		error = zfs_set_prop_nvlist(fsname, ZPROP_SRC_LOCAL,
-		    nvprops, outnvl);
-#if defined(__FreeBSD__) && defined(_KERNEL)
-		/*
-		 * Wait for ZVOL operations to settle down before destroying.
-		 */
-		if (error != 0) {
-			spa_t *spa;
-
-			if (spa_open(fsname, &spa, FTAG) == 0) {
-				taskqueue_drain_all(
-				    spa->spa_zvol_taskq->tq_queue);
-				spa_close(spa, FTAG);
-			}
-		}
-#endif
-		if (error != 0)
-			(void) dsl_destroy_head(fsname);
-	}
-	return (error);
-}
-
-/*
- * innvl: {
- *     "origin" -> name of origin snapshot
- *     (optional) "props" -> { prop -> value }
- * }
- *
- * outnvl: propname -> error code (int32)
- */
-static const zfs_ioc_key_t zfs_keys_clone[] = {
-	{"origin",	DATA_TYPE_STRING,	0},
-	{"props",	DATA_TYPE_NVLIST,	ZK_OPTIONAL},
-	{"hidden_args",	DATA_TYPE_NVLIST,	ZK_OPTIONAL},
-};
-
-static int
-zfs_ioc_clone(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl)
-{
-	int error = 0;
-	nvlist_t *nvprops = NULL;
-	char *origin_name;
-
-	origin_name = fnvlist_lookup_string(innvl, "origin");
-	(void) nvlist_lookup_nvlist(innvl, "props", &nvprops);
-
-	if (strchr(fsname, '@') ||
-	    strchr(fsname, '%'))
-		return (SET_ERROR(EINVAL));
-
-	if (dataset_namecheck(origin_name, NULL, NULL) != 0)
-		return (SET_ERROR(EINVAL));
-	error = dmu_objset_clone(fsname, origin_name);
-	if (error != 0)
-		return (error);
-
-	/*
-	 * It would be nice to do this atomically.
-	 */
-	if (error == 0) {
-		error = zfs_set_prop_nvlist(fsname, ZPROP_SRC_LOCAL,
-		    nvprops, outnvl);
-		if (error != 0)
-			(void) dsl_destroy_head(fsname);
-	}
-	return (error);
-}
-
-static const zfs_ioc_key_t zfs_keys_remap[] = {
-	/* no nvl keys */
-};
-
-/* ARGSUSED */
-static int
-zfs_ioc_remap(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl)
-{
-	if (strchr(fsname, '@') ||
-	    strchr(fsname, '%'))
-		return (SET_ERROR(EINVAL));
-
-	return (dmu_objset_remap_indirects(fsname));
-}
-
-/*
- * innvl: {
- *     "snaps" -> { snapshot1, snapshot2 }
- *     (optional) "props" -> { prop -> value (string) }
- * }
- *
- * outnvl: snapshot -> error code (int32)
- */
-static const zfs_ioc_key_t zfs_keys_snapshot[] = {
-	{"snaps",	DATA_TYPE_NVLIST,	0},
-	{"props",	DATA_TYPE_NVLIST,	ZK_OPTIONAL},
-};
-
-static int
-zfs_ioc_snapshot(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl)
-{
-	nvlist_t *snaps;
-	nvlist_t *props = NULL;
-	int error, poollen;
-	nvpair_t *pair;
-
-	(void) nvlist_lookup_nvlist(innvl, "props", &props);
-	if (!nvlist_empty(props) &&
-	    zfs_earlier_version(poolname, SPA_VERSION_SNAP_PROPS))
-		return (SET_ERROR(ENOTSUP));
-	if ((error = zfs_check_userprops(props)) != 0)
-		return (error);
-
-	snaps = fnvlist_lookup_nvlist(innvl, "snaps");
-	poollen = strlen(poolname);
-	for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL;
-	    pair = nvlist_next_nvpair(snaps, pair)) {
-		const char *name = nvpair_name(pair);
-		char *cp = strchr(name, '@');
-
-		/*
-		 * The snap name must contain an @, and the part after it must
-		 * contain only valid characters.
-		 */
-		if (cp == NULL ||
-		    zfs_component_namecheck(cp + 1, NULL, NULL) != 0)
-			return (SET_ERROR(EINVAL));
-
-		/*
-		 * The snap must be in the specified pool.
-		 */
-		if (strncmp(name, poolname, poollen) != 0 ||
-		    (name[poollen] != '/' && name[poollen] != '@'))
-			return (SET_ERROR(EXDEV));
-
-		/*
-		 * Check for permission to set the properties on the fs.
-		 */
-		if (!nvlist_empty(props)) {
-			*cp = '\0';
-			error = zfs_secpolicy_write_perms(name,
-			    ZFS_DELEG_PERM_USERPROP, CRED());
-			*cp = '@';
-			if (error != 0)
-				return (error);
-		}
-
-		/* This must be the only snap of this fs. */
-		for (nvpair_t *pair2 = nvlist_next_nvpair(snaps, pair);
-		    pair2 != NULL; pair2 = nvlist_next_nvpair(snaps, pair2)) {
-			if (strncmp(name, nvpair_name(pair2), cp - name + 1)
-			    == 0) {
-				return (SET_ERROR(EXDEV));
-			}
-		}
-	}
-
-	error = dsl_dataset_snapshot(snaps, props, outnvl);
-	return (error);
-}
-
-/*
- * innvl: "message" -> string
- */
-static const zfs_ioc_key_t zfs_keys_log_history[] = {
-	{"message",	DATA_TYPE_STRING,	0},
-};
-
-/* ARGSUSED */
-static int
-zfs_ioc_log_history(const char *unused, nvlist_t *innvl, nvlist_t *outnvl)
-{
-	char *message;
-	spa_t *spa;
-	int error;
-	char *poolname;
-
-	/*
-	 * The poolname in the ioctl is not set, we get it from the TSD,
-	 * which was set at the end of the last successful ioctl that allows
-	 * logging.  The secpolicy func already checked that it is set.
-	 * Only one log ioctl is allowed after each successful ioctl, so
-	 * we clear the TSD here.
-	 */
-	poolname = tsd_get(zfs_allow_log_key);
-	(void) tsd_set(zfs_allow_log_key, NULL);
-	error = spa_open(poolname, &spa, FTAG);
-	strfree(poolname);
-	if (error != 0)
-		return (error);
-
-	message = fnvlist_lookup_string(innvl, "message");
-
-	if (spa_version(spa) < SPA_VERSION_ZPOOL_HISTORY) {
-		spa_close(spa, FTAG);
-		return (SET_ERROR(ENOTSUP));
-	}
-
-	error = spa_history_log(spa, message);
-	spa_close(spa, FTAG);
-	return (error);
-}
-
-#ifdef __FreeBSD__
-static const zfs_ioc_key_t zfs_keys_nextboot[] = {
-	{"command",	DATA_TYPE_STRING,	0},
-};
-
-static int
-zfs_ioc_nextboot(const char *unused, nvlist_t *innvl, nvlist_t *outnvl)
-{
-	char name[MAXNAMELEN];
-	spa_t *spa;
-	vdev_t *vd;
-	char *command;
-	uint64_t pool_guid;
-	uint64_t vdev_guid;
-	int error;
-
-	if (nvlist_lookup_uint64(innvl,
-	    ZPOOL_CONFIG_POOL_GUID, &pool_guid) != 0)
-		return (EINVAL);
-	if (nvlist_lookup_uint64(innvl,
-	    ZPOOL_CONFIG_GUID, &vdev_guid) != 0)
-		return (EINVAL);
-	command = fnvlist_lookup_string(innvl, "command");
-
-	mutex_enter(&spa_namespace_lock);
-	spa = spa_by_guid(pool_guid, vdev_guid);
-	if (spa != NULL)
-		strcpy(name, spa_name(spa));
-	mutex_exit(&spa_namespace_lock);
-	if (spa == NULL)
-		return (ENOENT);
-
-	if ((error = spa_open(name, &spa, FTAG)) != 0)
-		return (error);
-	spa_vdev_state_enter(spa, SCL_ALL);
-	vd = spa_lookup_by_guid(spa, vdev_guid, B_TRUE);
-	if (vd == NULL) {
-		(void) spa_vdev_state_exit(spa, NULL, ENXIO);
-		spa_close(spa, FTAG);
-		return (ENODEV);
-	}
-	error = vdev_label_write_pad2(vd, command, strlen(command));
-	(void) spa_vdev_state_exit(spa, NULL, 0);
-	txg_wait_synced(spa->spa_dsl_pool, 0);
-	spa_close(spa, FTAG);
-	return (error);
-}
-#endif
-
-/*
- * The dp_config_rwlock must not be held when calling this, because the
- * unmount may need to write out data.
- *
- * This function is best-effort.  Callers must deal gracefully if it
- * remains mounted (or is remounted after this call).
- *
- * Returns 0 if the argument is not a snapshot, or it is not currently a
- * filesystem, or we were able to unmount it.  Returns error code otherwise.
- */
-void
-zfs_unmount_snap(const char *snapname)
-{
-	vfs_t *vfsp = NULL;
-	zfsvfs_t *zfsvfs = NULL;
-
-	if (strchr(snapname, '@') == NULL)
-		return;
-
-	int err = getzfsvfs(snapname, &zfsvfs);
-	if (err != 0) {
-		ASSERT3P(zfsvfs, ==, NULL);
-		return;
-	}
-	vfsp = zfsvfs->z_vfs;
-
-	ASSERT(!dsl_pool_config_held(dmu_objset_pool(zfsvfs->z_os)));
-
-#ifdef illumos
-	err = vn_vfswlock(vfsp->vfs_vnodecovered);
-	VFS_RELE(vfsp);
-	if (err != 0)
-		return;
-#endif
-
-	/*
-	 * Always force the unmount for snapshots.
-	 */
-#ifdef illumos
-	(void) dounmount(vfsp, MS_FORCE, kcred);
-#else
-	vfs_ref(vfsp);
-	vfs_unbusy(vfsp);
-	(void) dounmount(vfsp, MS_FORCE, curthread);
-#endif
-}
-
-/* ARGSUSED */
-static int
-zfs_unmount_snap_cb(const char *snapname, void *arg)
-{
-	zfs_unmount_snap(snapname);
-	return (0);
-}
-
-/*
- * When a clone is destroyed, its origin may also need to be destroyed,
- * in which case it must be unmounted.  This routine will do that unmount
- * if necessary.
- */
-void
-zfs_destroy_unmount_origin(const char *fsname)
-{
-	int error;
-	objset_t *os;
-	dsl_dataset_t *ds;
-
-	error = dmu_objset_hold(fsname, FTAG, &os);
-	if (error != 0)
-		return;
-	ds = dmu_objset_ds(os);
-	if (dsl_dir_is_clone(ds->ds_dir) && DS_IS_DEFER_DESTROY(ds->ds_prev)) {
-		char originname[ZFS_MAX_DATASET_NAME_LEN];
-		dsl_dataset_name(ds->ds_prev, originname);
-		dmu_objset_rele(os, FTAG);
-		zfs_unmount_snap(originname);
-	} else {
-		dmu_objset_rele(os, FTAG);
-	}
-}
-
-/*
- * innvl: {
- *     "snaps" -> { snapshot1, snapshot2 }
- *     (optional boolean) "defer"
- * }
- *
- * outnvl: snapshot -> error code (int32)
- *
- */
-static const zfs_ioc_key_t zfs_keys_destroy_snaps[] = {
-	{"snaps",	DATA_TYPE_NVLIST,	0},
-	{"defer",	DATA_TYPE_BOOLEAN,	ZK_OPTIONAL},
-};
-
-/* ARGSUSED */
-static int
-zfs_ioc_destroy_snaps(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl)
-{
-	int error, poollen;
-	nvlist_t *snaps;
-	nvpair_t *pair;
-	boolean_t defer;
-
-	snaps = fnvlist_lookup_nvlist(innvl, "snaps");
-	defer = nvlist_exists(innvl, "defer");
-
-	poollen = strlen(poolname);
-	for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL;
-	    pair = nvlist_next_nvpair(snaps, pair)) {
-		const char *name = nvpair_name(pair);
-
-		/*
-		 * The snap must be in the specified pool to prevent the
-		 * invalid removal of zvol minors below.
-		 */
-		if (strncmp(name, poolname, poollen) != 0 ||
-		    (name[poollen] != '/' && name[poollen] != '@'))
-			return (SET_ERROR(EXDEV));
-
-		zfs_unmount_snap(nvpair_name(pair));
-	}
-
-	return (dsl_destroy_snapshots_nvl(snaps, defer, outnvl));
-}
-
-/*
- * Create bookmarks.  Bookmark names are of the form <fs>#<bmark>.
- * All bookmarks must be in the same pool.
- *
- * innvl: {
- *     bookmark1 -> snapshot1, bookmark2 -> snapshot2
- * }
- *
- * outnvl: bookmark -> error code (int32)
- *
- */
-static const zfs_ioc_key_t zfs_keys_bookmark[] = {
-	{"<bookmark>...",	DATA_TYPE_STRING,	ZK_WILDCARDLIST},
-};
-
-/* ARGSUSED */
-static int
-zfs_ioc_bookmark(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl)
-{
-	for (nvpair_t *pair = nvlist_next_nvpair(innvl, NULL);
-	    pair != NULL; pair = nvlist_next_nvpair(innvl, pair)) {
-		char *snap_name;
-
-		/*
-		 * Verify the snapshot argument.
-		 */
-		if (nvpair_value_string(pair, &snap_name) != 0)
-			return (SET_ERROR(EINVAL));
-
-
-		/* Verify that the keys (bookmarks) are unique */
-		for (nvpair_t *pair2 = nvlist_next_nvpair(innvl, pair);
-		    pair2 != NULL; pair2 = nvlist_next_nvpair(innvl, pair2)) {
-			if (strcmp(nvpair_name(pair), nvpair_name(pair2)) == 0)
-				return (SET_ERROR(EINVAL));
-		}
-	}
-
-	return (dsl_bookmark_create(innvl, outnvl));
-}
-
-/*
- * innvl: {
- *     property 1, property 2, ...
- * }
- *
- * outnvl: {
- *     bookmark name 1 -> { property 1, property 2, ... },
- *     bookmark name 2 -> { property 1, property 2, ... }
- * }
- *
- */
-static const zfs_ioc_key_t zfs_keys_get_bookmarks[] = {
-	{"<property>...", DATA_TYPE_BOOLEAN, ZK_WILDCARDLIST | ZK_OPTIONAL},
-};
-
-static int
-zfs_ioc_get_bookmarks(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl)
-{
-	return (dsl_get_bookmarks(fsname, innvl, outnvl));
-}
-
-/*
- * innvl: {
- *     bookmark name 1, bookmark name 2
- * }
- *
- * outnvl: bookmark -> error code (int32)
- *
- */
-static const zfs_ioc_key_t zfs_keys_destroy_bookmarks[] = {
-	{"<bookmark>...",	DATA_TYPE_BOOLEAN,	ZK_WILDCARDLIST},
-};
-
-static int
-zfs_ioc_destroy_bookmarks(const char *poolname, nvlist_t *innvl,
-    nvlist_t *outnvl)
-{
-	int error, poollen;
-
-	poollen = strlen(poolname);
-	for (nvpair_t *pair = nvlist_next_nvpair(innvl, NULL);
-	    pair != NULL; pair = nvlist_next_nvpair(innvl, pair)) {
-		const char *name = nvpair_name(pair);
-		const char *cp = strchr(name, '#');
-
-		/*
-		 * The bookmark name must contain an #, and the part after it
-		 * must contain only valid characters.
-		 */
-		if (cp == NULL ||
-		    zfs_component_namecheck(cp + 1, NULL, NULL) != 0)
-			return (SET_ERROR(EINVAL));
-
-		/*
-		 * The bookmark must be in the specified pool.
-		 */
-		if (strncmp(name, poolname, poollen) != 0 ||
-		    (name[poollen] != '/' && name[poollen] != '#'))
-			return (SET_ERROR(EXDEV));
-	}
-
-	error = dsl_bookmark_destroy(innvl, outnvl);
-	return (error);
-}
-
-static const zfs_ioc_key_t zfs_keys_channel_program[] = {
-	{"program",	DATA_TYPE_STRING,		0},
-	{"arg",		DATA_TYPE_ANY,			0},
-	{"sync",	DATA_TYPE_BOOLEAN_VALUE,	ZK_OPTIONAL},
-	{"instrlimit",	DATA_TYPE_UINT64,		ZK_OPTIONAL},
-	{"memlimit",	DATA_TYPE_UINT64,		ZK_OPTIONAL},
-};
-
-static int
-zfs_ioc_channel_program(const char *poolname, nvlist_t *innvl,
-    nvlist_t *outnvl)
-{
-	char *program;
-	uint64_t instrlimit, memlimit;
-	boolean_t sync_flag;
-	nvpair_t *nvarg = NULL;
-
-	program = fnvlist_lookup_string(innvl, ZCP_ARG_PROGRAM);
-	if (0 != nvlist_lookup_boolean_value(innvl, ZCP_ARG_SYNC, &sync_flag)) {
-		sync_flag = B_TRUE;
-	}
-	if (0 != nvlist_lookup_uint64(innvl, ZCP_ARG_INSTRLIMIT, &instrlimit)) {
-		instrlimit = ZCP_DEFAULT_INSTRLIMIT;
-	}
-	if (0 != nvlist_lookup_uint64(innvl, ZCP_ARG_MEMLIMIT, &memlimit)) {
-		memlimit = ZCP_DEFAULT_MEMLIMIT;
-	}
-	nvarg = fnvlist_lookup_nvpair(innvl, ZCP_ARG_ARGLIST);
-
-	if (instrlimit == 0 || instrlimit > zfs_lua_max_instrlimit)
-		return (EINVAL);
-	if (memlimit == 0 || memlimit > zfs_lua_max_memlimit)
-		return (EINVAL);
-
-	return (zcp_eval(poolname, program, sync_flag, instrlimit, memlimit,
-	    nvarg, outnvl));
-}
-
-/*
- * innvl: unused
- * outnvl: empty
- */
-static const zfs_ioc_key_t zfs_keys_pool_checkpoint[] = {
-	/* no nvl keys */
-};
-
-/* ARGSUSED */
-static int
-zfs_ioc_pool_checkpoint(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl)
-{
-	return (spa_checkpoint(poolname));
-}
-
-/*
- * innvl: unused
- * outnvl: empty
- */
-static const zfs_ioc_key_t zfs_keys_pool_discard_checkpoint[] = {
-	/* no nvl keys */
-};
-
-/* ARGSUSED */
-static int
-zfs_ioc_pool_discard_checkpoint(const char *poolname, nvlist_t *innvl,
-    nvlist_t *outnvl)
-{
-	return (spa_checkpoint_discard(poolname));
-}
-
-/*
- * inputs:
- * zc_name		name of dataset to destroy
- * zc_defer_destroy	mark for deferred destroy
- *
- * outputs:		none
- */
-static int
-zfs_ioc_destroy(zfs_cmd_t *zc)
-{
-	objset_t *os;
-	dmu_objset_type_t ost;
-	int err;
-
-	err = dmu_objset_hold(zc->zc_name, FTAG, &os);
-	if (err != 0)
-		return (err);
-	ost = dmu_objset_type(os);
-	dmu_objset_rele(os, FTAG);
-
-	if (ost == DMU_OST_ZFS)
-		zfs_unmount_snap(zc->zc_name);
-
-	if (strchr(zc->zc_name, '@'))
-		err = dsl_destroy_snapshot(zc->zc_name, zc->zc_defer_destroy);
-	else
-		err = dsl_destroy_head(zc->zc_name);
-#ifndef __FreeBSD__
-	if (ost == DMU_OST_ZVOL && err == 0)
-		(void) zvol_remove_minor(zc->zc_name);
-#endif
-	return (err);
-}
-
-/*
- * innvl: {
- *     vdevs: {
- *         guid 1, guid 2, ...
- *     },
- *     func: POOL_INITIALIZE_{CANCEL|DO|SUSPEND}
- * }
- *
- * outnvl: {
- *     [func: EINVAL (if provided command type didn't make sense)],
- *     [vdevs: {
- *         guid1: errno, (see function body for possible errnos)
- *         ...
- *     }]
- * }
- *
- */
-static const zfs_ioc_key_t zfs_keys_pool_initialize[] = {
-	{ZPOOL_INITIALIZE_COMMAND,	DATA_TYPE_UINT64,	0},
-	{ZPOOL_INITIALIZE_VDEVS,	DATA_TYPE_NVLIST,	0}
-};
-
-static int
-zfs_ioc_pool_initialize(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl)
-{
-	spa_t *spa;
-	int error;
-
-	error = spa_open(poolname, &spa, FTAG);
-	if (error != 0)
-		return (error);
-
-	uint64_t cmd_type;
-	if (nvlist_lookup_uint64(innvl, ZPOOL_INITIALIZE_COMMAND,
-	    &cmd_type) != 0) {
-		spa_close(spa, FTAG);
-		return (SET_ERROR(EINVAL));
-	}
-	if (!(cmd_type == POOL_INITIALIZE_CANCEL ||
-	    cmd_type == POOL_INITIALIZE_DO ||
-	    cmd_type == POOL_INITIALIZE_SUSPEND)) {
-		spa_close(spa, FTAG);
-		return (SET_ERROR(EINVAL));
-	}
-
-	nvlist_t *vdev_guids;
-	if (nvlist_lookup_nvlist(innvl, ZPOOL_INITIALIZE_VDEVS,
-	    &vdev_guids) != 0) {
-		spa_close(spa, FTAG);
-		return (SET_ERROR(EINVAL));
-	}
-
-	nvlist_t *vdev_errlist = fnvlist_alloc();
-	int total_errors = 0;
-
-	for (nvpair_t *pair = nvlist_next_nvpair(vdev_guids, NULL);
-	    pair != NULL; pair = nvlist_next_nvpair(vdev_guids, pair)) {
-		uint64_t vdev_guid = fnvpair_value_uint64(pair);
-
-		error = spa_vdev_initialize(spa, vdev_guid, cmd_type);
-		if (error != 0) {
-			char guid_as_str[MAXNAMELEN];
-
-			(void) snprintf(guid_as_str, sizeof (guid_as_str),
-			    "%llu", (unsigned long long)vdev_guid);
-			fnvlist_add_int64(vdev_errlist, guid_as_str, error);
-			total_errors++;
-		}
-	}
-	if (fnvlist_size(vdev_errlist) > 0) {
-		fnvlist_add_nvlist(outnvl, ZPOOL_INITIALIZE_VDEVS,
-		    vdev_errlist);
-	}
-	fnvlist_free(vdev_errlist);
-
-	spa_close(spa, FTAG);
-	return (total_errors > 0 ? EINVAL : 0);
-}
-
-/*
- * fsname is name of dataset to rollback (to most recent snapshot)
- *
- * innvl may contain name of expected target snapshot
- *
- * outnvl: "target" -> name of most recent snapshot
- * }
- */
-static const zfs_ioc_key_t zfs_keys_rollback[] = {
-	{"target",	DATA_TYPE_STRING,	ZK_OPTIONAL},
-};
-
-/* ARGSUSED */
-static int
-zfs_ioc_rollback(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl)
-{
-	zfsvfs_t *zfsvfs;
-	char *target = NULL;
-	int error;
-
-	(void) nvlist_lookup_string(innvl, "target", &target);
-	if (target != NULL) {
-		const char *cp = strchr(target, '@');
-
-		/*
-		 * The snap name must contain an @, and the part after it must
-		 * contain only valid characters.
-		 */
-		if (cp == NULL ||
-		    zfs_component_namecheck(cp + 1, NULL, NULL) != 0)
-			return (SET_ERROR(EINVAL));
-	}
-
-	if (getzfsvfs(fsname, &zfsvfs) == 0) {
-		dsl_dataset_t *ds;
-
-		ds = dmu_objset_ds(zfsvfs->z_os);
-		error = zfs_suspend_fs(zfsvfs);
-		if (error == 0) {
-			int resume_err;
-
-			error = dsl_dataset_rollback(fsname, target, zfsvfs,
-			    outnvl);
-			resume_err = zfs_resume_fs(zfsvfs, ds);
-			error = error ? error : resume_err;
-		}
-#ifdef illumos
-		VFS_RELE(zfsvfs->z_vfs);
-#else
-		vfs_unbusy(zfsvfs->z_vfs);
-#endif
-	} else {
-		error = dsl_dataset_rollback(fsname, target, NULL, outnvl);
-	}
-	return (error);
-}
-
-static int
-recursive_unmount(const char *fsname, void *arg)
-{
-	const char *snapname = arg;
-	char fullname[ZFS_MAX_DATASET_NAME_LEN];
-
-	(void) snprintf(fullname, sizeof (fullname), "%s@%s", fsname, snapname);
-	zfs_unmount_snap(fullname);
-
-	return (0);
-}
-
-/*
- * inputs:
- * zc_name	old name of dataset or bookmark
- * zc_value	new name of dataset or bookmark
- * zc_cookie	recursive flag (only valid for snapshots)
- *
- * outputs:	none
- */
-static int
-zfs_ioc_rename(zfs_cmd_t *zc)
-{
-	objset_t *os;
-	dmu_objset_type_t ost;
-	boolean_t recursive = zc->zc_cookie & 1;
-	char *pos, *pos2;
-	boolean_t allow_mounted = B_TRUE;
-	int err;
-
-#ifdef __FreeBSD__
-	allow_mounted = (zc->zc_cookie & 2) != 0;
-#endif
-
-	zc->zc_name[sizeof (zc->zc_name) - 1] = '\0';
-	zc->zc_value[sizeof (zc->zc_value) - 1] = '\0';
-
-	pos = strchr(zc->zc_name, '#');
-	if (pos != NULL) {
-		/* Bookmarks must be in same fs. */
-		pos2 = strchr(zc->zc_value, '#');
-		if (pos2 == NULL)
-			return (SET_ERROR(EINVAL));
-
-		/* Recursive flag is not supported yet. */
-		if (recursive)
-			return (SET_ERROR(ENOTSUP));
-
-		*pos = '\0';
-		*pos2 = '\0';
-		if (strcmp(zc->zc_name, zc->zc_value) == 0) {
-			err = dsl_bookmark_rename(zc->zc_name,
-			    pos + 1, pos2 + 1);
-		} else {
-			err = SET_ERROR(EXDEV);
-		}
-		*pos = '#';
-		*pos2 = '#';
-		return (err);
-	}
-
-	/* "zfs rename" from and to ...%recv datasets should both fail */
-	if (dataset_namecheck(zc->zc_name, NULL, NULL) != 0 ||
-	    dataset_namecheck(zc->zc_value, NULL, NULL) != 0 ||
-	    strchr(zc->zc_name, '%') || strchr(zc->zc_value, '%'))
-		return (SET_ERROR(EINVAL));
-
-	err = dmu_objset_hold(zc->zc_name, FTAG, &os);
-	if (err != 0)
-		return (err);
-	ost = dmu_objset_type(os);
-	dmu_objset_rele(os, FTAG);
-
-	pos = strchr(zc->zc_name, '@');
-	if (pos != NULL) {
-		/* Snapshots must be in same fs. */
-		pos2 = strchr(zc->zc_value, '@');
-		if (pos2 == NULL)
-			return (SET_ERROR(EINVAL));
-		*pos = '\0';
-		*pos2 = '\0';
-		if (strcmp(zc->zc_name, zc->zc_value) != 0) {
-			err = SET_ERROR(EXDEV);
-		} else {
-			if (ost == DMU_OST_ZFS && !allow_mounted) {
-				err = dmu_objset_find(zc->zc_name,
-				    recursive_unmount, pos + 1,
-				    recursive ? DS_FIND_CHILDREN : 0);
-			}
-			if (err == 0) {
-				err = dsl_dataset_rename_snapshot(zc->zc_name,
-				    pos + 1, pos2 + 1, recursive);
-			}
-		}
-		*pos = '@';
-		*pos2 = '@';
-		return (err);
-	} else {
-#ifdef illumos
-		if (ost == DMU_OST_ZVOL)
-			(void) zvol_remove_minor(zc->zc_name);
-#endif
-		return (dsl_dir_rename(zc->zc_name, zc->zc_value));
-	}
-}
-
-static int
-zfs_check_settable(const char *dsname, nvpair_t *pair, cred_t *cr)
-{
-	const char *propname = nvpair_name(pair);
-	boolean_t issnap = (strchr(dsname, '@') != NULL);
-	zfs_prop_t prop = zfs_name_to_prop(propname);
-	uint64_t intval;
-	int err;
-
-	if (prop == ZPROP_INVAL) {
-		if (zfs_prop_user(propname)) {
-			if (err = zfs_secpolicy_write_perms(dsname,
-			    ZFS_DELEG_PERM_USERPROP, cr))
-				return (err);
-			return (0);
-		}
-
-		if (!issnap && zfs_prop_userquota(propname)) {
-			const char *perm = NULL;
-			const char *uq_prefix =
-			    zfs_userquota_prop_prefixes[ZFS_PROP_USERQUOTA];
-			const char *gq_prefix =
-			    zfs_userquota_prop_prefixes[ZFS_PROP_GROUPQUOTA];
-
-			if (strncmp(propname, uq_prefix,
-			    strlen(uq_prefix)) == 0) {
-				perm = ZFS_DELEG_PERM_USERQUOTA;
-			} else if (strncmp(propname, gq_prefix,
-			    strlen(gq_prefix)) == 0) {
-				perm = ZFS_DELEG_PERM_GROUPQUOTA;
-			} else {
-				/* USERUSED and GROUPUSED are read-only */
-				return (SET_ERROR(EINVAL));
-			}
-
-			if (err = zfs_secpolicy_write_perms(dsname, perm, cr))
-				return (err);
-			return (0);
-		}
-
-		return (SET_ERROR(EINVAL));
-	}
-
-	if (issnap)
-		return (SET_ERROR(EINVAL));
-
-	if (nvpair_type(pair) == DATA_TYPE_NVLIST) {
-		/*
-		 * dsl_prop_get_all_impl() returns properties in this
-		 * format.
-		 */
-		nvlist_t *attrs;
-		VERIFY(nvpair_value_nvlist(pair, &attrs) == 0);
-		VERIFY(nvlist_lookup_nvpair(attrs, ZPROP_VALUE,
-		    &pair) == 0);
-	}
-
-	/*
-	 * Check that this value is valid for this pool version
-	 */
-	switch (prop) {
-	case ZFS_PROP_COMPRESSION:
-		/*
-		 * If the user specified gzip compression, make sure
-		 * the SPA supports it. We ignore any errors here since
-		 * we'll catch them later.
-		 */
-		if (nvpair_value_uint64(pair, &intval) == 0) {
-			if (intval >= ZIO_COMPRESS_GZIP_1 &&
-			    intval <= ZIO_COMPRESS_GZIP_9 &&
-			    zfs_earlier_version(dsname,
-			    SPA_VERSION_GZIP_COMPRESSION)) {
-				return (SET_ERROR(ENOTSUP));
-			}
-
-			if (intval == ZIO_COMPRESS_ZLE &&
-			    zfs_earlier_version(dsname,
-			    SPA_VERSION_ZLE_COMPRESSION))
-				return (SET_ERROR(ENOTSUP));
-
-			if (intval == ZIO_COMPRESS_LZ4) {
-				spa_t *spa;
-
-				if ((err = spa_open(dsname, &spa, FTAG)) != 0)
-					return (err);
-
-				if (!spa_feature_is_enabled(spa,
-				    SPA_FEATURE_LZ4_COMPRESS)) {
-					spa_close(spa, FTAG);
-					return (SET_ERROR(ENOTSUP));
-				}
-				spa_close(spa, FTAG);
-			}
-
-			/*
-			 * If this is a bootable dataset then
-			 * verify that the compression algorithm
-			 * is supported for booting. We must return
-			 * something other than ENOTSUP since it
-			 * implies a downrev pool version.
-			 */
-			if (zfs_is_bootfs(dsname) &&
-			    !BOOTFS_COMPRESS_VALID(intval)) {
-				return (SET_ERROR(ERANGE));
-			}
-		}
-		break;
-
-	case ZFS_PROP_COPIES:
-		if (zfs_earlier_version(dsname, SPA_VERSION_DITTO_BLOCKS))
-			return (SET_ERROR(ENOTSUP));
-		break;
-
-	case ZFS_PROP_RECORDSIZE:
-		/* Record sizes above 128k need the feature to be enabled */
-		if (nvpair_value_uint64(pair, &intval) == 0 &&
-		    intval > SPA_OLD_MAXBLOCKSIZE) {
-			spa_t *spa;
-
-			/*
-			 * We don't allow setting the property above 1MB,
-			 * unless the tunable has been changed.
-			 */
-			if (intval > zfs_max_recordsize ||
-			    intval > SPA_MAXBLOCKSIZE)
-				return (SET_ERROR(ERANGE));
-
-			if ((err = spa_open(dsname, &spa, FTAG)) != 0)
-				return (err);
-
-			if (!spa_feature_is_enabled(spa,
-			    SPA_FEATURE_LARGE_BLOCKS)) {
-				spa_close(spa, FTAG);
-				return (SET_ERROR(ENOTSUP));
-			}
-			spa_close(spa, FTAG);
-		}
-		break;
-
-	case ZFS_PROP_DNODESIZE:
-		/* Dnode sizes above 512 need the feature to be enabled */
-		if (nvpair_value_uint64(pair, &intval) == 0 &&
-		    intval != ZFS_DNSIZE_LEGACY) {
-			spa_t *spa;
-
-			if ((err = spa_open(dsname, &spa, FTAG)) != 0)
-				return (err);
-
-			if (!spa_feature_is_enabled(spa,
-			    SPA_FEATURE_LARGE_DNODE)) {
-				spa_close(spa, FTAG);
-				return (SET_ERROR(ENOTSUP));
-			}
-			spa_close(spa, FTAG);
-		}
-		break;
-
-	case ZFS_PROP_SPECIAL_SMALL_BLOCKS:
-		/*
-		 * This property could require the allocation classes
-		 * feature to be active for setting, however we allow
-		 * it so that tests of settable properties succeed.
-		 * The CLI will issue a warning in this case.
-		 */
-		break;
-
-	case ZFS_PROP_SHARESMB:
-		if (zpl_earlier_version(dsname, ZPL_VERSION_FUID))
-			return (SET_ERROR(ENOTSUP));
-		break;
-
-	case ZFS_PROP_ACLINHERIT:
-		if (nvpair_type(pair) == DATA_TYPE_UINT64 &&
-		    nvpair_value_uint64(pair, &intval) == 0) {
-			if (intval == ZFS_ACL_PASSTHROUGH_X &&
-			    zfs_earlier_version(dsname,
-			    SPA_VERSION_PASSTHROUGH_X))
-				return (SET_ERROR(ENOTSUP));
-		}
-		break;
-
-	case ZFS_PROP_CHECKSUM:
-	case ZFS_PROP_DEDUP:
-	{
-		spa_feature_t feature;
-		spa_t *spa;
-
-		/* dedup feature version checks */
-		if (prop == ZFS_PROP_DEDUP &&
-		    zfs_earlier_version(dsname, SPA_VERSION_DEDUP))
-			return (SET_ERROR(ENOTSUP));
-
-		if (nvpair_value_uint64(pair, &intval) != 0)
-			return (SET_ERROR(EINVAL));
-
-		/* check prop value is enabled in features */
-		feature = zio_checksum_to_feature(intval & ZIO_CHECKSUM_MASK);
-		if (feature == SPA_FEATURE_NONE)
-			break;
-
-		if ((err = spa_open(dsname, &spa, FTAG)) != 0)
-			return (err);
-
-		if (!spa_feature_is_enabled(spa, feature)) {
-			spa_close(spa, FTAG);
-			return (SET_ERROR(ENOTSUP));
-		}
-		spa_close(spa, FTAG);
-		break;
-	}
-	}
-
-	return (zfs_secpolicy_setprop(dsname, prop, pair, CRED()));
-}
-
-/*
- * Checks for a race condition to make sure we don't increment a feature flag
- * multiple times.
- */
-static int
-zfs_prop_activate_feature_check(void *arg, dmu_tx_t *tx)
-{
-	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
-	spa_feature_t *featurep = arg;
-
-	if (!spa_feature_is_active(spa, *featurep))
-		return (0);
-	else
-		return (SET_ERROR(EBUSY));
-}
-
-/*
- * The callback invoked on feature activation in the sync task caused by
- * zfs_prop_activate_feature.
- */
-static void
-zfs_prop_activate_feature_sync(void *arg, dmu_tx_t *tx)
-{
-	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
-	spa_feature_t *featurep = arg;
-
-	spa_feature_incr(spa, *featurep, tx);
-}
-
-/*
- * Activates a feature on a pool in response to a property setting. This
- * creates a new sync task which modifies the pool to reflect the feature
- * as being active.
- */
-static int
-zfs_prop_activate_feature(spa_t *spa, spa_feature_t feature)
-{
-	int err;
-
-	/* EBUSY here indicates that the feature is already active */
-	err = dsl_sync_task(spa_name(spa),
-	    zfs_prop_activate_feature_check, zfs_prop_activate_feature_sync,
-	    &feature, 2, ZFS_SPACE_CHECK_RESERVED);
-
-	if (err != 0 && err != EBUSY)
-		return (err);
-	else
-		return (0);
-}
-
-/*
- * Removes properties from the given props list that fail permission checks
- * needed to clear them and to restore them in case of a receive error. For each
- * property, make sure we have both set and inherit permissions.
- *
- * Returns the first error encountered if any permission checks fail. If the
- * caller provides a non-NULL errlist, it also gives the complete list of names
- * of all the properties that failed a permission check along with the
- * corresponding error numbers. The caller is responsible for freeing the
- * returned errlist.
- *
- * If every property checks out successfully, zero is returned and the list
- * pointed at by errlist is NULL.
- */
-static int
-zfs_check_clearable(char *dataset, nvlist_t *props, nvlist_t **errlist)
-{
-	zfs_cmd_t *zc;
-	nvpair_t *pair, *next_pair;
-	nvlist_t *errors;
-	int err, rv = 0;
-
-	if (props == NULL)
-		return (0);
-
-	VERIFY(nvlist_alloc(&errors, NV_UNIQUE_NAME, KM_SLEEP) == 0);
-
-	zc = kmem_alloc(sizeof (zfs_cmd_t), KM_SLEEP);
-	(void) strcpy(zc->zc_name, dataset);
-	pair = nvlist_next_nvpair(props, NULL);
-	while (pair != NULL) {
-		next_pair = nvlist_next_nvpair(props, pair);
-
-		(void) strcpy(zc->zc_value, nvpair_name(pair));
-		if ((err = zfs_check_settable(dataset, pair, CRED())) != 0 ||
-		    (err = zfs_secpolicy_inherit_prop(zc, NULL, CRED())) != 0) {
-			VERIFY(nvlist_remove_nvpair(props, pair) == 0);
-			VERIFY(nvlist_add_int32(errors,
-			    zc->zc_value, err) == 0);
-		}
-		pair = next_pair;
-	}
-	kmem_free(zc, sizeof (zfs_cmd_t));
-
-	if ((pair = nvlist_next_nvpair(errors, NULL)) == NULL) {
-		nvlist_free(errors);
-		errors = NULL;
-	} else {
-		VERIFY(nvpair_value_int32(pair, &rv) == 0);
-	}
-
-	if (errlist == NULL)
-		nvlist_free(errors);
-	else
-		*errlist = errors;
-
-	return (rv);
-}
-
-static boolean_t
-propval_equals(nvpair_t *p1, nvpair_t *p2)
-{
-	if (nvpair_type(p1) == DATA_TYPE_NVLIST) {
-		/* dsl_prop_get_all_impl() format */
-		nvlist_t *attrs;
-		VERIFY(nvpair_value_nvlist(p1, &attrs) == 0);
-		VERIFY(nvlist_lookup_nvpair(attrs, ZPROP_VALUE,
-		    &p1) == 0);
-	}
-
-	if (nvpair_type(p2) == DATA_TYPE_NVLIST) {
-		nvlist_t *attrs;
-		VERIFY(nvpair_value_nvlist(p2, &attrs) == 0);
-		VERIFY(nvlist_lookup_nvpair(attrs, ZPROP_VALUE,
-		    &p2) == 0);
-	}
-
-	if (nvpair_type(p1) != nvpair_type(p2))
-		return (B_FALSE);
-
-	if (nvpair_type(p1) == DATA_TYPE_STRING) {
-		char *valstr1, *valstr2;
-
-		VERIFY(nvpair_value_string(p1, (char **)&valstr1) == 0);
-		VERIFY(nvpair_value_string(p2, (char **)&valstr2) == 0);
-		return (strcmp(valstr1, valstr2) == 0);
-	} else {
-		uint64_t intval1, intval2;
-
-		VERIFY(nvpair_value_uint64(p1, &intval1) == 0);
-		VERIFY(nvpair_value_uint64(p2, &intval2) == 0);
-		return (intval1 == intval2);
-	}
-}
-
-/*
- * Remove properties from props if they are not going to change (as determined
- * by comparison with origprops). Remove them from origprops as well, since we
- * do not need to clear or restore properties that won't change.
- */
-static void
-props_reduce(nvlist_t *props, nvlist_t *origprops)
-{
-	nvpair_t *pair, *next_pair;
-
-	if (origprops == NULL)
-		return; /* all props need to be received */
-
-	pair = nvlist_next_nvpair(props, NULL);
-	while (pair != NULL) {
-		const char *propname = nvpair_name(pair);
-		nvpair_t *match;
-
-		next_pair = nvlist_next_nvpair(props, pair);
-
-		if ((nvlist_lookup_nvpair(origprops, propname,
-		    &match) != 0) || !propval_equals(pair, match))
-			goto next; /* need to set received value */
-
-		/* don't clear the existing received value */
-		(void) nvlist_remove_nvpair(origprops, match);
-		/* don't bother receiving the property */
-		(void) nvlist_remove_nvpair(props, pair);
-next:
-		pair = next_pair;
-	}
-}
-
-/*
- * Extract properties that cannot be set PRIOR to the receipt of a dataset.
- * For example, refquota cannot be set until after the receipt of a dataset,
- * because in replication streams, an older/earlier snapshot may exceed the
- * refquota.  We want to receive the older/earlier snapshot, but setting
- * refquota pre-receipt will set the dsl's ACTUAL quota, which will prevent
- * the older/earlier snapshot from being received (with EDQUOT).
- *
- * The ZFS test "zfs_receive_011_pos" demonstrates such a scenario.
- *
- * libzfs will need to be judicious handling errors encountered by props
- * extracted by this function.
- */
-static nvlist_t *
-extract_delay_props(nvlist_t *props)
-{
-	nvlist_t *delayprops;
-	nvpair_t *nvp, *tmp;
-	static const zfs_prop_t delayable[] = { ZFS_PROP_REFQUOTA, 0 };
-	int i;
-
-	VERIFY(nvlist_alloc(&delayprops, NV_UNIQUE_NAME, KM_SLEEP) == 0);
-
-	for (nvp = nvlist_next_nvpair(props, NULL); nvp != NULL;
-	    nvp = nvlist_next_nvpair(props, nvp)) {
-		/*
-		 * strcmp() is safe because zfs_prop_to_name() always returns
-		 * a bounded string.
-		 */
-		for (i = 0; delayable[i] != 0; i++) {
-			if (strcmp(zfs_prop_to_name(delayable[i]),
-			    nvpair_name(nvp)) == 0) {
-				break;
-			}
-		}
-		if (delayable[i] != 0) {
-			tmp = nvlist_prev_nvpair(props, nvp);
-			VERIFY(nvlist_add_nvpair(delayprops, nvp) == 0);
-			VERIFY(nvlist_remove_nvpair(props, nvp) == 0);
-			nvp = tmp;
-		}
-	}
-
-	if (nvlist_empty(delayprops)) {
-		nvlist_free(delayprops);
-		delayprops = NULL;
-	}
-	return (delayprops);
-}
-
-#ifdef	DEBUG
-static boolean_t zfs_ioc_recv_inject_err;
-#endif
-
-/*
- * inputs:
- * zc_name		name of containing filesystem
- * zc_nvlist_src{_size}	nvlist of properties to apply
- * zc_value		name of snapshot to create
- * zc_string		name of clone origin (if DRR_FLAG_CLONE)
- * zc_cookie		file descriptor to recv from
- * zc_begin_record	the BEGIN record of the stream (not byteswapped)
- * zc_guid		force flag
- * zc_cleanup_fd	cleanup-on-exit file descriptor
- * zc_action_handle	handle for this guid/ds mapping (or zero on first call)
- * zc_resumable		if data is incomplete assume sender will resume
- *
- * outputs:
- * zc_cookie		number of bytes read
- * zc_nvlist_dst{_size} error for each unapplied received property
- * zc_obj		zprop_errflags_t
- * zc_action_handle	handle for this guid/ds mapping
- */
-static int
-zfs_ioc_recv(zfs_cmd_t *zc)
-{
-	file_t *fp;
-	dmu_recv_cookie_t drc;
-	boolean_t force = (boolean_t)zc->zc_guid;
-	int fd;
-	int error = 0;
-	int props_error = 0;
-	nvlist_t *errors;
-	offset_t off;
-	nvlist_t *props = NULL; /* sent properties */
-	nvlist_t *origprops = NULL; /* existing properties */
-	nvlist_t *delayprops = NULL; /* sent properties applied post-receive */
-	char *origin = NULL;
-	char *tosnap;
-	char tofs[ZFS_MAX_DATASET_NAME_LEN];
-	boolean_t first_recvd_props = B_FALSE;
-
-	if (dataset_namecheck(zc->zc_value, NULL, NULL) != 0 ||
-	    strchr(zc->zc_value, '@') == NULL ||
-	    strchr(zc->zc_value, '%'))
-		return (SET_ERROR(EINVAL));
-
-	(void) strcpy(tofs, zc->zc_value);
-	tosnap = strchr(tofs, '@');
-	*tosnap++ = '\0';
-
-	if (zc->zc_nvlist_src != 0 &&
-	    (error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size,
-	    zc->zc_iflags, &props)) != 0)
-		return (error);
-
-	fd = zc->zc_cookie;
-#ifdef illumos
-	fp = getf(fd);
-#else
-	fget_read(curthread, fd, &cap_pread_rights, &fp);
-#endif
-	if (fp == NULL) {
-		nvlist_free(props);
-		return (SET_ERROR(EBADF));
-	}
-
-	errors = fnvlist_alloc();
-
-	if (zc->zc_string[0])
-		origin = zc->zc_string;
-
-	error = dmu_recv_begin(tofs, tosnap,
-	    &zc->zc_begin_record, force, zc->zc_resumable, origin, &drc);
-	if (error != 0)
-		goto out;
-
-	/*
-	 * Set properties before we receive the stream so that they are applied
-	 * to the new data. Note that we must call dmu_recv_stream() if
-	 * dmu_recv_begin() succeeds.
-	 */
-	if (props != NULL && !drc.drc_newfs) {
-		if (spa_version(dsl_dataset_get_spa(drc.drc_ds)) >=
-		    SPA_VERSION_RECVD_PROPS &&
-		    !dsl_prop_get_hasrecvd(tofs))
-			first_recvd_props = B_TRUE;
-
-		/*
-		 * If new received properties are supplied, they are to
-		 * completely replace the existing received properties, so stash
-		 * away the existing ones.
-		 */
-		if (dsl_prop_get_received(tofs, &origprops) == 0) {
-			nvlist_t *errlist = NULL;
-			/*
-			 * Don't bother writing a property if its value won't
-			 * change (and avoid the unnecessary security checks).
-			 *
-			 * The first receive after SPA_VERSION_RECVD_PROPS is a
-			 * special case where we blow away all local properties
-			 * regardless.
-			 */
-			if (!first_recvd_props)
-				props_reduce(props, origprops);
-			if (zfs_check_clearable(tofs, origprops, &errlist) != 0)
-				(void) nvlist_merge(errors, errlist, 0);
-			nvlist_free(errlist);
-
-			if (clear_received_props(tofs, origprops,
-			    first_recvd_props ? NULL : props) != 0)
-				zc->zc_obj |= ZPROP_ERR_NOCLEAR;
-		} else {
-			zc->zc_obj |= ZPROP_ERR_NOCLEAR;
-		}
-	}
-
-	if (props != NULL) {
-		props_error = dsl_prop_set_hasrecvd(tofs);
-
-		if (props_error == 0) {
-			delayprops = extract_delay_props(props);
-			(void) zfs_set_prop_nvlist(tofs, ZPROP_SRC_RECEIVED,
-			    props, errors);
-		}
-	}
-
-	off = fp->f_offset;
-	error = dmu_recv_stream(&drc, fp, &off, zc->zc_cleanup_fd,
-	    &zc->zc_action_handle);
-
-	if (error == 0) {
-		zfsvfs_t *zfsvfs = NULL;
-
-		if (getzfsvfs(tofs, &zfsvfs) == 0) {
-			/* online recv */
-			dsl_dataset_t *ds;
-			int end_err;
-
-			ds = dmu_objset_ds(zfsvfs->z_os);
-			error = zfs_suspend_fs(zfsvfs);
-			/*
-			 * If the suspend fails, then the recv_end will
-			 * likely also fail, and clean up after itself.
-			 */
-			end_err = dmu_recv_end(&drc, zfsvfs);
-			if (error == 0)
-				error = zfs_resume_fs(zfsvfs, ds);
-			error = error ? error : end_err;
-#ifdef illumos
-			VFS_RELE(zfsvfs->z_vfs);
-#else
-			vfs_unbusy(zfsvfs->z_vfs);
-#endif
-		} else {
-			error = dmu_recv_end(&drc, NULL);
-		}
-
-		/* Set delayed properties now, after we're done receiving. */
-		if (delayprops != NULL && error == 0) {
-			(void) zfs_set_prop_nvlist(tofs, ZPROP_SRC_RECEIVED,
-			    delayprops, errors);
-		}
-	}
-
-	if (delayprops != NULL) {
-		/*
-		 * Merge delayed props back in with initial props, in case
-		 * we're DEBUG and zfs_ioc_recv_inject_err is set (which means
-		 * we have to make sure clear_received_props() includes
-		 * the delayed properties).
-		 *
-		 * Since zfs_ioc_recv_inject_err is only in DEBUG kernels,
-		 * using ASSERT() will be just like a VERIFY.
-		 */
-		ASSERT(nvlist_merge(props, delayprops, 0) == 0);
-		nvlist_free(delayprops);
-	}
-
-	/*
-	 * Now that all props, initial and delayed, are set, report the prop
-	 * errors to the caller.
-	 */
-	if (zc->zc_nvlist_dst_size != 0 &&
-	    (nvlist_smush(errors, zc->zc_nvlist_dst_size) != 0 ||
-	    put_nvlist(zc, errors) != 0)) {
-		/*
-		 * Caller made zc->zc_nvlist_dst less than the minimum expected
-		 * size or supplied an invalid address.
-		 */
-		props_error = SET_ERROR(EINVAL);
-	}
-
-	zc->zc_cookie = off - fp->f_offset;
-	if (off >= 0 && off <= MAXOFFSET_T)
-		fp->f_offset = off;
-
-#ifdef	DEBUG
-	if (zfs_ioc_recv_inject_err) {
-		zfs_ioc_recv_inject_err = B_FALSE;
-		error = 1;
-	}
-#endif
-
-	/*
-	 * On error, restore the original props.
-	 */
-	if (error != 0 && props != NULL && !drc.drc_newfs) {
-		if (clear_received_props(tofs, props, NULL) != 0) {
-			/*
-			 * We failed to clear the received properties.
-			 * Since we may have left a $recvd value on the
-			 * system, we can't clear the $hasrecvd flag.
-			 */
-			zc->zc_obj |= ZPROP_ERR_NORESTORE;
-		} else if (first_recvd_props) {
-			dsl_prop_unset_hasrecvd(tofs);
-		}
-
-		if (origprops == NULL && !drc.drc_newfs) {
-			/* We failed to stash the original properties. */
-			zc->zc_obj |= ZPROP_ERR_NORESTORE;
-		}
-
-		/*
-		 * dsl_props_set() will not convert RECEIVED to LOCAL on or
-		 * after SPA_VERSION_RECVD_PROPS, so we need to specify LOCAL
-		 * explictly if we're restoring local properties cleared in the
-		 * first new-style receive.
-		 */
-		if (origprops != NULL &&
-		    zfs_set_prop_nvlist(tofs, (first_recvd_props ?
-		    ZPROP_SRC_LOCAL : ZPROP_SRC_RECEIVED),
-		    origprops, NULL) != 0) {
-			/*
-			 * We stashed the original properties but failed to
-			 * restore them.
-			 */
-			zc->zc_obj |= ZPROP_ERR_NORESTORE;
-		}
-	}
-out:
-	nvlist_free(props);
-	nvlist_free(origprops);
-	nvlist_free(errors);
-	releasef(fd);
-
-	if (error == 0)
-		error = props_error;
-
-	return (error);
-}
-
-/*
- * inputs:
- * zc_name	name of snapshot to send
- * zc_cookie	file descriptor to send stream to
- * zc_obj	fromorigin flag (mutually exclusive with zc_fromobj)
- * zc_sendobj	objsetid of snapshot to send
- * zc_fromobj	objsetid of incremental fromsnap (may be zero)
- * zc_guid	if set, estimate size of stream only.  zc_cookie is ignored.
- *		output size in zc_objset_type.
- * zc_flags	lzc_send_flags
- *
- * outputs:
- * zc_objset_type	estimated size, if zc_guid is set
- *
- * NOTE: This is no longer the preferred interface, any new functionality
- *	  should be added to zfs_ioc_send_new() instead.
- */
-static int
-zfs_ioc_send(zfs_cmd_t *zc)
-{
-	int error;
-	offset_t off;
-	boolean_t estimate = (zc->zc_guid != 0);
-	boolean_t embedok = (zc->zc_flags & 0x1);
-	boolean_t large_block_ok = (zc->zc_flags & 0x2);
-	boolean_t compressok = (zc->zc_flags & 0x4);
-
-	if (zc->zc_obj != 0) {
-		dsl_pool_t *dp;
-		dsl_dataset_t *tosnap;
-
-		error = dsl_pool_hold(zc->zc_name, FTAG, &dp);
-		if (error != 0)
-			return (error);
-
-		error = dsl_dataset_hold_obj(dp, zc->zc_sendobj, FTAG, &tosnap);
-		if (error != 0) {
-			dsl_pool_rele(dp, FTAG);
-			return (error);
-		}
-
-		if (dsl_dir_is_clone(tosnap->ds_dir))
-			zc->zc_fromobj =
-			    dsl_dir_phys(tosnap->ds_dir)->dd_origin_obj;
-		dsl_dataset_rele(tosnap, FTAG);
-		dsl_pool_rele(dp, FTAG);
-	}
-
-	if (estimate) {
-		dsl_pool_t *dp;
-		dsl_dataset_t *tosnap;
-		dsl_dataset_t *fromsnap = NULL;
-
-		error = dsl_pool_hold(zc->zc_name, FTAG, &dp);
-		if (error != 0)
-			return (error);
-
-		error = dsl_dataset_hold_obj(dp, zc->zc_sendobj, FTAG, &tosnap);
-		if (error != 0) {
-			dsl_pool_rele(dp, FTAG);
-			return (error);
-		}
-
-		if (zc->zc_fromobj != 0) {
-			error = dsl_dataset_hold_obj(dp, zc->zc_fromobj,
-			    FTAG, &fromsnap);
-			if (error != 0) {
-				dsl_dataset_rele(tosnap, FTAG);
-				dsl_pool_rele(dp, FTAG);
-				return (error);
-			}
-		}
-
-		error = dmu_send_estimate(tosnap, fromsnap, compressok,
-		    &zc->zc_objset_type);
-
-		if (fromsnap != NULL)
-			dsl_dataset_rele(fromsnap, FTAG);
-		dsl_dataset_rele(tosnap, FTAG);
-		dsl_pool_rele(dp, FTAG);
-	} else {
-		file_t *fp;
-
-#ifdef illumos
-		fp = getf(zc->zc_cookie);
-#else
-		fget_write(curthread, zc->zc_cookie, &cap_write_rights, &fp);
-#endif
-		if (fp == NULL)
-			return (SET_ERROR(EBADF));
-
-		off = fp->f_offset;
-		error = dmu_send_obj(zc->zc_name, zc->zc_sendobj,
-		    zc->zc_fromobj, embedok, large_block_ok, compressok,
-#ifdef illumos
-		    zc->zc_cookie, fp->f_vnode, &off);
-#else
-		    zc->zc_cookie, fp, &off);
-#endif
-
-		if (off >= 0 && off <= MAXOFFSET_T)
-			fp->f_offset = off;
-		releasef(zc->zc_cookie);
-	}
-	return (error);
-}
-
-/*
- * inputs:
- * zc_name	name of snapshot on which to report progress
- * zc_cookie	file descriptor of send stream
- *
- * outputs:
- * zc_cookie	number of bytes written in send stream thus far
- */
-static int
-zfs_ioc_send_progress(zfs_cmd_t *zc)
-{
-	dsl_pool_t *dp;
-	dsl_dataset_t *ds;
-	dmu_sendarg_t *dsp = NULL;
-	int error;
-
-	error = dsl_pool_hold(zc->zc_name, FTAG, &dp);
-	if (error != 0)
-		return (error);
-
-	error = dsl_dataset_hold(dp, zc->zc_name, FTAG, &ds);
-	if (error != 0) {
-		dsl_pool_rele(dp, FTAG);
-		return (error);
-	}
-
-	mutex_enter(&ds->ds_sendstream_lock);
-
-	/*
-	 * Iterate over all the send streams currently active on this dataset.
-	 * If there's one which matches the specified file descriptor _and_ the
-	 * stream was started by the current process, return the progress of
-	 * that stream.
-	 */
-	for (dsp = list_head(&ds->ds_sendstreams); dsp != NULL;
-	    dsp = list_next(&ds->ds_sendstreams, dsp)) {
-		if (dsp->dsa_outfd == zc->zc_cookie &&
-		    dsp->dsa_proc == curproc)
-			break;
-	}
-
-	if (dsp != NULL)
-		zc->zc_cookie = *(dsp->dsa_off);
-	else
-		error = SET_ERROR(ENOENT);
-
-	mutex_exit(&ds->ds_sendstream_lock);
-	dsl_dataset_rele(ds, FTAG);
-	dsl_pool_rele(dp, FTAG);
-	return (error);
-}
-
-static int
-zfs_ioc_inject_fault(zfs_cmd_t *zc)
-{
-	int id, error;
-
-	error = zio_inject_fault(zc->zc_name, (int)zc->zc_guid, &id,
-	    &zc->zc_inject_record);
-
-	if (error == 0)
-		zc->zc_guid = (uint64_t)id;
-
-	return (error);
-}
-
-static int
-zfs_ioc_clear_fault(zfs_cmd_t *zc)
-{
-	return (zio_clear_fault((int)zc->zc_guid));
-}
-
-static int
-zfs_ioc_inject_list_next(zfs_cmd_t *zc)
-{
-	int id = (int)zc->zc_guid;
-	int error;
-
-	error = zio_inject_list_next(&id, zc->zc_name, sizeof (zc->zc_name),
-	    &zc->zc_inject_record);
-
-	zc->zc_guid = id;
-
-	return (error);
-}
-
-static int
-zfs_ioc_error_log(zfs_cmd_t *zc)
-{
-	spa_t *spa;
-	int error;
-	size_t count = (size_t)zc->zc_nvlist_dst_size;
-
-	if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
-		return (error);
-
-	error = spa_get_errlog(spa, (void *)(uintptr_t)zc->zc_nvlist_dst,
-	    &count);
-	if (error == 0)
-		zc->zc_nvlist_dst_size = count;
-	else
-		zc->zc_nvlist_dst_size = spa_get_errlog_size(spa);
-
-	spa_close(spa, FTAG);
-
-	return (error);
-}
-
-static int
-zfs_ioc_clear(zfs_cmd_t *zc)
-{
-	spa_t *spa;
-	vdev_t *vd;
-	int error;
-
-	/*
-	 * On zpool clear we also fix up missing slogs
-	 */
-	mutex_enter(&spa_namespace_lock);
-	spa = spa_lookup(zc->zc_name);
-	if (spa == NULL) {
-		mutex_exit(&spa_namespace_lock);
-		return (SET_ERROR(EIO));
-	}
-	if (spa_get_log_state(spa) == SPA_LOG_MISSING) {
-		/* we need to let spa_open/spa_load clear the chains */
-		spa_set_log_state(spa, SPA_LOG_CLEAR);
-	}
-	spa->spa_last_open_failed = 0;
-	mutex_exit(&spa_namespace_lock);
-
-	if (zc->zc_cookie & ZPOOL_NO_REWIND) {
-		error = spa_open(zc->zc_name, &spa, FTAG);
-	} else {
-		nvlist_t *policy;
-		nvlist_t *config = NULL;
-
-		if (zc->zc_nvlist_src == 0)
-			return (SET_ERROR(EINVAL));
-
-		if ((error = get_nvlist(zc->zc_nvlist_src,
-		    zc->zc_nvlist_src_size, zc->zc_iflags, &policy)) == 0) {
-			error = spa_open_rewind(zc->zc_name, &spa, FTAG,
-			    policy, &config);
-			if (config != NULL) {
-				int err;
-
-				if ((err = put_nvlist(zc, config)) != 0)
-					error = err;
-				nvlist_free(config);
-			}
-			nvlist_free(policy);
-		}
-	}
-
-	if (error != 0)
-		return (error);
-
-	/*
-	 * If multihost is enabled, resuming I/O is unsafe as another
-	 * host may have imported the pool.
-	 */
-	if (spa_multihost(spa) && spa_suspended(spa))
-		return (SET_ERROR(EINVAL));
-
-	spa_vdev_state_enter(spa, SCL_NONE);
-
-	if (zc->zc_guid == 0) {
-		vd = NULL;
-	} else {
-		vd = spa_lookup_by_guid(spa, zc->zc_guid, B_TRUE);
-		if (vd == NULL) {
-			(void) spa_vdev_state_exit(spa, NULL, ENODEV);
-			spa_close(spa, FTAG);
-			return (SET_ERROR(ENODEV));
-		}
-	}
-
-	vdev_clear(spa, vd);
-
-	(void) spa_vdev_state_exit(spa, NULL, 0);
-
-	/*
-	 * Resume any suspended I/Os.
-	 */
-	if (zio_resume(spa) != 0)
-		error = SET_ERROR(EIO);
-
-	spa_close(spa, FTAG);
-
-	return (error);
-}
-
-/*
- * Reopen all the vdevs associated with the pool.
- *
- * innvl: {
- *  "scrub_restart" -> when true and scrub is running, allow to restart
- *              scrub as the side effect of the reopen (boolean).
- * }
- *
- * outnvl is unused
- */
-static const zfs_ioc_key_t zfs_keys_pool_reopen[] = {
-	{"scrub_restart",	DATA_TYPE_BOOLEAN_VALUE,	ZK_OPTIONAL},
-};
-
-static int
-zfs_ioc_pool_reopen(const char *pool, nvlist_t *innvl, nvlist_t *outnvl)
-{
-	spa_t *spa;
-	int error;
-	boolean_t scrub_restart = B_TRUE;
-
-	if (innvl) {
-		scrub_restart = fnvlist_lookup_boolean_value(innvl,
-		    "scrub_restart");
-	}
-
-	error = spa_open(pool, &spa, FTAG);
-	if (error != 0)
-		return (error);
-
-	spa_vdev_state_enter(spa, SCL_NONE);
-
-	/*
-	 * If a resilver is already in progress then set the
-	 * spa_scrub_reopen flag to B_TRUE so that we don't restart
-	 * the scan as a side effect of the reopen. Otherwise, let
-	 * vdev_open() decided if a resilver is required.
-	 */
-	spa->spa_scrub_reopen = (!scrub_restart &&
-	    dsl_scan_resilvering(spa->spa_dsl_pool));
-	vdev_reopen(spa->spa_root_vdev);
-	spa->spa_scrub_reopen = B_FALSE;
-
-	(void) spa_vdev_state_exit(spa, NULL, 0);
-	spa_close(spa, FTAG);
-	return (0);
-}
-
-/*
- * inputs:
- * zc_name	name of filesystem
- *
- * outputs:
- * zc_string	name of conflicting snapshot, if there is one
- */
-static int
-zfs_ioc_promote(zfs_cmd_t *zc)
-{
-	dsl_pool_t *dp;
-	dsl_dataset_t *ds, *ods;
-	char origin[ZFS_MAX_DATASET_NAME_LEN];
-	char *cp;
-	int error;
-
-	zc->zc_name[sizeof (zc->zc_name) - 1] = '\0';
-	if (dataset_namecheck(zc->zc_name, NULL, NULL) != 0 ||
-	    strchr(zc->zc_name, '%'))
-		return (SET_ERROR(EINVAL));
-
-	error = dsl_pool_hold(zc->zc_name, FTAG, &dp);
-	if (error != 0)
-		return (error);
-
-	error = dsl_dataset_hold(dp, zc->zc_name, FTAG, &ds);
-	if (error != 0) {
-		dsl_pool_rele(dp, FTAG);
-		return (error);
-	}
-
-	if (!dsl_dir_is_clone(ds->ds_dir)) {
-		dsl_dataset_rele(ds, FTAG);
-		dsl_pool_rele(dp, FTAG);
-		return (SET_ERROR(EINVAL));
-	}
-
-	error = dsl_dataset_hold_obj(dp,
-	    dsl_dir_phys(ds->ds_dir)->dd_origin_obj, FTAG, &ods);
-	if (error != 0) {
-		dsl_dataset_rele(ds, FTAG);
-		dsl_pool_rele(dp, FTAG);
-		return (error);
-	}
-
-	dsl_dataset_name(ods, origin);
-	dsl_dataset_rele(ods, FTAG);
-	dsl_dataset_rele(ds, FTAG);
-	dsl_pool_rele(dp, FTAG);
-
-	/*
-	 * We don't need to unmount *all* the origin fs's snapshots, but
-	 * it's easier.
-	 */
-	cp = strchr(origin, '@');
-	if (cp)
-		*cp = '\0';
-	(void) dmu_objset_find(origin,
-	    zfs_unmount_snap_cb, NULL, DS_FIND_SNAPSHOTS);
-	return (dsl_dataset_promote(zc->zc_name, zc->zc_string));
-}
-
-/*
- * Retrieve a single {user|group}{used|quota}@... property.
- *
- * inputs:
- * zc_name	name of filesystem
- * zc_objset_type zfs_userquota_prop_t
- * zc_value	domain name (eg. "S-1-234-567-89")
- * zc_guid	RID/UID/GID
- *
- * outputs:
- * zc_cookie	property value
- */
-static int
-zfs_ioc_userspace_one(zfs_cmd_t *zc)
-{
-	zfsvfs_t *zfsvfs;
-	int error;
-
-	if (zc->zc_objset_type >= ZFS_NUM_USERQUOTA_PROPS)
-		return (SET_ERROR(EINVAL));
-
-	error = zfsvfs_hold(zc->zc_name, FTAG, &zfsvfs, B_FALSE);
-	if (error != 0)
-		return (error);
-
-	error = zfs_userspace_one(zfsvfs,
-	    zc->zc_objset_type, zc->zc_value, zc->zc_guid, &zc->zc_cookie);
-	zfsvfs_rele(zfsvfs, FTAG);
-
-	return (error);
-}
-
-/*
- * inputs:
- * zc_name		name of filesystem
- * zc_cookie		zap cursor
- * zc_objset_type	zfs_userquota_prop_t
- * zc_nvlist_dst[_size] buffer to fill (not really an nvlist)
- *
- * outputs:
- * zc_nvlist_dst[_size]	data buffer (array of zfs_useracct_t)
- * zc_cookie	zap cursor
- */
-static int
-zfs_ioc_userspace_many(zfs_cmd_t *zc)
-{
-	zfsvfs_t *zfsvfs;
-	int bufsize = zc->zc_nvlist_dst_size;
-
-	if (bufsize <= 0)
-		return (SET_ERROR(ENOMEM));
-
-	int error = zfsvfs_hold(zc->zc_name, FTAG, &zfsvfs, B_FALSE);
-	if (error != 0)
-		return (error);
-
-	void *buf = kmem_alloc(bufsize, KM_SLEEP);
-
-	error = zfs_userspace_many(zfsvfs, zc->zc_objset_type, &zc->zc_cookie,
-	    buf, &zc->zc_nvlist_dst_size);
-
-	if (error == 0) {
-		error = ddi_copyout(buf,
-		    (void *)(uintptr_t)zc->zc_nvlist_dst,
-		    zc->zc_nvlist_dst_size, zc->zc_iflags);
-	}
-	kmem_free(buf, bufsize);
-	zfsvfs_rele(zfsvfs, FTAG);
-
-	return (error);
-}
-
-/*
- * inputs:
- * zc_name		name of filesystem
- *
- * outputs:
- * none
- */
-static int
-zfs_ioc_userspace_upgrade(zfs_cmd_t *zc)
-{
-	objset_t *os;
-	int error = 0;
-	zfsvfs_t *zfsvfs;
-
-	if (getzfsvfs(zc->zc_name, &zfsvfs) == 0) {
-		if (!dmu_objset_userused_enabled(zfsvfs->z_os)) {
-			/*
-			 * If userused is not enabled, it may be because the
-			 * objset needs to be closed & reopened (to grow the
-			 * objset_phys_t).  Suspend/resume the fs will do that.
-			 */
-			dsl_dataset_t *ds, *newds;
-
-			ds = dmu_objset_ds(zfsvfs->z_os);
-			error = zfs_suspend_fs(zfsvfs);
-			if (error == 0) {
-				dmu_objset_refresh_ownership(ds, &newds,
-				    zfsvfs);
-				error = zfs_resume_fs(zfsvfs, newds);
-			}
-		}
-		if (error == 0)
-			error = dmu_objset_userspace_upgrade(zfsvfs->z_os);
-#ifdef illumos
-		VFS_RELE(zfsvfs->z_vfs);
-#else
-		vfs_unbusy(zfsvfs->z_vfs);
-#endif
-	} else {
-		/* XXX kind of reading contents without owning */
-		error = dmu_objset_hold(zc->zc_name, FTAG, &os);
-		if (error != 0)
-			return (error);
-
-		error = dmu_objset_userspace_upgrade(os);
-		dmu_objset_rele(os, FTAG);
-	}
-
-	return (error);
-}
-
-#ifdef illumos
-/*
- * We don't want to have a hard dependency
- * against some special symbols in sharefs
- * nfs, and smbsrv.  Determine them if needed when
- * the first file system is shared.
- * Neither sharefs, nfs or smbsrv are unloadable modules.
- */
-int (*znfsexport_fs)(void *arg);
-int (*zshare_fs)(enum sharefs_sys_op, share_t *, uint32_t);
-int (*zsmbexport_fs)(void *arg, boolean_t add_share);
-
-int zfs_nfsshare_inited;
-int zfs_smbshare_inited;
-
-ddi_modhandle_t nfs_mod;
-ddi_modhandle_t sharefs_mod;
-ddi_modhandle_t smbsrv_mod;
-#endif	/* illumos */
-kmutex_t zfs_share_lock;
-
-#ifdef illumos
-static int
-zfs_init_sharefs()
-{
-	int error;
-
-	ASSERT(MUTEX_HELD(&zfs_share_lock));
-	/* Both NFS and SMB shares also require sharetab support. */
-	if (sharefs_mod == NULL && ((sharefs_mod =
-	    ddi_modopen("fs/sharefs",
-	    KRTLD_MODE_FIRST, &error)) == NULL)) {
-		return (SET_ERROR(ENOSYS));
-	}
-	if (zshare_fs == NULL && ((zshare_fs =
-	    (int (*)(enum sharefs_sys_op, share_t *, uint32_t))
-	    ddi_modsym(sharefs_mod, "sharefs_impl", &error)) == NULL)) {
-		return (SET_ERROR(ENOSYS));
-	}
-	return (0);
-}
-#endif	/* illumos */
-
-static int
-zfs_ioc_share(zfs_cmd_t *zc)
-{
-#ifdef illumos
-	int error;
-	int opcode;
-
-	switch (zc->zc_share.z_sharetype) {
-	case ZFS_SHARE_NFS:
-	case ZFS_UNSHARE_NFS:
-		if (zfs_nfsshare_inited == 0) {
-			mutex_enter(&zfs_share_lock);
-			if (nfs_mod == NULL && ((nfs_mod = ddi_modopen("fs/nfs",
-			    KRTLD_MODE_FIRST, &error)) == NULL)) {
-				mutex_exit(&zfs_share_lock);
-				return (SET_ERROR(ENOSYS));
-			}
-			if (znfsexport_fs == NULL &&
-			    ((znfsexport_fs = (int (*)(void *))
-			    ddi_modsym(nfs_mod,
-			    "nfs_export", &error)) == NULL)) {
-				mutex_exit(&zfs_share_lock);
-				return (SET_ERROR(ENOSYS));
-			}
-			error = zfs_init_sharefs();
-			if (error != 0) {
-				mutex_exit(&zfs_share_lock);
-				return (SET_ERROR(ENOSYS));
-			}
-			zfs_nfsshare_inited = 1;
-			mutex_exit(&zfs_share_lock);
-		}
-		break;
-	case ZFS_SHARE_SMB:
-	case ZFS_UNSHARE_SMB:
-		if (zfs_smbshare_inited == 0) {
-			mutex_enter(&zfs_share_lock);
-			if (smbsrv_mod == NULL && ((smbsrv_mod =
-			    ddi_modopen("drv/smbsrv",
-			    KRTLD_MODE_FIRST, &error)) == NULL)) {
-				mutex_exit(&zfs_share_lock);
-				return (SET_ERROR(ENOSYS));
-			}
-			if (zsmbexport_fs == NULL && ((zsmbexport_fs =
-			    (int (*)(void *, boolean_t))ddi_modsym(smbsrv_mod,
-			    "smb_server_share", &error)) == NULL)) {
-				mutex_exit(&zfs_share_lock);
-				return (SET_ERROR(ENOSYS));
-			}
-			error = zfs_init_sharefs();
-			if (error != 0) {
-				mutex_exit(&zfs_share_lock);
-				return (SET_ERROR(ENOSYS));
-			}
-			zfs_smbshare_inited = 1;
-			mutex_exit(&zfs_share_lock);
-		}
-		break;
-	default:
-		return (SET_ERROR(EINVAL));
-	}
-
-	switch (zc->zc_share.z_sharetype) {
-	case ZFS_SHARE_NFS:
-	case ZFS_UNSHARE_NFS:
-		if (error =
-		    znfsexport_fs((void *)
-		    (uintptr_t)zc->zc_share.z_exportdata))
-			return (error);
-		break;
-	case ZFS_SHARE_SMB:
-	case ZFS_UNSHARE_SMB:
-		if (error = zsmbexport_fs((void *)
-		    (uintptr_t)zc->zc_share.z_exportdata,
-		    zc->zc_share.z_sharetype == ZFS_SHARE_SMB ?
-		    B_TRUE: B_FALSE)) {
-			return (error);
-		}
-		break;
-	}
-
-	opcode = (zc->zc_share.z_sharetype == ZFS_SHARE_NFS ||
-	    zc->zc_share.z_sharetype == ZFS_SHARE_SMB) ?
-	    SHAREFS_ADD : SHAREFS_REMOVE;
-
-	/*
-	 * Add or remove share from sharetab
-	 */
-	error = zshare_fs(opcode,
-	    (void *)(uintptr_t)zc->zc_share.z_sharedata,
-	    zc->zc_share.z_sharemax);
-
-	return (error);
-
-#else	/* !illumos */
-	return (ENOSYS);
-#endif	/* illumos */
-}
-
-ace_t full_access[] = {
-	{(uid_t)-1, ACE_ALL_PERMS, ACE_EVERYONE, 0}
-};
-
-/*
- * inputs:
- * zc_name		name of containing filesystem
- * zc_obj		object # beyond which we want next in-use object #
- *
- * outputs:
- * zc_obj		next in-use object #
- */
-static int
-zfs_ioc_next_obj(zfs_cmd_t *zc)
-{
-	objset_t *os = NULL;
-	int error;
-
-	error = dmu_objset_hold(zc->zc_name, FTAG, &os);
-	if (error != 0)
-		return (error);
-
-	error = dmu_object_next(os, &zc->zc_obj, B_FALSE, 0);
-
-	dmu_objset_rele(os, FTAG);
-	return (error);
-}
-
-/*
- * inputs:
- * zc_name		name of filesystem
- * zc_value		prefix name for snapshot
- * zc_cleanup_fd	cleanup-on-exit file descriptor for calling process
- *
- * outputs:
- * zc_value		short name of new snapshot
- */
-static int
-zfs_ioc_tmp_snapshot(zfs_cmd_t *zc)
-{
-	char *snap_name;
-	char *hold_name;
-	int error;
-	minor_t minor;
-
-	error = zfs_onexit_fd_hold(zc->zc_cleanup_fd, &minor);
-	if (error != 0)
-		return (error);
-
-	snap_name = kmem_asprintf("%s-%016llx", zc->zc_value,
-	    (u_longlong_t)ddi_get_lbolt64());
-	hold_name = kmem_asprintf("%%%s", zc->zc_value);
-
-	error = dsl_dataset_snapshot_tmp(zc->zc_name, snap_name, minor,
-	    hold_name);
-	if (error == 0)
-		(void) strcpy(zc->zc_value, snap_name);
-	strfree(snap_name);
-	strfree(hold_name);
-	zfs_onexit_fd_rele(zc->zc_cleanup_fd);
-	return (error);
-}
-
-/*
- * inputs:
- * zc_name		name of "to" snapshot
- * zc_value		name of "from" snapshot
- * zc_cookie		file descriptor to write diff data on
- *
- * outputs:
- * dmu_diff_record_t's to the file descriptor
- */
-static int
-zfs_ioc_diff(zfs_cmd_t *zc)
-{
-	file_t *fp;
-	offset_t off;
-	int error;
-
-#ifdef illumos
-	fp = getf(zc->zc_cookie);
-#else
-	fget_write(curthread, zc->zc_cookie, &cap_write_rights, &fp);
-#endif
-	if (fp == NULL)
-		return (SET_ERROR(EBADF));
-
-	off = fp->f_offset;
-
-#ifdef illumos
-	error = dmu_diff(zc->zc_name, zc->zc_value, fp->f_vnode, &off);
-#else
-	error = dmu_diff(zc->zc_name, zc->zc_value, fp, &off);
-#endif
-
-	if (off >= 0 && off <= MAXOFFSET_T)
-		fp->f_offset = off;
-	releasef(zc->zc_cookie);
-
-	return (error);
-}
-
-#ifdef illumos
-/*
- * Remove all ACL files in shares dir
- */
-static int
-zfs_smb_acl_purge(znode_t *dzp)
-{
-	zap_cursor_t	zc;
-	zap_attribute_t	zap;
-	zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
-	int error;
-
-	for (zap_cursor_init(&zc, zfsvfs->z_os, dzp->z_id);
-	    (error = zap_cursor_retrieve(&zc, &zap)) == 0;
-	    zap_cursor_advance(&zc)) {
-		if ((error = VOP_REMOVE(ZTOV(dzp), zap.za_name, kcred,
-		    NULL, 0)) != 0)
-			break;
-	}
-	zap_cursor_fini(&zc);
-	return (error);
-}
-#endif	/* illumos */
-
-static int
-zfs_ioc_smb_acl(zfs_cmd_t *zc)
-{
-#ifdef illumos
-	vnode_t *vp;
-	znode_t *dzp;
-	vnode_t *resourcevp = NULL;
-	znode_t *sharedir;
-	zfsvfs_t *zfsvfs;
-	nvlist_t *nvlist;
-	char *src, *target;
-	vattr_t vattr;
-	vsecattr_t vsec;
-	int error = 0;
-
-	if ((error = lookupname(zc->zc_value, UIO_SYSSPACE,
-	    NO_FOLLOW, NULL, &vp)) != 0)
-		return (error);
-
-	/* Now make sure mntpnt and dataset are ZFS */
-
-	if (strcmp(vp->v_vfsp->mnt_stat.f_fstypename, "zfs") != 0 ||
-	    (strcmp((char *)refstr_value(vp->v_vfsp->vfs_resource),
-	    zc->zc_name) != 0)) {
-		VN_RELE(vp);
-		return (SET_ERROR(EINVAL));
-	}
-
-	dzp = VTOZ(vp);
-	zfsvfs = dzp->z_zfsvfs;
-	ZFS_ENTER(zfsvfs);
-
-	/*
-	 * Create share dir if its missing.
-	 */
-	mutex_enter(&zfsvfs->z_lock);
-	if (zfsvfs->z_shares_dir == 0) {
-		dmu_tx_t *tx;
-
-		tx = dmu_tx_create(zfsvfs->z_os);
-		dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, TRUE,
-		    ZFS_SHARES_DIR);
-		dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
-		error = dmu_tx_assign(tx, TXG_WAIT);
-		if (error != 0) {
-			dmu_tx_abort(tx);
-		} else {
-			error = zfs_create_share_dir(zfsvfs, tx);
-			dmu_tx_commit(tx);
-		}
-		if (error != 0) {
-			mutex_exit(&zfsvfs->z_lock);
-			VN_RELE(vp);
-			ZFS_EXIT(zfsvfs);
-			return (error);
-		}
-	}
-	mutex_exit(&zfsvfs->z_lock);
-
-	ASSERT(zfsvfs->z_shares_dir);
-	if ((error = zfs_zget(zfsvfs, zfsvfs->z_shares_dir, &sharedir)) != 0) {
-		VN_RELE(vp);
-		ZFS_EXIT(zfsvfs);
-		return (error);
-	}
-
-	switch (zc->zc_cookie) {
-	case ZFS_SMB_ACL_ADD:
-		vattr.va_mask = AT_MODE|AT_UID|AT_GID|AT_TYPE;
-		vattr.va_type = VREG;
-		vattr.va_mode = S_IFREG|0777;
-		vattr.va_uid = 0;
-		vattr.va_gid = 0;
-
-		vsec.vsa_mask = VSA_ACE;
-		vsec.vsa_aclentp = &full_access;
-		vsec.vsa_aclentsz = sizeof (full_access);
-		vsec.vsa_aclcnt = 1;
-
-		error = VOP_CREATE(ZTOV(sharedir), zc->zc_string,
-		    &vattr, EXCL, 0, &resourcevp, kcred, 0, NULL, &vsec);
-		if (resourcevp)
-			VN_RELE(resourcevp);
-		break;
-
-	case ZFS_SMB_ACL_REMOVE:
-		error = VOP_REMOVE(ZTOV(sharedir), zc->zc_string, kcred,
-		    NULL, 0);
-		break;
-
-	case ZFS_SMB_ACL_RENAME:
-		if ((error = get_nvlist(zc->zc_nvlist_src,
-		    zc->zc_nvlist_src_size, zc->zc_iflags, &nvlist)) != 0) {
-			VN_RELE(vp);
-			VN_RELE(ZTOV(sharedir));
-			ZFS_EXIT(zfsvfs);
-			return (error);
-		}
-		if (nvlist_lookup_string(nvlist, ZFS_SMB_ACL_SRC, &src) ||
-		    nvlist_lookup_string(nvlist, ZFS_SMB_ACL_TARGET,
-		    &target)) {
-			VN_RELE(vp);
-			VN_RELE(ZTOV(sharedir));
-			ZFS_EXIT(zfsvfs);
-			nvlist_free(nvlist);
-			return (error);
-		}
-		error = VOP_RENAME(ZTOV(sharedir), src, ZTOV(sharedir), target,
-		    kcred, NULL, 0);
-		nvlist_free(nvlist);
-		break;
-
-	case ZFS_SMB_ACL_PURGE:
-		error = zfs_smb_acl_purge(sharedir);
-		break;
-
-	default:
-		error = SET_ERROR(EINVAL);
-		break;
-	}
-
-	VN_RELE(vp);
-	VN_RELE(ZTOV(sharedir));
-
-	ZFS_EXIT(zfsvfs);
-
-	return (error);
-#else	/* !illumos */
-	return (EOPNOTSUPP);
-#endif	/* illumos */
-}
-
-/*
- * innvl: {
- *     "holds" -> { snapname -> holdname (string), ... }
- *     (optional) "cleanup_fd" -> fd (int32)
- * }
- *
- * outnvl: {
- *     snapname -> error value (int32)
- *     ...
- * }
- */
-static const zfs_ioc_key_t zfs_keys_hold[] = {
-	{"holds",		DATA_TYPE_NVLIST,	0},
-	{"cleanup_fd",		DATA_TYPE_INT32,	ZK_OPTIONAL},
-};
-
-/* ARGSUSED */
-static int
-zfs_ioc_hold(const char *pool, nvlist_t *args, nvlist_t *errlist)
-{
-	nvpair_t *pair;
-	nvlist_t *holds;
-	int cleanup_fd = -1;
-	int error;
-	minor_t minor = 0;
-
-	holds = fnvlist_lookup_nvlist(args, "holds");
-
-	/* make sure the user didn't pass us any invalid (empty) tags */
-	for (pair = nvlist_next_nvpair(holds, NULL); pair != NULL;
-	    pair = nvlist_next_nvpair(holds, pair)) {
-		char *htag;
-
-		error = nvpair_value_string(pair, &htag);
-		if (error != 0)
-			return (SET_ERROR(error));
-
-		if (strlen(htag) == 0)
-			return (SET_ERROR(EINVAL));
-	}
-
-	if (nvlist_lookup_int32(args, "cleanup_fd", &cleanup_fd) == 0) {
-		error = zfs_onexit_fd_hold(cleanup_fd, &minor);
-		if (error != 0)
-			return (error);
-	}
-
-	error = dsl_dataset_user_hold(holds, minor, errlist);
-	if (minor != 0)
-		zfs_onexit_fd_rele(cleanup_fd);
-	return (error);
-}
-
-/*
- * innvl is not used.
- *
- * outnvl: {
- *    holdname -> time added (uint64 seconds since epoch)
- *    ...
- * }
- */
-static const zfs_ioc_key_t zfs_keys_get_holds[] = {
-	/* no nvl keys */
-};
-
-/* ARGSUSED */
-static int
-zfs_ioc_get_holds(const char *snapname, nvlist_t *args, nvlist_t *outnvl)
-{
-	return (dsl_dataset_get_holds(snapname, outnvl));
-}
-
-/*
- * innvl: {
- *     snapname -> { holdname, ... }
- *     ...
- * }
- *
- * outnvl: {
- *     snapname -> error value (int32)
- *     ...
- * }
- */
-static const zfs_ioc_key_t zfs_keys_release[] = {
-	{"<snapname>...",	DATA_TYPE_NVLIST,	ZK_WILDCARDLIST},
-};
-
-/* ARGSUSED */
-static int
-zfs_ioc_release(const char *pool, nvlist_t *holds, nvlist_t *errlist)
-{
-	return (dsl_dataset_user_release(holds, errlist));
-}
-
-/*
- * inputs:
- * zc_name		name of new filesystem or snapshot
- * zc_value		full name of old snapshot
- *
- * outputs:
- * zc_cookie		space in bytes
- * zc_objset_type	compressed space in bytes
- * zc_perm_action	uncompressed space in bytes
- */
-static int
-zfs_ioc_space_written(zfs_cmd_t *zc)
-{
-	int error;
-	dsl_pool_t *dp;
-	dsl_dataset_t *new, *old;
-
-	error = dsl_pool_hold(zc->zc_name, FTAG, &dp);
-	if (error != 0)
-		return (error);
-	error = dsl_dataset_hold(dp, zc->zc_name, FTAG, &new);
-	if (error != 0) {
-		dsl_pool_rele(dp, FTAG);
-		return (error);
-	}
-	error = dsl_dataset_hold(dp, zc->zc_value, FTAG, &old);
-	if (error != 0) {
-		dsl_dataset_rele(new, FTAG);
-		dsl_pool_rele(dp, FTAG);
-		return (error);
-	}
-
-	error = dsl_dataset_space_written(old, new, &zc->zc_cookie,
-	    &zc->zc_objset_type, &zc->zc_perm_action);
-	dsl_dataset_rele(old, FTAG);
-	dsl_dataset_rele(new, FTAG);
-	dsl_pool_rele(dp, FTAG);
-	return (error);
-}
-
-/*
- * innvl: {
- *     "firstsnap" -> snapshot name
- * }
- *
- * outnvl: {
- *     "used" -> space in bytes
- *     "compressed" -> compressed space in bytes
- *     "uncompressed" -> uncompressed space in bytes
- * }
- */
-static const zfs_ioc_key_t zfs_keys_space_snaps[] = {
-	{"firstsnap",	DATA_TYPE_STRING,	0},
-};
-
-static int
-zfs_ioc_space_snaps(const char *lastsnap, nvlist_t *innvl, nvlist_t *outnvl)
-{
-	int error;
-	dsl_pool_t *dp;
-	dsl_dataset_t *new, *old;
-	char *firstsnap;
-	uint64_t used, comp, uncomp;
-
-	firstsnap = fnvlist_lookup_string(innvl, "firstsnap");
-
-	error = dsl_pool_hold(lastsnap, FTAG, &dp);
-	if (error != 0)
-		return (error);
-
-	error = dsl_dataset_hold(dp, lastsnap, FTAG, &new);
-	if (error == 0 && !new->ds_is_snapshot) {
-		dsl_dataset_rele(new, FTAG);
-		error = SET_ERROR(EINVAL);
-	}
-	if (error != 0) {
-		dsl_pool_rele(dp, FTAG);
-		return (error);
-	}
-	error = dsl_dataset_hold(dp, firstsnap, FTAG, &old);
-	if (error == 0 && !old->ds_is_snapshot) {
-		dsl_dataset_rele(old, FTAG);
-		error = SET_ERROR(EINVAL);
-	}
-	if (error != 0) {
-		dsl_dataset_rele(new, FTAG);
-		dsl_pool_rele(dp, FTAG);
-		return (error);
-	}
-
-	error = dsl_dataset_space_wouldfree(old, new, &used, &comp, &uncomp);
-	dsl_dataset_rele(old, FTAG);
-	dsl_dataset_rele(new, FTAG);
-	dsl_pool_rele(dp, FTAG);
-	fnvlist_add_uint64(outnvl, "used", used);
-	fnvlist_add_uint64(outnvl, "compressed", comp);
-	fnvlist_add_uint64(outnvl, "uncompressed", uncomp);
-	return (error);
-}
-
-static int
-zfs_ioc_jail(zfs_cmd_t *zc)
-{
-
-	return (zone_dataset_attach(curthread->td_ucred, zc->zc_name,
-	    (int)zc->zc_jailid));
-}
-
-static int
-zfs_ioc_unjail(zfs_cmd_t *zc)
-{
-
-	return (zone_dataset_detach(curthread->td_ucred, zc->zc_name,
-	    (int)zc->zc_jailid));
-}
-
-/*
- * innvl: {
- *     "fd" -> file descriptor to write stream to (int32)
- *     (optional) "fromsnap" -> full snap name to send an incremental from
- *     (optional) "largeblockok" -> (value ignored)
- *         indicates that blocks > 128KB are permitted
- *     (optional) "embedok" -> (value ignored)
- *         presence indicates DRR_WRITE_EMBEDDED records are permitted
- *     (optional) "compressok" -> (value ignored)
- *         presence indicates compressed DRR_WRITE records are permitted
- *     (optional) "resume_object" and "resume_offset" -> (uint64)
- *         if present, resume send stream from specified object and offset.
- * }
- *
- * outnvl is unused
- */
-static const zfs_ioc_key_t zfs_keys_send_new[] = {
-	{"fd",			DATA_TYPE_INT32,	0},
-	{"fromsnap",		DATA_TYPE_STRING,	ZK_OPTIONAL},
-	{"largeblockok",	DATA_TYPE_BOOLEAN,	ZK_OPTIONAL},
-	{"embedok",		DATA_TYPE_BOOLEAN,	ZK_OPTIONAL},
-	{"compressok",		DATA_TYPE_BOOLEAN,	ZK_OPTIONAL},
-	{"rawok",		DATA_TYPE_BOOLEAN,	ZK_OPTIONAL},
-	{"resume_object",	DATA_TYPE_UINT64,	ZK_OPTIONAL},
-	{"resume_offset",	DATA_TYPE_UINT64,	ZK_OPTIONAL},
-};
-
-/* ARGSUSED */
-static int
-zfs_ioc_send_new(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl)
-{
-	file_t *fp;
-	int error;
-	offset_t off;
-	char *fromname = NULL;
-	int fd;
-	boolean_t largeblockok;
-	boolean_t embedok;
-	boolean_t compressok;
-	uint64_t resumeobj = 0;
-	uint64_t resumeoff = 0;
-
-	fd = fnvlist_lookup_int32(innvl, "fd");
-
-	(void) nvlist_lookup_string(innvl, "fromsnap", &fromname);
-
-	largeblockok = nvlist_exists(innvl, "largeblockok");
-	embedok = nvlist_exists(innvl, "embedok");
-	compressok = nvlist_exists(innvl, "compressok");
-
-	(void) nvlist_lookup_uint64(innvl, "resume_object", &resumeobj);
-	(void) nvlist_lookup_uint64(innvl, "resume_offset", &resumeoff);
-
-#ifdef illumos
-	file_t *fp = getf(fd);
-#else
-	fget_write(curthread, fd, &cap_write_rights, &fp);
-#endif
-	if (fp == NULL)
-		return (SET_ERROR(EBADF));
-
-	off = fp->f_offset;
-	error = dmu_send(snapname, fromname, embedok, largeblockok, compressok,
-#ifdef illumos
-	    fd, resumeobj, resumeoff, fp->f_vnode, &off);
-#else
-	    fd, resumeobj, resumeoff, fp, &off);
-#endif
-
-#ifdef illumos
-	if (VOP_SEEK(fp->f_vnode, fp->f_offset, &off, NULL) == 0)
-		fp->f_offset = off;
-#else
-	fp->f_offset = off;
-#endif
-
-	releasef(fd);
-	return (error);
-}
-
-/*
- * Determine approximately how large a zfs send stream will be -- the number
- * of bytes that will be written to the fd supplied to zfs_ioc_send_new().
- *
- * innvl: {
- *     (optional) "from" -> full snap or bookmark name to send an incremental
- *                          from
- *     (optional) "largeblockok" -> (value ignored)
- *         indicates that blocks > 128KB are permitted
- *     (optional) "embedok" -> (value ignored)
- *         presence indicates DRR_WRITE_EMBEDDED records are permitted
- *     (optional) "compressok" -> (value ignored)
- *         presence indicates compressed DRR_WRITE records are permitted
- * }
- *
- * outnvl: {
- *     "space" -> bytes of space (uint64)
- * }
- */
-static const zfs_ioc_key_t zfs_keys_send_space[] = {
-	{"from",		DATA_TYPE_STRING,	ZK_OPTIONAL},
-	{"fromsnap",		DATA_TYPE_STRING,	ZK_OPTIONAL},
-	{"largeblockok",	DATA_TYPE_BOOLEAN,	ZK_OPTIONAL},
-	{"embedok",		DATA_TYPE_BOOLEAN,	ZK_OPTIONAL},
-	{"compressok",		DATA_TYPE_BOOLEAN,	ZK_OPTIONAL},
-	{"rawok",		DATA_TYPE_BOOLEAN,	ZK_OPTIONAL},
-};
-
-static int
-zfs_ioc_send_space(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl)
-{
-	dsl_pool_t *dp;
-	dsl_dataset_t *tosnap;
-	int error;
-	char *fromname;
-	boolean_t compressok;
-	uint64_t space;
-
-	error = dsl_pool_hold(snapname, FTAG, &dp);
-	if (error != 0)
-		return (error);
-
-	error = dsl_dataset_hold(dp, snapname, FTAG, &tosnap);
-	if (error != 0) {
-		dsl_pool_rele(dp, FTAG);
-		return (error);
-	}
-
-	compressok = nvlist_exists(innvl, "compressok");
-
-	error = nvlist_lookup_string(innvl, "from", &fromname);
-	if (error == 0) {
-		if (strchr(fromname, '@') != NULL) {
-			/*
-			 * If from is a snapshot, hold it and use the more
-			 * efficient dmu_send_estimate to estimate send space
-			 * size using deadlists.
-			 */
-			dsl_dataset_t *fromsnap;
-			error = dsl_dataset_hold(dp, fromname, FTAG, &fromsnap);
-			if (error != 0)
-				goto out;
-			error = dmu_send_estimate(tosnap, fromsnap, compressok,
-			    &space);
-			dsl_dataset_rele(fromsnap, FTAG);
-		} else if (strchr(fromname, '#') != NULL) {
-			/*
-			 * If from is a bookmark, fetch the creation TXG of the
-			 * snapshot it was created from and use that to find
-			 * blocks that were born after it.
-			 */
-			zfs_bookmark_phys_t frombm;
-
-			error = dsl_bookmark_lookup(dp, fromname, tosnap,
-			    &frombm);
-			if (error != 0)
-				goto out;
-			error = dmu_send_estimate_from_txg(tosnap,
-			    frombm.zbm_creation_txg, compressok, &space);
-		} else {
-			/*
-			 * from is not properly formatted as a snapshot or
-			 * bookmark
-			 */
-			error = SET_ERROR(EINVAL);
-			goto out;
-		}
-	} else {
-		/*
-		 * If estimating the size of a full send, use dmu_send_estimate.
-		 */
-		error = dmu_send_estimate(tosnap, NULL, compressok, &space);
-	}
-
-	fnvlist_add_uint64(outnvl, "space", space);
-
-out:
-	dsl_dataset_rele(tosnap, FTAG);
-	dsl_pool_rele(dp, FTAG);
-	return (error);
-}
-
-/*
- * Sync the currently open TXG to disk for the specified pool.
- * This is somewhat similar to 'zfs_sync()'.
- * For cases that do not result in error this ioctl will wait for
- * the currently open TXG to commit before returning back to the caller.
- *
- * innvl: {
- *  "force" -> when true, force uberblock update even if there is no dirty data.
- *             In addition this will cause the vdev configuration to be written
- *             out including updating the zpool cache file. (boolean_t)
- * }
- *
- * onvl is unused
- */
-static const zfs_ioc_key_t zfs_keys_pool_sync[] = {
-	{"force",	DATA_TYPE_BOOLEAN_VALUE,	0},
-};
-
-/* ARGSUSED */
-static int
-zfs_ioc_pool_sync(const char *pool, nvlist_t *innvl, nvlist_t *onvl)
-{
-	int err;
-	boolean_t force;
-	spa_t *spa;
-
-	if ((err = spa_open(pool, &spa, FTAG)) != 0)
-		return (err);
-
-	force = fnvlist_lookup_boolean_value(innvl, "force");
-	if (force) {
-		spa_config_enter(spa, SCL_CONFIG, FTAG, RW_WRITER);
-		vdev_config_dirty(spa->spa_root_vdev);
-		spa_config_exit(spa, SCL_CONFIG, FTAG);
-	}
-	txg_wait_synced(spa_get_dsl(spa), 0);
-
-	spa_close(spa, FTAG);
-
-	return (err);
-}
-
-static zfs_ioc_vec_t zfs_ioc_vec[ZFS_IOC_LAST - ZFS_IOC_FIRST];
-
-static void
-zfs_ioctl_register_legacy(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func,
-    zfs_secpolicy_func_t *secpolicy, zfs_ioc_namecheck_t namecheck,
-    boolean_t log_history, zfs_ioc_poolcheck_t pool_check)
-{
-	zfs_ioc_vec_t *vec = &zfs_ioc_vec[ioc - ZFS_IOC_FIRST];
-
-	ASSERT3U(ioc, >=, ZFS_IOC_FIRST);
-	ASSERT3U(ioc, <, ZFS_IOC_LAST);
-	ASSERT3P(vec->zvec_legacy_func, ==, NULL);
-	ASSERT3P(vec->zvec_func, ==, NULL);
-
-	vec->zvec_legacy_func = func;
-	vec->zvec_secpolicy = secpolicy;
-	vec->zvec_namecheck = namecheck;
-	vec->zvec_allow_log = log_history;
-	vec->zvec_pool_check = pool_check;
-}
-
-/*
- * See the block comment at the beginning of this file for details on
- * each argument to this function.
- */
-static void
-zfs_ioctl_register(const char *name, zfs_ioc_t ioc, zfs_ioc_func_t *func,
-    zfs_secpolicy_func_t *secpolicy, zfs_ioc_namecheck_t namecheck,
-    zfs_ioc_poolcheck_t pool_check, boolean_t smush_outnvlist,
-    boolean_t allow_log, const zfs_ioc_key_t *nvl_keys, size_t num_keys)
-{
-	zfs_ioc_vec_t *vec = &zfs_ioc_vec[ioc - ZFS_IOC_FIRST];
-
-	ASSERT3U(ioc, >=, ZFS_IOC_FIRST);
-	ASSERT3U(ioc, <, ZFS_IOC_LAST);
-	ASSERT3P(vec->zvec_legacy_func, ==, NULL);
-	ASSERT3P(vec->zvec_func, ==, NULL);
-
-	/* if we are logging, the name must be valid */
-	ASSERT(!allow_log || namecheck != NO_NAME);
-
-	vec->zvec_name = name;
-	vec->zvec_func = func;
-	vec->zvec_secpolicy = secpolicy;
-	vec->zvec_namecheck = namecheck;
-	vec->zvec_pool_check = pool_check;
-	vec->zvec_smush_outnvlist = smush_outnvlist;
-	vec->zvec_allow_log = allow_log;
-	vec->zvec_nvl_keys = nvl_keys;
-	vec->zvec_nvl_key_count = num_keys;
-}
-
-static void
-zfs_ioctl_register_pool(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func,
-    zfs_secpolicy_func_t *secpolicy, boolean_t log_history,
-    zfs_ioc_poolcheck_t pool_check)
-{
-	zfs_ioctl_register_legacy(ioc, func, secpolicy,
-	    POOL_NAME, log_history, pool_check);
-}
-
-static void
-zfs_ioctl_register_dataset_nolog(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func,
-    zfs_secpolicy_func_t *secpolicy, zfs_ioc_poolcheck_t pool_check)
-{
-	zfs_ioctl_register_legacy(ioc, func, secpolicy,
-	    DATASET_NAME, B_FALSE, pool_check);
-}
-
-static void
-zfs_ioctl_register_pool_modify(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func)
-{
-	zfs_ioctl_register_legacy(ioc, func, zfs_secpolicy_config,
-	    POOL_NAME, B_TRUE, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY);
-}
-
-static void
-zfs_ioctl_register_pool_meta(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func,
-    zfs_secpolicy_func_t *secpolicy)
-{
-	zfs_ioctl_register_legacy(ioc, func, secpolicy,
-	    NO_NAME, B_FALSE, POOL_CHECK_NONE);
-}
-
-static void
-zfs_ioctl_register_dataset_read_secpolicy(zfs_ioc_t ioc,
-    zfs_ioc_legacy_func_t *func, zfs_secpolicy_func_t *secpolicy)
-{
-	zfs_ioctl_register_legacy(ioc, func, secpolicy,
-	    DATASET_NAME, B_FALSE, POOL_CHECK_SUSPENDED);
-}
-
-static void
-zfs_ioctl_register_dataset_read(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func)
-{
-	zfs_ioctl_register_dataset_read_secpolicy(ioc, func,
-	    zfs_secpolicy_read);
-}
-
-static void
-zfs_ioctl_register_dataset_modify(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func,
-    zfs_secpolicy_func_t *secpolicy)
-{
-	zfs_ioctl_register_legacy(ioc, func, secpolicy,
-	    DATASET_NAME, B_TRUE, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY);
-}
-
-static void
-zfs_ioctl_init(void)
-{
-	zfs_ioctl_register("snapshot", ZFS_IOC_SNAPSHOT,
-	    zfs_ioc_snapshot, zfs_secpolicy_snapshot, POOL_NAME,
-	    POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE,
-	    zfs_keys_snapshot, ARRAY_SIZE(zfs_keys_snapshot));
-
-	zfs_ioctl_register("log_history", ZFS_IOC_LOG_HISTORY,
-	    zfs_ioc_log_history, zfs_secpolicy_log_history, NO_NAME,
-	    POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_FALSE, B_FALSE,
-	    zfs_keys_log_history, ARRAY_SIZE(zfs_keys_log_history));
-
-	zfs_ioctl_register("space_snaps", ZFS_IOC_SPACE_SNAPS,
-	    zfs_ioc_space_snaps, zfs_secpolicy_read, DATASET_NAME,
-	    POOL_CHECK_SUSPENDED, B_FALSE, B_FALSE,
-	    zfs_keys_space_snaps, ARRAY_SIZE(zfs_keys_space_snaps));
-
-	zfs_ioctl_register("send", ZFS_IOC_SEND_NEW,
-	    zfs_ioc_send_new, zfs_secpolicy_send_new, DATASET_NAME,
-	    POOL_CHECK_SUSPENDED, B_FALSE, B_FALSE,
-	    zfs_keys_send_new, ARRAY_SIZE(zfs_keys_send_new));
-
-	zfs_ioctl_register("send_space", ZFS_IOC_SEND_SPACE,
-	    zfs_ioc_send_space, zfs_secpolicy_read, DATASET_NAME,
-	    POOL_CHECK_SUSPENDED, B_FALSE, B_FALSE,
-	    zfs_keys_send_space, ARRAY_SIZE(zfs_keys_send_space));
-
-	zfs_ioctl_register("create", ZFS_IOC_CREATE,
-	    zfs_ioc_create, zfs_secpolicy_create_clone, DATASET_NAME,
-	    POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE,
-	    zfs_keys_create, ARRAY_SIZE(zfs_keys_create));
-
-	zfs_ioctl_register("clone", ZFS_IOC_CLONE,
-	    zfs_ioc_clone, zfs_secpolicy_create_clone, DATASET_NAME,
-	    POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE,
-	    zfs_keys_clone, ARRAY_SIZE(zfs_keys_clone));
-
-	zfs_ioctl_register("remap", ZFS_IOC_REMAP,
-	    zfs_ioc_remap, zfs_secpolicy_remap, DATASET_NAME,
-	    POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_FALSE, B_TRUE,
-	    zfs_keys_remap, ARRAY_SIZE(zfs_keys_remap));
-
-	zfs_ioctl_register("destroy_snaps", ZFS_IOC_DESTROY_SNAPS,
-	    zfs_ioc_destroy_snaps, zfs_secpolicy_destroy_snaps, POOL_NAME,
-	    POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE,
-	    zfs_keys_destroy_snaps, ARRAY_SIZE(zfs_keys_destroy_snaps));
-
-	zfs_ioctl_register("hold", ZFS_IOC_HOLD,
-	    zfs_ioc_hold, zfs_secpolicy_hold, POOL_NAME,
-	    POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE,
-	    zfs_keys_hold, ARRAY_SIZE(zfs_keys_hold));
-	zfs_ioctl_register("release", ZFS_IOC_RELEASE,
-	    zfs_ioc_release, zfs_secpolicy_release, POOL_NAME,
-	    POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE,
-	    zfs_keys_release, ARRAY_SIZE(zfs_keys_release));
-
-	zfs_ioctl_register("get_holds", ZFS_IOC_GET_HOLDS,
-	    zfs_ioc_get_holds, zfs_secpolicy_read, DATASET_NAME,
-	    POOL_CHECK_SUSPENDED, B_FALSE, B_FALSE,
-	    zfs_keys_get_holds, ARRAY_SIZE(zfs_keys_get_holds));
-
-	zfs_ioctl_register("rollback", ZFS_IOC_ROLLBACK,
-	    zfs_ioc_rollback, zfs_secpolicy_rollback, DATASET_NAME,
-	    POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_FALSE, B_TRUE,
-	    zfs_keys_rollback, ARRAY_SIZE(zfs_keys_rollback));
-
-	zfs_ioctl_register("bookmark", ZFS_IOC_BOOKMARK,
-	    zfs_ioc_bookmark, zfs_secpolicy_bookmark, POOL_NAME,
-	    POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE,
-	    zfs_keys_bookmark, ARRAY_SIZE(zfs_keys_bookmark));
-
-	zfs_ioctl_register("get_bookmarks", ZFS_IOC_GET_BOOKMARKS,
-	    zfs_ioc_get_bookmarks, zfs_secpolicy_read, DATASET_NAME,
-	    POOL_CHECK_SUSPENDED, B_FALSE, B_FALSE,
-	    zfs_keys_get_bookmarks, ARRAY_SIZE(zfs_keys_get_bookmarks));
-
-	zfs_ioctl_register("destroy_bookmarks", ZFS_IOC_DESTROY_BOOKMARKS,
-	    zfs_ioc_destroy_bookmarks, zfs_secpolicy_destroy_bookmarks,
-	    POOL_NAME,
-	    POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE,
-	    zfs_keys_destroy_bookmarks,
-	    ARRAY_SIZE(zfs_keys_destroy_bookmarks));
-
-	zfs_ioctl_register("channel_program", ZFS_IOC_CHANNEL_PROGRAM,
-	    zfs_ioc_channel_program, zfs_secpolicy_config,
-	    POOL_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE,
-	    B_TRUE, zfs_keys_channel_program,
-	    ARRAY_SIZE(zfs_keys_channel_program));
-
-	zfs_ioctl_register("zpool_checkpoint", ZFS_IOC_POOL_CHECKPOINT,
-	    zfs_ioc_pool_checkpoint, zfs_secpolicy_config, POOL_NAME,
-	    POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE,
-	    zfs_keys_pool_checkpoint, ARRAY_SIZE(zfs_keys_pool_checkpoint));
-
-	zfs_ioctl_register("zpool_discard_checkpoint",
-	    ZFS_IOC_POOL_DISCARD_CHECKPOINT, zfs_ioc_pool_discard_checkpoint,
-	    zfs_secpolicy_config, POOL_NAME,
-	    POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE,
-	    zfs_keys_pool_discard_checkpoint,
-	    ARRAY_SIZE(zfs_keys_pool_discard_checkpoint));
-
-	zfs_ioctl_register("initialize", ZFS_IOC_POOL_INITIALIZE,
-	    zfs_ioc_pool_initialize, zfs_secpolicy_config, POOL_NAME,
-	    POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE,
-	    zfs_keys_pool_initialize, ARRAY_SIZE(zfs_keys_pool_initialize));
-
-	zfs_ioctl_register("sync", ZFS_IOC_POOL_SYNC,
-	    zfs_ioc_pool_sync, zfs_secpolicy_none, POOL_NAME,
-	    POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_FALSE, B_FALSE,
-	    zfs_keys_pool_sync, ARRAY_SIZE(zfs_keys_pool_sync));
-	zfs_ioctl_register("reopen", ZFS_IOC_POOL_REOPEN, zfs_ioc_pool_reopen,
-	    zfs_secpolicy_config, POOL_NAME, POOL_CHECK_SUSPENDED, B_TRUE,
-	    B_TRUE, zfs_keys_pool_reopen, ARRAY_SIZE(zfs_keys_pool_reopen));
-
-	/* IOCTLS that use the legacy function signature */
-
-	zfs_ioctl_register_legacy(ZFS_IOC_POOL_FREEZE, zfs_ioc_pool_freeze,
-	    zfs_secpolicy_config, NO_NAME, B_FALSE, POOL_CHECK_READONLY);
-
-	zfs_ioctl_register_pool(ZFS_IOC_POOL_CREATE, zfs_ioc_pool_create,
-	    zfs_secpolicy_config, B_TRUE, POOL_CHECK_NONE);
-	zfs_ioctl_register_pool_modify(ZFS_IOC_POOL_SCAN,
-	    zfs_ioc_pool_scan);
-	zfs_ioctl_register_pool_modify(ZFS_IOC_POOL_UPGRADE,
-	    zfs_ioc_pool_upgrade);
-	zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_ADD,
-	    zfs_ioc_vdev_add);
-	zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_REMOVE,
-	    zfs_ioc_vdev_remove);
-	zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_SET_STATE,
-	    zfs_ioc_vdev_set_state);
-	zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_ATTACH,
-	    zfs_ioc_vdev_attach);
-	zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_DETACH,
-	    zfs_ioc_vdev_detach);
-	zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_SETPATH,
-	    zfs_ioc_vdev_setpath);
-	zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_SETFRU,
-	    zfs_ioc_vdev_setfru);
-	zfs_ioctl_register_pool_modify(ZFS_IOC_POOL_SET_PROPS,
-	    zfs_ioc_pool_set_props);
-	zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_SPLIT,
-	    zfs_ioc_vdev_split);
-	zfs_ioctl_register_pool_modify(ZFS_IOC_POOL_REGUID,
-	    zfs_ioc_pool_reguid);
-
-	zfs_ioctl_register_pool_meta(ZFS_IOC_POOL_CONFIGS,
-	    zfs_ioc_pool_configs, zfs_secpolicy_none);
-	zfs_ioctl_register_pool_meta(ZFS_IOC_POOL_TRYIMPORT,
-	    zfs_ioc_pool_tryimport, zfs_secpolicy_config);
-	zfs_ioctl_register_pool_meta(ZFS_IOC_INJECT_FAULT,
-	    zfs_ioc_inject_fault, zfs_secpolicy_inject);
-	zfs_ioctl_register_pool_meta(ZFS_IOC_CLEAR_FAULT,
-	    zfs_ioc_clear_fault, zfs_secpolicy_inject);
-	zfs_ioctl_register_pool_meta(ZFS_IOC_INJECT_LIST_NEXT,
-	    zfs_ioc_inject_list_next, zfs_secpolicy_inject);
-
-	/*
-	 * pool destroy, and export don't log the history as part of
-	 * zfsdev_ioctl, but rather zfs_ioc_pool_export
-	 * does the logging of those commands.
-	 */
-	zfs_ioctl_register_pool(ZFS_IOC_POOL_DESTROY, zfs_ioc_pool_destroy,
-	    zfs_secpolicy_config, B_FALSE, POOL_CHECK_NONE);
-	zfs_ioctl_register_pool(ZFS_IOC_POOL_EXPORT, zfs_ioc_pool_export,
-	    zfs_secpolicy_config, B_FALSE, POOL_CHECK_NONE);
-
-	zfs_ioctl_register_pool(ZFS_IOC_POOL_STATS, zfs_ioc_pool_stats,
-	    zfs_secpolicy_read, B_FALSE, POOL_CHECK_NONE);
-	zfs_ioctl_register_pool(ZFS_IOC_POOL_GET_PROPS, zfs_ioc_pool_get_props,
-	    zfs_secpolicy_read, B_FALSE, POOL_CHECK_NONE);
-
-	zfs_ioctl_register_pool(ZFS_IOC_ERROR_LOG, zfs_ioc_error_log,
-	    zfs_secpolicy_inject, B_FALSE, POOL_CHECK_NONE);
-	zfs_ioctl_register_pool(ZFS_IOC_DSOBJ_TO_DSNAME,
-	    zfs_ioc_dsobj_to_dsname,
-	    zfs_secpolicy_diff, B_FALSE, POOL_CHECK_NONE);
-	zfs_ioctl_register_pool(ZFS_IOC_POOL_GET_HISTORY,
-	    zfs_ioc_pool_get_history,
-	    zfs_secpolicy_config, B_FALSE, POOL_CHECK_SUSPENDED);
-
-	zfs_ioctl_register_pool(ZFS_IOC_POOL_IMPORT, zfs_ioc_pool_import,
-	    zfs_secpolicy_config, B_TRUE, POOL_CHECK_NONE);
-
-	zfs_ioctl_register_pool(ZFS_IOC_CLEAR, zfs_ioc_clear,
-	    zfs_secpolicy_config, B_TRUE, POOL_CHECK_READONLY);
-
-	zfs_ioctl_register_dataset_read(ZFS_IOC_SPACE_WRITTEN,
-	    zfs_ioc_space_written);
-	zfs_ioctl_register_dataset_read(ZFS_IOC_OBJSET_RECVD_PROPS,
-	    zfs_ioc_objset_recvd_props);
-	zfs_ioctl_register_dataset_read(ZFS_IOC_NEXT_OBJ,
-	    zfs_ioc_next_obj);
-	zfs_ioctl_register_dataset_read(ZFS_IOC_GET_FSACL,
-	    zfs_ioc_get_fsacl);
-	zfs_ioctl_register_dataset_read(ZFS_IOC_OBJSET_STATS,
-	    zfs_ioc_objset_stats);
-	zfs_ioctl_register_dataset_read(ZFS_IOC_OBJSET_ZPLPROPS,
-	    zfs_ioc_objset_zplprops);
-	zfs_ioctl_register_dataset_read(ZFS_IOC_DATASET_LIST_NEXT,
-	    zfs_ioc_dataset_list_next);
-	zfs_ioctl_register_dataset_read(ZFS_IOC_SNAPSHOT_LIST_NEXT,
-	    zfs_ioc_snapshot_list_next);
-	zfs_ioctl_register_dataset_read(ZFS_IOC_SEND_PROGRESS,
-	    zfs_ioc_send_progress);
-
-	zfs_ioctl_register_dataset_read_secpolicy(ZFS_IOC_DIFF,
-	    zfs_ioc_diff, zfs_secpolicy_diff);
-	zfs_ioctl_register_dataset_read_secpolicy(ZFS_IOC_OBJ_TO_STATS,
-	    zfs_ioc_obj_to_stats, zfs_secpolicy_diff);
-	zfs_ioctl_register_dataset_read_secpolicy(ZFS_IOC_OBJ_TO_PATH,
-	    zfs_ioc_obj_to_path, zfs_secpolicy_diff);
-	zfs_ioctl_register_dataset_read_secpolicy(ZFS_IOC_USERSPACE_ONE,
-	    zfs_ioc_userspace_one, zfs_secpolicy_userspace_one);
-	zfs_ioctl_register_dataset_read_secpolicy(ZFS_IOC_USERSPACE_MANY,
-	    zfs_ioc_userspace_many, zfs_secpolicy_userspace_many);
-	zfs_ioctl_register_dataset_read_secpolicy(ZFS_IOC_SEND,
-	    zfs_ioc_send, zfs_secpolicy_send);
-
-	zfs_ioctl_register_dataset_modify(ZFS_IOC_SET_PROP, zfs_ioc_set_prop,
-	    zfs_secpolicy_none);
-	zfs_ioctl_register_dataset_modify(ZFS_IOC_DESTROY, zfs_ioc_destroy,
-	    zfs_secpolicy_destroy);
-	zfs_ioctl_register_dataset_modify(ZFS_IOC_RECV, zfs_ioc_recv,
-	    zfs_secpolicy_recv);
-	zfs_ioctl_register_dataset_modify(ZFS_IOC_PROMOTE, zfs_ioc_promote,
-	    zfs_secpolicy_promote);
-	zfs_ioctl_register_dataset_modify(ZFS_IOC_INHERIT_PROP,
-	    zfs_ioc_inherit_prop, zfs_secpolicy_inherit_prop);
-	zfs_ioctl_register_dataset_modify(ZFS_IOC_SET_FSACL, zfs_ioc_set_fsacl,
-	    zfs_secpolicy_set_fsacl);
-
-	/*
-	 * Not using zfs_ioctl_register_dataset_modify as DATASET_NAME check
-	 * won't allow a bookmark name.
-	 */
-	zfs_ioctl_register_legacy(ZFS_IOC_RENAME, zfs_ioc_rename,
-	    zfs_secpolicy_rename, ENTITY_NAME, B_TRUE,
-	    POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY);
-
-	zfs_ioctl_register_dataset_nolog(ZFS_IOC_SHARE, zfs_ioc_share,
-	    zfs_secpolicy_share, POOL_CHECK_NONE);
-	zfs_ioctl_register_dataset_nolog(ZFS_IOC_SMB_ACL, zfs_ioc_smb_acl,
-	    zfs_secpolicy_smb_acl, POOL_CHECK_NONE);
-	zfs_ioctl_register_dataset_nolog(ZFS_IOC_USERSPACE_UPGRADE,
-	    zfs_ioc_userspace_upgrade, zfs_secpolicy_userspace_upgrade,
-	    POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY);
-	zfs_ioctl_register_dataset_nolog(ZFS_IOC_TMP_SNAPSHOT,
-	    zfs_ioc_tmp_snapshot, zfs_secpolicy_tmp_snapshot,
-	    POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY);
-
-#ifdef __FreeBSD__
-	zfs_ioctl_register_dataset_nolog(ZFS_IOC_JAIL, zfs_ioc_jail,
-	    zfs_secpolicy_config, POOL_CHECK_NONE);
-	zfs_ioctl_register_dataset_nolog(ZFS_IOC_UNJAIL, zfs_ioc_unjail,
-	    zfs_secpolicy_config, POOL_CHECK_NONE);
-	zfs_ioctl_register("fbsd_nextboot", ZFS_IOC_NEXTBOOT,
-	    zfs_ioc_nextboot, zfs_secpolicy_config, NO_NAME,
-	    POOL_CHECK_NONE, B_FALSE, B_FALSE,
-	    zfs_keys_nextboot, ARRAY_SIZE(zfs_keys_nextboot));
-#endif
-}
-
-/*
- * Verify that for non-legacy ioctls the input nvlist
- * pairs match against the expected input.
- *
- * Possible errors are:
- * ZFS_ERR_IOC_ARG_UNAVAIL     An unrecognized nvpair was encountered
- * ZFS_ERR_IOC_ARG_REQUIRED    A required nvpair is missing
- * ZFS_ERR_IOC_ARG_BADTYPE     Invalid type for nvpair
- */
-static int
-zfs_check_input_nvpairs(nvlist_t *innvl, const zfs_ioc_vec_t *vec)
-{
-	const zfs_ioc_key_t *nvl_keys = vec->zvec_nvl_keys;
-	boolean_t required_keys_found = B_FALSE;
-
-	/*
-	 * examine each input pair
-	 */
-	for (nvpair_t *pair = nvlist_next_nvpair(innvl, NULL);
-	    pair != NULL; pair = nvlist_next_nvpair(innvl, pair)) {
-		char *name = nvpair_name(pair);
-		data_type_t type = nvpair_type(pair);
-		boolean_t identified = B_FALSE;
-
-		/*
-		 * check pair against the documented names and type
-		 */
-		for (int k = 0; k < vec->zvec_nvl_key_count; k++) {
-			/* if not a wild card name, check for an exact match */
-			if ((nvl_keys[k].zkey_flags & ZK_WILDCARDLIST) == 0 &&
-			    strcmp(nvl_keys[k].zkey_name, name) != 0)
-				continue;
-
-			identified = B_TRUE;
-
-			if (nvl_keys[k].zkey_type != DATA_TYPE_ANY &&
-			    nvl_keys[k].zkey_type != type) {
-				return (SET_ERROR(ZFS_ERR_IOC_ARG_BADTYPE));
-			}
-
-			if (nvl_keys[k].zkey_flags & ZK_OPTIONAL)
-				continue;
-
-			required_keys_found = B_TRUE;
-			break;
-		}
-
-		/* allow an 'optional' key, everything else is invalid */
-		if (!identified &&
-		    (strcmp(name, "optional") != 0 ||
-		    type != DATA_TYPE_NVLIST)) {
-			return (SET_ERROR(ZFS_ERR_IOC_ARG_UNAVAIL));
-		}
-	}
-
-	/* verify that all required keys were found */
-	for (int k = 0; k < vec->zvec_nvl_key_count; k++) {
-		if (nvl_keys[k].zkey_flags & ZK_OPTIONAL)
-			continue;
-
-		if (nvl_keys[k].zkey_flags & ZK_WILDCARDLIST) {
-			/* at least one non-optionial key is expected here */
-			if (!required_keys_found)
-				return (SET_ERROR(ZFS_ERR_IOC_ARG_REQUIRED));
-			continue;
-		}
-
-		if (!nvlist_exists(innvl, nvl_keys[k].zkey_name))
-			return (SET_ERROR(ZFS_ERR_IOC_ARG_REQUIRED));
-	}
-
-	return (0);
-}
-
-int
-pool_status_check(const char *name, zfs_ioc_namecheck_t type,
-    zfs_ioc_poolcheck_t check)
-{
-	spa_t *spa;
-	int error;
-
-	ASSERT(type == POOL_NAME || type == DATASET_NAME ||
-	    type == ENTITY_NAME);
-
-	if (check & POOL_CHECK_NONE)
-		return (0);
-
-	error = spa_open(name, &spa, FTAG);
-	if (error == 0) {
-		if ((check & POOL_CHECK_SUSPENDED) && spa_suspended(spa))
-			error = SET_ERROR(EAGAIN);
-		else if ((check & POOL_CHECK_READONLY) && !spa_writeable(spa))
-			error = SET_ERROR(EROFS);
-		spa_close(spa, FTAG);
-	}
-	return (error);
-}
-
-/*
- * Find a free minor number.
- */
-minor_t
-zfsdev_minor_alloc(void)
-{
-	static minor_t last_minor;
-	minor_t m;
-
-	ASSERT(MUTEX_HELD(&spa_namespace_lock));
-
-	for (m = last_minor + 1; m != last_minor; m++) {
-		if (m > ZFSDEV_MAX_MINOR)
-			m = 1;
-		if (ddi_get_soft_state(zfsdev_state, m) == NULL) {
-			last_minor = m;
-			return (m);
-		}
-	}
-
-	return (0);
-}
-
-static int
-zfs_ctldev_init(struct cdev *devp)
-{
-	minor_t minor;
-	zfs_soft_state_t *zs;
-
-	ASSERT(MUTEX_HELD(&spa_namespace_lock));
-
-	minor = zfsdev_minor_alloc();
-	if (minor == 0)
-		return (SET_ERROR(ENXIO));
-
-	if (ddi_soft_state_zalloc(zfsdev_state, minor) != DDI_SUCCESS)
-		return (SET_ERROR(EAGAIN));
-
-	devfs_set_cdevpriv((void *)(uintptr_t)minor, zfsdev_close);
-
-	zs = ddi_get_soft_state(zfsdev_state, minor);
-	zs->zss_type = ZSST_CTLDEV;
-	zfs_onexit_init((zfs_onexit_t **)&zs->zss_data);
-
-	return (0);
-}
-
-static void
-zfs_ctldev_destroy(zfs_onexit_t *zo, minor_t minor)
-{
-	ASSERT(MUTEX_HELD(&spa_namespace_lock));
-
-	zfs_onexit_destroy(zo);
-	ddi_soft_state_free(zfsdev_state, minor);
-}
-
-void *
-zfsdev_get_soft_state(minor_t minor, enum zfs_soft_state_type which)
-{
-	zfs_soft_state_t *zp;
-
-	zp = ddi_get_soft_state(zfsdev_state, minor);
-	if (zp == NULL || zp->zss_type != which)
-		return (NULL);
-
-	return (zp->zss_data);
-}
-
-static int
-zfsdev_open(struct cdev *devp, int flag, int mode, struct thread *td)
-{
-	int error = 0;
-
-#ifdef illumos
-	if (getminor(*devp) != 0)
-		return (zvol_open(devp, flag, otyp, cr));
-#endif
-
-	/* This is the control device. Allocate a new minor if requested. */
-	if (flag & FEXCL) {
-		mutex_enter(&spa_namespace_lock);
-		error = zfs_ctldev_init(devp);
-		mutex_exit(&spa_namespace_lock);
-	}
-
-	return (error);
-}
-
-static void
-zfsdev_close(void *data)
-{
-	zfs_onexit_t *zo;
-	minor_t minor = (minor_t)(uintptr_t)data;
-
-	if (minor == 0)
-		return;
-
-	mutex_enter(&spa_namespace_lock);
-	zo = zfsdev_get_soft_state(minor, ZSST_CTLDEV);
-	if (zo == NULL) {
-		mutex_exit(&spa_namespace_lock);
-		return;
-	}
-	zfs_ctldev_destroy(zo, minor);
-	mutex_exit(&spa_namespace_lock);
-}
-
-static int
-zfsdev_ioctl(struct cdev *dev, u_long zcmd, caddr_t arg, int flag,
-    struct thread *td)
-{
-	zfs_cmd_t *zc;
-	uint_t vecnum;
-	int error, rc, len;
-#ifdef illumos
-	minor_t minor = getminor(dev);
-#else
-	zfs_iocparm_t *zc_iocparm;
-	int cflag, cmd, oldvecnum;
-	boolean_t newioc, compat;
-	void *compat_zc = NULL;
-	cred_t *cr = td->td_ucred;
-#endif
-	const zfs_ioc_vec_t *vec;
-	char *saved_poolname = NULL;
-	nvlist_t *innvl = NULL;
-
-	cflag = ZFS_CMD_COMPAT_NONE;
-	compat = B_FALSE;
-	newioc = B_TRUE;	/* "new" style (zfs_iocparm_t) ioctl */
-
-	len = IOCPARM_LEN(zcmd);
-	vecnum = cmd = zcmd & 0xff;
-
-	/*
-	 * Check if we are talking to supported older binaries
-	 * and translate zfs_cmd if necessary
-	 */
-	if (len != sizeof(zfs_iocparm_t)) {
-		newioc = B_FALSE;
-		compat = B_TRUE;
-
-		vecnum = cmd;
-
-		switch (len) {
-		case sizeof(zfs_cmd_zcmd_t):
-			cflag = ZFS_CMD_COMPAT_LZC;
-			break;
-		case sizeof(zfs_cmd_deadman_t):
-			cflag = ZFS_CMD_COMPAT_DEADMAN;
-			break;
-		case sizeof(zfs_cmd_v28_t):
-			cflag = ZFS_CMD_COMPAT_V28;
-			break;
-		case sizeof(zfs_cmd_v15_t):
-			if (cmd >= sizeof(zfs_ioctl_v15_to_v28) /
-			    sizeof(zfs_ioctl_v15_to_v28[0]))
-				return (EINVAL);
-
-			cflag = ZFS_CMD_COMPAT_V15;
-			vecnum = zfs_ioctl_v15_to_v28[cmd];
-
-			/*
-			 * Return without further handling
-			 * if the command is blacklisted.
-			 */
-			if (vecnum == ZFS_IOC_COMPAT_PASS)
-				return (0);
-			else if (vecnum == ZFS_IOC_COMPAT_FAIL)
-				return (ENOTSUP);
-			break;
-		default:
-			return (EINVAL);
-		}
-	}
-
-#ifdef illumos
-	vecnum = cmd - ZFS_IOC_FIRST;
-	ASSERT3U(getmajor(dev), ==, ddi_driver_major(zfs_dip));
-#endif
-
-	if (vecnum >= sizeof (zfs_ioc_vec) / sizeof (zfs_ioc_vec[0]))
-		return (SET_ERROR(ZFS_ERR_IOC_CMD_UNAVAIL));
-	vec = &zfs_ioc_vec[vecnum];
-
-	zc = kmem_zalloc(sizeof(zfs_cmd_t), KM_SLEEP);
-
-#ifdef illumos
-	error = ddi_copyin((void *)arg, zc, sizeof (zfs_cmd_t), flag);
-	if (error != 0) {
-		error = SET_ERROR(EFAULT);
-		goto out;
-	}
-#else	/* !illumos */
-	bzero(zc, sizeof(zfs_cmd_t));
-
-	if (newioc) {
-		zc_iocparm = (void *)arg;
-
-		switch (zc_iocparm->zfs_ioctl_version) {
-		case ZFS_IOCVER_CURRENT:
-			if (zc_iocparm->zfs_cmd_size != sizeof(zfs_cmd_t)) {
-				error = SET_ERROR(EINVAL);
-				goto out;
-			}
-			break;
-		case ZFS_IOCVER_INLANES:
-			if (zc_iocparm->zfs_cmd_size != sizeof(zfs_cmd_inlanes_t)) {
-				error = SET_ERROR(EFAULT);
-				goto out;
-			}
-			compat = B_TRUE;
-			cflag = ZFS_CMD_COMPAT_INLANES;
-			break;
-		case ZFS_IOCVER_RESUME:
-			if (zc_iocparm->zfs_cmd_size != sizeof(zfs_cmd_resume_t)) {
-				error = SET_ERROR(EFAULT);
-				goto out;
-			}
-			compat = B_TRUE;
-			cflag = ZFS_CMD_COMPAT_RESUME;
-			break;
-		case ZFS_IOCVER_EDBP:
-			if (zc_iocparm->zfs_cmd_size != sizeof(zfs_cmd_edbp_t)) {
-				error = SET_ERROR(EFAULT);
-				goto out;
-			}
-			compat = B_TRUE;
-			cflag = ZFS_CMD_COMPAT_EDBP;
-			break;
-		case ZFS_IOCVER_ZCMD:
-			if (zc_iocparm->zfs_cmd_size > sizeof(zfs_cmd_t) ||
-			    zc_iocparm->zfs_cmd_size < sizeof(zfs_cmd_zcmd_t)) {
-				error = SET_ERROR(EFAULT);
-				goto out;
-			}
-			compat = B_TRUE;
-			cflag = ZFS_CMD_COMPAT_ZCMD;
-			break;
-		default:
-			error = SET_ERROR(EINVAL);
-			goto out;
-			/* NOTREACHED */
-		}
-
-		if (compat) {
-			ASSERT(sizeof(zfs_cmd_t) >= zc_iocparm->zfs_cmd_size);
-			compat_zc = kmem_zalloc(sizeof(zfs_cmd_t), KM_SLEEP);
-			bzero(compat_zc, sizeof(zfs_cmd_t));
-
-			error = ddi_copyin((void *)(uintptr_t)zc_iocparm->zfs_cmd,
-			    compat_zc, zc_iocparm->zfs_cmd_size, flag);
-			if (error != 0) {
-				error = SET_ERROR(EFAULT);
-				goto out;
-			}
-		} else {
-			error = ddi_copyin((void *)(uintptr_t)zc_iocparm->zfs_cmd,
-			    zc, zc_iocparm->zfs_cmd_size, flag);
-			if (error != 0) {
-				error = SET_ERROR(EFAULT);
-				goto out;
-			}
-		}
-	}
-
-	if (compat) {
-		if (newioc) {
-			ASSERT(compat_zc != NULL);
-			zfs_cmd_compat_get(zc, compat_zc, cflag);
-		} else {
-			ASSERT(compat_zc == NULL);
-			zfs_cmd_compat_get(zc, arg, cflag);
-		}
-		oldvecnum = vecnum;
-		error = zfs_ioctl_compat_pre(zc, &vecnum, cflag);
-		if (error != 0)
-			goto out;
-		if (oldvecnum != vecnum)
-			vec = &zfs_ioc_vec[vecnum];
-	}
-#endif	/* !illumos */
-
-	zc->zc_iflags = flag & FKIOCTL;
-	if (zc->zc_nvlist_src_size != 0) {
-		error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size,
-		    zc->zc_iflags, &innvl);
-		if (error != 0)
-			goto out;
-	}
-
-	/* rewrite innvl for backwards compatibility */
-	if (compat)
-		innvl = zfs_ioctl_compat_innvl(zc, innvl, vecnum, cflag);
-
-	/*
-	 * Ensure that all pool/dataset names are valid before we pass down to
-	 * the lower layers.
-	 */
-	zc->zc_name[sizeof (zc->zc_name) - 1] = '\0';
-	switch (vec->zvec_namecheck) {
-	case POOL_NAME:
-		if (pool_namecheck(zc->zc_name, NULL, NULL) != 0)
-			error = SET_ERROR(EINVAL);
-		else
-			error = pool_status_check(zc->zc_name,
-			    vec->zvec_namecheck, vec->zvec_pool_check);
-		break;
-
-	case DATASET_NAME:
-		if (dataset_namecheck(zc->zc_name, NULL, NULL) != 0)
-			error = SET_ERROR(EINVAL);
-		else
-			error = pool_status_check(zc->zc_name,
-			    vec->zvec_namecheck, vec->zvec_pool_check);
-		break;
-
-	case ENTITY_NAME:
-		if (entity_namecheck(zc->zc_name, NULL, NULL) != 0) {
-			error = SET_ERROR(EINVAL);
-		} else {
-			error = pool_status_check(zc->zc_name,
-			    vec->zvec_namecheck, vec->zvec_pool_check);
-		}
-		break;
-
-	case NO_NAME:
-		break;
-	}
-
-	/*
-	 * Ensure that all input pairs are valid before we pass them down
-	 * to the lower layers.
-	 *
-	 * The vectored functions can use fnvlist_lookup_{type} for any
-	 * required pairs since zfs_check_input_nvpairs() confirmed that
-	 * they exist and are of the correct type.
-	 */
-	if (error == 0 && vec->zvec_func != NULL) {
-		error = zfs_check_input_nvpairs(innvl, vec);
-		if (error != 0)
-			goto out;
-	}
-
-	if (error == 0)
-		error = vec->zvec_secpolicy(zc, innvl, cr);
-
-	if (error != 0)
-		goto out;
-
-	/* legacy ioctls can modify zc_name */
-	len = strcspn(zc->zc_name, "/@#") + 1;
-	saved_poolname = kmem_alloc(len, KM_SLEEP);
-	(void) strlcpy(saved_poolname, zc->zc_name, len);
-
-	if (vec->zvec_func != NULL) {
-		nvlist_t *outnvl;
-		int puterror = 0;
-		spa_t *spa;
-		nvlist_t *lognv = NULL;
-
-		ASSERT(vec->zvec_legacy_func == NULL);
-
-		/*
-		 * Add the innvl to the lognv before calling the func,
-		 * in case the func changes the innvl.
-		 */
-		if (vec->zvec_allow_log) {
-			lognv = fnvlist_alloc();
-			fnvlist_add_string(lognv, ZPOOL_HIST_IOCTL,
-			    vec->zvec_name);
-			if (!nvlist_empty(innvl)) {
-				fnvlist_add_nvlist(lognv, ZPOOL_HIST_INPUT_NVL,
-				    innvl);
-			}
-		}
-
-		outnvl = fnvlist_alloc();
-		error = vec->zvec_func(zc->zc_name, innvl, outnvl);
-
-		/*
-		 * Some commands can partially execute, modify state, and still
-		 * return an error.  In these cases, attempt to record what
-		 * was modified.
-		 */
-		if ((error == 0 ||
-		    (cmd == ZFS_IOC_CHANNEL_PROGRAM && error != EINVAL)) &&
-		    vec->zvec_allow_log &&
-		    spa_open(zc->zc_name, &spa, FTAG) == 0) {
-			if (!nvlist_empty(outnvl)) {
-				fnvlist_add_nvlist(lognv, ZPOOL_HIST_OUTPUT_NVL,
-				    outnvl);
-			}
-			if (error != 0) {
-				fnvlist_add_int64(lognv, ZPOOL_HIST_ERRNO,
-				    error);
-			}
-			(void) spa_history_log_nvl(spa, lognv);
-			spa_close(spa, FTAG);
-		}
-		fnvlist_free(lognv);
-
-		/* rewrite outnvl for backwards compatibility */
-		if (compat)
-			outnvl = zfs_ioctl_compat_outnvl(zc, outnvl, vecnum,
-			    cflag);
-
-		if (!nvlist_empty(outnvl) || zc->zc_nvlist_dst_size != 0) {
-			int smusherror = 0;
-			if (vec->zvec_smush_outnvlist) {
-				smusherror = nvlist_smush(outnvl,
-				    zc->zc_nvlist_dst_size);
-			}
-			if (smusherror == 0)
-				puterror = put_nvlist(zc, outnvl);
-		}
-
-		if (puterror != 0)
-			error = puterror;
-
-		nvlist_free(outnvl);
-	} else {
-		error = vec->zvec_legacy_func(zc);
-	}
-
-out:
-	nvlist_free(innvl);
-
-#if defined(__FreeBSD__) && defined(_KERNEL)
-	/*
-	 * Wait for ZVOL changes to get applied.
-	 * NB: taskqueue_drain_all() does less than taskq_wait(),
-	 * but enough for what we want.
-	 * And there is no equivalent illumos API.
-	 */
-	if (error == 0) {
-		spa_t *spa;
-
-		if (spa_open(saved_poolname, &spa, FTAG) == 0) {
-			taskqueue_drain_all(
-			    spa->spa_zvol_taskq->tq_queue);
-			spa_close(spa, FTAG);
-		}
-	}
-#endif
-
-#ifdef illumos
-	rc = ddi_copyout(zc, (void *)arg, sizeof (zfs_cmd_t), flag);
-	if (error == 0 && rc != 0)
-		error = SET_ERROR(EFAULT);
-#else
-	if (compat) {
-		zfs_ioctl_compat_post(zc, cmd, cflag);
-		if (newioc) {
-			ASSERT(compat_zc != NULL);
-			ASSERT(sizeof(zfs_cmd_t) >= zc_iocparm->zfs_cmd_size);
-
-			zfs_cmd_compat_put(zc, compat_zc, vecnum, cflag);
-			rc = ddi_copyout(compat_zc,
-			    (void *)(uintptr_t)zc_iocparm->zfs_cmd,
-			    zc_iocparm->zfs_cmd_size, flag);
-			if (error == 0 && rc != 0)
-				error = SET_ERROR(EFAULT);
-			kmem_free(compat_zc, sizeof (zfs_cmd_t));
-		} else {
-			zfs_cmd_compat_put(zc, arg, vecnum, cflag);
-		}
-	} else {
-		ASSERT(newioc);
-
-		rc = ddi_copyout(zc, (void *)(uintptr_t)zc_iocparm->zfs_cmd,
-		    sizeof (zfs_cmd_t), flag);
-		if (error == 0 && rc != 0)
-			error = SET_ERROR(EFAULT);
-	}
-#endif
-	if (error == 0 && vec->zvec_allow_log) {
-		char *s = tsd_get(zfs_allow_log_key);
-		if (s != NULL)
-			strfree(s);
-		(void) tsd_set(zfs_allow_log_key, saved_poolname);
-	} else {
-		if (saved_poolname != NULL)
-			strfree(saved_poolname);
-	}
-
-	kmem_free(zc, sizeof (zfs_cmd_t));
-	return (error);
-}
-
-#ifdef illumos
-static int
-zfs_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
-{
-	if (cmd != DDI_ATTACH)
-		return (DDI_FAILURE);
-
-	if (ddi_create_minor_node(dip, "zfs", S_IFCHR, 0,
-	    DDI_PSEUDO, 0) == DDI_FAILURE)
-		return (DDI_FAILURE);
-
-	zfs_dip = dip;
-
-	ddi_report_dev(dip);
-
-	return (DDI_SUCCESS);
-}
-
-static int
-zfs_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
-{
-	if (spa_busy() || zfs_busy() || zvol_busy())
-		return (DDI_FAILURE);
-
-	if (cmd != DDI_DETACH)
-		return (DDI_FAILURE);
-
-	zfs_dip = NULL;
-
-	ddi_prop_remove_all(dip);
-	ddi_remove_minor_node(dip, NULL);
-
-	return (DDI_SUCCESS);
-}
-
-/*ARGSUSED*/
-static int
-zfs_info(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result)
-{
-	switch (infocmd) {
-	case DDI_INFO_DEVT2DEVINFO:
-		*result = zfs_dip;
-		return (DDI_SUCCESS);
-
-	case DDI_INFO_DEVT2INSTANCE:
-		*result = (void *)0;
-		return (DDI_SUCCESS);
-	}
-
-	return (DDI_FAILURE);
-}
-#endif	/* illumos */
-
-/*
- * OK, so this is a little weird.
- *
- * /dev/zfs is the control node, i.e. minor 0.
- * /dev/zvol/[r]dsk/pool/dataset are the zvols, minor > 0.
- *
- * /dev/zfs has basically nothing to do except serve up ioctls,
- * so most of the standard driver entry points are in zvol.c.
- */
-#ifdef illumos
-static struct cb_ops zfs_cb_ops = {
-	zfsdev_open,	/* open */
-	zfsdev_close,	/* close */
-	zvol_strategy,	/* strategy */
-	nodev,		/* print */
-	zvol_dump,	/* dump */
-	zvol_read,	/* read */
-	zvol_write,	/* write */
-	zfsdev_ioctl,	/* ioctl */
-	nodev,		/* devmap */
-	nodev,		/* mmap */
-	nodev,		/* segmap */
-	nochpoll,	/* poll */
-	ddi_prop_op,	/* prop_op */
-	NULL,		/* streamtab */
-	D_NEW | D_MP | D_64BIT,		/* Driver compatibility flag */
-	CB_REV,		/* version */
-	nodev,		/* async read */
-	nodev,		/* async write */
-};
-
-static struct dev_ops zfs_dev_ops = {
-	DEVO_REV,	/* version */
-	0,		/* refcnt */
-	zfs_info,	/* info */
-	nulldev,	/* identify */
-	nulldev,	/* probe */
-	zfs_attach,	/* attach */
-	zfs_detach,	/* detach */
-	nodev,		/* reset */
-	&zfs_cb_ops,	/* driver operations */
-	NULL,		/* no bus operations */
-	NULL,		/* power */
-	ddi_quiesce_not_needed,	/* quiesce */
-};
-
-static struct modldrv zfs_modldrv = {
-	&mod_driverops,
-	"ZFS storage pool",
-	&zfs_dev_ops
-};
-
-static struct modlinkage modlinkage = {
-	MODREV_1,
-	(void *)&zfs_modlfs,
-	(void *)&zfs_modldrv,
-	NULL
-};
-#endif	/* illumos */
-
-static struct cdevsw zfs_cdevsw = {
-	.d_version =	D_VERSION,
-	.d_open =	zfsdev_open,
-	.d_ioctl =	zfsdev_ioctl,
-	.d_name =	ZFS_DEV_NAME
-};
-
-static void
-zfs_allow_log_destroy(void *arg)
-{
-	char *poolname = arg;
-	strfree(poolname);
-}
-
-static void
-zfsdev_init(void)
-{
-	zfsdev = make_dev(&zfs_cdevsw, 0x0, UID_ROOT, GID_OPERATOR, 0666,
-	    ZFS_DEV_NAME);
-}
-
-static void
-zfsdev_fini(void)
-{
-	if (zfsdev != NULL)
-		destroy_dev(zfsdev);
-}
-
-static struct root_hold_token *zfs_root_token;
-
-#ifdef illumos
-int
-_init(void)
-{
-	int error;
-
-	spa_init(FREAD | FWRITE);
-	zfs_init();
-	zvol_init();
-	zfs_ioctl_init();
-
-	if ((error = mod_install(&modlinkage)) != 0) {
-		zvol_fini();
-		zfs_fini();
-		spa_fini();
-		return (error);
-	}
-
-	tsd_create(&zfs_fsyncer_key, NULL);
-	tsd_create(&rrw_tsd_key, rrw_tsd_destroy);
-	tsd_create(&zfs_allow_log_key, zfs_allow_log_destroy);
-
-	error = ldi_ident_from_mod(&modlinkage, &zfs_li);
-	ASSERT(error == 0);
-	mutex_init(&zfs_share_lock, NULL, MUTEX_DEFAULT, NULL);
-
-	return (0);
-}
-
-int
-_fini(void)
-{
-	int error;
-
-	if (spa_busy() || zfs_busy() || zvol_busy() || zio_injection_enabled)
-		return (SET_ERROR(EBUSY));
-
-	if ((error = mod_remove(&modlinkage)) != 0)
-		return (error);
-
-	zvol_fini();
-	zfs_fini();
-	spa_fini();
-	if (zfs_nfsshare_inited)
-		(void) ddi_modclose(nfs_mod);
-	if (zfs_smbshare_inited)
-		(void) ddi_modclose(smbsrv_mod);
-	if (zfs_nfsshare_inited || zfs_smbshare_inited)
-		(void) ddi_modclose(sharefs_mod);
-
-	tsd_destroy(&zfs_fsyncer_key);
-	ldi_ident_release(zfs_li);
-	zfs_li = NULL;
-	mutex_destroy(&zfs_share_lock);
-
-	return (error);
-}
-
-int
-_info(struct modinfo *modinfop)
-{
-	return (mod_info(&modlinkage, modinfop));
-}
-#endif	/* illumos */
-
-static int zfs__init(void);
-static int zfs__fini(void);
-static void zfs_shutdown(void *, int);
-
-static eventhandler_tag zfs_shutdown_event_tag;
-
-#ifdef __FreeBSD__
-#define ZFS_MIN_KSTACK_PAGES 4
-#endif
-
-int
-zfs__init(void)
-{
-
-#ifdef __FreeBSD__
-#if KSTACK_PAGES < ZFS_MIN_KSTACK_PAGES
-	printf("ZFS NOTICE: KSTACK_PAGES is %d which could result in stack "
-	    "overflow panic!\nPlease consider adding "
-	    "'options KSTACK_PAGES=%d' to your kernel config\n", KSTACK_PAGES,
-	    ZFS_MIN_KSTACK_PAGES);
-#endif
-#endif
-	zfs_root_token = root_mount_hold("ZFS");
-
-	mutex_init(&zfs_share_lock, NULL, MUTEX_DEFAULT, NULL);
-
-	spa_init(FREAD | FWRITE);
-	zfs_init();
-	zvol_init();
-	zfs_ioctl_init();
-
-	tsd_create(&zfs_fsyncer_key, NULL);
-	tsd_create(&rrw_tsd_key, rrw_tsd_destroy);
-	tsd_create(&zfs_allow_log_key, zfs_allow_log_destroy);
-	tsd_create(&zfs_geom_probe_vdev_key, NULL);
-
-	printf("ZFS storage pool version: features support (" SPA_VERSION_STRING ")\n");
-	root_mount_rel(zfs_root_token);
-
-	zfsdev_init();
-
-	return (0);
-}
-
-int
-zfs__fini(void)
-{
-	if (spa_busy() || zfs_busy() || zvol_busy() ||
-	    zio_injection_enabled) {
-		return (EBUSY);
-	}
-
-	zfsdev_fini();
-	zvol_fini();
-	zfs_fini();
-	spa_fini();
-
-	tsd_destroy(&zfs_fsyncer_key);
-	tsd_destroy(&rrw_tsd_key);
-	tsd_destroy(&zfs_allow_log_key);
-
-	mutex_destroy(&zfs_share_lock);
-
-	return (0);
-}
-
-static void
-zfs_shutdown(void *arg __unused, int howto __unused)
-{
-
-	/*
-	 * ZFS fini routines can not properly work in a panic-ed system.
-	 */
-	if (!KERNEL_PANICKED())
-		(void)zfs__fini();
-}
-
-
-static int
-zfs_modevent(module_t mod, int type, void *unused __unused)
-{
-	int err;
-
-	switch (type) {
-	case MOD_LOAD:
-		err = zfs__init();
-		if (err == 0)
-			zfs_shutdown_event_tag = EVENTHANDLER_REGISTER(
-			    shutdown_post_sync, zfs_shutdown, NULL,
-			    SHUTDOWN_PRI_FIRST);
-		return (err);
-	case MOD_UNLOAD:
-		err = zfs__fini();
-		if (err == 0 && zfs_shutdown_event_tag != NULL)
-			EVENTHANDLER_DEREGISTER(shutdown_post_sync,
-			    zfs_shutdown_event_tag);
-		return (err);
-	case MOD_SHUTDOWN:
-		return (0);
-	default:
-		break;
-	}
-	return (EOPNOTSUPP);
-}
-
-static moduledata_t zfs_mod = {
-	"zfsctrl",
-	zfs_modevent,
-	0
-};
-DECLARE_MODULE(zfsctrl, zfs_mod, SI_SUB_VFS, SI_ORDER_ANY);
-MODULE_VERSION(zfsctrl, 1);
-MODULE_DEPEND(zfsctrl, opensolaris, 1, 1, 1);
-MODULE_DEPEND(zfsctrl, xdr, 1, 1, 1);
-MODULE_DEPEND(zfsctrl, acl_nfs4, 1, 1, 1);
-MODULE_DEPEND(zfsctrl, zlib, 1, 1, 1);
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_log.c
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_log.c
+++ /dev/null
@@ -1,688 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2015, 2018 by Delphix. All rights reserved.
- * Copyright (c) 2014 Integros [integros.com]
- */
-
-#include <sys/types.h>
-#include <sys/param.h>
-#include <sys/systm.h>
-#include <sys/sysmacros.h>
-#include <sys/cmn_err.h>
-#include <sys/kmem.h>
-#include <sys/file.h>
-#include <sys/vfs.h>
-#include <sys/zfs_znode.h>
-#include <sys/zfs_dir.h>
-#include <sys/zil.h>
-#include <sys/zil_impl.h>
-#include <sys/byteorder.h>
-#include <sys/policy.h>
-#include <sys/stat.h>
-#include <sys/acl.h>
-#include <sys/dmu.h>
-#include <sys/spa.h>
-#include <sys/zfs_fuid.h>
-#include <sys/dsl_dataset.h>
-
-/*
- * These zfs_log_* functions must be called within a dmu tx, in one
- * of 2 contexts depending on zilog->z_replay:
- *
- * Non replay mode
- * ---------------
- * We need to record the transaction so that if it is committed to
- * the Intent Log then it can be replayed.  An intent log transaction
- * structure (itx_t) is allocated and all the information necessary to
- * possibly replay the transaction is saved in it. The itx is then assigned
- * a sequence number and inserted in the in-memory list anchored in the zilog.
- *
- * Replay mode
- * -----------
- * We need to mark the intent log record as replayed in the log header.
- * This is done in the same transaction as the replay so that they
- * commit atomically.
- */
-
-int
-zfs_log_create_txtype(zil_create_t type, vsecattr_t *vsecp, vattr_t *vap)
-{
-	int isxvattr = (vap->va_mask & AT_XVATTR);
-	switch (type) {
-	case Z_FILE:
-		if (vsecp == NULL && !isxvattr)
-			return (TX_CREATE);
-		if (vsecp && isxvattr)
-#ifdef TODO
-			return (TX_CREATE_ACL_ATTR);
-#else
-			panic("%s:%u: unsupported condition", __func__, __LINE__);
-#endif
-		if (vsecp)
-			return (TX_CREATE_ACL);
-		else
-			return (TX_CREATE_ATTR);
-		/*NOTREACHED*/
-	case Z_DIR:
-		if (vsecp == NULL && !isxvattr)
-			return (TX_MKDIR);
-		if (vsecp && isxvattr)
-#ifdef TODO
-			return (TX_MKDIR_ACL_ATTR);
-#else
-			panic("%s:%u: unsupported condition", __func__, __LINE__);
-#endif
-		if (vsecp)
-			return (TX_MKDIR_ACL);
-		else
-			return (TX_MKDIR_ATTR);
-	case Z_XATTRDIR:
-		return (TX_MKXATTR);
-	}
-	ASSERT(0);
-	return (TX_MAX_TYPE);
-}
-
-/*
- * build up the log data necessary for logging xvattr_t
- * First lr_attr_t is initialized.  following the lr_attr_t
- * is the mapsize and attribute bitmap copied from the xvattr_t.
- * Following the bitmap and bitmapsize two 64 bit words are reserved
- * for the create time which may be set.  Following the create time
- * records a single 64 bit integer which has the bits to set on
- * replay for the xvattr.
- */
-static void
-zfs_log_xvattr(lr_attr_t *lrattr, xvattr_t *xvap)
-{
-	uint32_t	*bitmap;
-	uint64_t	*attrs;
-	uint64_t	*crtime;
-	xoptattr_t	*xoap;
-	void		*scanstamp;
-	int		i;
-
-	xoap = xva_getxoptattr(xvap);
-	ASSERT(xoap);
-
-	lrattr->lr_attr_masksize = xvap->xva_mapsize;
-	bitmap = &lrattr->lr_attr_bitmap;
-	for (i = 0; i != xvap->xva_mapsize; i++, bitmap++) {
-		*bitmap = xvap->xva_reqattrmap[i];
-	}
-
-	/* Now pack the attributes up in a single uint64_t */
-	attrs = (uint64_t *)bitmap;
-	crtime = attrs + 1;
-	scanstamp = (caddr_t)(crtime + 2);
-	*attrs = 0;
-	if (XVA_ISSET_REQ(xvap, XAT_READONLY))
-		*attrs |= (xoap->xoa_readonly == 0) ? 0 :
-		    XAT0_READONLY;
-	if (XVA_ISSET_REQ(xvap, XAT_HIDDEN))
-		*attrs |= (xoap->xoa_hidden == 0) ? 0 :
-		    XAT0_HIDDEN;
-	if (XVA_ISSET_REQ(xvap, XAT_SYSTEM))
-		*attrs |= (xoap->xoa_system == 0) ? 0 :
-		    XAT0_SYSTEM;
-	if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE))
-		*attrs |= (xoap->xoa_archive == 0) ? 0 :
-		    XAT0_ARCHIVE;
-	if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE))
-		*attrs |= (xoap->xoa_immutable == 0) ? 0 :
-		    XAT0_IMMUTABLE;
-	if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK))
-		*attrs |= (xoap->xoa_nounlink == 0) ? 0 :
-		    XAT0_NOUNLINK;
-	if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY))
-		*attrs |= (xoap->xoa_appendonly == 0) ? 0 :
-		    XAT0_APPENDONLY;
-	if (XVA_ISSET_REQ(xvap, XAT_OPAQUE))
-		*attrs |= (xoap->xoa_opaque == 0) ? 0 :
-		    XAT0_APPENDONLY;
-	if (XVA_ISSET_REQ(xvap, XAT_NODUMP))
-		*attrs |= (xoap->xoa_nodump == 0) ? 0 :
-		    XAT0_NODUMP;
-	if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED))
-		*attrs |= (xoap->xoa_av_quarantined == 0) ? 0 :
-		    XAT0_AV_QUARANTINED;
-	if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED))
-		*attrs |= (xoap->xoa_av_modified == 0) ? 0 :
-		    XAT0_AV_MODIFIED;
-	if (XVA_ISSET_REQ(xvap, XAT_CREATETIME))
-		ZFS_TIME_ENCODE(&xoap->xoa_createtime, crtime);
-	if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP))
-		bcopy(xoap->xoa_av_scanstamp, scanstamp, AV_SCANSTAMP_SZ);
-	if (XVA_ISSET_REQ(xvap, XAT_REPARSE))
-		*attrs |= (xoap->xoa_reparse == 0) ? 0 :
-		    XAT0_REPARSE;
-	if (XVA_ISSET_REQ(xvap, XAT_OFFLINE))
-		*attrs |= (xoap->xoa_offline == 0) ? 0 :
-		    XAT0_OFFLINE;
-	if (XVA_ISSET_REQ(xvap, XAT_SPARSE))
-		*attrs |= (xoap->xoa_sparse == 0) ? 0 :
-		    XAT0_SPARSE;
-}
-
-static void *
-zfs_log_fuid_ids(zfs_fuid_info_t *fuidp, void *start)
-{
-	zfs_fuid_t *zfuid;
-	uint64_t *fuidloc = start;
-
-	/* First copy in the ACE FUIDs */
-	for (zfuid = list_head(&fuidp->z_fuids); zfuid;
-	    zfuid = list_next(&fuidp->z_fuids, zfuid)) {
-		*fuidloc++ = zfuid->z_logfuid;
-	}
-	return (fuidloc);
-}
-
-
-static void *
-zfs_log_fuid_domains(zfs_fuid_info_t *fuidp, void *start)
-{
-	zfs_fuid_domain_t *zdomain;
-
-	/* now copy in the domain info, if any */
-	if (fuidp->z_domain_str_sz != 0) {
-		for (zdomain = list_head(&fuidp->z_domains); zdomain;
-		    zdomain = list_next(&fuidp->z_domains, zdomain)) {
-			bcopy((void *)zdomain->z_domain, start,
-			    strlen(zdomain->z_domain) + 1);
-			start = (caddr_t)start +
-			    strlen(zdomain->z_domain) + 1;
-		}
-	}
-	return (start);
-}
-
-/*
- * Handles TX_CREATE, TX_CREATE_ATTR, TX_MKDIR, TX_MKDIR_ATTR and
- * TK_MKXATTR transactions.
- *
- * TX_CREATE and TX_MKDIR are standard creates, but they may have FUID
- * domain information appended prior to the name.  In this case the
- * uid/gid in the log record will be a log centric FUID.
- *
- * TX_CREATE_ACL_ATTR and TX_MKDIR_ACL_ATTR handle special creates that
- * may contain attributes, ACL and optional fuid information.
- *
- * TX_CREATE_ACL and TX_MKDIR_ACL handle special creates that specify
- * and ACL and normal users/groups in the ACEs.
- *
- * There may be an optional xvattr attribute information similar
- * to zfs_log_setattr.
- *
- * Also, after the file name "domain" strings may be appended.
- */
-void
-zfs_log_create(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
-    znode_t *dzp, znode_t *zp, char *name, vsecattr_t *vsecp,
-    zfs_fuid_info_t *fuidp, vattr_t *vap)
-{
-	itx_t *itx;
-	lr_create_t *lr;
-	lr_acl_create_t *lracl;
-	size_t aclsize = (vsecp != NULL) ? vsecp->vsa_aclentsz : 0;
-	size_t xvatsize = 0;
-	size_t txsize;
-	xvattr_t *xvap = (xvattr_t *)vap;
-	void *end;
-	size_t lrsize;
-	size_t namesize = strlen(name) + 1;
-	size_t fuidsz = 0;
-
-	if (zil_replaying(zilog, tx))
-		return;
-
-	/*
-	 * If we have FUIDs present then add in space for
-	 * domains and ACE fuid's if any.
-	 */
-	if (fuidp) {
-		fuidsz += fuidp->z_domain_str_sz;
-		fuidsz += fuidp->z_fuid_cnt * sizeof (uint64_t);
-	}
-
-	if (vap->va_mask & AT_XVATTR)
-		xvatsize = ZIL_XVAT_SIZE(xvap->xva_mapsize);
-
-	if ((int)txtype == TX_CREATE_ATTR || (int)txtype == TX_MKDIR_ATTR ||
-	    (int)txtype == TX_CREATE || (int)txtype == TX_MKDIR ||
-	    (int)txtype == TX_MKXATTR) {
-		txsize = sizeof (*lr) + namesize + fuidsz + xvatsize;
-		lrsize = sizeof (*lr);
-	} else {
-		txsize =
-		    sizeof (lr_acl_create_t) + namesize + fuidsz +
-		    ZIL_ACE_LENGTH(aclsize) + xvatsize;
-		lrsize = sizeof (lr_acl_create_t);
-	}
-
-	itx = zil_itx_create(txtype, txsize);
-
-	lr = (lr_create_t *)&itx->itx_lr;
-	lr->lr_doid = dzp->z_id;
-	lr->lr_foid = zp->z_id;
-	/* Store dnode slot count in 8 bits above object id. */
-	LR_FOID_SET_SLOTS(lr->lr_foid, zp->z_dnodesize >> DNODE_SHIFT);
-	lr->lr_mode = zp->z_mode;
-	if (!IS_EPHEMERAL(zp->z_uid)) {
-		lr->lr_uid = (uint64_t)zp->z_uid;
-	} else {
-		lr->lr_uid = fuidp->z_fuid_owner;
-	}
-	if (!IS_EPHEMERAL(zp->z_gid)) {
-		lr->lr_gid = (uint64_t)zp->z_gid;
-	} else {
-		lr->lr_gid = fuidp->z_fuid_group;
-	}
-	(void) sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zp->z_zfsvfs), &lr->lr_gen,
-	    sizeof (uint64_t));
-	(void) sa_lookup(zp->z_sa_hdl, SA_ZPL_CRTIME(zp->z_zfsvfs),
-	    lr->lr_crtime, sizeof (uint64_t) * 2);
-
-	if (sa_lookup(zp->z_sa_hdl, SA_ZPL_RDEV(zp->z_zfsvfs), &lr->lr_rdev,
-	    sizeof (lr->lr_rdev)) != 0)
-		lr->lr_rdev = 0;
-
-	/*
-	 * Fill in xvattr info if any
-	 */
-	if (vap->va_mask & AT_XVATTR) {
-		zfs_log_xvattr((lr_attr_t *)((caddr_t)lr + lrsize), xvap);
-		end = (caddr_t)lr + lrsize + xvatsize;
-	} else {
-		end = (caddr_t)lr + lrsize;
-	}
-
-	/* Now fill in any ACL info */
-
-	if (vsecp) {
-		lracl = (lr_acl_create_t *)&itx->itx_lr;
-		lracl->lr_aclcnt = vsecp->vsa_aclcnt;
-		lracl->lr_acl_bytes = aclsize;
-		lracl->lr_domcnt = fuidp ? fuidp->z_domain_cnt : 0;
-		lracl->lr_fuidcnt  = fuidp ? fuidp->z_fuid_cnt : 0;
-		if (vsecp->vsa_aclflags & VSA_ACE_ACLFLAGS)
-			lracl->lr_acl_flags = (uint64_t)vsecp->vsa_aclflags;
-		else
-			lracl->lr_acl_flags = 0;
-
-		bcopy(vsecp->vsa_aclentp, end, aclsize);
-		end = (caddr_t)end + ZIL_ACE_LENGTH(aclsize);
-	}
-
-	/* drop in FUID info */
-	if (fuidp) {
-		end = zfs_log_fuid_ids(fuidp, end);
-		end = zfs_log_fuid_domains(fuidp, end);
-	}
-	/*
-	 * Now place file name in log record
-	 */
-	bcopy(name, end, namesize);
-
-	zil_itx_assign(zilog, itx, tx);
-}
-
-/*
- * Handles both TX_REMOVE and TX_RMDIR transactions.
- */
-void
-zfs_log_remove(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
-    znode_t *dzp, char *name, uint64_t foid)
-{
-	itx_t *itx;
-	lr_remove_t *lr;
-	size_t namesize = strlen(name) + 1;
-
-	if (zil_replaying(zilog, tx))
-		return;
-
-	itx = zil_itx_create(txtype, sizeof (*lr) + namesize);
-	lr = (lr_remove_t *)&itx->itx_lr;
-	lr->lr_doid = dzp->z_id;
-	bcopy(name, (char *)(lr + 1), namesize);
-
-	itx->itx_oid = foid;
-
-	zil_itx_assign(zilog, itx, tx);
-}
-
-/*
- * Handles TX_LINK transactions.
- */
-void
-zfs_log_link(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
-    znode_t *dzp, znode_t *zp, char *name)
-{
-	itx_t *itx;
-	lr_link_t *lr;
-	size_t namesize = strlen(name) + 1;
-
-	if (zil_replaying(zilog, tx))
-		return;
-
-	itx = zil_itx_create(txtype, sizeof (*lr) + namesize);
-	lr = (lr_link_t *)&itx->itx_lr;
-	lr->lr_doid = dzp->z_id;
-	lr->lr_link_obj = zp->z_id;
-	bcopy(name, (char *)(lr + 1), namesize);
-
-	zil_itx_assign(zilog, itx, tx);
-}
-
-/*
- * Handles TX_SYMLINK transactions.
- */
-void
-zfs_log_symlink(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
-    znode_t *dzp, znode_t *zp, char *name, char *link)
-{
-	itx_t *itx;
-	lr_create_t *lr;
-	size_t namesize = strlen(name) + 1;
-	size_t linksize = strlen(link) + 1;
-
-	if (zil_replaying(zilog, tx))
-		return;
-
-	itx = zil_itx_create(txtype, sizeof (*lr) + namesize + linksize);
-	lr = (lr_create_t *)&itx->itx_lr;
-	lr->lr_doid = dzp->z_id;
-	lr->lr_foid = zp->z_id;
-	lr->lr_uid = zp->z_uid;
-	lr->lr_gid = zp->z_gid;
-	lr->lr_mode = zp->z_mode;
-	(void) sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zp->z_zfsvfs), &lr->lr_gen,
-	    sizeof (uint64_t));
-	(void) sa_lookup(zp->z_sa_hdl, SA_ZPL_CRTIME(zp->z_zfsvfs),
-	    lr->lr_crtime, sizeof (uint64_t) * 2);
-	bcopy(name, (char *)(lr + 1), namesize);
-	bcopy(link, (char *)(lr + 1) + namesize, linksize);
-
-	zil_itx_assign(zilog, itx, tx);
-}
-
-/*
- * Handles TX_RENAME transactions.
- */
-void
-zfs_log_rename(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
-    znode_t *sdzp, char *sname, znode_t *tdzp, char *dname, znode_t *szp)
-{
-	itx_t *itx;
-	lr_rename_t *lr;
-	size_t snamesize = strlen(sname) + 1;
-	size_t dnamesize = strlen(dname) + 1;
-
-	if (zil_replaying(zilog, tx))
-		return;
-
-	itx = zil_itx_create(txtype, sizeof (*lr) + snamesize + dnamesize);
-	lr = (lr_rename_t *)&itx->itx_lr;
-	lr->lr_sdoid = sdzp->z_id;
-	lr->lr_tdoid = tdzp->z_id;
-	bcopy(sname, (char *)(lr + 1), snamesize);
-	bcopy(dname, (char *)(lr + 1) + snamesize, dnamesize);
-	itx->itx_oid = szp->z_id;
-
-	zil_itx_assign(zilog, itx, tx);
-}
-
-/*
- * Handles TX_WRITE transactions.
- */
-ssize_t zfs_immediate_write_sz = 32768;
-#ifdef _KERNEL
-SYSCTL_DECL(_vfs_zfs);
-SYSCTL_LONG(_vfs_zfs, OID_AUTO, immediate_write_sz, CTLFLAG_RWTUN,
-    &zfs_immediate_write_sz, 0, "Minimal size for indirect log write");
-#endif
-
-void
-zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype,
-    znode_t *zp, offset_t off, ssize_t resid, int ioflag)
-{
-	uint32_t blocksize = zp->z_blksz;
-	itx_wr_state_t write_state;
-	uintptr_t fsync_cnt;
-
-	if (zil_replaying(zilog, tx) || zp->z_unlinked)
-		return;
-
-	if (zilog->zl_logbias == ZFS_LOGBIAS_THROUGHPUT)
-		write_state = WR_INDIRECT;
-	else if (!spa_has_slogs(zilog->zl_spa) &&
-	    resid >= zfs_immediate_write_sz)
-		write_state = WR_INDIRECT;
-	else if (ioflag & (FSYNC | FDSYNC))
-		write_state = WR_COPIED;
-	else
-		write_state = WR_NEED_COPY;
-
-	if ((fsync_cnt = (uintptr_t)tsd_get(zfs_fsyncer_key)) != 0) {
-		(void) tsd_set(zfs_fsyncer_key, (void *)(fsync_cnt - 1));
-	}
-
-	while (resid) {
-		itx_t *itx;
-		lr_write_t *lr;
-		itx_wr_state_t wr_state = write_state;
-		ssize_t len = resid;
-
-		/*
-		 * A WR_COPIED record must fit entirely in one log block.
-		 * Large writes can use WR_NEED_COPY, which the ZIL will
-		 * split into multiple records across several log blocks
-		 * if necessary.
-		 */
-		if (wr_state == WR_COPIED &&
-		    resid > zil_max_copied_data(zilog))
-			wr_state = WR_NEED_COPY;
-		else if (wr_state == WR_INDIRECT)
-			len = MIN(blocksize - P2PHASE(off, blocksize), resid);
-
-		itx = zil_itx_create(txtype, sizeof (*lr) +
-		    (wr_state == WR_COPIED ? len : 0));
-		lr = (lr_write_t *)&itx->itx_lr;
-		if (wr_state == WR_COPIED && dmu_read(zp->z_zfsvfs->z_os,
-		    zp->z_id, off, len, lr + 1, DMU_READ_NO_PREFETCH) != 0) {
-			zil_itx_destroy(itx);
-			itx = zil_itx_create(txtype, sizeof (*lr));
-			lr = (lr_write_t *)&itx->itx_lr;
-			wr_state = WR_NEED_COPY;
-		}
-
-		itx->itx_wr_state = wr_state;
-		lr->lr_foid = zp->z_id;
-		lr->lr_offset = off;
-		lr->lr_length = len;
-		lr->lr_blkoff = 0;
-		BP_ZERO(&lr->lr_blkptr);
-
-		itx->itx_private = zp->z_zfsvfs;
-
-		if (!(ioflag & (FSYNC | FDSYNC)) && (zp->z_sync_cnt == 0) &&
-		    (fsync_cnt == 0))
-			itx->itx_sync = B_FALSE;
-
-		zil_itx_assign(zilog, itx, tx);
-
-		off += len;
-		resid -= len;
-	}
-}
-
-/*
- * Handles TX_TRUNCATE transactions.
- */
-void
-zfs_log_truncate(zilog_t *zilog, dmu_tx_t *tx, int txtype,
-    znode_t *zp, uint64_t off, uint64_t len)
-{
-	itx_t *itx;
-	lr_truncate_t *lr;
-
-	if (zil_replaying(zilog, tx) || zp->z_unlinked)
-		return;
-
-	itx = zil_itx_create(txtype, sizeof (*lr));
-	lr = (lr_truncate_t *)&itx->itx_lr;
-	lr->lr_foid = zp->z_id;
-	lr->lr_offset = off;
-	lr->lr_length = len;
-
-	itx->itx_sync = (zp->z_sync_cnt != 0);
-	zil_itx_assign(zilog, itx, tx);
-}
-
-/*
- * Handles TX_SETATTR transactions.
- */
-void
-zfs_log_setattr(zilog_t *zilog, dmu_tx_t *tx, int txtype,
-    znode_t *zp, vattr_t *vap, uint_t mask_applied, zfs_fuid_info_t *fuidp)
-{
-	itx_t		*itx;
-	lr_setattr_t	*lr;
-	xvattr_t	*xvap = (xvattr_t *)vap;
-	size_t		recsize = sizeof (lr_setattr_t);
-	void		*start;
-
-	if (zil_replaying(zilog, tx) || zp->z_unlinked)
-		return;
-
-	/*
-	 * If XVATTR set, then log record size needs to allow
-	 * for lr_attr_t + xvattr mask, mapsize and create time
-	 * plus actual attribute values
-	 */
-	if (vap->va_mask & AT_XVATTR)
-		recsize = sizeof (*lr) + ZIL_XVAT_SIZE(xvap->xva_mapsize);
-
-	if (fuidp)
-		recsize += fuidp->z_domain_str_sz;
-
-	itx = zil_itx_create(txtype, recsize);
-	lr = (lr_setattr_t *)&itx->itx_lr;
-	lr->lr_foid = zp->z_id;
-	lr->lr_mask = (uint64_t)mask_applied;
-	lr->lr_mode = (uint64_t)vap->va_mode;
-	if ((mask_applied & AT_UID) && IS_EPHEMERAL(vap->va_uid))
-		lr->lr_uid = fuidp->z_fuid_owner;
-	else
-		lr->lr_uid = (uint64_t)vap->va_uid;
-
-	if ((mask_applied & AT_GID) && IS_EPHEMERAL(vap->va_gid))
-		lr->lr_gid = fuidp->z_fuid_group;
-	else
-		lr->lr_gid = (uint64_t)vap->va_gid;
-
-	lr->lr_size = (uint64_t)vap->va_size;
-	ZFS_TIME_ENCODE(&vap->va_atime, lr->lr_atime);
-	ZFS_TIME_ENCODE(&vap->va_mtime, lr->lr_mtime);
-	start = (lr_setattr_t *)(lr + 1);
-	if (vap->va_mask & AT_XVATTR) {
-		zfs_log_xvattr((lr_attr_t *)start, xvap);
-		start = (caddr_t)start + ZIL_XVAT_SIZE(xvap->xva_mapsize);
-	}
-
-	/*
-	 * Now stick on domain information if any on end
-	 */
-
-	if (fuidp)
-		(void) zfs_log_fuid_domains(fuidp, start);
-
-	itx->itx_sync = (zp->z_sync_cnt != 0);
-	zil_itx_assign(zilog, itx, tx);
-}
-
-/*
- * Handles TX_ACL transactions.
- */
-void
-zfs_log_acl(zilog_t *zilog, dmu_tx_t *tx, znode_t *zp,
-    vsecattr_t *vsecp, zfs_fuid_info_t *fuidp)
-{
-	itx_t *itx;
-	lr_acl_v0_t *lrv0;
-	lr_acl_t *lr;
-	int txtype;
-	int lrsize;
-	size_t txsize;
-	size_t aclbytes = vsecp->vsa_aclentsz;
-
-	if (zil_replaying(zilog, tx) || zp->z_unlinked)
-		return;
-
-	txtype = (zp->z_zfsvfs->z_version < ZPL_VERSION_FUID) ?
-	    TX_ACL_V0 : TX_ACL;
-
-	if (txtype == TX_ACL)
-		lrsize = sizeof (*lr);
-	else
-		lrsize = sizeof (*lrv0);
-
-	txsize = lrsize +
-	    ((txtype == TX_ACL) ? ZIL_ACE_LENGTH(aclbytes) : aclbytes) +
-	    (fuidp ? fuidp->z_domain_str_sz : 0) +
-	    sizeof (uint64_t) * (fuidp ? fuidp->z_fuid_cnt : 0);
-
-	itx = zil_itx_create(txtype, txsize);
-
-	lr = (lr_acl_t *)&itx->itx_lr;
-	lr->lr_foid = zp->z_id;
-	if (txtype == TX_ACL) {
-		lr->lr_acl_bytes = aclbytes;
-		lr->lr_domcnt = fuidp ? fuidp->z_domain_cnt : 0;
-		lr->lr_fuidcnt = fuidp ? fuidp->z_fuid_cnt : 0;
-		if (vsecp->vsa_mask & VSA_ACE_ACLFLAGS)
-			lr->lr_acl_flags = (uint64_t)vsecp->vsa_aclflags;
-		else
-			lr->lr_acl_flags = 0;
-	}
-	lr->lr_aclcnt = (uint64_t)vsecp->vsa_aclcnt;
-
-	if (txtype == TX_ACL_V0) {
-		lrv0 = (lr_acl_v0_t *)lr;
-		bcopy(vsecp->vsa_aclentp, (ace_t *)(lrv0 + 1), aclbytes);
-	} else {
-		void *start = (ace_t *)(lr + 1);
-
-		bcopy(vsecp->vsa_aclentp, start, aclbytes);
-
-		start = (caddr_t)start + ZIL_ACE_LENGTH(aclbytes);
-
-		if (fuidp) {
-			start = zfs_log_fuid_ids(fuidp, start);
-			(void) zfs_log_fuid_domains(fuidp, start);
-		}
-	}
-
-	itx->itx_sync = (zp->z_sync_cnt != 0);
-	zil_itx_assign(zilog, itx, tx);
-}
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_onexit.c
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_onexit.c
+++ /dev/null
@@ -1,254 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2013 by Delphix. All rights reserved.
- */
-
-#include <sys/types.h>
-#include <sys/param.h>
-#include <sys/errno.h>
-#include <sys/kmem.h>
-#include <sys/conf.h>
-#include <sys/sunddi.h>
-#include <sys/zfs_ioctl.h>
-#include <sys/zfs_onexit.h>
-#include <sys/zvol.h>
-
-/*
- * ZFS kernel routines may add/delete callback routines to be invoked
- * upon process exit (triggered via the close operation from the /dev/zfs
- * driver).
- *
- * These cleanup callbacks are intended to allow for the accumulation
- * of kernel state across multiple ioctls.  User processes participate
- * by opening ZFS_DEV with O_EXCL. This causes the ZFS driver to do a
- * clone-open, generating a unique minor number. The process then passes
- * along that file descriptor to each ioctl that might have a cleanup operation.
- *
- * Consumers of the onexit routines should call zfs_onexit_fd_hold() early
- * on to validate the given fd and add a reference to its file table entry.
- * This allows the consumer to do its work and then add a callback, knowing
- * that zfs_onexit_add_cb() won't fail with EBADF.  When finished, consumers
- * should call zfs_onexit_fd_rele().
- *
- * A simple example is zfs_ioc_recv(), where we might create an AVL tree
- * with dataset/GUID mappings and then reuse that tree on subsequent
- * zfs_ioc_recv() calls.
- *
- * On the first zfs_ioc_recv() call, dmu_recv_stream() will kmem_alloc()
- * the AVL tree and pass it along with a callback function to
- * zfs_onexit_add_cb(). The zfs_onexit_add_cb() routine will register the
- * callback and return an action handle.
- *
- * The action handle is then passed from user space to subsequent
- * zfs_ioc_recv() calls, so that dmu_recv_stream() can fetch its AVL tree
- * by calling zfs_onexit_cb_data() with the device minor number and
- * action handle.
- *
- * If the user process exits abnormally, the callback is invoked implicitly
- * as part of the driver close operation.  Once the user space process is
- * finished with the accumulated kernel state, it can also just call close(2)
- * on the cleanup fd to trigger the cleanup callback.
- */
-
-void
-zfs_onexit_init(zfs_onexit_t **zop)
-{
-	zfs_onexit_t *zo;
-
-	zo = *zop = kmem_zalloc(sizeof (zfs_onexit_t), KM_SLEEP);
-	mutex_init(&zo->zo_lock, NULL, MUTEX_DEFAULT, NULL);
-	list_create(&zo->zo_actions, sizeof (zfs_onexit_action_node_t),
-	    offsetof(zfs_onexit_action_node_t, za_link));
-}
-
-void
-zfs_onexit_destroy(zfs_onexit_t *zo)
-{
-	zfs_onexit_action_node_t *ap;
-
-	mutex_enter(&zo->zo_lock);
-	while ((ap = list_head(&zo->zo_actions)) != NULL) {
-		list_remove(&zo->zo_actions, ap);
-		mutex_exit(&zo->zo_lock);
-		ap->za_func(ap->za_data);
-		kmem_free(ap, sizeof (zfs_onexit_action_node_t));
-		mutex_enter(&zo->zo_lock);
-	}
-	mutex_exit(&zo->zo_lock);
-
-	list_destroy(&zo->zo_actions);
-	mutex_destroy(&zo->zo_lock);
-	kmem_free(zo, sizeof (zfs_onexit_t));
-}
-
-static int
-zfs_onexit_minor_to_state(minor_t minor, zfs_onexit_t **zo)
-{
-	*zo = zfsdev_get_soft_state(minor, ZSST_CTLDEV);
-	if (*zo == NULL)
-		return (SET_ERROR(EBADF));
-
-	return (0);
-}
-
-/*
- * Consumers might need to operate by minor number instead of fd, since
- * they might be running in another thread (e.g. txg_sync_thread). Callers
- * of this function must call zfs_onexit_fd_rele() when they're finished
- * using the minor number.
- */
-int
-zfs_onexit_fd_hold(int fd, minor_t *minorp)
-{
-	file_t *fp, *tmpfp;
-	zfs_onexit_t *zo;
-	cap_rights_t rights;
-	void *data;
-	int error;
-
-	fp = getf(fd, &cap_no_rights);
-	if (fp == NULL)
-		return (SET_ERROR(EBADF));
-
-	tmpfp = curthread->td_fpop;
-	curthread->td_fpop = fp;
-	error = devfs_get_cdevpriv(&data);
-	if (error == 0)
-		*minorp = (minor_t)(uintptr_t)data;
-	curthread->td_fpop = tmpfp;
-	if (error != 0)
-		return (SET_ERROR(EBADF));
-	return (zfs_onexit_minor_to_state(*minorp, &zo));
-}
-
-void
-zfs_onexit_fd_rele(int fd)
-{
-	releasef(fd);
-}
-
-/*
- * Add a callback to be invoked when the calling process exits.
- */
-int
-zfs_onexit_add_cb(minor_t minor, void (*func)(void *), void *data,
-    uint64_t *action_handle)
-{
-	zfs_onexit_t *zo;
-	zfs_onexit_action_node_t *ap;
-	int error;
-
-	error = zfs_onexit_minor_to_state(minor, &zo);
-	if (error)
-		return (error);
-
-	ap = kmem_alloc(sizeof (zfs_onexit_action_node_t), KM_SLEEP);
-	list_link_init(&ap->za_link);
-	ap->za_func = func;
-	ap->za_data = data;
-
-	mutex_enter(&zo->zo_lock);
-	list_insert_tail(&zo->zo_actions, ap);
-	mutex_exit(&zo->zo_lock);
-	if (action_handle)
-		*action_handle = (uint64_t)(uintptr_t)ap;
-
-	return (0);
-}
-
-static zfs_onexit_action_node_t *
-zfs_onexit_find_cb(zfs_onexit_t *zo, uint64_t action_handle)
-{
-	zfs_onexit_action_node_t *match;
-	zfs_onexit_action_node_t *ap;
-	list_t *l;
-
-	ASSERT(MUTEX_HELD(&zo->zo_lock));
-
-	match = (zfs_onexit_action_node_t *)(uintptr_t)action_handle;
-	l = &zo->zo_actions;
-	for (ap = list_head(l); ap != NULL; ap = list_next(l, ap)) {
-		if (match == ap)
-			break;
-	}
-	return (ap);
-}
-
-/*
- * Delete the callback, triggering it first if 'fire' is set.
- */
-int
-zfs_onexit_del_cb(minor_t minor, uint64_t action_handle, boolean_t fire)
-{
-	zfs_onexit_t *zo;
-	zfs_onexit_action_node_t *ap;
-	int error;
-
-	error = zfs_onexit_minor_to_state(minor, &zo);
-	if (error)
-		return (error);
-
-	mutex_enter(&zo->zo_lock);
-	ap = zfs_onexit_find_cb(zo, action_handle);
-	if (ap != NULL) {
-		list_remove(&zo->zo_actions, ap);
-		mutex_exit(&zo->zo_lock);
-		if (fire)
-			ap->za_func(ap->za_data);
-		kmem_free(ap, sizeof (zfs_onexit_action_node_t));
-	} else {
-		mutex_exit(&zo->zo_lock);
-		error = SET_ERROR(ENOENT);
-	}
-
-	return (error);
-}
-
-/*
- * Return the data associated with this callback.  This allows consumers
- * of the cleanup-on-exit interfaces to stash kernel data across system
- * calls, knowing that it will be cleaned up if the calling process exits.
- */
-int
-zfs_onexit_cb_data(minor_t minor, uint64_t action_handle, void **data)
-{
-	zfs_onexit_t *zo;
-	zfs_onexit_action_node_t *ap;
-	int error;
-
-	*data = NULL;
-
-	error = zfs_onexit_minor_to_state(minor, &zo);
-	if (error)
-		return (error);
-
-	mutex_enter(&zo->zo_lock);
-	ap = zfs_onexit_find_cb(zo, action_handle);
-	if (ap != NULL)
-		*data = ap->za_data;
-	else
-		error = SET_ERROR(ENOENT);
-	mutex_exit(&zo->zo_lock);
-
-	return (error);
-}
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_replay.c
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_replay.c
+++ /dev/null
@@ -1,1069 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2013, 2015 by Delphix. All rights reserved.
- */
-
-#include <sys/types.h>
-#include <sys/param.h>
-#include <sys/systm.h>
-#include <sys/sysmacros.h>
-#include <sys/cmn_err.h>
-#include <sys/kmem.h>
-#include <sys/file.h>
-#include <sys/fcntl.h>
-#include <sys/vfs.h>
-#include <sys/fs/zfs.h>
-#include <sys/zfs_znode.h>
-#include <sys/zfs_dir.h>
-#include <sys/zfs_acl.h>
-#include <sys/zfs_fuid.h>
-#include <sys/spa.h>
-#include <sys/zil.h>
-#include <sys/byteorder.h>
-#include <sys/stat.h>
-#include <sys/acl.h>
-#include <sys/atomic.h>
-#include <sys/cred.h>
-#include <sys/namei.h>
-
-/*
- * Functions to replay ZFS intent log (ZIL) records
- * The functions are called through a function vector (zfs_replay_vector)
- * which is indexed by the transaction type.
- */
-
-static void
-zfs_init_vattr(vattr_t *vap, uint64_t mask, uint64_t mode,
-    uint64_t uid, uint64_t gid, uint64_t rdev, uint64_t nodeid)
-{
-	VATTR_NULL(vap);
-	vap->va_mask = (uint_t)mask;
-	if (mask & AT_TYPE)
-		vap->va_type = IFTOVT(mode);
-	if (mask & AT_MODE)
-		vap->va_mode = mode & MODEMASK;
-	if (mask & AT_UID)
-		vap->va_uid = (uid_t)(IS_EPHEMERAL(uid)) ? -1 : uid;
-	if (mask & AT_GID)
-		vap->va_gid = (gid_t)(IS_EPHEMERAL(gid)) ? -1 : gid;
-	vap->va_rdev = zfs_cmpldev(rdev);
-	vap->va_nodeid = nodeid;
-}
-
-/* ARGSUSED */
-static int
-zfs_replay_error(void *arg1, void *arg2, boolean_t byteswap)
-{
-	return (SET_ERROR(ENOTSUP));
-}
-
-static void
-zfs_replay_xvattr(lr_attr_t *lrattr, xvattr_t *xvap)
-{
-	xoptattr_t *xoap = NULL;
-	uint64_t *attrs;
-	uint64_t *crtime;
-	uint32_t *bitmap;
-	void *scanstamp;
-	int i;
-
-	xvap->xva_vattr.va_mask |= AT_XVATTR;
-	if ((xoap = xva_getxoptattr(xvap)) == NULL) {
-		xvap->xva_vattr.va_mask &= ~AT_XVATTR; /* shouldn't happen */
-		return;
-	}
-
-	ASSERT(lrattr->lr_attr_masksize == xvap->xva_mapsize);
-
-	bitmap = &lrattr->lr_attr_bitmap;
-	for (i = 0; i != lrattr->lr_attr_masksize; i++, bitmap++)
-		xvap->xva_reqattrmap[i] = *bitmap;
-
-	attrs = (uint64_t *)(lrattr + lrattr->lr_attr_masksize - 1);
-	crtime = attrs + 1;
-	scanstamp = (caddr_t)(crtime + 2);
-
-	if (XVA_ISSET_REQ(xvap, XAT_HIDDEN))
-		xoap->xoa_hidden = ((*attrs & XAT0_HIDDEN) != 0);
-	if (XVA_ISSET_REQ(xvap, XAT_SYSTEM))
-		xoap->xoa_system = ((*attrs & XAT0_SYSTEM) != 0);
-	if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE))
-		xoap->xoa_archive = ((*attrs & XAT0_ARCHIVE) != 0);
-	if (XVA_ISSET_REQ(xvap, XAT_READONLY))
-		xoap->xoa_readonly = ((*attrs & XAT0_READONLY) != 0);
-	if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE))
-		xoap->xoa_immutable = ((*attrs & XAT0_IMMUTABLE) != 0);
-	if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK))
-		xoap->xoa_nounlink = ((*attrs & XAT0_NOUNLINK) != 0);
-	if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY))
-		xoap->xoa_appendonly = ((*attrs & XAT0_APPENDONLY) != 0);
-	if (XVA_ISSET_REQ(xvap, XAT_NODUMP))
-		xoap->xoa_nodump = ((*attrs & XAT0_NODUMP) != 0);
-	if (XVA_ISSET_REQ(xvap, XAT_OPAQUE))
-		xoap->xoa_opaque = ((*attrs & XAT0_OPAQUE) != 0);
-	if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED))
-		xoap->xoa_av_modified = ((*attrs & XAT0_AV_MODIFIED) != 0);
-	if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED))
-		xoap->xoa_av_quarantined =
-		    ((*attrs & XAT0_AV_QUARANTINED) != 0);
-	if (XVA_ISSET_REQ(xvap, XAT_CREATETIME))
-		ZFS_TIME_DECODE(&xoap->xoa_createtime, crtime);
-	if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP))
-		bcopy(scanstamp, xoap->xoa_av_scanstamp, AV_SCANSTAMP_SZ);
-	if (XVA_ISSET_REQ(xvap, XAT_REPARSE))
-		xoap->xoa_reparse = ((*attrs & XAT0_REPARSE) != 0);
-	if (XVA_ISSET_REQ(xvap, XAT_OFFLINE))
-		xoap->xoa_offline = ((*attrs & XAT0_OFFLINE) != 0);
-	if (XVA_ISSET_REQ(xvap, XAT_SPARSE))
-		xoap->xoa_sparse = ((*attrs & XAT0_SPARSE) != 0);
-}
-
-static int
-zfs_replay_domain_cnt(uint64_t uid, uint64_t gid)
-{
-	uint64_t uid_idx;
-	uint64_t gid_idx;
-	int domcnt = 0;
-
-	uid_idx = FUID_INDEX(uid);
-	gid_idx = FUID_INDEX(gid);
-	if (uid_idx)
-		domcnt++;
-	if (gid_idx > 0 && gid_idx != uid_idx)
-		domcnt++;
-
-	return (domcnt);
-}
-
-static void *
-zfs_replay_fuid_domain_common(zfs_fuid_info_t *fuid_infop, void *start,
-    int domcnt)
-{
-	int i;
-
-	for (i = 0; i != domcnt; i++) {
-		fuid_infop->z_domain_table[i] = start;
-		start = (caddr_t)start + strlen(start) + 1;
-	}
-
-	return (start);
-}
-
-/*
- * Set the uid/gid in the fuid_info structure.
- */
-static void
-zfs_replay_fuid_ugid(zfs_fuid_info_t *fuid_infop, uint64_t uid, uint64_t gid)
-{
-	/*
-	 * If owner or group are log specific FUIDs then slurp up
-	 * domain information and build zfs_fuid_info_t
-	 */
-	if (IS_EPHEMERAL(uid))
-		fuid_infop->z_fuid_owner = uid;
-
-	if (IS_EPHEMERAL(gid))
-		fuid_infop->z_fuid_group = gid;
-}
-
-/*
- * Load fuid domains into fuid_info_t
- */
-static zfs_fuid_info_t *
-zfs_replay_fuid_domain(void *buf, void **end, uint64_t uid, uint64_t gid)
-{
-	int domcnt;
-
-	zfs_fuid_info_t *fuid_infop;
-
-	fuid_infop = zfs_fuid_info_alloc();
-
-	domcnt = zfs_replay_domain_cnt(uid, gid);
-
-	if (domcnt == 0)
-		return (fuid_infop);
-
-	fuid_infop->z_domain_table =
-	    kmem_zalloc(domcnt * sizeof (char **), KM_SLEEP);
-
-	zfs_replay_fuid_ugid(fuid_infop, uid, gid);
-
-	fuid_infop->z_domain_cnt = domcnt;
-	*end = zfs_replay_fuid_domain_common(fuid_infop, buf, domcnt);
-	return (fuid_infop);
-}
-
-/*
- * load zfs_fuid_t's and fuid_domains into fuid_info_t
- */
-static zfs_fuid_info_t *
-zfs_replay_fuids(void *start, void **end, int idcnt, int domcnt, uint64_t uid,
-    uint64_t gid)
-{
-	uint64_t *log_fuid = (uint64_t *)start;
-	zfs_fuid_info_t *fuid_infop;
-	int i;
-
-	fuid_infop = zfs_fuid_info_alloc();
-	fuid_infop->z_domain_cnt = domcnt;
-
-	fuid_infop->z_domain_table =
-	    kmem_zalloc(domcnt * sizeof (char **), KM_SLEEP);
-
-	for (i = 0; i != idcnt; i++) {
-		zfs_fuid_t *zfuid;
-
-		zfuid = kmem_alloc(sizeof (zfs_fuid_t), KM_SLEEP);
-		zfuid->z_logfuid = *log_fuid;
-		zfuid->z_id = -1;
-		zfuid->z_domidx = 0;
-		list_insert_tail(&fuid_infop->z_fuids, zfuid);
-		log_fuid++;
-	}
-
-	zfs_replay_fuid_ugid(fuid_infop, uid, gid);
-
-	*end = zfs_replay_fuid_domain_common(fuid_infop, log_fuid, domcnt);
-	return (fuid_infop);
-}
-
-static void
-zfs_replay_swap_attrs(lr_attr_t *lrattr)
-{
-	/* swap the lr_attr structure */
-	byteswap_uint32_array(lrattr, sizeof (*lrattr));
-	/* swap the bitmap */
-	byteswap_uint32_array(lrattr + 1, (lrattr->lr_attr_masksize - 1) *
-	    sizeof (uint32_t));
-	/* swap the attributes, create time + 64 bit word for attributes */
-	byteswap_uint64_array((caddr_t)(lrattr + 1) + (sizeof (uint32_t) *
-	    (lrattr->lr_attr_masksize - 1)), 3 * sizeof (uint64_t));
-}
-
-/*
- * Replay file create with optional ACL, xvattr information as well
- * as option FUID information.
- */
-static int
-zfs_replay_create_acl(void *arg1, void *arg2, boolean_t byteswap)
-{
-	zfsvfs_t *zfsvfs = arg1;
-	lr_acl_create_t *lracl = arg2;
-	char *name = NULL;		/* location determined later */
-	lr_create_t *lr = (lr_create_t *)lracl;
-	znode_t *dzp;
-	vnode_t *vp = NULL;
-	xvattr_t xva;
-	int vflg = 0;
-	vsecattr_t vsec = { 0 };
-	lr_attr_t *lrattr;
-	void *aclstart;
-	void *fuidstart;
-	size_t xvatlen = 0;
-	uint64_t txtype;
-	uint64_t objid;
-	uint64_t dnodesize;
-	int error;
-
-	txtype = (lr->lr_common.lrc_txtype & ~TX_CI);
-	if (byteswap) {
-		byteswap_uint64_array(lracl, sizeof (*lracl));
-		if (txtype == TX_CREATE_ACL_ATTR ||
-		    txtype == TX_MKDIR_ACL_ATTR) {
-			lrattr = (lr_attr_t *)(caddr_t)(lracl + 1);
-			zfs_replay_swap_attrs(lrattr);
-			xvatlen = ZIL_XVAT_SIZE(lrattr->lr_attr_masksize);
-		}
-
-		aclstart = (caddr_t)(lracl + 1) + xvatlen;
-		zfs_ace_byteswap(aclstart, lracl->lr_acl_bytes, B_FALSE);
-		/* swap fuids */
-		if (lracl->lr_fuidcnt) {
-			byteswap_uint64_array((caddr_t)aclstart +
-			    ZIL_ACE_LENGTH(lracl->lr_acl_bytes),
-			    lracl->lr_fuidcnt * sizeof (uint64_t));
-		}
-	}
-
-	if ((error = zfs_zget(zfsvfs, lr->lr_doid, &dzp)) != 0)
-		return (error);
-
-	objid = LR_FOID_GET_OBJ(lr->lr_foid);
-	dnodesize = LR_FOID_GET_SLOTS(lr->lr_foid) << DNODE_SHIFT;
-
-	xva_init(&xva);
-	zfs_init_vattr(&xva.xva_vattr, AT_TYPE | AT_MODE | AT_UID | AT_GID,
-	    lr->lr_mode, lr->lr_uid, lr->lr_gid, lr->lr_rdev, objid);
-
-	/*
-	 * All forms of zfs create (create, mkdir, mkxattrdir, symlink)
-	 * eventually end up in zfs_mknode(), which assigns the object's
-	 * creation time, generation number, and dnode size. The generic
-	 * zfs_create() has no concept of these attributes, so we smuggle
-	 * the values inside the vattr's otherwise unused va_ctime,
-	 * va_nblocks, and va_fsid fields.
-	 */
-	ZFS_TIME_DECODE(&xva.xva_vattr.va_ctime, lr->lr_crtime);
-	xva.xva_vattr.va_nblocks = lr->lr_gen;
-	xva.xva_vattr.va_fsid = dnodesize;
-
-	error = dmu_object_info(zfsvfs->z_os, lr->lr_foid, NULL);
-	if (error != ENOENT)
-		goto bail;
-
-	if (lr->lr_common.lrc_txtype & TX_CI)
-		vflg |= FIGNORECASE;
-	switch (txtype) {
-	case TX_CREATE_ACL:
-		aclstart = (caddr_t)(lracl + 1);
-		fuidstart = (caddr_t)aclstart +
-		    ZIL_ACE_LENGTH(lracl->lr_acl_bytes);
-		zfsvfs->z_fuid_replay = zfs_replay_fuids(fuidstart,
-		    (void *)&name, lracl->lr_fuidcnt, lracl->lr_domcnt,
-		    lr->lr_uid, lr->lr_gid);
-		/*FALLTHROUGH*/
-	case TX_CREATE_ACL_ATTR:
-		if (name == NULL) {
-			lrattr = (lr_attr_t *)(caddr_t)(lracl + 1);
-			xvatlen = ZIL_XVAT_SIZE(lrattr->lr_attr_masksize);
-			xva.xva_vattr.va_mask |= AT_XVATTR;
-			zfs_replay_xvattr(lrattr, &xva);
-		}
-		vsec.vsa_mask = VSA_ACE | VSA_ACE_ACLFLAGS;
-		vsec.vsa_aclentp = (caddr_t)(lracl + 1) + xvatlen;
-		vsec.vsa_aclcnt = lracl->lr_aclcnt;
-		vsec.vsa_aclentsz = lracl->lr_acl_bytes;
-		vsec.vsa_aclflags = lracl->lr_acl_flags;
-		if (zfsvfs->z_fuid_replay == NULL) {
-			fuidstart = (caddr_t)(lracl + 1) + xvatlen +
-			    ZIL_ACE_LENGTH(lracl->lr_acl_bytes);
-			zfsvfs->z_fuid_replay =
-			    zfs_replay_fuids(fuidstart,
-			    (void *)&name, lracl->lr_fuidcnt, lracl->lr_domcnt,
-			    lr->lr_uid, lr->lr_gid);
-		}
-
-#ifdef TODO
-		error = VOP_CREATE(ZTOV(dzp), name, &xva.xva_vattr,
-		    0, 0, &vp, kcred, vflg, NULL, &vsec);
-#else
-		panic("%s:%u: unsupported condition", __func__, __LINE__);
-#endif
-		break;
-	case TX_MKDIR_ACL:
-		aclstart = (caddr_t)(lracl + 1);
-		fuidstart = (caddr_t)aclstart +
-		    ZIL_ACE_LENGTH(lracl->lr_acl_bytes);
-		zfsvfs->z_fuid_replay = zfs_replay_fuids(fuidstart,
-		    (void *)&name, lracl->lr_fuidcnt, lracl->lr_domcnt,
-		    lr->lr_uid, lr->lr_gid);
-		/*FALLTHROUGH*/
-	case TX_MKDIR_ACL_ATTR:
-		if (name == NULL) {
-			lrattr = (lr_attr_t *)(caddr_t)(lracl + 1);
-			xvatlen = ZIL_XVAT_SIZE(lrattr->lr_attr_masksize);
-			zfs_replay_xvattr(lrattr, &xva);
-		}
-		vsec.vsa_mask = VSA_ACE | VSA_ACE_ACLFLAGS;
-		vsec.vsa_aclentp = (caddr_t)(lracl + 1) + xvatlen;
-		vsec.vsa_aclcnt = lracl->lr_aclcnt;
-		vsec.vsa_aclentsz = lracl->lr_acl_bytes;
-		vsec.vsa_aclflags = lracl->lr_acl_flags;
-		if (zfsvfs->z_fuid_replay == NULL) {
-			fuidstart = (caddr_t)(lracl + 1) + xvatlen +
-			    ZIL_ACE_LENGTH(lracl->lr_acl_bytes);
-			zfsvfs->z_fuid_replay =
-			    zfs_replay_fuids(fuidstart,
-			    (void *)&name, lracl->lr_fuidcnt, lracl->lr_domcnt,
-			    lr->lr_uid, lr->lr_gid);
-		}
-#ifdef TODO
-		error = VOP_MKDIR(ZTOV(dzp), name, &xva.xva_vattr,
-		    &vp, kcred, NULL, vflg, &vsec);
-#else
-		panic("%s:%u: unsupported condition", __func__, __LINE__);
-#endif
-		break;
-	default:
-		error = SET_ERROR(ENOTSUP);
-	}
-
-bail:
-	if (error == 0 && vp != NULL)
-		VN_RELE(vp);
-
-	VN_RELE(ZTOV(dzp));
-
-	if (zfsvfs->z_fuid_replay)
-		zfs_fuid_info_free(zfsvfs->z_fuid_replay);
-	zfsvfs->z_fuid_replay = NULL;
-
-	return (error);
-}
-
-static int
-zfs_replay_create(void *arg1, void *arg2, boolean_t byteswap)
-{
-	zfsvfs_t *zfsvfs = arg1;
-	lr_create_t *lr = arg2;
-	char *name = NULL;		/* location determined later */
-	char *link;			/* symlink content follows name */
-	znode_t *dzp;
-	vnode_t *vp = NULL;
-	xvattr_t xva;
-	int vflg = 0;
-	size_t lrsize = sizeof (lr_create_t);
-	lr_attr_t *lrattr;
-	void *start;
-	size_t xvatlen;
-	uint64_t txtype;
-	struct componentname cn;
-	int error;
-
-	txtype = (lr->lr_common.lrc_txtype & ~TX_CI);
-	if (byteswap) {
-		byteswap_uint64_array(lr, sizeof (*lr));
-		if (txtype == TX_CREATE_ATTR || txtype == TX_MKDIR_ATTR)
-			zfs_replay_swap_attrs((lr_attr_t *)(lr + 1));
-	}
-
-
-	if ((error = zfs_zget(zfsvfs, lr->lr_doid, &dzp)) != 0)
-		return (error);
-
-	uint64_t objid = LR_FOID_GET_OBJ(lr->lr_foid);
-	int dnodesize = LR_FOID_GET_SLOTS(lr->lr_foid) << DNODE_SHIFT;
-
-	xva_init(&xva);
-	zfs_init_vattr(&xva.xva_vattr, AT_TYPE | AT_MODE | AT_UID | AT_GID,
-	    lr->lr_mode, lr->lr_uid, lr->lr_gid, lr->lr_rdev, objid);
-
-	/*
-	 * All forms of zfs create (create, mkdir, mkxattrdir, symlink)
-	 * eventually end up in zfs_mknode(), which assigns the object's
-	 * creation time, generation number, and dnode slot count. The
-	 * generic zfs_create() has no concept of these attributes, so
-	 * we smuggle the values inside the vattr's otherwise unused
-	 * va_ctime, va_nblocks and va_fsid fields.
-	 */
-	ZFS_TIME_DECODE(&xva.xva_vattr.va_ctime, lr->lr_crtime);
-	xva.xva_vattr.va_nblocks = lr->lr_gen;
-	xva.xva_vattr.va_fsid = dnodesize;
-
-	error = dmu_object_info(zfsvfs->z_os, objid, NULL);
-	if (error != ENOENT)
-		goto out;
-
-	if (lr->lr_common.lrc_txtype & TX_CI)
-		vflg |= FIGNORECASE;
-
-	/*
-	 * Symlinks don't have fuid info, and CIFS never creates
-	 * symlinks.
-	 *
-	 * The _ATTR versions will grab the fuid info in their subcases.
-	 */
-	if ((int)lr->lr_common.lrc_txtype != TX_SYMLINK &&
-	    (int)lr->lr_common.lrc_txtype != TX_MKDIR_ATTR &&
-	    (int)lr->lr_common.lrc_txtype != TX_CREATE_ATTR) {
-		start = (lr + 1);
-		zfsvfs->z_fuid_replay =
-		    zfs_replay_fuid_domain(start, &start,
-		    lr->lr_uid, lr->lr_gid);
-	}
-
-	cn.cn_cred = kcred;
-	cn.cn_thread = curthread;
-	cn.cn_flags = SAVENAME;
-
-	vn_lock(ZTOV(dzp), LK_EXCLUSIVE | LK_RETRY);
-	switch (txtype) {
-	case TX_CREATE_ATTR:
-		lrattr = (lr_attr_t *)(caddr_t)(lr + 1);
-		xvatlen = ZIL_XVAT_SIZE(lrattr->lr_attr_masksize);
-		zfs_replay_xvattr((lr_attr_t *)((caddr_t)lr + lrsize), &xva);
-		start = (caddr_t)(lr + 1) + xvatlen;
-		zfsvfs->z_fuid_replay =
-		    zfs_replay_fuid_domain(start, &start,
-		    lr->lr_uid, lr->lr_gid);
-		name = (char *)start;
-
-		/*FALLTHROUGH*/
-	case TX_CREATE:
-		if (name == NULL)
-			name = (char *)start;
-
-		cn.cn_nameptr = name;
-		error = VOP_CREATE(ZTOV(dzp), &vp, &cn, &xva.xva_vattr /*,vflg*/);
-		break;
-	case TX_MKDIR_ATTR:
-		lrattr = (lr_attr_t *)(caddr_t)(lr + 1);
-		xvatlen = ZIL_XVAT_SIZE(lrattr->lr_attr_masksize);
-		zfs_replay_xvattr((lr_attr_t *)((caddr_t)lr + lrsize), &xva);
-		start = (caddr_t)(lr + 1) + xvatlen;
-		zfsvfs->z_fuid_replay =
-		    zfs_replay_fuid_domain(start, &start,
-		    lr->lr_uid, lr->lr_gid);
-		name = (char *)start;
-
-		/*FALLTHROUGH*/
-	case TX_MKDIR:
-		if (name == NULL)
-			name = (char *)(lr + 1);
-
-		cn.cn_nameptr = name;
-		error = VOP_MKDIR(ZTOV(dzp), &vp, &cn, &xva.xva_vattr /*,vflg*/);
-		break;
-	case TX_MKXATTR:
-		error = zfs_make_xattrdir(dzp, &xva.xva_vattr, &vp, kcred);
-		break;
-	case TX_SYMLINK:
-		name = (char *)(lr + 1);
-		link = name + strlen(name) + 1;
-		cn.cn_nameptr = name;
-		error = VOP_SYMLINK(ZTOV(dzp), &vp, &cn, &xva.xva_vattr, link /*,vflg*/);
-		break;
-	default:
-		error = SET_ERROR(ENOTSUP);
-	}
-	VOP_UNLOCK(ZTOV(dzp));
-
-out:
-	if (error == 0 && vp != NULL)
-		VN_URELE(vp);
-
-	VN_RELE(ZTOV(dzp));
-
-	if (zfsvfs->z_fuid_replay)
-		zfs_fuid_info_free(zfsvfs->z_fuid_replay);
-	zfsvfs->z_fuid_replay = NULL;
-	return (error);
-}
-
-static int
-zfs_replay_remove(void *arg1, void *arg2, boolean_t byteswap)
-{
-	zfsvfs_t *zfsvfs = arg1;
-	lr_remove_t *lr = arg2;
-	char *name = (char *)(lr + 1);	/* name follows lr_remove_t */
-	znode_t *dzp;
-	struct componentname cn;
-	vnode_t *vp;
-	int error;
-	int vflg = 0;
-
-	if (byteswap)
-		byteswap_uint64_array(lr, sizeof (*lr));
-
-	if ((error = zfs_zget(zfsvfs, lr->lr_doid, &dzp)) != 0)
-		return (error);
-
-	if (lr->lr_common.lrc_txtype & TX_CI)
-		vflg |= FIGNORECASE;
-	cn.cn_nameptr = name;
-	cn.cn_namelen = strlen(name);
-	cn.cn_nameiop = DELETE;
-	cn.cn_flags = ISLASTCN | SAVENAME;
-	cn.cn_lkflags = LK_EXCLUSIVE | LK_RETRY;
-	cn.cn_cred = kcred;
-	cn.cn_thread = curthread;
-	vn_lock(ZTOV(dzp), LK_EXCLUSIVE | LK_RETRY);
-	error = VOP_LOOKUP(ZTOV(dzp), &vp, &cn);
-	if (error != 0) {
-		VOP_UNLOCK(ZTOV(dzp));
-		goto fail;
-	}
-
-	switch ((int)lr->lr_common.lrc_txtype) {
-	case TX_REMOVE:
-		error = VOP_REMOVE(ZTOV(dzp), vp, &cn /*,vflg*/);
-		break;
-	case TX_RMDIR:
-		error = VOP_RMDIR(ZTOV(dzp), vp, &cn /*,vflg*/);
-		break;
-	default:
-		error = SET_ERROR(ENOTSUP);
-	}
-	vput(vp);
-	VOP_UNLOCK(ZTOV(dzp));
-
-fail:
-	VN_RELE(ZTOV(dzp));
-
-	return (error);
-}
-
-static int
-zfs_replay_link(void *arg1, void *arg2, boolean_t byteswap)
-{
-	zfsvfs_t *zfsvfs = arg1;
-	lr_link_t *lr = arg2;
-	char *name = (char *)(lr + 1);	/* name follows lr_link_t */
-	znode_t *dzp, *zp;
-	struct componentname cn;
-	int error;
-	int vflg = 0;
-
-	if (byteswap)
-		byteswap_uint64_array(lr, sizeof (*lr));
-
-	if ((error = zfs_zget(zfsvfs, lr->lr_doid, &dzp)) != 0)
-		return (error);
-
-	if ((error = zfs_zget(zfsvfs, lr->lr_link_obj, &zp)) != 0) {
-		VN_RELE(ZTOV(dzp));
-		return (error);
-	}
-
-	if (lr->lr_common.lrc_txtype & TX_CI)
-		vflg |= FIGNORECASE;
-
-	cn.cn_nameptr = name;
-	cn.cn_cred = kcred;
-	cn.cn_thread = curthread;
-	cn.cn_flags = SAVENAME;
-
-	vn_lock(ZTOV(dzp), LK_EXCLUSIVE | LK_RETRY);
-	vn_lock(ZTOV(zp), LK_EXCLUSIVE | LK_RETRY);
-	error = VOP_LINK(ZTOV(dzp), ZTOV(zp), &cn /*,vflg*/);
-	VOP_UNLOCK(ZTOV(zp));
-	VOP_UNLOCK(ZTOV(dzp));
-
-	VN_RELE(ZTOV(zp));
-	VN_RELE(ZTOV(dzp));
-
-	return (error);
-}
-
-static int
-zfs_replay_rename(void *arg1, void *arg2, boolean_t byteswap)
-{
-	zfsvfs_t *zfsvfs = arg1;
-	lr_rename_t *lr = arg2;
-	char *sname = (char *)(lr + 1);	/* sname and tname follow lr_rename_t */
-	char *tname = sname + strlen(sname) + 1;
-	znode_t *sdzp, *tdzp;
-	struct componentname scn, tcn;
-	vnode_t *svp, *tvp;
-	kthread_t *td = curthread;
-	int error;
-	int vflg = 0;
-
-	if (byteswap)
-		byteswap_uint64_array(lr, sizeof (*lr));
-
-	if ((error = zfs_zget(zfsvfs, lr->lr_sdoid, &sdzp)) != 0)
-		return (error);
-
-	if ((error = zfs_zget(zfsvfs, lr->lr_tdoid, &tdzp)) != 0) {
-		VN_RELE(ZTOV(sdzp));
-		return (error);
-	}
-
-	if (lr->lr_common.lrc_txtype & TX_CI)
-		vflg |= FIGNORECASE;
-	svp = tvp = NULL;
-
-	scn.cn_nameptr = sname;
-	scn.cn_namelen = strlen(sname);
-	scn.cn_nameiop = DELETE;
-	scn.cn_flags = ISLASTCN | SAVENAME;
-	scn.cn_lkflags = LK_EXCLUSIVE | LK_RETRY;
-	scn.cn_cred = kcred;
-	scn.cn_thread = td;
-	vn_lock(ZTOV(sdzp), LK_EXCLUSIVE | LK_RETRY);
-	error = VOP_LOOKUP(ZTOV(sdzp), &svp, &scn);
-	VOP_UNLOCK(ZTOV(sdzp));
-	if (error != 0)
-		goto fail;
-	VOP_UNLOCK(svp);
-
-	tcn.cn_nameptr = tname;
-	tcn.cn_namelen = strlen(tname);
-	tcn.cn_nameiop = RENAME;
-	tcn.cn_flags = ISLASTCN | SAVENAME;
-	tcn.cn_lkflags = LK_EXCLUSIVE | LK_RETRY;
-	tcn.cn_cred = kcred;
-	tcn.cn_thread = td;
-	vn_lock(ZTOV(tdzp), LK_EXCLUSIVE | LK_RETRY);
-	error = VOP_LOOKUP(ZTOV(tdzp), &tvp, &tcn);
-	if (error == EJUSTRETURN)
-		tvp = NULL;
-	else if (error != 0) {
-		VOP_UNLOCK(ZTOV(tdzp));
-		goto fail;
-	}
-
-	error = VOP_RENAME(ZTOV(sdzp), svp, &scn, ZTOV(tdzp), tvp, &tcn /*,vflg*/);
-	return (error);
-fail:
-	if (svp != NULL)
-		vrele(svp);
-	if (tvp != NULL)
-		vrele(tvp);
-	VN_RELE(ZTOV(tdzp));
-	VN_RELE(ZTOV(sdzp));
-
-	return (error);
-}
-
-static int
-zfs_replay_write(void *arg1, void *arg2, boolean_t byteswap)
-{
-	zfsvfs_t *zfsvfs = arg1;
-	lr_write_t *lr = arg2;
-	char *data = (char *)(lr + 1);	/* data follows lr_write_t */
-	znode_t	*zp;
-	int error;
-	ssize_t resid;
-	uint64_t eod, offset, length;
-
-	if (byteswap)
-		byteswap_uint64_array(lr, sizeof (*lr));
-
-	if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) {
-		/*
-		 * As we can log writes out of order, it's possible the
-		 * file has been removed. In this case just drop the write
-		 * and return success.
-		 */
-		if (error == ENOENT)
-			error = 0;
-		return (error);
-	}
-
-	offset = lr->lr_offset;
-	length = lr->lr_length;
-	eod = offset + length;	/* end of data for this write */
-
-	/*
-	 * This may be a write from a dmu_sync() for a whole block,
-	 * and may extend beyond the current end of the file.
-	 * We can't just replay what was written for this TX_WRITE as
-	 * a future TX_WRITE2 may extend the eof and the data for that
-	 * write needs to be there. So we write the whole block and
-	 * reduce the eof. This needs to be done within the single dmu
-	 * transaction created within vn_rdwr -> zfs_write. So a possible
-	 * new end of file is passed through in zfsvfs->z_replay_eof
-	 */
-
-	zfsvfs->z_replay_eof = 0; /* 0 means don't change end of file */
-
-	/* If it's a dmu_sync() block, write the whole block */
-	if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) {
-		uint64_t blocksize = BP_GET_LSIZE(&lr->lr_blkptr);
-		if (length < blocksize) {
-			offset -= offset % blocksize;
-			length = blocksize;
-		}
-		if (zp->z_size < eod)
-			zfsvfs->z_replay_eof = eod;
-	}
-
-	error = vn_rdwr(UIO_WRITE, ZTOV(zp), data, length, offset,
-	    UIO_SYSSPACE, 0, RLIM64_INFINITY, kcred, &resid);
-
-	VN_RELE(ZTOV(zp));
-	zfsvfs->z_replay_eof = 0;	/* safety */
-
-	return (error);
-}
-
-/*
- * TX_WRITE2 are only generated when dmu_sync() returns EALREADY
- * meaning the pool block is already being synced. So now that we always write
- * out full blocks, all we have to do is expand the eof if
- * the file is grown.
- */
-static int
-zfs_replay_write2(void *arg1, void *arg2, boolean_t byteswap)
-{
-	zfsvfs_t *zfsvfs = arg1;
-	lr_write_t *lr = arg2;
-	znode_t	*zp;
-	int error;
-	uint64_t end;
-
-	if (byteswap)
-		byteswap_uint64_array(lr, sizeof (*lr));
-
-	if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0)
-		return (error);
-
-top:
-	end = lr->lr_offset + lr->lr_length;
-	if (end > zp->z_size) {
-		dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os);
-
-		zp->z_size = end;
-		dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
-		error = dmu_tx_assign(tx, TXG_WAIT);
-		if (error) {
-			VN_RELE(ZTOV(zp));
-			if (error == ERESTART) {
-				dmu_tx_wait(tx);
-				dmu_tx_abort(tx);
-				goto top;
-			}
-			dmu_tx_abort(tx);
-			return (error);
-		}
-		(void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs),
-		    (void *)&zp->z_size, sizeof (uint64_t), tx);
-
-		/* Ensure the replayed seq is updated */
-		(void) zil_replaying(zfsvfs->z_log, tx);
-
-		dmu_tx_commit(tx);
-	}
-
-	VN_RELE(ZTOV(zp));
-
-	return (error);
-}
-
-static int
-zfs_replay_truncate(void *arg1, void *arg2, boolean_t byteswap)
-{
-#ifdef illumos
-	zfsvfs_t *zfsvfs = arg1;
-	lr_truncate_t *lr = arg2;
-	znode_t *zp;
-	flock64_t fl;
-	int error;
-
-	if (byteswap)
-		byteswap_uint64_array(lr, sizeof (*lr));
-
-	if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0)
-		return (error);
-
-	bzero(&fl, sizeof (fl));
-	fl.l_type = F_WRLCK;
-	fl.l_whence = 0;
-	fl.l_start = lr->lr_offset;
-	fl.l_len = lr->lr_length;
-
-	error = VOP_SPACE(ZTOV(zp), F_FREESP, &fl, FWRITE | FOFFMAX,
-	    lr->lr_offset, kcred, NULL);
-
-	VN_RELE(ZTOV(zp));
-
-	return (error);
-#else
-	ZFS_LOG(0, "Unexpected code path, report to pjd@FreeBSD.org");
-	return (EOPNOTSUPP);
-#endif
-}
-
-static int
-zfs_replay_setattr(void *arg1, void *arg2, boolean_t byteswap)
-{
-	zfsvfs_t *zfsvfs = arg1;
-	lr_setattr_t *lr = arg2;
-	znode_t *zp;
-	xvattr_t xva;
-	vattr_t *vap = &xva.xva_vattr;
-	vnode_t *vp;
-	int error;
-	void *start;
-
-	xva_init(&xva);
-	if (byteswap) {
-		byteswap_uint64_array(lr, sizeof (*lr));
-
-		if ((lr->lr_mask & AT_XVATTR) &&
-		    zfsvfs->z_version >= ZPL_VERSION_INITIAL)
-			zfs_replay_swap_attrs((lr_attr_t *)(lr + 1));
-	}
-
-	if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0)
-		return (error);
-
-	zfs_init_vattr(vap, lr->lr_mask, lr->lr_mode,
-	    lr->lr_uid, lr->lr_gid, 0, lr->lr_foid);
-
-	vap->va_size = lr->lr_size;
-	ZFS_TIME_DECODE(&vap->va_atime, lr->lr_atime);
-	ZFS_TIME_DECODE(&vap->va_mtime, lr->lr_mtime);
-
-	/*
-	 * Fill in xvattr_t portions if necessary.
-	 */
-
-	start = (lr_setattr_t *)(lr + 1);
-	if (vap->va_mask & AT_XVATTR) {
-		zfs_replay_xvattr((lr_attr_t *)start, &xva);
-		start = (caddr_t)start +
-		    ZIL_XVAT_SIZE(((lr_attr_t *)start)->lr_attr_masksize);
-	} else
-		xva.xva_vattr.va_mask &= ~AT_XVATTR;
-
-	zfsvfs->z_fuid_replay = zfs_replay_fuid_domain(start, &start,
-	    lr->lr_uid, lr->lr_gid);
-
-	vp = ZTOV(zp);
-	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
-	error = VOP_SETATTR(vp, vap, kcred);
-	VOP_UNLOCK(vp);
-
-	zfs_fuid_info_free(zfsvfs->z_fuid_replay);
-	zfsvfs->z_fuid_replay = NULL;
-	VN_RELE(vp);
-
-	return (error);
-}
-
-extern int zfs_setsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr,
-    caller_context_t *ct);
-
-static int
-zfs_replay_acl_v0(void *arg1, void *arg2, boolean_t byteswap)
-{
-	zfsvfs_t *zfsvfs = arg1;
-	lr_acl_v0_t *lr = arg2;
-	ace_t *ace = (ace_t *)(lr + 1);	/* ace array follows lr_acl_t */
-	vsecattr_t vsa;
-	vnode_t *vp;
-	znode_t *zp;
-	int error;
-
-	if (byteswap) {
-		byteswap_uint64_array(lr, sizeof (*lr));
-		zfs_oldace_byteswap(ace, lr->lr_aclcnt);
-	}
-
-	if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0)
-		return (error);
-
-	bzero(&vsa, sizeof (vsa));
-	vsa.vsa_mask = VSA_ACE | VSA_ACECNT;
-	vsa.vsa_aclcnt = lr->lr_aclcnt;
-	vsa.vsa_aclentsz = sizeof (ace_t) * vsa.vsa_aclcnt;
-	vsa.vsa_aclflags = 0;
-	vsa.vsa_aclentp = ace;
-
-	vp = ZTOV(zp);
-	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
-	error = zfs_setsecattr(vp, &vsa, 0, kcred, NULL);
-	VOP_UNLOCK(vp);
-
-	VN_RELE(vp);
-
-	return (error);
-}
-
-/*
- * Replaying ACLs is complicated by FUID support.
- * The log record may contain some optional data
- * to be used for replaying FUID's.  These pieces
- * are the actual FUIDs that were created initially.
- * The FUID table index may no longer be valid and
- * during zfs_create() a new index may be assigned.
- * Because of this the log will contain the original
- * doman+rid in order to create a new FUID.
- *
- * The individual ACEs may contain an ephemeral uid/gid which is no
- * longer valid and will need to be replaced with an actual FUID.
- *
- */
-static int
-zfs_replay_acl(void *arg1, void *arg2, boolean_t byteswap)
-{
-	zfsvfs_t *zfsvfs = arg1;
-	lr_acl_t *lr = arg2;
-	ace_t *ace = (ace_t *)(lr + 1);
-	vsecattr_t vsa;
-	znode_t *zp;
-	vnode_t *vp;
-	int error;
-
-	if (byteswap) {
-		byteswap_uint64_array(lr, sizeof (*lr));
-		zfs_ace_byteswap(ace, lr->lr_acl_bytes, B_FALSE);
-		if (lr->lr_fuidcnt) {
-			byteswap_uint64_array((caddr_t)ace +
-			    ZIL_ACE_LENGTH(lr->lr_acl_bytes),
-			    lr->lr_fuidcnt * sizeof (uint64_t));
-		}
-	}
-
-	if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0)
-		return (error);
-
-	bzero(&vsa, sizeof (vsa));
-	vsa.vsa_mask = VSA_ACE | VSA_ACECNT | VSA_ACE_ACLFLAGS;
-	vsa.vsa_aclcnt = lr->lr_aclcnt;
-	vsa.vsa_aclentp = ace;
-	vsa.vsa_aclentsz = lr->lr_acl_bytes;
-	vsa.vsa_aclflags = lr->lr_acl_flags;
-
-	if (lr->lr_fuidcnt) {
-		void *fuidstart = (caddr_t)ace +
-		    ZIL_ACE_LENGTH(lr->lr_acl_bytes);
-
-		zfsvfs->z_fuid_replay =
-		    zfs_replay_fuids(fuidstart, &fuidstart,
-		    lr->lr_fuidcnt, lr->lr_domcnt, 0, 0);
-	}
-
-	vp = ZTOV(zp);
-	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
-	error = zfs_setsecattr(vp, &vsa, 0, kcred, NULL);
-	VOP_UNLOCK(vp);
-
-	if (zfsvfs->z_fuid_replay)
-		zfs_fuid_info_free(zfsvfs->z_fuid_replay);
-
-	zfsvfs->z_fuid_replay = NULL;
-	VN_RELE(vp);
-
-	return (error);
-}
-
-/*
- * Callback vectors for replaying records
- */
-zil_replay_func_t *zfs_replay_vector[TX_MAX_TYPE] = {
-	zfs_replay_error,	/* 0 no such transaction type */
-	zfs_replay_create,	/* TX_CREATE */
-	zfs_replay_create,	/* TX_MKDIR */
-	zfs_replay_create,	/* TX_MKXATTR */
-	zfs_replay_create,	/* TX_SYMLINK */
-	zfs_replay_remove,	/* TX_REMOVE */
-	zfs_replay_remove,	/* TX_RMDIR */
-	zfs_replay_link,	/* TX_LINK */
-	zfs_replay_rename,	/* TX_RENAME */
-	zfs_replay_write,	/* TX_WRITE */
-	zfs_replay_truncate,	/* TX_TRUNCATE */
-	zfs_replay_setattr,	/* TX_SETATTR */
-	zfs_replay_acl_v0,	/* TX_ACL_V0 */
-	zfs_replay_acl,		/* TX_ACL */
-	zfs_replay_create_acl,	/* TX_CREATE_ACL */
-	zfs_replay_create,	/* TX_CREATE_ATTR */
-	zfs_replay_create_acl,	/* TX_CREATE_ACL_ATTR */
-	zfs_replay_create_acl,	/* TX_MKDIR_ACL */
-	zfs_replay_create,	/* TX_MKDIR_ATTR */
-	zfs_replay_create_acl,	/* TX_MKDIR_ACL_ATTR */
-	zfs_replay_write2,	/* TX_WRITE2 */
-};
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_rlock.c
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_rlock.c
+++ /dev/null
@@ -1,641 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-/*
- * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
- */
-
-/*
- * This file contains the code to implement file range locking in
- * ZFS, although there isn't much specific to ZFS (all that comes to mind is
- * support for growing the blocksize).
- *
- * Interface
- * ---------
- * Defined in zfs_rlock.h but essentially:
- *	lr = rangelock_enter(zp, off, len, lock_type);
- *	rangelock_reduce(lr, off, len); // optional
- *	rangelock_exit(lr);
- *
- * AVL tree
- * --------
- * An AVL tree is used to maintain the state of the existing ranges
- * that are locked for exclusive (writer) or shared (reader) use.
- * The starting range offset is used for searching and sorting the tree.
- *
- * Common case
- * -----------
- * The (hopefully) usual case is of no overlaps or contention for locks. On
- * entry to rangelock_enter(), a locked_range_t is allocated; the tree
- * searched that finds no overlap, and *this* locked_range_t is placed in the
- * tree.
- *
- * Overlaps/Reference counting/Proxy locks
- * ---------------------------------------
- * The avl code only allows one node at a particular offset. Also it's very
- * inefficient to search through all previous entries looking for overlaps
- * (because the very 1st in the ordered list might be at offset 0 but
- * cover the whole file).
- * So this implementation uses reference counts and proxy range locks.
- * Firstly, only reader locks use reference counts and proxy locks,
- * because writer locks are exclusive.
- * When a reader lock overlaps with another then a proxy lock is created
- * for that range and replaces the original lock. If the overlap
- * is exact then the reference count of the proxy is simply incremented.
- * Otherwise, the proxy lock is split into smaller lock ranges and
- * new proxy locks created for non overlapping ranges.
- * The reference counts are adjusted accordingly.
- * Meanwhile, the orginal lock is kept around (this is the callers handle)
- * and its offset and length are used when releasing the lock.
- *
- * Thread coordination
- * -------------------
- * In order to make wakeups efficient and to ensure multiple continuous
- * readers on a range don't starve a writer for the same range lock,
- * two condition variables are allocated in each rl_t.
- * If a writer (or reader) can't get a range it initialises the writer
- * (or reader) cv; sets a flag saying there's a writer (or reader) waiting;
- * and waits on that cv. When a thread unlocks that range it wakes up all
- * writers then all readers before destroying the lock.
- *
- * Append mode writes
- * ------------------
- * Append mode writes need to lock a range at the end of a file.
- * The offset of the end of the file is determined under the
- * range locking mutex, and the lock type converted from RL_APPEND to
- * RL_WRITER and the range locked.
- *
- * Grow block handling
- * -------------------
- * ZFS supports multiple block sizes, up to 16MB. The smallest
- * block size is used for the file which is grown as needed. During this
- * growth all other writers and readers must be excluded.
- * So if the block size needs to be grown then the whole file is
- * exclusively locked, then later the caller will reduce the lock
- * range to just the range to be written using rangelock_reduce().
- */
-
-#include <sys/zfs_context.h>
-#include <sys/avl.h>
-#include <sys/zfs_rlock.h>
-
-/*
- * AVL comparison function used to order range locks
- * Locks are ordered on the start offset of the range.
- */
-static int
-rangelock_compare(const void *arg1, const void *arg2)
-{
-	const locked_range_t *rl1 = (const locked_range_t *)arg1;
-	const locked_range_t *rl2 = (const locked_range_t *)arg2;
-
-	return (AVL_CMP(rl1->lr_offset, rl2->lr_offset));
-}
-
-/*
- * The callback is invoked when acquiring a RL_WRITER or RL_APPEND lock.
- * It must convert RL_APPEND to RL_WRITER (starting at the end of the file),
- * and may increase the range that's locked for RL_WRITER.
- */
-void
-rangelock_init(rangelock_t *rl, rangelock_cb_t *cb, void *arg)
-{
-	mutex_init(&rl->rl_lock, NULL, MUTEX_DEFAULT, NULL);
-	avl_create(&rl->rl_tree, rangelock_compare,
-	    sizeof (locked_range_t), offsetof(locked_range_t, lr_node));
-	rl->rl_cb = cb;
-	rl->rl_arg = arg;
-}
-
-void
-rangelock_fini(rangelock_t *rl)
-{
-	mutex_destroy(&rl->rl_lock);
-	avl_destroy(&rl->rl_tree);
-}
-
-/*
- * Check if a write lock can be grabbed.  If not, fail immediately or sleep and
- * recheck until available, depending on the value of the "nonblock" parameter.
- */
-static boolean_t
-rangelock_enter_writer(rangelock_t *rl, locked_range_t *new, boolean_t nonblock)
-{
-	avl_tree_t *tree = &rl->rl_tree;
-	locked_range_t *lr;
-	avl_index_t where;
-	uint64_t orig_off = new->lr_offset;
-	uint64_t orig_len = new->lr_length;
-	rangelock_type_t orig_type = new->lr_type;
-
-	for (;;) {
-		/*
-		 * Call callback which can modify new->r_off,len,type.
-		 * Note, the callback is used by the ZPL to handle appending
-		 * and changing blocksizes.  It isn't needed for zvols.
-		 */
-		if (rl->rl_cb != NULL) {
-			rl->rl_cb(new, rl->rl_arg);
-		}
-
-		/*
-		 * If the type was APPEND, the callback must convert it to
-		 * WRITER.
-		 */
-		ASSERT3U(new->lr_type, ==, RL_WRITER);
-
-		/*
-		 * First check for the usual case of no locks
-		 */
-		if (avl_numnodes(tree) == 0) {
-			avl_add(tree, new);
-			return (B_TRUE);
-		}
-
-		/*
-		 * Look for any locks in the range.
-		 */
-		lr = avl_find(tree, new, &where);
-		if (lr != NULL)
-			goto wait; /* already locked at same offset */
-
-		lr = (locked_range_t *)avl_nearest(tree, where, AVL_AFTER);
-		if (lr != NULL &&
-		    lr->lr_offset < new->lr_offset + new->lr_length)
-			goto wait;
-
-		lr = (locked_range_t *)avl_nearest(tree, where, AVL_BEFORE);
-		if (lr != NULL &&
-		    lr->lr_offset + lr->lr_length > new->lr_offset)
-			goto wait;
-
-		avl_insert(tree, new, where);
-		return (B_TRUE);
-wait:
-		if (nonblock)
-			return (B_FALSE);
-		if (!lr->lr_write_wanted) {
-			cv_init(&lr->lr_write_cv, NULL, CV_DEFAULT, NULL);
-			lr->lr_write_wanted = B_TRUE;
-		}
-		cv_wait(&lr->lr_write_cv, &rl->rl_lock);
-
-		/* reset to original */
-		new->lr_offset = orig_off;
-		new->lr_length = orig_len;
-		new->lr_type = orig_type;
-	}
-}
-
-/*
- * If this is an original (non-proxy) lock then replace it by
- * a proxy and return the proxy.
- */
-static locked_range_t *
-rangelock_proxify(avl_tree_t *tree, locked_range_t *lr)
-{
-	locked_range_t *proxy;
-
-	if (lr->lr_proxy)
-		return (lr); /* already a proxy */
-
-	ASSERT3U(lr->lr_count, ==, 1);
-	ASSERT(lr->lr_write_wanted == B_FALSE);
-	ASSERT(lr->lr_read_wanted == B_FALSE);
-	avl_remove(tree, lr);
-	lr->lr_count = 0;
-
-	/* create a proxy range lock */
-	proxy = kmem_alloc(sizeof (locked_range_t), KM_SLEEP);
-	proxy->lr_offset = lr->lr_offset;
-	proxy->lr_length = lr->lr_length;
-	proxy->lr_count = 1;
-	proxy->lr_type = RL_READER;
-	proxy->lr_proxy = B_TRUE;
-	proxy->lr_write_wanted = B_FALSE;
-	proxy->lr_read_wanted = B_FALSE;
-	avl_add(tree, proxy);
-
-	return (proxy);
-}
-
-/*
- * Split the range lock at the supplied offset
- * returning the *front* proxy.
- */
-static locked_range_t *
-rangelock_split(avl_tree_t *tree, locked_range_t *lr, uint64_t off)
-{
-	ASSERT3U(lr->lr_length, >, 1);
-	ASSERT3U(off, >, lr->lr_offset);
-	ASSERT3U(off, <, lr->lr_offset + lr->lr_length);
-	ASSERT(lr->lr_write_wanted == B_FALSE);
-	ASSERT(lr->lr_read_wanted == B_FALSE);
-
-	/* create the rear proxy range lock */
-	locked_range_t *rear = kmem_alloc(sizeof (locked_range_t), KM_SLEEP);
-	rear->lr_offset = off;
-	rear->lr_length = lr->lr_offset + lr->lr_length - off;
-	rear->lr_count = lr->lr_count;
-	rear->lr_type = RL_READER;
-	rear->lr_proxy = B_TRUE;
-	rear->lr_write_wanted = B_FALSE;
-	rear->lr_read_wanted = B_FALSE;
-
-	locked_range_t *front = rangelock_proxify(tree, lr);
-	front->lr_length = off - lr->lr_offset;
-
-	avl_insert_here(tree, rear, front, AVL_AFTER);
-	return (front);
-}
-
-/*
- * Create and add a new proxy range lock for the supplied range.
- */
-static void
-rangelock_new_proxy(avl_tree_t *tree, uint64_t off, uint64_t len)
-{
-	ASSERT(len != 0);
-	locked_range_t *lr = kmem_alloc(sizeof (locked_range_t), KM_SLEEP);
-	lr->lr_offset = off;
-	lr->lr_length = len;
-	lr->lr_count = 1;
-	lr->lr_type = RL_READER;
-	lr->lr_proxy = B_TRUE;
-	lr->lr_write_wanted = B_FALSE;
-	lr->lr_read_wanted = B_FALSE;
-	avl_add(tree, lr);
-}
-
-static void
-rangelock_add_reader(avl_tree_t *tree, locked_range_t *new,
-    locked_range_t *prev, avl_index_t where)
-{
-	locked_range_t *next;
-	uint64_t off = new->lr_offset;
-	uint64_t len = new->lr_length;
-
-	/*
-	 * prev arrives either:
-	 * - pointing to an entry at the same offset
-	 * - pointing to the entry with the closest previous offset whose
-	 *   range may overlap with the new range
-	 * - null, if there were no ranges starting before the new one
-	 */
-	if (prev != NULL) {
-		if (prev->lr_offset + prev->lr_length <= off) {
-			prev = NULL;
-		} else if (prev->lr_offset != off) {
-			/*
-			 * convert to proxy if needed then
-			 * split this entry and bump ref count
-			 */
-			prev = rangelock_split(tree, prev, off);
-			prev = AVL_NEXT(tree, prev); /* move to rear range */
-		}
-	}
-	ASSERT((prev == NULL) || (prev->lr_offset == off));
-
-	if (prev != NULL)
-		next = prev;
-	else
-		next = avl_nearest(tree, where, AVL_AFTER);
-
-	if (next == NULL || off + len <= next->lr_offset) {
-		/* no overlaps, use the original new rl_t in the tree */
-		avl_insert(tree, new, where);
-		return;
-	}
-
-	if (off < next->lr_offset) {
-		/* Add a proxy for initial range before the overlap */
-		rangelock_new_proxy(tree, off, next->lr_offset - off);
-	}
-
-	new->lr_count = 0; /* will use proxies in tree */
-	/*
-	 * We now search forward through the ranges, until we go past the end
-	 * of the new range. For each entry we make it a proxy if it
-	 * isn't already, then bump its reference count. If there's any
-	 * gaps between the ranges then we create a new proxy range.
-	 */
-	for (prev = NULL; next; prev = next, next = AVL_NEXT(tree, next)) {
-		if (off + len <= next->lr_offset)
-			break;
-		if (prev != NULL && prev->lr_offset + prev->lr_length <
-		    next->lr_offset) {
-			/* there's a gap */
-			ASSERT3U(next->lr_offset, >,
-			    prev->lr_offset + prev->lr_length);
-			rangelock_new_proxy(tree,
-			    prev->lr_offset + prev->lr_length,
-			    next->lr_offset -
-			    (prev->lr_offset + prev->lr_length));
-		}
-		if (off + len == next->lr_offset + next->lr_length) {
-			/* exact overlap with end */
-			next = rangelock_proxify(tree, next);
-			next->lr_count++;
-			return;
-		}
-		if (off + len < next->lr_offset + next->lr_length) {
-			/* new range ends in the middle of this block */
-			next = rangelock_split(tree, next, off + len);
-			next->lr_count++;
-			return;
-		}
-		ASSERT3U(off + len, >, next->lr_offset + next->lr_length);
-		next = rangelock_proxify(tree, next);
-		next->lr_count++;
-	}
-
-	/* Add the remaining end range. */
-	rangelock_new_proxy(tree, prev->lr_offset + prev->lr_length,
-	    (off + len) - (prev->lr_offset + prev->lr_length));
-}
-
-/*
- * Check if a reader lock can be grabbed.  If not, fail immediately or sleep and
- * recheck until available, depending on the value of the "nonblock" parameter.
- */
-static boolean_t
-rangelock_enter_reader(rangelock_t *rl, locked_range_t *new, boolean_t nonblock)
-{
-	avl_tree_t *tree = &rl->rl_tree;
-	locked_range_t *prev, *next;
-	avl_index_t where;
-	uint64_t off = new->lr_offset;
-	uint64_t len = new->lr_length;
-
-	/*
-	 * Look for any writer locks in the range.
-	 */
-retry:
-	prev = avl_find(tree, new, &where);
-	if (prev == NULL)
-		prev = (locked_range_t *)avl_nearest(tree, where, AVL_BEFORE);
-
-	/*
-	 * Check the previous range for a writer lock overlap.
-	 */
-	if (prev && (off < prev->lr_offset + prev->lr_length)) {
-		if ((prev->lr_type == RL_WRITER) || (prev->lr_write_wanted)) {
-			if (nonblock)
-				return (B_FALSE);
-			if (!prev->lr_read_wanted) {
-				cv_init(&prev->lr_read_cv,
-				    NULL, CV_DEFAULT, NULL);
-				prev->lr_read_wanted = B_TRUE;
-			}
-			cv_wait(&prev->lr_read_cv, &rl->rl_lock);
-			goto retry;
-		}
-		if (off + len < prev->lr_offset + prev->lr_length)
-			goto got_lock;
-	}
-
-	/*
-	 * Search through the following ranges to see if there's
-	 * write lock any overlap.
-	 */
-	if (prev != NULL)
-		next = AVL_NEXT(tree, prev);
-	else
-		next = (locked_range_t *)avl_nearest(tree, where, AVL_AFTER);
-	for (; next != NULL; next = AVL_NEXT(tree, next)) {
-		if (off + len <= next->lr_offset)
-			goto got_lock;
-		if ((next->lr_type == RL_WRITER) || (next->lr_write_wanted)) {
-			if (nonblock)
-				return (B_FALSE);
-			if (!next->lr_read_wanted) {
-				cv_init(&next->lr_read_cv,
-				    NULL, CV_DEFAULT, NULL);
-				next->lr_read_wanted = B_TRUE;
-			}
-			cv_wait(&next->lr_read_cv, &rl->rl_lock);
-			goto retry;
-		}
-		if (off + len <= next->lr_offset + next->lr_length)
-			goto got_lock;
-	}
-
-got_lock:
-	/*
-	 * Add the read lock, which may involve splitting existing
-	 * locks and bumping ref counts (r_count).
-	 */
-	rangelock_add_reader(tree, new, prev, where);
-	return (B_TRUE);
-}
-
-/*
- * Lock a range (offset, length) as either shared (RL_READER) or exclusive
- * (RL_WRITER or RL_APPEND).  If RL_APPEND is specified, rl_cb() will convert
- * it to a RL_WRITER lock (with the offset at the end of the file).  Returns
- * the range lock structure for later unlocking (or reduce range if the
- * entire file is locked as RL_WRITER).
- */
-static locked_range_t *
-_rangelock_enter(rangelock_t *rl, uint64_t off, uint64_t len,
-    rangelock_type_t type, boolean_t nonblock)
-{
-	ASSERT(type == RL_READER || type == RL_WRITER || type == RL_APPEND);
-
-	locked_range_t *new = kmem_alloc(sizeof (*new), KM_SLEEP);
-	new->lr_rangelock = rl;
-	new->lr_offset = off;
-	if (len + off < off)	/* overflow */
-		len = UINT64_MAX - off;
-	new->lr_length = len;
-	new->lr_count = 1; /* assume it's going to be in the tree */
-	new->lr_type = type;
-	new->lr_proxy = B_FALSE;
-	new->lr_write_wanted = B_FALSE;
-	new->lr_read_wanted = B_FALSE;
-
-	mutex_enter(&rl->rl_lock);
-	if (type == RL_READER) {
-		/*
-		 * First check for the usual case of no locks
-		 */
-		if (avl_numnodes(&rl->rl_tree) == 0) {
-			avl_add(&rl->rl_tree, new);
-		} else if (!rangelock_enter_reader(rl, new, nonblock)) {
-			kmem_free(new, sizeof (*new));
-			new = NULL;
-		}
-	} else if (!rangelock_enter_writer(rl, new, nonblock)) {
-		kmem_free(new, sizeof (*new));
-		new = NULL;
-	}
-	mutex_exit(&rl->rl_lock);
-	return (new);
-}
-
-locked_range_t *
-rangelock_enter(rangelock_t *rl, uint64_t off, uint64_t len,
-    rangelock_type_t type)
-{
-	return (_rangelock_enter(rl, off, len, type, B_FALSE));
-}
-
-locked_range_t *
-rangelock_tryenter(rangelock_t *rl, uint64_t off, uint64_t len,
-    rangelock_type_t type)
-{
-	return (_rangelock_enter(rl, off, len, type, B_TRUE));
-}
-
-/*
- * Unlock a reader lock
- */
-static void
-rangelock_exit_reader(rangelock_t *rl, locked_range_t *remove)
-{
-	avl_tree_t *tree = &rl->rl_tree;
-	uint64_t len;
-
-	/*
-	 * The common case is when the remove entry is in the tree
-	 * (cnt == 1) meaning there's been no other reader locks overlapping
-	 * with this one. Otherwise the remove entry will have been
-	 * removed from the tree and replaced by proxies (one or
-	 * more ranges mapping to the entire range).
-	 */
-	if (remove->lr_count == 1) {
-		avl_remove(tree, remove);
-		if (remove->lr_write_wanted) {
-			cv_broadcast(&remove->lr_write_cv);
-			cv_destroy(&remove->lr_write_cv);
-		}
-		if (remove->lr_read_wanted) {
-			cv_broadcast(&remove->lr_read_cv);
-			cv_destroy(&remove->lr_read_cv);
-		}
-	} else {
-		ASSERT0(remove->lr_count);
-		ASSERT0(remove->lr_write_wanted);
-		ASSERT0(remove->lr_read_wanted);
-		/*
-		 * Find start proxy representing this reader lock,
-		 * then decrement ref count on all proxies
-		 * that make up this range, freeing them as needed.
-		 */
-		locked_range_t *lr = avl_find(tree, remove, NULL);
-		ASSERT3P(lr, !=, NULL);
-		ASSERT3U(lr->lr_count, !=, 0);
-		ASSERT3U(lr->lr_type, ==, RL_READER);
-		locked_range_t *next = NULL;
-		for (len = remove->lr_length; len != 0; lr = next) {
-			len -= lr->lr_length;
-			if (len != 0) {
-				next = AVL_NEXT(tree, lr);
-				ASSERT3P(next, !=, NULL);
-				ASSERT3U(lr->lr_offset + lr->lr_length, ==,
-				    next->lr_offset);
-				ASSERT3U(next->lr_count, !=, 0);
-				ASSERT3U(next->lr_type, ==, RL_READER);
-			}
-			lr->lr_count--;
-			if (lr->lr_count == 0) {
-				avl_remove(tree, lr);
-				if (lr->lr_write_wanted) {
-					cv_broadcast(&lr->lr_write_cv);
-					cv_destroy(&lr->lr_write_cv);
-				}
-				if (lr->lr_read_wanted) {
-					cv_broadcast(&lr->lr_read_cv);
-					cv_destroy(&lr->lr_read_cv);
-				}
-				kmem_free(lr, sizeof (locked_range_t));
-			}
-		}
-	}
-	kmem_free(remove, sizeof (locked_range_t));
-}
-
-/*
- * Unlock range and destroy range lock structure.
- */
-void
-rangelock_exit(locked_range_t *lr)
-{
-	rangelock_t *rl = lr->lr_rangelock;
-
-	ASSERT(lr->lr_type == RL_WRITER || lr->lr_type == RL_READER);
-	ASSERT(lr->lr_count == 1 || lr->lr_count == 0);
-	ASSERT(!lr->lr_proxy);
-
-	mutex_enter(&rl->rl_lock);
-	if (lr->lr_type == RL_WRITER) {
-		/* writer locks can't be shared or split */
-		avl_remove(&rl->rl_tree, lr);
-		mutex_exit(&rl->rl_lock);
-		if (lr->lr_write_wanted) {
-			cv_broadcast(&lr->lr_write_cv);
-			cv_destroy(&lr->lr_write_cv);
-		}
-		if (lr->lr_read_wanted) {
-			cv_broadcast(&lr->lr_read_cv);
-			cv_destroy(&lr->lr_read_cv);
-		}
-		kmem_free(lr, sizeof (locked_range_t));
-	} else {
-		/*
-		 * lock may be shared, let rangelock_exit_reader()
-		 * release the lock and free the rl_t
-		 */
-		rangelock_exit_reader(rl, lr);
-		mutex_exit(&rl->rl_lock);
-	}
-}
-
-/*
- * Reduce range locked as RL_WRITER from whole file to specified range.
- * Asserts the whole file is exclusively locked and so there's only one
- * entry in the tree.
- */
-void
-rangelock_reduce(locked_range_t *lr, uint64_t off, uint64_t len)
-{
-	rangelock_t *rl = lr->lr_rangelock;
-
-	/* Ensure there are no other locks */
-	ASSERT3U(avl_numnodes(&rl->rl_tree), ==, 1);
-	ASSERT3U(lr->lr_offset, ==, 0);
-	ASSERT3U(lr->lr_type, ==, RL_WRITER);
-	ASSERT(!lr->lr_proxy);
-	ASSERT3U(lr->lr_length, ==, UINT64_MAX);
-	ASSERT3U(lr->lr_count, ==, 1);
-
-	mutex_enter(&rl->rl_lock);
-	lr->lr_offset = off;
-	lr->lr_length = len;
-	mutex_exit(&rl->rl_lock);
-	if (lr->lr_write_wanted)
-		cv_broadcast(&lr->lr_write_cv);
-	if (lr->lr_read_wanted)
-		cv_broadcast(&lr->lr_read_cv);
-}
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_sa.c
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_sa.c
+++ /dev/null
@@ -1,326 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
- */
-
-#include <sys/zfs_context.h>
-#include <sys/vnode.h>
-#include <sys/sa.h>
-#include <sys/zfs_acl.h>
-#include <sys/zfs_sa.h>
-
-/*
- * ZPL attribute registration table.
- * Order of attributes doesn't matter
- * a unique value will be assigned for each
- * attribute that is file system specific
- *
- * This is just the set of ZPL attributes that this
- * version of ZFS deals with natively.  The file system
- * could have other attributes stored in files, but they will be
- * ignored.  The SA framework will preserve them, just that
- * this version of ZFS won't change or delete them.
- */
-
-sa_attr_reg_t zfs_attr_table[ZPL_END+1] = {
-	{"ZPL_ATIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 0},
-	{"ZPL_MTIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 1},
-	{"ZPL_CTIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 2},
-	{"ZPL_CRTIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 3},
-	{"ZPL_GEN", sizeof (uint64_t), SA_UINT64_ARRAY, 4},
-	{"ZPL_MODE", sizeof (uint64_t), SA_UINT64_ARRAY, 5},
-	{"ZPL_SIZE", sizeof (uint64_t), SA_UINT64_ARRAY, 6},
-	{"ZPL_PARENT", sizeof (uint64_t), SA_UINT64_ARRAY, 7},
-	{"ZPL_LINKS", sizeof (uint64_t), SA_UINT64_ARRAY, 8},
-	{"ZPL_XATTR", sizeof (uint64_t), SA_UINT64_ARRAY, 9},
-	{"ZPL_RDEV", sizeof (uint64_t), SA_UINT64_ARRAY, 10},
-	{"ZPL_FLAGS", sizeof (uint64_t), SA_UINT64_ARRAY, 11},
-	{"ZPL_UID", sizeof (uint64_t), SA_UINT64_ARRAY, 12},
-	{"ZPL_GID", sizeof (uint64_t), SA_UINT64_ARRAY, 13},
-	{"ZPL_PAD", sizeof (uint64_t) * 4, SA_UINT64_ARRAY, 14},
-	{"ZPL_ZNODE_ACL", 88, SA_UINT8_ARRAY, 15},
-	{"ZPL_DACL_COUNT", sizeof (uint64_t), SA_UINT64_ARRAY, 0},
-	{"ZPL_SYMLINK", 0, SA_UINT8_ARRAY, 0},
-	{"ZPL_SCANSTAMP", 32, SA_UINT8_ARRAY, 0},
-	{"ZPL_DACL_ACES", 0, SA_ACL, 0},
-	{NULL, 0, 0, 0}
-};
-
-#ifdef _KERNEL
-
-int
-zfs_sa_readlink(znode_t *zp, uio_t *uio)
-{
-	dmu_buf_t *db = sa_get_db(zp->z_sa_hdl);
-	size_t bufsz;
-	int error;
-
-	bufsz = zp->z_size;
-	if (bufsz + ZFS_OLD_ZNODE_PHYS_SIZE <= db->db_size) {
-		error = uiomove((caddr_t)db->db_data +
-		    ZFS_OLD_ZNODE_PHYS_SIZE,
-		    MIN((size_t)bufsz, uio->uio_resid), UIO_READ, uio);
-	} else {
-		dmu_buf_t *dbp;
-		if ((error = dmu_buf_hold(zp->z_zfsvfs->z_os, zp->z_id,
-		    0, FTAG, &dbp, DMU_READ_NO_PREFETCH)) == 0) {
-			error = uiomove(dbp->db_data,
-			    MIN((size_t)bufsz, uio->uio_resid), UIO_READ, uio);
-			dmu_buf_rele(dbp, FTAG);
-		}
-	}
-	return (error);
-}
-
-void
-zfs_sa_symlink(znode_t *zp, char *link, int len, dmu_tx_t *tx)
-{
-	dmu_buf_t *db = sa_get_db(zp->z_sa_hdl);
-
-	if (ZFS_OLD_ZNODE_PHYS_SIZE + len <= dmu_bonus_max()) {
-		VERIFY0(dmu_set_bonus(db, len + ZFS_OLD_ZNODE_PHYS_SIZE, tx));
-		if (len) {
-			bcopy(link, (caddr_t)db->db_data +
-			    ZFS_OLD_ZNODE_PHYS_SIZE, len);
-		}
-	} else {
-		dmu_buf_t *dbp;
-
-		zfs_grow_blocksize(zp, len, tx);
-		VERIFY(0 == dmu_buf_hold(zp->z_zfsvfs->z_os,
-		    zp->z_id, 0, FTAG, &dbp, DMU_READ_NO_PREFETCH));
-
-		dmu_buf_will_dirty(dbp, tx);
-
-		ASSERT3U(len, <=, dbp->db_size);
-		bcopy(link, dbp->db_data, len);
-		dmu_buf_rele(dbp, FTAG);
-	}
-}
-
-void
-zfs_sa_get_scanstamp(znode_t *zp, xvattr_t *xvap)
-{
-	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
-	xoptattr_t *xoap;
-
-	ASSERT_VOP_LOCKED(ZTOV(zp), __func__);
-	VERIFY((xoap = xva_getxoptattr(xvap)) != NULL);
-	if (zp->z_is_sa) {
-		if (sa_lookup(zp->z_sa_hdl, SA_ZPL_SCANSTAMP(zfsvfs),
-		    &xoap->xoa_av_scanstamp,
-		    sizeof (xoap->xoa_av_scanstamp)) != 0)
-			return;
-	} else {
-		dmu_object_info_t doi;
-		dmu_buf_t *db = sa_get_db(zp->z_sa_hdl);
-		int len;
-
-		if (!(zp->z_pflags & ZFS_BONUS_SCANSTAMP))
-			return;
-
-		sa_object_info(zp->z_sa_hdl, &doi);
-		len = sizeof (xoap->xoa_av_scanstamp) +
-		    ZFS_OLD_ZNODE_PHYS_SIZE;
-
-		if (len <= doi.doi_bonus_size) {
-			(void) memcpy(xoap->xoa_av_scanstamp,
-			    (caddr_t)db->db_data + ZFS_OLD_ZNODE_PHYS_SIZE,
-			    sizeof (xoap->xoa_av_scanstamp));
-		}
-	}
-	XVA_SET_RTN(xvap, XAT_AV_SCANSTAMP);
-}
-
-void
-zfs_sa_set_scanstamp(znode_t *zp, xvattr_t *xvap, dmu_tx_t *tx)
-{
-	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
-	xoptattr_t *xoap;
-
-	ASSERT_VOP_ELOCKED(ZTOV(zp), __func__);
-	VERIFY((xoap = xva_getxoptattr(xvap)) != NULL);
-	if (zp->z_is_sa)
-		VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_SCANSTAMP(zfsvfs),
-		    &xoap->xoa_av_scanstamp,
-		    sizeof (xoap->xoa_av_scanstamp), tx));
-	else {
-		dmu_object_info_t doi;
-		dmu_buf_t *db = sa_get_db(zp->z_sa_hdl);
-		int len;
-
-		sa_object_info(zp->z_sa_hdl, &doi);
-		len = sizeof (xoap->xoa_av_scanstamp) +
-		    ZFS_OLD_ZNODE_PHYS_SIZE;
-		if (len > doi.doi_bonus_size)
-			VERIFY(dmu_set_bonus(db, len, tx) == 0);
-		(void) memcpy((caddr_t)db->db_data + ZFS_OLD_ZNODE_PHYS_SIZE,
-		    xoap->xoa_av_scanstamp, sizeof (xoap->xoa_av_scanstamp));
-
-		zp->z_pflags |= ZFS_BONUS_SCANSTAMP;
-		VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs),
-		    &zp->z_pflags, sizeof (uint64_t), tx));
-	}
-}
-
-/*
- * I'm not convinced we should do any of this upgrade.
- * since the SA code can read both old/new znode formats
- * with probably little to no performance difference.
- *
- * All new files will be created with the new format.
- */
-
-void
-zfs_sa_upgrade(sa_handle_t *hdl, dmu_tx_t *tx)
-{
-	dmu_buf_t *db = sa_get_db(hdl);
-	znode_t *zp = sa_get_userdata(hdl);
-	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
-	sa_bulk_attr_t bulk[20];
-	int count = 0;
-	sa_bulk_attr_t sa_attrs[20] = { 0 };
-	zfs_acl_locator_cb_t locate = { 0 };
-	uint64_t uid, gid, mode, rdev, xattr, parent;
-	uint64_t crtime[2], mtime[2], ctime[2];
-	zfs_acl_phys_t znode_acl;
-	char scanstamp[AV_SCANSTAMP_SZ];
-
-	/*
-	 * No upgrade if ACL isn't cached
-	 * since we won't know which locks are held
-	 * and ready the ACL would require special "locked"
-	 * interfaces that would be messy
-	 */
-	if (zp->z_acl_cached == NULL || ZTOV(zp)->v_type == VLNK)
-		return;
-
-	/*
-	 * If the vnode lock is held and we aren't the owner
-	 * then just return since we don't want to deadlock
-	 * trying to update the status of z_is_sa.  This
-	 * file can then be upgraded at a later time.
-	 *
-	 * Otherwise, we know we are doing the
-	 * sa_update() that caused us to enter this function.
-	 */
-	if (vn_lock(ZTOV(zp), LK_EXCLUSIVE | LK_NOWAIT) != 0)
-			return;
-
-	/* First do a bulk query of the attributes that aren't cached */
-	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
-	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
-	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CRTIME(zfsvfs), NULL, &crtime, 16);
-	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, &mode, 8);
-	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PARENT(zfsvfs), NULL, &parent, 8);
-	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_XATTR(zfsvfs), NULL, &xattr, 8);
-	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_RDEV(zfsvfs), NULL, &rdev, 8);
-	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL, &uid, 8);
-	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL, &gid, 8);
-	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ZNODE_ACL(zfsvfs), NULL,
-	    &znode_acl, 88);
-
-	if (sa_bulk_lookup_locked(hdl, bulk, count) != 0)
-		goto done;
-
-
-	/*
-	 * While the order here doesn't matter its best to try and organize
-	 * it is such a way to pick up an already existing layout number
-	 */
-	count = 0;
-	SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_MODE(zfsvfs), NULL, &mode, 8);
-	SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_SIZE(zfsvfs), NULL,
-	    &zp->z_size, 8);
-	SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_GEN(zfsvfs),
-	    NULL, &zp->z_gen, 8);
-	SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_UID(zfsvfs), NULL, &uid, 8);
-	SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_GID(zfsvfs), NULL, &gid, 8);
-	SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_PARENT(zfsvfs),
-	    NULL, &parent, 8);
-	SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_FLAGS(zfsvfs), NULL,
-	    &zp->z_pflags, 8);
-	SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_ATIME(zfsvfs), NULL,
-	    zp->z_atime, 16);
-	SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_MTIME(zfsvfs), NULL,
-	    &mtime, 16);
-	SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_CTIME(zfsvfs), NULL,
-	    &ctime, 16);
-	SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_CRTIME(zfsvfs), NULL,
-	    &crtime, 16);
-	SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_LINKS(zfsvfs), NULL,
-	    &zp->z_links, 8);
-	if (zp->z_vnode->v_type == VBLK || zp->z_vnode->v_type == VCHR)
-		SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_RDEV(zfsvfs), NULL,
-		    &rdev, 8);
-	SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_DACL_COUNT(zfsvfs), NULL,
-	    &zp->z_acl_cached->z_acl_count, 8);
-
-	if (zp->z_acl_cached->z_version < ZFS_ACL_VERSION_FUID)
-		zfs_acl_xform(zp, zp->z_acl_cached, CRED());
-
-	locate.cb_aclp = zp->z_acl_cached;
-	SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_DACL_ACES(zfsvfs),
-	    zfs_acl_data_locator, &locate, zp->z_acl_cached->z_acl_bytes);
-
-	if (xattr)
-		SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_XATTR(zfsvfs),
-		    NULL, &xattr, 8);
-
-	/* if scanstamp then add scanstamp */
-
-	if (zp->z_pflags & ZFS_BONUS_SCANSTAMP) {
-		bcopy((caddr_t)db->db_data + ZFS_OLD_ZNODE_PHYS_SIZE,
-		    scanstamp, AV_SCANSTAMP_SZ);
-		SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_SCANSTAMP(zfsvfs),
-		    NULL, scanstamp, AV_SCANSTAMP_SZ);
-		zp->z_pflags &= ~ZFS_BONUS_SCANSTAMP;
-	}
-
-	VERIFY(dmu_set_bonustype(db, DMU_OT_SA, tx) == 0);
-	VERIFY(sa_replace_all_by_template_locked(hdl, sa_attrs,
-	    count, tx) == 0);
-	if (znode_acl.z_acl_extern_obj)
-		VERIFY(0 == dmu_object_free(zfsvfs->z_os,
-		    znode_acl.z_acl_extern_obj, tx));
-
-	zp->z_is_sa = B_TRUE;
-done:
-	VOP_UNLOCK(ZTOV(zp));
-}
-
-void
-zfs_sa_upgrade_txholds(dmu_tx_t *tx, znode_t *zp)
-{
-	if (!zp->z_zfsvfs->z_use_sa || zp->z_is_sa)
-		return;
-
-
-	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
-
-	if (zfs_external_acl(zp)) {
-		dmu_tx_hold_free(tx, zfs_external_acl(zp), 0,
-		    DMU_OBJECT_END);
-	}
-}
-
-#endif
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c
+++ /dev/null
@@ -1,2799 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011 Pawel Jakub Dawidek <pawel@dawidek.net>.
- * All rights reserved.
- * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
- * Copyright (c) 2014 Integros [integros.com]
- * Copyright 2016 Nexenta Systems, Inc. All rights reserved.
- */
-
-/* Portions Copyright 2010 Robert Milkowski */
-
-#include <sys/types.h>
-#include <sys/param.h>
-#include <sys/systm.h>
-#include <sys/kernel.h>
-#include <sys/sysmacros.h>
-#include <sys/kmem.h>
-#include <sys/acl.h>
-#include <sys/vnode.h>
-#include <sys/vfs.h>
-#include <sys/mntent.h>
-#include <sys/mount.h>
-#include <sys/cmn_err.h>
-#include <sys/zfs_znode.h>
-#include <sys/zfs_dir.h>
-#include <sys/zil.h>
-#include <sys/fs/zfs.h>
-#include <sys/dmu.h>
-#include <sys/dsl_prop.h>
-#include <sys/dsl_dataset.h>
-#include <sys/dsl_deleg.h>
-#include <sys/spa.h>
-#include <sys/zap.h>
-#include <sys/sa.h>
-#include <sys/sa_impl.h>
-#include <sys/varargs.h>
-#include <sys/policy.h>
-#include <sys/atomic.h>
-#include <sys/zfs_ioctl.h>
-#include <sys/zfs_ctldir.h>
-#include <sys/zfs_fuid.h>
-#include <sys/sunddi.h>
-#include <sys/dnlc.h>
-#include <sys/dmu_objset.h>
-#include <sys/spa_boot.h>
-#include <sys/jail.h>
-#include <ufs/ufs/quota.h>
-
-#include "zfs_comutil.h"
-
-struct mtx zfs_debug_mtx;
-MTX_SYSINIT(zfs_debug_mtx, &zfs_debug_mtx, "zfs_debug", MTX_DEF);
-
-SYSCTL_NODE(_vfs, OID_AUTO, zfs, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
-    "ZFS file system");
-
-int zfs_super_owner;
-SYSCTL_INT(_vfs_zfs, OID_AUTO, super_owner, CTLFLAG_RW, &zfs_super_owner, 0,
-    "File system owner can perform privileged operation on his file systems");
-
-int zfs_debug_level;
-SYSCTL_INT(_vfs_zfs, OID_AUTO, debug, CTLFLAG_RWTUN, &zfs_debug_level, 0,
-    "Debug level");
-
-SYSCTL_NODE(_vfs_zfs, OID_AUTO, version, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
-    "ZFS versions");
-static int zfs_version_acl = ZFS_ACL_VERSION;
-SYSCTL_INT(_vfs_zfs_version, OID_AUTO, acl, CTLFLAG_RD, &zfs_version_acl, 0,
-    "ZFS_ACL_VERSION");
-static int zfs_version_spa = SPA_VERSION;
-SYSCTL_INT(_vfs_zfs_version, OID_AUTO, spa, CTLFLAG_RD, &zfs_version_spa, 0,
-    "SPA_VERSION");
-static int zfs_version_zpl = ZPL_VERSION;
-SYSCTL_INT(_vfs_zfs_version, OID_AUTO, zpl, CTLFLAG_RD, &zfs_version_zpl, 0,
-    "ZPL_VERSION");
-
-static int zfs_quotactl(vfs_t *vfsp, int cmds, uid_t id, void *arg);
-static int zfs_mount(vfs_t *vfsp);
-static int zfs_umount(vfs_t *vfsp, int fflag);
-static int zfs_root(vfs_t *vfsp, int flags, vnode_t **vpp);
-static int zfs_statfs(vfs_t *vfsp, struct statfs *statp);
-static int zfs_vget(vfs_t *vfsp, ino_t ino, int flags, vnode_t **vpp);
-static int zfs_sync(vfs_t *vfsp, int waitfor);
-static int zfs_checkexp(vfs_t *vfsp, struct sockaddr *nam, uint64_t *extflagsp,
-    struct ucred **credanonp, int *numsecflavors, int *secflavors);
-static int zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, int flags, vnode_t **vpp);
-static void zfs_objset_close(zfsvfs_t *zfsvfs);
-static void zfs_freevfs(vfs_t *vfsp);
-
-struct vfsops zfs_vfsops = {
-	.vfs_mount =		zfs_mount,
-	.vfs_unmount =		zfs_umount,
-	.vfs_root =		vfs_cache_root,
-	.vfs_cachedroot =	zfs_root,
-	.vfs_statfs =		zfs_statfs,
-	.vfs_vget =		zfs_vget,
-	.vfs_sync =		zfs_sync,
-	.vfs_checkexp =		zfs_checkexp,
-	.vfs_fhtovp =		zfs_fhtovp,
-	.vfs_quotactl =		zfs_quotactl,
-};
-
-VFS_SET(zfs_vfsops, zfs, VFCF_JAIL | VFCF_DELEGADMIN);
-
-/*
- * We need to keep a count of active fs's.
- * This is necessary to prevent our module
- * from being unloaded after a umount -f
- */
-static uint32_t	zfs_active_fs_count = 0;
-
-static int
-zfs_getquota(zfsvfs_t *zfsvfs, uid_t id, int isgroup, struct dqblk64 *dqp)
-{
-	int error = 0;
-	char buf[32];
-	int err;
-	uint64_t usedobj, quotaobj;
-	uint64_t quota, used = 0;
-	timespec_t now;
-	
-	usedobj = isgroup ? DMU_GROUPUSED_OBJECT : DMU_USERUSED_OBJECT;
-	quotaobj = isgroup ? zfsvfs->z_groupquota_obj : zfsvfs->z_userquota_obj;
-
-	if (quotaobj == 0 || zfsvfs->z_replay) {
-		error = EINVAL;
-		goto done;
-	}
-	(void)sprintf(buf, "%llx", (longlong_t)id);
-	if ((error = zap_lookup(zfsvfs->z_os, quotaobj,
-				buf, sizeof(quota), 1, &quota)) != 0) {
-		dprintf("%s(%d): quotaobj lookup failed\n", __FUNCTION__, __LINE__);
-		goto done;
-	}
-	/*
-	 * quota(8) uses bsoftlimit as "quoota", and hardlimit as "limit".
-	 * So we set them to be the same.
-	 */
-	dqp->dqb_bsoftlimit = dqp->dqb_bhardlimit = btodb(quota);
-	error = zap_lookup(zfsvfs->z_os, usedobj, buf, sizeof(used), 1, &used);
-	if (error && error != ENOENT) {
-		dprintf("%s(%d):  usedobj failed; %d\n", __FUNCTION__, __LINE__, error);
-		goto done;
-	}
-	dqp->dqb_curblocks = btodb(used);
-	dqp->dqb_ihardlimit = dqp->dqb_isoftlimit = 0;
-	vfs_timestamp(&now);
-	/*
-	 * Setting this to 0 causes FreeBSD quota(8) to print
-	 * the number of days since the epoch, which isn't
-	 * particularly useful.
-	 */
-	dqp->dqb_btime = dqp->dqb_itime = now.tv_sec;
-done:
-	return (error);
-}
-
-static int
-zfs_quotactl(vfs_t *vfsp, int cmds, uid_t id, void *arg)
-{
-	zfsvfs_t *zfsvfs = vfsp->vfs_data;
-	struct thread *td;
-	int cmd, type, error = 0;
-	int bitsize;
-	uint64_t fuid;
-	zfs_userquota_prop_t quota_type;
-	struct dqblk64 dqblk = { 0 };
-	
-	td = curthread;
-	cmd = cmds >> SUBCMDSHIFT;
-	type = cmds & SUBCMDMASK;
-
-	ZFS_ENTER(zfsvfs);
-	if (id == -1) {
-		switch (type) {
-		case USRQUOTA:
-			id = td->td_ucred->cr_ruid;
-			break;
-		case GRPQUOTA:
-			id = td->td_ucred->cr_rgid;
-			break;
-		default:
-			error = EINVAL;
-			if (cmd == Q_QUOTAON || cmd == Q_QUOTAOFF)
-				vfs_unbusy(vfsp);
-			goto done;
-		}
-	}
-	/*
-	 * Map BSD type to:
-	 * ZFS_PROP_USERUSED,
-	 * ZFS_PROP_USERQUOTA,
-	 * ZFS_PROP_GROUPUSED,
-	 * ZFS_PROP_GROUPQUOTA
-	 */
-	switch (cmd) {
-	case Q_SETQUOTA:
-	case Q_SETQUOTA32:
-		if (type == USRQUOTA)
-			quota_type = ZFS_PROP_USERQUOTA;
-		else if (type == GRPQUOTA)
-			quota_type = ZFS_PROP_GROUPQUOTA;
-		else
-			error = EINVAL;
-		break;
-	case Q_GETQUOTA:
-	case Q_GETQUOTA32:
-		if (type == USRQUOTA)
-			quota_type = ZFS_PROP_USERUSED;
-		else if (type == GRPQUOTA)
-			quota_type = ZFS_PROP_GROUPUSED;
-		else
-			error = EINVAL;
-		break;
-	}
-
-	/*
-	 * Depending on the cmd, we may need to get
-	 * the ruid and domain (see fuidstr_to_sid?),
-	 * the fuid (how?), or other information.
-	 * Create fuid using zfs_fuid_create(zfsvfs, id,
-	 * ZFS_OWNER or ZFS_GROUP, cr, &fuidp)?
-	 * I think I can use just the id?
-	 *
-	 * Look at zfs_fuid_overquota() to look up a quota.
-	 * zap_lookup(something, quotaobj, fuidstring, sizeof(long long), 1, &quota)
-	 *
-	 * See zfs_set_userquota() to set a quota.
-	 */
-	if ((u_int)type >= MAXQUOTAS) {
-		error = EINVAL;
-		goto done;
-	}
-
-	switch (cmd) {
-	case Q_GETQUOTASIZE:
-		bitsize = 64;
-		error = copyout(&bitsize, arg, sizeof(int));
-		break;
-	case Q_QUOTAON:
-		// As far as I can tell, you can't turn quotas on or off on zfs
-		error = 0;
-		vfs_unbusy(vfsp);
-		break;
-	case Q_QUOTAOFF:
-		error = ENOTSUP;
-		vfs_unbusy(vfsp);
-		break;
-	case Q_SETQUOTA:
-		error = copyin(arg, &dqblk, sizeof(dqblk));
-		if (error == 0)
-			error = zfs_set_userquota(zfsvfs, quota_type,
-						  "", id, dbtob(dqblk.dqb_bhardlimit));
-		break;
-	case Q_GETQUOTA:
-		error = zfs_getquota(zfsvfs, id, type == GRPQUOTA, &dqblk);
-		if (error == 0)
-			error = copyout(&dqblk, arg, sizeof(dqblk));
-		break;
-	default:
-		error = EINVAL;
-		break;
-	}
-done:
-	ZFS_EXIT(zfsvfs);
-	return (error);
-}
-
-/*ARGSUSED*/
-static int
-zfs_sync(vfs_t *vfsp, int waitfor)
-{
-
-	/*
-	 * Data integrity is job one.  We don't want a compromised kernel
-	 * writing to the storage pool, so we never sync during panic.
-	 */
-	if (KERNEL_PANICKED())
-		return (0);
-
-	/*
-	 * Ignore the system syncher.  ZFS already commits async data
-	 * at zfs_txg_timeout intervals.
-	 */
-	if (waitfor == MNT_LAZY)
-		return (0);
-
-	if (vfsp != NULL) {
-		/*
-		 * Sync a specific filesystem.
-		 */
-		zfsvfs_t *zfsvfs = vfsp->vfs_data;
-		dsl_pool_t *dp;
-		int error;
-
-		error = vfs_stdsync(vfsp, waitfor);
-		if (error != 0)
-			return (error);
-
-		ZFS_ENTER(zfsvfs);
-		dp = dmu_objset_pool(zfsvfs->z_os);
-
-		/*
-		 * If the system is shutting down, then skip any
-		 * filesystems which may exist on a suspended pool.
-		 */
-		if (sys_shutdown && spa_suspended(dp->dp_spa)) {
-			ZFS_EXIT(zfsvfs);
-			return (0);
-		}
-
-		if (zfsvfs->z_log != NULL)
-			zil_commit(zfsvfs->z_log, 0);
-
-		ZFS_EXIT(zfsvfs);
-	} else {
-		/*
-		 * Sync all ZFS filesystems.  This is what happens when you
-		 * run sync(1M).  Unlike other filesystems, ZFS honors the
-		 * request by waiting for all pools to commit all dirty data.
-		 */
-		spa_sync_allpools();
-	}
-
-	return (0);
-}
-
-#ifndef __FreeBSD_kernel__
-static int
-zfs_create_unique_device(dev_t *dev)
-{
-	major_t new_major;
-
-	do {
-		ASSERT3U(zfs_minor, <=, MAXMIN32);
-		minor_t start = zfs_minor;
-		do {
-			mutex_enter(&zfs_dev_mtx);
-			if (zfs_minor >= MAXMIN32) {
-				/*
-				 * If we're still using the real major
-				 * keep out of /dev/zfs and /dev/zvol minor
-				 * number space.  If we're using a getudev()'ed
-				 * major number, we can use all of its minors.
-				 */
-				if (zfs_major == ddi_name_to_major(ZFS_DRIVER))
-					zfs_minor = ZFS_MIN_MINOR;
-				else
-					zfs_minor = 0;
-			} else {
-				zfs_minor++;
-			}
-			*dev = makedevice(zfs_major, zfs_minor);
-			mutex_exit(&zfs_dev_mtx);
-		} while (vfs_devismounted(*dev) && zfs_minor != start);
-		if (zfs_minor == start) {
-			/*
-			 * We are using all ~262,000 minor numbers for the
-			 * current major number.  Create a new major number.
-			 */
-			if ((new_major = getudev()) == (major_t)-1) {
-				cmn_err(CE_WARN,
-				    "zfs_mount: Can't get unique major "
-				    "device number.");
-				return (-1);
-			}
-			mutex_enter(&zfs_dev_mtx);
-			zfs_major = new_major;
-			zfs_minor = 0;
-
-			mutex_exit(&zfs_dev_mtx);
-		} else {
-			break;
-		}
-		/* CONSTANTCONDITION */
-	} while (1);
-
-	return (0);
-}
-#endif	/* !__FreeBSD_kernel__ */
-
-static void
-atime_changed_cb(void *arg, uint64_t newval)
-{
-	zfsvfs_t *zfsvfs = arg;
-
-	if (newval == TRUE) {
-		zfsvfs->z_atime = TRUE;
-		zfsvfs->z_vfs->vfs_flag &= ~MNT_NOATIME;
-		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME);
-		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_ATIME, NULL, 0);
-	} else {
-		zfsvfs->z_atime = FALSE;
-		zfsvfs->z_vfs->vfs_flag |= MNT_NOATIME;
-		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_ATIME);
-		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME, NULL, 0);
-	}
-}
-
-static void
-xattr_changed_cb(void *arg, uint64_t newval)
-{
-	zfsvfs_t *zfsvfs = arg;
-
-	if (newval == TRUE) {
-		/* XXX locking on vfs_flag? */
-#ifdef TODO
-		zfsvfs->z_vfs->vfs_flag |= VFS_XATTR;
-#endif
-		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOXATTR);
-		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_XATTR, NULL, 0);
-	} else {
-		/* XXX locking on vfs_flag? */
-#ifdef TODO
-		zfsvfs->z_vfs->vfs_flag &= ~VFS_XATTR;
-#endif
-		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_XATTR);
-		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOXATTR, NULL, 0);
-	}
-}
-
-static void
-blksz_changed_cb(void *arg, uint64_t newval)
-{
-	zfsvfs_t *zfsvfs = arg;
-	ASSERT3U(newval, <=, spa_maxblocksize(dmu_objset_spa(zfsvfs->z_os)));
-	ASSERT3U(newval, >=, SPA_MINBLOCKSIZE);
-	ASSERT(ISP2(newval));
-
-	zfsvfs->z_max_blksz = newval;
-	zfsvfs->z_vfs->mnt_stat.f_iosize = newval;
-}
-
-static void
-readonly_changed_cb(void *arg, uint64_t newval)
-{
-	zfsvfs_t *zfsvfs = arg;
-
-	if (newval) {
-		/* XXX locking on vfs_flag? */
-		zfsvfs->z_vfs->vfs_flag |= VFS_RDONLY;
-		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RW);
-		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RO, NULL, 0);
-	} else {
-		/* XXX locking on vfs_flag? */
-		zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY;
-		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RO);
-		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RW, NULL, 0);
-	}
-}
-
-static void
-setuid_changed_cb(void *arg, uint64_t newval)
-{
-	zfsvfs_t *zfsvfs = arg;
-
-	if (newval == FALSE) {
-		zfsvfs->z_vfs->vfs_flag |= VFS_NOSETUID;
-		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_SETUID);
-		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID, NULL, 0);
-	} else {
-		zfsvfs->z_vfs->vfs_flag &= ~VFS_NOSETUID;
-		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID);
-		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_SETUID, NULL, 0);
-	}
-}
-
-static void
-exec_changed_cb(void *arg, uint64_t newval)
-{
-	zfsvfs_t *zfsvfs = arg;
-
-	if (newval == FALSE) {
-		zfsvfs->z_vfs->vfs_flag |= VFS_NOEXEC;
-		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_EXEC);
-		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC, NULL, 0);
-	} else {
-		zfsvfs->z_vfs->vfs_flag &= ~VFS_NOEXEC;
-		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC);
-		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_EXEC, NULL, 0);
-	}
-}
-
-/*
- * The nbmand mount option can be changed at mount time.
- * We can't allow it to be toggled on live file systems or incorrect
- * behavior may be seen from cifs clients
- *
- * This property isn't registered via dsl_prop_register(), but this callback
- * will be called when a file system is first mounted
- */
-static void
-nbmand_changed_cb(void *arg, uint64_t newval)
-{
-	zfsvfs_t *zfsvfs = arg;
-	if (newval == FALSE) {
-		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NBMAND);
-		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NONBMAND, NULL, 0);
-	} else {
-		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NONBMAND);
-		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NBMAND, NULL, 0);
-	}
-}
-
-static void
-snapdir_changed_cb(void *arg, uint64_t newval)
-{
-	zfsvfs_t *zfsvfs = arg;
-
-	zfsvfs->z_show_ctldir = newval;
-}
-
-static void
-vscan_changed_cb(void *arg, uint64_t newval)
-{
-	zfsvfs_t *zfsvfs = arg;
-
-	zfsvfs->z_vscan = newval;
-}
-
-static void
-acl_mode_changed_cb(void *arg, uint64_t newval)
-{
-	zfsvfs_t *zfsvfs = arg;
-
-	zfsvfs->z_acl_mode = newval;
-}
-
-static void
-acl_inherit_changed_cb(void *arg, uint64_t newval)
-{
-	zfsvfs_t *zfsvfs = arg;
-
-	zfsvfs->z_acl_inherit = newval;
-}
-
-static int
-zfs_register_callbacks(vfs_t *vfsp)
-{
-	struct dsl_dataset *ds = NULL;
-	objset_t *os = NULL;
-	zfsvfs_t *zfsvfs = NULL;
-	uint64_t nbmand;
-	boolean_t readonly = B_FALSE;
-	boolean_t do_readonly = B_FALSE;
-	boolean_t setuid = B_FALSE;
-	boolean_t do_setuid = B_FALSE;
-	boolean_t exec = B_FALSE;
-	boolean_t do_exec = B_FALSE;
-#ifdef illumos
-	boolean_t devices = B_FALSE;
-	boolean_t do_devices = B_FALSE;
-#endif
-	boolean_t xattr = B_FALSE;
-	boolean_t do_xattr = B_FALSE;
-	boolean_t atime = B_FALSE;
-	boolean_t do_atime = B_FALSE;
-	int error = 0;
-
-	ASSERT(vfsp);
-	zfsvfs = vfsp->vfs_data;
-	ASSERT(zfsvfs);
-	os = zfsvfs->z_os;
-
-	/*
-	 * This function can be called for a snapshot when we update snapshot's
-	 * mount point, which isn't really supported.
-	 */
-	if (dmu_objset_is_snapshot(os))
-		return (EOPNOTSUPP);
-
-	/*
-	 * The act of registering our callbacks will destroy any mount
-	 * options we may have.  In order to enable temporary overrides
-	 * of mount options, we stash away the current values and
-	 * restore them after we register the callbacks.
-	 */
-	if (vfs_optionisset(vfsp, MNTOPT_RO, NULL) ||
-	    !spa_writeable(dmu_objset_spa(os))) {
-		readonly = B_TRUE;
-		do_readonly = B_TRUE;
-	} else if (vfs_optionisset(vfsp, MNTOPT_RW, NULL)) {
-		readonly = B_FALSE;
-		do_readonly = B_TRUE;
-	}
-	if (vfs_optionisset(vfsp, MNTOPT_NOSUID, NULL)) {
-		setuid = B_FALSE;
-		do_setuid = B_TRUE;
-	} else {
-		if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL)) {
-			setuid = B_FALSE;
-			do_setuid = B_TRUE;
-		} else if (vfs_optionisset(vfsp, MNTOPT_SETUID, NULL)) {
-			setuid = B_TRUE;
-			do_setuid = B_TRUE;
-		}
-	}
-	if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL)) {
-		exec = B_FALSE;
-		do_exec = B_TRUE;
-	} else if (vfs_optionisset(vfsp, MNTOPT_EXEC, NULL)) {
-		exec = B_TRUE;
-		do_exec = B_TRUE;
-	}
-	if (vfs_optionisset(vfsp, MNTOPT_NOXATTR, NULL)) {
-		xattr = B_FALSE;
-		do_xattr = B_TRUE;
-	} else if (vfs_optionisset(vfsp, MNTOPT_XATTR, NULL)) {
-		xattr = B_TRUE;
-		do_xattr = B_TRUE;
-	}
-	if (vfs_optionisset(vfsp, MNTOPT_NOATIME, NULL)) {
-		atime = B_FALSE;
-		do_atime = B_TRUE;
-	} else if (vfs_optionisset(vfsp, MNTOPT_ATIME, NULL)) {
-		atime = B_TRUE;
-		do_atime = B_TRUE;
-	}
-
-	/*
-	 * We need to enter pool configuration here, so that we can use
-	 * dsl_prop_get_int_ds() to handle the special nbmand property below.
-	 * dsl_prop_get_integer() can not be used, because it has to acquire
-	 * spa_namespace_lock and we can not do that because we already hold
-	 * z_teardown_lock.  The problem is that spa_write_cachefile() is called
-	 * with spa_namespace_lock held and the function calls ZFS vnode
-	 * operations to write the cache file and thus z_teardown_lock is
-	 * acquired after spa_namespace_lock.
-	 */
-	ds = dmu_objset_ds(os);
-	dsl_pool_config_enter(dmu_objset_pool(os), FTAG);
-
-	/*
-	 * nbmand is a special property.  It can only be changed at
-	 * mount time.
-	 *
-	 * This is weird, but it is documented to only be changeable
-	 * at mount time.
-	 */
-	if (vfs_optionisset(vfsp, MNTOPT_NONBMAND, NULL)) {
-		nbmand = B_FALSE;
-	} else if (vfs_optionisset(vfsp, MNTOPT_NBMAND, NULL)) {
-		nbmand = B_TRUE;
-	} else if (error = dsl_prop_get_int_ds(ds, "nbmand", &nbmand) != 0) {
-		dsl_pool_config_exit(dmu_objset_pool(os), FTAG);
-		return (error);
-	}
-
-	/*
-	 * Register property callbacks.
-	 *
-	 * It would probably be fine to just check for i/o error from
-	 * the first prop_register(), but I guess I like to go
-	 * overboard...
-	 */
-	error = dsl_prop_register(ds,
-	    zfs_prop_to_name(ZFS_PROP_ATIME), atime_changed_cb, zfsvfs);
-	error = error ? error : dsl_prop_register(ds,
-	    zfs_prop_to_name(ZFS_PROP_XATTR), xattr_changed_cb, zfsvfs);
-	error = error ? error : dsl_prop_register(ds,
-	    zfs_prop_to_name(ZFS_PROP_RECORDSIZE), blksz_changed_cb, zfsvfs);
-	error = error ? error : dsl_prop_register(ds,
-	    zfs_prop_to_name(ZFS_PROP_READONLY), readonly_changed_cb, zfsvfs);
-#ifdef illumos
-	error = error ? error : dsl_prop_register(ds,
-	    zfs_prop_to_name(ZFS_PROP_DEVICES), devices_changed_cb, zfsvfs);
-#endif
-	error = error ? error : dsl_prop_register(ds,
-	    zfs_prop_to_name(ZFS_PROP_SETUID), setuid_changed_cb, zfsvfs);
-	error = error ? error : dsl_prop_register(ds,
-	    zfs_prop_to_name(ZFS_PROP_EXEC), exec_changed_cb, zfsvfs);
-	error = error ? error : dsl_prop_register(ds,
-	    zfs_prop_to_name(ZFS_PROP_SNAPDIR), snapdir_changed_cb, zfsvfs);
-	error = error ? error : dsl_prop_register(ds,
-	    zfs_prop_to_name(ZFS_PROP_ACLMODE), acl_mode_changed_cb, zfsvfs);
-	error = error ? error : dsl_prop_register(ds,
-	    zfs_prop_to_name(ZFS_PROP_ACLINHERIT), acl_inherit_changed_cb,
-	    zfsvfs);
-	error = error ? error : dsl_prop_register(ds,
-	    zfs_prop_to_name(ZFS_PROP_VSCAN), vscan_changed_cb, zfsvfs);
-	dsl_pool_config_exit(dmu_objset_pool(os), FTAG);
-	if (error)
-		goto unregister;
-
-	/*
-	 * Invoke our callbacks to restore temporary mount options.
-	 */
-	if (do_readonly)
-		readonly_changed_cb(zfsvfs, readonly);
-	if (do_setuid)
-		setuid_changed_cb(zfsvfs, setuid);
-	if (do_exec)
-		exec_changed_cb(zfsvfs, exec);
-	if (do_xattr)
-		xattr_changed_cb(zfsvfs, xattr);
-	if (do_atime)
-		atime_changed_cb(zfsvfs, atime);
-
-	nbmand_changed_cb(zfsvfs, nbmand);
-
-	return (0);
-
-unregister:
-	dsl_prop_unregister_all(ds, zfsvfs);
-	return (error);
-}
-
-static int
-zfs_space_delta_cb(dmu_object_type_t bonustype, void *data,
-    uint64_t *userp, uint64_t *groupp)
-{
-	/*
-	 * Is it a valid type of object to track?
-	 */
-	if (bonustype != DMU_OT_ZNODE && bonustype != DMU_OT_SA)
-		return (SET_ERROR(ENOENT));
-
-	/*
-	 * If we have a NULL data pointer
-	 * then assume the id's aren't changing and
-	 * return EEXIST to the dmu to let it know to
-	 * use the same ids
-	 */
-	if (data == NULL)
-		return (SET_ERROR(EEXIST));
-
-	if (bonustype == DMU_OT_ZNODE) {
-		znode_phys_t *znp = data;
-		*userp = znp->zp_uid;
-		*groupp = znp->zp_gid;
-	} else {
-		int hdrsize;
-		sa_hdr_phys_t *sap = data;
-		sa_hdr_phys_t sa = *sap;
-		boolean_t swap = B_FALSE;
-
-		ASSERT(bonustype == DMU_OT_SA);
-
-		if (sa.sa_magic == 0) {
-			/*
-			 * This should only happen for newly created
-			 * files that haven't had the znode data filled
-			 * in yet.
-			 */
-			*userp = 0;
-			*groupp = 0;
-			return (0);
-		}
-		if (sa.sa_magic == BSWAP_32(SA_MAGIC)) {
-			sa.sa_magic = SA_MAGIC;
-			sa.sa_layout_info = BSWAP_16(sa.sa_layout_info);
-			swap = B_TRUE;
-		} else {
-			VERIFY3U(sa.sa_magic, ==, SA_MAGIC);
-		}
-
-		hdrsize = sa_hdrsize(&sa);
-		VERIFY3U(hdrsize, >=, sizeof (sa_hdr_phys_t));
-		*userp = *((uint64_t *)((uintptr_t)data + hdrsize +
-		    SA_UID_OFFSET));
-		*groupp = *((uint64_t *)((uintptr_t)data + hdrsize +
-		    SA_GID_OFFSET));
-		if (swap) {
-			*userp = BSWAP_64(*userp);
-			*groupp = BSWAP_64(*groupp);
-		}
-	}
-	return (0);
-}
-
-static void
-fuidstr_to_sid(zfsvfs_t *zfsvfs, const char *fuidstr,
-    char *domainbuf, int buflen, uid_t *ridp)
-{
-	uint64_t fuid;
-	const char *domain;
-
-	fuid = zfs_strtonum(fuidstr, NULL);
-
-	domain = zfs_fuid_find_by_idx(zfsvfs, FUID_INDEX(fuid));
-	if (domain)
-		(void) strlcpy(domainbuf, domain, buflen);
-	else
-		domainbuf[0] = '\0';
-	*ridp = FUID_RID(fuid);
-}
-
-static uint64_t
-zfs_userquota_prop_to_obj(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type)
-{
-	switch (type) {
-	case ZFS_PROP_USERUSED:
-		return (DMU_USERUSED_OBJECT);
-	case ZFS_PROP_GROUPUSED:
-		return (DMU_GROUPUSED_OBJECT);
-	case ZFS_PROP_USERQUOTA:
-		return (zfsvfs->z_userquota_obj);
-	case ZFS_PROP_GROUPQUOTA:
-		return (zfsvfs->z_groupquota_obj);
-	}
-	return (0);
-}
-
-int
-zfs_userspace_many(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type,
-    uint64_t *cookiep, void *vbuf, uint64_t *bufsizep)
-{
-	int error;
-	zap_cursor_t zc;
-	zap_attribute_t za;
-	zfs_useracct_t *buf = vbuf;
-	uint64_t obj;
-
-	if (!dmu_objset_userspace_present(zfsvfs->z_os))
-		return (SET_ERROR(ENOTSUP));
-
-	obj = zfs_userquota_prop_to_obj(zfsvfs, type);
-	if (obj == 0) {
-		*bufsizep = 0;
-		return (0);
-	}
-
-	for (zap_cursor_init_serialized(&zc, zfsvfs->z_os, obj, *cookiep);
-	    (error = zap_cursor_retrieve(&zc, &za)) == 0;
-	    zap_cursor_advance(&zc)) {
-		if ((uintptr_t)buf - (uintptr_t)vbuf + sizeof (zfs_useracct_t) >
-		    *bufsizep)
-			break;
-
-		fuidstr_to_sid(zfsvfs, za.za_name,
-		    buf->zu_domain, sizeof (buf->zu_domain), &buf->zu_rid);
-
-		buf->zu_space = za.za_first_integer;
-		buf++;
-	}
-	if (error == ENOENT)
-		error = 0;
-
-	ASSERT3U((uintptr_t)buf - (uintptr_t)vbuf, <=, *bufsizep);
-	*bufsizep = (uintptr_t)buf - (uintptr_t)vbuf;
-	*cookiep = zap_cursor_serialize(&zc);
-	zap_cursor_fini(&zc);
-	return (error);
-}
-
-/*
- * buf must be big enough (eg, 32 bytes)
- */
-static int
-id_to_fuidstr(zfsvfs_t *zfsvfs, const char *domain, uid_t rid,
-    char *buf, boolean_t addok)
-{
-	uint64_t fuid;
-	int domainid = 0;
-
-	if (domain && domain[0]) {
-		domainid = zfs_fuid_find_by_domain(zfsvfs, domain, NULL, addok);
-		if (domainid == -1)
-			return (SET_ERROR(ENOENT));
-	}
-	fuid = FUID_ENCODE(domainid, rid);
-	(void) sprintf(buf, "%llx", (longlong_t)fuid);
-	return (0);
-}
-
-int
-zfs_userspace_one(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type,
-    const char *domain, uint64_t rid, uint64_t *valp)
-{
-	char buf[32];
-	int err;
-	uint64_t obj;
-
-	*valp = 0;
-
-	if (!dmu_objset_userspace_present(zfsvfs->z_os))
-		return (SET_ERROR(ENOTSUP));
-
-	obj = zfs_userquota_prop_to_obj(zfsvfs, type);
-	if (obj == 0)
-		return (0);
-
-	err = id_to_fuidstr(zfsvfs, domain, rid, buf, B_FALSE);
-	if (err)
-		return (err);
-
-	err = zap_lookup(zfsvfs->z_os, obj, buf, 8, 1, valp);
-	if (err == ENOENT)
-		err = 0;
-	return (err);
-}
-
-int
-zfs_set_userquota(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type,
-    const char *domain, uint64_t rid, uint64_t quota)
-{
-	char buf[32];
-	int err;
-	dmu_tx_t *tx;
-	uint64_t *objp;
-	boolean_t fuid_dirtied;
-
-	if (type != ZFS_PROP_USERQUOTA && type != ZFS_PROP_GROUPQUOTA)
-		return (SET_ERROR(EINVAL));
-
-	if (zfsvfs->z_version < ZPL_VERSION_USERSPACE)
-		return (SET_ERROR(ENOTSUP));
-
-	objp = (type == ZFS_PROP_USERQUOTA) ? &zfsvfs->z_userquota_obj :
-	    &zfsvfs->z_groupquota_obj;
-
-	err = id_to_fuidstr(zfsvfs, domain, rid, buf, B_TRUE);
-	if (err)
-		return (err);
-	fuid_dirtied = zfsvfs->z_fuid_dirty;
-
-	tx = dmu_tx_create(zfsvfs->z_os);
-	dmu_tx_hold_zap(tx, *objp ? *objp : DMU_NEW_OBJECT, B_TRUE, NULL);
-	if (*objp == 0) {
-		dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_TRUE,
-		    zfs_userquota_prop_prefixes[type]);
-	}
-	if (fuid_dirtied)
-		zfs_fuid_txhold(zfsvfs, tx);
-	err = dmu_tx_assign(tx, TXG_WAIT);
-	if (err) {
-		dmu_tx_abort(tx);
-		return (err);
-	}
-
-	mutex_enter(&zfsvfs->z_lock);
-	if (*objp == 0) {
-		*objp = zap_create(zfsvfs->z_os, DMU_OT_USERGROUP_QUOTA,
-		    DMU_OT_NONE, 0, tx);
-		VERIFY(0 == zap_add(zfsvfs->z_os, MASTER_NODE_OBJ,
-		    zfs_userquota_prop_prefixes[type], 8, 1, objp, tx));
-	}
-	mutex_exit(&zfsvfs->z_lock);
-
-	if (quota == 0) {
-		err = zap_remove(zfsvfs->z_os, *objp, buf, tx);
-		if (err == ENOENT)
-			err = 0;
-	} else {
-		err = zap_update(zfsvfs->z_os, *objp, buf, 8, 1, &quota, tx);
-	}
-	ASSERT(err == 0);
-	if (fuid_dirtied)
-		zfs_fuid_sync(zfsvfs, tx);
-	dmu_tx_commit(tx);
-	return (err);
-}
-
-boolean_t
-zfs_fuid_overquota(zfsvfs_t *zfsvfs, boolean_t isgroup, uint64_t fuid)
-{
-	char buf[32];
-	uint64_t used, quota, usedobj, quotaobj;
-	int err;
-
-	usedobj = isgroup ? DMU_GROUPUSED_OBJECT : DMU_USERUSED_OBJECT;
-	quotaobj = isgroup ? zfsvfs->z_groupquota_obj : zfsvfs->z_userquota_obj;
-
-	if (quotaobj == 0 || zfsvfs->z_replay)
-		return (B_FALSE);
-
-	(void) sprintf(buf, "%llx", (longlong_t)fuid);
-	err = zap_lookup(zfsvfs->z_os, quotaobj, buf, 8, 1, &quota);
-	if (err != 0)
-		return (B_FALSE);
-
-	err = zap_lookup(zfsvfs->z_os, usedobj, buf, 8, 1, &used);
-	if (err != 0)
-		return (B_FALSE);
-	return (used >= quota);
-}
-
-boolean_t
-zfs_owner_overquota(zfsvfs_t *zfsvfs, znode_t *zp, boolean_t isgroup)
-{
-	uint64_t fuid;
-	uint64_t quotaobj;
-
-	quotaobj = isgroup ? zfsvfs->z_groupquota_obj : zfsvfs->z_userquota_obj;
-
-	fuid = isgroup ? zp->z_gid : zp->z_uid;
-
-	if (quotaobj == 0 || zfsvfs->z_replay)
-		return (B_FALSE);
-
-	return (zfs_fuid_overquota(zfsvfs, isgroup, fuid));
-}
-
-/*
- * Associate this zfsvfs with the given objset, which must be owned.
- * This will cache a bunch of on-disk state from the objset in the
- * zfsvfs.
- */
-static int
-zfsvfs_init(zfsvfs_t *zfsvfs, objset_t *os)
-{
-	int error;
-	uint64_t val;
-
-	zfsvfs->z_max_blksz = SPA_OLD_MAXBLOCKSIZE;
-	zfsvfs->z_show_ctldir = ZFS_SNAPDIR_VISIBLE;
-	zfsvfs->z_os = os;
-
-	error = zfs_get_zplprop(os, ZFS_PROP_VERSION, &zfsvfs->z_version);
-	if (error != 0)
-		return (error);
-	if (zfsvfs->z_version >
-	    zfs_zpl_version_map(spa_version(dmu_objset_spa(os)))) {
-		(void) printf("Can't mount a version %lld file system "
-		    "on a version %lld pool\n. Pool must be upgraded to mount "
-		    "this file system.", (u_longlong_t)zfsvfs->z_version,
-		    (u_longlong_t)spa_version(dmu_objset_spa(os)));
-		return (SET_ERROR(ENOTSUP));
-	}
-	error = zfs_get_zplprop(os, ZFS_PROP_NORMALIZE, &val);
-	if (error != 0)
-		return (error);
-	zfsvfs->z_norm = (int)val;
-
-	error = zfs_get_zplprop(os, ZFS_PROP_UTF8ONLY, &val);
-	if (error != 0)
-		return (error);
-	zfsvfs->z_utf8 = (val != 0);
-
-	error = zfs_get_zplprop(os, ZFS_PROP_CASE, &val);
-	if (error != 0)
-		return (error);
-	zfsvfs->z_case = (uint_t)val;
-
-	/*
-	 * Fold case on file systems that are always or sometimes case
-	 * insensitive.
-	 */
-	if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE ||
-	    zfsvfs->z_case == ZFS_CASE_MIXED)
-		zfsvfs->z_norm |= U8_TEXTPREP_TOUPPER;
-
-	zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os);
-	zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os);
-
-	uint64_t sa_obj = 0;
-	if (zfsvfs->z_use_sa) {
-		/* should either have both of these objects or none */
-		error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1,
-		    &sa_obj);
-		if (error != 0)
-			return (error);
-	}
-
-	error = sa_setup(os, sa_obj, zfs_attr_table, ZPL_END,
-	    &zfsvfs->z_attr_table);
-	if (error != 0)
-		return (error);
-
-	if (zfsvfs->z_version >= ZPL_VERSION_SA)
-		sa_register_update_callback(os, zfs_sa_upgrade);
-
-	error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_ROOT_OBJ, 8, 1,
-	    &zfsvfs->z_root);
-	if (error != 0)
-		return (error);
-	ASSERT(zfsvfs->z_root != 0);
-
-	error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_UNLINKED_SET, 8, 1,
-	    &zfsvfs->z_unlinkedobj);
-	if (error != 0)
-		return (error);
-
-	error = zap_lookup(os, MASTER_NODE_OBJ,
-	    zfs_userquota_prop_prefixes[ZFS_PROP_USERQUOTA],
-	    8, 1, &zfsvfs->z_userquota_obj);
-	if (error == ENOENT)
-		zfsvfs->z_userquota_obj = 0;
-	else if (error != 0)
-		return (error);
-
-	error = zap_lookup(os, MASTER_NODE_OBJ,
-	    zfs_userquota_prop_prefixes[ZFS_PROP_GROUPQUOTA],
-	    8, 1, &zfsvfs->z_groupquota_obj);
-	if (error == ENOENT)
-		zfsvfs->z_groupquota_obj = 0;
-	else if (error != 0)
-		return (error);
-
-	error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_FUID_TABLES, 8, 1,
-	    &zfsvfs->z_fuid_obj);
-	if (error == ENOENT)
-		zfsvfs->z_fuid_obj = 0;
-	else if (error != 0)
-		return (error);
-
-	error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SHARES_DIR, 8, 1,
-	    &zfsvfs->z_shares_dir);
-	if (error == ENOENT)
-		zfsvfs->z_shares_dir = 0;
-	else if (error != 0)
-		return (error);
-
-	/*
-	 * Only use the name cache if we are looking for a
-	 * name on a file system that does not require normalization
-	 * or case folding.  We can also look there if we happen to be
-	 * on a non-normalizing, mixed sensitivity file system IF we
-	 * are looking for the exact name (which is always the case on
-	 * FreeBSD).
-	 */
-	zfsvfs->z_use_namecache = !zfsvfs->z_norm ||
-	    ((zfsvfs->z_case == ZFS_CASE_MIXED) &&
-	    !(zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER));
-
-	return (0);
-}
-
-#if defined(__FreeBSD__)
-taskq_t *zfsvfs_taskq;
-
-static void
-zfsvfs_task_unlinked_drain(void *context, int pending __unused)
-{
-
-	zfs_unlinked_drain((zfsvfs_t *)context);
-}
-#endif
-
-int
-zfsvfs_create(const char *osname, zfsvfs_t **zfvp)
-{
-	objset_t *os;
-	zfsvfs_t *zfsvfs;
-	int error;
-
-	/*
-	 * XXX: Fix struct statfs so this isn't necessary!
-	 *
-	 * The 'osname' is used as the filesystem's special node, which means
-	 * it must fit in statfs.f_mntfromname, or else it can't be
-	 * enumerated, so libzfs_mnttab_find() returns NULL, which causes
-	 * 'zfs unmount' to think it's not mounted when it is.
-	 */
-	if (strlen(osname) >= MNAMELEN)
-		return (SET_ERROR(ENAMETOOLONG));
-
-	zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP);
-
-	/*
-	 * We claim to always be readonly so we can open snapshots;
-	 * other ZPL code will prevent us from writing to snapshots.
-	 */
-
-	error = dmu_objset_own(osname, DMU_OST_ZFS, B_TRUE, zfsvfs, &os);
-	if (error != 0) {
-		kmem_free(zfsvfs, sizeof (zfsvfs_t));
-		return (error);
-	}
-
-	error = zfsvfs_create_impl(zfvp, zfsvfs, os);
-	if (error != 0) {
-		dmu_objset_disown(os, zfsvfs);
-	}
-	return (error);
-}
-
-
-int
-zfsvfs_create_impl(zfsvfs_t **zfvp, zfsvfs_t *zfsvfs, objset_t *os)
-{
-	int error;
-
-	zfsvfs->z_vfs = NULL;
-	zfsvfs->z_parent = zfsvfs;
-
-	mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL);
-	mutex_init(&zfsvfs->z_lock, NULL, MUTEX_DEFAULT, NULL);
-	list_create(&zfsvfs->z_all_znodes, sizeof (znode_t),
-	    offsetof(znode_t, z_link_node));
-#if defined(__FreeBSD__)
-	TASK_INIT(&zfsvfs->z_unlinked_drain_task, 0,
-	    zfsvfs_task_unlinked_drain, zfsvfs);
-#endif
-#ifdef DIAGNOSTIC
-	rrm_init(&zfsvfs->z_teardown_lock, B_TRUE);
-#else
-	rrm_init(&zfsvfs->z_teardown_lock, B_FALSE);
-#endif
-	rms_init(&zfsvfs->z_teardown_inactive_lock, "zfs teardown inactive");
-	rw_init(&zfsvfs->z_fuid_lock, NULL, RW_DEFAULT, NULL);
-	for (int i = 0; i != ZFS_OBJ_MTX_SZ; i++)
-		mutex_init(&zfsvfs->z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL);
-
-	error = zfsvfs_init(zfsvfs, os);
-	if (error != 0) {
-		*zfvp = NULL;
-		kmem_free(zfsvfs, sizeof (zfsvfs_t));
-		return (error);
-	}
-
-	*zfvp = zfsvfs;
-	return (0);
-}
-
-static int
-zfsvfs_setup(zfsvfs_t *zfsvfs, boolean_t mounting)
-{
-	int error;
-
-	error = zfs_register_callbacks(zfsvfs->z_vfs);
-	if (error)
-		return (error);
-
-	zfsvfs->z_log = zil_open(zfsvfs->z_os, zfs_get_data);
-
-	/*
-	 * If we are not mounting (ie: online recv), then we don't
-	 * have to worry about replaying the log as we blocked all
-	 * operations out since we closed the ZIL.
-	 */
-	if (mounting) {
-		boolean_t readonly;
-
-		/*
-		 * During replay we remove the read only flag to
-		 * allow replays to succeed.
-		 */
-		readonly = zfsvfs->z_vfs->vfs_flag & VFS_RDONLY;
-		if (readonly != 0)
-			zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY;
-		else
-			zfs_unlinked_drain(zfsvfs);
-
-		/*
-		 * Parse and replay the intent log.
-		 *
-		 * Because of ziltest, this must be done after
-		 * zfs_unlinked_drain().  (Further note: ziltest
-		 * doesn't use readonly mounts, where
-		 * zfs_unlinked_drain() isn't called.)  This is because
-		 * ziltest causes spa_sync() to think it's committed,
-		 * but actually it is not, so the intent log contains
-		 * many txg's worth of changes.
-		 *
-		 * In particular, if object N is in the unlinked set in
-		 * the last txg to actually sync, then it could be
-		 * actually freed in a later txg and then reallocated
-		 * in a yet later txg.  This would write a "create
-		 * object N" record to the intent log.  Normally, this
-		 * would be fine because the spa_sync() would have
-		 * written out the fact that object N is free, before
-		 * we could write the "create object N" intent log
-		 * record.
-		 *
-		 * But when we are in ziltest mode, we advance the "open
-		 * txg" without actually spa_sync()-ing the changes to
-		 * disk.  So we would see that object N is still
-		 * allocated and in the unlinked set, and there is an
-		 * intent log record saying to allocate it.
-		 */
-		if (spa_writeable(dmu_objset_spa(zfsvfs->z_os))) {
-			if (zil_replay_disable) {
-				zil_destroy(zfsvfs->z_log, B_FALSE);
-			} else {
-				zfsvfs->z_replay = B_TRUE;
-				zil_replay(zfsvfs->z_os, zfsvfs,
-				    zfs_replay_vector);
-				zfsvfs->z_replay = B_FALSE;
-			}
-		}
-		zfsvfs->z_vfs->vfs_flag |= readonly; /* restore readonly bit */
-	}
-
-	/*
-	 * Set the objset user_ptr to track its zfsvfs.
-	 */
-	mutex_enter(&zfsvfs->z_os->os_user_ptr_lock);
-	dmu_objset_set_user(zfsvfs->z_os, zfsvfs);
-	mutex_exit(&zfsvfs->z_os->os_user_ptr_lock);
-
-	return (0);
-}
-
-extern krwlock_t zfsvfs_lock; /* in zfs_znode.c */
-
-void
-zfsvfs_free(zfsvfs_t *zfsvfs)
-{
-	int i;
-
-	/*
-	 * This is a barrier to prevent the filesystem from going away in
-	 * zfs_znode_move() until we can safely ensure that the filesystem is
-	 * not unmounted. We consider the filesystem valid before the barrier
-	 * and invalid after the barrier.
-	 */
-	rw_enter(&zfsvfs_lock, RW_READER);
-	rw_exit(&zfsvfs_lock);
-
-	zfs_fuid_destroy(zfsvfs);
-
-	mutex_destroy(&zfsvfs->z_znodes_lock);
-	mutex_destroy(&zfsvfs->z_lock);
-	list_destroy(&zfsvfs->z_all_znodes);
-	rrm_destroy(&zfsvfs->z_teardown_lock);
-	rms_destroy(&zfsvfs->z_teardown_inactive_lock);
-	rw_destroy(&zfsvfs->z_fuid_lock);
-	for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
-		mutex_destroy(&zfsvfs->z_hold_mtx[i]);
-	kmem_free(zfsvfs, sizeof (zfsvfs_t));
-}
-
-static void
-zfs_set_fuid_feature(zfsvfs_t *zfsvfs)
-{
-	zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os);
-	if (zfsvfs->z_vfs) {
-		if (zfsvfs->z_use_fuids) {
-			vfs_set_feature(zfsvfs->z_vfs, VFSFT_XVATTR);
-			vfs_set_feature(zfsvfs->z_vfs, VFSFT_SYSATTR_VIEWS);
-			vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACEMASKONACCESS);
-			vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACLONCREATE);
-			vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACCESS_FILTER);
-			vfs_set_feature(zfsvfs->z_vfs, VFSFT_REPARSE);
-		} else {
-			vfs_clear_feature(zfsvfs->z_vfs, VFSFT_XVATTR);
-			vfs_clear_feature(zfsvfs->z_vfs, VFSFT_SYSATTR_VIEWS);
-			vfs_clear_feature(zfsvfs->z_vfs, VFSFT_ACEMASKONACCESS);
-			vfs_clear_feature(zfsvfs->z_vfs, VFSFT_ACLONCREATE);
-			vfs_clear_feature(zfsvfs->z_vfs, VFSFT_ACCESS_FILTER);
-			vfs_clear_feature(zfsvfs->z_vfs, VFSFT_REPARSE);
-		}
-	}
-	zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os);
-}
-
-static int
-zfs_domount(vfs_t *vfsp, char *osname)
-{
-	uint64_t recordsize, fsid_guid;
-	int error = 0;
-	zfsvfs_t *zfsvfs;
-	vnode_t *vp;
-
-	ASSERT(vfsp);
-	ASSERT(osname);
-
-	error = zfsvfs_create(osname, &zfsvfs);
-	if (error)
-		return (error);
-	zfsvfs->z_vfs = vfsp;
-
-#ifdef illumos
-	/* Initialize the generic filesystem structure. */
-	vfsp->vfs_bcount = 0;
-	vfsp->vfs_data = NULL;
-
-	if (zfs_create_unique_device(&mount_dev) == -1) {
-		error = SET_ERROR(ENODEV);
-		goto out;
-	}
-	ASSERT(vfs_devismounted(mount_dev) == 0);
-#endif
-
-	if (error = dsl_prop_get_integer(osname, "recordsize", &recordsize,
-	    NULL))
-		goto out;
-	zfsvfs->z_vfs->vfs_bsize = SPA_MINBLOCKSIZE;
-	zfsvfs->z_vfs->mnt_stat.f_iosize = recordsize;
-
-	vfsp->vfs_data = zfsvfs;
-	vfsp->mnt_flag |= MNT_LOCAL;
-#if defined(_KERNEL) && !defined(KMEM_DEBUG)
-	vfsp->mnt_kern_flag |= MNTK_FPLOOKUP;
-#endif
-	vfsp->mnt_kern_flag |= MNTK_LOOKUP_SHARED;
-	vfsp->mnt_kern_flag |= MNTK_SHARED_WRITES;
-	vfsp->mnt_kern_flag |= MNTK_EXTENDED_SHARED;
-	vfsp->mnt_kern_flag |= MNTK_NO_IOPF;	/* vn_io_fault can be used */
-	vfsp->mnt_kern_flag |= MNTK_NOMSYNC;
-	vfsp->mnt_kern_flag |= MNTK_VMSETSIZE_BUG;
-
-	/*
-	 * The fsid is 64 bits, composed of an 8-bit fs type, which
-	 * separates our fsid from any other filesystem types, and a
-	 * 56-bit objset unique ID.  The objset unique ID is unique to
-	 * all objsets open on this system, provided by unique_create().
-	 * The 8-bit fs type must be put in the low bits of fsid[1]
-	 * because that's where other Solaris filesystems put it.
-	 */
-	fsid_guid = dmu_objset_fsid_guid(zfsvfs->z_os);
-	ASSERT((fsid_guid & ~((1ULL<<56)-1)) == 0);
-	vfsp->vfs_fsid.val[0] = fsid_guid;
-	vfsp->vfs_fsid.val[1] = ((fsid_guid>>32) << 8) |
-	    vfsp->mnt_vfc->vfc_typenum & 0xFF;
-
-	/*
-	 * Set features for file system.
-	 */
-	zfs_set_fuid_feature(zfsvfs);
-	if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE) {
-		vfs_set_feature(vfsp, VFSFT_DIRENTFLAGS);
-		vfs_set_feature(vfsp, VFSFT_CASEINSENSITIVE);
-		vfs_set_feature(vfsp, VFSFT_NOCASESENSITIVE);
-	} else if (zfsvfs->z_case == ZFS_CASE_MIXED) {
-		vfs_set_feature(vfsp, VFSFT_DIRENTFLAGS);
-		vfs_set_feature(vfsp, VFSFT_CASEINSENSITIVE);
-	}
-	vfs_set_feature(vfsp, VFSFT_ZEROCOPY_SUPPORTED);
-
-	if (dmu_objset_is_snapshot(zfsvfs->z_os)) {
-		uint64_t pval;
-
-		atime_changed_cb(zfsvfs, B_FALSE);
-		readonly_changed_cb(zfsvfs, B_TRUE);
-		if (error = dsl_prop_get_integer(osname, "xattr", &pval, NULL))
-			goto out;
-		xattr_changed_cb(zfsvfs, pval);
-		zfsvfs->z_issnap = B_TRUE;
-		zfsvfs->z_os->os_sync = ZFS_SYNC_DISABLED;
-
-		mutex_enter(&zfsvfs->z_os->os_user_ptr_lock);
-		dmu_objset_set_user(zfsvfs->z_os, zfsvfs);
-		mutex_exit(&zfsvfs->z_os->os_user_ptr_lock);
-	} else {
-		error = zfsvfs_setup(zfsvfs, B_TRUE);
-	}
-
-	vfs_mountedfrom(vfsp, osname);
-
-	if (!zfsvfs->z_issnap)
-		zfsctl_create(zfsvfs);
-out:
-	if (error) {
-		dmu_objset_disown(zfsvfs->z_os, zfsvfs);
-		zfsvfs_free(zfsvfs);
-	} else {
-		atomic_inc_32(&zfs_active_fs_count);
-	}
-
-	return (error);
-}
-
-void
-zfs_unregister_callbacks(zfsvfs_t *zfsvfs)
-{
-	objset_t *os = zfsvfs->z_os;
-
-	if (!dmu_objset_is_snapshot(os))
-		dsl_prop_unregister_all(dmu_objset_ds(os), zfsvfs);
-}
-
-#ifdef SECLABEL
-/*
- * Convert a decimal digit string to a uint64_t integer.
- */
-static int
-str_to_uint64(char *str, uint64_t *objnum)
-{
-	uint64_t num = 0;
-
-	while (*str) {
-		if (*str < '0' || *str > '9')
-			return (SET_ERROR(EINVAL));
-
-		num = num*10 + *str++ - '0';
-	}
-
-	*objnum = num;
-	return (0);
-}
-
-/*
- * The boot path passed from the boot loader is in the form of
- * "rootpool-name/root-filesystem-object-number'. Convert this
- * string to a dataset name: "rootpool-name/root-filesystem-name".
- */
-static int
-zfs_parse_bootfs(char *bpath, char *outpath)
-{
-	char *slashp;
-	uint64_t objnum;
-	int error;
-
-	if (*bpath == 0 || *bpath == '/')
-		return (SET_ERROR(EINVAL));
-
-	(void) strcpy(outpath, bpath);
-
-	slashp = strchr(bpath, '/');
-
-	/* if no '/', just return the pool name */
-	if (slashp == NULL) {
-		return (0);
-	}
-
-	/* if not a number, just return the root dataset name */
-	if (str_to_uint64(slashp+1, &objnum)) {
-		return (0);
-	}
-
-	*slashp = '\0';
-	error = dsl_dsobj_to_dsname(bpath, objnum, outpath);
-	*slashp = '/';
-
-	return (error);
-}
-
-/*
- * Check that the hex label string is appropriate for the dataset being
- * mounted into the global_zone proper.
- *
- * Return an error if the hex label string is not default or
- * admin_low/admin_high.  For admin_low labels, the corresponding
- * dataset must be readonly.
- */
-int
-zfs_check_global_label(const char *dsname, const char *hexsl)
-{
-	if (strcasecmp(hexsl, ZFS_MLSLABEL_DEFAULT) == 0)
-		return (0);
-	if (strcasecmp(hexsl, ADMIN_HIGH) == 0)
-		return (0);
-	if (strcasecmp(hexsl, ADMIN_LOW) == 0) {
-		/* must be readonly */
-		uint64_t rdonly;
-
-		if (dsl_prop_get_integer(dsname,
-		    zfs_prop_to_name(ZFS_PROP_READONLY), &rdonly, NULL))
-			return (SET_ERROR(EACCES));
-		return (rdonly ? 0 : EACCES);
-	}
-	return (SET_ERROR(EACCES));
-}
-
-/*
- * Determine whether the mount is allowed according to MAC check.
- * by comparing (where appropriate) label of the dataset against
- * the label of the zone being mounted into.  If the dataset has
- * no label, create one.
- *
- * Returns 0 if access allowed, error otherwise (e.g. EACCES)
- */
-static int
-zfs_mount_label_policy(vfs_t *vfsp, char *osname)
-{
-	int		error, retv;
-	zone_t		*mntzone = NULL;
-	ts_label_t	*mnt_tsl;
-	bslabel_t	*mnt_sl;
-	bslabel_t	ds_sl;
-	char		ds_hexsl[MAXNAMELEN];
-
-	retv = EACCES;				/* assume the worst */
-
-	/*
-	 * Start by getting the dataset label if it exists.
-	 */
-	error = dsl_prop_get(osname, zfs_prop_to_name(ZFS_PROP_MLSLABEL),
-	    1, sizeof (ds_hexsl), &ds_hexsl, NULL);
-	if (error)
-		return (SET_ERROR(EACCES));
-
-	/*
-	 * If labeling is NOT enabled, then disallow the mount of datasets
-	 * which have a non-default label already.  No other label checks
-	 * are needed.
-	 */
-	if (!is_system_labeled()) {
-		if (strcasecmp(ds_hexsl, ZFS_MLSLABEL_DEFAULT) == 0)
-			return (0);
-		return (SET_ERROR(EACCES));
-	}
-
-	/*
-	 * Get the label of the mountpoint.  If mounting into the global
-	 * zone (i.e. mountpoint is not within an active zone and the
-	 * zoned property is off), the label must be default or
-	 * admin_low/admin_high only; no other checks are needed.
-	 */
-	mntzone = zone_find_by_any_path(refstr_value(vfsp->vfs_mntpt), B_FALSE);
-	if (mntzone->zone_id == GLOBAL_ZONEID) {
-		uint64_t zoned;
-
-		zone_rele(mntzone);
-
-		if (dsl_prop_get_integer(osname,
-		    zfs_prop_to_name(ZFS_PROP_ZONED), &zoned, NULL))
-			return (SET_ERROR(EACCES));
-		if (!zoned)
-			return (zfs_check_global_label(osname, ds_hexsl));
-		else
-			/*
-			 * This is the case of a zone dataset being mounted
-			 * initially, before the zone has been fully created;
-			 * allow this mount into global zone.
-			 */
-			return (0);
-	}
-
-	mnt_tsl = mntzone->zone_slabel;
-	ASSERT(mnt_tsl != NULL);
-	label_hold(mnt_tsl);
-	mnt_sl = label2bslabel(mnt_tsl);
-
-	if (strcasecmp(ds_hexsl, ZFS_MLSLABEL_DEFAULT) == 0) {
-		/*
-		 * The dataset doesn't have a real label, so fabricate one.
-		 */
-		char *str = NULL;
-
-		if (l_to_str_internal(mnt_sl, &str) == 0 &&
-		    dsl_prop_set_string(osname,
-		    zfs_prop_to_name(ZFS_PROP_MLSLABEL),
-		    ZPROP_SRC_LOCAL, str) == 0)
-			retv = 0;
-		if (str != NULL)
-			kmem_free(str, strlen(str) + 1);
-	} else if (hexstr_to_label(ds_hexsl, &ds_sl) == 0) {
-		/*
-		 * Now compare labels to complete the MAC check.  If the
-		 * labels are equal then allow access.  If the mountpoint
-		 * label dominates the dataset label, allow readonly access.
-		 * Otherwise, access is denied.
-		 */
-		if (blequal(mnt_sl, &ds_sl))
-			retv = 0;
-		else if (bldominates(mnt_sl, &ds_sl)) {
-			vfs_setmntopt(vfsp, MNTOPT_RO, NULL, 0);
-			retv = 0;
-		}
-	}
-
-	label_rele(mnt_tsl);
-	zone_rele(mntzone);
-	return (retv);
-}
-#endif	/* SECLABEL */
-
-#ifdef OPENSOLARIS_MOUNTROOT
-static int
-zfs_mountroot(vfs_t *vfsp, enum whymountroot why)
-{
-	int error = 0;
-	static int zfsrootdone = 0;
-	zfsvfs_t *zfsvfs = NULL;
-	znode_t *zp = NULL;
-	vnode_t *vp = NULL;
-	char *zfs_bootfs;
-	char *zfs_devid;
-
-	ASSERT(vfsp);
-
-	/*
-	 * The filesystem that we mount as root is defined in the
-	 * boot property "zfs-bootfs" with a format of
-	 * "poolname/root-dataset-objnum".
-	 */
-	if (why == ROOT_INIT) {
-		if (zfsrootdone++)
-			return (SET_ERROR(EBUSY));
-		/*
-		 * the process of doing a spa_load will require the
-		 * clock to be set before we could (for example) do
-		 * something better by looking at the timestamp on
-		 * an uberblock, so just set it to -1.
-		 */
-		clkset(-1);
-
-		if ((zfs_bootfs = spa_get_bootprop("zfs-bootfs")) == NULL) {
-			cmn_err(CE_NOTE, "spa_get_bootfs: can not get "
-			    "bootfs name");
-			return (SET_ERROR(EINVAL));
-		}
-		zfs_devid = spa_get_bootprop("diskdevid");
-		error = spa_import_rootpool(rootfs.bo_name, zfs_devid);
-		if (zfs_devid)
-			spa_free_bootprop(zfs_devid);
-		if (error) {
-			spa_free_bootprop(zfs_bootfs);
-			cmn_err(CE_NOTE, "spa_import_rootpool: error %d",
-			    error);
-			return (error);
-		}
-		if (error = zfs_parse_bootfs(zfs_bootfs, rootfs.bo_name)) {
-			spa_free_bootprop(zfs_bootfs);
-			cmn_err(CE_NOTE, "zfs_parse_bootfs: error %d",
-			    error);
-			return (error);
-		}
-
-		spa_free_bootprop(zfs_bootfs);
-
-		if (error = vfs_lock(vfsp))
-			return (error);
-
-		if (error = zfs_domount(vfsp, rootfs.bo_name)) {
-			cmn_err(CE_NOTE, "zfs_domount: error %d", error);
-			goto out;
-		}
-
-		zfsvfs = (zfsvfs_t *)vfsp->vfs_data;
-		ASSERT(zfsvfs);
-		if (error = zfs_zget(zfsvfs, zfsvfs->z_root, &zp)) {
-			cmn_err(CE_NOTE, "zfs_zget: error %d", error);
-			goto out;
-		}
-
-		vp = ZTOV(zp);
-		mutex_enter(&vp->v_lock);
-		vp->v_flag |= VROOT;
-		mutex_exit(&vp->v_lock);
-		rootvp = vp;
-
-		/*
-		 * Leave rootvp held.  The root file system is never unmounted.
-		 */
-
-		vfs_add((struct vnode *)0, vfsp,
-		    (vfsp->vfs_flag & VFS_RDONLY) ? MS_RDONLY : 0);
-out:
-		vfs_unlock(vfsp);
-		return (error);
-	} else if (why == ROOT_REMOUNT) {
-		readonly_changed_cb(vfsp->vfs_data, B_FALSE);
-		vfsp->vfs_flag |= VFS_REMOUNT;
-
-		/* refresh mount options */
-		zfs_unregister_callbacks(vfsp->vfs_data);
-		return (zfs_register_callbacks(vfsp));
-
-	} else if (why == ROOT_UNMOUNT) {
-		zfs_unregister_callbacks((zfsvfs_t *)vfsp->vfs_data);
-		(void) zfs_sync(vfsp, 0, 0);
-		return (0);
-	}
-
-	/*
-	 * if "why" is equal to anything else other than ROOT_INIT,
-	 * ROOT_REMOUNT, or ROOT_UNMOUNT, we do not support it.
-	 */
-	return (SET_ERROR(ENOTSUP));
-}
-#endif	/* OPENSOLARIS_MOUNTROOT */
-
-static int
-getpoolname(const char *osname, char *poolname)
-{
-	char *p;
-
-	p = strchr(osname, '/');
-	if (p == NULL) {
-		if (strlen(osname) >= MAXNAMELEN)
-			return (ENAMETOOLONG);
-		(void) strcpy(poolname, osname);
-	} else {
-		if (p - osname >= MAXNAMELEN)
-			return (ENAMETOOLONG);
-		(void) strncpy(poolname, osname, p - osname);
-		poolname[p - osname] = '\0';
-	}
-	return (0);
-}
-
-/*ARGSUSED*/
-static int
-zfs_mount(vfs_t *vfsp)
-{
-	kthread_t	*td = curthread;
-	vnode_t		*mvp = vfsp->mnt_vnodecovered;
-	cred_t		*cr = td->td_ucred;
-	char		*osname;
-	int		error = 0;
-	int		canwrite;
-
-#ifdef illumos
-	if (mvp->v_type != VDIR)
-		return (SET_ERROR(ENOTDIR));
-
-	mutex_enter(&mvp->v_lock);
-	if ((uap->flags & MS_REMOUNT) == 0 &&
-	    (uap->flags & MS_OVERLAY) == 0 &&
-	    (mvp->v_count != 1 || (mvp->v_flag & VROOT))) {
-		mutex_exit(&mvp->v_lock);
-		return (SET_ERROR(EBUSY));
-	}
-	mutex_exit(&mvp->v_lock);
-
-	/*
-	 * ZFS does not support passing unparsed data in via MS_DATA.
-	 * Users should use the MS_OPTIONSTR interface; this means
-	 * that all option parsing is already done and the options struct
-	 * can be interrogated.
-	 */
-	if ((uap->flags & MS_DATA) && uap->datalen > 0)
-		return (SET_ERROR(EINVAL));
-
-	/*
-	 * Get the objset name (the "special" mount argument).
-	 */
-	if (error = pn_get(uap->spec, fromspace, &spn))
-		return (error);
-
-	osname = spn.pn_path;
-#else	/* !illumos */
-	if (vfs_getopt(vfsp->mnt_optnew, "from", (void **)&osname, NULL))
-		return (SET_ERROR(EINVAL));
-
-	/*
-	 * If full-owner-access is enabled and delegated administration is
-	 * turned on, we must set nosuid.
-	 */
-	if (zfs_super_owner &&
-	    dsl_deleg_access(osname, ZFS_DELEG_PERM_MOUNT, cr) != ECANCELED) {
-		secpolicy_fs_mount_clearopts(cr, vfsp);
-	}
-#endif	/* illumos */
-
-	/*
-	 * Check for mount privilege?
-	 *
-	 * If we don't have privilege then see if
-	 * we have local permission to allow it
-	 */
-	error = secpolicy_fs_mount(cr, mvp, vfsp);
-	if (error) {
-		if (dsl_deleg_access(osname, ZFS_DELEG_PERM_MOUNT, cr) != 0)
-			goto out;
-
-		if (!(vfsp->vfs_flag & MS_REMOUNT)) {
-			vattr_t		vattr;
-
-			/*
-			 * Make sure user is the owner of the mount point
-			 * or has sufficient privileges.
-			 */
-
-			vattr.va_mask = AT_UID;
-
-			vn_lock(mvp, LK_SHARED | LK_RETRY);
-			if (VOP_GETATTR(mvp, &vattr, cr)) {
-				VOP_UNLOCK(mvp);
-				goto out;
-			}
-
-			if (secpolicy_vnode_owner(mvp, cr, vattr.va_uid) != 0 &&
-			    VOP_ACCESS(mvp, VWRITE, cr, td) != 0) {
-				VOP_UNLOCK(mvp);
-				goto out;
-			}
-			VOP_UNLOCK(mvp);
-		}
-
-		secpolicy_fs_mount_clearopts(cr, vfsp);
-	}
-
-	/*
-	 * Refuse to mount a filesystem if we are in a local zone and the
-	 * dataset is not visible.
-	 */
-	if (!INGLOBALZONE(curthread) &&
-	    (!zone_dataset_visible(osname, &canwrite) || !canwrite)) {
-		error = SET_ERROR(EPERM);
-		goto out;
-	}
-
-#ifdef SECLABEL
-	error = zfs_mount_label_policy(vfsp, osname);
-	if (error)
-		goto out;
-#endif
-
-	vfsp->vfs_flag |= MNT_NFS4ACLS;
-
-	/*
-	 * When doing a remount, we simply refresh our temporary properties
-	 * according to those options set in the current VFS options.
-	 */
-	if (vfsp->vfs_flag & MS_REMOUNT) {
-		zfsvfs_t *zfsvfs = vfsp->vfs_data;
-
-		/*
-		 * Refresh mount options with z_teardown_lock blocking I/O while
-		 * the filesystem is in an inconsistent state.
-		 * The lock also serializes this code with filesystem
-		 * manipulations between entry to zfs_suspend_fs() and return
-		 * from zfs_resume_fs().
-		 */
-		rrm_enter(&zfsvfs->z_teardown_lock, RW_WRITER, FTAG);
-		zfs_unregister_callbacks(zfsvfs);
-		error = zfs_register_callbacks(vfsp);
-		rrm_exit(&zfsvfs->z_teardown_lock, FTAG);
-		goto out;
-	}
-
-	/* Initial root mount: try hard to import the requested root pool. */
-	if ((vfsp->vfs_flag & MNT_ROOTFS) != 0 &&
-	    (vfsp->vfs_flag & MNT_UPDATE) == 0) {
-		char pname[MAXNAMELEN];
-
-		error = getpoolname(osname, pname);
-		if (error == 0)
-			error = spa_import_rootpool(pname);
-		if (error)
-			goto out;
-	}
-	DROP_GIANT();
-	error = zfs_domount(vfsp, osname);
-	PICKUP_GIANT();
-
-#ifdef illumos
-	/*
-	 * Add an extra VFS_HOLD on our parent vfs so that it can't
-	 * disappear due to a forced unmount.
-	 */
-	if (error == 0 && ((zfsvfs_t *)vfsp->vfs_data)->z_issnap)
-		VFS_HOLD(mvp->v_vfsp);
-#endif
-
-out:
-	return (error);
-}
-
-static int
-zfs_statfs(vfs_t *vfsp, struct statfs *statp)
-{
-	zfsvfs_t *zfsvfs = vfsp->vfs_data;
-	uint64_t refdbytes, availbytes, usedobjs, availobjs;
-
-	statp->f_version = STATFS_VERSION;
-
-	ZFS_ENTER(zfsvfs);
-
-	dmu_objset_space(zfsvfs->z_os,
-	    &refdbytes, &availbytes, &usedobjs, &availobjs);
-
-	/*
-	 * The underlying storage pool actually uses multiple block sizes.
-	 * We report the fragsize as the smallest block size we support,
-	 * and we report our blocksize as the filesystem's maximum blocksize.
-	 */
-	statp->f_bsize = SPA_MINBLOCKSIZE;
-	statp->f_iosize = zfsvfs->z_vfs->mnt_stat.f_iosize;
-
-	/*
-	 * The following report "total" blocks of various kinds in the
-	 * file system, but reported in terms of f_frsize - the
-	 * "fragment" size.
-	 */
-
-	statp->f_blocks = (refdbytes + availbytes) >> SPA_MINBLOCKSHIFT;
-	statp->f_bfree = availbytes / statp->f_bsize;
-	statp->f_bavail = statp->f_bfree; /* no root reservation */
-
-	/*
-	 * statvfs() should really be called statufs(), because it assumes
-	 * static metadata.  ZFS doesn't preallocate files, so the best
-	 * we can do is report the max that could possibly fit in f_files,
-	 * and that minus the number actually used in f_ffree.
-	 * For f_ffree, report the smaller of the number of object available
-	 * and the number of blocks (each object will take at least a block).
-	 */
-	statp->f_ffree = MIN(availobjs, statp->f_bfree);
-	statp->f_files = statp->f_ffree + usedobjs;
-
-	/*
-	 * We're a zfs filesystem.
-	 */
-	(void) strlcpy(statp->f_fstypename, "zfs", sizeof(statp->f_fstypename));
-
-	strlcpy(statp->f_mntfromname, vfsp->mnt_stat.f_mntfromname,
-	    sizeof(statp->f_mntfromname));
-	strlcpy(statp->f_mntonname, vfsp->mnt_stat.f_mntonname,
-	    sizeof(statp->f_mntonname));
-
-	statp->f_namemax = MAXNAMELEN - 1;
-
-	ZFS_EXIT(zfsvfs);
-	return (0);
-}
-
-static int
-zfs_root(vfs_t *vfsp, int flags, vnode_t **vpp)
-{
-	zfsvfs_t *zfsvfs = vfsp->vfs_data;
-	znode_t *rootzp;
-	int error;
-
-	ZFS_ENTER(zfsvfs);
-
-	error = zfs_zget(zfsvfs, zfsvfs->z_root, &rootzp);
-	if (error == 0)
-		*vpp = ZTOV(rootzp);
-
-	ZFS_EXIT(zfsvfs);
-
-	if (error == 0) {
-		error = vn_lock(*vpp, flags);
-		if (error != 0) {
-			VN_RELE(*vpp);
-			*vpp = NULL;
-		}
-	}
-	return (error);
-}
-
-/*
- * Teardown the zfsvfs::z_os.
- *
- * Note, if 'unmounting' is FALSE, we return with the 'z_teardown_lock'
- * and 'z_teardown_inactive_lock' held.
- */
-static int
-zfsvfs_teardown(zfsvfs_t *zfsvfs, boolean_t unmounting)
-{
-	znode_t	*zp;
-
-	rrm_enter(&zfsvfs->z_teardown_lock, RW_WRITER, FTAG);
-
-	if (!unmounting) {
-		/*
-		 * We purge the parent filesystem's vfsp as the parent
-		 * filesystem and all of its snapshots have their vnode's
-		 * v_vfsp set to the parent's filesystem's vfsp.  Note,
-		 * 'z_parent' is self referential for non-snapshots.
-		 */
-		(void) dnlc_purge_vfsp(zfsvfs->z_parent->z_vfs, 0);
-#ifdef FREEBSD_NAMECACHE
-		cache_purgevfs(zfsvfs->z_parent->z_vfs, true);
-#endif
-	}
-
-	/*
-	 * Close the zil. NB: Can't close the zil while zfs_inactive
-	 * threads are blocked as zil_close can call zfs_inactive.
-	 */
-	if (zfsvfs->z_log) {
-		zil_close(zfsvfs->z_log);
-		zfsvfs->z_log = NULL;
-	}
-
-	ZFS_WLOCK_TEARDOWN_INACTIVE(zfsvfs);
-
-	/*
-	 * If we are not unmounting (ie: online recv) and someone already
-	 * unmounted this file system while we were doing the switcheroo,
-	 * or a reopen of z_os failed then just bail out now.
-	 */
-	if (!unmounting && (zfsvfs->z_unmounted || zfsvfs->z_os == NULL)) {
-		ZFS_WUNLOCK_TEARDOWN_INACTIVE(zfsvfs);
-		rrm_exit(&zfsvfs->z_teardown_lock, FTAG);
-		return (SET_ERROR(EIO));
-	}
-
-	/*
-	 * At this point there are no vops active, and any new vops will
-	 * fail with EIO since we have z_teardown_lock for writer (only
-	 * relavent for forced unmount).
-	 *
-	 * Release all holds on dbufs.
-	 */
-	mutex_enter(&zfsvfs->z_znodes_lock);
-	for (zp = list_head(&zfsvfs->z_all_znodes); zp != NULL;
-	    zp = list_next(&zfsvfs->z_all_znodes, zp))
-		if (zp->z_sa_hdl) {
-			ASSERT(ZTOV(zp)->v_count >= 0);
-			zfs_znode_dmu_fini(zp);
-		}
-	mutex_exit(&zfsvfs->z_znodes_lock);
-
-	/*
-	 * If we are unmounting, set the unmounted flag and let new vops
-	 * unblock.  zfs_inactive will have the unmounted behavior, and all
-	 * other vops will fail with EIO.
-	 */
-	if (unmounting) {
-		zfsvfs->z_unmounted = B_TRUE;
-		ZFS_WUNLOCK_TEARDOWN_INACTIVE(zfsvfs);
-		rrm_exit(&zfsvfs->z_teardown_lock, FTAG);
-	}
-
-	/*
-	 * z_os will be NULL if there was an error in attempting to reopen
-	 * zfsvfs, so just return as the properties had already been
-	 * unregistered and cached data had been evicted before.
-	 */
-	if (zfsvfs->z_os == NULL)
-		return (0);
-
-	/*
-	 * Unregister properties.
-	 */
-	zfs_unregister_callbacks(zfsvfs);
-
-	/*
-	 * Evict cached data
-	 */
-	if (dsl_dataset_is_dirty(dmu_objset_ds(zfsvfs->z_os)) &&
-	    !(zfsvfs->z_vfs->vfs_flag & VFS_RDONLY))
-		txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0);
-	dmu_objset_evict_dbufs(zfsvfs->z_os);
-
-	return (0);
-}
-
-/*ARGSUSED*/
-static int
-zfs_umount(vfs_t *vfsp, int fflag)
-{
-	kthread_t *td = curthread;
-	zfsvfs_t *zfsvfs = vfsp->vfs_data;
-	objset_t *os;
-	cred_t *cr = td->td_ucred;
-	int ret;
-
-	ret = secpolicy_fs_unmount(cr, vfsp);
-	if (ret) {
-		if (dsl_deleg_access((char *)refstr_value(vfsp->vfs_resource),
-		    ZFS_DELEG_PERM_MOUNT, cr))
-			return (ret);
-	}
-
-	/*
-	 * We purge the parent filesystem's vfsp as the parent filesystem
-	 * and all of its snapshots have their vnode's v_vfsp set to the
-	 * parent's filesystem's vfsp.  Note, 'z_parent' is self
-	 * referential for non-snapshots.
-	 */
-	(void) dnlc_purge_vfsp(zfsvfs->z_parent->z_vfs, 0);
-
-	/*
-	 * Unmount any snapshots mounted under .zfs before unmounting the
-	 * dataset itself.
-	 */
-	if (zfsvfs->z_ctldir != NULL) {
-		if ((ret = zfsctl_umount_snapshots(vfsp, fflag, cr)) != 0)
-			return (ret);
-	}
-
-	if (fflag & MS_FORCE) {
-		/*
-		 * Mark file system as unmounted before calling
-		 * vflush(FORCECLOSE). This way we ensure no future vnops
-		 * will be called and risk operating on DOOMED vnodes.
-		 */
-		rrm_enter(&zfsvfs->z_teardown_lock, RW_WRITER, FTAG);
-		zfsvfs->z_unmounted = B_TRUE;
-		rrm_exit(&zfsvfs->z_teardown_lock, FTAG);
-	}
-
-	/*
-	 * Flush all the files.
-	 */
-	ret = vflush(vfsp, 0, (fflag & MS_FORCE) ? FORCECLOSE : 0, td);
-	if (ret != 0)
-		return (ret);
-
-#ifdef illumos
-	if (!(fflag & MS_FORCE)) {
-		/*
-		 * Check the number of active vnodes in the file system.
-		 * Our count is maintained in the vfs structure, but the
-		 * number is off by 1 to indicate a hold on the vfs
-		 * structure itself.
-		 *
-		 * The '.zfs' directory maintains a reference of its
-		 * own, and any active references underneath are
-		 * reflected in the vnode count.
-		 */
-		if (zfsvfs->z_ctldir == NULL) {
-			if (vfsp->vfs_count > 1)
-				return (SET_ERROR(EBUSY));
-		} else {
-			if (vfsp->vfs_count > 2 ||
-			    zfsvfs->z_ctldir->v_count > 1)
-				return (SET_ERROR(EBUSY));
-		}
-	}
-#endif
-
-	while (taskqueue_cancel(zfsvfs_taskq->tq_queue,
-	    &zfsvfs->z_unlinked_drain_task, NULL) != 0)
-		taskqueue_drain(zfsvfs_taskq->tq_queue,
-		    &zfsvfs->z_unlinked_drain_task);
-
-	VERIFY(zfsvfs_teardown(zfsvfs, B_TRUE) == 0);
-	os = zfsvfs->z_os;
-
-	/*
-	 * z_os will be NULL if there was an error in
-	 * attempting to reopen zfsvfs.
-	 */
-	if (os != NULL) {
-		/*
-		 * Unset the objset user_ptr.
-		 */
-		mutex_enter(&os->os_user_ptr_lock);
-		dmu_objset_set_user(os, NULL);
-		mutex_exit(&os->os_user_ptr_lock);
-
-		/*
-		 * Finally release the objset
-		 */
-		dmu_objset_disown(os, zfsvfs);
-	}
-
-	/*
-	 * We can now safely destroy the '.zfs' directory node.
-	 */
-	if (zfsvfs->z_ctldir != NULL)
-		zfsctl_destroy(zfsvfs);
-	zfs_freevfs(vfsp);
-
-	return (0);
-}
-
-static int
-zfs_vget(vfs_t *vfsp, ino_t ino, int flags, vnode_t **vpp)
-{
-	zfsvfs_t	*zfsvfs = vfsp->vfs_data;
-	znode_t		*zp;
-	int 		err;
-
-	/*
-	 * zfs_zget() can't operate on virtual entries like .zfs/ or
-	 * .zfs/snapshot/ directories, that's why we return EOPNOTSUPP.
-	 * This will make NFS to switch to LOOKUP instead of using VGET.
-	 */
-	if (ino == ZFSCTL_INO_ROOT || ino == ZFSCTL_INO_SNAPDIR ||
-	    (zfsvfs->z_shares_dir != 0 && ino == zfsvfs->z_shares_dir))
-		return (EOPNOTSUPP);
-
-	ZFS_ENTER(zfsvfs);
-	err = zfs_zget(zfsvfs, ino, &zp);
-	if (err == 0 && zp->z_unlinked) {
-		vrele(ZTOV(zp));
-		err = EINVAL;
-	}
-	if (err == 0)
-		*vpp = ZTOV(zp);
-	ZFS_EXIT(zfsvfs);
-	if (err == 0) {
-		err = vn_lock(*vpp, flags);
-		if (err != 0)
-			vrele(*vpp);
-	}
-	if (err != 0)
-		*vpp = NULL;
-	return (err);
-}
-
-static int
-zfs_checkexp(vfs_t *vfsp, struct sockaddr *nam, uint64_t *extflagsp,
-    struct ucred **credanonp, int *numsecflavors, int *secflavors)
-{
-	zfsvfs_t *zfsvfs = vfsp->vfs_data;
-
-	/*
-	 * If this is regular file system vfsp is the same as
-	 * zfsvfs->z_parent->z_vfs, but if it is snapshot,
-	 * zfsvfs->z_parent->z_vfs represents parent file system
-	 * which we have to use here, because only this file system
-	 * has mnt_export configured.
-	 */
-	return (vfs_stdcheckexp(zfsvfs->z_parent->z_vfs, nam, extflagsp,
-	    credanonp, numsecflavors, secflavors));
-}
-
-CTASSERT(SHORT_FID_LEN <= sizeof(struct fid));
-CTASSERT(LONG_FID_LEN <= sizeof(struct fid));
-
-static int
-zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, int flags, vnode_t **vpp)
-{
-	struct componentname cn;
-	zfsvfs_t	*zfsvfs = vfsp->vfs_data;
-	znode_t		*zp;
-	vnode_t		*dvp;
-	uint64_t	object = 0;
-	uint64_t	fid_gen = 0;
-	uint64_t	gen_mask;
-	uint64_t	zp_gen;
-	int 		i, err;
-
-	*vpp = NULL;
-
-	ZFS_ENTER(zfsvfs);
-
-	/*
-	 * On FreeBSD we can get snapshot's mount point or its parent file
-	 * system mount point depending if snapshot is already mounted or not.
-	 */
-	if (zfsvfs->z_parent == zfsvfs && fidp->fid_len == LONG_FID_LEN) {
-		zfid_long_t	*zlfid = (zfid_long_t *)fidp;
-		uint64_t	objsetid = 0;
-		uint64_t	setgen = 0;
-
-		for (i = 0; i < sizeof (zlfid->zf_setid); i++)
-			objsetid |= ((uint64_t)zlfid->zf_setid[i]) << (8 * i);
-
-		for (i = 0; i < sizeof (zlfid->zf_setgen); i++)
-			setgen |= ((uint64_t)zlfid->zf_setgen[i]) << (8 * i);
-
-		ZFS_EXIT(zfsvfs);
-
-		err = zfsctl_lookup_objset(vfsp, objsetid, &zfsvfs);
-		if (err)
-			return (SET_ERROR(EINVAL));
-		ZFS_ENTER(zfsvfs);
-	}
-
-	if (fidp->fid_len == SHORT_FID_LEN || fidp->fid_len == LONG_FID_LEN) {
-		zfid_short_t	*zfid = (zfid_short_t *)fidp;
-
-		for (i = 0; i < sizeof (zfid->zf_object); i++)
-			object |= ((uint64_t)zfid->zf_object[i]) << (8 * i);
-
-		for (i = 0; i < sizeof (zfid->zf_gen); i++)
-			fid_gen |= ((uint64_t)zfid->zf_gen[i]) << (8 * i);
-	} else {
-		ZFS_EXIT(zfsvfs);
-		return (SET_ERROR(EINVAL));
-	}
-
-	/*
-	 * A zero fid_gen means we are in .zfs or the .zfs/snapshot
-	 * directory tree. If the object == zfsvfs->z_shares_dir, then
-	 * we are in the .zfs/shares directory tree.
-	 */
-	if ((fid_gen == 0 &&
-	     (object == ZFSCTL_INO_ROOT || object == ZFSCTL_INO_SNAPDIR)) ||
-	    (zfsvfs->z_shares_dir != 0 && object == zfsvfs->z_shares_dir)) {
-		ZFS_EXIT(zfsvfs);
-		VERIFY0(zfsctl_root(zfsvfs, LK_SHARED, &dvp));
-		if (object == ZFSCTL_INO_SNAPDIR) {
-			cn.cn_nameptr = "snapshot";
-			cn.cn_namelen = strlen(cn.cn_nameptr);
-			cn.cn_nameiop = LOOKUP;
-			cn.cn_flags = ISLASTCN | LOCKLEAF;
-			cn.cn_lkflags = flags;
-			VERIFY0(VOP_LOOKUP(dvp, vpp, &cn));
-			vput(dvp);
-		} else if (object == zfsvfs->z_shares_dir) {
-			/*
-			 * XXX This branch must not be taken,
-			 * if it is, then the lookup below will
-			 * explode.
-			 */
-			cn.cn_nameptr = "shares";
-			cn.cn_namelen = strlen(cn.cn_nameptr);
-			cn.cn_nameiop = LOOKUP;
-			cn.cn_flags = ISLASTCN;
-			cn.cn_lkflags = flags;
-			VERIFY0(VOP_LOOKUP(dvp, vpp, &cn));
-			vput(dvp);
-		} else {
-			*vpp = dvp;
-		}
-		return (err);
-	}
-
-	gen_mask = -1ULL >> (64 - 8 * i);
-
-	dprintf("getting %llu [%u mask %llx]\n", object, fid_gen, gen_mask);
-	if (err = zfs_zget(zfsvfs, object, &zp)) {
-		ZFS_EXIT(zfsvfs);
-		return (err);
-	}
-	(void) sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs), &zp_gen,
-	    sizeof (uint64_t));
-	zp_gen = zp_gen & gen_mask;
-	if (zp_gen == 0)
-		zp_gen = 1;
-	if (zp->z_unlinked || zp_gen != fid_gen) {
-		dprintf("znode gen (%u) != fid gen (%u)\n", zp_gen, fid_gen);
-		vrele(ZTOV(zp));
-		ZFS_EXIT(zfsvfs);
-		return (SET_ERROR(EINVAL));
-	}
-
-	*vpp = ZTOV(zp);
-	ZFS_EXIT(zfsvfs);
-	err = vn_lock(*vpp, flags);
-	if (err == 0)
-		vnode_create_vobject(*vpp, zp->z_size, curthread);
-	else
-		*vpp = NULL;
-	return (err);
-}
-
-/*
- * Block out VOPs and close zfsvfs_t::z_os
- *
- * Note, if successful, then we return with the 'z_teardown_lock' and
- * 'z_teardown_inactive_lock' write held.  We leave ownership of the underlying
- * dataset and objset intact so that they can be atomically handed off during
- * a subsequent rollback or recv operation and the resume thereafter.
- */
-int
-zfs_suspend_fs(zfsvfs_t *zfsvfs)
-{
-	int error;
-
-	if ((error = zfsvfs_teardown(zfsvfs, B_FALSE)) != 0)
-		return (error);
-
-	return (0);
-}
-
-/*
- * Rebuild SA and release VOPs.  Note that ownership of the underlying dataset
- * is an invariant across any of the operations that can be performed while the
- * filesystem was suspended.  Whether it succeeded or failed, the preconditions
- * are the same: the relevant objset and associated dataset are owned by
- * zfsvfs, held, and long held on entry.
- */
-int
-zfs_resume_fs(zfsvfs_t *zfsvfs, dsl_dataset_t *ds)
-{
-	int err;
-	znode_t *zp;
-
-	ASSERT(RRM_WRITE_HELD(&zfsvfs->z_teardown_lock));
-	ASSERT(ZFS_TEARDOWN_INACTIVE_WLOCKED(zfsvfs));
-
-	/*
-	 * We already own this, so just update the objset_t, as the one we
-	 * had before may have been evicted.
-	 */
-	objset_t *os;
-	VERIFY3P(ds->ds_owner, ==, zfsvfs);
-	VERIFY(dsl_dataset_long_held(ds));
-	VERIFY0(dmu_objset_from_ds(ds, &os));
-
-	err = zfsvfs_init(zfsvfs, os);
-	if (err != 0)
-		goto bail;
-
-	VERIFY(zfsvfs_setup(zfsvfs, B_FALSE) == 0);
-
-	zfs_set_fuid_feature(zfsvfs);
-
-	/*
-	 * Attempt to re-establish all the active znodes with
-	 * their dbufs.  If a zfs_rezget() fails, then we'll let
-	 * any potential callers discover that via ZFS_ENTER_VERIFY_VP
-	 * when they try to use their znode.
-	 */
-	mutex_enter(&zfsvfs->z_znodes_lock);
-	for (zp = list_head(&zfsvfs->z_all_znodes); zp;
-	    zp = list_next(&zfsvfs->z_all_znodes, zp)) {
-		(void) zfs_rezget(zp);
-	}
-	mutex_exit(&zfsvfs->z_znodes_lock);
-
-bail:
-	/* release the VOPs */
-	ZFS_WUNLOCK_TEARDOWN_INACTIVE(zfsvfs);
-	rrm_exit(&zfsvfs->z_teardown_lock, FTAG);
-
-	if (err) {
-		/*
-		 * Since we couldn't setup the sa framework, try to force
-		 * unmount this file system.
-		 */
-		if (vn_vfswlock(zfsvfs->z_vfs->vfs_vnodecovered) == 0) {
-			vfs_ref(zfsvfs->z_vfs);
-			(void) dounmount(zfsvfs->z_vfs, MS_FORCE, curthread);
-		}
-	}
-	return (err);
-}
-
-static void
-zfs_freevfs(vfs_t *vfsp)
-{
-	zfsvfs_t *zfsvfs = vfsp->vfs_data;
-
-#ifdef illumos
-	/*
-	 * If this is a snapshot, we have an extra VFS_HOLD on our parent
-	 * from zfs_mount().  Release it here.  If we came through
-	 * zfs_mountroot() instead, we didn't grab an extra hold, so
-	 * skip the VFS_RELE for rootvfs.
-	 */
-	if (zfsvfs->z_issnap && (vfsp != rootvfs))
-		VFS_RELE(zfsvfs->z_parent->z_vfs);
-#endif
-
-	zfsvfs_free(zfsvfs);
-
-	atomic_dec_32(&zfs_active_fs_count);
-}
-
-#ifdef __i386__
-static int desiredvnodes_backup;
-#endif
-
-static void
-zfs_vnodes_adjust(void)
-{
-#ifdef __i386__
-	int newdesiredvnodes;
-
-	desiredvnodes_backup = desiredvnodes;
-
-	/*
-	 * We calculate newdesiredvnodes the same way it is done in
-	 * vntblinit(). If it is equal to desiredvnodes, it means that
-	 * it wasn't tuned by the administrator and we can tune it down.
-	 */
-	newdesiredvnodes = min(maxproc + vm_cnt.v_page_count / 4, 2 *
-	    vm_kmem_size / (5 * (sizeof(struct vm_object) +
-	    sizeof(struct vnode))));
-	if (newdesiredvnodes == desiredvnodes)
-		desiredvnodes = (3 * newdesiredvnodes) / 4;
-#endif
-}
-
-static void
-zfs_vnodes_adjust_back(void)
-{
-
-#ifdef __i386__
-	desiredvnodes = desiredvnodes_backup;
-#endif
-}
-
-void
-zfs_init(void)
-{
-
-	printf("ZFS filesystem version: " ZPL_VERSION_STRING "\n");
-
-	/*
-	 * Initialize .zfs directory structures
-	 */
-	zfsctl_init();
-
-	/*
-	 * Initialize znode cache, vnode ops, etc...
-	 */
-	zfs_znode_init();
-
-	/*
-	 * Reduce number of vnodes. Originally number of vnodes is calculated
-	 * with UFS inode in mind. We reduce it here, because it's too big for
-	 * ZFS/i386.
-	 */
-	zfs_vnodes_adjust();
-
-	dmu_objset_register_type(DMU_OST_ZFS, zfs_space_delta_cb);
-#if defined(__FreeBSD__)
-	zfsvfs_taskq = taskq_create("zfsvfs", 1, minclsyspri, 0, 0, 0);
-#endif
-}
-
-void
-zfs_fini(void)
-{
-#if defined(__FreeBSD__)
-	taskq_destroy(zfsvfs_taskq);
-#endif
-	zfsctl_fini();
-	zfs_znode_fini();
-	zfs_vnodes_adjust_back();
-}
-
-int
-zfs_busy(void)
-{
-	return (zfs_active_fs_count != 0);
-}
-
-int
-zfs_set_version(zfsvfs_t *zfsvfs, uint64_t newvers)
-{
-	int error;
-	objset_t *os = zfsvfs->z_os;
-	dmu_tx_t *tx;
-
-	if (newvers < ZPL_VERSION_INITIAL || newvers > ZPL_VERSION)
-		return (SET_ERROR(EINVAL));
-
-	if (newvers < zfsvfs->z_version)
-		return (SET_ERROR(EINVAL));
-
-	if (zfs_spa_version_map(newvers) >
-	    spa_version(dmu_objset_spa(zfsvfs->z_os)))
-		return (SET_ERROR(ENOTSUP));
-
-	tx = dmu_tx_create(os);
-	dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_FALSE, ZPL_VERSION_STR);
-	if (newvers >= ZPL_VERSION_SA && !zfsvfs->z_use_sa) {
-		dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_TRUE,
-		    ZFS_SA_ATTRS);
-		dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
-	}
-	error = dmu_tx_assign(tx, TXG_WAIT);
-	if (error) {
-		dmu_tx_abort(tx);
-		return (error);
-	}
-
-	error = zap_update(os, MASTER_NODE_OBJ, ZPL_VERSION_STR,
-	    8, 1, &newvers, tx);
-
-	if (error) {
-		dmu_tx_commit(tx);
-		return (error);
-	}
-
-	if (newvers >= ZPL_VERSION_SA && !zfsvfs->z_use_sa) {
-		uint64_t sa_obj;
-
-		ASSERT3U(spa_version(dmu_objset_spa(zfsvfs->z_os)), >=,
-		    SPA_VERSION_SA);
-		sa_obj = zap_create(os, DMU_OT_SA_MASTER_NODE,
-		    DMU_OT_NONE, 0, tx);
-
-		error = zap_add(os, MASTER_NODE_OBJ,
-		    ZFS_SA_ATTRS, 8, 1, &sa_obj, tx);
-		ASSERT0(error);
-
-		VERIFY(0 == sa_set_sa_object(os, sa_obj));
-		sa_register_update_callback(os, zfs_sa_upgrade);
-	}
-
-	spa_history_log_internal_ds(dmu_objset_ds(os), "upgrade", tx,
-	    "from %llu to %llu", zfsvfs->z_version, newvers);
-
-	dmu_tx_commit(tx);
-
-	zfsvfs->z_version = newvers;
-	os->os_version = newvers;
-
-	zfs_set_fuid_feature(zfsvfs);
-
-	return (0);
-}
-
-/*
- * Read a property stored within the master node.
- */
-int
-zfs_get_zplprop(objset_t *os, zfs_prop_t prop, uint64_t *value)
-{
-	uint64_t *cached_copy = NULL;
-
-	/*
-	 * Figure out where in the objset_t the cached copy would live, if it
-	 * is available for the requested property.
-	 */
-	if (os != NULL) {
-		switch (prop) {
-		case ZFS_PROP_VERSION:
-			cached_copy = &os->os_version;
-			break;
-		case ZFS_PROP_NORMALIZE:
-			cached_copy = &os->os_normalization;
-			break;
-		case ZFS_PROP_UTF8ONLY:
-			cached_copy = &os->os_utf8only;
-			break;
-		case ZFS_PROP_CASE:
-			cached_copy = &os->os_casesensitivity;
-			break;
-		default:
-			break;
-		}
-	}
-	if (cached_copy != NULL && *cached_copy != OBJSET_PROP_UNINITIALIZED) {
-		*value = *cached_copy;
-		return (0);
-	}
-
-	/*
-	 * If the property wasn't cached, look up the file system's value for
-	 * the property. For the version property, we look up a slightly
-	 * different string.
-	 */
-	const char *pname;
-	int error = ENOENT;
-	if (prop == ZFS_PROP_VERSION) {
-		pname = ZPL_VERSION_STR;
-	} else {
-		pname = zfs_prop_to_name(prop);
-	}
-
-	if (os != NULL) {
-		ASSERT3U(os->os_phys->os_type, ==, DMU_OST_ZFS);
-		error = zap_lookup(os, MASTER_NODE_OBJ, pname, 8, 1, value);
-	}
-
-	if (error == ENOENT) {
-		/* No value set, use the default value */
-		switch (prop) {
-		case ZFS_PROP_VERSION:
-			*value = ZPL_VERSION;
-			break;
-		case ZFS_PROP_NORMALIZE:
-		case ZFS_PROP_UTF8ONLY:
-			*value = 0;
-			break;
-		case ZFS_PROP_CASE:
-			*value = ZFS_CASE_SENSITIVE;
-			break;
-		default:
-			return (error);
-		}
-		error = 0;
-	}
-
-	/*
-	 * If one of the methods for getting the property value above worked,
-	 * copy it into the objset_t's cache.
-	 */
-	if (error == 0 && cached_copy != NULL) {
-		*cached_copy = *value;
-	}
-
-	return (error);
-}
-
-/*
- * Return true if the coresponding vfs's unmounted flag is set.
- * Otherwise return false.
- * If this function returns true we know VFS unmount has been initiated.
- */
-boolean_t
-zfs_get_vfs_flag_unmounted(objset_t *os)
-{
-	zfsvfs_t *zfvp;
-	boolean_t unmounted = B_FALSE;
-
-	ASSERT(dmu_objset_type(os) == DMU_OST_ZFS);
-
-	mutex_enter(&os->os_user_ptr_lock);
-	zfvp = dmu_objset_get_user(os);
-	if (zfvp != NULL && zfvp->z_vfs != NULL &&
-	    (zfvp->z_vfs->mnt_kern_flag & MNTK_UNMOUNT))
-		unmounted = B_TRUE;
-	mutex_exit(&os->os_user_ptr_lock);
-
-	return (unmounted);
-}
-
-#ifdef _KERNEL
-void
-zfsvfs_update_fromname(const char *oldname, const char *newname)
-{
-	char tmpbuf[MAXPATHLEN];
-	struct mount *mp;
-	char *fromname;
-	size_t oldlen;
-
-	oldlen = strlen(oldname);
-
-	mtx_lock(&mountlist_mtx);
-	TAILQ_FOREACH(mp, &mountlist, mnt_list) {
-		fromname = mp->mnt_stat.f_mntfromname;
-		if (strcmp(fromname, oldname) == 0) {
-			(void)strlcpy(fromname, newname,
-			    sizeof(mp->mnt_stat.f_mntfromname));
-			continue;
-		}
-		if (strncmp(fromname, oldname, oldlen) == 0 &&
-		    (fromname[oldlen] == '/' || fromname[oldlen] == '@')) {
-			(void)snprintf(tmpbuf, sizeof(tmpbuf), "%s%s",
-			    newname, fromname + oldlen);
-			(void)strlcpy(fromname, tmpbuf,
-			    sizeof(mp->mnt_stat.f_mntfromname));
-			continue;
-		}
-	}
-	mtx_unlock(&mountlist_mtx);
-}
-#endif
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c
+++ /dev/null
@@ -1,6124 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-
-/*
- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
- * Copyright (c) 2014 Integros [integros.com]
- * Copyright 2017 Nexenta Systems, Inc.
- */
-
-/* Portions Copyright 2007 Jeremy Teo */
-/* Portions Copyright 2010 Robert Milkowski */
-
-#include <sys/types.h>
-#include <sys/param.h>
-#include <sys/time.h>
-#include <sys/systm.h>
-#include <sys/sysmacros.h>
-#include <sys/resource.h>
-#include <sys/vfs.h>
-#include <sys/vm.h>
-#include <sys/vnode.h>
-#include <sys/smr.h>
-#include <sys/file.h>
-#include <sys/stat.h>
-#include <sys/kmem.h>
-#include <sys/taskq.h>
-#include <sys/uio.h>
-#include <sys/atomic.h>
-#include <sys/namei.h>
-#include <sys/mman.h>
-#include <sys/cmn_err.h>
-#include <sys/errno.h>
-#include <sys/unistd.h>
-#include <sys/zfs_dir.h>
-#include <sys/zfs_ioctl.h>
-#include <sys/fs/zfs.h>
-#include <sys/dmu.h>
-#include <sys/dmu_objset.h>
-#include <sys/spa.h>
-#include <sys/txg.h>
-#include <sys/dbuf.h>
-#include <sys/zap.h>
-#include <sys/sa.h>
-#include <sys/dirent.h>
-#include <sys/policy.h>
-#include <sys/sunddi.h>
-#include <sys/filio.h>
-#include <sys/sid.h>
-#include <sys/zfs_ctldir.h>
-#include <sys/zfs_fuid.h>
-#include <sys/zfs_sa.h>
-#include <sys/zfs_rlock.h>
-#include <sys/extdirent.h>
-#include <sys/kidmap.h>
-#include <sys/bio.h>
-#include <sys/buf.h>
-#include <sys/sched.h>
-#include <sys/acl.h>
-#include <sys/vmmeter.h>
-#include <vm/vm_param.h>
-#include <sys/zil.h>
-
-VFS_SMR_DECLARE;
-
-/*
- * Programming rules.
- *
- * Each vnode op performs some logical unit of work.  To do this, the ZPL must
- * properly lock its in-core state, create a DMU transaction, do the work,
- * record this work in the intent log (ZIL), commit the DMU transaction,
- * and wait for the intent log to commit if it is a synchronous operation.
- * Moreover, the vnode ops must work in both normal and log replay context.
- * The ordering of events is important to avoid deadlocks and references
- * to freed memory.  The example below illustrates the following Big Rules:
- *
- *  (1)	A check must be made in each zfs thread for a mounted file system.
- *	This is done avoiding races using ZFS_ENTER(zfsvfs).
- *	A ZFS_EXIT(zfsvfs) is needed before all returns.  Any znodes
- *	must be checked with ZFS_VERIFY_ZP(zp).  Both of these macros
- *	can return EIO from the calling function.
- *
- *  (2)	VN_RELE() should always be the last thing except for zil_commit()
- *	(if necessary) and ZFS_EXIT(). This is for 3 reasons:
- *	First, if it's the last reference, the vnode/znode
- *	can be freed, so the zp may point to freed memory.  Second, the last
- *	reference will call zfs_zinactive(), which may induce a lot of work --
- *	pushing cached pages (which acquires range locks) and syncing out
- *	cached atime changes.  Third, zfs_zinactive() may require a new tx,
- *	which could deadlock the system if you were already holding one.
- *	If you must call VN_RELE() within a tx then use VN_RELE_ASYNC().
- *
- *  (3)	All range locks must be grabbed before calling dmu_tx_assign(),
- *	as they can span dmu_tx_assign() calls.
- *
- *  (4) If ZPL locks are held, pass TXG_NOWAIT as the second argument to
- *      dmu_tx_assign().  This is critical because we don't want to block
- *      while holding locks.
- *
- *	If no ZPL locks are held (aside from ZFS_ENTER()), use TXG_WAIT.  This
- *	reduces lock contention and CPU usage when we must wait (note that if
- *	throughput is constrained by the storage, nearly every transaction
- *	must wait).
- *
- *      Note, in particular, that if a lock is sometimes acquired before
- *      the tx assigns, and sometimes after (e.g. z_lock), then failing
- *      to use a non-blocking assign can deadlock the system.  The scenario:
- *
- *	Thread A has grabbed a lock before calling dmu_tx_assign().
- *	Thread B is in an already-assigned tx, and blocks for this lock.
- *	Thread A calls dmu_tx_assign(TXG_WAIT) and blocks in txg_wait_open()
- *	forever, because the previous txg can't quiesce until B's tx commits.
- *
- *	If dmu_tx_assign() returns ERESTART and zfsvfs->z_assign is TXG_NOWAIT,
- *	then drop all locks, call dmu_tx_wait(), and try again.  On subsequent
- *	calls to dmu_tx_assign(), pass TXG_NOTHROTTLE in addition to TXG_NOWAIT,
- *	to indicate that this operation has already called dmu_tx_wait().
- *	This will ensure that we don't retry forever, waiting a short bit
- *	each time.
- *
- *  (5)	If the operation succeeded, generate the intent log entry for it
- *	before dropping locks.  This ensures that the ordering of events
- *	in the intent log matches the order in which they actually occurred.
- *	During ZIL replay the zfs_log_* functions will update the sequence
- *	number to indicate the zil transaction has replayed.
- *
- *  (6)	At the end of each vnode op, the DMU tx must always commit,
- *	regardless of whether there were any errors.
- *
- *  (7)	After dropping all locks, invoke zil_commit(zilog, foid)
- *	to ensure that synchronous semantics are provided when necessary.
- *
- * In general, this is how things should be ordered in each vnode op:
- *
- *	ZFS_ENTER(zfsvfs);		// exit if unmounted
- * top:
- *	zfs_dirent_lookup(&dl, ...)	// lock directory entry (may VN_HOLD())
- *	rw_enter(...);			// grab any other locks you need
- *	tx = dmu_tx_create(...);	// get DMU tx
- *	dmu_tx_hold_*();		// hold each object you might modify
- *	error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
- *	if (error) {
- *		rw_exit(...);		// drop locks
- *		zfs_dirent_unlock(dl);	// unlock directory entry
- *		VN_RELE(...);		// release held vnodes
- *		if (error == ERESTART) {
- *			waited = B_TRUE;
- *			dmu_tx_wait(tx);
- *			dmu_tx_abort(tx);
- *			goto top;
- *		}
- *		dmu_tx_abort(tx);	// abort DMU tx
- *		ZFS_EXIT(zfsvfs);	// finished in zfs
- *		return (error);		// really out of space
- *	}
- *	error = do_real_work();		// do whatever this VOP does
- *	if (error == 0)
- *		zfs_log_*(...);		// on success, make ZIL entry
- *	dmu_tx_commit(tx);		// commit DMU tx -- error or not
- *	rw_exit(...);			// drop locks
- *	zfs_dirent_unlock(dl);		// unlock directory entry
- *	VN_RELE(...);			// release held vnodes
- *	zil_commit(zilog, foid);	// synchronous when necessary
- *	ZFS_EXIT(zfsvfs);		// finished in zfs
- *	return (error);			// done, report error
- */
-
-/* ARGSUSED */
-static int
-zfs_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct)
-{
-	znode_t	*zp = VTOZ(*vpp);
-	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
-
-	ZFS_ENTER(zfsvfs);
-	ZFS_VERIFY_ZP(zp);
-
-	if ((flag & FWRITE) && (zp->z_pflags & ZFS_APPENDONLY) &&
-	    ((flag & FAPPEND) == 0)) {
-		ZFS_EXIT(zfsvfs);
-		return (SET_ERROR(EPERM));
-	}
-
-	if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan &&
-	    ZTOV(zp)->v_type == VREG &&
-	    !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0) {
-		if (fs_vscan(*vpp, cr, 0) != 0) {
-			ZFS_EXIT(zfsvfs);
-			return (SET_ERROR(EACCES));
-		}
-	}
-
-	/* Keep a count of the synchronous opens in the znode */
-	if (flag & (FSYNC | FDSYNC))
-		atomic_inc_32(&zp->z_sync_cnt);
-
-	ZFS_EXIT(zfsvfs);
-	return (0);
-}
-
-/* ARGSUSED */
-static int
-zfs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr,
-    caller_context_t *ct)
-{
-	znode_t	*zp = VTOZ(vp);
-	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
-
-	/*
-	 * Clean up any locks held by this process on the vp.
-	 */
-	cleanlocks(vp, ddi_get_pid(), 0);
-	cleanshares(vp, ddi_get_pid());
-
-	ZFS_ENTER(zfsvfs);
-	ZFS_VERIFY_ZP(zp);
-
-	/* Decrement the synchronous opens in the znode */
-	if ((flag & (FSYNC | FDSYNC)) && (count == 1))
-		atomic_dec_32(&zp->z_sync_cnt);
-
-	if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan &&
-	    ZTOV(zp)->v_type == VREG &&
-	    !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0)
-		VERIFY(fs_vscan(vp, cr, 1) == 0);
-
-	ZFS_EXIT(zfsvfs);
-	return (0);
-}
-
-/*
- * Lseek support for finding holes (cmd == _FIO_SEEK_HOLE) and
- * data (cmd == _FIO_SEEK_DATA). "off" is an in/out parameter.
- */
-static int
-zfs_holey(vnode_t *vp, u_long cmd, offset_t *off)
-{
-	znode_t	*zp = VTOZ(vp);
-	uint64_t noff = (uint64_t)*off; /* new offset */
-	uint64_t file_sz;
-	int error;
-	boolean_t hole;
-
-	file_sz = zp->z_size;
-	if (noff >= file_sz)  {
-		return (SET_ERROR(ENXIO));
-	}
-
-	if (cmd == _FIO_SEEK_HOLE)
-		hole = B_TRUE;
-	else
-		hole = B_FALSE;
-
-	error = dmu_offset_next(zp->z_zfsvfs->z_os, zp->z_id, hole, &noff);
-
-	if (error == ESRCH)
-		return (SET_ERROR(ENXIO));
-
-	/*
-	 * We could find a hole that begins after the logical end-of-file,
-	 * because dmu_offset_next() only works on whole blocks.  If the
-	 * EOF falls mid-block, then indicate that the "virtual hole"
-	 * at the end of the file begins at the logical EOF, rather than
-	 * at the end of the last block.
-	 */
-	if (noff > file_sz) {
-		ASSERT(hole);
-		noff = file_sz;
-	}
-
-	if (noff < *off)
-		return (error);
-	*off = noff;
-	return (error);
-}
-
-/* ARGSUSED */
-static int
-zfs_ioctl(vnode_t *vp, u_long com, intptr_t data, int flag, cred_t *cred,
-    int *rvalp, caller_context_t *ct)
-{
-	offset_t off;
-	offset_t ndata;
-	dmu_object_info_t doi;
-	int error;
-	zfsvfs_t *zfsvfs;
-	znode_t *zp;
-
-	switch (com) {
-	case _FIOFFS:
-	{
-		return (0);
-
-		/*
-		 * The following two ioctls are used by bfu.  Faking out,
-		 * necessary to avoid bfu errors.
-		 */
-	}
-	case _FIOGDIO:
-	case _FIOSDIO:
-	{
-		return (0);
-	}
-
-	case _FIO_SEEK_DATA:
-	case _FIO_SEEK_HOLE:
-	{
-#ifdef illumos
-		if (ddi_copyin((void *)data, &off, sizeof (off), flag))
-			return (SET_ERROR(EFAULT));
-#else
-		off = *(offset_t *)data;
-#endif
-		zp = VTOZ(vp);
-		zfsvfs = zp->z_zfsvfs;
-		ZFS_ENTER(zfsvfs);
-		ZFS_VERIFY_ZP(zp);
-
-		/* offset parameter is in/out */
-		error = zfs_holey(vp, com, &off);
-		ZFS_EXIT(zfsvfs);
-		if (error)
-			return (error);
-#ifdef illumos
-		if (ddi_copyout(&off, (void *)data, sizeof (off), flag))
-			return (SET_ERROR(EFAULT));
-#else
-		*(offset_t *)data = off;
-#endif
-		return (0);
-	}
-#ifdef illumos
-	case _FIO_COUNT_FILLED:
-	{
-		/*
-		 * _FIO_COUNT_FILLED adds a new ioctl command which
-		 * exposes the number of filled blocks in a
-		 * ZFS object.
-		 */
-		zp = VTOZ(vp);
-		zfsvfs = zp->z_zfsvfs;
-		ZFS_ENTER(zfsvfs);
-		ZFS_VERIFY_ZP(zp);
-
-		/*
-		 * Wait for all dirty blocks for this object
-		 * to get synced out to disk, and the DMU info
-		 * updated.
-		 */
-		error = dmu_object_wait_synced(zfsvfs->z_os, zp->z_id);
-		if (error) {
-			ZFS_EXIT(zfsvfs);
-			return (error);
-		}
-
-		/*
-		 * Retrieve fill count from DMU object.
-		 */
-		error = dmu_object_info(zfsvfs->z_os, zp->z_id, &doi);
-		if (error) {
-			ZFS_EXIT(zfsvfs);
-			return (error);
-		}
-
-		ndata = doi.doi_fill_count;
-
-		ZFS_EXIT(zfsvfs);
-		if (ddi_copyout(&ndata, (void *)data, sizeof (ndata), flag))
-			return (SET_ERROR(EFAULT));
-		return (0);
-	}
-#endif
-	}
-	return (SET_ERROR(ENOTTY));
-}
-
-static vm_page_t
-page_busy(vnode_t *vp, int64_t start, int64_t off, int64_t nbytes)
-{
-	vm_object_t obj;
-	vm_page_t pp;
-	int64_t end;
-
-	/*
-	 * At present vm_page_clear_dirty extends the cleared range to DEV_BSIZE
-	 * aligned boundaries, if the range is not aligned.  As a result a
-	 * DEV_BSIZE subrange with partially dirty data may get marked as clean.
-	 * It may happen that all DEV_BSIZE subranges are marked clean and thus
-	 * the whole page would be considred clean despite have some dirty data.
-	 * For this reason we should shrink the range to DEV_BSIZE aligned
-	 * boundaries before calling vm_page_clear_dirty.
-	 */
-	end = rounddown2(off + nbytes, DEV_BSIZE);
-	off = roundup2(off, DEV_BSIZE);
-	nbytes = end - off;
-
-	obj = vp->v_object;
-
-	vm_page_grab_valid_unlocked(&pp, obj, OFF_TO_IDX(start),
-	    VM_ALLOC_NOCREAT | VM_ALLOC_SBUSY | VM_ALLOC_NORMAL |
-	    VM_ALLOC_IGN_SBUSY);
-	if (pp != NULL) {
-		ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL);
-		vm_object_pip_add(obj, 1);
-		pmap_remove_write(pp);
-		if (nbytes != 0)
-			vm_page_clear_dirty(pp, off, nbytes);
-	}
-	return (pp);
-}
-
-static void
-page_unbusy(vm_page_t pp)
-{
-
-	vm_page_sunbusy(pp);
-	vm_object_pip_wakeup(pp->object);
-}
-
-static vm_page_t
-page_wire(vnode_t *vp, int64_t start)
-{
-	vm_object_t obj;
-	vm_page_t m;
-
-	obj = vp->v_object;
-	vm_page_grab_valid_unlocked(&m, obj, OFF_TO_IDX(start),
-	    VM_ALLOC_NOCREAT | VM_ALLOC_WIRED | VM_ALLOC_IGN_SBUSY |
-	    VM_ALLOC_NOBUSY);
-	return (m);
-}
-
-static void
-page_unwire(vm_page_t pp)
-{
-
-	vm_page_unwire(pp, PQ_ACTIVE);
-}
-
-/*
- * When a file is memory mapped, we must keep the IO data synchronized
- * between the DMU cache and the memory mapped pages.  What this means:
- *
- * On Write:	If we find a memory mapped page, we write to *both*
- *		the page and the dmu buffer.
- */
-static void
-update_pages(vnode_t *vp, int64_t start, int len, objset_t *os, uint64_t oid,
-    int segflg, dmu_tx_t *tx)
-{
-	vm_object_t obj;
-	struct sf_buf *sf;
-	caddr_t va;
-	int off;
-
-	ASSERT(segflg != UIO_NOCOPY);
-	ASSERT(vp->v_mount != NULL);
-	obj = vp->v_object;
-	ASSERT(obj != NULL);
-
-	off = start & PAGEOFFSET;
-	vm_object_pip_add(obj, 1);
-	for (start &= PAGEMASK; len > 0; start += PAGESIZE) {
-		vm_page_t pp;
-		int nbytes = imin(PAGESIZE - off, len);
-
-		if ((pp = page_busy(vp, start, off, nbytes)) != NULL) {
-			va = zfs_map_page(pp, &sf);
-			(void) dmu_read(os, oid, start+off, nbytes,
-			    va+off, DMU_READ_PREFETCH);;
-			zfs_unmap_page(sf);
-			page_unbusy(pp);
-		}
-		len -= nbytes;
-		off = 0;
-	}
-	vm_object_pip_wakeup(obj);
-}
-
-/*
- * Read with UIO_NOCOPY flag means that sendfile(2) requests
- * ZFS to populate a range of page cache pages with data.
- *
- * NOTE: this function could be optimized to pre-allocate
- * all pages in advance, drain exclusive busy on all of them,
- * map them into contiguous KVA region and populate them
- * in one single dmu_read() call.
- */
-static int
-mappedread_sf(vnode_t *vp, int nbytes, uio_t *uio)
-{
-	znode_t *zp = VTOZ(vp);
-	objset_t *os = zp->z_zfsvfs->z_os;
-	struct sf_buf *sf;
-	vm_object_t obj;
-	vm_page_t pp;
-	int64_t start;
-	caddr_t va;
-	int len = nbytes;
-	int off;
-	int error = 0;
-
-	ASSERT(uio->uio_segflg == UIO_NOCOPY);
-	ASSERT(vp->v_mount != NULL);
-	obj = vp->v_object;
-	ASSERT(obj != NULL);
-	ASSERT((uio->uio_loffset & PAGEOFFSET) == 0);
-
-	for (start = uio->uio_loffset; len > 0; start += PAGESIZE) {
-		int bytes = MIN(PAGESIZE, len);
-
-		pp = vm_page_grab_unlocked(obj, OFF_TO_IDX(start),
-		    VM_ALLOC_SBUSY | VM_ALLOC_NORMAL | VM_ALLOC_IGN_SBUSY);
-		if (vm_page_none_valid(pp)) {
-			va = zfs_map_page(pp, &sf);
-			error = dmu_read(os, zp->z_id, start, bytes, va,
-			    DMU_READ_PREFETCH);
-			if (bytes != PAGESIZE && error == 0)
-				bzero(va + bytes, PAGESIZE - bytes);
-			zfs_unmap_page(sf);
-			if (error == 0) {
-				vm_page_valid(pp);
-				vm_page_activate(pp);
-				vm_page_sunbusy(pp);
-			} else {
-				zfs_vmobject_wlock(obj);
-				if (!vm_page_wired(pp) && pp->valid == 0 &&
-				    vm_page_busy_tryupgrade(pp))
-					vm_page_free(pp);
-				else
-					vm_page_sunbusy(pp);
-				zfs_vmobject_wunlock(obj);
-			}
-		} else {
-			ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL);
-			vm_page_sunbusy(pp);
-		}
-		if (error)
-			break;
-		uio->uio_resid -= bytes;
-		uio->uio_offset += bytes;
-		len -= bytes;
-	}
-	return (error);
-}
-
-/*
- * When a file is memory mapped, we must keep the IO data synchronized
- * between the DMU cache and the memory mapped pages.  What this means:
- *
- * On Read:	We "read" preferentially from memory mapped pages,
- *		else we default from the dmu buffer.
- *
- * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when
- *	 the file is memory mapped.
- */
-static int
-mappedread(vnode_t *vp, int nbytes, uio_t *uio)
-{
-	znode_t *zp = VTOZ(vp);
-	vm_object_t obj;
-	int64_t start;
-	caddr_t va;
-	int len = nbytes;
-	int off;
-	int error = 0;
-
-	ASSERT(vp->v_mount != NULL);
-	obj = vp->v_object;
-	ASSERT(obj != NULL);
-
-	start = uio->uio_loffset;
-	off = start & PAGEOFFSET;
-	for (start &= PAGEMASK; len > 0; start += PAGESIZE) {
-		vm_page_t pp;
-		uint64_t bytes = MIN(PAGESIZE - off, len);
-
-		if (pp = page_wire(vp, start)) {
-			struct sf_buf *sf;
-			caddr_t va;
-
-			va = zfs_map_page(pp, &sf);
-#ifdef illumos
-			error = uiomove(va + off, bytes, UIO_READ, uio);
-#else
-			error = vn_io_fault_uiomove(va + off, bytes, uio);
-#endif
-			zfs_unmap_page(sf);
-			page_unwire(pp);
-		} else {
-			error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl),
-			    uio, bytes);
-		}
-		len -= bytes;
-		off = 0;
-		if (error)
-			break;
-	}
-	return (error);
-}
-
-offset_t zfs_read_chunk_size = 1024 * 1024; /* Tunable */
-
-/*
- * Read bytes from specified file into supplied buffer.
- *
- *	IN:	vp	- vnode of file to be read from.
- *		uio	- structure supplying read location, range info,
- *			  and return buffer.
- *		ioflag	- SYNC flags; used to provide FRSYNC semantics.
- *		cr	- credentials of caller.
- *		ct	- caller context
- *
- *	OUT:	uio	- updated offset and range, buffer filled.
- *
- *	RETURN:	0 on success, error code on failure.
- *
- * Side Effects:
- *	vp - atime updated if byte count > 0
- */
-/* ARGSUSED */
-static int
-zfs_read(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
-{
-	znode_t		*zp = VTOZ(vp);
-	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
-	ssize_t		n, nbytes;
-	int		error = 0;
-	xuio_t		*xuio = NULL;
-
-	ZFS_ENTER(zfsvfs);
-	ZFS_VERIFY_ZP(zp);
-
-	/* We don't copy out anything useful for directories. */
-	if (vp->v_type == VDIR) {
-		ZFS_EXIT(zfsvfs);
-		return (SET_ERROR(EISDIR));
-	}
-
-	if (zp->z_pflags & ZFS_AV_QUARANTINED) {
-		ZFS_EXIT(zfsvfs);
-		return (SET_ERROR(EACCES));
-	}
-
-	/*
-	 * Validate file offset
-	 */
-	if (uio->uio_loffset < (offset_t)0) {
-		ZFS_EXIT(zfsvfs);
-		return (SET_ERROR(EINVAL));
-	}
-
-	/*
-	 * Fasttrack empty reads
-	 */
-	if (uio->uio_resid == 0) {
-		ZFS_EXIT(zfsvfs);
-		return (0);
-	}
-
-	/*
-	 * Check for mandatory locks
-	 */
-	if (MANDMODE(zp->z_mode)) {
-		if (error = chklock(vp, FREAD,
-		    uio->uio_loffset, uio->uio_resid, uio->uio_fmode, ct)) {
-			ZFS_EXIT(zfsvfs);
-			return (error);
-		}
-	}
-
-	/*
-	 * If we're in FRSYNC mode, sync out this znode before reading it.
-	 */
-	if (zfsvfs->z_log &&
-	    (ioflag & FRSYNC || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS))
-		zil_commit(zfsvfs->z_log, zp->z_id);
-
-	/*
-	 * Lock the range against changes.
-	 */
-	locked_range_t *lr = rangelock_enter(&zp->z_rangelock,
-	    uio->uio_loffset, uio->uio_resid, RL_READER);
-
-	/*
-	 * If we are reading past end-of-file we can skip
-	 * to the end; but we might still need to set atime.
-	 */
-	if (uio->uio_loffset >= zp->z_size) {
-		error = 0;
-		goto out;
-	}
-
-	ASSERT(uio->uio_loffset < zp->z_size);
-	n = MIN(uio->uio_resid, zp->z_size - uio->uio_loffset);
-
-#ifdef illumos
-	if ((uio->uio_extflg == UIO_XUIO) &&
-	    (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY)) {
-		int nblk;
-		int blksz = zp->z_blksz;
-		uint64_t offset = uio->uio_loffset;
-
-		xuio = (xuio_t *)uio;
-		if ((ISP2(blksz))) {
-			nblk = (P2ROUNDUP(offset + n, blksz) - P2ALIGN(offset,
-			    blksz)) / blksz;
-		} else {
-			ASSERT(offset + n <= blksz);
-			nblk = 1;
-		}
-		(void) dmu_xuio_init(xuio, nblk);
-
-		if (vn_has_cached_data(vp)) {
-			/*
-			 * For simplicity, we always allocate a full buffer
-			 * even if we only expect to read a portion of a block.
-			 */
-			while (--nblk >= 0) {
-				(void) dmu_xuio_add(xuio,
-				    dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
-				    blksz), 0, blksz);
-			}
-		}
-	}
-#endif	/* illumos */
-
-	while (n > 0) {
-		nbytes = MIN(n, zfs_read_chunk_size -
-		    P2PHASE(uio->uio_loffset, zfs_read_chunk_size));
-
-#ifdef __FreeBSD__
-		if (uio->uio_segflg == UIO_NOCOPY)
-			error = mappedread_sf(vp, nbytes, uio);
-		else
-#endif /* __FreeBSD__ */
-		if (vn_has_cached_data(vp)) {
-			error = mappedread(vp, nbytes, uio);
-		} else {
-			error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl),
-			    uio, nbytes);
-		}
-		if (error) {
-			/* convert checksum errors into IO errors */
-			if (error == ECKSUM)
-				error = SET_ERROR(EIO);
-			break;
-		}
-
-		n -= nbytes;
-	}
-out:
-	rangelock_exit(lr);
-
-	ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
-	ZFS_EXIT(zfsvfs);
-	return (error);
-}
-
-/*
- * Write the bytes to a file.
- *
- *	IN:	vp	- vnode of file to be written to.
- *		uio	- structure supplying write location, range info,
- *			  and data buffer.
- *		ioflag	- FAPPEND, FSYNC, and/or FDSYNC.  FAPPEND is
- *			  set if in append mode.
- *		cr	- credentials of caller.
- *		ct	- caller context (NFS/CIFS fem monitor only)
- *
- *	OUT:	uio	- updated offset and range.
- *
- *	RETURN:	0 on success, error code on failure.
- *
- * Timestamps:
- *	vp - ctime|mtime updated if byte count > 0
- */
-
-/* ARGSUSED */
-static int
-zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
-{
-	znode_t		*zp = VTOZ(vp);
-	rlim64_t	limit = MAXOFFSET_T;
-	ssize_t		start_resid = uio->uio_resid;
-	ssize_t		tx_bytes;
-	uint64_t	end_size;
-	dmu_tx_t	*tx;
-	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
-	zilog_t		*zilog;
-	offset_t	woff;
-	ssize_t		n, nbytes;
-	int		max_blksz = zfsvfs->z_max_blksz;
-	int		error = 0;
-	arc_buf_t	*abuf;
-	iovec_t		*aiov = NULL;
-	xuio_t		*xuio = NULL;
-	int		i_iov = 0;
-	int		iovcnt = uio->uio_iovcnt;
-	iovec_t		*iovp = uio->uio_iov;
-	int		write_eof;
-	int		count = 0;
-	sa_bulk_attr_t	bulk[4];
-	uint64_t	mtime[2], ctime[2];
-
-	/*
-	 * Fasttrack empty write
-	 */
-	n = start_resid;
-	if (n == 0)
-		return (0);
-
-	if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T)
-		limit = MAXOFFSET_T;
-
-	ZFS_ENTER(zfsvfs);
-	ZFS_VERIFY_ZP(zp);
-
-	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
-	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
-	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL,
-	    &zp->z_size, 8);
-	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
-	    &zp->z_pflags, 8);
-
-	/*
-	 * In a case vp->v_vfsp != zp->z_zfsvfs->z_vfs (e.g. snapshots) our
-	 * callers might not be able to detect properly that we are read-only,
-	 * so check it explicitly here.
-	 */
-	if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) {
-		ZFS_EXIT(zfsvfs);
-		return (SET_ERROR(EROFS));
-	}
-
-	/*
-	 * If immutable or not appending then return EPERM.
-	 * Intentionally allow ZFS_READONLY through here.
-	 * See zfs_zaccess_common()
-	 */
-	if ((zp->z_pflags & ZFS_IMMUTABLE) ||
-	    ((zp->z_pflags & ZFS_APPENDONLY) && !(ioflag & FAPPEND) &&
-	    (uio->uio_loffset < zp->z_size))) {
-		ZFS_EXIT(zfsvfs);
-		return (SET_ERROR(EPERM));
-	}
-
-	zilog = zfsvfs->z_log;
-
-	/*
-	 * Validate file offset
-	 */
-	woff = ioflag & FAPPEND ? zp->z_size : uio->uio_loffset;
-	if (woff < 0) {
-		ZFS_EXIT(zfsvfs);
-		return (SET_ERROR(EINVAL));
-	}
-
-	/*
-	 * Check for mandatory locks before calling rangelock_enter()
-	 * in order to prevent a deadlock with locks set via fcntl().
-	 */
-	if (MANDMODE((mode_t)zp->z_mode) &&
-	    (error = chklock(vp, FWRITE, woff, n, uio->uio_fmode, ct)) != 0) {
-		ZFS_EXIT(zfsvfs);
-		return (error);
-	}
-
-#ifdef illumos
-	/*
-	 * Pre-fault the pages to ensure slow (eg NFS) pages
-	 * don't hold up txg.
-	 * Skip this if uio contains loaned arc_buf.
-	 */
-	if ((uio->uio_extflg == UIO_XUIO) &&
-	    (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY))
-		xuio = (xuio_t *)uio;
-	else
-		uio_prefaultpages(MIN(n, max_blksz), uio);
-#endif
-
-	/*
-	 * If in append mode, set the io offset pointer to eof.
-	 */
-	locked_range_t *lr;
-	if (ioflag & FAPPEND) {
-		/*
-		 * Obtain an appending range lock to guarantee file append
-		 * semantics.  We reset the write offset once we have the lock.
-		 */
-		lr = rangelock_enter(&zp->z_rangelock, 0, n, RL_APPEND);
-		woff = lr->lr_offset;
-		if (lr->lr_length == UINT64_MAX) {
-			/*
-			 * We overlocked the file because this write will cause
-			 * the file block size to increase.
-			 * Note that zp_size cannot change with this lock held.
-			 */
-			woff = zp->z_size;
-		}
-		uio->uio_loffset = woff;
-	} else {
-		/*
-		 * Note that if the file block size will change as a result of
-		 * this write, then this range lock will lock the entire file
-		 * so that we can re-write the block safely.
-		 */
-		lr = rangelock_enter(&zp->z_rangelock, woff, n, RL_WRITER);
-	}
-
-	if (vn_rlimit_fsize(vp, uio, uio->uio_td)) {
-		rangelock_exit(lr);
-		ZFS_EXIT(zfsvfs);
-		return (EFBIG);
-	}
-
-	if (woff >= limit) {
-		rangelock_exit(lr);
-		ZFS_EXIT(zfsvfs);
-		return (SET_ERROR(EFBIG));
-	}
-
-	if ((woff + n) > limit || woff > (limit - n))
-		n = limit - woff;
-
-	/* Will this write extend the file length? */
-	write_eof = (woff + n > zp->z_size);
-
-	end_size = MAX(zp->z_size, woff + n);
-
-	/*
-	 * Write the file in reasonable size chunks.  Each chunk is written
-	 * in a separate transaction; this keeps the intent log records small
-	 * and allows us to do more fine-grained space accounting.
-	 */
-	while (n > 0) {
-		abuf = NULL;
-		woff = uio->uio_loffset;
-		if (zfs_owner_overquota(zfsvfs, zp, B_FALSE) ||
-		    zfs_owner_overquota(zfsvfs, zp, B_TRUE)) {
-			if (abuf != NULL)
-				dmu_return_arcbuf(abuf);
-			error = SET_ERROR(EDQUOT);
-			break;
-		}
-
-		if (xuio && abuf == NULL) {
-			ASSERT(i_iov < iovcnt);
-			aiov = &iovp[i_iov];
-			abuf = dmu_xuio_arcbuf(xuio, i_iov);
-			dmu_xuio_clear(xuio, i_iov);
-			DTRACE_PROBE3(zfs_cp_write, int, i_iov,
-			    iovec_t *, aiov, arc_buf_t *, abuf);
-			ASSERT((aiov->iov_base == abuf->b_data) ||
-			    ((char *)aiov->iov_base - (char *)abuf->b_data +
-			    aiov->iov_len == arc_buf_size(abuf)));
-			i_iov++;
-		} else if (abuf == NULL && n >= max_blksz &&
-		    woff >= zp->z_size &&
-		    P2PHASE(woff, max_blksz) == 0 &&
-		    zp->z_blksz == max_blksz) {
-			/*
-			 * This write covers a full block.  "Borrow" a buffer
-			 * from the dmu so that we can fill it before we enter
-			 * a transaction.  This avoids the possibility of
-			 * holding up the transaction if the data copy hangs
-			 * up on a pagefault (e.g., from an NFS server mapping).
-			 */
-			size_t cbytes;
-
-			abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
-			    max_blksz);
-			ASSERT(abuf != NULL);
-			ASSERT(arc_buf_size(abuf) == max_blksz);
-			if (error = uiocopy(abuf->b_data, max_blksz,
-			    UIO_WRITE, uio, &cbytes)) {
-				dmu_return_arcbuf(abuf);
-				break;
-			}
-			ASSERT(cbytes == max_blksz);
-		}
-
-		/*
-		 * Start a transaction.
-		 */
-		tx = dmu_tx_create(zfsvfs->z_os);
-		dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
-		dmu_tx_hold_write(tx, zp->z_id, woff, MIN(n, max_blksz));
-		zfs_sa_upgrade_txholds(tx, zp);
-		error = dmu_tx_assign(tx, TXG_WAIT);
-		if (error) {
-			dmu_tx_abort(tx);
-			if (abuf != NULL)
-				dmu_return_arcbuf(abuf);
-			break;
-		}
-
-		/*
-		 * If rangelock_enter() over-locked we grow the blocksize
-		 * and then reduce the lock range.  This will only happen
-		 * on the first iteration since rangelock_reduce() will
-		 * shrink down lr_length to the appropriate size.
-		 */
-		if (lr->lr_length == UINT64_MAX) {
-			uint64_t new_blksz;
-
-			if (zp->z_blksz > max_blksz) {
-				/*
-				 * File's blocksize is already larger than the
-				 * "recordsize" property.  Only let it grow to
-				 * the next power of 2.
-				 */
-				ASSERT(!ISP2(zp->z_blksz));
-				new_blksz = MIN(end_size,
-				    1 << highbit64(zp->z_blksz));
-			} else {
-				new_blksz = MIN(end_size, max_blksz);
-			}
-			zfs_grow_blocksize(zp, new_blksz, tx);
-			rangelock_reduce(lr, woff, n);
-		}
-
-		/*
-		 * XXX - should we really limit each write to z_max_blksz?
-		 * Perhaps we should use SPA_MAXBLOCKSIZE chunks?
-		 */
-		nbytes = MIN(n, max_blksz - P2PHASE(woff, max_blksz));
-
-		if (woff + nbytes > zp->z_size)
-			vnode_pager_setsize(vp, woff + nbytes);
-
-		if (abuf == NULL) {
-			tx_bytes = uio->uio_resid;
-			error = dmu_write_uio_dbuf(sa_get_db(zp->z_sa_hdl),
-			    uio, nbytes, tx);
-			tx_bytes -= uio->uio_resid;
-		} else {
-			tx_bytes = nbytes;
-			ASSERT(xuio == NULL || tx_bytes == aiov->iov_len);
-			/*
-			 * If this is not a full block write, but we are
-			 * extending the file past EOF and this data starts
-			 * block-aligned, use assign_arcbuf().  Otherwise,
-			 * write via dmu_write().
-			 */
-			if (tx_bytes < max_blksz && (!write_eof ||
-			    aiov->iov_base != abuf->b_data)) {
-				ASSERT(xuio);
-				dmu_write(zfsvfs->z_os, zp->z_id, woff,
-				    aiov->iov_len, aiov->iov_base, tx);
-				dmu_return_arcbuf(abuf);
-				xuio_stat_wbuf_copied();
-			} else {
-				ASSERT(xuio || tx_bytes == max_blksz);
-				dmu_assign_arcbuf(sa_get_db(zp->z_sa_hdl),
-				    woff, abuf, tx);
-			}
-			ASSERT(tx_bytes <= uio->uio_resid);
-			uioskip(uio, tx_bytes);
-		}
-		if (tx_bytes && vn_has_cached_data(vp)) {
-			update_pages(vp, woff, tx_bytes, zfsvfs->z_os,
-			    zp->z_id, uio->uio_segflg, tx);
-		}
-
-		/*
-		 * If we made no progress, we're done.  If we made even
-		 * partial progress, update the znode and ZIL accordingly.
-		 */
-		if (tx_bytes == 0) {
-			(void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs),
-			    (void *)&zp->z_size, sizeof (uint64_t), tx);
-			dmu_tx_commit(tx);
-			ASSERT(error != 0);
-			break;
-		}
-
-		/*
-		 * Clear Set-UID/Set-GID bits on successful write if not
-		 * privileged and at least one of the excute bits is set.
-		 *
-		 * It would be nice to to this after all writes have
-		 * been done, but that would still expose the ISUID/ISGID
-		 * to another app after the partial write is committed.
-		 *
-		 * Note: we don't call zfs_fuid_map_id() here because
-		 * user 0 is not an ephemeral uid.
-		 */
-		mutex_enter(&zp->z_acl_lock);
-		if ((zp->z_mode & (S_IXUSR | (S_IXUSR >> 3) |
-		    (S_IXUSR >> 6))) != 0 &&
-		    (zp->z_mode & (S_ISUID | S_ISGID)) != 0 &&
-		    secpolicy_vnode_setid_retain(vp, cr,
-		    (zp->z_mode & S_ISUID) != 0 && zp->z_uid == 0) != 0) {
-			uint64_t newmode;
-			zp->z_mode &= ~(S_ISUID | S_ISGID);
-			newmode = zp->z_mode;
-			(void) sa_update(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs),
-			    (void *)&newmode, sizeof (uint64_t), tx);
-		}
-		mutex_exit(&zp->z_acl_lock);
-
-		zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime,
-		    B_TRUE);
-
-		/*
-		 * Update the file size (zp_size) if it has changed;
-		 * account for possible concurrent updates.
-		 */
-		while ((end_size = zp->z_size) < uio->uio_loffset) {
-			(void) atomic_cas_64(&zp->z_size, end_size,
-			    uio->uio_loffset);
-#ifdef illumos
-			ASSERT(error == 0);
-#else
-			ASSERT(error == 0 || error == EFAULT);
-#endif
-		}
-		/*
-		 * If we are replaying and eof is non zero then force
-		 * the file size to the specified eof. Note, there's no
-		 * concurrency during replay.
-		 */
-		if (zfsvfs->z_replay && zfsvfs->z_replay_eof != 0)
-			zp->z_size = zfsvfs->z_replay_eof;
-
-		if (error == 0)
-			error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
-		else
-			(void) sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
-
-		zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, ioflag);
-		dmu_tx_commit(tx);
-
-		if (error != 0)
-			break;
-		ASSERT(tx_bytes == nbytes);
-		n -= nbytes;
-
-#ifdef illumos
-		if (!xuio && n > 0)
-			uio_prefaultpages(MIN(n, max_blksz), uio);
-#endif
-	}
-
-	rangelock_exit(lr);
-
-	/*
-	 * If we're in replay mode, or we made no progress, return error.
-	 * Otherwise, it's at least a partial write, so it's successful.
-	 */
-	if (zfsvfs->z_replay || uio->uio_resid == start_resid) {
-		ZFS_EXIT(zfsvfs);
-		return (error);
-	}
-
-#ifdef __FreeBSD__
-	/*
-	 * EFAULT means that at least one page of the source buffer was not
-	 * available.  VFS will re-try remaining I/O upon this error.
-	 */
-	if (error == EFAULT) {
-		ZFS_EXIT(zfsvfs);
-		return (error);
-	}
-#endif
-
-	if (ioflag & (FSYNC | FDSYNC) ||
-	    zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
-		zil_commit(zilog, zp->z_id);
-
-	ZFS_EXIT(zfsvfs);
-	return (0);
-}
-
-/* ARGSUSED */
-void
-zfs_get_done(zgd_t *zgd, int error)
-{
-	znode_t *zp = zgd->zgd_private;
-	objset_t *os = zp->z_zfsvfs->z_os;
-
-	if (zgd->zgd_db)
-		dmu_buf_rele(zgd->zgd_db, zgd);
-
-	rangelock_exit(zgd->zgd_lr);
-
-	/*
-	 * Release the vnode asynchronously as we currently have the
-	 * txg stopped from syncing.
-	 */
-	VN_RELE_ASYNC(ZTOV(zp), dsl_pool_vnrele_taskq(dmu_objset_pool(os)));
-
-	kmem_free(zgd, sizeof (zgd_t));
-}
-
-#ifdef DEBUG
-static int zil_fault_io = 0;
-#endif
-
-/*
- * Get data to generate a TX_WRITE intent log record.
- */
-int
-zfs_get_data(void *arg, lr_write_t *lr, char *buf, struct lwb *lwb, zio_t *zio)
-{
-	zfsvfs_t *zfsvfs = arg;
-	objset_t *os = zfsvfs->z_os;
-	znode_t *zp;
-	uint64_t object = lr->lr_foid;
-	uint64_t offset = lr->lr_offset;
-	uint64_t size = lr->lr_length;
-	dmu_buf_t *db;
-	zgd_t *zgd;
-	int error = 0;
-
-	ASSERT3P(lwb, !=, NULL);
-	ASSERT3P(zio, !=, NULL);
-	ASSERT3U(size, !=, 0);
-
-	/*
-	 * Nothing to do if the file has been removed
-	 */
-	if (zfs_zget(zfsvfs, object, &zp) != 0)
-		return (SET_ERROR(ENOENT));
-	if (zp->z_unlinked) {
-		/*
-		 * Release the vnode asynchronously as we currently have the
-		 * txg stopped from syncing.
-		 */
-		VN_RELE_ASYNC(ZTOV(zp),
-		    dsl_pool_vnrele_taskq(dmu_objset_pool(os)));
-		return (SET_ERROR(ENOENT));
-	}
-
-	zgd = (zgd_t *)kmem_zalloc(sizeof (zgd_t), KM_SLEEP);
-	zgd->zgd_lwb = lwb;
-	zgd->zgd_private = zp;
-
-	/*
-	 * Write records come in two flavors: immediate and indirect.
-	 * For small writes it's cheaper to store the data with the
-	 * log record (immediate); for large writes it's cheaper to
-	 * sync the data and get a pointer to it (indirect) so that
-	 * we don't have to write the data twice.
-	 */
-	if (buf != NULL) { /* immediate write */
-		zgd->zgd_lr = rangelock_enter(&zp->z_rangelock,
-		    offset, size, RL_READER);
-		/* test for truncation needs to be done while range locked */
-		if (offset >= zp->z_size) {
-			error = SET_ERROR(ENOENT);
-		} else {
-			error = dmu_read(os, object, offset, size, buf,
-			    DMU_READ_NO_PREFETCH);
-		}
-		ASSERT(error == 0 || error == ENOENT);
-	} else { /* indirect write */
-		/*
-		 * Have to lock the whole block to ensure when it's
-		 * written out and its checksum is being calculated
-		 * that no one can change the data. We need to re-check
-		 * blocksize after we get the lock in case it's changed!
-		 */
-		for (;;) {
-			uint64_t blkoff;
-			size = zp->z_blksz;
-			blkoff = ISP2(size) ? P2PHASE(offset, size) : offset;
-			offset -= blkoff;
-			zgd->zgd_lr = rangelock_enter(&zp->z_rangelock,
-			    offset, size, RL_READER);
-			if (zp->z_blksz == size)
-				break;
-			offset += blkoff;
-			rangelock_exit(zgd->zgd_lr);
-		}
-		/* test for truncation needs to be done while range locked */
-		if (lr->lr_offset >= zp->z_size)
-			error = SET_ERROR(ENOENT);
-#ifdef DEBUG
-		if (zil_fault_io) {
-			error = SET_ERROR(EIO);
-			zil_fault_io = 0;
-		}
-#endif
-		if (error == 0)
-			error = dmu_buf_hold(os, object, offset, zgd, &db,
-			    DMU_READ_NO_PREFETCH);
-
-		if (error == 0) {
-			blkptr_t *bp = &lr->lr_blkptr;
-
-			zgd->zgd_db = db;
-			zgd->zgd_bp = bp;
-
-			ASSERT(db->db_offset == offset);
-			ASSERT(db->db_size == size);
-
-			error = dmu_sync(zio, lr->lr_common.lrc_txg,
-			    zfs_get_done, zgd);
-			ASSERT(error || lr->lr_length <= size);
-
-			/*
-			 * On success, we need to wait for the write I/O
-			 * initiated by dmu_sync() to complete before we can
-			 * release this dbuf.  We will finish everything up
-			 * in the zfs_get_done() callback.
-			 */
-			if (error == 0)
-				return (0);
-
-			if (error == EALREADY) {
-				lr->lr_common.lrc_txtype = TX_WRITE2;
-				/*
-				 * TX_WRITE2 relies on the data previously
-				 * written by the TX_WRITE that caused
-				 * EALREADY.  We zero out the BP because
-				 * it is the old, currently-on-disk BP.
-				 */
-				zgd->zgd_bp = NULL;
-				BP_ZERO(bp);
-				error = 0;
-			}
-		}
-	}
-
-	zfs_get_done(zgd, error);
-
-	return (error);
-}
-
-/*ARGSUSED*/
-static int
-zfs_access(vnode_t *vp, int mode, int flag, cred_t *cr,
-    caller_context_t *ct)
-{
-	znode_t *zp = VTOZ(vp);
-	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
-	int error;
-
-	ZFS_ENTER(zfsvfs);
-	ZFS_VERIFY_ZP(zp);
-
-	if (flag & V_ACE_MASK)
-		error = zfs_zaccess(zp, mode, flag, B_FALSE, cr);
-	else
-		error = zfs_zaccess_rwx(zp, mode, flag, cr);
-
-	ZFS_EXIT(zfsvfs);
-	return (error);
-}
-
-static int
-zfs_dd_callback(struct mount *mp, void *arg, int lkflags, struct vnode **vpp)
-{
-	int error;
-
-	*vpp = arg;
-	error = vn_lock(*vpp, lkflags);
-	if (error != 0)
-		vrele(*vpp);
-	return (error);
-}
-
-static int
-zfs_lookup_lock(vnode_t *dvp, vnode_t *vp, const char *name, int lkflags)
-{
-	znode_t *zdp = VTOZ(dvp);
-	zfsvfs_t *zfsvfs = zdp->z_zfsvfs;
-	int error;
-	int ltype;
-
-	ASSERT_VOP_LOCKED(dvp, __func__);
-#ifdef DIAGNOSTIC
-	if ((zdp->z_pflags & ZFS_XATTR) == 0)
-		VERIFY(!RRM_LOCK_HELD(&zfsvfs->z_teardown_lock));
-#endif
-
-	if (name[0] == 0 || (name[0] == '.' && name[1] == 0)) {
-		ASSERT3P(dvp, ==, vp);
-		vref(dvp);
-		ltype = lkflags & LK_TYPE_MASK;
-		if (ltype != VOP_ISLOCKED(dvp)) {
-			if (ltype == LK_EXCLUSIVE)
-				vn_lock(dvp, LK_UPGRADE | LK_RETRY);
-			else /* if (ltype == LK_SHARED) */
-				vn_lock(dvp, LK_DOWNGRADE | LK_RETRY);
-
-			/*
-			 * Relock for the "." case could leave us with
-			 * reclaimed vnode.
-			 */
-			if (VN_IS_DOOMED(dvp)) {
-				vrele(dvp);
-				return (SET_ERROR(ENOENT));
-			}
-		}
-		return (0);
-	} else if (name[0] == '.' && name[1] == '.' && name[2] == 0) {
-		/*
-		 * Note that in this case, dvp is the child vnode, and we
-		 * are looking up the parent vnode - exactly reverse from
-		 * normal operation.  Unlocking dvp requires some rather
-		 * tricky unlock/relock dance to prevent mp from being freed;
-		 * use vn_vget_ino_gen() which takes care of all that.
-		 *
-		 * XXX Note that there is a time window when both vnodes are
-		 * unlocked.  It is possible, although highly unlikely, that
-		 * during that window the parent-child relationship between
-		 * the vnodes may change, for example, get reversed.
-		 * In that case we would have a wrong lock order for the vnodes.
-		 * All other filesystems seem to ignore this problem, so we
-		 * do the same here.
-		 * A potential solution could be implemented as follows:
-		 * - using LK_NOWAIT when locking the second vnode and retrying
-		 *   if necessary
-		 * - checking that the parent-child relationship still holds
-		 *   after locking both vnodes and retrying if it doesn't
-		 */
-		error = vn_vget_ino_gen(dvp, zfs_dd_callback, vp, lkflags, &vp);
-		return (error);
-	} else {
-		error = vn_lock(vp, lkflags);
-		if (error != 0)
-			vrele(vp);
-		return (error);
-	}
-}
-
-/*
- * Lookup an entry in a directory, or an extended attribute directory.
- * If it exists, return a held vnode reference for it.
- *
- *	IN:	dvp	- vnode of directory to search.
- *		nm	- name of entry to lookup.
- *		pnp	- full pathname to lookup [UNUSED].
- *		flags	- LOOKUP_XATTR set if looking for an attribute.
- *		rdir	- root directory vnode [UNUSED].
- *		cr	- credentials of caller.
- *		ct	- caller context
- *
- *	OUT:	vpp	- vnode of located entry, NULL if not found.
- *
- *	RETURN:	0 on success, error code on failure.
- *
- * Timestamps:
- *	NA
- */
-/* ARGSUSED */
-static int
-zfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct componentname *cnp,
-    int nameiop, cred_t *cr, kthread_t *td, int flags, boolean_t cached)
-{
-	znode_t *zdp = VTOZ(dvp);
-	znode_t *zp;
-	zfsvfs_t *zfsvfs = zdp->z_zfsvfs;
-	int	error = 0;
-
-	/*
-	 * Fast path lookup, however we must skip DNLC lookup
-	 * for case folding or normalizing lookups because the
-	 * DNLC code only stores the passed in name.  This means
-	 * creating 'a' and removing 'A' on a case insensitive
-	 * file system would work, but DNLC still thinks 'a'
-	 * exists and won't let you create it again on the next
-	 * pass through fast path.
-	 */
-	if (!(flags & LOOKUP_XATTR)) {
-		if (dvp->v_type != VDIR) {
-			return (SET_ERROR(ENOTDIR));
-		} else if (zdp->z_sa_hdl == NULL) {
-			return (SET_ERROR(EIO));
-		}
-	}
-
-	DTRACE_PROBE2(zfs__fastpath__lookup__miss, vnode_t *, dvp, char *, nm);
-
-	ZFS_ENTER(zfsvfs);
-	ZFS_VERIFY_ZP(zdp);
-
-	*vpp = NULL;
-
-	if (flags & LOOKUP_XATTR) {
-#ifdef TODO
-		/*
-		 * If the xattr property is off, refuse the lookup request.
-		 */
-		if (!(zfsvfs->z_vfs->vfs_flag & VFS_XATTR)) {
-			ZFS_EXIT(zfsvfs);
-			return (SET_ERROR(EINVAL));
-		}
-#endif
-
-		/*
-		 * We don't allow recursive attributes..
-		 * Maybe someday we will.
-		 */
-		if (zdp->z_pflags & ZFS_XATTR) {
-			ZFS_EXIT(zfsvfs);
-			return (SET_ERROR(EINVAL));
-		}
-
-		if (error = zfs_get_xattrdir(VTOZ(dvp), vpp, cr, flags)) {
-			ZFS_EXIT(zfsvfs);
-			return (error);
-		}
-
-		/*
-		 * Do we have permission to get into attribute directory?
-		 */
-		if (error = zfs_zaccess(VTOZ(*vpp), ACE_EXECUTE, 0,
-		    B_FALSE, cr)) {
-			vrele(*vpp);
-			*vpp = NULL;
-		}
-
-		ZFS_EXIT(zfsvfs);
-		return (error);
-	}
-
-	/*
-	 * Check accessibility of directory.
-	 */
-	if (!cached) {
-		if ((cnp->cn_flags & NOEXECCHECK) != 0) {
-			cnp->cn_flags &= ~NOEXECCHECK;
-		} else {
-			error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr);
-			if (error != 0) {
-				ZFS_EXIT(zfsvfs);
-				return (error);
-			}
-		}
-	}
-
-	if (zfsvfs->z_utf8 && u8_validate(nm, strlen(nm),
-	    NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
-		ZFS_EXIT(zfsvfs);
-		return (SET_ERROR(EILSEQ));
-	}
-
-
-	/*
-	 * First handle the special cases.
-	 */
-	if ((cnp->cn_flags & ISDOTDOT) != 0) {
-		/*
-		 * If we are a snapshot mounted under .zfs, return
-		 * the vp for the snapshot directory.
-		 */
-		if (zdp->z_id == zfsvfs->z_root && zfsvfs->z_parent != zfsvfs) {
-			struct componentname cn;
-			vnode_t *zfsctl_vp;
-			int ltype;
-
-			ZFS_EXIT(zfsvfs);
-			ltype = VOP_ISLOCKED(dvp);
-			VOP_UNLOCK(dvp);
-			error = zfsctl_root(zfsvfs->z_parent, LK_SHARED,
-			    &zfsctl_vp);
-			if (error == 0) {
-				cn.cn_nameptr = "snapshot";
-				cn.cn_namelen = strlen(cn.cn_nameptr);
-				cn.cn_nameiop = cnp->cn_nameiop;
-				cn.cn_flags = cnp->cn_flags & ~ISDOTDOT;
-				cn.cn_lkflags = cnp->cn_lkflags;
-				error = VOP_LOOKUP(zfsctl_vp, vpp, &cn);
-				vput(zfsctl_vp);
-			}
-			vn_lock(dvp, ltype | LK_RETRY);
-			return (error);
-		}
-	}
-	if (zfs_has_ctldir(zdp) && strcmp(nm, ZFS_CTLDIR_NAME) == 0) {
-		ZFS_EXIT(zfsvfs);
-		if ((cnp->cn_flags & ISLASTCN) != 0 && nameiop != LOOKUP)
-			return (SET_ERROR(ENOTSUP));
-		error = zfsctl_root(zfsvfs, cnp->cn_lkflags, vpp);
-		return (error);
-	}
-
-	/*
-	 * The loop is retry the lookup if the parent-child relationship
-	 * changes during the dot-dot locking complexities.
-	 */
-	for (;;) {
-		uint64_t parent;
-
-		error = zfs_dirlook(zdp, nm, &zp);
-		if (error == 0)
-			*vpp = ZTOV(zp);
-
-		ZFS_EXIT(zfsvfs);
-		if (error != 0)
-			break;
-
-		error = zfs_lookup_lock(dvp, *vpp, nm, cnp->cn_lkflags);
-		if (error != 0) {
-			/*
-			 * If we've got a locking error, then the vnode
-			 * got reclaimed because of a force unmount.
-			 * We never enter doomed vnodes into the name cache.
-			 */
-			*vpp = NULL;
-			return (error);
-		}
-
-		if ((cnp->cn_flags & ISDOTDOT) == 0)
-			break;
-
-		ZFS_ENTER(zfsvfs);
-		if (zdp->z_sa_hdl == NULL) {
-			error = SET_ERROR(EIO);
-		} else {
-			error = sa_lookup(zdp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
-			    &parent, sizeof (parent));
-		}
-		if (error != 0) {
-			ZFS_EXIT(zfsvfs);
-			vput(ZTOV(zp));
-			break;
-		}
-		if (zp->z_id == parent) {
-			ZFS_EXIT(zfsvfs);
-			break;
-		}
-		vput(ZTOV(zp));
-	}
-
-out:
-	if (error != 0)
-		*vpp = NULL;
-
-	/* Translate errors and add SAVENAME when needed. */
-	if (cnp->cn_flags & ISLASTCN) {
-		switch (nameiop) {
-		case CREATE:
-		case RENAME:
-			if (error == ENOENT) {
-				error = EJUSTRETURN;
-				cnp->cn_flags |= SAVENAME;
-				break;
-			}
-			/* FALLTHROUGH */
-		case DELETE:
-			if (error == 0)
-				cnp->cn_flags |= SAVENAME;
-			break;
-		}
-	}
-
-	/* Insert name into cache (as non-existent) if appropriate. */
-	if (zfsvfs->z_use_namecache &&
-	    error == ENOENT && (cnp->cn_flags & MAKEENTRY) != 0)
-		cache_enter(dvp, NULL, cnp);
-
-	/* Insert name into cache if appropriate. */
-	if (zfsvfs->z_use_namecache &&
-	    error == 0 && (cnp->cn_flags & MAKEENTRY)) {
-		if (!(cnp->cn_flags & ISLASTCN) ||
-		    (nameiop != DELETE && nameiop != RENAME)) {
-			cache_enter(dvp, *vpp, cnp);
-		}
-	}
-
-	return (error);
-}
-
-/*
- * Attempt to create a new entry in a directory.  If the entry
- * already exists, truncate the file if permissible, else return
- * an error.  Return the vp of the created or trunc'd file.
- *
- *	IN:	dvp	- vnode of directory to put new file entry in.
- *		name	- name of new file entry.
- *		vap	- attributes of new file.
- *		excl	- flag indicating exclusive or non-exclusive mode.
- *		mode	- mode to open file with.
- *		cr	- credentials of caller.
- *		flag	- large file flag [UNUSED].
- *		ct	- caller context
- *		vsecp	- ACL to be set
- *
- *	OUT:	vpp	- vnode of created or trunc'd entry.
- *
- *	RETURN:	0 on success, error code on failure.
- *
- * Timestamps:
- *	dvp - ctime|mtime updated if new entry created
- *	 vp - ctime|mtime always, atime if new
- */
-
-/* ARGSUSED */
-static int
-zfs_create(vnode_t *dvp, char *name, vattr_t *vap, int excl, int mode,
-    vnode_t **vpp, cred_t *cr, kthread_t *td)
-{
-	znode_t		*zp, *dzp = VTOZ(dvp);
-	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
-	zilog_t		*zilog;
-	objset_t	*os;
-	dmu_tx_t	*tx;
-	int		error;
-	ksid_t		*ksid;
-	uid_t		uid;
-	gid_t		gid = crgetgid(cr);
-	zfs_acl_ids_t   acl_ids;
-	boolean_t	fuid_dirtied;
-	void		*vsecp = NULL;
-	int		flag = 0;
-	uint64_t	txtype;
-
-	/*
-	 * If we have an ephemeral id, ACL, or XVATTR then
-	 * make sure file system is at proper version
-	 */
-
-	ksid = crgetsid(cr, KSID_OWNER);
-	if (ksid)
-		uid = ksid_getid(ksid);
-	else
-		uid = crgetuid(cr);
-
-	if (zfsvfs->z_use_fuids == B_FALSE &&
-	    (vsecp || (vap->va_mask & AT_XVATTR) ||
-	    IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid)))
-		return (SET_ERROR(EINVAL));
-
-	ZFS_ENTER(zfsvfs);
-	ZFS_VERIFY_ZP(dzp);
-	os = zfsvfs->z_os;
-	zilog = zfsvfs->z_log;
-
-	if (zfsvfs->z_utf8 && u8_validate(name, strlen(name),
-	    NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
-		ZFS_EXIT(zfsvfs);
-		return (SET_ERROR(EILSEQ));
-	}
-
-	if (vap->va_mask & AT_XVATTR) {
-		if ((error = secpolicy_xvattr(dvp, (xvattr_t *)vap,
-		    crgetuid(cr), cr, vap->va_type)) != 0) {
-			ZFS_EXIT(zfsvfs);
-			return (error);
-		}
-	}
-
-	*vpp = NULL;
-
-	if ((vap->va_mode & S_ISVTX) && secpolicy_vnode_stky_modify(cr))
-		vap->va_mode &= ~S_ISVTX;
-
-	error = zfs_dirent_lookup(dzp, name, &zp, ZNEW);
-	if (error) {
-		ZFS_EXIT(zfsvfs);
-		return (error);
-	}
-	ASSERT3P(zp, ==, NULL);
-
-	/*
-	 * Create a new file object and update the directory
-	 * to reference it.
-	 */
-	if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) {
-		goto out;
-	}
-
-	/*
-	 * We only support the creation of regular files in
-	 * extended attribute directories.
-	 */
-
-	if ((dzp->z_pflags & ZFS_XATTR) &&
-	    (vap->va_type != VREG)) {
-		error = SET_ERROR(EINVAL);
-		goto out;
-	}
-
-	if ((error = zfs_acl_ids_create(dzp, 0, vap,
-	    cr, vsecp, &acl_ids)) != 0)
-		goto out;
-
-	if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
-		zfs_acl_ids_free(&acl_ids);
-		error = SET_ERROR(EDQUOT);
-		goto out;
-	}
-
-	getnewvnode_reserve();
-
-	tx = dmu_tx_create(os);
-
-	dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
-	    ZFS_SA_BASE_ATTR_SIZE);
-
-	fuid_dirtied = zfsvfs->z_fuid_dirty;
-	if (fuid_dirtied)
-		zfs_fuid_txhold(zfsvfs, tx);
-	dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
-	dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
-	if (!zfsvfs->z_use_sa &&
-	    acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
-		dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
-		    0, acl_ids.z_aclp->z_acl_bytes);
-	}
-	error = dmu_tx_assign(tx, TXG_WAIT);
-	if (error) {
-		zfs_acl_ids_free(&acl_ids);
-		dmu_tx_abort(tx);
-		getnewvnode_drop_reserve();
-		ZFS_EXIT(zfsvfs);
-		return (error);
-	}
-	zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
-
-	if (fuid_dirtied)
-		zfs_fuid_sync(zfsvfs, tx);
-
-	(void) zfs_link_create(dzp, name, zp, tx, ZNEW);
-	txtype = zfs_log_create_txtype(Z_FILE, vsecp, vap);
-	zfs_log_create(zilog, tx, txtype, dzp, zp, name,
-	    vsecp, acl_ids.z_fuidp, vap);
-	zfs_acl_ids_free(&acl_ids);
-	dmu_tx_commit(tx);
-
-	getnewvnode_drop_reserve();
-
-out:
-	if (error == 0) {
-		*vpp = ZTOV(zp);
-	}
-
-	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
-		zil_commit(zilog, 0);
-
-	ZFS_EXIT(zfsvfs);
-	return (error);
-}
-
-/*
- * Remove an entry from a directory.
- *
- *	IN:	dvp	- vnode of directory to remove entry from.
- *		name	- name of entry to remove.
- *		cr	- credentials of caller.
- *		ct	- caller context
- *		flags	- case flags
- *
- *	RETURN:	0 on success, error code on failure.
- *
- * Timestamps:
- *	dvp - ctime|mtime
- *	 vp - ctime (if nlink > 0)
- */
-
-/*ARGSUSED*/
-static int
-zfs_remove(vnode_t *dvp, vnode_t *vp, char *name, cred_t *cr)
-{
-	znode_t		*dzp = VTOZ(dvp);
-	znode_t		*zp = VTOZ(vp);
-	znode_t		*xzp;
-	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
-	zilog_t		*zilog;
-	uint64_t	acl_obj, xattr_obj;
-	uint64_t	obj = 0;
-	dmu_tx_t	*tx;
-	boolean_t	unlinked, toobig = FALSE;
-	uint64_t	txtype;
-	int		error;
-
-	ZFS_ENTER(zfsvfs);
-	ZFS_VERIFY_ZP(dzp);
-	ZFS_VERIFY_ZP(zp);
-	zilog = zfsvfs->z_log;
-	zp = VTOZ(vp);
-
-	xattr_obj = 0;
-	xzp = NULL;
-
-	if (error = zfs_zaccess_delete(dzp, zp, cr)) {
-		goto out;
-	}
-
-	/*
-	 * Need to use rmdir for removing directories.
-	 */
-	if (vp->v_type == VDIR) {
-		error = SET_ERROR(EPERM);
-		goto out;
-	}
-
-	vnevent_remove(vp, dvp, name, ct);
-
-	obj = zp->z_id;
-
-	/* are there any extended attributes? */
-	error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
-	    &xattr_obj, sizeof (xattr_obj));
-	if (error == 0 && xattr_obj) {
-		error = zfs_zget(zfsvfs, xattr_obj, &xzp);
-		ASSERT0(error);
-	}
-
-	/*
-	 * We may delete the znode now, or we may put it in the unlinked set;
-	 * it depends on whether we're the last link, and on whether there are
-	 * other holds on the vnode.  So we dmu_tx_hold() the right things to
-	 * allow for either case.
-	 */
-	tx = dmu_tx_create(zfsvfs->z_os);
-	dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
-	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
-	zfs_sa_upgrade_txholds(tx, zp);
-	zfs_sa_upgrade_txholds(tx, dzp);
-
-	if (xzp) {
-		dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
-		dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE);
-	}
-
-	/* charge as an update -- would be nice not to charge at all */
-	dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
-
-	/*
-	 * Mark this transaction as typically resulting in a net free of space
-	 */
-	dmu_tx_mark_netfree(tx);
-
-	error = dmu_tx_assign(tx, TXG_WAIT);
-	if (error) {
-		dmu_tx_abort(tx);
-		ZFS_EXIT(zfsvfs);
-		return (error);
-	}
-
-	/*
-	 * Remove the directory entry.
-	 */
-	error = zfs_link_destroy(dzp, name, zp, tx, ZEXISTS, &unlinked);
-
-	if (error) {
-		dmu_tx_commit(tx);
-		goto out;
-	}
-
-	if (unlinked) {
-		zfs_unlinked_add(zp, tx);
-		vp->v_vflag |= VV_NOSYNC;
-	}
-
-	txtype = TX_REMOVE;
-	zfs_log_remove(zilog, tx, txtype, dzp, name, obj);
-
-	dmu_tx_commit(tx);
-out:
-
-	if (xzp)
-		vrele(ZTOV(xzp));
-
-	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
-		zil_commit(zilog, 0);
-
-	ZFS_EXIT(zfsvfs);
-	return (error);
-}
-
-/*
- * Create a new directory and insert it into dvp using the name
- * provided.  Return a pointer to the inserted directory.
- *
- *	IN:	dvp	- vnode of directory to add subdir to.
- *		dirname	- name of new directory.
- *		vap	- attributes of new directory.
- *		cr	- credentials of caller.
- *		ct	- caller context
- *		flags	- case flags
- *		vsecp	- ACL to be set
- *
- *	OUT:	vpp	- vnode of created directory.
- *
- *	RETURN:	0 on success, error code on failure.
- *
- * Timestamps:
- *	dvp - ctime|mtime updated
- *	 vp - ctime|mtime|atime updated
- */
-/*ARGSUSED*/
-static int
-zfs_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t **vpp, cred_t *cr)
-{
-	znode_t		*zp, *dzp = VTOZ(dvp);
-	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
-	zilog_t		*zilog;
-	uint64_t	txtype;
-	dmu_tx_t	*tx;
-	int		error;
-	ksid_t		*ksid;
-	uid_t		uid;
-	gid_t		gid = crgetgid(cr);
-	zfs_acl_ids_t   acl_ids;
-	boolean_t	fuid_dirtied;
-
-	ASSERT(vap->va_type == VDIR);
-
-	/*
-	 * If we have an ephemeral id, ACL, or XVATTR then
-	 * make sure file system is at proper version
-	 */
-
-	ksid = crgetsid(cr, KSID_OWNER);
-	if (ksid)
-		uid = ksid_getid(ksid);
-	else
-		uid = crgetuid(cr);
-	if (zfsvfs->z_use_fuids == B_FALSE &&
-	    ((vap->va_mask & AT_XVATTR) ||
-	    IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid)))
-		return (SET_ERROR(EINVAL));
-
-	ZFS_ENTER(zfsvfs);
-	ZFS_VERIFY_ZP(dzp);
-	zilog = zfsvfs->z_log;
-
-	if (dzp->z_pflags & ZFS_XATTR) {
-		ZFS_EXIT(zfsvfs);
-		return (SET_ERROR(EINVAL));
-	}
-
-	if (zfsvfs->z_utf8 && u8_validate(dirname,
-	    strlen(dirname), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
-		ZFS_EXIT(zfsvfs);
-		return (SET_ERROR(EILSEQ));
-	}
-
-	if (vap->va_mask & AT_XVATTR) {
-		if ((error = secpolicy_xvattr(dvp, (xvattr_t *)vap,
-		    crgetuid(cr), cr, vap->va_type)) != 0) {
-			ZFS_EXIT(zfsvfs);
-			return (error);
-		}
-	}
-
-	if ((error = zfs_acl_ids_create(dzp, 0, vap, cr,
-	    NULL, &acl_ids)) != 0) {
-		ZFS_EXIT(zfsvfs);
-		return (error);
-	}
-
-	/*
-	 * First make sure the new directory doesn't exist.
-	 *
-	 * Existence is checked first to make sure we don't return
-	 * EACCES instead of EEXIST which can cause some applications
-	 * to fail.
-	 */
-	*vpp = NULL;
-
-	if (error = zfs_dirent_lookup(dzp, dirname, &zp, ZNEW)) {
-		zfs_acl_ids_free(&acl_ids);
-		ZFS_EXIT(zfsvfs);
-		return (error);
-	}
-	ASSERT3P(zp, ==, NULL);
-
-	if (error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, 0, B_FALSE, cr)) {
-		zfs_acl_ids_free(&acl_ids);
-		ZFS_EXIT(zfsvfs);
-		return (error);
-	}
-
-	if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
-		zfs_acl_ids_free(&acl_ids);
-		ZFS_EXIT(zfsvfs);
-		return (SET_ERROR(EDQUOT));
-	}
-
-	/*
-	 * Add a new entry to the directory.
-	 */
-	getnewvnode_reserve();
-	tx = dmu_tx_create(zfsvfs->z_os);
-	dmu_tx_hold_zap(tx, dzp->z_id, TRUE, dirname);
-	dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
-	fuid_dirtied = zfsvfs->z_fuid_dirty;
-	if (fuid_dirtied)
-		zfs_fuid_txhold(zfsvfs, tx);
-	if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
-		dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
-		    acl_ids.z_aclp->z_acl_bytes);
-	}
-
-	dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
-	    ZFS_SA_BASE_ATTR_SIZE);
-
-	error = dmu_tx_assign(tx, TXG_WAIT);
-	if (error) {
-		zfs_acl_ids_free(&acl_ids);
-		dmu_tx_abort(tx);
-		getnewvnode_drop_reserve();
-		ZFS_EXIT(zfsvfs);
-		return (error);
-	}
-
-	/*
-	 * Create new node.
-	 */
-	zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
-
-	if (fuid_dirtied)
-		zfs_fuid_sync(zfsvfs, tx);
-
-	/*
-	 * Now put new name in parent dir.
-	 */
-	(void) zfs_link_create(dzp, dirname, zp, tx, ZNEW);
-
-	*vpp = ZTOV(zp);
-
-	txtype = zfs_log_create_txtype(Z_DIR, NULL, vap);
-	zfs_log_create(zilog, tx, txtype, dzp, zp, dirname, NULL,
-	    acl_ids.z_fuidp, vap);
-
-	zfs_acl_ids_free(&acl_ids);
-
-	dmu_tx_commit(tx);
-
-	getnewvnode_drop_reserve();
-
-	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
-		zil_commit(zilog, 0);
-
-	ZFS_EXIT(zfsvfs);
-	return (0);
-}
-
-/*
- * Remove a directory subdir entry.  If the current working
- * directory is the same as the subdir to be removed, the
- * remove will fail.
- *
- *	IN:	dvp	- vnode of directory to remove from.
- *		name	- name of directory to be removed.
- *		cwd	- vnode of current working directory.
- *		cr	- credentials of caller.
- *		ct	- caller context
- *		flags	- case flags
- *
- *	RETURN:	0 on success, error code on failure.
- *
- * Timestamps:
- *	dvp - ctime|mtime updated
- */
-/*ARGSUSED*/
-static int
-zfs_rmdir(vnode_t *dvp, vnode_t *vp, char *name, cred_t *cr)
-{
-	znode_t		*dzp = VTOZ(dvp);
-	znode_t		*zp = VTOZ(vp);
-	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
-	zilog_t		*zilog;
-	dmu_tx_t	*tx;
-	int		error;
-
-	ZFS_ENTER(zfsvfs);
-	ZFS_VERIFY_ZP(dzp);
-	ZFS_VERIFY_ZP(zp);
-	zilog = zfsvfs->z_log;
-
-
-	if (error = zfs_zaccess_delete(dzp, zp, cr)) {
-		goto out;
-	}
-
-	if (vp->v_type != VDIR) {
-		error = SET_ERROR(ENOTDIR);
-		goto out;
-	}
-
-	vnevent_rmdir(vp, dvp, name, ct);
-
-	tx = dmu_tx_create(zfsvfs->z_os);
-	dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
-	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
-	dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
-	zfs_sa_upgrade_txholds(tx, zp);
-	zfs_sa_upgrade_txholds(tx, dzp);
-	dmu_tx_mark_netfree(tx);
-	error = dmu_tx_assign(tx, TXG_WAIT);
-	if (error) {
-		dmu_tx_abort(tx);
-		ZFS_EXIT(zfsvfs);
-		return (error);
-	}
-
-	cache_purge(dvp);
-
-	error = zfs_link_destroy(dzp, name, zp, tx, ZEXISTS, NULL);
-
-	if (error == 0) {
-		uint64_t txtype = TX_RMDIR;
-		zfs_log_remove(zilog, tx, txtype, dzp, name, ZFS_NO_OBJECT);
-	}
-
-	dmu_tx_commit(tx);
-
-	cache_purge(vp);
-out:
-	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
-		zil_commit(zilog, 0);
-
-	ZFS_EXIT(zfsvfs);
-	return (error);
-}
-
-/*
- * Read as many directory entries as will fit into the provided
- * buffer from the given directory cursor position (specified in
- * the uio structure).
- *
- *	IN:	vp	- vnode of directory to read.
- *		uio	- structure supplying read location, range info,
- *			  and return buffer.
- *		cr	- credentials of caller.
- *		ct	- caller context
- *		flags	- case flags
- *
- *	OUT:	uio	- updated offset and range, buffer filled.
- *		eofp	- set to true if end-of-file detected.
- *
- *	RETURN:	0 on success, error code on failure.
- *
- * Timestamps:
- *	vp - atime updated
- *
- * Note that the low 4 bits of the cookie returned by zap is always zero.
- * This allows us to use the low range for "special" directory entries:
- * We use 0 for '.', and 1 for '..'.  If this is the root of the filesystem,
- * we use the offset 2 for the '.zfs' directory.
- */
-/* ARGSUSED */
-static int
-zfs_readdir(vnode_t *vp, uio_t *uio, cred_t *cr, int *eofp, int *ncookies, u_long **cookies)
-{
-	znode_t		*zp = VTOZ(vp);
-	iovec_t		*iovp;
-	edirent_t	*eodp;
-	dirent64_t	*odp;
-	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
-	objset_t	*os;
-	caddr_t		outbuf;
-	size_t		bufsize;
-	zap_cursor_t	zc;
-	zap_attribute_t	zap;
-	uint_t		bytes_wanted;
-	uint64_t	offset; /* must be unsigned; checks for < 1 */
-	uint64_t	parent;
-	int		local_eof;
-	int		outcount;
-	int		error;
-	uint8_t		prefetch;
-	boolean_t	check_sysattrs;
-	uint8_t		type;
-	int		ncooks;
-	u_long		*cooks = NULL;
-	int		flags = 0;
-
-	ZFS_ENTER(zfsvfs);
-	ZFS_VERIFY_ZP(zp);
-
-	if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
-	    &parent, sizeof (parent))) != 0) {
-		ZFS_EXIT(zfsvfs);
-		return (error);
-	}
-
-	/*
-	 * If we are not given an eof variable,
-	 * use a local one.
-	 */
-	if (eofp == NULL)
-		eofp = &local_eof;
-
-	/*
-	 * Check for valid iov_len.
-	 */
-	if (uio->uio_iov->iov_len <= 0) {
-		ZFS_EXIT(zfsvfs);
-		return (SET_ERROR(EINVAL));
-	}
-
-	/*
-	 * Quit if directory has been removed (posix)
-	 */
-	if ((*eofp = zp->z_unlinked) != 0) {
-		ZFS_EXIT(zfsvfs);
-		return (0);
-	}
-
-	error = 0;
-	os = zfsvfs->z_os;
-	offset = uio->uio_loffset;
-	prefetch = zp->z_zn_prefetch;
-
-	/*
-	 * Initialize the iterator cursor.
-	 */
-	if (offset <= 3) {
-		/*
-		 * Start iteration from the beginning of the directory.
-		 */
-		zap_cursor_init(&zc, os, zp->z_id);
-	} else {
-		/*
-		 * The offset is a serialized cursor.
-		 */
-		zap_cursor_init_serialized(&zc, os, zp->z_id, offset);
-	}
-
-	/*
-	 * Get space to change directory entries into fs independent format.
-	 */
-	iovp = uio->uio_iov;
-	bytes_wanted = iovp->iov_len;
-	if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1) {
-		bufsize = bytes_wanted;
-		outbuf = kmem_alloc(bufsize, KM_SLEEP);
-		odp = (struct dirent64 *)outbuf;
-	} else {
-		bufsize = bytes_wanted;
-		outbuf = NULL;
-		odp = (struct dirent64 *)iovp->iov_base;
-	}
-	eodp = (struct edirent *)odp;
-
-	if (ncookies != NULL) {
-		/*
-		 * Minimum entry size is dirent size and 1 byte for a file name.
-		 */
-		ncooks = uio->uio_resid / (sizeof(struct dirent) - sizeof(((struct dirent *)NULL)->d_name) + 1);
-		cooks = malloc(ncooks * sizeof(u_long), M_TEMP, M_WAITOK);
-		*cookies = cooks;
-		*ncookies = ncooks;
-	}
-	/*
-	 * If this VFS supports the system attribute view interface; and
-	 * we're looking at an extended attribute directory; and we care
-	 * about normalization conflicts on this vfs; then we must check
-	 * for normalization conflicts with the sysattr name space.
-	 */
-#ifdef TODO
-	check_sysattrs = vfs_has_feature(vp->v_vfsp, VFSFT_SYSATTR_VIEWS) &&
-	    (vp->v_flag & V_XATTRDIR) && zfsvfs->z_norm &&
-	    (flags & V_RDDIR_ENTFLAGS);
-#else
-	check_sysattrs = 0;
-#endif
-
-	/*
-	 * Transform to file-system independent format
-	 */
-	outcount = 0;
-	while (outcount < bytes_wanted) {
-		ino64_t objnum;
-		ushort_t reclen;
-		off64_t *next = NULL;
-
-		/*
-		 * Special case `.', `..', and `.zfs'.
-		 */
-		if (offset == 0) {
-			(void) strcpy(zap.za_name, ".");
-			zap.za_normalization_conflict = 0;
-			objnum = zp->z_id;
-			type = DT_DIR;
-		} else if (offset == 1) {
-			(void) strcpy(zap.za_name, "..");
-			zap.za_normalization_conflict = 0;
-			objnum = parent;
-			type = DT_DIR;
-		} else if (offset == 2 && zfs_show_ctldir(zp)) {
-			(void) strcpy(zap.za_name, ZFS_CTLDIR_NAME);
-			zap.za_normalization_conflict = 0;
-			objnum = ZFSCTL_INO_ROOT;
-			type = DT_DIR;
-		} else {
-			/*
-			 * Grab next entry.
-			 */
-			if (error = zap_cursor_retrieve(&zc, &zap)) {
-				if ((*eofp = (error == ENOENT)) != 0)
-					break;
-				else
-					goto update;
-			}
-
-			if (zap.za_integer_length != 8 ||
-			    zap.za_num_integers != 1) {
-				cmn_err(CE_WARN, "zap_readdir: bad directory "
-				    "entry, obj = %lld, offset = %lld\n",
-				    (u_longlong_t)zp->z_id,
-				    (u_longlong_t)offset);
-				error = SET_ERROR(ENXIO);
-				goto update;
-			}
-
-			objnum = ZFS_DIRENT_OBJ(zap.za_first_integer);
-			/*
-			 * MacOS X can extract the object type here such as:
-			 * uint8_t type = ZFS_DIRENT_TYPE(zap.za_first_integer);
-			 */
-			type = ZFS_DIRENT_TYPE(zap.za_first_integer);
-
-			if (check_sysattrs && !zap.za_normalization_conflict) {
-#ifdef TODO
-				zap.za_normalization_conflict =
-				    xattr_sysattr_casechk(zap.za_name);
-#else
-				panic("%s:%u: TODO", __func__, __LINE__);
-#endif
-			}
-		}
-
-		if (flags & V_RDDIR_ACCFILTER) {
-			/*
-			 * If we have no access at all, don't include
-			 * this entry in the returned information
-			 */
-			znode_t	*ezp;
-			if (zfs_zget(zp->z_zfsvfs, objnum, &ezp) != 0)
-				goto skip_entry;
-			if (!zfs_has_access(ezp, cr)) {
-				vrele(ZTOV(ezp));
-				goto skip_entry;
-			}
-			vrele(ZTOV(ezp));
-		}
-
-		if (flags & V_RDDIR_ENTFLAGS)
-			reclen = EDIRENT_RECLEN(strlen(zap.za_name));
-		else
-			reclen = DIRENT64_RECLEN(strlen(zap.za_name));
-
-		/*
-		 * Will this entry fit in the buffer?
-		 */
-		if (outcount + reclen > bufsize) {
-			/*
-			 * Did we manage to fit anything in the buffer?
-			 */
-			if (!outcount) {
-				error = SET_ERROR(EINVAL);
-				goto update;
-			}
-			break;
-		}
-		if (flags & V_RDDIR_ENTFLAGS) {
-			/*
-			 * Add extended flag entry:
-			 */
-			eodp->ed_ino = objnum;
-			eodp->ed_reclen = reclen;
-			/* NOTE: ed_off is the offset for the *next* entry. */
-			next = &eodp->ed_off;
-			eodp->ed_eflags = zap.za_normalization_conflict ?
-			    ED_CASE_CONFLICT : 0;
-			(void) strncpy(eodp->ed_name, zap.za_name,
-			    EDIRENT_NAMELEN(reclen));
-			eodp = (edirent_t *)((intptr_t)eodp + reclen);
-		} else {
-			/*
-			 * Add normal entry:
-			 */
-			odp->d_ino = objnum;
-			odp->d_reclen = reclen;
-			odp->d_namlen = strlen(zap.za_name);
-			/* NOTE: d_off is the offset for the *next* entry. */
-			next = &odp->d_off;
-			(void) strlcpy(odp->d_name, zap.za_name, odp->d_namlen + 1);
-			odp->d_type = type;
-			dirent_terminate(odp);
-			odp = (dirent64_t *)((intptr_t)odp + reclen);
-		}
-		outcount += reclen;
-
-		ASSERT(outcount <= bufsize);
-
-		/* Prefetch znode */
-		if (prefetch)
-			dmu_prefetch(os, objnum, 0, 0, 0,
-			    ZIO_PRIORITY_SYNC_READ);
-
-	skip_entry:
-		/*
-		 * Move to the next entry, fill in the previous offset.
-		 */
-		if (offset > 2 || (offset == 2 && !zfs_show_ctldir(zp))) {
-			zap_cursor_advance(&zc);
-			offset = zap_cursor_serialize(&zc);
-		} else {
-			offset += 1;
-		}
-
-		/* Fill the offset right after advancing the cursor. */
-		if (next != NULL)
-			*next = offset;
-		if (cooks != NULL) {
-			*cooks++ = offset;
-			ncooks--;
-			KASSERT(ncooks >= 0, ("ncookies=%d", ncooks));
-		}
-	}
-	zp->z_zn_prefetch = B_FALSE; /* a lookup will re-enable pre-fetching */
-
-	/* Subtract unused cookies */
-	if (ncookies != NULL)
-		*ncookies -= ncooks;
-
-	if (uio->uio_segflg == UIO_SYSSPACE && uio->uio_iovcnt == 1) {
-		iovp->iov_base += outcount;
-		iovp->iov_len -= outcount;
-		uio->uio_resid -= outcount;
-	} else if (error = uiomove(outbuf, (long)outcount, UIO_READ, uio)) {
-		/*
-		 * Reset the pointer.
-		 */
-		offset = uio->uio_loffset;
-	}
-
-update:
-	zap_cursor_fini(&zc);
-	if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1)
-		kmem_free(outbuf, bufsize);
-
-	if (error == ENOENT)
-		error = 0;
-
-	ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
-
-	uio->uio_loffset = offset;
-	ZFS_EXIT(zfsvfs);
-	if (error != 0 && cookies != NULL) {
-		free(*cookies, M_TEMP);
-		*cookies = NULL;
-		*ncookies = 0;
-	}
-	return (error);
-}
-
-ulong_t zfs_fsync_sync_cnt = 4;
-
-static int
-zfs_fsync(vnode_t *vp, int syncflag, cred_t *cr, caller_context_t *ct)
-{
-	znode_t	*zp = VTOZ(vp);
-	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
-
-	(void) tsd_set(zfs_fsyncer_key, (void *)zfs_fsync_sync_cnt);
-
-	if (zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED) {
-		ZFS_ENTER(zfsvfs);
-		ZFS_VERIFY_ZP(zp);
-		zil_commit(zfsvfs->z_log, zp->z_id);
-		ZFS_EXIT(zfsvfs);
-	}
-	return (0);
-}
-
-
-/*
- * Get the requested file attributes and place them in the provided
- * vattr structure.
- *
- *	IN:	vp	- vnode of file.
- *		vap	- va_mask identifies requested attributes.
- *			  If AT_XVATTR set, then optional attrs are requested
- *		flags	- ATTR_NOACLCHECK (CIFS server context)
- *		cr	- credentials of caller.
- *		ct	- caller context
- *
- *	OUT:	vap	- attribute values.
- *
- *	RETURN:	0 (always succeeds).
- */
-/* ARGSUSED */
-static int
-zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
-    caller_context_t *ct)
-{
-	znode_t *zp = VTOZ(vp);
-	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
-	int	error = 0;
-	uint32_t blksize;
-	u_longlong_t nblocks;
-	uint64_t mtime[2], ctime[2], crtime[2], rdev;
-	xvattr_t *xvap = (xvattr_t *)vap;	/* vap may be an xvattr_t * */
-	xoptattr_t *xoap = NULL;
-	boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
-	sa_bulk_attr_t bulk[4];
-	int count = 0;
-
-	ZFS_ENTER(zfsvfs);
-	ZFS_VERIFY_ZP(zp);
-
-	zfs_fuid_map_ids(zp, cr, &vap->va_uid, &vap->va_gid);
-
-	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
-	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
-	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CRTIME(zfsvfs), NULL, &crtime, 16);
-	if (vp->v_type == VBLK || vp->v_type == VCHR)
-		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_RDEV(zfsvfs), NULL,
-		    &rdev, 8);
-
-	if ((error = sa_bulk_lookup(zp->z_sa_hdl, bulk, count)) != 0) {
-		ZFS_EXIT(zfsvfs);
-		return (error);
-	}
-
-	/*
-	 * If ACL is trivial don't bother looking for ACE_READ_ATTRIBUTES.
-	 * Also, if we are the owner don't bother, since owner should
-	 * always be allowed to read basic attributes of file.
-	 */
-	if (!(zp->z_pflags & ZFS_ACL_TRIVIAL) &&
-	    (vap->va_uid != crgetuid(cr))) {
-		if (error = zfs_zaccess(zp, ACE_READ_ATTRIBUTES, 0,
-		    skipaclchk, cr)) {
-			ZFS_EXIT(zfsvfs);
-			return (error);
-		}
-	}
-
-	/*
-	 * Return all attributes.  It's cheaper to provide the answer
-	 * than to determine whether we were asked the question.
-	 */
-
-	vap->va_type = IFTOVT(zp->z_mode);
-	vap->va_mode = zp->z_mode & ~S_IFMT;
-#ifdef illumos
-	vap->va_fsid = zp->z_zfsvfs->z_vfs->vfs_dev;
-#else
-	vn_fsid(vp, vap);
-#endif
-	vap->va_nodeid = zp->z_id;
-	vap->va_nlink = zp->z_links;
-	if ((vp->v_flag & VROOT) && zfs_show_ctldir(zp) &&
-	    zp->z_links < ZFS_LINK_MAX)
-		vap->va_nlink++;
-	vap->va_size = zp->z_size;
-#ifdef illumos
-	vap->va_rdev = vp->v_rdev;
-#else
-	if (vp->v_type == VBLK || vp->v_type == VCHR)
-		vap->va_rdev = zfs_cmpldev(rdev);
-#endif
-	vap->va_seq = zp->z_seq;
-	vap->va_flags = 0;	/* FreeBSD: Reset chflags(2) flags. */
-     	vap->va_filerev = zp->z_seq;
-
-	/*
-	 * Add in any requested optional attributes and the create time.
-	 * Also set the corresponding bits in the returned attribute bitmap.
-	 */
-	if ((xoap = xva_getxoptattr(xvap)) != NULL && zfsvfs->z_use_fuids) {
-		if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) {
-			xoap->xoa_archive =
-			    ((zp->z_pflags & ZFS_ARCHIVE) != 0);
-			XVA_SET_RTN(xvap, XAT_ARCHIVE);
-		}
-
-		if (XVA_ISSET_REQ(xvap, XAT_READONLY)) {
-			xoap->xoa_readonly =
-			    ((zp->z_pflags & ZFS_READONLY) != 0);
-			XVA_SET_RTN(xvap, XAT_READONLY);
-		}
-
-		if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) {
-			xoap->xoa_system =
-			    ((zp->z_pflags & ZFS_SYSTEM) != 0);
-			XVA_SET_RTN(xvap, XAT_SYSTEM);
-		}
-
-		if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) {
-			xoap->xoa_hidden =
-			    ((zp->z_pflags & ZFS_HIDDEN) != 0);
-			XVA_SET_RTN(xvap, XAT_HIDDEN);
-		}
-
-		if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
-			xoap->xoa_nounlink =
-			    ((zp->z_pflags & ZFS_NOUNLINK) != 0);
-			XVA_SET_RTN(xvap, XAT_NOUNLINK);
-		}
-
-		if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
-			xoap->xoa_immutable =
-			    ((zp->z_pflags & ZFS_IMMUTABLE) != 0);
-			XVA_SET_RTN(xvap, XAT_IMMUTABLE);
-		}
-
-		if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
-			xoap->xoa_appendonly =
-			    ((zp->z_pflags & ZFS_APPENDONLY) != 0);
-			XVA_SET_RTN(xvap, XAT_APPENDONLY);
-		}
-
-		if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
-			xoap->xoa_nodump =
-			    ((zp->z_pflags & ZFS_NODUMP) != 0);
-			XVA_SET_RTN(xvap, XAT_NODUMP);
-		}
-
-		if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) {
-			xoap->xoa_opaque =
-			    ((zp->z_pflags & ZFS_OPAQUE) != 0);
-			XVA_SET_RTN(xvap, XAT_OPAQUE);
-		}
-
-		if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
-			xoap->xoa_av_quarantined =
-			    ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0);
-			XVA_SET_RTN(xvap, XAT_AV_QUARANTINED);
-		}
-
-		if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
-			xoap->xoa_av_modified =
-			    ((zp->z_pflags & ZFS_AV_MODIFIED) != 0);
-			XVA_SET_RTN(xvap, XAT_AV_MODIFIED);
-		}
-
-		if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) &&
-		    vp->v_type == VREG) {
-			zfs_sa_get_scanstamp(zp, xvap);
-		}
-
-		if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) {
-			xoap->xoa_reparse = ((zp->z_pflags & ZFS_REPARSE) != 0);
-			XVA_SET_RTN(xvap, XAT_REPARSE);
-		}
-		if (XVA_ISSET_REQ(xvap, XAT_GEN)) {
-			xoap->xoa_generation = zp->z_gen;
-			XVA_SET_RTN(xvap, XAT_GEN);
-		}
-
-		if (XVA_ISSET_REQ(xvap, XAT_OFFLINE)) {
-			xoap->xoa_offline =
-			    ((zp->z_pflags & ZFS_OFFLINE) != 0);
-			XVA_SET_RTN(xvap, XAT_OFFLINE);
-		}
-
-		if (XVA_ISSET_REQ(xvap, XAT_SPARSE)) {
-			xoap->xoa_sparse =
-			    ((zp->z_pflags & ZFS_SPARSE) != 0);
-			XVA_SET_RTN(xvap, XAT_SPARSE);
-		}
-	}
-
-	ZFS_TIME_DECODE(&vap->va_atime, zp->z_atime);
-	ZFS_TIME_DECODE(&vap->va_mtime, mtime);
-	ZFS_TIME_DECODE(&vap->va_ctime, ctime);
-	ZFS_TIME_DECODE(&vap->va_birthtime, crtime);
-
-
-	sa_object_size(zp->z_sa_hdl, &blksize, &nblocks);
-	vap->va_blksize = blksize;
-	vap->va_bytes = nblocks << 9;	/* nblocks * 512 */
-
-	if (zp->z_blksz == 0) {
-		/*
-		 * Block size hasn't been set; suggest maximal I/O transfers.
-		 */
-		vap->va_blksize = zfsvfs->z_max_blksz;
-	}
-
-	ZFS_EXIT(zfsvfs);
-	return (0);
-}
-
-/*
- * Set the file attributes to the values contained in the
- * vattr structure.
- *
- *	IN:	vp	- vnode of file to be modified.
- *		vap	- new attribute values.
- *			  If AT_XVATTR set, then optional attrs are being set
- *		flags	- ATTR_UTIME set if non-default time values provided.
- *			- ATTR_NOACLCHECK (CIFS context only).
- *		cr	- credentials of caller.
- *		ct	- caller context
- *
- *	RETURN:	0 on success, error code on failure.
- *
- * Timestamps:
- *	vp - ctime updated, mtime updated if size changed.
- */
-/* ARGSUSED */
-static int
-zfs_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
-    caller_context_t *ct)
-{
-	znode_t		*zp = VTOZ(vp);
-	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
-	zilog_t		*zilog;
-	dmu_tx_t	*tx;
-	vattr_t		oldva;
-	xvattr_t	tmpxvattr;
-	uint_t		mask = vap->va_mask;
-	uint_t		saved_mask = 0;
-	uint64_t	saved_mode;
-	int		trim_mask = 0;
-	uint64_t	new_mode;
-	uint64_t	new_uid, new_gid;
-	uint64_t	xattr_obj;
-	uint64_t	mtime[2], ctime[2];
-	znode_t		*attrzp;
-	int		need_policy = FALSE;
-	int		err, err2;
-	zfs_fuid_info_t *fuidp = NULL;
-	xvattr_t *xvap = (xvattr_t *)vap;	/* vap may be an xvattr_t * */
-	xoptattr_t	*xoap;
-	zfs_acl_t	*aclp;
-	boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
-	boolean_t	fuid_dirtied = B_FALSE;
-	sa_bulk_attr_t	bulk[7], xattr_bulk[7];
-	int		count = 0, xattr_count = 0;
-
-	if (mask == 0)
-		return (0);
-
-	if (mask & AT_NOSET)
-		return (SET_ERROR(EINVAL));
-
-	ZFS_ENTER(zfsvfs);
-	ZFS_VERIFY_ZP(zp);
-
-	zilog = zfsvfs->z_log;
-
-	/*
-	 * Make sure that if we have ephemeral uid/gid or xvattr specified
-	 * that file system is at proper version level
-	 */
-
-	if (zfsvfs->z_use_fuids == B_FALSE &&
-	    (((mask & AT_UID) && IS_EPHEMERAL(vap->va_uid)) ||
-	    ((mask & AT_GID) && IS_EPHEMERAL(vap->va_gid)) ||
-	    (mask & AT_XVATTR))) {
-		ZFS_EXIT(zfsvfs);
-		return (SET_ERROR(EINVAL));
-	}
-
-	if (mask & AT_SIZE && vp->v_type == VDIR) {
-		ZFS_EXIT(zfsvfs);
-		return (SET_ERROR(EISDIR));
-	}
-
-	if (mask & AT_SIZE && vp->v_type != VREG && vp->v_type != VFIFO) {
-		ZFS_EXIT(zfsvfs);
-		return (SET_ERROR(EINVAL));
-	}
-
-	/*
-	 * If this is an xvattr_t, then get a pointer to the structure of
-	 * optional attributes.  If this is NULL, then we have a vattr_t.
-	 */
-	xoap = xva_getxoptattr(xvap);
-
-	xva_init(&tmpxvattr);
-
-	/*
-	 * Immutable files can only alter immutable bit and atime
-	 */
-	if ((zp->z_pflags & ZFS_IMMUTABLE) &&
-	    ((mask & (AT_SIZE|AT_UID|AT_GID|AT_MTIME|AT_MODE)) ||
-	    ((mask & AT_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME)))) {
-		ZFS_EXIT(zfsvfs);
-		return (SET_ERROR(EPERM));
-	}
-
-	/*
-	 * Note: ZFS_READONLY is handled in zfs_zaccess_common.
-	 */
-
-	/*
-	 * Verify timestamps doesn't overflow 32 bits.
-	 * ZFS can handle large timestamps, but 32bit syscalls can't
-	 * handle times greater than 2039.  This check should be removed
-	 * once large timestamps are fully supported.
-	 */
-	if (mask & (AT_ATIME | AT_MTIME)) {
-		if (((mask & AT_ATIME) && TIMESPEC_OVERFLOW(&vap->va_atime)) ||
-		    ((mask & AT_MTIME) && TIMESPEC_OVERFLOW(&vap->va_mtime))) {
-			ZFS_EXIT(zfsvfs);
-			return (SET_ERROR(EOVERFLOW));
-		}
-	}
-	if (xoap && (mask & AT_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME) &&
-	    TIMESPEC_OVERFLOW(&vap->va_birthtime)) {
-		ZFS_EXIT(zfsvfs);
-		return (SET_ERROR(EOVERFLOW));
-	}
-
-	attrzp = NULL;
-	aclp = NULL;
-
-	/* Can this be moved to before the top label? */
-	if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) {
-		ZFS_EXIT(zfsvfs);
-		return (SET_ERROR(EROFS));
-	}
-
-	/*
-	 * First validate permissions
-	 */
-
-	if (mask & AT_SIZE) {
-		/*
-		 * XXX - Note, we are not providing any open
-		 * mode flags here (like FNDELAY), so we may
-		 * block if there are locks present... this
-		 * should be addressed in openat().
-		 */
-		/* XXX - would it be OK to generate a log record here? */
-		err = zfs_freesp(zp, vap->va_size, 0, 0, FALSE);
-		if (err) {
-			ZFS_EXIT(zfsvfs);
-			return (err);
-		}
-	}
-
-	if (mask & (AT_ATIME|AT_MTIME) ||
-	    ((mask & AT_XVATTR) && (XVA_ISSET_REQ(xvap, XAT_HIDDEN) ||
-	    XVA_ISSET_REQ(xvap, XAT_READONLY) ||
-	    XVA_ISSET_REQ(xvap, XAT_ARCHIVE) ||
-	    XVA_ISSET_REQ(xvap, XAT_OFFLINE) ||
-	    XVA_ISSET_REQ(xvap, XAT_SPARSE) ||
-	    XVA_ISSET_REQ(xvap, XAT_CREATETIME) ||
-	    XVA_ISSET_REQ(xvap, XAT_SYSTEM)))) {
-		need_policy = zfs_zaccess(zp, ACE_WRITE_ATTRIBUTES, 0,
-		    skipaclchk, cr);
-	}
-
-	if (mask & (AT_UID|AT_GID)) {
-		int	idmask = (mask & (AT_UID|AT_GID));
-		int	take_owner;
-		int	take_group;
-
-		/*
-		 * NOTE: even if a new mode is being set,
-		 * we may clear S_ISUID/S_ISGID bits.
-		 */
-
-		if (!(mask & AT_MODE))
-			vap->va_mode = zp->z_mode;
-
-		/*
-		 * Take ownership or chgrp to group we are a member of
-		 */
-
-		take_owner = (mask & AT_UID) && (vap->va_uid == crgetuid(cr));
-		take_group = (mask & AT_GID) &&
-		    zfs_groupmember(zfsvfs, vap->va_gid, cr);
-
-		/*
-		 * If both AT_UID and AT_GID are set then take_owner and
-		 * take_group must both be set in order to allow taking
-		 * ownership.
-		 *
-		 * Otherwise, send the check through secpolicy_vnode_setattr()
-		 *
-		 */
-
-		if (((idmask == (AT_UID|AT_GID)) && take_owner && take_group) ||
-		    ((idmask == AT_UID) && take_owner) ||
-		    ((idmask == AT_GID) && take_group)) {
-			if (zfs_zaccess(zp, ACE_WRITE_OWNER, 0,
-			    skipaclchk, cr) == 0) {
-				/*
-				 * Remove setuid/setgid for non-privileged users
-				 */
-				secpolicy_setid_clear(vap, vp, cr);
-				trim_mask = (mask & (AT_UID|AT_GID));
-			} else {
-				need_policy =  TRUE;
-			}
-		} else {
-			need_policy =  TRUE;
-		}
-	}
-
-	oldva.va_mode = zp->z_mode;
-	zfs_fuid_map_ids(zp, cr, &oldva.va_uid, &oldva.va_gid);
-	if (mask & AT_XVATTR) {
-		/*
-		 * Update xvattr mask to include only those attributes
-		 * that are actually changing.
-		 *
-		 * the bits will be restored prior to actually setting
-		 * the attributes so the caller thinks they were set.
-		 */
-		if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
-			if (xoap->xoa_appendonly !=
-			    ((zp->z_pflags & ZFS_APPENDONLY) != 0)) {
-				need_policy = TRUE;
-			} else {
-				XVA_CLR_REQ(xvap, XAT_APPENDONLY);
-				XVA_SET_REQ(&tmpxvattr, XAT_APPENDONLY);
-			}
-		}
-
-		if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
-			if (xoap->xoa_nounlink !=
-			    ((zp->z_pflags & ZFS_NOUNLINK) != 0)) {
-				need_policy = TRUE;
-			} else {
-				XVA_CLR_REQ(xvap, XAT_NOUNLINK);
-				XVA_SET_REQ(&tmpxvattr, XAT_NOUNLINK);
-			}
-		}
-
-		if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
-			if (xoap->xoa_immutable !=
-			    ((zp->z_pflags & ZFS_IMMUTABLE) != 0)) {
-				need_policy = TRUE;
-			} else {
-				XVA_CLR_REQ(xvap, XAT_IMMUTABLE);
-				XVA_SET_REQ(&tmpxvattr, XAT_IMMUTABLE);
-			}
-		}
-
-		if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
-			if (xoap->xoa_nodump !=
-			    ((zp->z_pflags & ZFS_NODUMP) != 0)) {
-				need_policy = TRUE;
-			} else {
-				XVA_CLR_REQ(xvap, XAT_NODUMP);
-				XVA_SET_REQ(&tmpxvattr, XAT_NODUMP);
-			}
-		}
-
-		if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
-			if (xoap->xoa_av_modified !=
-			    ((zp->z_pflags & ZFS_AV_MODIFIED) != 0)) {
-				need_policy = TRUE;
-			} else {
-				XVA_CLR_REQ(xvap, XAT_AV_MODIFIED);
-				XVA_SET_REQ(&tmpxvattr, XAT_AV_MODIFIED);
-			}
-		}
-
-		if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
-			if ((vp->v_type != VREG &&
-			    xoap->xoa_av_quarantined) ||
-			    xoap->xoa_av_quarantined !=
-			    ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0)) {
-				need_policy = TRUE;
-			} else {
-				XVA_CLR_REQ(xvap, XAT_AV_QUARANTINED);
-				XVA_SET_REQ(&tmpxvattr, XAT_AV_QUARANTINED);
-			}
-		}
-
-		if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) {
-			ZFS_EXIT(zfsvfs);
-			return (SET_ERROR(EPERM));
-		}
-
-		if (need_policy == FALSE &&
-		    (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) ||
-		    XVA_ISSET_REQ(xvap, XAT_OPAQUE))) {
-			need_policy = TRUE;
-		}
-	}
-
-	if (mask & AT_MODE) {
-		if (zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr) == 0) {
-			err = secpolicy_setid_setsticky_clear(vp, vap,
-			    &oldva, cr);
-			if (err) {
-				ZFS_EXIT(zfsvfs);
-				return (err);
-			}
-			trim_mask |= AT_MODE;
-		} else {
-			need_policy = TRUE;
-		}
-	}
-
-	if (need_policy) {
-		/*
-		 * If trim_mask is set then take ownership
-		 * has been granted or write_acl is present and user
-		 * has the ability to modify mode.  In that case remove
-		 * UID|GID and or MODE from mask so that
-		 * secpolicy_vnode_setattr() doesn't revoke it.
-		 */
-
-		if (trim_mask) {
-			saved_mask = vap->va_mask;
-			vap->va_mask &= ~trim_mask;
-			if (trim_mask & AT_MODE) {
-				/*
-				 * Save the mode, as secpolicy_vnode_setattr()
-				 * will overwrite it with ova.va_mode.
-				 */
-				saved_mode = vap->va_mode;
-			}
-		}
-		err = secpolicy_vnode_setattr(cr, vp, vap, &oldva, flags,
-		    (int (*)(void *, int, cred_t *))zfs_zaccess_unix, zp);
-		if (err) {
-			ZFS_EXIT(zfsvfs);
-			return (err);
-		}
-
-		if (trim_mask) {
-			vap->va_mask |= saved_mask;
-			if (trim_mask & AT_MODE) {
-				/*
-				 * Recover the mode after
-				 * secpolicy_vnode_setattr().
-				 */
-				vap->va_mode = saved_mode;
-			}
-		}
-	}
-
-	/*
-	 * secpolicy_vnode_setattr, or take ownership may have
-	 * changed va_mask
-	 */
-	mask = vap->va_mask;
-
-	if ((mask & (AT_UID | AT_GID))) {
-		err = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
-		    &xattr_obj, sizeof (xattr_obj));
-
-		if (err == 0 && xattr_obj) {
-			err = zfs_zget(zp->z_zfsvfs, xattr_obj, &attrzp);
-			if (err == 0) {
-				err = vn_lock(ZTOV(attrzp), LK_EXCLUSIVE);
-				if (err != 0)
-					vrele(ZTOV(attrzp));
-			}
-			if (err)
-				goto out2;
-		}
-		if (mask & AT_UID) {
-			new_uid = zfs_fuid_create(zfsvfs,
-			    (uint64_t)vap->va_uid, cr, ZFS_OWNER, &fuidp);
-			if (new_uid != zp->z_uid &&
-			    zfs_fuid_overquota(zfsvfs, B_FALSE, new_uid)) {
-				if (attrzp)
-					vput(ZTOV(attrzp));
-				err = SET_ERROR(EDQUOT);
-				goto out2;
-			}
-		}
-
-		if (mask & AT_GID) {
-			new_gid = zfs_fuid_create(zfsvfs, (uint64_t)vap->va_gid,
-			    cr, ZFS_GROUP, &fuidp);
-			if (new_gid != zp->z_gid &&
-			    zfs_fuid_overquota(zfsvfs, B_TRUE, new_gid)) {
-				if (attrzp)
-					vput(ZTOV(attrzp));
-				err = SET_ERROR(EDQUOT);
-				goto out2;
-			}
-		}
-	}
-	tx = dmu_tx_create(zfsvfs->z_os);
-
-	if (mask & AT_MODE) {
-		uint64_t pmode = zp->z_mode;
-		uint64_t acl_obj;
-		new_mode = (pmode & S_IFMT) | (vap->va_mode & ~S_IFMT);
-
-		if (zp->z_zfsvfs->z_acl_mode == ZFS_ACL_RESTRICTED &&
-		    !(zp->z_pflags & ZFS_ACL_TRIVIAL)) {
-			err = SET_ERROR(EPERM);
-			goto out;
-		}
-
-		if (err = zfs_acl_chmod_setattr(zp, &aclp, new_mode))
-			goto out;
-
-		if (!zp->z_is_sa && ((acl_obj = zfs_external_acl(zp)) != 0)) {
-			/*
-			 * Are we upgrading ACL from old V0 format
-			 * to V1 format?
-			 */
-			if (zfsvfs->z_version >= ZPL_VERSION_FUID &&
-			    zfs_znode_acl_version(zp) ==
-			    ZFS_ACL_VERSION_INITIAL) {
-				dmu_tx_hold_free(tx, acl_obj, 0,
-				    DMU_OBJECT_END);
-				dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
-				    0, aclp->z_acl_bytes);
-			} else {
-				dmu_tx_hold_write(tx, acl_obj, 0,
-				    aclp->z_acl_bytes);
-			}
-		} else if (!zp->z_is_sa && aclp->z_acl_bytes > ZFS_ACE_SPACE) {
-			dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
-			    0, aclp->z_acl_bytes);
-		}
-		dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
-	} else {
-		if ((mask & AT_XVATTR) &&
-		    XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP))
-			dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
-		else
-			dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
-	}
-
-	if (attrzp) {
-		dmu_tx_hold_sa(tx, attrzp->z_sa_hdl, B_FALSE);
-	}
-
-	fuid_dirtied = zfsvfs->z_fuid_dirty;
-	if (fuid_dirtied)
-		zfs_fuid_txhold(zfsvfs, tx);
-
-	zfs_sa_upgrade_txholds(tx, zp);
-
-	err = dmu_tx_assign(tx, TXG_WAIT);
-	if (err)
-		goto out;
-
-	count = 0;
-	/*
-	 * Set each attribute requested.
-	 * We group settings according to the locks they need to acquire.
-	 *
-	 * Note: you cannot set ctime directly, although it will be
-	 * updated as a side-effect of calling this function.
-	 */
-
-	if (mask & (AT_UID|AT_GID|AT_MODE))
-		mutex_enter(&zp->z_acl_lock);
-
-	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
-	    &zp->z_pflags, sizeof (zp->z_pflags));
-
-	if (attrzp) {
-		if (mask & (AT_UID|AT_GID|AT_MODE))
-			mutex_enter(&attrzp->z_acl_lock);
-		SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
-		    SA_ZPL_FLAGS(zfsvfs), NULL, &attrzp->z_pflags,
-		    sizeof (attrzp->z_pflags));
-	}
-
-	if (mask & (AT_UID|AT_GID)) {
-
-		if (mask & AT_UID) {
-			SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL,
-			    &new_uid, sizeof (new_uid));
-			zp->z_uid = new_uid;
-			if (attrzp) {
-				SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
-				    SA_ZPL_UID(zfsvfs), NULL, &new_uid,
-				    sizeof (new_uid));
-				attrzp->z_uid = new_uid;
-			}
-		}
-
-		if (mask & AT_GID) {
-			SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs),
-			    NULL, &new_gid, sizeof (new_gid));
-			zp->z_gid = new_gid;
-			if (attrzp) {
-				SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
-				    SA_ZPL_GID(zfsvfs), NULL, &new_gid,
-				    sizeof (new_gid));
-				attrzp->z_gid = new_gid;
-			}
-		}
-		if (!(mask & AT_MODE)) {
-			SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs),
-			    NULL, &new_mode, sizeof (new_mode));
-			new_mode = zp->z_mode;
-		}
-		err = zfs_acl_chown_setattr(zp);
-		ASSERT(err == 0);
-		if (attrzp) {
-			err = zfs_acl_chown_setattr(attrzp);
-			ASSERT(err == 0);
-		}
-	}
-
-	if (mask & AT_MODE) {
-		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL,
-		    &new_mode, sizeof (new_mode));
-		zp->z_mode = new_mode;
-		ASSERT3U((uintptr_t)aclp, !=, 0);
-		err = zfs_aclset_common(zp, aclp, cr, tx);
-		ASSERT0(err);
-		if (zp->z_acl_cached)
-			zfs_acl_free(zp->z_acl_cached);
-		zp->z_acl_cached = aclp;
-		aclp = NULL;
-	}
-
-
-	if (mask & AT_ATIME) {
-		ZFS_TIME_ENCODE(&vap->va_atime, zp->z_atime);
-		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL,
-		    &zp->z_atime, sizeof (zp->z_atime));
-	}
-
-	if (mask & AT_MTIME) {
-		ZFS_TIME_ENCODE(&vap->va_mtime, mtime);
-		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
-		    mtime, sizeof (mtime));
-	}
-
-	/* XXX - shouldn't this be done *before* the ATIME/MTIME checks? */
-	if (mask & AT_SIZE && !(mask & AT_MTIME)) {
-		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs),
-		    NULL, mtime, sizeof (mtime));
-		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
-		    &ctime, sizeof (ctime));
-		zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime,
-		    B_TRUE);
-	} else if (mask != 0) {
-		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
-		    &ctime, sizeof (ctime));
-		zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime, ctime,
-		    B_TRUE);
-		if (attrzp) {
-			SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
-			    SA_ZPL_CTIME(zfsvfs), NULL,
-			    &ctime, sizeof (ctime));
-			zfs_tstamp_update_setup(attrzp, STATE_CHANGED,
-			    mtime, ctime, B_TRUE);
-		}
-	}
-	/*
-	 * Do this after setting timestamps to prevent timestamp
-	 * update from toggling bit
-	 */
-
-	if (xoap && (mask & AT_XVATTR)) {
-
-		if (XVA_ISSET_REQ(xvap, XAT_CREATETIME))
-			xoap->xoa_createtime = vap->va_birthtime;
-		/*
-		 * restore trimmed off masks
-		 * so that return masks can be set for caller.
-		 */
-
-		if (XVA_ISSET_REQ(&tmpxvattr, XAT_APPENDONLY)) {
-			XVA_SET_REQ(xvap, XAT_APPENDONLY);
-		}
-		if (XVA_ISSET_REQ(&tmpxvattr, XAT_NOUNLINK)) {
-			XVA_SET_REQ(xvap, XAT_NOUNLINK);
-		}
-		if (XVA_ISSET_REQ(&tmpxvattr, XAT_IMMUTABLE)) {
-			XVA_SET_REQ(xvap, XAT_IMMUTABLE);
-		}
-		if (XVA_ISSET_REQ(&tmpxvattr, XAT_NODUMP)) {
-			XVA_SET_REQ(xvap, XAT_NODUMP);
-		}
-		if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_MODIFIED)) {
-			XVA_SET_REQ(xvap, XAT_AV_MODIFIED);
-		}
-		if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_QUARANTINED)) {
-			XVA_SET_REQ(xvap, XAT_AV_QUARANTINED);
-		}
-
-		if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP))
-			ASSERT(vp->v_type == VREG);
-
-		zfs_xvattr_set(zp, xvap, tx);
-	}
-
-	if (fuid_dirtied)
-		zfs_fuid_sync(zfsvfs, tx);
-
-	if (mask != 0)
-		zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask, fuidp);
-
-	if (mask & (AT_UID|AT_GID|AT_MODE))
-		mutex_exit(&zp->z_acl_lock);
-
-	if (attrzp) {
-		if (mask & (AT_UID|AT_GID|AT_MODE))
-			mutex_exit(&attrzp->z_acl_lock);
-	}
-out:
-	if (err == 0 && attrzp) {
-		err2 = sa_bulk_update(attrzp->z_sa_hdl, xattr_bulk,
-		    xattr_count, tx);
-		ASSERT(err2 == 0);
-	}
-
-	if (attrzp)
-		vput(ZTOV(attrzp));
-
-	if (aclp)
-		zfs_acl_free(aclp);
-
-	if (fuidp) {
-		zfs_fuid_info_free(fuidp);
-		fuidp = NULL;
-	}
-
-	if (err) {
-		dmu_tx_abort(tx);
-	} else {
-		err2 = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
-		dmu_tx_commit(tx);
-	}
-
-out2:
-	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
-		zil_commit(zilog, 0);
-
-	ZFS_EXIT(zfsvfs);
-	return (err);
-}
-
-/*
- * We acquire all but fdvp locks using non-blocking acquisitions.  If we
- * fail to acquire any lock in the path we will drop all held locks,
- * acquire the new lock in a blocking fashion, and then release it and
- * restart the rename.  This acquire/release step ensures that we do not
- * spin on a lock waiting for release.  On error release all vnode locks
- * and decrement references the way tmpfs_rename() would do.
- */
-static int
-zfs_rename_relock(struct vnode *sdvp, struct vnode **svpp,
-    struct vnode *tdvp, struct vnode **tvpp,
-    const struct componentname *scnp, const struct componentname *tcnp)
-{
-	zfsvfs_t	*zfsvfs;
-	struct vnode	*nvp, *svp, *tvp;
-	znode_t		*sdzp, *tdzp, *szp, *tzp;
-	const char	*snm = scnp->cn_nameptr;
-	const char	*tnm = tcnp->cn_nameptr;
-	int error;
-
-	VOP_UNLOCK(tdvp);
-	if (*tvpp != NULL && *tvpp != tdvp)
-		VOP_UNLOCK(*tvpp);
-
-relock:
-	error = vn_lock(sdvp, LK_EXCLUSIVE);
-	if (error)
-		goto out;
-	sdzp = VTOZ(sdvp);
-
-	error = vn_lock(tdvp, LK_EXCLUSIVE | LK_NOWAIT);
-	if (error != 0) {
-		VOP_UNLOCK(sdvp);
-		if (error != EBUSY)
-			goto out;
-		error = vn_lock(tdvp, LK_EXCLUSIVE);
-		if (error)
-			goto out;
-		VOP_UNLOCK(tdvp);
-		goto relock;
-	}
-	tdzp = VTOZ(tdvp);
-
-	/*
-	 * Before using sdzp and tdzp we must ensure that they are live.
-	 * As a porting legacy from illumos we have two things to worry
-	 * about.  One is typical for FreeBSD and it is that the vnode is
-	 * not reclaimed (doomed).  The other is that the znode is live.
-	 * The current code can invalidate the znode without acquiring the
-	 * corresponding vnode lock if the object represented by the znode
-	 * and vnode is no longer valid after a rollback or receive operation.
-	 * z_teardown_lock hidden behind ZFS_ENTER and ZFS_EXIT is the lock
-	 * that protects the znodes from the invalidation.
-	 */
-	zfsvfs = sdzp->z_zfsvfs;
-	ASSERT3P(zfsvfs, ==, tdzp->z_zfsvfs);
-	ZFS_ENTER(zfsvfs);
-
-	/*
-	 * We can not use ZFS_VERIFY_ZP() here because it could directly return
-	 * bypassing the cleanup code in the case of an error.
-	 */
-	if (tdzp->z_sa_hdl == NULL || sdzp->z_sa_hdl == NULL) {
-		ZFS_EXIT(zfsvfs);
-		VOP_UNLOCK(sdvp);
-		VOP_UNLOCK(tdvp);
-		error = SET_ERROR(EIO);
-		goto out;
-	}
-
-	/*
-	 * Re-resolve svp to be certain it still exists and fetch the
-	 * correct vnode.
-	 */
-	error = zfs_dirent_lookup(sdzp, snm, &szp, ZEXISTS);
-	if (error != 0) {
-		/* Source entry invalid or not there. */
-		ZFS_EXIT(zfsvfs);
-		VOP_UNLOCK(sdvp);
-		VOP_UNLOCK(tdvp);
-		if ((scnp->cn_flags & ISDOTDOT) != 0 ||
-		    (scnp->cn_namelen == 1 && scnp->cn_nameptr[0] == '.'))
-			error = SET_ERROR(EINVAL);
-		goto out;
-	}
-	svp = ZTOV(szp);
-
-	/*
-	 * Re-resolve tvp, if it disappeared we just carry on.
-	 */
-	error = zfs_dirent_lookup(tdzp, tnm, &tzp, 0);
-	if (error != 0) {
-		ZFS_EXIT(zfsvfs);
-		VOP_UNLOCK(sdvp);
-		VOP_UNLOCK(tdvp);
-		vrele(svp);
-		if ((tcnp->cn_flags & ISDOTDOT) != 0)
-			error = SET_ERROR(EINVAL);
-		goto out;
-	}
-	if (tzp != NULL)
-		tvp = ZTOV(tzp);
-	else
-		tvp = NULL;
-
-	/*
-	 * At present the vnode locks must be acquired before z_teardown_lock,
-	 * although it would be more logical to use the opposite order.
-	 */
-	ZFS_EXIT(zfsvfs);
-
-	/*
-	 * Now try acquire locks on svp and tvp.
-	 */
-	nvp = svp;
-	error = vn_lock(nvp, LK_EXCLUSIVE | LK_NOWAIT);
-	if (error != 0) {
-		VOP_UNLOCK(sdvp);
-		VOP_UNLOCK(tdvp);
-		if (tvp != NULL)
-			vrele(tvp);
-		if (error != EBUSY) {
-			vrele(nvp);
-			goto out;
-		}
-		error = vn_lock(nvp, LK_EXCLUSIVE);
-		if (error != 0) {
-			vrele(nvp);
-			goto out;
-		}
-		VOP_UNLOCK(nvp);
-		/*
-		 * Concurrent rename race.
-		 * XXX ?
-		 */
-		if (nvp == tdvp) {
-			vrele(nvp);
-			error = SET_ERROR(EINVAL);
-			goto out;
-		}
-		vrele(*svpp);
-		*svpp = nvp;
-		goto relock;
-	}
-	vrele(*svpp);
-	*svpp = nvp;
-
-	if (*tvpp != NULL)
-		vrele(*tvpp);
-	*tvpp = NULL;
-	if (tvp != NULL) {
-		nvp = tvp;
-		error = vn_lock(nvp, LK_EXCLUSIVE | LK_NOWAIT);
-		if (error != 0) {
-			VOP_UNLOCK(sdvp);
-			VOP_UNLOCK(tdvp);
-			VOP_UNLOCK(*svpp);
-			if (error != EBUSY) {
-				vrele(nvp);
-				goto out;
-			}
-			error = vn_lock(nvp, LK_EXCLUSIVE);
-			if (error != 0) {
-				vrele(nvp);
-				goto out;
-			}
-			vput(nvp);
-			goto relock;
-		}
-		*tvpp = nvp;
-	}
-
-	return (0);
-
-out:
-	return (error);
-}
-
-/*
- * Note that we must use VRELE_ASYNC in this function as it walks
- * up the directory tree and vrele may need to acquire an exclusive
- * lock if a last reference to a vnode is dropped.
- */
-static int
-zfs_rename_check(znode_t *szp, znode_t *sdzp, znode_t *tdzp)
-{
-	zfsvfs_t	*zfsvfs;
-	znode_t		*zp, *zp1;
-	uint64_t	parent;
-	int		error;
-
-	zfsvfs = tdzp->z_zfsvfs;
-	if (tdzp == szp)
-		return (SET_ERROR(EINVAL));
-	if (tdzp == sdzp)
-		return (0);
-	if (tdzp->z_id == zfsvfs->z_root)
-		return (0);
-	zp = tdzp;
-	for (;;) {
-		ASSERT(!zp->z_unlinked);
-		if ((error = sa_lookup(zp->z_sa_hdl,
-		    SA_ZPL_PARENT(zfsvfs), &parent, sizeof (parent))) != 0)
-			break;
-
-		if (parent == szp->z_id) {
-			error = SET_ERROR(EINVAL);
-			break;
-		}
-		if (parent == zfsvfs->z_root)
-			break;
-		if (parent == sdzp->z_id)
-			break;
-
-		error = zfs_zget(zfsvfs, parent, &zp1);
-		if (error != 0)
-			break;
-
-		if (zp != tdzp)
-			VN_RELE_ASYNC(ZTOV(zp),
-			    dsl_pool_vnrele_taskq(dmu_objset_pool(zfsvfs->z_os)));
-		zp = zp1;
-	}
-
-	if (error == ENOTDIR)
-		panic("checkpath: .. not a directory\n");
-	if (zp != tdzp)
-		VN_RELE_ASYNC(ZTOV(zp),
-		    dsl_pool_vnrele_taskq(dmu_objset_pool(zfsvfs->z_os)));
-	return (error);
-}
-
-/*
- * Move an entry from the provided source directory to the target
- * directory.  Change the entry name as indicated.
- *
- *	IN:	sdvp	- Source directory containing the "old entry".
- *		snm	- Old entry name.
- *		tdvp	- Target directory to contain the "new entry".
- *		tnm	- New entry name.
- *		cr	- credentials of caller.
- *		ct	- caller context
- *		flags	- case flags
- *
- *	RETURN:	0 on success, error code on failure.
- *
- * Timestamps:
- *	sdvp,tdvp - ctime|mtime updated
- */
-/*ARGSUSED*/
-static int
-zfs_rename(vnode_t *sdvp, vnode_t **svpp, struct componentname *scnp,
-    vnode_t *tdvp, vnode_t **tvpp, struct componentname *tcnp,
-    cred_t *cr)
-{
-	zfsvfs_t	*zfsvfs;
-	znode_t		*sdzp, *tdzp, *szp, *tzp;
-	zilog_t		*zilog = NULL;
-	dmu_tx_t	*tx;
-	char		*snm = scnp->cn_nameptr;
-	char		*tnm = tcnp->cn_nameptr;
-	int		error = 0;
-	bool		want_seqc_end = false;
-
-	/* Reject renames across filesystems. */
-	if ((*svpp)->v_mount != tdvp->v_mount ||
-	    ((*tvpp) != NULL && (*svpp)->v_mount != (*tvpp)->v_mount)) {
-		error = SET_ERROR(EXDEV);
-		goto out;
-	}
-
-	if (zfsctl_is_node(tdvp)) {
-		error = SET_ERROR(EXDEV);
-		goto out;
-	}
-
-	/*
-	 * Lock all four vnodes to ensure safety and semantics of renaming.
-	 */
-	error = zfs_rename_relock(sdvp, svpp, tdvp, tvpp, scnp, tcnp);
-	if (error != 0) {
-		/* no vnodes are locked in the case of error here */
-		return (error);
-	}
-
-	tdzp = VTOZ(tdvp);
-	sdzp = VTOZ(sdvp);
-	zfsvfs = tdzp->z_zfsvfs;
-	zilog = zfsvfs->z_log;
-
-	/*
-	 * After we re-enter ZFS_ENTER() we will have to revalidate all
-	 * znodes involved.
-	 */
-	ZFS_ENTER(zfsvfs);
-
-	if (zfsvfs->z_utf8 && u8_validate(tnm,
-	    strlen(tnm), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
-		error = SET_ERROR(EILSEQ);
-		goto unlockout;
-	}
-
-	/* If source and target are the same file, there is nothing to do. */
-	if ((*svpp) == (*tvpp)) {
-		error = 0;
-		goto unlockout;
-	}
-
-	if (((*svpp)->v_type == VDIR && (*svpp)->v_mountedhere != NULL) ||
-	    ((*tvpp) != NULL && (*tvpp)->v_type == VDIR &&
-	    (*tvpp)->v_mountedhere != NULL)) {
-		error = SET_ERROR(EXDEV);
-		goto unlockout;
-	}
-
-	/*
-	 * We can not use ZFS_VERIFY_ZP() here because it could directly return
-	 * bypassing the cleanup code in the case of an error.
-	 */
-	if (tdzp->z_sa_hdl == NULL || sdzp->z_sa_hdl == NULL) {
-		error = SET_ERROR(EIO);
-		goto unlockout;
-	}
-
-	szp = VTOZ(*svpp);
-	tzp = *tvpp == NULL ? NULL : VTOZ(*tvpp);
-	if (szp->z_sa_hdl == NULL || (tzp != NULL && tzp->z_sa_hdl == NULL)) {
-		error = SET_ERROR(EIO);
-		goto unlockout;
-	}
-
-	/*
-	 * This is to prevent the creation of links into attribute space
-	 * by renaming a linked file into/outof an attribute directory.
-	 * See the comment in zfs_link() for why this is considered bad.
-	 */
-	if ((tdzp->z_pflags & ZFS_XATTR) != (sdzp->z_pflags & ZFS_XATTR)) {
-		error = SET_ERROR(EINVAL);
-		goto unlockout;
-	}
-
-	/*
-	 * Must have write access at the source to remove the old entry
-	 * and write access at the target to create the new entry.
-	 * Note that if target and source are the same, this can be
-	 * done in a single check.
-	 */
-	if (error = zfs_zaccess_rename(sdzp, szp, tdzp, tzp, cr))
-		goto unlockout;
-
-	if ((*svpp)->v_type == VDIR) {
-		/*
-		 * Avoid ".", "..", and aliases of "." for obvious reasons.
-		 */
-		if ((scnp->cn_namelen == 1 && scnp->cn_nameptr[0] == '.') ||
-		    sdzp == szp ||
-		    (scnp->cn_flags | tcnp->cn_flags) & ISDOTDOT) {
-			error = EINVAL;
-			goto unlockout;
-		}
-
-		/*
-		 * Check to make sure rename is valid.
-		 * Can't do a move like this: /usr/a/b to /usr/a/b/c/d
-		 */
-		if (error = zfs_rename_check(szp, sdzp, tdzp))
-			goto unlockout;
-	}
-
-	/*
-	 * Does target exist?
-	 */
-	if (tzp) {
-		/*
-		 * Source and target must be the same type.
-		 */
-		if ((*svpp)->v_type == VDIR) {
-			if ((*tvpp)->v_type != VDIR) {
-				error = SET_ERROR(ENOTDIR);
-				goto unlockout;
-			} else {
-				cache_purge(tdvp);
-				if (sdvp != tdvp)
-					cache_purge(sdvp);
-			}
-		} else {
-			if ((*tvpp)->v_type == VDIR) {
-				error = SET_ERROR(EISDIR);
-				goto unlockout;
-			}
-		}
-	}
-
-	vn_seqc_write_begin(*svpp);
-	vn_seqc_write_begin(sdvp);
-	if (*tvpp != NULL)
-		vn_seqc_write_begin(*tvpp);
-	if (tdvp != *tvpp)
-		vn_seqc_write_begin(tdvp);
-	want_seqc_end = true;
-
-	vnevent_rename_src(*svpp, sdvp, scnp->cn_nameptr, ct);
-	if (tzp)
-		vnevent_rename_dest(*tvpp, tdvp, tnm, ct);
-
-	/*
-	 * notify the target directory if it is not the same
-	 * as source directory.
-	 */
-	if (tdvp != sdvp) {
-		vnevent_rename_dest_dir(tdvp, ct);
-	}
-
-	tx = dmu_tx_create(zfsvfs->z_os);
-	dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE);
-	dmu_tx_hold_sa(tx, sdzp->z_sa_hdl, B_FALSE);
-	dmu_tx_hold_zap(tx, sdzp->z_id, FALSE, snm);
-	dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, tnm);
-	if (sdzp != tdzp) {
-		dmu_tx_hold_sa(tx, tdzp->z_sa_hdl, B_FALSE);
-		zfs_sa_upgrade_txholds(tx, tdzp);
-	}
-	if (tzp) {
-		dmu_tx_hold_sa(tx, tzp->z_sa_hdl, B_FALSE);
-		zfs_sa_upgrade_txholds(tx, tzp);
-	}
-
-	zfs_sa_upgrade_txholds(tx, szp);
-	dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
-	error = dmu_tx_assign(tx, TXG_WAIT);
-	if (error) {
-		dmu_tx_abort(tx);
-		goto unlockout;
-	}
-
-
-	if (tzp)	/* Attempt to remove the existing target */
-		error = zfs_link_destroy(tdzp, tnm, tzp, tx, 0, NULL);
-
-	if (error == 0) {
-		error = zfs_link_create(tdzp, tnm, szp, tx, ZRENAMING);
-		if (error == 0) {
-			szp->z_pflags |= ZFS_AV_MODIFIED;
-
-			error = sa_update(szp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs),
-			    (void *)&szp->z_pflags, sizeof (uint64_t), tx);
-			ASSERT0(error);
-
-			error = zfs_link_destroy(sdzp, snm, szp, tx, ZRENAMING,
-			    NULL);
-			if (error == 0) {
-				zfs_log_rename(zilog, tx, TX_RENAME, sdzp,
-				    snm, tdzp, tnm, szp);
-
-				/*
-				 * Update path information for the target vnode
-				 */
-				vn_renamepath(tdvp, *svpp, tnm, strlen(tnm));
-			} else {
-				/*
-				 * At this point, we have successfully created
-				 * the target name, but have failed to remove
-				 * the source name.  Since the create was done
-				 * with the ZRENAMING flag, there are
-				 * complications; for one, the link count is
-				 * wrong.  The easiest way to deal with this
-				 * is to remove the newly created target, and
-				 * return the original error.  This must
-				 * succeed; fortunately, it is very unlikely to
-				 * fail, since we just created it.
-				 */
-				VERIFY3U(zfs_link_destroy(tdzp, tnm, szp, tx,
-				    ZRENAMING, NULL), ==, 0);
-			}
-		}
-		if (error == 0) {
-			cache_purge(*svpp);
-			if (*tvpp != NULL)
-				cache_purge(*tvpp);
-			cache_purge_negative(tdvp);
-		}
-	}
-
-	dmu_tx_commit(tx);
-
-unlockout:			/* all 4 vnodes are locked, ZFS_ENTER called */
-	ZFS_EXIT(zfsvfs);
-	if (want_seqc_end) {
-		vn_seqc_write_end(*svpp);
-		vn_seqc_write_end(sdvp);
-		if (*tvpp != NULL)
-			vn_seqc_write_end(*tvpp);
-		if (tdvp != *tvpp)
-			vn_seqc_write_end(tdvp);
-		want_seqc_end = false;
-	}
-	VOP_UNLOCK(*svpp);
-	VOP_UNLOCK(sdvp);
-
-out:				/* original two vnodes are locked */
-	MPASS(!want_seqc_end);
-	if (error == 0 && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
-		zil_commit(zilog, 0);
-
-	if (*tvpp != NULL)
-		VOP_UNLOCK(*tvpp);
-	if (tdvp != *tvpp)
-		VOP_UNLOCK(tdvp);
-	return (error);
-}
-
-/*
- * Insert the indicated symbolic reference entry into the directory.
- *
- *	IN:	dvp	- Directory to contain new symbolic link.
- *		link	- Name for new symlink entry.
- *		vap	- Attributes of new entry.
- *		cr	- credentials of caller.
- *		ct	- caller context
- *		flags	- case flags
- *
- *	RETURN:	0 on success, error code on failure.
- *
- * Timestamps:
- *	dvp - ctime|mtime updated
- */
-/*ARGSUSED*/
-static int
-zfs_symlink(vnode_t *dvp, vnode_t **vpp, char *name, vattr_t *vap, char *link,
-    cred_t *cr, kthread_t *td)
-{
-	znode_t		*zp, *dzp = VTOZ(dvp);
-	dmu_tx_t	*tx;
-	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
-	zilog_t		*zilog;
-	uint64_t	len = strlen(link);
-	int		error;
-	zfs_acl_ids_t	acl_ids;
-	boolean_t	fuid_dirtied;
-	uint64_t	txtype = TX_SYMLINK;
-	int		flags = 0;
-
-	ASSERT(vap->va_type == VLNK);
-
-	ZFS_ENTER(zfsvfs);
-	ZFS_VERIFY_ZP(dzp);
-	zilog = zfsvfs->z_log;
-
-	if (zfsvfs->z_utf8 && u8_validate(name, strlen(name),
-	    NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
-		ZFS_EXIT(zfsvfs);
-		return (SET_ERROR(EILSEQ));
-	}
-
-	if (len > MAXPATHLEN) {
-		ZFS_EXIT(zfsvfs);
-		return (SET_ERROR(ENAMETOOLONG));
-	}
-
-	if ((error = zfs_acl_ids_create(dzp, 0,
-	    vap, cr, NULL, &acl_ids)) != 0) {
-		ZFS_EXIT(zfsvfs);
-		return (error);
-	}
-
-	/*
-	 * Attempt to lock directory; fail if entry already exists.
-	 */
-	error = zfs_dirent_lookup(dzp, name, &zp, ZNEW);
-	if (error) {
-		zfs_acl_ids_free(&acl_ids);
-		ZFS_EXIT(zfsvfs);
-		return (error);
-	}
-
-	if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) {
-		zfs_acl_ids_free(&acl_ids);
-		ZFS_EXIT(zfsvfs);
-		return (error);
-	}
-
-	if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
-		zfs_acl_ids_free(&acl_ids);
-		ZFS_EXIT(zfsvfs);
-		return (SET_ERROR(EDQUOT));
-	}
-
-	getnewvnode_reserve();
-	tx = dmu_tx_create(zfsvfs->z_os);
-	fuid_dirtied = zfsvfs->z_fuid_dirty;
-	dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len));
-	dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
-	dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
-	    ZFS_SA_BASE_ATTR_SIZE + len);
-	dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
-	if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
-		dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
-		    acl_ids.z_aclp->z_acl_bytes);
-	}
-	if (fuid_dirtied)
-		zfs_fuid_txhold(zfsvfs, tx);
-	error = dmu_tx_assign(tx, TXG_WAIT);
-	if (error) {
-		zfs_acl_ids_free(&acl_ids);
-		dmu_tx_abort(tx);
-		getnewvnode_drop_reserve();
-		ZFS_EXIT(zfsvfs);
-		return (error);
-	}
-
-	/*
-	 * Create a new object for the symlink.
-	 * for version 4 ZPL datsets the symlink will be an SA attribute
-	 */
-	zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
-
-	if (fuid_dirtied)
-		zfs_fuid_sync(zfsvfs, tx);
-
-	if (zp->z_is_sa)
-		error = sa_update(zp->z_sa_hdl, SA_ZPL_SYMLINK(zfsvfs),
-		    link, len, tx);
-	else
-		zfs_sa_symlink(zp, link, len, tx);
-
-	zp->z_size = len;
-	(void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs),
-	    &zp->z_size, sizeof (zp->z_size), tx);
-	/*
-	 * Insert the new object into the directory.
-	 */
-	(void) zfs_link_create(dzp, name, zp, tx, ZNEW);
-
-	zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link);
-	*vpp = ZTOV(zp);
-
-	zfs_acl_ids_free(&acl_ids);
-
-	dmu_tx_commit(tx);
-
-	getnewvnode_drop_reserve();
-
-	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
-		zil_commit(zilog, 0);
-
-	ZFS_EXIT(zfsvfs);
-	return (error);
-}
-
-/*
- * Return, in the buffer contained in the provided uio structure,
- * the symbolic path referred to by vp.
- *
- *	IN:	vp	- vnode of symbolic link.
- *		uio	- structure to contain the link path.
- *		cr	- credentials of caller.
- *		ct	- caller context
- *
- *	OUT:	uio	- structure containing the link path.
- *
- *	RETURN:	0 on success, error code on failure.
- *
- * Timestamps:
- *	vp - atime updated
- */
-/* ARGSUSED */
-static int
-zfs_readlink(vnode_t *vp, uio_t *uio, cred_t *cr, caller_context_t *ct)
-{
-	znode_t		*zp = VTOZ(vp);
-	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
-	int		error;
-
-	ZFS_ENTER(zfsvfs);
-	ZFS_VERIFY_ZP(zp);
-
-	if (zp->z_is_sa)
-		error = sa_lookup_uio(zp->z_sa_hdl,
-		    SA_ZPL_SYMLINK(zfsvfs), uio);
-	else
-		error = zfs_sa_readlink(zp, uio);
-
-	ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
-
-	ZFS_EXIT(zfsvfs);
-	return (error);
-}
-
-/*
- * Insert a new entry into directory tdvp referencing svp.
- *
- *	IN:	tdvp	- Directory to contain new entry.
- *		svp	- vnode of new entry.
- *		name	- name of new entry.
- *		cr	- credentials of caller.
- *		ct	- caller context
- *
- *	RETURN:	0 on success, error code on failure.
- *
- * Timestamps:
- *	tdvp - ctime|mtime updated
- *	 svp - ctime updated
- */
-/* ARGSUSED */
-static int
-zfs_link(vnode_t *tdvp, vnode_t *svp, char *name, cred_t *cr,
-    caller_context_t *ct, int flags)
-{
-	znode_t		*dzp = VTOZ(tdvp);
-	znode_t		*tzp, *szp;
-	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
-	zilog_t		*zilog;
-	dmu_tx_t	*tx;
-	int		error;
-	uint64_t	parent;
-	uid_t		owner;
-
-	ASSERT(tdvp->v_type == VDIR);
-
-	ZFS_ENTER(zfsvfs);
-	ZFS_VERIFY_ZP(dzp);
-	zilog = zfsvfs->z_log;
-
-	/*
-	 * POSIX dictates that we return EPERM here.
-	 * Better choices include ENOTSUP or EISDIR.
-	 */
-	if (svp->v_type == VDIR) {
-		ZFS_EXIT(zfsvfs);
-		return (SET_ERROR(EPERM));
-	}
-
-	szp = VTOZ(svp);
-	ZFS_VERIFY_ZP(szp);
-
-	if (szp->z_pflags & (ZFS_APPENDONLY | ZFS_IMMUTABLE | ZFS_READONLY)) {
-		ZFS_EXIT(zfsvfs);
-		return (SET_ERROR(EPERM));
-	}
-
-	/* Prevent links to .zfs/shares files */
-
-	if ((error = sa_lookup(szp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
-	    &parent, sizeof (uint64_t))) != 0) {
-		ZFS_EXIT(zfsvfs);
-		return (error);
-	}
-	if (parent == zfsvfs->z_shares_dir) {
-		ZFS_EXIT(zfsvfs);
-		return (SET_ERROR(EPERM));
-	}
-
-	if (zfsvfs->z_utf8 && u8_validate(name,
-	    strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
-		ZFS_EXIT(zfsvfs);
-		return (SET_ERROR(EILSEQ));
-	}
-
-	/*
-	 * We do not support links between attributes and non-attributes
-	 * because of the potential security risk of creating links
-	 * into "normal" file space in order to circumvent restrictions
-	 * imposed in attribute space.
-	 */
-	if ((szp->z_pflags & ZFS_XATTR) != (dzp->z_pflags & ZFS_XATTR)) {
-		ZFS_EXIT(zfsvfs);
-		return (SET_ERROR(EINVAL));
-	}
-
-
-	owner = zfs_fuid_map_id(zfsvfs, szp->z_uid, cr, ZFS_OWNER);
-	if (owner != crgetuid(cr) && secpolicy_basic_link(svp, cr) != 0) {
-		ZFS_EXIT(zfsvfs);
-		return (SET_ERROR(EPERM));
-	}
-
-	if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) {
-		ZFS_EXIT(zfsvfs);
-		return (error);
-	}
-
-	/*
-	 * Attempt to lock directory; fail if entry already exists.
-	 */
-	error = zfs_dirent_lookup(dzp, name, &tzp, ZNEW);
-	if (error) {
-		ZFS_EXIT(zfsvfs);
-		return (error);
-	}
-
-	tx = dmu_tx_create(zfsvfs->z_os);
-	dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE);
-	dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
-	zfs_sa_upgrade_txholds(tx, szp);
-	zfs_sa_upgrade_txholds(tx, dzp);
-	error = dmu_tx_assign(tx, TXG_WAIT);
-	if (error) {
-		dmu_tx_abort(tx);
-		ZFS_EXIT(zfsvfs);
-		return (error);
-	}
-
-	error = zfs_link_create(dzp, name, szp, tx, 0);
-
-	if (error == 0) {
-		uint64_t txtype = TX_LINK;
-		zfs_log_link(zilog, tx, txtype, dzp, szp, name);
-	}
-
-	dmu_tx_commit(tx);
-
-	if (error == 0) {
-		vnevent_link(svp, ct);
-	}
-
-	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
-		zil_commit(zilog, 0);
-
-	ZFS_EXIT(zfsvfs);
-	return (error);
-}
-
-
-/*ARGSUSED*/
-void
-zfs_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct)
-{
-	znode_t	*zp = VTOZ(vp);
-	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
-	int error;
-
-	ZFS_RLOCK_TEARDOWN_INACTIVE(zfsvfs);
-	if (zp->z_sa_hdl == NULL) {
-		/*
-		 * The fs has been unmounted, or we did a
-		 * suspend/resume and this file no longer exists.
-		 */
-		ZFS_RUNLOCK_TEARDOWN_INACTIVE(zfsvfs);
-		vrecycle(vp);
-		return;
-	}
-
-	if (zp->z_unlinked) {
-		/*
-		 * Fast path to recycle a vnode of a removed file.
-		 */
-		ZFS_RUNLOCK_TEARDOWN_INACTIVE(zfsvfs);
-		vrecycle(vp);
-		return;
-	}
-
-	if (zp->z_atime_dirty && zp->z_unlinked == 0) {
-		dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os);
-
-		dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
-		zfs_sa_upgrade_txholds(tx, zp);
-		error = dmu_tx_assign(tx, TXG_WAIT);
-		if (error) {
-			dmu_tx_abort(tx);
-		} else {
-			(void) sa_update(zp->z_sa_hdl, SA_ZPL_ATIME(zfsvfs),
-			    (void *)&zp->z_atime, sizeof (zp->z_atime), tx);
-			zp->z_atime_dirty = 0;
-			dmu_tx_commit(tx);
-		}
-	}
-	ZFS_RUNLOCK_TEARDOWN_INACTIVE(zfsvfs);
-}
-
-
-CTASSERT(sizeof(struct zfid_short) <= sizeof(struct fid));
-CTASSERT(sizeof(struct zfid_long) <= sizeof(struct fid));
-
-/*ARGSUSED*/
-static int
-zfs_fid(vnode_t *vp, fid_t *fidp, caller_context_t *ct)
-{
-	znode_t		*zp = VTOZ(vp);
-	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
-	uint32_t	gen;
-	uint64_t	gen64;
-	uint64_t	object = zp->z_id;
-	zfid_short_t	*zfid;
-	int		size, i, error;
-
-	ZFS_ENTER(zfsvfs);
-	ZFS_VERIFY_ZP(zp);
-
-	if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs),
-	    &gen64, sizeof (uint64_t))) != 0) {
-		ZFS_EXIT(zfsvfs);
-		return (error);
-	}
-
-	gen = (uint32_t)gen64;
-
-	size = (zfsvfs->z_parent != zfsvfs) ? LONG_FID_LEN : SHORT_FID_LEN;
-
-#ifdef illumos
-	if (fidp->fid_len < size) {
-		fidp->fid_len = size;
-		ZFS_EXIT(zfsvfs);
-		return (SET_ERROR(ENOSPC));
-	}
-#else
-	fidp->fid_len = size;
-#endif
-
-	zfid = (zfid_short_t *)fidp;
-
-	zfid->zf_len = size;
-
-	for (i = 0; i < sizeof (zfid->zf_object); i++)
-		zfid->zf_object[i] = (uint8_t)(object >> (8 * i));
-
-	/* Must have a non-zero generation number to distinguish from .zfs */
-	if (gen == 0)
-		gen = 1;
-	for (i = 0; i < sizeof (zfid->zf_gen); i++)
-		zfid->zf_gen[i] = (uint8_t)(gen >> (8 * i));
-
-	if (size == LONG_FID_LEN) {
-		uint64_t	objsetid = dmu_objset_id(zfsvfs->z_os);
-		zfid_long_t	*zlfid;
-
-		zlfid = (zfid_long_t *)fidp;
-
-		for (i = 0; i < sizeof (zlfid->zf_setid); i++)
-			zlfid->zf_setid[i] = (uint8_t)(objsetid >> (8 * i));
-
-		/* XXX - this should be the generation number for the objset */
-		for (i = 0; i < sizeof (zlfid->zf_setgen); i++)
-			zlfid->zf_setgen[i] = 0;
-	}
-
-	ZFS_EXIT(zfsvfs);
-	return (0);
-}
-
-static int
-zfs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr,
-    caller_context_t *ct)
-{
-	znode_t		*zp, *xzp;
-	zfsvfs_t	*zfsvfs;
-	int		error;
-
-	switch (cmd) {
-	case _PC_LINK_MAX:
-		*valp = MIN(LONG_MAX, ZFS_LINK_MAX);
-		return (0);
-
-	case _PC_FILESIZEBITS:
-		*valp = 64;
-		return (0);
-#ifdef illumos
-	case _PC_XATTR_EXISTS:
-		zp = VTOZ(vp);
-		zfsvfs = zp->z_zfsvfs;
-		ZFS_ENTER(zfsvfs);
-		ZFS_VERIFY_ZP(zp);
-		*valp = 0;
-		error = zfs_dirent_lookup(zp, "", &xzp,
-		    ZXATTR | ZEXISTS | ZSHARED);
-		if (error == 0) {
-			if (!zfs_dirempty(xzp))
-				*valp = 1;
-			vrele(ZTOV(xzp));
-		} else if (error == ENOENT) {
-			/*
-			 * If there aren't extended attributes, it's the
-			 * same as having zero of them.
-			 */
-			error = 0;
-		}
-		ZFS_EXIT(zfsvfs);
-		return (error);
-
-	case _PC_SATTR_ENABLED:
-	case _PC_SATTR_EXISTS:
-		*valp = vfs_has_feature(vp->v_vfsp, VFSFT_SYSATTR_VIEWS) &&
-		    (vp->v_type == VREG || vp->v_type == VDIR);
-		return (0);
-
-	case _PC_ACCESS_FILTERING:
-		*valp = vfs_has_feature(vp->v_vfsp, VFSFT_ACCESS_FILTER) &&
-		    vp->v_type == VDIR;
-		return (0);
-
-	case _PC_ACL_ENABLED:
-		*valp = _ACL_ACE_ENABLED;
-		return (0);
-#endif	/* illumos */
-	case _PC_MIN_HOLE_SIZE:
-		*valp = (int)SPA_MINBLOCKSIZE;
-		return (0);
-#ifdef illumos
-	case _PC_TIMESTAMP_RESOLUTION:
-		/* nanosecond timestamp resolution */
-		*valp = 1L;
-		return (0);
-#endif
-	case _PC_ACL_EXTENDED:
-		*valp = 0;
-		return (0);
-
-	case _PC_ACL_NFS4:
-		*valp = 1;
-		return (0);
-
-	case _PC_ACL_PATH_MAX:
-		*valp = ACL_MAX_ENTRIES;
-		return (0);
-
-	default:
-		return (EOPNOTSUPP);
-	}
-}
-
-/*ARGSUSED*/
-static int
-zfs_getsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr,
-    caller_context_t *ct)
-{
-	znode_t *zp = VTOZ(vp);
-	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
-	int error;
-	boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
-
-	ZFS_ENTER(zfsvfs);
-	ZFS_VERIFY_ZP(zp);
-	error = zfs_getacl(zp, vsecp, skipaclchk, cr);
-	ZFS_EXIT(zfsvfs);
-
-	return (error);
-}
-
-/*ARGSUSED*/
-int
-zfs_setsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr,
-    caller_context_t *ct)
-{
-	znode_t *zp = VTOZ(vp);
-	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
-	int error;
-	boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
-	zilog_t	*zilog = zfsvfs->z_log;
-
-	ZFS_ENTER(zfsvfs);
-	ZFS_VERIFY_ZP(zp);
-
-	error = zfs_setacl(zp, vsecp, skipaclchk, cr);
-
-	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
-		zil_commit(zilog, 0);
-
-	ZFS_EXIT(zfsvfs);
-	return (error);
-}
-
-static int
-zfs_getpages(struct vnode *vp, vm_page_t *ma, int count, int *rbehind,
-    int *rahead)
-{
-	znode_t *zp = VTOZ(vp);
-	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
-	objset_t *os = zp->z_zfsvfs->z_os;
-	locked_range_t *lr;
-	vm_object_t object;
-	off_t start, end, obj_size;
-	uint_t blksz;
-	int pgsin_b, pgsin_a;
-	int error;
-
-	ZFS_ENTER(zfsvfs);
-	ZFS_VERIFY_ZP(zp);
-
-	start = IDX_TO_OFF(ma[0]->pindex);
-	end = IDX_TO_OFF(ma[count - 1]->pindex + 1);
-
-	/*
-	 * Try to lock a range covering all required and optional pages, to
-	 * handle the case of the block size growing.  It is not safe to block
-	 * on the range lock since the owner may be waiting for the fault page
-	 * to be unbusied.
-	 */
-	for (;;) {
-		blksz = zp->z_blksz;
-		lr = rangelock_tryenter(&zp->z_rangelock,
-		    rounddown(start, blksz),
-		    roundup(end, blksz) - rounddown(start, blksz), RL_READER);
-		if (lr == NULL) {
-			if (rahead != NULL) {
-				*rahead = 0;
-				rahead = NULL;
-			}
-			if (rbehind != NULL) {
-				*rbehind = 0;
-				rbehind = NULL;
-			}
-			break;
-		}
-		if (blksz == zp->z_blksz)
-			break;
-		rangelock_exit(lr);
-	}
-
-	object = ma[0]->object;
-	zfs_vmobject_wlock(object);
-	obj_size = object->un_pager.vnp.vnp_size;
-	zfs_vmobject_wunlock(object);
-	if (IDX_TO_OFF(ma[count - 1]->pindex) >= obj_size) {
-		if (lr != NULL)
-			rangelock_exit(lr);
-		ZFS_EXIT(zfsvfs);
-		return (zfs_vm_pagerret_bad);
-	}
-
-	pgsin_b = 0;
-	if (rbehind != NULL) {
-		pgsin_b = OFF_TO_IDX(start - rounddown(start, blksz));
-		pgsin_b = MIN(*rbehind, pgsin_b);
-	}
-
-	pgsin_a = 0;
-	if (rahead != NULL) {
-		pgsin_a = OFF_TO_IDX(roundup(end, blksz) - end);
-		if (end + IDX_TO_OFF(pgsin_a) >= obj_size)
-			pgsin_a = OFF_TO_IDX(round_page(obj_size) - end);
-		pgsin_a = MIN(*rahead, pgsin_a);
-	}
-
-	/*
-	 * NB: we need to pass the exact byte size of the data that we expect
-	 * to read after accounting for the file size.  This is required because
-	 * ZFS will panic if we request DMU to read beyond the end of the last
-	 * allocated block.
-	 */
-	error = dmu_read_pages(os, zp->z_id, ma, count, &pgsin_b, &pgsin_a,
-	    MIN(end, obj_size) - (end - PAGE_SIZE));
-
-	if (lr != NULL)
-		rangelock_exit(lr);
-	ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
-	ZFS_EXIT(zfsvfs);
-
-	if (error != 0)
-		return (zfs_vm_pagerret_error);
-
-	VM_CNT_INC(v_vnodein);
-	VM_CNT_ADD(v_vnodepgsin, count + pgsin_b + pgsin_a);
-	if (rbehind != NULL)
-		*rbehind = pgsin_b;
-	if (rahead != NULL)
-		*rahead = pgsin_a;
-	return (zfs_vm_pagerret_ok);
-}
-
-static int
-zfs_freebsd_getpages(ap)
-	struct vop_getpages_args /* {
-		struct vnode *a_vp;
-		vm_page_t *a_m;
-		int a_count;
-		int *a_rbehind;
-		int *a_rahead;
-	} */ *ap;
-{
-
-	return (zfs_getpages(ap->a_vp, ap->a_m, ap->a_count, ap->a_rbehind,
-	    ap->a_rahead));
-}
-
-static int
-zfs_putpages(struct vnode *vp, vm_page_t *ma, size_t len, int flags,
-    int *rtvals)
-{
-	znode_t		*zp = VTOZ(vp);
-	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
-	locked_range_t	*lr;
-	dmu_tx_t	*tx;
-	struct sf_buf	*sf;
-	vm_object_t	object;
-	vm_page_t	m;
-	caddr_t		va;
-	size_t		tocopy;
-	size_t		lo_len;
-	vm_ooffset_t	lo_off;
-	vm_ooffset_t	off;
-	uint_t		blksz;
-	int		ncount;
-	int		pcount;
-	int		err;
-	int		i;
-
-	ZFS_ENTER(zfsvfs);
-	ZFS_VERIFY_ZP(zp);
-
-	object = vp->v_object;
-	pcount = btoc(len);
-	ncount = pcount;
-
-	KASSERT(ma[0]->object == object, ("mismatching object"));
-	KASSERT(len > 0 && (len & PAGE_MASK) == 0, ("unexpected length"));
-
-	for (i = 0; i < pcount; i++)
-		rtvals[i] = zfs_vm_pagerret_error;
-
-	off = IDX_TO_OFF(ma[0]->pindex);
-	blksz = zp->z_blksz;
-	lo_off = rounddown(off, blksz);
-	lo_len = roundup(len + (off - lo_off), blksz);
-	lr = rangelock_enter(&zp->z_rangelock, lo_off, lo_len, RL_WRITER);
-
-	zfs_vmobject_wlock(object);
-	if (len + off > object->un_pager.vnp.vnp_size) {
-		if (object->un_pager.vnp.vnp_size > off) {
-			int pgoff;
-
-			len = object->un_pager.vnp.vnp_size - off;
-			ncount = btoc(len);
-			if ((pgoff = (int)len & PAGE_MASK) != 0) {
-				/*
-				 * If the object is locked and the following
-				 * conditions hold, then the page's dirty
-				 * field cannot be concurrently changed by a
-				 * pmap operation.
-				 */
-				m = ma[ncount - 1];
-				vm_page_assert_sbusied(m);
-				KASSERT(!pmap_page_is_write_mapped(m),
-				    ("zfs_putpages: page %p is not read-only", m));
-				vm_page_clear_dirty(m, pgoff, PAGE_SIZE -
-				    pgoff);
-			}
-		} else {
-			len = 0;
-			ncount = 0;
-		}
-		if (ncount < pcount) {
-			for (i = ncount; i < pcount; i++) {
-				rtvals[i] = zfs_vm_pagerret_bad;
-			}
-		}
-	}
-	zfs_vmobject_wunlock(object);
-
-	if (ncount == 0)
-		goto out;
-
-	if (zfs_owner_overquota(zfsvfs, zp, B_FALSE) ||
-	    zfs_owner_overquota(zfsvfs, zp, B_TRUE)) {
-		goto out;
-	}
-
-	tx = dmu_tx_create(zfsvfs->z_os);
-	dmu_tx_hold_write(tx, zp->z_id, off, len);
-
-	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
-	zfs_sa_upgrade_txholds(tx, zp);
-	err = dmu_tx_assign(tx, TXG_WAIT);
-	if (err != 0) {
-		dmu_tx_abort(tx);
-		goto out;
-	}
-
-	if (zp->z_blksz < PAGE_SIZE) {
-		for (i = 0; len > 0; off += tocopy, len -= tocopy, i++) {
-			tocopy = len > PAGE_SIZE ? PAGE_SIZE : len;
-			va = zfs_map_page(ma[i], &sf);
-			dmu_write(zfsvfs->z_os, zp->z_id, off, tocopy, va, tx);
-			zfs_unmap_page(sf);
-		}
-	} else {
-		err = dmu_write_pages(zfsvfs->z_os, zp->z_id, off, len, ma, tx);
-	}
-
-	if (err == 0) {
-		uint64_t mtime[2], ctime[2];
-		sa_bulk_attr_t bulk[3];
-		int count = 0;
-
-		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
-		    &mtime, 16);
-		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
-		    &ctime, 16);
-		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
-		    &zp->z_pflags, 8);
-		zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime,
-		    B_TRUE);
-		err = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
-		ASSERT0(err);
-		zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, off, len, 0);
-
-		zfs_vmobject_wlock(object);
-		for (i = 0; i < ncount; i++) {
-			rtvals[i] = zfs_vm_pagerret_ok;
-			vm_page_undirty(ma[i]);
-		}
-		zfs_vmobject_wunlock(object);
-		VM_CNT_INC(v_vnodeout);
-		VM_CNT_ADD(v_vnodepgsout, ncount);
-	}
-	dmu_tx_commit(tx);
-
-out:
-	rangelock_exit(lr);
-	if ((flags & (zfs_vm_pagerput_sync | zfs_vm_pagerput_inval)) != 0 ||
-	    zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
-		zil_commit(zfsvfs->z_log, zp->z_id);
-	ZFS_EXIT(zfsvfs);
-	return (rtvals[0]);
-}
-
-int
-zfs_freebsd_putpages(ap)
-	struct vop_putpages_args /* {
-		struct vnode *a_vp;
-		vm_page_t *a_m;
-		int a_count;
-		int a_sync;
-		int *a_rtvals;
-	} */ *ap;
-{
-
-	return (zfs_putpages(ap->a_vp, ap->a_m, ap->a_count, ap->a_sync,
-	    ap->a_rtvals));
-}
-
-static int
-zfs_freebsd_bmap(ap)
-	struct vop_bmap_args /* {
-		struct vnode *a_vp;
-		daddr_t  a_bn;
-		struct bufobj **a_bop;
-		daddr_t *a_bnp;
-		int *a_runp;
-		int *a_runb;
-	} */ *ap;
-{
-
-	if (ap->a_bop != NULL)
-		*ap->a_bop = &ap->a_vp->v_bufobj;
-	if (ap->a_bnp != NULL)
-		*ap->a_bnp = ap->a_bn;
-	if (ap->a_runp != NULL)
-		*ap->a_runp = 0;
-	if (ap->a_runb != NULL)
-		*ap->a_runb = 0;
-
-	return (0);
-}
-
-static int
-zfs_freebsd_open(ap)
-	struct vop_open_args /* {
-		struct vnode *a_vp;
-		int a_mode;
-		struct ucred *a_cred;
-		struct thread *a_td;
-	} */ *ap;
-{
-	vnode_t	*vp = ap->a_vp;
-	znode_t *zp = VTOZ(vp);
-	int error;
-
-	error = zfs_open(&vp, ap->a_mode, ap->a_cred, NULL);
-	if (error == 0)
-		vnode_create_vobject(vp, zp->z_size, ap->a_td);
-	return (error);
-}
-
-static int
-zfs_freebsd_close(ap)
-	struct vop_close_args /* {
-		struct vnode *a_vp;
-		int  a_fflag;
-		struct ucred *a_cred;
-		struct thread *a_td;
-	} */ *ap;
-{
-
-	return (zfs_close(ap->a_vp, ap->a_fflag, 1, 0, ap->a_cred, NULL));
-}
-
-static int
-zfs_freebsd_ioctl(ap)
-	struct vop_ioctl_args /* {
-		struct vnode *a_vp;
-		u_long a_command;
-		caddr_t a_data;
-		int a_fflag;
-		struct ucred *cred;
-		struct thread *td;
-	} */ *ap;
-{
-
-	return (zfs_ioctl(ap->a_vp, ap->a_command, (intptr_t)ap->a_data,
-	    ap->a_fflag, ap->a_cred, NULL, NULL));
-}
-
-static int
-ioflags(int ioflags)
-{
-	int flags = 0;
-
-	if (ioflags & IO_APPEND)
-		flags |= FAPPEND;
-	if (ioflags & IO_NDELAY)
-		flags |= FNONBLOCK;
-	if (ioflags & IO_SYNC)
-		flags |= (FSYNC | FDSYNC | FRSYNC);
-
-	return (flags);
-}
-
-static int
-zfs_freebsd_read(ap)
-	struct vop_read_args /* {
-		struct vnode *a_vp;
-		struct uio *a_uio;
-		int a_ioflag;
-		struct ucred *a_cred;
-	} */ *ap;
-{
-
-	return (zfs_read(ap->a_vp, ap->a_uio, ioflags(ap->a_ioflag),
-	    ap->a_cred, NULL));
-}
-
-static int
-zfs_freebsd_write(ap)
-	struct vop_write_args /* {
-		struct vnode *a_vp;
-		struct uio *a_uio;
-		int a_ioflag;
-		struct ucred *a_cred;
-	} */ *ap;
-{
-
-	return (zfs_write(ap->a_vp, ap->a_uio, ioflags(ap->a_ioflag),
-	    ap->a_cred, NULL));
-}
-
-/*
- * VOP_FPLOOKUP_VEXEC routines are subject to special circumstances, see
- * the comment above cache_fplookup for details.
- */
-static int
-zfs_freebsd_fplookup_vexec(struct vop_fplookup_vexec_args *v)
-{
-	vnode_t *vp;
-	znode_t *zp;
-	uint64_t pflags;
-
-	vp = v->a_vp;
-	zp = VTOZ_SMR(vp);
-	if (__predict_false(zp == NULL))
-		return (EAGAIN);
-	pflags = atomic_load_64(&zp->z_pflags);
-	if (pflags & ZFS_AV_QUARANTINED)
-		return (EAGAIN);
-	if (pflags & ZFS_XATTR)
-		return (EAGAIN);
-	if ((pflags & ZFS_NO_EXECS_DENIED) == 0)
-		return (EAGAIN);
-	return (0);
-}
-
-static int
-zfs_freebsd_access(ap)
-	struct vop_access_args /* {
-		struct vnode *a_vp;
-		accmode_t a_accmode;
-		struct ucred *a_cred;
-		struct thread *a_td;
-	} */ *ap;
-{
-	vnode_t *vp = ap->a_vp;
-	znode_t *zp = VTOZ(vp);
-	accmode_t accmode;
-	int error = 0;
-
-	if (ap->a_accmode == VEXEC) {
-		if (zfs_freebsd_fastaccesschk_execute(ap->a_vp, ap->a_cred) == 0)
-			return (0);
-	}
-
-	/*
-	 * ZFS itself only knowns about VREAD, VWRITE, VEXEC and VAPPEND,
-	 */
-	accmode = ap->a_accmode & (VREAD|VWRITE|VEXEC|VAPPEND);
-	if (accmode != 0)
-		error = zfs_access(ap->a_vp, accmode, 0, ap->a_cred, NULL);
-
-	/*
-	 * VADMIN has to be handled by vaccess().
-	 */
-	if (error == 0) {
-		accmode = ap->a_accmode & ~(VREAD|VWRITE|VEXEC|VAPPEND);
-		if (accmode != 0) {
-			error = vaccess(vp->v_type, zp->z_mode, zp->z_uid,
-			    zp->z_gid, accmode, ap->a_cred, NULL);
-		}
-	}
-
-	/*
-	 * For VEXEC, ensure that at least one execute bit is set for
-	 * non-directories.
-	 */
-	if (error == 0 && (ap->a_accmode & VEXEC) != 0 && vp->v_type != VDIR &&
-	    (zp->z_mode & (S_IXUSR | S_IXGRP | S_IXOTH)) == 0) {
-		error = EACCES;
-	}
-
-	return (error);
-}
-
-static int
-zfs_freebsd_lookup(struct vop_lookup_args *ap, boolean_t cached)
-{
-	struct componentname *cnp = ap->a_cnp;
-	char nm[NAME_MAX + 1];
-
-	ASSERT(cnp->cn_namelen < sizeof(nm));
-	strlcpy(nm, cnp->cn_nameptr, MIN(cnp->cn_namelen + 1, sizeof(nm)));
-
-	return (zfs_lookup(ap->a_dvp, nm, ap->a_vpp, cnp, cnp->cn_nameiop,
-	    cnp->cn_cred, cnp->cn_thread, 0, cached));
-}
-
-static int
-zfs_freebsd_cachedlookup(struct vop_cachedlookup_args *ap)
-{
-
-	return (zfs_freebsd_lookup((struct vop_lookup_args *)ap, B_TRUE));
-}
-
-static int
-zfs_cache_lookup(ap)
-	struct vop_lookup_args /* {
-		struct vnode *a_dvp;
-		struct vnode **a_vpp;
-		struct componentname *a_cnp;
-	} */ *ap;
-{
-	zfsvfs_t *zfsvfs;
-
-	zfsvfs = ap->a_dvp->v_mount->mnt_data;
-	if (zfsvfs->z_use_namecache)
-		return (vfs_cache_lookup(ap));
-	else
-		return (zfs_freebsd_lookup(ap, B_FALSE));
-}
-
-static int
-zfs_freebsd_create(ap)
-	struct vop_create_args /* {
-		struct vnode *a_dvp;
-		struct vnode **a_vpp;
-		struct componentname *a_cnp;
-		struct vattr *a_vap;
-	} */ *ap;
-{
-	zfsvfs_t *zfsvfs;
-	struct componentname *cnp = ap->a_cnp;
-	vattr_t *vap = ap->a_vap;
-	int error, mode;
-
-	ASSERT(cnp->cn_flags & SAVENAME);
-
-	vattr_init_mask(vap);
-	mode = vap->va_mode & ALLPERMS;
-	zfsvfs = ap->a_dvp->v_mount->mnt_data;
-
-	error = zfs_create(ap->a_dvp, cnp->cn_nameptr, vap, !EXCL, mode,
-	    ap->a_vpp, cnp->cn_cred, cnp->cn_thread);
-	if (zfsvfs->z_use_namecache &&
-	    error == 0 && (cnp->cn_flags & MAKEENTRY) != 0)
-		cache_enter(ap->a_dvp, *ap->a_vpp, cnp);
-	return (error);
-}
-
-static int
-zfs_freebsd_remove(ap)
-	struct vop_remove_args /* {
-		struct vnode *a_dvp;
-		struct vnode *a_vp;
-		struct componentname *a_cnp;
-	} */ *ap;
-{
-
-	ASSERT(ap->a_cnp->cn_flags & SAVENAME);
-
-	return (zfs_remove(ap->a_dvp, ap->a_vp, ap->a_cnp->cn_nameptr,
-	    ap->a_cnp->cn_cred));
-}
-
-static int
-zfs_freebsd_mkdir(ap)
-	struct vop_mkdir_args /* {
-		struct vnode *a_dvp;
-		struct vnode **a_vpp;
-		struct componentname *a_cnp;
-		struct vattr *a_vap;
-	} */ *ap;
-{
-	vattr_t *vap = ap->a_vap;
-
-	ASSERT(ap->a_cnp->cn_flags & SAVENAME);
-
-	vattr_init_mask(vap);
-
-	return (zfs_mkdir(ap->a_dvp, ap->a_cnp->cn_nameptr, vap, ap->a_vpp,
-	    ap->a_cnp->cn_cred));
-}
-
-static int
-zfs_freebsd_rmdir(ap)
-	struct vop_rmdir_args /* {
-		struct vnode *a_dvp;
-		struct vnode *a_vp;
-		struct componentname *a_cnp;
-	} */ *ap;
-{
-	struct componentname *cnp = ap->a_cnp;
-
-	ASSERT(cnp->cn_flags & SAVENAME);
-
-	return (zfs_rmdir(ap->a_dvp, ap->a_vp, cnp->cn_nameptr, cnp->cn_cred));
-}
-
-static int
-zfs_freebsd_readdir(ap)
-	struct vop_readdir_args /* {
-		struct vnode *a_vp;
-		struct uio *a_uio;
-		struct ucred *a_cred;
-		int *a_eofflag;
-		int *a_ncookies;
-		u_long **a_cookies;
-	} */ *ap;
-{
-
-	return (zfs_readdir(ap->a_vp, ap->a_uio, ap->a_cred, ap->a_eofflag,
-	    ap->a_ncookies, ap->a_cookies));
-}
-
-static int
-zfs_freebsd_fsync(ap)
-	struct vop_fsync_args /* {
-		struct vnode *a_vp;
-		int a_waitfor;
-		struct thread *a_td;
-	} */ *ap;
-{
-
-	vop_stdfsync(ap);
-	return (zfs_fsync(ap->a_vp, 0, ap->a_td->td_ucred, NULL));
-}
-
-static int
-zfs_freebsd_getattr(ap)
-	struct vop_getattr_args /* {
-		struct vnode *a_vp;
-		struct vattr *a_vap;
-		struct ucred *a_cred;
-	} */ *ap;
-{
-	vattr_t *vap = ap->a_vap;
-	xvattr_t xvap;
-	u_long fflags = 0;
-	int error;
-
-	xva_init(&xvap);
-	xvap.xva_vattr = *vap;
-	xvap.xva_vattr.va_mask |= AT_XVATTR;
-
-	/* Convert chflags into ZFS-type flags. */
-	/* XXX: what about SF_SETTABLE?. */
-	XVA_SET_REQ(&xvap, XAT_IMMUTABLE);
-	XVA_SET_REQ(&xvap, XAT_APPENDONLY);
-	XVA_SET_REQ(&xvap, XAT_NOUNLINK);
-	XVA_SET_REQ(&xvap, XAT_NODUMP);
-	XVA_SET_REQ(&xvap, XAT_READONLY);
-	XVA_SET_REQ(&xvap, XAT_ARCHIVE);
-	XVA_SET_REQ(&xvap, XAT_SYSTEM);
-	XVA_SET_REQ(&xvap, XAT_HIDDEN);
-	XVA_SET_REQ(&xvap, XAT_REPARSE);
-	XVA_SET_REQ(&xvap, XAT_OFFLINE);
-	XVA_SET_REQ(&xvap, XAT_SPARSE);
-
-	error = zfs_getattr(ap->a_vp, (vattr_t *)&xvap, 0, ap->a_cred, NULL);
-	if (error != 0)
-		return (error);
-
-	/* Convert ZFS xattr into chflags. */
-#define	FLAG_CHECK(fflag, xflag, xfield)	do {			\
-	if (XVA_ISSET_RTN(&xvap, (xflag)) && (xfield) != 0)		\
-		fflags |= (fflag);					\
-} while (0)
-	FLAG_CHECK(SF_IMMUTABLE, XAT_IMMUTABLE,
-	    xvap.xva_xoptattrs.xoa_immutable);
-	FLAG_CHECK(SF_APPEND, XAT_APPENDONLY,
-	    xvap.xva_xoptattrs.xoa_appendonly);
-	FLAG_CHECK(SF_NOUNLINK, XAT_NOUNLINK,
-	    xvap.xva_xoptattrs.xoa_nounlink);
-	FLAG_CHECK(UF_ARCHIVE, XAT_ARCHIVE,
-	    xvap.xva_xoptattrs.xoa_archive);
-	FLAG_CHECK(UF_NODUMP, XAT_NODUMP,
-	    xvap.xva_xoptattrs.xoa_nodump);
-	FLAG_CHECK(UF_READONLY, XAT_READONLY,
-	    xvap.xva_xoptattrs.xoa_readonly);
-	FLAG_CHECK(UF_SYSTEM, XAT_SYSTEM,
-	    xvap.xva_xoptattrs.xoa_system);
-	FLAG_CHECK(UF_HIDDEN, XAT_HIDDEN,
-	    xvap.xva_xoptattrs.xoa_hidden);
-	FLAG_CHECK(UF_REPARSE, XAT_REPARSE,
-	    xvap.xva_xoptattrs.xoa_reparse);
-	FLAG_CHECK(UF_OFFLINE, XAT_OFFLINE,
-	    xvap.xva_xoptattrs.xoa_offline);
-	FLAG_CHECK(UF_SPARSE, XAT_SPARSE,
-	    xvap.xva_xoptattrs.xoa_sparse);
-
-#undef	FLAG_CHECK
-	*vap = xvap.xva_vattr;
-	vap->va_flags = fflags;
-	return (0);
-}
-
-static int
-zfs_freebsd_setattr(ap)
-	struct vop_setattr_args /* {
-		struct vnode *a_vp;
-		struct vattr *a_vap;
-		struct ucred *a_cred;
-	} */ *ap;
-{
-	vnode_t *vp = ap->a_vp;
-	vattr_t *vap = ap->a_vap;
-	cred_t *cred = ap->a_cred;
-	xvattr_t xvap;
-	u_long fflags;
-	uint64_t zflags;
-
-	vattr_init_mask(vap);
-	vap->va_mask &= ~AT_NOSET;
-
-	xva_init(&xvap);
-	xvap.xva_vattr = *vap;
-
-	zflags = VTOZ(vp)->z_pflags;
-
-	if (vap->va_flags != VNOVAL) {
-		zfsvfs_t *zfsvfs = VTOZ(vp)->z_zfsvfs;
-		int error;
-
-		if (zfsvfs->z_use_fuids == B_FALSE)
-			return (EOPNOTSUPP);
-
-		fflags = vap->va_flags;
-		/*
-		 * XXX KDM 
-		 * We need to figure out whether it makes sense to allow
-		 * UF_REPARSE through, since we don't really have other
-		 * facilities to handle reparse points and zfs_setattr()
-		 * doesn't currently allow setting that attribute anyway.
-		 */
-		if ((fflags & ~(SF_IMMUTABLE|SF_APPEND|SF_NOUNLINK|UF_ARCHIVE|
-		     UF_NODUMP|UF_SYSTEM|UF_HIDDEN|UF_READONLY|UF_REPARSE|
-		     UF_OFFLINE|UF_SPARSE)) != 0)
-			return (EOPNOTSUPP);
-		/*
-		 * Unprivileged processes are not permitted to unset system
-		 * flags, or modify flags if any system flags are set.
-		 * Privileged non-jail processes may not modify system flags
-		 * if securelevel > 0 and any existing system flags are set.
-		 * Privileged jail processes behave like privileged non-jail
-		 * processes if the PR_ALLOW_CHFLAGS permission bit is set;
-		 * otherwise, they behave like unprivileged processes.
-		 */
-		if (secpolicy_fs_owner(vp->v_mount, cred) == 0 ||
-		    priv_check_cred(cred, PRIV_VFS_SYSFLAGS) == 0) {
-			if (zflags &
-			    (ZFS_IMMUTABLE | ZFS_APPENDONLY | ZFS_NOUNLINK)) {
-				error = securelevel_gt(cred, 0);
-				if (error != 0)
-					return (error);
-			}
-		} else {
-			/*
-			 * Callers may only modify the file flags on objects they
-			 * have VADMIN rights for.
-			 */
-			if ((error = VOP_ACCESS(vp, VADMIN, cred, curthread)) != 0)
-				return (error);
-			if (zflags &
-			    (ZFS_IMMUTABLE | ZFS_APPENDONLY | ZFS_NOUNLINK)) {
-				return (EPERM);
-			}
-			if (fflags &
-			    (SF_IMMUTABLE | SF_APPEND | SF_NOUNLINK)) {
-				return (EPERM);
-			}
-		}
-
-#define	FLAG_CHANGE(fflag, zflag, xflag, xfield)	do {		\
-	if (((fflags & (fflag)) && !(zflags & (zflag))) ||		\
-	    ((zflags & (zflag)) && !(fflags & (fflag)))) {		\
-		XVA_SET_REQ(&xvap, (xflag));				\
-		(xfield) = ((fflags & (fflag)) != 0);			\
-	}								\
-} while (0)
-		/* Convert chflags into ZFS-type flags. */
-		/* XXX: what about SF_SETTABLE?. */
-		FLAG_CHANGE(SF_IMMUTABLE, ZFS_IMMUTABLE, XAT_IMMUTABLE,
-		    xvap.xva_xoptattrs.xoa_immutable);
-		FLAG_CHANGE(SF_APPEND, ZFS_APPENDONLY, XAT_APPENDONLY,
-		    xvap.xva_xoptattrs.xoa_appendonly);
-		FLAG_CHANGE(SF_NOUNLINK, ZFS_NOUNLINK, XAT_NOUNLINK,
-		    xvap.xva_xoptattrs.xoa_nounlink);
-		FLAG_CHANGE(UF_ARCHIVE, ZFS_ARCHIVE, XAT_ARCHIVE,
-		    xvap.xva_xoptattrs.xoa_archive);
-		FLAG_CHANGE(UF_NODUMP, ZFS_NODUMP, XAT_NODUMP,
-		    xvap.xva_xoptattrs.xoa_nodump);
-		FLAG_CHANGE(UF_READONLY, ZFS_READONLY, XAT_READONLY,
-		    xvap.xva_xoptattrs.xoa_readonly);
-		FLAG_CHANGE(UF_SYSTEM, ZFS_SYSTEM, XAT_SYSTEM,
-		    xvap.xva_xoptattrs.xoa_system);
-		FLAG_CHANGE(UF_HIDDEN, ZFS_HIDDEN, XAT_HIDDEN,
-		    xvap.xva_xoptattrs.xoa_hidden);
-		FLAG_CHANGE(UF_REPARSE, ZFS_REPARSE, XAT_REPARSE,
-		    xvap.xva_xoptattrs.xoa_reparse);
-		FLAG_CHANGE(UF_OFFLINE, ZFS_OFFLINE, XAT_OFFLINE,
-		    xvap.xva_xoptattrs.xoa_offline);
-		FLAG_CHANGE(UF_SPARSE, ZFS_SPARSE, XAT_SPARSE,
-		    xvap.xva_xoptattrs.xoa_sparse);
-#undef	FLAG_CHANGE
-	}
-	if (vap->va_birthtime.tv_sec != VNOVAL) {
-		xvap.xva_vattr.va_mask |= AT_XVATTR;
-		XVA_SET_REQ(&xvap, XAT_CREATETIME);
-	}
-	return (zfs_setattr(vp, (vattr_t *)&xvap, 0, cred, NULL));
-}
-
-static int
-zfs_freebsd_rename(ap)
-	struct vop_rename_args  /* {
-		struct vnode *a_fdvp;
-		struct vnode *a_fvp;
-		struct componentname *a_fcnp;
-		struct vnode *a_tdvp;
-		struct vnode *a_tvp;
-		struct componentname *a_tcnp;
-	} */ *ap;
-{
-	vnode_t *fdvp = ap->a_fdvp;
-	vnode_t *fvp = ap->a_fvp;
-	vnode_t *tdvp = ap->a_tdvp;
-	vnode_t *tvp = ap->a_tvp;
-	int error;
-
-	ASSERT(ap->a_fcnp->cn_flags & (SAVENAME|SAVESTART));
-	ASSERT(ap->a_tcnp->cn_flags & (SAVENAME|SAVESTART));
-
-	error = zfs_rename(fdvp, &fvp, ap->a_fcnp, tdvp, &tvp,
-	    ap->a_tcnp, ap->a_fcnp->cn_cred);
-
-	vrele(fdvp);
-	vrele(fvp);
-	vrele(tdvp);
-	if (tvp != NULL)
-		vrele(tvp);
-
-	return (error);
-}
-
-static int
-zfs_freebsd_symlink(ap)
-	struct vop_symlink_args /* {
-		struct vnode *a_dvp;
-		struct vnode **a_vpp;
-		struct componentname *a_cnp;
-		struct vattr *a_vap;
-		char *a_target;
-	} */ *ap;
-{
-	struct componentname *cnp = ap->a_cnp;
-	vattr_t *vap = ap->a_vap;
-
-	ASSERT(cnp->cn_flags & SAVENAME);
-
-	vap->va_type = VLNK;	/* FreeBSD: Syscall only sets va_mode. */
-	vattr_init_mask(vap);
-
-	return (zfs_symlink(ap->a_dvp, ap->a_vpp, cnp->cn_nameptr, vap,
-	    __DECONST(char *, ap->a_target), cnp->cn_cred, cnp->cn_thread));
-}
-
-static int
-zfs_freebsd_readlink(ap)
-	struct vop_readlink_args /* {
-		struct vnode *a_vp;
-		struct uio *a_uio;
-		struct ucred *a_cred;
-	} */ *ap;
-{
-
-	return (zfs_readlink(ap->a_vp, ap->a_uio, ap->a_cred, NULL));
-}
-
-static int
-zfs_freebsd_link(ap)
-	struct vop_link_args /* {
-		struct vnode *a_tdvp;
-		struct vnode *a_vp;
-		struct componentname *a_cnp;
-	} */ *ap;
-{
-	struct componentname *cnp = ap->a_cnp;
-	vnode_t *vp = ap->a_vp;
-	vnode_t *tdvp = ap->a_tdvp;
-
-	if (tdvp->v_mount != vp->v_mount)
-		return (EXDEV);
-
-	ASSERT(cnp->cn_flags & SAVENAME);
-
-	return (zfs_link(tdvp, vp, cnp->cn_nameptr, cnp->cn_cred, NULL, 0));
-}
-
-static int
-zfs_freebsd_inactive(ap)
-	struct vop_inactive_args /* {
-		struct vnode *a_vp;
-		struct thread *a_td;
-	} */ *ap;
-{
-	vnode_t *vp = ap->a_vp;
-
-	zfs_inactive(vp, ap->a_td->td_ucred, NULL);
-	return (0);
-}
-
-static int
-zfs_freebsd_need_inactive(ap)
-	struct vop_need_inactive_args /* {
-		struct vnode *a_vp;
-		struct thread *a_td;
-	} */ *ap;
-{
-	vnode_t *vp = ap->a_vp;
-	znode_t	*zp = VTOZ(vp);
-	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
-	int need;
-
-	if (vn_need_pageq_flush(vp))
-		return (1);
-
-	if (!ZFS_TRYRLOCK_TEARDOWN_INACTIVE(zfsvfs))
-		return (1);
-	need = (zp->z_sa_hdl == NULL || zp->z_unlinked || zp->z_atime_dirty);
-	ZFS_RUNLOCK_TEARDOWN_INACTIVE(zfsvfs);
-
-	return (need);
-}
-
-static int
-zfs_freebsd_reclaim(ap)
-	struct vop_reclaim_args /* {
-		struct vnode *a_vp;
-		struct thread *a_td;
-	} */ *ap;
-{
-	vnode_t	*vp = ap->a_vp;
-	znode_t	*zp = VTOZ(vp);
-	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
-
-	ASSERT(zp != NULL);
-
-	/*
-	 * z_teardown_inactive_lock protects from a race with
-	 * zfs_znode_dmu_fini in zfsvfs_teardown during
-	 * force unmount.
-	 */
-	ZFS_RLOCK_TEARDOWN_INACTIVE(zfsvfs);
-	if (zp->z_sa_hdl == NULL)
-		zfs_znode_free(zp);
-	else
-		zfs_zinactive(zp);
-	ZFS_RUNLOCK_TEARDOWN_INACTIVE(zfsvfs);
-
-	vp->v_data = NULL;
-	return (0);
-}
-
-static int
-zfs_freebsd_fid(ap)
-	struct vop_fid_args /* {
-		struct vnode *a_vp;
-		struct fid *a_fid;
-	} */ *ap;
-{
-
-	return (zfs_fid(ap->a_vp, (void *)ap->a_fid, NULL));
-}
-
-static int
-zfs_freebsd_pathconf(ap)
-	struct vop_pathconf_args /* {
-		struct vnode *a_vp;
-		int a_name;
-		register_t *a_retval;
-	} */ *ap;
-{
-	ulong_t val;
-	int error;
-
-	error = zfs_pathconf(ap->a_vp, ap->a_name, &val, curthread->td_ucred, NULL);
-	if (error == 0) {
-		*ap->a_retval = val;
-		return (error);
-	}
-	if (error != EOPNOTSUPP)
-		return (error);
-
-	switch (ap->a_name) {
-	case _PC_NAME_MAX:
-		*ap->a_retval = NAME_MAX;
-		return (0);
-	case _PC_PIPE_BUF:
-		if (ap->a_vp->v_type == VDIR || ap->a_vp->v_type == VFIFO) {
-			*ap->a_retval = PIPE_BUF;
-			return (0);
-		}
-		return (EINVAL);
-	default:
-		return (vop_stdpathconf(ap));
-	}
-}
-
-/*
- * FreeBSD's extended attributes namespace defines file name prefix for ZFS'
- * extended attribute name:
- *
- *	NAMESPACE	PREFIX	
- *	system		freebsd:system:
- *	user		(none, can be used to access ZFS fsattr(5) attributes
- *			created on Solaris)
- */
-static int
-zfs_create_attrname(int attrnamespace, const char *name, char *attrname,
-    size_t size)
-{
-	const char *namespace, *prefix, *suffix;
-
-	/* We don't allow '/' character in attribute name. */
-	if (strchr(name, '/') != NULL)
-		return (EINVAL);
-	/* We don't allow attribute names that start with "freebsd:" string. */
-	if (strncmp(name, "freebsd:", 8) == 0)
-		return (EINVAL);
-
-	bzero(attrname, size);
-
-	switch (attrnamespace) {
-	case EXTATTR_NAMESPACE_USER:
-#if 0
-		prefix = "freebsd:";
-		namespace = EXTATTR_NAMESPACE_USER_STRING;
-		suffix = ":";
-#else
-		/*
-		 * This is the default namespace by which we can access all
-		 * attributes created on Solaris.
-		 */
-		prefix = namespace = suffix = "";
-#endif
-		break;
-	case EXTATTR_NAMESPACE_SYSTEM:
-		prefix = "freebsd:";
-		namespace = EXTATTR_NAMESPACE_SYSTEM_STRING;
-		suffix = ":";
-		break;
-	case EXTATTR_NAMESPACE_EMPTY:
-	default:
-		return (EINVAL);
-	}
-	if (snprintf(attrname, size, "%s%s%s%s", prefix, namespace, suffix,
-	    name) >= size) {
-		return (ENAMETOOLONG);
-	}
-	return (0);
-}
-
-/*
- * Vnode operating to retrieve a named extended attribute.
- */
-static int
-zfs_getextattr(struct vop_getextattr_args *ap)
-/*
-vop_getextattr {
-	IN struct vnode *a_vp;
-	IN int a_attrnamespace;
-	IN const char *a_name;
-	INOUT struct uio *a_uio;
-	OUT size_t *a_size;
-	IN struct ucred *a_cred;
-	IN struct thread *a_td;
-};
-*/
-{
-	zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs;
-	struct thread *td = ap->a_td;
-	struct nameidata nd;
-	char attrname[255];
-	struct vattr va;
-	vnode_t *xvp = NULL, *vp;
-	int error, flags;
-
-	error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
-	    ap->a_cred, ap->a_td, VREAD);
-	if (error != 0)
-		return (error);
-
-	error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname,
-	    sizeof(attrname));
-	if (error != 0)
-		return (error);
-
-	ZFS_ENTER(zfsvfs);
-
-	error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td,
-	    LOOKUP_XATTR, B_FALSE);
-	if (error != 0) {
-		ZFS_EXIT(zfsvfs);
-		return (error);
-	}
-
-	flags = FREAD;
-	NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, attrname,
-	    xvp, td);
-	error = vn_open_cred(&nd, &flags, VN_OPEN_INVFS, 0, ap->a_cred, NULL);
-	vp = nd.ni_vp;
-	NDFREE(&nd, NDF_ONLY_PNBUF);
-	if (error != 0) {
-		ZFS_EXIT(zfsvfs);
-		if (error == ENOENT)
-			error = ENOATTR;
-		return (error);
-	}
-
-	if (ap->a_size != NULL) {
-		error = VOP_GETATTR(vp, &va, ap->a_cred);
-		if (error == 0)
-			*ap->a_size = (size_t)va.va_size;
-	} else if (ap->a_uio != NULL)
-		error = VOP_READ(vp, ap->a_uio, IO_UNIT, ap->a_cred);
-
-	VOP_UNLOCK(vp);
-	vn_close(vp, flags, ap->a_cred, td);
-	ZFS_EXIT(zfsvfs);
-
-	return (error);
-}
-
-/*
- * Vnode operation to remove a named attribute.
- */
-int
-zfs_deleteextattr(struct vop_deleteextattr_args *ap)
-/*
-vop_deleteextattr {
-	IN struct vnode *a_vp;
-	IN int a_attrnamespace;
-	IN const char *a_name;
-	IN struct ucred *a_cred;
-	IN struct thread *a_td;
-};
-*/
-{
-	zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs;
-	struct thread *td = ap->a_td;
-	struct nameidata nd;
-	char attrname[255];
-	struct vattr va;
-	vnode_t *xvp = NULL, *vp;
-	int error, flags;
-
-	error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
-	    ap->a_cred, ap->a_td, VWRITE);
-	if (error != 0)
-		return (error);
-
-	error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname,
-	    sizeof(attrname));
-	if (error != 0)
-		return (error);
-
-	ZFS_ENTER(zfsvfs);
-
-	error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td,
-	    LOOKUP_XATTR, B_FALSE);
-	if (error != 0) {
-		ZFS_EXIT(zfsvfs);
-		return (error);
-	}
-
-	NDINIT_ATVP(&nd, DELETE, NOFOLLOW | LOCKPARENT | LOCKLEAF,
-	    UIO_SYSSPACE, attrname, xvp, td);
-	error = namei(&nd);
-	vp = nd.ni_vp;
-	if (error != 0) {
-		ZFS_EXIT(zfsvfs);
-		NDFREE(&nd, NDF_ONLY_PNBUF);
-		if (error == ENOENT)
-			error = ENOATTR;
-		return (error);
-	}
-
-	error = VOP_REMOVE(nd.ni_dvp, vp, &nd.ni_cnd);
-	NDFREE(&nd, NDF_ONLY_PNBUF);
-
-	vput(nd.ni_dvp);
-	if (vp == nd.ni_dvp)
-		vrele(vp);
-	else
-		vput(vp);
-	ZFS_EXIT(zfsvfs);
-
-	return (error);
-}
-
-/*
- * Vnode operation to set a named attribute.
- */
-static int
-zfs_setextattr(struct vop_setextattr_args *ap)
-/*
-vop_setextattr {
-	IN struct vnode *a_vp;
-	IN int a_attrnamespace;
-	IN const char *a_name;
-	INOUT struct uio *a_uio;
-	IN struct ucred *a_cred;
-	IN struct thread *a_td;
-};
-*/
-{
-	zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs;
-	struct thread *td = ap->a_td;
-	struct nameidata nd;
-	char attrname[255];
-	struct vattr va;
-	vnode_t *xvp = NULL, *vp;
-	int error, flags;
-
-	error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
-	    ap->a_cred, ap->a_td, VWRITE);
-	if (error != 0)
-		return (error);
-
-	error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname,
-	    sizeof(attrname));
-	if (error != 0)
-		return (error);
-
-	ZFS_ENTER(zfsvfs);
-
-	error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td,
-	    LOOKUP_XATTR | CREATE_XATTR_DIR, B_FALSE);
-	if (error != 0) {
-		ZFS_EXIT(zfsvfs);
-		return (error);
-	}
-
-	flags = FFLAGS(O_WRONLY | O_CREAT);
-	NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, attrname,
-	    xvp, td);
-	error = vn_open_cred(&nd, &flags, 0600, VN_OPEN_INVFS, ap->a_cred,
-	    NULL);
-	vp = nd.ni_vp;
-	NDFREE(&nd, NDF_ONLY_PNBUF);
-	if (error != 0) {
-		ZFS_EXIT(zfsvfs);
-		return (error);
-	}
-
-	VATTR_NULL(&va);
-	va.va_size = 0;
-	error = VOP_SETATTR(vp, &va, ap->a_cred);
-	if (error == 0)
-		VOP_WRITE(vp, ap->a_uio, IO_UNIT, ap->a_cred);
-
-	VOP_UNLOCK(vp);
-	vn_close(vp, flags, ap->a_cred, td);
-	ZFS_EXIT(zfsvfs);
-
-	return (error);
-}
-
-/*
- * Vnode operation to retrieve extended attributes on a vnode.
- */
-static int
-zfs_listextattr(struct vop_listextattr_args *ap)
-/*
-vop_listextattr {
-	IN struct vnode *a_vp;
-	IN int a_attrnamespace;
-	INOUT struct uio *a_uio;
-	OUT size_t *a_size;
-	IN struct ucred *a_cred;
-	IN struct thread *a_td;
-};
-*/
-{
-	zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs;
-	struct thread *td = ap->a_td;
-	struct nameidata nd;
-	char attrprefix[16];
-	u_char dirbuf[sizeof(struct dirent)];
-	struct dirent *dp;
-	struct iovec aiov;
-	struct uio auio, *uio = ap->a_uio;
-	size_t *sizep = ap->a_size;
-	size_t plen;
-	vnode_t *xvp = NULL, *vp;
-	int done, error, eof, pos;
-
-	error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
-	    ap->a_cred, ap->a_td, VREAD);
-	if (error != 0)
-		return (error);
-
-	error = zfs_create_attrname(ap->a_attrnamespace, "", attrprefix,
-	    sizeof(attrprefix));
-	if (error != 0)
-		return (error);
-	plen = strlen(attrprefix);
-
-	ZFS_ENTER(zfsvfs);
-
-	if (sizep != NULL)
-		*sizep = 0;
-
-	error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td,
-	    LOOKUP_XATTR, B_FALSE);
-	if (error != 0) {
-		ZFS_EXIT(zfsvfs);
-		/*
-		 * ENOATTR means that the EA directory does not yet exist,
-		 * i.e. there are no extended attributes there.
-		 */
-		if (error == ENOATTR)
-			error = 0;
-		return (error);
-	}
-
-	NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | LOCKSHARED,
-	    UIO_SYSSPACE, ".", xvp, td);
-	error = namei(&nd);
-	vp = nd.ni_vp;
-	NDFREE(&nd, NDF_ONLY_PNBUF);
-	if (error != 0) {
-		ZFS_EXIT(zfsvfs);
-		return (error);
-	}
-
-	auio.uio_iov = &aiov;
-	auio.uio_iovcnt = 1;
-	auio.uio_segflg = UIO_SYSSPACE;
-	auio.uio_td = td;
-	auio.uio_rw = UIO_READ;
-	auio.uio_offset = 0;
-
-	do {
-		u_char nlen;
-
-		aiov.iov_base = (void *)dirbuf;
-		aiov.iov_len = sizeof(dirbuf);
-		auio.uio_resid = sizeof(dirbuf);
-		error = VOP_READDIR(vp, &auio, ap->a_cred, &eof, NULL, NULL);
-		done = sizeof(dirbuf) - auio.uio_resid;
-		if (error != 0)
-			break;
-		for (pos = 0; pos < done;) {
-			dp = (struct dirent *)(dirbuf + pos);
-			pos += dp->d_reclen;
-			/*
-			 * XXX: Temporarily we also accept DT_UNKNOWN, as this
-			 * is what we get when attribute was created on Solaris.
-			 */
-			if (dp->d_type != DT_REG && dp->d_type != DT_UNKNOWN)
-				continue;
-			if (plen == 0 && strncmp(dp->d_name, "freebsd:", 8) == 0)
-				continue;
-			else if (strncmp(dp->d_name, attrprefix, plen) != 0)
-				continue;
-			nlen = dp->d_namlen - plen;
-			if (sizep != NULL)
-				*sizep += 1 + nlen;
-			else if (uio != NULL) {
-				/*
-				 * Format of extattr name entry is one byte for
-				 * length and the rest for name.
-				 */
-				error = uiomove(&nlen, 1, uio->uio_rw, uio);
-				if (error == 0) {
-					error = uiomove(dp->d_name + plen, nlen,
-					    uio->uio_rw, uio);
-				}
-				if (error != 0)
-					break;
-			}
-		}
-	} while (!eof && error == 0);
-
-	vput(vp);
-	ZFS_EXIT(zfsvfs);
-
-	return (error);
-}
-
-int
-zfs_freebsd_getacl(ap)
-	struct vop_getacl_args /* {
-		struct vnode *vp;
-		acl_type_t type;
-		struct acl *aclp;
-		struct ucred *cred;
-		struct thread *td;
-	} */ *ap;
-{
-	int		error;
-	vsecattr_t      vsecattr;
-
-	if (ap->a_type != ACL_TYPE_NFS4)
-		return (EINVAL);
-
-	vsecattr.vsa_mask = VSA_ACE | VSA_ACECNT;
-	if (error = zfs_getsecattr(ap->a_vp, &vsecattr, 0, ap->a_cred, NULL))
-		return (error);
-
-	error = acl_from_aces(ap->a_aclp, vsecattr.vsa_aclentp, vsecattr.vsa_aclcnt);
-	if (vsecattr.vsa_aclentp != NULL)
-		kmem_free(vsecattr.vsa_aclentp, vsecattr.vsa_aclentsz);
-
-	return (error);
-}
-
-int
-zfs_freebsd_setacl(ap)
-	struct vop_setacl_args /* {
-		struct vnode *vp;
-		acl_type_t type;
-		struct acl *aclp;
-		struct ucred *cred;
-		struct thread *td;
-	} */ *ap;
-{
-	int		error;
-	vsecattr_t      vsecattr;
-	int		aclbsize;	/* size of acl list in bytes */
-	aclent_t	*aaclp;
-
-	if (ap->a_type != ACL_TYPE_NFS4)
-		return (EINVAL);
-
-	if (ap->a_aclp == NULL)
-		return (EINVAL);
-
-	if (ap->a_aclp->acl_cnt < 1 || ap->a_aclp->acl_cnt > MAX_ACL_ENTRIES)
-		return (EINVAL);
-
-	/*
-	 * With NFSv4 ACLs, chmod(2) may need to add additional entries,
-	 * splitting every entry into two and appending "canonical six"
-	 * entries at the end.  Don't allow for setting an ACL that would
-	 * cause chmod(2) to run out of ACL entries.
-	 */
-	if (ap->a_aclp->acl_cnt * 2 + 6 > ACL_MAX_ENTRIES)
-		return (ENOSPC);
-
-	error = acl_nfs4_check(ap->a_aclp, ap->a_vp->v_type == VDIR);
-	if (error != 0)
-		return (error);
-
-	vsecattr.vsa_mask = VSA_ACE;
-	aclbsize = ap->a_aclp->acl_cnt * sizeof(ace_t);
-	vsecattr.vsa_aclentp = kmem_alloc(aclbsize, KM_SLEEP);
-	aaclp = vsecattr.vsa_aclentp;
-	vsecattr.vsa_aclentsz = aclbsize;
-
-	aces_from_acl(vsecattr.vsa_aclentp, &vsecattr.vsa_aclcnt, ap->a_aclp);
-	error = zfs_setsecattr(ap->a_vp, &vsecattr, 0, ap->a_cred, NULL);
-	kmem_free(aaclp, aclbsize);
-
-	return (error);
-}
-
-int
-zfs_freebsd_aclcheck(ap)
-	struct vop_aclcheck_args /* {
-		struct vnode *vp;
-		acl_type_t type;
-		struct acl *aclp;
-		struct ucred *cred;
-		struct thread *td;
-	} */ *ap;
-{
-
-	return (EOPNOTSUPP);
-}
-
-static int
-zfs_vptocnp(struct vop_vptocnp_args *ap)
-{
-	vnode_t *covered_vp;
-	vnode_t *vp = ap->a_vp;;
-	zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
-	znode_t *zp = VTOZ(vp);
-	enum vgetstate vs;
-	int ltype;
-	int error;
-
-	ZFS_ENTER(zfsvfs);
-	ZFS_VERIFY_ZP(zp);
-
-	/*
-	 * If we are a snapshot mounted under .zfs, run the operation
-	 * on the covered vnode.
-	 */
-	if (zp->z_id != zfsvfs->z_root || zfsvfs->z_parent == zfsvfs) {
-		char name[MAXNAMLEN + 1];
-		znode_t *dzp;
-		size_t len;
-
-		error = zfs_znode_parent_and_name(zp, &dzp, name);
-		if (error == 0) {
-			len = strlen(name);
-			if (*ap->a_buflen < len)
-				error = SET_ERROR(ENOMEM);
-		}
-		if (error == 0) {
-			*ap->a_buflen -= len;
-			bcopy(name, ap->a_buf + *ap->a_buflen, len);
-			*ap->a_vpp = ZTOV(dzp);
-		}
-		ZFS_EXIT(zfsvfs);
-		return (error);
-	}
-	ZFS_EXIT(zfsvfs);
-
-	covered_vp = vp->v_mount->mnt_vnodecovered;
-	vs = vget_prep(covered_vp);
-	ltype = VOP_ISLOCKED(vp);
-	VOP_UNLOCK(vp);
-	error = vget_finish(covered_vp, LK_SHARED, vs);
-	if (error == 0) {
-		error = VOP_VPTOCNP(covered_vp, ap->a_vpp, ap->a_cred,
-		    ap->a_buf, ap->a_buflen);
-		vput(covered_vp);
-	}
-	vn_lock(vp, ltype | LK_RETRY);
-	if (VN_IS_DOOMED(vp))
-		error = SET_ERROR(ENOENT);
-	return (error);
-}
-
-#ifdef DIAGNOSTIC
-static int
-zfs_lock(ap)
-	struct vop_lock1_args /* {
-		struct vnode *a_vp;
-		int a_flags;
-		char *file;
-		int line;
-	} */ *ap;
-{
-	vnode_t *vp;
-	znode_t *zp;
-	int err;
-
-	err = vop_lock(ap);
-	if (err == 0 && (ap->a_flags & LK_NOWAIT) == 0) {
-		vp = ap->a_vp;
-		zp = vp->v_data;
-		if (vp->v_mount != NULL && !VN_IS_DOOMED(vp) &&
-		    zp != NULL && (zp->z_pflags & ZFS_XATTR) == 0)
-			VERIFY(!RRM_LOCK_HELD(&zp->z_zfsvfs->z_teardown_lock));
-	}
-	return (err);
-}
-#endif
-
-struct vop_vector zfs_vnodeops;
-struct vop_vector zfs_fifoops;
-struct vop_vector zfs_shareops;
-
-struct vop_vector zfs_vnodeops = {
-	.vop_default =		&default_vnodeops,
-	.vop_inactive =		zfs_freebsd_inactive,
-	.vop_need_inactive =	zfs_freebsd_need_inactive,
-	.vop_reclaim =		zfs_freebsd_reclaim,
-	.vop_fplookup_vexec =	zfs_freebsd_fplookup_vexec,
-	.vop_access =		zfs_freebsd_access,
-	.vop_allocate =		VOP_EINVAL,
-	.vop_lookup =		zfs_cache_lookup,
-	.vop_cachedlookup =	zfs_freebsd_cachedlookup,
-	.vop_getattr =		zfs_freebsd_getattr,
-	.vop_setattr =		zfs_freebsd_setattr,
-	.vop_create =		zfs_freebsd_create,
-	.vop_mknod =		zfs_freebsd_create,
-	.vop_mkdir =		zfs_freebsd_mkdir,
-	.vop_readdir =		zfs_freebsd_readdir,
-	.vop_fsync =		zfs_freebsd_fsync,
-	.vop_open =		zfs_freebsd_open,
-	.vop_close =		zfs_freebsd_close,
-	.vop_rmdir =		zfs_freebsd_rmdir,
-	.vop_ioctl =		zfs_freebsd_ioctl,
-	.vop_link =		zfs_freebsd_link,
-	.vop_symlink =		zfs_freebsd_symlink,
-	.vop_readlink =		zfs_freebsd_readlink,
-	.vop_read =		zfs_freebsd_read,
-	.vop_write =		zfs_freebsd_write,
-	.vop_remove =		zfs_freebsd_remove,
-	.vop_rename =		zfs_freebsd_rename,
-	.vop_pathconf =		zfs_freebsd_pathconf,
-	.vop_bmap =		zfs_freebsd_bmap,
-	.vop_fid =		zfs_freebsd_fid,
-	.vop_getextattr =	zfs_getextattr,
-	.vop_deleteextattr =	zfs_deleteextattr,
-	.vop_setextattr =	zfs_setextattr,
-	.vop_listextattr =	zfs_listextattr,
-	.vop_getacl =		zfs_freebsd_getacl,
-	.vop_setacl =		zfs_freebsd_setacl,
-	.vop_aclcheck =		zfs_freebsd_aclcheck,
-	.vop_getpages =		zfs_freebsd_getpages,
-	.vop_putpages =		zfs_freebsd_putpages,
-	.vop_vptocnp =		zfs_vptocnp,
-#ifdef DIAGNOSTIC
-	.vop_lock1 =		zfs_lock,
-#else
-	.vop_lock1 =		vop_lock,
-#endif
-	.vop_unlock =		vop_unlock,
-	.vop_islocked =		vop_islocked,
-};
-VFS_VOP_VECTOR_REGISTER(zfs_vnodeops);
-
-struct vop_vector zfs_fifoops = {
-	.vop_default =		&fifo_specops,
-	.vop_fsync =		zfs_freebsd_fsync,
-	.vop_access =		zfs_freebsd_access,
-	.vop_getattr =		zfs_freebsd_getattr,
-	.vop_inactive =		zfs_freebsd_inactive,
-	.vop_read =		VOP_PANIC,
-	.vop_reclaim =		zfs_freebsd_reclaim,
-	.vop_setattr =		zfs_freebsd_setattr,
-	.vop_write =		VOP_PANIC,
-	.vop_pathconf = 	zfs_freebsd_pathconf,
-	.vop_fid =		zfs_freebsd_fid,
-	.vop_getacl =		zfs_freebsd_getacl,
-	.vop_setacl =		zfs_freebsd_setacl,
-	.vop_aclcheck =		zfs_freebsd_aclcheck,
-};
-VFS_VOP_VECTOR_REGISTER(zfs_fifoops);
-
-/*
- * special share hidden files vnode operations template
- */
-struct vop_vector zfs_shareops = {
-	.vop_default =		&default_vnodeops,
-	.vop_fplookup_vexec =	zfs_freebsd_fplookup_vexec,
-	.vop_access =		zfs_freebsd_access,
-	.vop_inactive =		zfs_freebsd_inactive,
-	.vop_reclaim =		zfs_freebsd_reclaim,
-	.vop_fid =		zfs_freebsd_fid,
-	.vop_pathconf =		zfs_freebsd_pathconf,
-};
-VFS_VOP_VECTOR_REGISTER(zfs_shareops);
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_znode.c
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_znode.c
+++ /dev/null
@@ -1,2388 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
- * Copyright (c) 2014 Integros [integros.com]
- */
-
-/* Portions Copyright 2007 Jeremy Teo */
-/* Portions Copyright 2011 Martin Matuska <mm@FreeBSD.org> */
-
-#ifdef _KERNEL
-#include <sys/types.h>
-#include <sys/param.h>
-#include <sys/time.h>
-#include <sys/systm.h>
-#include <sys/sysmacros.h>
-#include <sys/resource.h>
-#include <sys/mntent.h>
-#include <sys/u8_textprep.h>
-#include <sys/dsl_dataset.h>
-#include <sys/vfs.h>
-#include <sys/vnode.h>
-#include <sys/file.h>
-#include <sys/kmem.h>
-#include <sys/errno.h>
-#include <sys/unistd.h>
-#include <sys/atomic.h>
-#include <sys/zfs_dir.h>
-#include <sys/zfs_acl.h>
-#include <sys/zfs_ioctl.h>
-#include <sys/zfs_rlock.h>
-#include <sys/zfs_fuid.h>
-#include <sys/dnode.h>
-#include <sys/fs/zfs.h>
-#include <sys/kidmap.h>
-#endif /* _KERNEL */
-
-#include <sys/dmu.h>
-#include <sys/dmu_objset.h>
-#include <sys/dmu_tx.h>
-#include <sys/refcount.h>
-#include <sys/stat.h>
-#include <sys/zap.h>
-#include <sys/zfs_znode.h>
-#include <sys/sa.h>
-#include <sys/zfs_sa.h>
-#include <sys/zfs_stat.h>
-#include <sys/refcount.h>
-
-#include "zfs_prop.h"
-#include "zfs_comutil.h"
-
-/* Used by fstat(1). */
-SYSCTL_INT(_debug_sizeof, OID_AUTO, znode, CTLFLAG_RD,
-    SYSCTL_NULL_INT_PTR, sizeof(znode_t), "sizeof(znode_t)");
-
-/*
- * Define ZNODE_STATS to turn on statistic gathering. By default, it is only
- * turned on when DEBUG is also defined.
- */
-#ifdef	DEBUG
-#define	ZNODE_STATS
-#endif	/* DEBUG */
-
-#ifdef	ZNODE_STATS
-#define	ZNODE_STAT_ADD(stat)			((stat)++)
-#else
-#define	ZNODE_STAT_ADD(stat)			/* nothing */
-#endif	/* ZNODE_STATS */
-
-/*
- * Functions needed for userland (ie: libzpool) are not put under
- * #ifdef_KERNEL; the rest of the functions have dependencies
- * (such as VFS logic) that will not compile easily in userland.
- */
-#ifdef _KERNEL
-/*
- * Needed to close a small window in zfs_znode_move() that allows the zfsvfs to
- * be freed before it can be safely accessed.
- */
-krwlock_t zfsvfs_lock;
-
-#if defined(_KERNEL) && !defined(KMEM_DEBUG)
-#define _ZFS_USE_SMR
-static uma_zone_t znode_uma_zone;
-#else
-static kmem_cache_t *znode_cache = NULL;
-#endif
-
-/*ARGSUSED*/
-static void
-znode_evict_error(dmu_buf_t *dbuf, void *user_ptr)
-{
-	/*
-	 * We should never drop all dbuf refs without first clearing
-	 * the eviction callback.
-	 */
-	panic("evicting znode %p\n", user_ptr);
-}
-
-extern struct vop_vector zfs_vnodeops;
-extern struct vop_vector zfs_fifoops;
-extern struct vop_vector zfs_shareops;
-
-/*
- * This callback is invoked when acquiring a RL_WRITER or RL_APPEND lock on
- * z_rangelock. It will modify the offset and length of the lock to reflect
- * znode-specific information, and convert RL_APPEND to RL_WRITER.  This is
- * called with the rangelock_t's rl_lock held, which avoids races.
- */
-static void
-zfs_rangelock_cb(locked_range_t *new, void *arg)
-{
-	znode_t *zp = arg;
-
-	/*
-	 * If in append mode, convert to writer and lock starting at the
-	 * current end of file.
-	 */
-	if (new->lr_type == RL_APPEND) {
-		new->lr_offset = zp->z_size;
-		new->lr_type = RL_WRITER;
-	}
-
-	/*
-	 * If we need to grow the block size then lock the whole file range.
-	 */
-	uint64_t end_size = MAX(zp->z_size, new->lr_offset + new->lr_length);
-	if (end_size > zp->z_blksz && (!ISP2(zp->z_blksz) ||
-	    zp->z_blksz < zp->z_zfsvfs->z_max_blksz)) {
-		new->lr_offset = 0;
-		new->lr_length = UINT64_MAX;
-	}
-}
-
-/*ARGSUSED*/
-static int
-zfs_znode_cache_constructor(void *buf, void *arg, int kmflags)
-{
-	znode_t *zp = buf;
-
-	POINTER_INVALIDATE(&zp->z_zfsvfs);
-
-	list_link_init(&zp->z_link_node);
-
-	mutex_init(&zp->z_acl_lock, NULL, MUTEX_DEFAULT, NULL);
-
-	rangelock_init(&zp->z_rangelock, zfs_rangelock_cb, zp);
-
-	zp->z_acl_cached = NULL;
-	zp->z_vnode = NULL;
-	zp->z_moved = 0;
-	return (0);
-}
-
-/*ARGSUSED*/
-static void
-zfs_znode_cache_destructor(void *buf, void *arg)
-{
-	znode_t *zp = buf;
-
-	ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs));
-	ASSERT3P(zp->z_vnode, ==, NULL);
-	ASSERT(!list_link_active(&zp->z_link_node));
-	mutex_destroy(&zp->z_acl_lock);
-	rangelock_fini(&zp->z_rangelock);
-
-	ASSERT(zp->z_acl_cached == NULL);
-}
-
-#ifdef	ZNODE_STATS
-static struct {
-	uint64_t zms_zfsvfs_invalid;
-	uint64_t zms_zfsvfs_recheck1;
-	uint64_t zms_zfsvfs_unmounted;
-	uint64_t zms_zfsvfs_recheck2;
-	uint64_t zms_obj_held;
-	uint64_t zms_vnode_locked;
-	uint64_t zms_not_only_dnlc;
-} znode_move_stats;
-#endif	/* ZNODE_STATS */
-
-#ifdef illumos
-static void
-zfs_znode_move_impl(znode_t *ozp, znode_t *nzp)
-{
-	vnode_t *vp;
-
-	/* Copy fields. */
-	nzp->z_zfsvfs = ozp->z_zfsvfs;
-
-	/* Swap vnodes. */
-	vp = nzp->z_vnode;
-	nzp->z_vnode = ozp->z_vnode;
-	ozp->z_vnode = vp; /* let destructor free the overwritten vnode */
-	ZTOV(ozp)->v_data = ozp;
-	ZTOV(nzp)->v_data = nzp;
-
-	nzp->z_id = ozp->z_id;
-	ASSERT(ozp->z_dirlocks == NULL); /* znode not in use */
-	nzp->z_unlinked = ozp->z_unlinked;
-	nzp->z_atime_dirty = ozp->z_atime_dirty;
-	nzp->z_zn_prefetch = ozp->z_zn_prefetch;
-	nzp->z_blksz = ozp->z_blksz;
-	nzp->z_seq = ozp->z_seq;
-	nzp->z_mapcnt = ozp->z_mapcnt;
-	nzp->z_gen = ozp->z_gen;
-	nzp->z_sync_cnt = ozp->z_sync_cnt;
-	nzp->z_is_sa = ozp->z_is_sa;
-	nzp->z_sa_hdl = ozp->z_sa_hdl;
-	bcopy(ozp->z_atime, nzp->z_atime, sizeof (uint64_t) * 2);
-	nzp->z_links = ozp->z_links;
-	nzp->z_size = ozp->z_size;
-	nzp->z_pflags = ozp->z_pflags;
-	nzp->z_uid = ozp->z_uid;
-	nzp->z_gid = ozp->z_gid;
-	nzp->z_mode = ozp->z_mode;
-
-	/*
-	 * Since this is just an idle znode and kmem is already dealing with
-	 * memory pressure, release any cached ACL.
-	 */
-	if (ozp->z_acl_cached) {
-		zfs_acl_free(ozp->z_acl_cached);
-		ozp->z_acl_cached = NULL;
-	}
-
-	sa_set_userp(nzp->z_sa_hdl, nzp);
-
-	/*
-	 * Invalidate the original znode by clearing fields that provide a
-	 * pointer back to the znode. Set the low bit of the vfs pointer to
-	 * ensure that zfs_znode_move() recognizes the znode as invalid in any
-	 * subsequent callback.
-	 */
-	ozp->z_sa_hdl = NULL;
-	POINTER_INVALIDATE(&ozp->z_zfsvfs);
-
-	/*
-	 * Mark the znode.
-	 */
-	nzp->z_moved = 1;
-	ozp->z_moved = (uint8_t)-1;
-}
-
-/*ARGSUSED*/
-static kmem_cbrc_t
-zfs_znode_move(void *buf, void *newbuf, size_t size, void *arg)
-{
-	znode_t *ozp = buf, *nzp = newbuf;
-	zfsvfs_t *zfsvfs;
-	vnode_t *vp;
-
-	/*
-	 * The znode is on the file system's list of known znodes if the vfs
-	 * pointer is valid. We set the low bit of the vfs pointer when freeing
-	 * the znode to invalidate it, and the memory patterns written by kmem
-	 * (baddcafe and deadbeef) set at least one of the two low bits. A newly
-	 * created znode sets the vfs pointer last of all to indicate that the
-	 * znode is known and in a valid state to be moved by this function.
-	 */
-	zfsvfs = ozp->z_zfsvfs;
-	if (!POINTER_IS_VALID(zfsvfs)) {
-		ZNODE_STAT_ADD(znode_move_stats.zms_zfsvfs_invalid);
-		return (KMEM_CBRC_DONT_KNOW);
-	}
-
-	/*
-	 * Close a small window in which it's possible that the filesystem could
-	 * be unmounted and freed, and zfsvfs, though valid in the previous
-	 * statement, could point to unrelated memory by the time we try to
-	 * prevent the filesystem from being unmounted.
-	 */
-	rw_enter(&zfsvfs_lock, RW_WRITER);
-	if (zfsvfs != ozp->z_zfsvfs) {
-		rw_exit(&zfsvfs_lock);
-		ZNODE_STAT_ADD(znode_move_stats.zms_zfsvfs_recheck1);
-		return (KMEM_CBRC_DONT_KNOW);
-	}
-
-	/*
-	 * If the znode is still valid, then so is the file system. We know that
-	 * no valid file system can be freed while we hold zfsvfs_lock, so we
-	 * can safely ensure that the filesystem is not and will not be
-	 * unmounted. The next statement is equivalent to ZFS_ENTER().
-	 */
-	rrm_enter(&zfsvfs->z_teardown_lock, RW_READER, FTAG);
-	if (zfsvfs->z_unmounted) {
-		ZFS_EXIT(zfsvfs);
-		rw_exit(&zfsvfs_lock);
-		ZNODE_STAT_ADD(znode_move_stats.zms_zfsvfs_unmounted);
-		return (KMEM_CBRC_DONT_KNOW);
-	}
-	rw_exit(&zfsvfs_lock);
-
-	mutex_enter(&zfsvfs->z_znodes_lock);
-	/*
-	 * Recheck the vfs pointer in case the znode was removed just before
-	 * acquiring the lock.
-	 */
-	if (zfsvfs != ozp->z_zfsvfs) {
-		mutex_exit(&zfsvfs->z_znodes_lock);
-		ZFS_EXIT(zfsvfs);
-		ZNODE_STAT_ADD(znode_move_stats.zms_zfsvfs_recheck2);
-		return (KMEM_CBRC_DONT_KNOW);
-	}
-
-	/*
-	 * At this point we know that as long as we hold z_znodes_lock, the
-	 * znode cannot be freed and fields within the znode can be safely
-	 * accessed. Now, prevent a race with zfs_zget().
-	 */
-	if (ZFS_OBJ_HOLD_TRYENTER(zfsvfs, ozp->z_id) == 0) {
-		mutex_exit(&zfsvfs->z_znodes_lock);
-		ZFS_EXIT(zfsvfs);
-		ZNODE_STAT_ADD(znode_move_stats.zms_obj_held);
-		return (KMEM_CBRC_LATER);
-	}
-
-	vp = ZTOV(ozp);
-	if (mutex_tryenter(&vp->v_lock) == 0) {
-		ZFS_OBJ_HOLD_EXIT(zfsvfs, ozp->z_id);
-		mutex_exit(&zfsvfs->z_znodes_lock);
-		ZFS_EXIT(zfsvfs);
-		ZNODE_STAT_ADD(znode_move_stats.zms_vnode_locked);
-		return (KMEM_CBRC_LATER);
-	}
-
-	/* Only move znodes that are referenced _only_ by the DNLC. */
-	if (vp->v_count != 1 || !vn_in_dnlc(vp)) {
-		mutex_exit(&vp->v_lock);
-		ZFS_OBJ_HOLD_EXIT(zfsvfs, ozp->z_id);
-		mutex_exit(&zfsvfs->z_znodes_lock);
-		ZFS_EXIT(zfsvfs);
-		ZNODE_STAT_ADD(znode_move_stats.zms_not_only_dnlc);
-		return (KMEM_CBRC_LATER);
-	}
-
-	/*
-	 * The znode is known and in a valid state to move. We're holding the
-	 * locks needed to execute the critical section.
-	 */
-	zfs_znode_move_impl(ozp, nzp);
-	mutex_exit(&vp->v_lock);
-	ZFS_OBJ_HOLD_EXIT(zfsvfs, ozp->z_id);
-
-	list_link_replace(&ozp->z_link_node, &nzp->z_link_node);
-	mutex_exit(&zfsvfs->z_znodes_lock);
-	ZFS_EXIT(zfsvfs);
-
-	return (KMEM_CBRC_YES);
-}
-#endif /* illumos */
-
-#ifdef _ZFS_USE_SMR
-VFS_SMR_DECLARE;
-
-static int
-zfs_znode_cache_constructor_smr(void *mem, int size __unused, void *private, int flags)
-{
-
-	return (zfs_znode_cache_constructor(mem, private, flags));
-}
-
-static void
-zfs_znode_cache_destructor_smr(void *mem, int size __unused, void *private)
-{
-
-	zfs_znode_cache_destructor(mem, private);
-}
-
-void
-zfs_znode_init(void)
-{
-	/*
-	 * Initialize zcache
-	 */
-	rw_init(&zfsvfs_lock, NULL, RW_DEFAULT, NULL);
-	ASSERT(znode_uma_zone == NULL);
-	znode_uma_zone = uma_zcreate("zfs_znode_cache",
-	    sizeof (znode_t), zfs_znode_cache_constructor_smr,
-	    zfs_znode_cache_destructor_smr, NULL, NULL, 0, 0);
-	VFS_SMR_ZONE_SET(znode_uma_zone);
-}
-
-static znode_t *
-zfs_znode_alloc_kmem(int flags)
-{
-
-	return (uma_zalloc_smr(znode_uma_zone, flags));
-}
-
-static void
-zfs_znode_free_kmem(znode_t *zp)
-{
-
-	uma_zfree_smr(znode_uma_zone, zp);
-}
-#else
-void
-zfs_znode_init(void)
-{
-	/*
-	 * Initialize zcache
-	 */
-	rw_init(&zfsvfs_lock, NULL, RW_DEFAULT, NULL);
-	ASSERT(znode_cache == NULL);
-	znode_cache = kmem_cache_create("zfs_znode_cache",
-	    sizeof (znode_t), 0, zfs_znode_cache_constructor,
-	    zfs_znode_cache_destructor, NULL, NULL, NULL, 0);
-	kmem_cache_set_move(znode_cache, zfs_znode_move);
-}
-
-static znode_t *
-zfs_znode_alloc_kmem(int flags)
-{
-
-	return (kmem_cache_alloc(znode_cache, flags));
-}
-
-static void
-zfs_znode_free_kmem(znode_t *zp)
-{
-
-	kmem_cache_free(znode_cache, zp);
-}
-#endif
-
-void
-zfs_znode_fini(void)
-{
-#ifdef illumos
-	/*
-	 * Cleanup vfs & vnode ops
-	 */
-	zfs_remove_op_tables();
-#endif
-
-	/*
-	 * Cleanup zcache
-	 */
-#ifdef _ZFS_USE_SMR
-	if (znode_uma_zone) {
-		uma_zdestroy(znode_uma_zone);
-		znode_uma_zone = NULL;
-	}
-#else
-	if (znode_cache) {
-		kmem_cache_destroy(znode_cache);
-		znode_cache = NULL;
-	}
-#endif
-	rw_destroy(&zfsvfs_lock);
-}
-
-#ifdef illumos
-struct vnodeops *zfs_dvnodeops;
-struct vnodeops *zfs_fvnodeops;
-struct vnodeops *zfs_symvnodeops;
-struct vnodeops *zfs_xdvnodeops;
-struct vnodeops *zfs_evnodeops;
-struct vnodeops *zfs_sharevnodeops;
-
-void
-zfs_remove_op_tables()
-{
-	/*
-	 * Remove vfs ops
-	 */
-	ASSERT(zfsfstype);
-	(void) vfs_freevfsops_by_type(zfsfstype);
-	zfsfstype = 0;
-
-	/*
-	 * Remove vnode ops
-	 */
-	if (zfs_dvnodeops)
-		vn_freevnodeops(zfs_dvnodeops);
-	if (zfs_fvnodeops)
-		vn_freevnodeops(zfs_fvnodeops);
-	if (zfs_symvnodeops)
-		vn_freevnodeops(zfs_symvnodeops);
-	if (zfs_xdvnodeops)
-		vn_freevnodeops(zfs_xdvnodeops);
-	if (zfs_evnodeops)
-		vn_freevnodeops(zfs_evnodeops);
-	if (zfs_sharevnodeops)
-		vn_freevnodeops(zfs_sharevnodeops);
-
-	zfs_dvnodeops = NULL;
-	zfs_fvnodeops = NULL;
-	zfs_symvnodeops = NULL;
-	zfs_xdvnodeops = NULL;
-	zfs_evnodeops = NULL;
-	zfs_sharevnodeops = NULL;
-}
-
-extern const fs_operation_def_t zfs_dvnodeops_template[];
-extern const fs_operation_def_t zfs_fvnodeops_template[];
-extern const fs_operation_def_t zfs_xdvnodeops_template[];
-extern const fs_operation_def_t zfs_symvnodeops_template[];
-extern const fs_operation_def_t zfs_evnodeops_template[];
-extern const fs_operation_def_t zfs_sharevnodeops_template[];
-
-int
-zfs_create_op_tables()
-{
-	int error;
-
-	/*
-	 * zfs_dvnodeops can be set if mod_remove() calls mod_installfs()
-	 * due to a failure to remove the the 2nd modlinkage (zfs_modldrv).
-	 * In this case we just return as the ops vectors are already set up.
-	 */
-	if (zfs_dvnodeops)
-		return (0);
-
-	error = vn_make_ops(MNTTYPE_ZFS, zfs_dvnodeops_template,
-	    &zfs_dvnodeops);
-	if (error)
-		return (error);
-
-	error = vn_make_ops(MNTTYPE_ZFS, zfs_fvnodeops_template,
-	    &zfs_fvnodeops);
-	if (error)
-		return (error);
-
-	error = vn_make_ops(MNTTYPE_ZFS, zfs_symvnodeops_template,
-	    &zfs_symvnodeops);
-	if (error)
-		return (error);
-
-	error = vn_make_ops(MNTTYPE_ZFS, zfs_xdvnodeops_template,
-	    &zfs_xdvnodeops);
-	if (error)
-		return (error);
-
-	error = vn_make_ops(MNTTYPE_ZFS, zfs_evnodeops_template,
-	    &zfs_evnodeops);
-	if (error)
-		return (error);
-
-	error = vn_make_ops(MNTTYPE_ZFS, zfs_sharevnodeops_template,
-	    &zfs_sharevnodeops);
-
-	return (error);
-}
-#endif	/* illumos */
-
-int
-zfs_create_share_dir(zfsvfs_t *zfsvfs, dmu_tx_t *tx)
-{
-	zfs_acl_ids_t acl_ids;
-	vattr_t vattr;
-	znode_t *sharezp;
-	znode_t *zp;
-	int error;
-
-	vattr.va_mask = AT_MODE|AT_UID|AT_GID|AT_TYPE;
-	vattr.va_type = VDIR;
-	vattr.va_mode = S_IFDIR|0555;
-	vattr.va_uid = crgetuid(kcred);
-	vattr.va_gid = crgetgid(kcred);
-
-	sharezp = zfs_znode_alloc_kmem(KM_SLEEP);
-	ASSERT(!POINTER_IS_VALID(sharezp->z_zfsvfs));
-	sharezp->z_moved = 0;
-	sharezp->z_unlinked = 0;
-	sharezp->z_atime_dirty = 0;
-	sharezp->z_zfsvfs = zfsvfs;
-	sharezp->z_is_sa = zfsvfs->z_use_sa;
-
-	VERIFY(0 == zfs_acl_ids_create(sharezp, IS_ROOT_NODE, &vattr,
-	    kcred, NULL, &acl_ids));
-	zfs_mknode(sharezp, &vattr, tx, kcred, IS_ROOT_NODE, &zp, &acl_ids);
-	ASSERT3P(zp, ==, sharezp);
-	POINTER_INVALIDATE(&sharezp->z_zfsvfs);
-	error = zap_add(zfsvfs->z_os, MASTER_NODE_OBJ,
-	    ZFS_SHARES_DIR, 8, 1, &sharezp->z_id, tx);
-	zfsvfs->z_shares_dir = sharezp->z_id;
-
-	zfs_acl_ids_free(&acl_ids);
-	sa_handle_destroy(sharezp->z_sa_hdl);
-	zfs_znode_free_kmem(sharezp);
-
-	return (error);
-}
-
-/*
- * define a couple of values we need available
- * for both 64 and 32 bit environments.
- */
-#ifndef NBITSMINOR64
-#define	NBITSMINOR64	32
-#endif
-#ifndef MAXMAJ64
-#define	MAXMAJ64	0xffffffffUL
-#endif
-#ifndef	MAXMIN64
-#define	MAXMIN64	0xffffffffUL
-#endif
-
-/*
- * Create special expldev for ZFS private use.
- * Can't use standard expldev since it doesn't do
- * what we want.  The standard expldev() takes a
- * dev32_t in LP64 and expands it to a long dev_t.
- * We need an interface that takes a dev32_t in ILP32
- * and expands it to a long dev_t.
- */
-static uint64_t
-zfs_expldev(dev_t dev)
-{
-	return (((uint64_t)major(dev) << NBITSMINOR64) | minor(dev));
-}
-/*
- * Special cmpldev for ZFS private use.
- * Can't use standard cmpldev since it takes
- * a long dev_t and compresses it to dev32_t in
- * LP64.  We need to do a compaction of a long dev_t
- * to a dev32_t in ILP32.
- */
-dev_t
-zfs_cmpldev(uint64_t dev)
-{
-	return (makedev((dev >> NBITSMINOR64), (dev & MAXMIN64)));
-}
-
-static void
-zfs_znode_sa_init(zfsvfs_t *zfsvfs, znode_t *zp,
-    dmu_buf_t *db, dmu_object_type_t obj_type, sa_handle_t *sa_hdl)
-{
-	ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs) || (zfsvfs == zp->z_zfsvfs));
-	ASSERT(MUTEX_HELD(ZFS_OBJ_MUTEX(zfsvfs, zp->z_id)));
-
-	ASSERT(zp->z_sa_hdl == NULL);
-	ASSERT(zp->z_acl_cached == NULL);
-	if (sa_hdl == NULL) {
-		VERIFY(0 == sa_handle_get_from_db(zfsvfs->z_os, db, zp,
-		    SA_HDL_SHARED, &zp->z_sa_hdl));
-	} else {
-		zp->z_sa_hdl = sa_hdl;
-		sa_set_userp(sa_hdl, zp);
-	}
-
-	zp->z_is_sa = (obj_type == DMU_OT_SA) ? B_TRUE : B_FALSE;
-
-	/*
-	 * Slap on VROOT if we are the root znode unless we are the root
-	 * node of a snapshot mounted under .zfs.
-	 */
-	if (zp->z_id == zfsvfs->z_root && zfsvfs->z_parent == zfsvfs)
-		ZTOV(zp)->v_flag |= VROOT;
-
-	vn_exists(ZTOV(zp));
-}
-
-void
-zfs_znode_dmu_fini(znode_t *zp)
-{
-	ASSERT(MUTEX_HELD(ZFS_OBJ_MUTEX(zp->z_zfsvfs, zp->z_id)) ||
-	    zp->z_unlinked ||
-	    ZFS_TEARDOWN_INACTIVE_WLOCKED(zp->z_zfsvfs));
-
-	sa_handle_destroy(zp->z_sa_hdl);
-	zp->z_sa_hdl = NULL;
-}
-
-static void
-zfs_vnode_forget(vnode_t *vp)
-{
-
-	/* copied from insmntque_stddtr */
-	vp->v_data = NULL;
-	vp->v_op = &dead_vnodeops;
-	vgone(vp);
-	vput(vp);
-}
-
-/*
- * Construct a new znode/vnode and intialize.
- *
- * This does not do a call to dmu_set_user() that is
- * up to the caller to do, in case you don't want to
- * return the znode
- */
-static znode_t *
-zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz,
-    dmu_object_type_t obj_type, sa_handle_t *hdl)
-{
-	znode_t	*zp;
-	vnode_t *vp;
-	uint64_t mode;
-	uint64_t parent;
-	sa_bulk_attr_t bulk[9];
-	int count = 0;
-	int error;
-
-	zp = zfs_znode_alloc_kmem(KM_SLEEP);
-
-#ifndef _ZFS_USE_SMR
-	KASSERT((zfsvfs->z_parent->z_vfs->mnt_kern_flag & MNTK_FPLOOKUP) == 0,
-	    ("%s: fast path lookup enabled without smr", __func__));
-#endif
-
-	KASSERT(curthread->td_vp_reserved != NULL,
-	    ("zfs_znode_alloc: getnewvnode without preallocated vnode"));
-	error = getnewvnode("zfs", zfsvfs->z_parent->z_vfs, &zfs_vnodeops, &vp);
-	if (error != 0) {
-		zfs_znode_free_kmem(zp);
-		return (NULL);
-	}
-	zp->z_vnode = vp;
-	vp->v_data = zp;
-
-	ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs));
-	zp->z_moved = 0;
-
-	/*
-	 * Defer setting z_zfsvfs until the znode is ready to be a candidate for
-	 * the zfs_znode_move() callback.
-	 */
-	zp->z_sa_hdl = NULL;
-	zp->z_unlinked = 0;
-	zp->z_atime_dirty = 0;
-	zp->z_mapcnt = 0;
-	zp->z_id = db->db_object;
-	zp->z_blksz = blksz;
-	zp->z_seq = 0x7A4653;
-	zp->z_sync_cnt = 0;
-
-	vp = ZTOV(zp);
-
-	zfs_znode_sa_init(zfsvfs, zp, db, obj_type, hdl);
-
-	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, &mode, 8);
-	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GEN(zfsvfs), NULL, &zp->z_gen, 8);
-	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL,
-	    &zp->z_size, 8);
-	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL,
-	    &zp->z_links, 8);
-	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
-	    &zp->z_pflags, 8);
-	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PARENT(zfsvfs), NULL, &parent, 8);
-	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL,
-	    &zp->z_atime, 16);
-	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL,
-	    &zp->z_uid, 8);
-	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL,
-	    &zp->z_gid, 8);
-
-	if (sa_bulk_lookup(zp->z_sa_hdl, bulk, count) != 0 || zp->z_gen == 0) {
-		if (hdl == NULL)
-			sa_handle_destroy(zp->z_sa_hdl);
-		zfs_vnode_forget(vp);
-		zp->z_vnode = NULL;
-		zfs_znode_free_kmem(zp);
-		return (NULL);
-	}
-
-	zp->z_mode = mode;
-
-	vp->v_type = IFTOVT((mode_t)mode);
-
-	switch (vp->v_type) {
-	case VDIR:
-		zp->z_zn_prefetch = B_TRUE; /* z_prefetch default is enabled */
-		break;
-#ifdef illumos
-	case VBLK:
-	case VCHR:
-		{
-			uint64_t rdev;
-			VERIFY(sa_lookup(zp->z_sa_hdl, SA_ZPL_RDEV(zfsvfs),
-			    &rdev, sizeof (rdev)) == 0);
-
-			vp->v_rdev = zfs_cmpldev(rdev);
-		}
-		break;
-#endif
-	case VFIFO:
-#ifdef illumos
-	case VSOCK:
-	case VDOOR:
-#endif
-		vp->v_op = &zfs_fifoops;
-		break;
-	case VREG:
-		if (parent == zfsvfs->z_shares_dir) {
-			ASSERT(zp->z_uid == 0 && zp->z_gid == 0);
-			vp->v_op = &zfs_shareops;
-		}
-		break;
-#ifdef illumos
-	case VLNK:
-		vn_setops(vp, zfs_symvnodeops);
-		break;
-	default:
-		vn_setops(vp, zfs_evnodeops);
-		break;
-#endif
-	}
-
-	mutex_enter(&zfsvfs->z_znodes_lock);
-	list_insert_tail(&zfsvfs->z_all_znodes, zp);
-	membar_producer();
-	/*
-	 * Everything else must be valid before assigning z_zfsvfs makes the
-	 * znode eligible for zfs_znode_move().
-	 */
-	zp->z_zfsvfs = zfsvfs;
-	mutex_exit(&zfsvfs->z_znodes_lock);
-
-	/*
-	 * Acquire vnode lock before making it available to the world.
-	 */
-	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
-	VN_LOCK_AREC(vp);
-	if (vp->v_type != VFIFO)
-		VN_LOCK_ASHARE(vp);
-
-#ifdef illumos
-	VFS_HOLD(zfsvfs->z_vfs);
-#endif
-	return (zp);
-}
-
-static uint64_t empty_xattr;
-static uint64_t pad[4];
-static zfs_acl_phys_t acl_phys;
-/*
- * Create a new DMU object to hold a zfs znode.
- *
- *	IN:	dzp	- parent directory for new znode
- *		vap	- file attributes for new znode
- *		tx	- dmu transaction id for zap operations
- *		cr	- credentials of caller
- *		flag	- flags:
- *			  IS_ROOT_NODE	- new object will be root
- *			  IS_XATTR	- new object is an attribute
- *		bonuslen - length of bonus buffer
- *		setaclp  - File/Dir initial ACL
- *		fuidp	 - Tracks fuid allocation.
- *
- *	OUT:	zpp	- allocated znode
- *
- */
-void
-zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr,
-    uint_t flag, znode_t **zpp, zfs_acl_ids_t *acl_ids)
-{
-	uint64_t	crtime[2], atime[2], mtime[2], ctime[2];
-	uint64_t	mode, size, links, parent, pflags;
-	uint64_t	dzp_pflags = 0;
-	uint64_t	rdev = 0;
-	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
-	dmu_buf_t	*db;
-	timestruc_t	now;
-	uint64_t	gen, obj;
-	int		err;
-	int		bonuslen;
-	int		dnodesize;
-	sa_handle_t	*sa_hdl;
-	dmu_object_type_t obj_type;
-	sa_bulk_attr_t	*sa_attrs;
-	int		cnt = 0;
-	zfs_acl_locator_cb_t locate = { 0 };
-
-	ASSERT(vap && (vap->va_mask & (AT_TYPE|AT_MODE)) == (AT_TYPE|AT_MODE));
-
-	if (zfsvfs->z_replay) {
-		obj = vap->va_nodeid;
-		now = vap->va_ctime;		/* see zfs_replay_create() */
-		gen = vap->va_nblocks;		/* ditto */
-		dnodesize = vap->va_fsid;	/* ditto */
-	} else {
-		obj = 0;
-		vfs_timestamp(&now);
-		gen = dmu_tx_get_txg(tx);
-		dnodesize = dmu_objset_dnodesize(zfsvfs->z_os);
-	}
-
-	if (dnodesize == 0)
-		dnodesize = DNODE_MIN_SIZE;
-
-	obj_type = zfsvfs->z_use_sa ? DMU_OT_SA : DMU_OT_ZNODE;
-	bonuslen = (obj_type == DMU_OT_SA) ?
-	    DN_BONUS_SIZE(dnodesize) : ZFS_OLD_ZNODE_PHYS_SIZE;
-
-	/*
-	 * Create a new DMU object.
-	 */
-	/*
-	 * There's currently no mechanism for pre-reading the blocks that will
-	 * be needed to allocate a new object, so we accept the small chance
-	 * that there will be an i/o error and we will fail one of the
-	 * assertions below.
-	 */
-	if (vap->va_type == VDIR) {
-		if (zfsvfs->z_replay) {
-			VERIFY0(zap_create_claim_norm_dnsize(zfsvfs->z_os, obj,
-			    zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS,
-			    obj_type, bonuslen, dnodesize, tx));
-		} else {
-			obj = zap_create_norm_dnsize(zfsvfs->z_os,
-			    zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS,
-			    obj_type, bonuslen, dnodesize, tx);
-		}
-	} else {
-		if (zfsvfs->z_replay) {
-			VERIFY0(dmu_object_claim_dnsize(zfsvfs->z_os, obj,
-			    DMU_OT_PLAIN_FILE_CONTENTS, 0,
-			    obj_type, bonuslen, dnodesize, tx));
-		} else {
-			obj = dmu_object_alloc_dnsize(zfsvfs->z_os,
-			    DMU_OT_PLAIN_FILE_CONTENTS, 0,
-			    obj_type, bonuslen, dnodesize, tx);
-		}
-	}
-
-	ZFS_OBJ_HOLD_ENTER(zfsvfs, obj);
-	VERIFY0(sa_buf_hold(zfsvfs->z_os, obj, NULL, &db));
-
-	/*
-	 * If this is the root, fix up the half-initialized parent pointer
-	 * to reference the just-allocated physical data area.
-	 */
-	if (flag & IS_ROOT_NODE) {
-		dzp->z_id = obj;
-	} else {
-		dzp_pflags = dzp->z_pflags;
-	}
-
-	/*
-	 * If parent is an xattr, so am I.
-	 */
-	if (dzp_pflags & ZFS_XATTR) {
-		flag |= IS_XATTR;
-	}
-
-	if (zfsvfs->z_use_fuids)
-		pflags = ZFS_ARCHIVE | ZFS_AV_MODIFIED;
-	else
-		pflags = 0;
-
-	if (vap->va_type == VDIR) {
-		size = 2;		/* contents ("." and "..") */
-		links = (flag & (IS_ROOT_NODE | IS_XATTR)) ? 2 : 1;
-	} else {
-		size = links = 0;
-	}
-
-	if (vap->va_type == VBLK || vap->va_type == VCHR) {
-		rdev = zfs_expldev(vap->va_rdev);
-	}
-
-	parent = dzp->z_id;
-	mode = acl_ids->z_mode;
-	if (flag & IS_XATTR)
-		pflags |= ZFS_XATTR;
-
-	/*
-	 * No execs denied will be deterimed when zfs_mode_compute() is called.
-	 */
-	pflags |= acl_ids->z_aclp->z_hints &
-	    (ZFS_ACL_TRIVIAL|ZFS_INHERIT_ACE|ZFS_ACL_AUTO_INHERIT|
-	    ZFS_ACL_DEFAULTED|ZFS_ACL_PROTECTED);
-
-	ZFS_TIME_ENCODE(&now, crtime);
-	ZFS_TIME_ENCODE(&now, ctime);
-
-	if (vap->va_mask & AT_ATIME) {
-		ZFS_TIME_ENCODE(&vap->va_atime, atime);
-	} else {
-		ZFS_TIME_ENCODE(&now, atime);
-	}
-
-	if (vap->va_mask & AT_MTIME) {
-		ZFS_TIME_ENCODE(&vap->va_mtime, mtime);
-	} else {
-		ZFS_TIME_ENCODE(&now, mtime);
-	}
-
-	/* Now add in all of the "SA" attributes */
-	VERIFY(0 == sa_handle_get_from_db(zfsvfs->z_os, db, NULL, SA_HDL_SHARED,
-	    &sa_hdl));
-
-	/*
-	 * Setup the array of attributes to be replaced/set on the new file
-	 *
-	 * order for  DMU_OT_ZNODE is critical since it needs to be constructed
-	 * in the old znode_phys_t format.  Don't change this ordering
-	 */
-	sa_attrs = kmem_alloc(sizeof (sa_bulk_attr_t) * ZPL_END, KM_SLEEP);
-
-	if (obj_type == DMU_OT_ZNODE) {
-		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ATIME(zfsvfs),
-		    NULL, &atime, 16);
-		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MTIME(zfsvfs),
-		    NULL, &mtime, 16);
-		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CTIME(zfsvfs),
-		    NULL, &ctime, 16);
-		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CRTIME(zfsvfs),
-		    NULL, &crtime, 16);
-		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GEN(zfsvfs),
-		    NULL, &gen, 8);
-		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MODE(zfsvfs),
-		    NULL, &mode, 8);
-		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_SIZE(zfsvfs),
-		    NULL, &size, 8);
-		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PARENT(zfsvfs),
-		    NULL, &parent, 8);
-	} else {
-		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MODE(zfsvfs),
-		    NULL, &mode, 8);
-		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_SIZE(zfsvfs),
-		    NULL, &size, 8);
-		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GEN(zfsvfs),
-		    NULL, &gen, 8);
-		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_UID(zfsvfs),
-		    NULL, &acl_ids->z_fuid, 8);
-		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GID(zfsvfs),
-		    NULL, &acl_ids->z_fgid, 8);
-		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PARENT(zfsvfs),
-		    NULL, &parent, 8);
-		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_FLAGS(zfsvfs),
-		    NULL, &pflags, 8);
-		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ATIME(zfsvfs),
-		    NULL, &atime, 16);
-		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MTIME(zfsvfs),
-		    NULL, &mtime, 16);
-		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CTIME(zfsvfs),
-		    NULL, &ctime, 16);
-		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CRTIME(zfsvfs),
-		    NULL, &crtime, 16);
-	}
-
-	SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_LINKS(zfsvfs), NULL, &links, 8);
-
-	if (obj_type == DMU_OT_ZNODE) {
-		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_XATTR(zfsvfs), NULL,
-		    &empty_xattr, 8);
-	}
-	if (obj_type == DMU_OT_ZNODE ||
-	    (vap->va_type == VBLK || vap->va_type == VCHR)) {
-		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_RDEV(zfsvfs),
-		    NULL, &rdev, 8);
-
-	}
-	if (obj_type == DMU_OT_ZNODE) {
-		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_FLAGS(zfsvfs),
-		    NULL, &pflags, 8);
-		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_UID(zfsvfs), NULL,
-		    &acl_ids->z_fuid, 8);
-		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GID(zfsvfs), NULL,
-		    &acl_ids->z_fgid, 8);
-		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PAD(zfsvfs), NULL, pad,
-		    sizeof (uint64_t) * 4);
-		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ZNODE_ACL(zfsvfs), NULL,
-		    &acl_phys, sizeof (zfs_acl_phys_t));
-	} else if (acl_ids->z_aclp->z_version >= ZFS_ACL_VERSION_FUID) {
-		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_DACL_COUNT(zfsvfs), NULL,
-		    &acl_ids->z_aclp->z_acl_count, 8);
-		locate.cb_aclp = acl_ids->z_aclp;
-		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_DACL_ACES(zfsvfs),
-		    zfs_acl_data_locator, &locate,
-		    acl_ids->z_aclp->z_acl_bytes);
-		mode = zfs_mode_compute(mode, acl_ids->z_aclp, &pflags,
-		    acl_ids->z_fuid, acl_ids->z_fgid);
-	}
-
-	VERIFY(sa_replace_all_by_template(sa_hdl, sa_attrs, cnt, tx) == 0);
-
-	if (!(flag & IS_ROOT_NODE)) {
-		*zpp = zfs_znode_alloc(zfsvfs, db, 0, obj_type, sa_hdl);
-		ASSERT(*zpp != NULL);
-	} else {
-		/*
-		 * If we are creating the root node, the "parent" we
-		 * passed in is the znode for the root.
-		 */
-		*zpp = dzp;
-
-		(*zpp)->z_sa_hdl = sa_hdl;
-	}
-
-	(*zpp)->z_pflags = pflags;
-	(*zpp)->z_mode = mode;
-	(*zpp)->z_dnodesize = dnodesize;
-
-	if (vap->va_mask & AT_XVATTR)
-		zfs_xvattr_set(*zpp, (xvattr_t *)vap, tx);
-
-	if (obj_type == DMU_OT_ZNODE ||
-	    acl_ids->z_aclp->z_version < ZFS_ACL_VERSION_FUID) {
-		VERIFY0(zfs_aclset_common(*zpp, acl_ids->z_aclp, cr, tx));
-	}
-	if (!(flag & IS_ROOT_NODE)) {
-		vnode_t *vp;
-
-		vp = ZTOV(*zpp);
-		vp->v_vflag |= VV_FORCEINSMQ;
-		err = insmntque(vp, zfsvfs->z_vfs);
-		vp->v_vflag &= ~VV_FORCEINSMQ;
-		KASSERT(err == 0, ("insmntque() failed: error %d", err));
-	}
-	kmem_free(sa_attrs, sizeof (sa_bulk_attr_t) * ZPL_END);
-	ZFS_OBJ_HOLD_EXIT(zfsvfs, obj);
-}
-
-/*
- * Update in-core attributes.  It is assumed the caller will be doing an
- * sa_bulk_update to push the changes out.
- */
-void
-zfs_xvattr_set(znode_t *zp, xvattr_t *xvap, dmu_tx_t *tx)
-{
-	xoptattr_t *xoap;
-
-	xoap = xva_getxoptattr(xvap);
-	ASSERT(xoap);
-
-	ASSERT_VOP_IN_SEQC(ZTOV(zp));
-
-	if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) {
-		uint64_t times[2];
-		ZFS_TIME_ENCODE(&xoap->xoa_createtime, times);
-		(void) sa_update(zp->z_sa_hdl, SA_ZPL_CRTIME(zp->z_zfsvfs),
-		    &times, sizeof (times), tx);
-		XVA_SET_RTN(xvap, XAT_CREATETIME);
-	}
-	if (XVA_ISSET_REQ(xvap, XAT_READONLY)) {
-		ZFS_ATTR_SET(zp, ZFS_READONLY, xoap->xoa_readonly,
-		    zp->z_pflags, tx);
-		XVA_SET_RTN(xvap, XAT_READONLY);
-	}
-	if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) {
-		ZFS_ATTR_SET(zp, ZFS_HIDDEN, xoap->xoa_hidden,
-		    zp->z_pflags, tx);
-		XVA_SET_RTN(xvap, XAT_HIDDEN);
-	}
-	if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) {
-		ZFS_ATTR_SET(zp, ZFS_SYSTEM, xoap->xoa_system,
-		    zp->z_pflags, tx);
-		XVA_SET_RTN(xvap, XAT_SYSTEM);
-	}
-	if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) {
-		ZFS_ATTR_SET(zp, ZFS_ARCHIVE, xoap->xoa_archive,
-		    zp->z_pflags, tx);
-		XVA_SET_RTN(xvap, XAT_ARCHIVE);
-	}
-	if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
-		ZFS_ATTR_SET(zp, ZFS_IMMUTABLE, xoap->xoa_immutable,
-		    zp->z_pflags, tx);
-		XVA_SET_RTN(xvap, XAT_IMMUTABLE);
-	}
-	if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
-		ZFS_ATTR_SET(zp, ZFS_NOUNLINK, xoap->xoa_nounlink,
-		    zp->z_pflags, tx);
-		XVA_SET_RTN(xvap, XAT_NOUNLINK);
-	}
-	if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
-		ZFS_ATTR_SET(zp, ZFS_APPENDONLY, xoap->xoa_appendonly,
-		    zp->z_pflags, tx);
-		XVA_SET_RTN(xvap, XAT_APPENDONLY);
-	}
-	if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
-		ZFS_ATTR_SET(zp, ZFS_NODUMP, xoap->xoa_nodump,
-		    zp->z_pflags, tx);
-		XVA_SET_RTN(xvap, XAT_NODUMP);
-	}
-	if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) {
-		ZFS_ATTR_SET(zp, ZFS_OPAQUE, xoap->xoa_opaque,
-		    zp->z_pflags, tx);
-		XVA_SET_RTN(xvap, XAT_OPAQUE);
-	}
-	if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
-		ZFS_ATTR_SET(zp, ZFS_AV_QUARANTINED,
-		    xoap->xoa_av_quarantined, zp->z_pflags, tx);
-		XVA_SET_RTN(xvap, XAT_AV_QUARANTINED);
-	}
-	if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
-		ZFS_ATTR_SET(zp, ZFS_AV_MODIFIED, xoap->xoa_av_modified,
-		    zp->z_pflags, tx);
-		XVA_SET_RTN(xvap, XAT_AV_MODIFIED);
-	}
-	if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) {
-		zfs_sa_set_scanstamp(zp, xvap, tx);
-		XVA_SET_RTN(xvap, XAT_AV_SCANSTAMP);
-	}
-	if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) {
-		ZFS_ATTR_SET(zp, ZFS_REPARSE, xoap->xoa_reparse,
-		    zp->z_pflags, tx);
-		XVA_SET_RTN(xvap, XAT_REPARSE);
-	}
-	if (XVA_ISSET_REQ(xvap, XAT_OFFLINE)) {
-		ZFS_ATTR_SET(zp, ZFS_OFFLINE, xoap->xoa_offline,
-		    zp->z_pflags, tx);
-		XVA_SET_RTN(xvap, XAT_OFFLINE);
-	}
-	if (XVA_ISSET_REQ(xvap, XAT_SPARSE)) {
-		ZFS_ATTR_SET(zp, ZFS_SPARSE, xoap->xoa_sparse,
-		    zp->z_pflags, tx);
-		XVA_SET_RTN(xvap, XAT_SPARSE);
-	}
-}
-
-int
-zfs_zget(zfsvfs_t *zfsvfs, uint64_t obj_num, znode_t **zpp)
-{
-	dmu_object_info_t doi;
-	dmu_buf_t	*db;
-	znode_t		*zp;
-	vnode_t		*vp;
-	sa_handle_t	*hdl;
-	struct thread	*td;
-	int locked;
-	int err;
-
-	td = curthread;
-	getnewvnode_reserve();
-again:
-	*zpp = NULL;
-	ZFS_OBJ_HOLD_ENTER(zfsvfs, obj_num);
-
-	err = sa_buf_hold(zfsvfs->z_os, obj_num, NULL, &db);
-	if (err) {
-		ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
-		getnewvnode_drop_reserve();
-		return (err);
-	}
-
-	dmu_object_info_from_db(db, &doi);
-	if (doi.doi_bonus_type != DMU_OT_SA &&
-	    (doi.doi_bonus_type != DMU_OT_ZNODE ||
-	    (doi.doi_bonus_type == DMU_OT_ZNODE &&
-	    doi.doi_bonus_size < sizeof (znode_phys_t)))) {
-		sa_buf_rele(db, NULL);
-		ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
-#ifdef __FreeBSD__
-		getnewvnode_drop_reserve();
-#endif
-		return (SET_ERROR(EINVAL));
-	}
-
-	hdl = dmu_buf_get_user(db);
-	if (hdl != NULL) {
-		zp  = sa_get_userdata(hdl);
-
-		/*
-		 * Since "SA" does immediate eviction we
-		 * should never find a sa handle that doesn't
-		 * know about the znode.
-		 */
-		ASSERT3P(zp, !=, NULL);
-		ASSERT3U(zp->z_id, ==, obj_num);
-		if (zp->z_unlinked) {
-			err = SET_ERROR(ENOENT);
-		} else {
-			vp = ZTOV(zp);
-			/*
-			 * Don't let the vnode disappear after
-			 * ZFS_OBJ_HOLD_EXIT.
-			 */
-			VN_HOLD(vp);
-			*zpp = zp;
-			err = 0;
-		}
-
-		sa_buf_rele(db, NULL);
-		ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
-
-		if (err) {
-			getnewvnode_drop_reserve();
-			return (err);
-		}
-
-		locked = VOP_ISLOCKED(vp);
-		VI_LOCK(vp);
-		if (VN_IS_DOOMED(vp) && locked != LK_EXCLUSIVE) {
-			/*
-			 * The vnode is doomed and this thread doesn't
-			 * hold the exclusive lock on it, so the vnode
-			 * must be being reclaimed by another thread.
-			 * Otherwise the doomed vnode is being reclaimed
-			 * by this thread and zfs_zget is called from
-			 * ZIL internals.
-			 */
-			VI_UNLOCK(vp);
-
-			/*
-			 * XXX vrele() locks the vnode when the last reference
-			 * is dropped.  Although in this case the vnode is
-			 * doomed / dead and so no inactivation is required,
-			 * the vnode lock is still acquired.  That could result
-			 * in a LOR with z_teardown_lock if another thread holds
-			 * the vnode's lock and tries to take z_teardown_lock.
-			 * But that is only possible if the other thread peforms
-			 * a ZFS vnode operation on the vnode.  That either
-			 * should not happen if the vnode is dead or the thread
-			 * should also have a refrence to the vnode and thus
-			 * our reference is not last.
-			 */
-			VN_RELE(vp);
-			goto again;
-		}
-		VI_UNLOCK(vp);
-		getnewvnode_drop_reserve();
-		return (err);
-	}
-
-	/*
-	 * Not found create new znode/vnode
-	 * but only if file exists.
-	 *
-	 * There is a small window where zfs_vget() could
-	 * find this object while a file create is still in
-	 * progress.  This is checked for in zfs_znode_alloc()
-	 *
-	 * if zfs_znode_alloc() fails it will drop the hold on the
-	 * bonus buffer.
-	 */
-	zp = zfs_znode_alloc(zfsvfs, db, doi.doi_data_block_size,
-	    doi.doi_bonus_type, NULL);
-	if (zp == NULL) {
-		err = SET_ERROR(ENOENT);
-	} else {
-		*zpp = zp;
-	}
-	if (err == 0) {
-		vnode_t *vp = ZTOV(zp);
-
-		err = insmntque(vp, zfsvfs->z_vfs);
-		if (err == 0) {
-			vp->v_hash = obj_num;
-			VOP_UNLOCK(vp);
-		} else {
-			zp->z_vnode = NULL;
-			zfs_znode_dmu_fini(zp);
-			zfs_znode_free(zp);
-			*zpp = NULL;
-		}
-	}
-	ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
-	getnewvnode_drop_reserve();
-	return (err);
-}
-
-int
-zfs_rezget(znode_t *zp)
-{
-	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
-	dmu_object_info_t doi;
-	dmu_buf_t *db;
-	vnode_t *vp;
-	uint64_t obj_num = zp->z_id;
-	uint64_t mode, size;
-	sa_bulk_attr_t bulk[8];
-	int err;
-	int count = 0;
-	uint64_t gen;
-
-	/*
-	 * Remove cached pages before reloading the znode, so that they are not
-	 * lingering after we run into any error.  Ideally, we should vgone()
-	 * the vnode in case of error, but currently we cannot do that
-	 * because of the LOR between the vnode lock and z_teardown_lock.
-	 * So, instead, we have to "doom" the znode in the illumos style.
-	 */
-	vp = ZTOV(zp);
-	vn_pages_remove(vp, 0, 0);
-
-	ZFS_OBJ_HOLD_ENTER(zfsvfs, obj_num);
-
-	mutex_enter(&zp->z_acl_lock);
-	if (zp->z_acl_cached) {
-		zfs_acl_free(zp->z_acl_cached);
-		zp->z_acl_cached = NULL;
-	}
-
-	mutex_exit(&zp->z_acl_lock);
-	ASSERT(zp->z_sa_hdl == NULL);
-	err = sa_buf_hold(zfsvfs->z_os, obj_num, NULL, &db);
-	if (err) {
-		ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
-		return (err);
-	}
-
-	dmu_object_info_from_db(db, &doi);
-	if (doi.doi_bonus_type != DMU_OT_SA &&
-	    (doi.doi_bonus_type != DMU_OT_ZNODE ||
-	    (doi.doi_bonus_type == DMU_OT_ZNODE &&
-	    doi.doi_bonus_size < sizeof (znode_phys_t)))) {
-		sa_buf_rele(db, NULL);
-		ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
-		return (SET_ERROR(EINVAL));
-	}
-
-	zfs_znode_sa_init(zfsvfs, zp, db, doi.doi_bonus_type, NULL);
-	size = zp->z_size;
-
-	/* reload cached values */
-	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GEN(zfsvfs), NULL,
-	    &gen, sizeof (gen));
-	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL,
-	    &zp->z_size, sizeof (zp->z_size));
-	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL,
-	    &zp->z_links, sizeof (zp->z_links));
-	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
-	    &zp->z_pflags, sizeof (zp->z_pflags));
-	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL,
-	    &zp->z_atime, sizeof (zp->z_atime));
-	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL,
-	    &zp->z_uid, sizeof (zp->z_uid));
-	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL,
-	    &zp->z_gid, sizeof (zp->z_gid));
-	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL,
-	    &mode, sizeof (mode));
-
-	if (sa_bulk_lookup(zp->z_sa_hdl, bulk, count)) {
-		zfs_znode_dmu_fini(zp);
-		ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
-		return (SET_ERROR(EIO));
-	}
-
-	zp->z_mode = mode;
-
-	if (gen != zp->z_gen) {
-		zfs_znode_dmu_fini(zp);
-		ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
-		return (SET_ERROR(EIO));
-	}
-
-	/*
-	 * It is highly improbable but still quite possible that two
-	 * objects in different datasets are created with the same
-	 * object numbers and in transaction groups with the same
-	 * numbers.  znodes corresponding to those objects would
-	 * have the same z_id and z_gen, but their other attributes
-	 * may be different.
-	 * zfs recv -F may replace one of such objects with the other.
-	 * As a result file properties recorded in the replaced
-	 * object's vnode may no longer match the received object's
-	 * properties.  At present the only cached property is the
-	 * files type recorded in v_type.
-	 * So, handle this case by leaving the old vnode and znode
-	 * disassociated from the actual object.  A new vnode and a
-	 * znode will be created if the object is accessed
-	 * (e.g. via a look-up).  The old vnode and znode will be
-	 * recycled when the last vnode reference is dropped.
-	 */
-	if (vp->v_type != IFTOVT((mode_t)zp->z_mode)) {
-		zfs_znode_dmu_fini(zp);
-		ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
-		return (SET_ERROR(EIO));
-	}
-
-	/*
-	 * If the file has zero links, then it has been unlinked on the send
-	 * side and it must be in the received unlinked set.
-	 * We call zfs_znode_dmu_fini() now to prevent any accesses to the
-	 * stale data and to prevent automatical removal of the file in
-	 * zfs_zinactive().  The file will be removed either when it is removed
-	 * on the send side and the next incremental stream is received or
-	 * when the unlinked set gets processed.
-	 */
-	zp->z_unlinked = (zp->z_links == 0);
-	if (zp->z_unlinked) {
-		zfs_znode_dmu_fini(zp);
-		ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
-		return (0);
-	}
-
-	zp->z_blksz = doi.doi_data_block_size;
-	if (zp->z_size != size)
-		vnode_pager_setsize(vp, zp->z_size);
-
-	ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
-
-	return (0);
-}
-
-void
-zfs_znode_delete(znode_t *zp, dmu_tx_t *tx)
-{
-	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
-	objset_t *os = zfsvfs->z_os;
-	uint64_t obj = zp->z_id;
-	uint64_t acl_obj = zfs_external_acl(zp);
-
-	ZFS_OBJ_HOLD_ENTER(zfsvfs, obj);
-	if (acl_obj) {
-		VERIFY(!zp->z_is_sa);
-		VERIFY(0 == dmu_object_free(os, acl_obj, tx));
-	}
-	VERIFY(0 == dmu_object_free(os, obj, tx));
-	zfs_znode_dmu_fini(zp);
-	ZFS_OBJ_HOLD_EXIT(zfsvfs, obj);
-	zfs_znode_free(zp);
-}
-
-void
-zfs_zinactive(znode_t *zp)
-{
-	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
-	uint64_t z_id = zp->z_id;
-
-	ASSERT(zp->z_sa_hdl);
-
-	/*
-	 * Don't allow a zfs_zget() while were trying to release this znode
-	 */
-	ZFS_OBJ_HOLD_ENTER(zfsvfs, z_id);
-
-	/*
-	 * If this was the last reference to a file with no links, remove
-	 * the file from the file system unless the file system is mounted
-	 * read-only.  That can happen, for example, if the file system was
-	 * originally read-write, the file was opened, then unlinked and
-	 * the file system was made read-only before the file was finally
-	 * closed.  The file will remain in the unlinked set.
-	 */
-	if (zp->z_unlinked) {
-		ASSERT(!zfsvfs->z_issnap);
-		if ((zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) == 0) {
-			ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id);
-			zfs_rmnode(zp);
-			return;
-		}
-	}
-
-	zfs_znode_dmu_fini(zp);
-	ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id);
-	zfs_znode_free(zp);
-}
-
-void
-zfs_znode_free(znode_t *zp)
-{
-	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
-
-	ASSERT(zp->z_sa_hdl == NULL);
-	zp->z_vnode = NULL;
-	mutex_enter(&zfsvfs->z_znodes_lock);
-	POINTER_INVALIDATE(&zp->z_zfsvfs);
-	list_remove(&zfsvfs->z_all_znodes, zp);
-	mutex_exit(&zfsvfs->z_znodes_lock);
-
-	if (zp->z_acl_cached) {
-		zfs_acl_free(zp->z_acl_cached);
-		zp->z_acl_cached = NULL;
-	}
-
-	zfs_znode_free_kmem(zp);
-
-#ifdef illumos
-	VFS_RELE(zfsvfs->z_vfs);
-#endif
-}
-
-void
-zfs_tstamp_update_setup(znode_t *zp, uint_t flag, uint64_t mtime[2],
-    uint64_t ctime[2], boolean_t have_tx)
-{
-	timestruc_t	now;
-
-	vfs_timestamp(&now);
-
-	if (have_tx) {	/* will sa_bulk_update happen really soon? */
-		zp->z_atime_dirty = 0;
-		zp->z_seq++;
-	} else {
-		zp->z_atime_dirty = 1;
-	}
-
-	if (flag & AT_ATIME) {
-		ZFS_TIME_ENCODE(&now, zp->z_atime);
-	}
-
-	if (flag & AT_MTIME) {
-		ZFS_TIME_ENCODE(&now, mtime);
-		if (zp->z_zfsvfs->z_use_fuids) {
-			zp->z_pflags |= (ZFS_ARCHIVE |
-			    ZFS_AV_MODIFIED);
-		}
-	}
-
-	if (flag & AT_CTIME) {
-		ZFS_TIME_ENCODE(&now, ctime);
-		if (zp->z_zfsvfs->z_use_fuids)
-			zp->z_pflags |= ZFS_ARCHIVE;
-	}
-}
-
-/*
- * Grow the block size for a file.
- *
- *	IN:	zp	- znode of file to free data in.
- *		size	- requested block size
- *		tx	- open transaction.
- *
- * NOTE: this function assumes that the znode is write locked.
- */
-void
-zfs_grow_blocksize(znode_t *zp, uint64_t size, dmu_tx_t *tx)
-{
-	int		error;
-	u_longlong_t	dummy;
-
-	if (size <= zp->z_blksz)
-		return;
-	/*
-	 * If the file size is already greater than the current blocksize,
-	 * we will not grow.  If there is more than one block in a file,
-	 * the blocksize cannot change.
-	 */
-	if (zp->z_blksz && zp->z_size > zp->z_blksz)
-		return;
-
-	error = dmu_object_set_blocksize(zp->z_zfsvfs->z_os, zp->z_id,
-	    size, 0, tx);
-
-	if (error == ENOTSUP)
-		return;
-	ASSERT0(error);
-
-	/* What blocksize did we actually get? */
-	dmu_object_size_from_db(sa_get_db(zp->z_sa_hdl), &zp->z_blksz, &dummy);
-}
-
-#ifdef illumos
-/*
- * This is a dummy interface used when pvn_vplist_dirty() should *not*
- * be calling back into the fs for a putpage().  E.g.: when truncating
- * a file, the pages being "thrown away* don't need to be written out.
- */
-/* ARGSUSED */
-static int
-zfs_no_putpage(vnode_t *vp, page_t *pp, u_offset_t *offp, size_t *lenp,
-    int flags, cred_t *cr)
-{
-	ASSERT(0);
-	return (0);
-}
-#endif
-
-/*
- * Increase the file length
- *
- *	IN:	zp	- znode of file to free data in.
- *		end	- new end-of-file
- *
- *	RETURN:	0 on success, error code on failure
- */
-static int
-zfs_extend(znode_t *zp, uint64_t end)
-{
-	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
-	dmu_tx_t *tx;
-	locked_range_t *lr;
-	uint64_t newblksz;
-	int error;
-
-	/*
-	 * We will change zp_size, lock the whole file.
-	 */
-	lr = rangelock_enter(&zp->z_rangelock, 0, UINT64_MAX, RL_WRITER);
-
-	/*
-	 * Nothing to do if file already at desired length.
-	 */
-	if (end <= zp->z_size) {
-		rangelock_exit(lr);
-		return (0);
-	}
-	tx = dmu_tx_create(zfsvfs->z_os);
-	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
-	zfs_sa_upgrade_txholds(tx, zp);
-	if (end > zp->z_blksz &&
-	    (!ISP2(zp->z_blksz) || zp->z_blksz < zfsvfs->z_max_blksz)) {
-		/*
-		 * We are growing the file past the current block size.
-		 */
-		if (zp->z_blksz > zp->z_zfsvfs->z_max_blksz) {
-			/*
-			 * File's blocksize is already larger than the
-			 * "recordsize" property.  Only let it grow to
-			 * the next power of 2.
-			 */
-			ASSERT(!ISP2(zp->z_blksz));
-			newblksz = MIN(end, 1 << highbit64(zp->z_blksz));
-		} else {
-			newblksz = MIN(end, zp->z_zfsvfs->z_max_blksz);
-		}
-		dmu_tx_hold_write(tx, zp->z_id, 0, newblksz);
-	} else {
-		newblksz = 0;
-	}
-
-	error = dmu_tx_assign(tx, TXG_WAIT);
-	if (error) {
-		dmu_tx_abort(tx);
-		rangelock_exit(lr);
-		return (error);
-	}
-
-	if (newblksz)
-		zfs_grow_blocksize(zp, newblksz, tx);
-
-	zp->z_size = end;
-
-	VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zp->z_zfsvfs),
-	    &zp->z_size, sizeof (zp->z_size), tx));
-
-	vnode_pager_setsize(ZTOV(zp), end);
-
-	rangelock_exit(lr);
-
-	dmu_tx_commit(tx);
-
-	return (0);
-}
-
-/*
- * Free space in a file.
- *
- *	IN:	zp	- znode of file to free data in.
- *		off	- start of section to free.
- *		len	- length of section to free.
- *
- *	RETURN:	0 on success, error code on failure
- */
-static int
-zfs_free_range(znode_t *zp, uint64_t off, uint64_t len)
-{
-	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
-	locked_range_t *lr;
-	int error;
-
-	/*
-	 * Lock the range being freed.
-	 */
-	lr = rangelock_enter(&zp->z_rangelock, off, len, RL_WRITER);
-
-	/*
-	 * Nothing to do if file already at desired length.
-	 */
-	if (off >= zp->z_size) {
-		rangelock_exit(lr);
-		return (0);
-	}
-
-	if (off + len > zp->z_size)
-		len = zp->z_size - off;
-
-	error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, off, len);
-
-	if (error == 0) {
-		/*
-		 * In FreeBSD we cannot free block in the middle of a file,
-		 * but only at the end of a file, so this code path should
-		 * never happen.
-		 */
-		vnode_pager_setsize(ZTOV(zp), off);
-	}
-
-	rangelock_exit(lr);
-
-	return (error);
-}
-
-/*
- * Truncate a file
- *
- *	IN:	zp	- znode of file to free data in.
- *		end	- new end-of-file.
- *
- *	RETURN:	0 on success, error code on failure
- */
-static int
-zfs_trunc(znode_t *zp, uint64_t end)
-{
-	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
-	vnode_t *vp = ZTOV(zp);
-	dmu_tx_t *tx;
-	locked_range_t *lr;
-	int error;
-	sa_bulk_attr_t bulk[2];
-	int count = 0;
-
-	/*
-	 * We will change zp_size, lock the whole file.
-	 */
-	lr = rangelock_enter(&zp->z_rangelock, 0, UINT64_MAX, RL_WRITER);
-
-	/*
-	 * Nothing to do if file already at desired length.
-	 */
-	if (end >= zp->z_size) {
-		rangelock_exit(lr);
-		return (0);
-	}
-
-	error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, end,
-	    DMU_OBJECT_END);
-	if (error) {
-		rangelock_exit(lr);
-		return (error);
-	}
-	tx = dmu_tx_create(zfsvfs->z_os);
-	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
-	zfs_sa_upgrade_txholds(tx, zp);
-	dmu_tx_mark_netfree(tx);
-	error = dmu_tx_assign(tx, TXG_WAIT);
-	if (error) {
-		dmu_tx_abort(tx);
-		rangelock_exit(lr);
-		return (error);
-	}
-
-	zp->z_size = end;
-	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs),
-	    NULL, &zp->z_size, sizeof (zp->z_size));
-
-	if (end == 0) {
-		zp->z_pflags &= ~ZFS_SPARSE;
-		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs),
-		    NULL, &zp->z_pflags, 8);
-	}
-	VERIFY(sa_bulk_update(zp->z_sa_hdl, bulk, count, tx) == 0);
-
-	dmu_tx_commit(tx);
-
-	/*
-	 * Clear any mapped pages in the truncated region.  This has to
-	 * happen outside of the transaction to avoid the possibility of
-	 * a deadlock with someone trying to push a page that we are
-	 * about to invalidate.
-	 */
-	vnode_pager_setsize(vp, end);
-
-	rangelock_exit(lr);
-
-	return (0);
-}
-
-/*
- * Free space in a file
- *
- *	IN:	zp	- znode of file to free data in.
- *		off	- start of range
- *		len	- end of range (0 => EOF)
- *		flag	- current file open mode flags.
- *		log	- TRUE if this action should be logged
- *
- *	RETURN:	0 on success, error code on failure
- */
-int
-zfs_freesp(znode_t *zp, uint64_t off, uint64_t len, int flag, boolean_t log)
-{
-	vnode_t *vp = ZTOV(zp);
-	dmu_tx_t *tx;
-	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
-	zilog_t *zilog = zfsvfs->z_log;
-	uint64_t mode;
-	uint64_t mtime[2], ctime[2];
-	sa_bulk_attr_t bulk[3];
-	int count = 0;
-	int error;
-
-	if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs), &mode,
-	    sizeof (mode))) != 0)
-		return (error);
-
-	if (off > zp->z_size) {
-		error =  zfs_extend(zp, off+len);
-		if (error == 0 && log)
-			goto log;
-		else
-			return (error);
-	}
-
-	/*
-	 * Check for any locks in the region to be freed.
-	 */
-
-	if (MANDLOCK(vp, (mode_t)mode)) {
-		uint64_t length = (len ? len : zp->z_size - off);
-		if (error = chklock(vp, FWRITE, off, length, flag, NULL))
-			return (error);
-	}
-
-	if (len == 0) {
-		error = zfs_trunc(zp, off);
-	} else {
-		if ((error = zfs_free_range(zp, off, len)) == 0 &&
-		    off + len > zp->z_size)
-			error = zfs_extend(zp, off+len);
-	}
-	if (error || !log)
-		return (error);
-log:
-	tx = dmu_tx_create(zfsvfs->z_os);
-	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
-	zfs_sa_upgrade_txholds(tx, zp);
-	error = dmu_tx_assign(tx, TXG_WAIT);
-	if (error) {
-		dmu_tx_abort(tx);
-		return (error);
-	}
-
-	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, mtime, 16);
-	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, ctime, 16);
-	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs),
-	    NULL, &zp->z_pflags, 8);
-	zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime, B_TRUE);
-	error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
-	ASSERT(error == 0);
-
-	zfs_log_truncate(zilog, tx, TX_TRUNCATE, zp, off, len);
-
-	dmu_tx_commit(tx);
-	return (0);
-}
-
-void
-zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx)
-{
-	uint64_t	moid, obj, sa_obj, version;
-	uint64_t	sense = ZFS_CASE_SENSITIVE;
-	uint64_t	norm = 0;
-	nvpair_t	*elem;
-	int		error;
-	int		i;
-	znode_t		*rootzp = NULL;
-	zfsvfs_t	*zfsvfs;
-	vattr_t		vattr;
-	znode_t		*zp;
-	zfs_acl_ids_t	acl_ids;
-
-	/*
-	 * First attempt to create master node.
-	 */
-	/*
-	 * In an empty objset, there are no blocks to read and thus
-	 * there can be no i/o errors (which we assert below).
-	 */
-	moid = MASTER_NODE_OBJ;
-	error = zap_create_claim(os, moid, DMU_OT_MASTER_NODE,
-	    DMU_OT_NONE, 0, tx);
-	ASSERT(error == 0);
-
-	/*
-	 * Set starting attributes.
-	 */
-	version = zfs_zpl_version_map(spa_version(dmu_objset_spa(os)));
-	elem = NULL;
-	while ((elem = nvlist_next_nvpair(zplprops, elem)) != NULL) {
-		/* For the moment we expect all zpl props to be uint64_ts */
-		uint64_t val;
-		char *name;
-
-		ASSERT(nvpair_type(elem) == DATA_TYPE_UINT64);
-		VERIFY(nvpair_value_uint64(elem, &val) == 0);
-		name = nvpair_name(elem);
-		if (strcmp(name, zfs_prop_to_name(ZFS_PROP_VERSION)) == 0) {
-			if (val < version)
-				version = val;
-		} else {
-			error = zap_update(os, moid, name, 8, 1, &val, tx);
-		}
-		ASSERT(error == 0);
-		if (strcmp(name, zfs_prop_to_name(ZFS_PROP_NORMALIZE)) == 0)
-			norm = val;
-		else if (strcmp(name, zfs_prop_to_name(ZFS_PROP_CASE)) == 0)
-			sense = val;
-	}
-	ASSERT(version != 0);
-	error = zap_update(os, moid, ZPL_VERSION_STR, 8, 1, &version, tx);
-
-	/*
-	 * Create zap object used for SA attribute registration
-	 */
-
-	if (version >= ZPL_VERSION_SA) {
-		sa_obj = zap_create(os, DMU_OT_SA_MASTER_NODE,
-		    DMU_OT_NONE, 0, tx);
-		error = zap_add(os, moid, ZFS_SA_ATTRS, 8, 1, &sa_obj, tx);
-		ASSERT(error == 0);
-	} else {
-		sa_obj = 0;
-	}
-	/*
-	 * Create a delete queue.
-	 */
-	obj = zap_create(os, DMU_OT_UNLINKED_SET, DMU_OT_NONE, 0, tx);
-
-	error = zap_add(os, moid, ZFS_UNLINKED_SET, 8, 1, &obj, tx);
-	ASSERT(error == 0);
-
-	/*
-	 * Create root znode.  Create minimal znode/vnode/zfsvfs
-	 * to allow zfs_mknode to work.
-	 */
-	VATTR_NULL(&vattr);
-	vattr.va_mask = AT_MODE|AT_UID|AT_GID|AT_TYPE;
-	vattr.va_type = VDIR;
-	vattr.va_mode = S_IFDIR|0755;
-	vattr.va_uid = crgetuid(cr);
-	vattr.va_gid = crgetgid(cr);
-
-	zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP);
-
-	rootzp = zfs_znode_alloc_kmem(KM_SLEEP);
-	ASSERT(!POINTER_IS_VALID(rootzp->z_zfsvfs));
-	rootzp->z_moved = 0;
-	rootzp->z_unlinked = 0;
-	rootzp->z_atime_dirty = 0;
-	rootzp->z_is_sa = USE_SA(version, os);
-
-	zfsvfs->z_os = os;
-	zfsvfs->z_parent = zfsvfs;
-	zfsvfs->z_version = version;
-	zfsvfs->z_use_fuids = USE_FUIDS(version, os);
-	zfsvfs->z_use_sa = USE_SA(version, os);
-	zfsvfs->z_norm = norm;
-
-	error = sa_setup(os, sa_obj, zfs_attr_table, ZPL_END,
-	    &zfsvfs->z_attr_table);
-
-	ASSERT(error == 0);
-
-	/*
-	 * Fold case on file systems that are always or sometimes case
-	 * insensitive.
-	 */
-	if (sense == ZFS_CASE_INSENSITIVE || sense == ZFS_CASE_MIXED)
-		zfsvfs->z_norm |= U8_TEXTPREP_TOUPPER;
-
-	mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL);
-	list_create(&zfsvfs->z_all_znodes, sizeof (znode_t),
-	    offsetof(znode_t, z_link_node));
-
-	for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
-		mutex_init(&zfsvfs->z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL);
-
-	rootzp->z_zfsvfs = zfsvfs;
-	VERIFY(0 == zfs_acl_ids_create(rootzp, IS_ROOT_NODE, &vattr,
-	    cr, NULL, &acl_ids));
-	zfs_mknode(rootzp, &vattr, tx, cr, IS_ROOT_NODE, &zp, &acl_ids);
-	ASSERT3P(zp, ==, rootzp);
-	error = zap_add(os, moid, ZFS_ROOT_OBJ, 8, 1, &rootzp->z_id, tx);
-	ASSERT(error == 0);
-	zfs_acl_ids_free(&acl_ids);
-	POINTER_INVALIDATE(&rootzp->z_zfsvfs);
-
-	sa_handle_destroy(rootzp->z_sa_hdl);
-	zfs_znode_free_kmem(rootzp);
-
-	/*
-	 * Create shares directory
-	 */
-
-	error = zfs_create_share_dir(zfsvfs, tx);
-
-	ASSERT(error == 0);
-
-	for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
-		mutex_destroy(&zfsvfs->z_hold_mtx[i]);
-	kmem_free(zfsvfs, sizeof (zfsvfs_t));
-}
-#endif /* _KERNEL */
-
-static int
-zfs_sa_setup(objset_t *osp, sa_attr_type_t **sa_table)
-{
-	uint64_t sa_obj = 0;
-	int error;
-
-	error = zap_lookup(osp, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1, &sa_obj);
-	if (error != 0 && error != ENOENT)
-		return (error);
-
-	error = sa_setup(osp, sa_obj, zfs_attr_table, ZPL_END, sa_table);
-	return (error);
-}
-
-static int
-zfs_grab_sa_handle(objset_t *osp, uint64_t obj, sa_handle_t **hdlp,
-    dmu_buf_t **db, void *tag)
-{
-	dmu_object_info_t doi;
-	int error;
-
-	if ((error = sa_buf_hold(osp, obj, tag, db)) != 0)
-		return (error);
-
-	dmu_object_info_from_db(*db, &doi);
-	if ((doi.doi_bonus_type != DMU_OT_SA &&
-	    doi.doi_bonus_type != DMU_OT_ZNODE) ||
-	    doi.doi_bonus_type == DMU_OT_ZNODE &&
-	    doi.doi_bonus_size < sizeof (znode_phys_t)) {
-		sa_buf_rele(*db, tag);
-		return (SET_ERROR(ENOTSUP));
-	}
-
-	error = sa_handle_get(osp, obj, NULL, SA_HDL_PRIVATE, hdlp);
-	if (error != 0) {
-		sa_buf_rele(*db, tag);
-		return (error);
-	}
-
-	return (0);
-}
-
-void
-zfs_release_sa_handle(sa_handle_t *hdl, dmu_buf_t *db, void *tag)
-{
-	sa_handle_destroy(hdl);
-	sa_buf_rele(db, tag);
-}
-
-/*
- * Given an object number, return its parent object number and whether
- * or not the object is an extended attribute directory.
- */
-static int
-zfs_obj_to_pobj(objset_t *osp, sa_handle_t *hdl, sa_attr_type_t *sa_table,
-    uint64_t *pobjp, int *is_xattrdir)
-{
-	uint64_t parent;
-	uint64_t pflags;
-	uint64_t mode;
-	uint64_t parent_mode;
-	sa_bulk_attr_t bulk[3];
-	sa_handle_t *sa_hdl;
-	dmu_buf_t *sa_db;
-	int count = 0;
-	int error;
-
-	SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_PARENT], NULL,
-	    &parent, sizeof (parent));
-	SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_FLAGS], NULL,
-	    &pflags, sizeof (pflags));
-	SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_MODE], NULL,
-	    &mode, sizeof (mode));
-
-	if ((error = sa_bulk_lookup(hdl, bulk, count)) != 0)
-		return (error);
-
-	/*
-	 * When a link is removed its parent pointer is not changed and will
-	 * be invalid.  There are two cases where a link is removed but the
-	 * file stays around, when it goes to the delete queue and when there
-	 * are additional links.
-	 */
-	error = zfs_grab_sa_handle(osp, parent, &sa_hdl, &sa_db, FTAG);
-	if (error != 0)
-		return (error);
-
-	error = sa_lookup(sa_hdl, ZPL_MODE, &parent_mode, sizeof (parent_mode));
-	zfs_release_sa_handle(sa_hdl, sa_db, FTAG);
-	if (error != 0)
-		return (error);
-
-	*is_xattrdir = ((pflags & ZFS_XATTR) != 0) && S_ISDIR(mode);
-
-	/*
-	 * Extended attributes can be applied to files, directories, etc.
-	 * Otherwise the parent must be a directory.
-	 */
-	if (!*is_xattrdir && !S_ISDIR(parent_mode))
-		return (SET_ERROR(EINVAL));
-
-	*pobjp = parent;
-
-	return (0);
-}
-
-/*
- * Given an object number, return some zpl level statistics
- */
-static int
-zfs_obj_to_stats_impl(sa_handle_t *hdl, sa_attr_type_t *sa_table,
-    zfs_stat_t *sb)
-{
-	sa_bulk_attr_t bulk[4];
-	int count = 0;
-
-	SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_MODE], NULL,
-	    &sb->zs_mode, sizeof (sb->zs_mode));
-	SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_GEN], NULL,
-	    &sb->zs_gen, sizeof (sb->zs_gen));
-	SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_LINKS], NULL,
-	    &sb->zs_links, sizeof (sb->zs_links));
-	SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_CTIME], NULL,
-	    &sb->zs_ctime, sizeof (sb->zs_ctime));
-
-	return (sa_bulk_lookup(hdl, bulk, count));
-}
-
-static int
-zfs_obj_to_path_impl(objset_t *osp, uint64_t obj, sa_handle_t *hdl,
-    sa_attr_type_t *sa_table, char *buf, int len)
-{
-	sa_handle_t *sa_hdl;
-	sa_handle_t *prevhdl = NULL;
-	dmu_buf_t *prevdb = NULL;
-	dmu_buf_t *sa_db = NULL;
-	char *path = buf + len - 1;
-	int error;
-
-	*path = '\0';
-	sa_hdl = hdl;
-
-	uint64_t deleteq_obj;
-	VERIFY0(zap_lookup(osp, MASTER_NODE_OBJ,
-	    ZFS_UNLINKED_SET, sizeof (uint64_t), 1, &deleteq_obj));
-	error = zap_lookup_int(osp, deleteq_obj, obj);
-	if (error == 0) {
-		return (ESTALE);
-	} else if (error != ENOENT) {
-		return (error);
-	}
-	error = 0;
-
-	for (;;) {
-		uint64_t pobj;
-		char component[MAXNAMELEN + 2];
-		size_t complen;
-		int is_xattrdir;
-
-		if (prevdb)
-			zfs_release_sa_handle(prevhdl, prevdb, FTAG);
-
-		if ((error = zfs_obj_to_pobj(osp, sa_hdl, sa_table, &pobj,
-		    &is_xattrdir)) != 0)
-			break;
-
-		if (pobj == obj) {
-			if (path[0] != '/')
-				*--path = '/';
-			break;
-		}
-
-		component[0] = '/';
-		if (is_xattrdir) {
-			(void) sprintf(component + 1, "<xattrdir>");
-		} else {
-			error = zap_value_search(osp, pobj, obj,
-			    ZFS_DIRENT_OBJ(-1ULL), component + 1);
-			if (error != 0)
-				break;
-		}
-
-		complen = strlen(component);
-		path -= complen;
-		ASSERT(path >= buf);
-		bcopy(component, path, complen);
-		obj = pobj;
-
-		if (sa_hdl != hdl) {
-			prevhdl = sa_hdl;
-			prevdb = sa_db;
-		}
-		error = zfs_grab_sa_handle(osp, obj, &sa_hdl, &sa_db, FTAG);
-		if (error != 0) {
-			sa_hdl = prevhdl;
-			sa_db = prevdb;
-			break;
-		}
-	}
-
-	if (sa_hdl != NULL && sa_hdl != hdl) {
-		ASSERT(sa_db != NULL);
-		zfs_release_sa_handle(sa_hdl, sa_db, FTAG);
-	}
-
-	if (error == 0)
-		(void) memmove(buf, path, buf + len - path);
-
-	return (error);
-}
-
-int
-zfs_obj_to_path(objset_t *osp, uint64_t obj, char *buf, int len)
-{
-	sa_attr_type_t *sa_table;
-	sa_handle_t *hdl;
-	dmu_buf_t *db;
-	int error;
-
-	error = zfs_sa_setup(osp, &sa_table);
-	if (error != 0)
-		return (error);
-
-	error = zfs_grab_sa_handle(osp, obj, &hdl, &db, FTAG);
-	if (error != 0)
-		return (error);
-
-	error = zfs_obj_to_path_impl(osp, obj, hdl, sa_table, buf, len);
-
-	zfs_release_sa_handle(hdl, db, FTAG);
-	return (error);
-}
-
-int
-zfs_obj_to_stats(objset_t *osp, uint64_t obj, zfs_stat_t *sb,
-    char *buf, int len)
-{
-	char *path = buf + len - 1;
-	sa_attr_type_t *sa_table;
-	sa_handle_t *hdl;
-	dmu_buf_t *db;
-	int error;
-
-	*path = '\0';
-
-	error = zfs_sa_setup(osp, &sa_table);
-	if (error != 0)
-		return (error);
-
-	error = zfs_grab_sa_handle(osp, obj, &hdl, &db, FTAG);
-	if (error != 0)
-		return (error);
-
-	error = zfs_obj_to_stats_impl(hdl, sa_table, sb);
-	if (error != 0) {
-		zfs_release_sa_handle(hdl, db, FTAG);
-		return (error);
-	}
-
-	error = zfs_obj_to_path_impl(osp, obj, hdl, sa_table, buf, len);
-
-	zfs_release_sa_handle(hdl, db, FTAG);
-	return (error);
-}
-
-#ifdef _KERNEL
-int
-zfs_znode_parent_and_name(znode_t *zp, znode_t **dzpp, char *buf)
-{
-	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
-	uint64_t parent;
-	int is_xattrdir;
-	int err;
-
-	/* Extended attributes should not be visible as regular files. */
-	if ((zp->z_pflags & ZFS_XATTR) != 0)
-		return (SET_ERROR(EINVAL));
-
-	err = zfs_obj_to_pobj(zfsvfs->z_os, zp->z_sa_hdl, zfsvfs->z_attr_table,
-	    &parent, &is_xattrdir);
-	if (err != 0)
-		return (err);
-	ASSERT0(is_xattrdir);
-
-	/* No name as this is a root object. */
-	if (parent == zp->z_id)
-		return (SET_ERROR(EINVAL));
-
-	err = zap_value_search(zfsvfs->z_os, parent, zp->z_id,
-	    ZFS_DIRENT_OBJ(-1ULL), buf);
-	if (err != 0)
-		return (err);
-	err = zfs_zget(zfsvfs, parent, dzpp);
-	return (err);
-}
-#endif /* _KERNEL */
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zil.c
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zil.c
+++ /dev/null
@@ -1,3499 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
- * Copyright (c) 2014 Integros [integros.com]
- */
-
-/* Portions Copyright 2010 Robert Milkowski */
-
-#include <sys/zfs_context.h>
-#include <sys/spa.h>
-#include <sys/spa_impl.h>
-#include <sys/dmu.h>
-#include <sys/zap.h>
-#include <sys/arc.h>
-#include <sys/stat.h>
-#include <sys/resource.h>
-#include <sys/zil.h>
-#include <sys/zil_impl.h>
-#include <sys/dsl_dataset.h>
-#include <sys/vdev_impl.h>
-#include <sys/dmu_tx.h>
-#include <sys/dsl_pool.h>
-#include <sys/abd.h>
-
-/*
- * The ZFS Intent Log (ZIL) saves "transaction records" (itxs) of system
- * calls that change the file system. Each itx has enough information to
- * be able to replay them after a system crash, power loss, or
- * equivalent failure mode. These are stored in memory until either:
- *
- *   1. they are committed to the pool by the DMU transaction group
- *      (txg), at which point they can be discarded; or
- *   2. they are committed to the on-disk ZIL for the dataset being
- *      modified (e.g. due to an fsync, O_DSYNC, or other synchronous
- *      requirement).
- *
- * In the event of a crash or power loss, the itxs contained by each
- * dataset's on-disk ZIL will be replayed when that dataset is first
- * instantianted (e.g. if the dataset is a normal fileystem, when it is
- * first mounted).
- *
- * As hinted at above, there is one ZIL per dataset (both the in-memory
- * representation, and the on-disk representation). The on-disk format
- * consists of 3 parts:
- *
- *	- a single, per-dataset, ZIL header; which points to a chain of
- *	- zero or more ZIL blocks; each of which contains
- *	- zero or more ZIL records
- *
- * A ZIL record holds the information necessary to replay a single
- * system call transaction. A ZIL block can hold many ZIL records, and
- * the blocks are chained together, similarly to a singly linked list.
- *
- * Each ZIL block contains a block pointer (blkptr_t) to the next ZIL
- * block in the chain, and the ZIL header points to the first block in
- * the chain.
- *
- * Note, there is not a fixed place in the pool to hold these ZIL
- * blocks; they are dynamically allocated and freed as needed from the
- * blocks available on the pool, though they can be preferentially
- * allocated from a dedicated "log" vdev.
- */
-
-/*
- * This controls the amount of time that a ZIL block (lwb) will remain
- * "open" when it isn't "full", and it has a thread waiting for it to be
- * committed to stable storage. Please refer to the zil_commit_waiter()
- * function (and the comments within it) for more details.
- */
-int zfs_commit_timeout_pct = 5;
-
-/*
- * Disable intent logging replay.  This global ZIL switch affects all pools.
- */
-int zil_replay_disable = 0;
-SYSCTL_DECL(_vfs_zfs);
-SYSCTL_INT(_vfs_zfs, OID_AUTO, zil_replay_disable, CTLFLAG_RWTUN,
-    &zil_replay_disable, 0, "Disable intent logging replay");
-
-/*
- * Disable the DKIOCFLUSHWRITECACHE commands that are normally sent to
- * the disk(s) by the ZIL after an LWB write has completed. Setting this
- * will cause ZIL corruption on power loss if a volatile out-of-order
- * write cache is enabled.
- */
-boolean_t zil_nocacheflush = B_FALSE;
-SYSCTL_INT(_vfs_zfs, OID_AUTO, zil_nocacheflush, CTLFLAG_RWTUN,
-    &zil_nocacheflush, 0, "Disable ZIL cache flush");
-
-boolean_t zfs_trim_enabled = B_TRUE;
-SYSCTL_DECL(_vfs_zfs_trim);
-SYSCTL_INT(_vfs_zfs_trim, OID_AUTO, enabled, CTLFLAG_RDTUN, &zfs_trim_enabled, 0,
-    "Enable ZFS TRIM");
-
-/*
- * Limit SLOG write size per commit executed with synchronous priority.
- * Any writes above that will be executed with lower (asynchronous) priority
- * to limit potential SLOG device abuse by single active ZIL writer.
- */
-uint64_t zil_slog_bulk = 768 * 1024;
-SYSCTL_QUAD(_vfs_zfs, OID_AUTO, zil_slog_bulk, CTLFLAG_RWTUN,
-    &zil_slog_bulk, 0, "Maximal SLOG commit size with sync priority");
-
-static kmem_cache_t *zil_lwb_cache;
-static kmem_cache_t *zil_zcw_cache;
-
-#define	LWB_EMPTY(lwb) ((BP_GET_LSIZE(&lwb->lwb_blk) - \
-    sizeof (zil_chain_t)) == (lwb->lwb_sz - lwb->lwb_nused))
-
-static int
-zil_bp_compare(const void *x1, const void *x2)
-{
-	const dva_t *dva1 = &((zil_bp_node_t *)x1)->zn_dva;
-	const dva_t *dva2 = &((zil_bp_node_t *)x2)->zn_dva;
-
-	int cmp = AVL_CMP(DVA_GET_VDEV(dva1), DVA_GET_VDEV(dva2));
-	if (likely(cmp))
-		return (cmp);
-
-	return (AVL_CMP(DVA_GET_OFFSET(dva1), DVA_GET_OFFSET(dva2)));
-}
-
-static void
-zil_bp_tree_init(zilog_t *zilog)
-{
-	avl_create(&zilog->zl_bp_tree, zil_bp_compare,
-	    sizeof (zil_bp_node_t), offsetof(zil_bp_node_t, zn_node));
-}
-
-static void
-zil_bp_tree_fini(zilog_t *zilog)
-{
-	avl_tree_t *t = &zilog->zl_bp_tree;
-	zil_bp_node_t *zn;
-	void *cookie = NULL;
-
-	while ((zn = avl_destroy_nodes(t, &cookie)) != NULL)
-		kmem_free(zn, sizeof (zil_bp_node_t));
-
-	avl_destroy(t);
-}
-
-int
-zil_bp_tree_add(zilog_t *zilog, const blkptr_t *bp)
-{
-	avl_tree_t *t = &zilog->zl_bp_tree;
-	const dva_t *dva;
-	zil_bp_node_t *zn;
-	avl_index_t where;
-
-	if (BP_IS_EMBEDDED(bp))
-		return (0);
-
-	dva = BP_IDENTITY(bp);
-
-	if (avl_find(t, dva, &where) != NULL)
-		return (SET_ERROR(EEXIST));
-
-	zn = kmem_alloc(sizeof (zil_bp_node_t), KM_SLEEP);
-	zn->zn_dva = *dva;
-	avl_insert(t, zn, where);
-
-	return (0);
-}
-
-static zil_header_t *
-zil_header_in_syncing_context(zilog_t *zilog)
-{
-	return ((zil_header_t *)zilog->zl_header);
-}
-
-static void
-zil_init_log_chain(zilog_t *zilog, blkptr_t *bp)
-{
-	zio_cksum_t *zc = &bp->blk_cksum;
-
-	zc->zc_word[ZIL_ZC_GUID_0] = spa_get_random(-1ULL);
-	zc->zc_word[ZIL_ZC_GUID_1] = spa_get_random(-1ULL);
-	zc->zc_word[ZIL_ZC_OBJSET] = dmu_objset_id(zilog->zl_os);
-	zc->zc_word[ZIL_ZC_SEQ] = 1ULL;
-}
-
-/*
- * Read a log block and make sure it's valid.
- */
-static int
-zil_read_log_block(zilog_t *zilog, const blkptr_t *bp, blkptr_t *nbp, void *dst,
-    char **end)
-{
-	enum zio_flag zio_flags = ZIO_FLAG_CANFAIL;
-	arc_flags_t aflags = ARC_FLAG_WAIT;
-	arc_buf_t *abuf = NULL;
-	zbookmark_phys_t zb;
-	int error;
-
-	if (zilog->zl_header->zh_claim_txg == 0)
-		zio_flags |= ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SCRUB;
-
-	if (!(zilog->zl_header->zh_flags & ZIL_CLAIM_LR_SEQ_VALID))
-		zio_flags |= ZIO_FLAG_SPECULATIVE;
-
-	SET_BOOKMARK(&zb, bp->blk_cksum.zc_word[ZIL_ZC_OBJSET],
-	    ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, bp->blk_cksum.zc_word[ZIL_ZC_SEQ]);
-
-	error = arc_read(NULL, zilog->zl_spa, bp, arc_getbuf_func, &abuf,
-	    ZIO_PRIORITY_SYNC_READ, zio_flags, &aflags, &zb);
-
-	if (error == 0) {
-		zio_cksum_t cksum = bp->blk_cksum;
-
-		/*
-		 * Validate the checksummed log block.
-		 *
-		 * Sequence numbers should be... sequential.  The checksum
-		 * verifier for the next block should be bp's checksum plus 1.
-		 *
-		 * Also check the log chain linkage and size used.
-		 */
-		cksum.zc_word[ZIL_ZC_SEQ]++;
-
-		if (BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_ZILOG2) {
-			zil_chain_t *zilc = abuf->b_data;
-			char *lr = (char *)(zilc + 1);
-			uint64_t len = zilc->zc_nused - sizeof (zil_chain_t);
-
-			if (bcmp(&cksum, &zilc->zc_next_blk.blk_cksum,
-			    sizeof (cksum)) || BP_IS_HOLE(&zilc->zc_next_blk)) {
-				error = SET_ERROR(ECKSUM);
-			} else {
-				ASSERT3U(len, <=, SPA_OLD_MAXBLOCKSIZE);
-				bcopy(lr, dst, len);
-				*end = (char *)dst + len;
-				*nbp = zilc->zc_next_blk;
-			}
-		} else {
-			char *lr = abuf->b_data;
-			uint64_t size = BP_GET_LSIZE(bp);
-			zil_chain_t *zilc = (zil_chain_t *)(lr + size) - 1;
-
-			if (bcmp(&cksum, &zilc->zc_next_blk.blk_cksum,
-			    sizeof (cksum)) || BP_IS_HOLE(&zilc->zc_next_blk) ||
-			    (zilc->zc_nused > (size - sizeof (*zilc)))) {
-				error = SET_ERROR(ECKSUM);
-			} else {
-				ASSERT3U(zilc->zc_nused, <=,
-				    SPA_OLD_MAXBLOCKSIZE);
-				bcopy(lr, dst, zilc->zc_nused);
-				*end = (char *)dst + zilc->zc_nused;
-				*nbp = zilc->zc_next_blk;
-			}
-		}
-
-		arc_buf_destroy(abuf, &abuf);
-	}
-
-	return (error);
-}
-
-/*
- * Read a TX_WRITE log data block.
- */
-static int
-zil_read_log_data(zilog_t *zilog, const lr_write_t *lr, void *wbuf)
-{
-	enum zio_flag zio_flags = ZIO_FLAG_CANFAIL;
-	const blkptr_t *bp = &lr->lr_blkptr;
-	arc_flags_t aflags = ARC_FLAG_WAIT;
-	arc_buf_t *abuf = NULL;
-	zbookmark_phys_t zb;
-	int error;
-
-	if (BP_IS_HOLE(bp)) {
-		if (wbuf != NULL)
-			bzero(wbuf, MAX(BP_GET_LSIZE(bp), lr->lr_length));
-		return (0);
-	}
-
-	if (zilog->zl_header->zh_claim_txg == 0)
-		zio_flags |= ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SCRUB;
-
-	SET_BOOKMARK(&zb, dmu_objset_id(zilog->zl_os), lr->lr_foid,
-	    ZB_ZIL_LEVEL, lr->lr_offset / BP_GET_LSIZE(bp));
-
-	error = arc_read(NULL, zilog->zl_spa, bp, arc_getbuf_func, &abuf,
-	    ZIO_PRIORITY_SYNC_READ, zio_flags, &aflags, &zb);
-
-	if (error == 0) {
-		if (wbuf != NULL)
-			bcopy(abuf->b_data, wbuf, arc_buf_size(abuf));
-		arc_buf_destroy(abuf, &abuf);
-	}
-
-	return (error);
-}
-
-/*
- * Parse the intent log, and call parse_func for each valid record within.
- */
-int
-zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func,
-    zil_parse_lr_func_t *parse_lr_func, void *arg, uint64_t txg)
-{
-	const zil_header_t *zh = zilog->zl_header;
-	boolean_t claimed = !!zh->zh_claim_txg;
-	uint64_t claim_blk_seq = claimed ? zh->zh_claim_blk_seq : UINT64_MAX;
-	uint64_t claim_lr_seq = claimed ? zh->zh_claim_lr_seq : UINT64_MAX;
-	uint64_t max_blk_seq = 0;
-	uint64_t max_lr_seq = 0;
-	uint64_t blk_count = 0;
-	uint64_t lr_count = 0;
-	blkptr_t blk, next_blk;
-	char *lrbuf, *lrp;
-	int error = 0;
-
-	/*
-	 * Old logs didn't record the maximum zh_claim_lr_seq.
-	 */
-	if (!(zh->zh_flags & ZIL_CLAIM_LR_SEQ_VALID))
-		claim_lr_seq = UINT64_MAX;
-
-	/*
-	 * Starting at the block pointed to by zh_log we read the log chain.
-	 * For each block in the chain we strongly check that block to
-	 * ensure its validity.  We stop when an invalid block is found.
-	 * For each block pointer in the chain we call parse_blk_func().
-	 * For each record in each valid block we call parse_lr_func().
-	 * If the log has been claimed, stop if we encounter a sequence
-	 * number greater than the highest claimed sequence number.
-	 */
-	lrbuf = zio_buf_alloc(SPA_OLD_MAXBLOCKSIZE);
-	zil_bp_tree_init(zilog);
-
-	for (blk = zh->zh_log; !BP_IS_HOLE(&blk); blk = next_blk) {
-		uint64_t blk_seq = blk.blk_cksum.zc_word[ZIL_ZC_SEQ];
-		int reclen;
-		char *end;
-
-		if (blk_seq > claim_blk_seq)
-			break;
-		if ((error = parse_blk_func(zilog, &blk, arg, txg)) != 0)
-			break;
-		ASSERT3U(max_blk_seq, <, blk_seq);
-		max_blk_seq = blk_seq;
-		blk_count++;
-
-		if (max_lr_seq == claim_lr_seq && max_blk_seq == claim_blk_seq)
-			break;
-
-		error = zil_read_log_block(zilog, &blk, &next_blk, lrbuf, &end);
-		if (error != 0)
-			break;
-
-		for (lrp = lrbuf; lrp < end; lrp += reclen) {
-			lr_t *lr = (lr_t *)lrp;
-			reclen = lr->lrc_reclen;
-			ASSERT3U(reclen, >=, sizeof (lr_t));
-			if (lr->lrc_seq > claim_lr_seq)
-				goto done;
-			if ((error = parse_lr_func(zilog, lr, arg, txg)) != 0)
-				goto done;
-			ASSERT3U(max_lr_seq, <, lr->lrc_seq);
-			max_lr_seq = lr->lrc_seq;
-			lr_count++;
-		}
-	}
-done:
-	zilog->zl_parse_error = error;
-	zilog->zl_parse_blk_seq = max_blk_seq;
-	zilog->zl_parse_lr_seq = max_lr_seq;
-	zilog->zl_parse_blk_count = blk_count;
-	zilog->zl_parse_lr_count = lr_count;
-
-	ASSERT(!claimed || !(zh->zh_flags & ZIL_CLAIM_LR_SEQ_VALID) ||
-	    (max_blk_seq == claim_blk_seq && max_lr_seq == claim_lr_seq));
-
-	zil_bp_tree_fini(zilog);
-	zio_buf_free(lrbuf, SPA_OLD_MAXBLOCKSIZE);
-
-	return (error);
-}
-
-/* ARGSUSED */
-static int
-zil_clear_log_block(zilog_t *zilog, blkptr_t *bp, void *tx, uint64_t first_txg)
-{
-	ASSERT(!BP_IS_HOLE(bp));
-
-	/*
-	 * As we call this function from the context of a rewind to a
-	 * checkpoint, each ZIL block whose txg is later than the txg
-	 * that we rewind to is invalid. Thus, we return -1 so
-	 * zil_parse() doesn't attempt to read it.
-	 */
-	if (bp->blk_birth >= first_txg)
-		return (-1);
-
-	if (zil_bp_tree_add(zilog, bp) != 0)
-		return (0);
-
-	zio_free(zilog->zl_spa, first_txg, bp);
-	return (0);
-}
-
-/* ARGSUSED */
-static int
-zil_noop_log_record(zilog_t *zilog, lr_t *lrc, void *tx, uint64_t first_txg)
-{
-	return (0);
-}
-
-static int
-zil_claim_log_block(zilog_t *zilog, blkptr_t *bp, void *tx, uint64_t first_txg)
-{
-	/*
-	 * Claim log block if not already committed and not already claimed.
-	 * If tx == NULL, just verify that the block is claimable.
-	 */
-	if (BP_IS_HOLE(bp) || bp->blk_birth < first_txg ||
-	    zil_bp_tree_add(zilog, bp) != 0)
-		return (0);
-
-	return (zio_wait(zio_claim(NULL, zilog->zl_spa,
-	    tx == NULL ? 0 : first_txg, bp, spa_claim_notify, NULL,
-	    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SCRUB)));
-}
-
-static int
-zil_claim_log_record(zilog_t *zilog, lr_t *lrc, void *tx, uint64_t first_txg)
-{
-	lr_write_t *lr = (lr_write_t *)lrc;
-	int error;
-
-	if (lrc->lrc_txtype != TX_WRITE)
-		return (0);
-
-	/*
-	 * If the block is not readable, don't claim it.  This can happen
-	 * in normal operation when a log block is written to disk before
-	 * some of the dmu_sync() blocks it points to.  In this case, the
-	 * transaction cannot have been committed to anyone (we would have
-	 * waited for all writes to be stable first), so it is semantically
-	 * correct to declare this the end of the log.
-	 */
-	if (lr->lr_blkptr.blk_birth >= first_txg &&
-	    (error = zil_read_log_data(zilog, lr, NULL)) != 0)
-		return (error);
-	return (zil_claim_log_block(zilog, &lr->lr_blkptr, tx, first_txg));
-}
-
-/* ARGSUSED */
-static int
-zil_free_log_block(zilog_t *zilog, blkptr_t *bp, void *tx, uint64_t claim_txg)
-{
-	zio_free(zilog->zl_spa, dmu_tx_get_txg(tx), bp);
-
-	return (0);
-}
-
-static int
-zil_free_log_record(zilog_t *zilog, lr_t *lrc, void *tx, uint64_t claim_txg)
-{
-	lr_write_t *lr = (lr_write_t *)lrc;
-	blkptr_t *bp = &lr->lr_blkptr;
-
-	/*
-	 * If we previously claimed it, we need to free it.
-	 */
-	if (claim_txg != 0 && lrc->lrc_txtype == TX_WRITE &&
-	    bp->blk_birth >= claim_txg && zil_bp_tree_add(zilog, bp) == 0 &&
-	    !BP_IS_HOLE(bp))
-		zio_free(zilog->zl_spa, dmu_tx_get_txg(tx), bp);
-
-	return (0);
-}
-
-static int
-zil_lwb_vdev_compare(const void *x1, const void *x2)
-{
-	const uint64_t v1 = ((zil_vdev_node_t *)x1)->zv_vdev;
-	const uint64_t v2 = ((zil_vdev_node_t *)x2)->zv_vdev;
-
-	return (AVL_CMP(v1, v2));
-}
-
-static lwb_t *
-zil_alloc_lwb(zilog_t *zilog, blkptr_t *bp, boolean_t slog, uint64_t txg)
-{
-	lwb_t *lwb;
-
-	lwb = kmem_cache_alloc(zil_lwb_cache, KM_SLEEP);
-	lwb->lwb_zilog = zilog;
-	lwb->lwb_blk = *bp;
-	lwb->lwb_slog = slog;
-	lwb->lwb_state = LWB_STATE_CLOSED;
-	lwb->lwb_buf = zio_buf_alloc(BP_GET_LSIZE(bp));
-	lwb->lwb_max_txg = txg;
-	lwb->lwb_write_zio = NULL;
-	lwb->lwb_root_zio = NULL;
-	lwb->lwb_tx = NULL;
-	lwb->lwb_issued_timestamp = 0;
-	if (BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_ZILOG2) {
-		lwb->lwb_nused = sizeof (zil_chain_t);
-		lwb->lwb_sz = BP_GET_LSIZE(bp);
-	} else {
-		lwb->lwb_nused = 0;
-		lwb->lwb_sz = BP_GET_LSIZE(bp) - sizeof (zil_chain_t);
-	}
-
-	mutex_enter(&zilog->zl_lock);
-	list_insert_tail(&zilog->zl_lwb_list, lwb);
-	mutex_exit(&zilog->zl_lock);
-
-	ASSERT(!MUTEX_HELD(&lwb->lwb_vdev_lock));
-	ASSERT(avl_is_empty(&lwb->lwb_vdev_tree));
-	VERIFY(list_is_empty(&lwb->lwb_waiters));
-
-	return (lwb);
-}
-
-static void
-zil_free_lwb(zilog_t *zilog, lwb_t *lwb)
-{
-	ASSERT(MUTEX_HELD(&zilog->zl_lock));
-	ASSERT(!MUTEX_HELD(&lwb->lwb_vdev_lock));
-	VERIFY(list_is_empty(&lwb->lwb_waiters));
-	ASSERT(avl_is_empty(&lwb->lwb_vdev_tree));
-	ASSERT3P(lwb->lwb_write_zio, ==, NULL);
-	ASSERT3P(lwb->lwb_root_zio, ==, NULL);
-	ASSERT3U(lwb->lwb_max_txg, <=, spa_syncing_txg(zilog->zl_spa));
-	ASSERT(lwb->lwb_state == LWB_STATE_CLOSED ||
-	    lwb->lwb_state == LWB_STATE_FLUSH_DONE);
-
-	/*
-	 * Clear the zilog's field to indicate this lwb is no longer
-	 * valid, and prevent use-after-free errors.
-	 */
-	if (zilog->zl_last_lwb_opened == lwb)
-		zilog->zl_last_lwb_opened = NULL;
-
-	kmem_cache_free(zil_lwb_cache, lwb);
-}
-
-/*
- * Called when we create in-memory log transactions so that we know
- * to cleanup the itxs at the end of spa_sync().
- */
-void
-zilog_dirty(zilog_t *zilog, uint64_t txg)
-{
-	dsl_pool_t *dp = zilog->zl_dmu_pool;
-	dsl_dataset_t *ds = dmu_objset_ds(zilog->zl_os);
-
-	ASSERT(spa_writeable(zilog->zl_spa));
-
-	if (ds->ds_is_snapshot)
-		panic("dirtying snapshot!");
-
-	if (txg_list_add(&dp->dp_dirty_zilogs, zilog, txg)) {
-		/* up the hold count until we can be written out */
-		dmu_buf_add_ref(ds->ds_dbuf, zilog);
-
-		zilog->zl_dirty_max_txg = MAX(txg, zilog->zl_dirty_max_txg);
-	}
-}
-
-/*
- * Determine if the zil is dirty in the specified txg. Callers wanting to
- * ensure that the dirty state does not change must hold the itxg_lock for
- * the specified txg. Holding the lock will ensure that the zil cannot be
- * dirtied (zil_itx_assign) or cleaned (zil_clean) while we check its current
- * state.
- */
-boolean_t
-zilog_is_dirty_in_txg(zilog_t *zilog, uint64_t txg)
-{
-	dsl_pool_t *dp = zilog->zl_dmu_pool;
-
-	if (txg_list_member(&dp->dp_dirty_zilogs, zilog, txg & TXG_MASK))
-		return (B_TRUE);
-	return (B_FALSE);
-}
-
-/*
- * Determine if the zil is dirty. The zil is considered dirty if it has
- * any pending itx records that have not been cleaned by zil_clean().
- */
-boolean_t
-zilog_is_dirty(zilog_t *zilog)
-{
-	dsl_pool_t *dp = zilog->zl_dmu_pool;
-
-	for (int t = 0; t < TXG_SIZE; t++) {
-		if (txg_list_member(&dp->dp_dirty_zilogs, zilog, t))
-			return (B_TRUE);
-	}
-	return (B_FALSE);
-}
-
-/*
- * Create an on-disk intent log.
- */
-static lwb_t *
-zil_create(zilog_t *zilog)
-{
-	const zil_header_t *zh = zilog->zl_header;
-	lwb_t *lwb = NULL;
-	uint64_t txg = 0;
-	dmu_tx_t *tx = NULL;
-	blkptr_t blk;
-	int error = 0;
-	boolean_t slog = FALSE;
-
-	/*
-	 * Wait for any previous destroy to complete.
-	 */
-	txg_wait_synced(zilog->zl_dmu_pool, zilog->zl_destroy_txg);
-
-	ASSERT(zh->zh_claim_txg == 0);
-	ASSERT(zh->zh_replay_seq == 0);
-
-	blk = zh->zh_log;
-
-	/*
-	 * Allocate an initial log block if:
-	 *    - there isn't one already
-	 *    - the existing block is the wrong endianess
-	 */
-	if (BP_IS_HOLE(&blk) || BP_SHOULD_BYTESWAP(&blk)) {
-		tx = dmu_tx_create(zilog->zl_os);
-		VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
-		dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx);
-		txg = dmu_tx_get_txg(tx);
-
-		if (!BP_IS_HOLE(&blk)) {
-			zio_free(zilog->zl_spa, txg, &blk);
-			BP_ZERO(&blk);
-		}
-
-		error = zio_alloc_zil(zilog->zl_spa,
-		    zilog->zl_os->os_dsl_dataset->ds_object, txg, &blk, NULL,
-		    ZIL_MIN_BLKSZ, &slog);
-
-		if (error == 0)
-			zil_init_log_chain(zilog, &blk);
-	}
-
-	/*
-	 * Allocate a log write block (lwb) for the first log block.
-	 */
-	if (error == 0)
-		lwb = zil_alloc_lwb(zilog, &blk, slog, txg);
-
-	/*
-	 * If we just allocated the first log block, commit our transaction
-	 * and wait for zil_sync() to stuff the block poiner into zh_log.
-	 * (zh is part of the MOS, so we cannot modify it in open context.)
-	 */
-	if (tx != NULL) {
-		dmu_tx_commit(tx);
-		txg_wait_synced(zilog->zl_dmu_pool, txg);
-	}
-
-	ASSERT(bcmp(&blk, &zh->zh_log, sizeof (blk)) == 0);
-
-	return (lwb);
-}
-
-/*
- * In one tx, free all log blocks and clear the log header. If keep_first
- * is set, then we're replaying a log with no content. We want to keep the
- * first block, however, so that the first synchronous transaction doesn't
- * require a txg_wait_synced() in zil_create(). We don't need to
- * txg_wait_synced() here either when keep_first is set, because both
- * zil_create() and zil_destroy() will wait for any in-progress destroys
- * to complete.
- */
-void
-zil_destroy(zilog_t *zilog, boolean_t keep_first)
-{
-	const zil_header_t *zh = zilog->zl_header;
-	lwb_t *lwb;
-	dmu_tx_t *tx;
-	uint64_t txg;
-
-	/*
-	 * Wait for any previous destroy to complete.
-	 */
-	txg_wait_synced(zilog->zl_dmu_pool, zilog->zl_destroy_txg);
-
-	zilog->zl_old_header = *zh;		/* debugging aid */
-
-	if (BP_IS_HOLE(&zh->zh_log))
-		return;
-
-	tx = dmu_tx_create(zilog->zl_os);
-	VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
-	dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx);
-	txg = dmu_tx_get_txg(tx);
-
-	mutex_enter(&zilog->zl_lock);
-
-	ASSERT3U(zilog->zl_destroy_txg, <, txg);
-	zilog->zl_destroy_txg = txg;
-	zilog->zl_keep_first = keep_first;
-
-	if (!list_is_empty(&zilog->zl_lwb_list)) {
-		ASSERT(zh->zh_claim_txg == 0);
-		VERIFY(!keep_first);
-		while ((lwb = list_head(&zilog->zl_lwb_list)) != NULL) {
-			list_remove(&zilog->zl_lwb_list, lwb);
-			if (lwb->lwb_buf != NULL)
-				zio_buf_free(lwb->lwb_buf, lwb->lwb_sz);
-			zio_free(zilog->zl_spa, txg, &lwb->lwb_blk);
-			zil_free_lwb(zilog, lwb);
-		}
-	} else if (!keep_first) {
-		zil_destroy_sync(zilog, tx);
-	}
-	mutex_exit(&zilog->zl_lock);
-
-	dmu_tx_commit(tx);
-}
-
-void
-zil_destroy_sync(zilog_t *zilog, dmu_tx_t *tx)
-{
-	ASSERT(list_is_empty(&zilog->zl_lwb_list));
-	(void) zil_parse(zilog, zil_free_log_block,
-	    zil_free_log_record, tx, zilog->zl_header->zh_claim_txg);
-}
-
-int
-zil_claim(dsl_pool_t *dp, dsl_dataset_t *ds, void *txarg)
-{
-	dmu_tx_t *tx = txarg;
-	zilog_t *zilog;
-	uint64_t first_txg;
-	zil_header_t *zh;
-	objset_t *os;
-	int error;
-
-	error = dmu_objset_own_obj(dp, ds->ds_object,
-	    DMU_OST_ANY, B_FALSE, FTAG, &os);
-	if (error != 0) {
-		/*
-		 * EBUSY indicates that the objset is inconsistent, in which
-		 * case it can not have a ZIL.
-		 */
-		if (error != EBUSY) {
-			cmn_err(CE_WARN, "can't open objset for %llu, error %u",
-			    (unsigned long long)ds->ds_object, error);
-		}
-		return (0);
-	}
-
-	zilog = dmu_objset_zil(os);
-	zh = zil_header_in_syncing_context(zilog);
-	ASSERT3U(tx->tx_txg, ==, spa_first_txg(zilog->zl_spa));
-	first_txg = spa_min_claim_txg(zilog->zl_spa);
-
-	/*
-	 * If the spa_log_state is not set to be cleared, check whether
-	 * the current uberblock is a checkpoint one and if the current
-	 * header has been claimed before moving on.
-	 *
-	 * If the current uberblock is a checkpointed uberblock then
-	 * one of the following scenarios took place:
-	 *
-	 * 1] We are currently rewinding to the checkpoint of the pool.
-	 * 2] We crashed in the middle of a checkpoint rewind but we
-	 *    did manage to write the checkpointed uberblock to the
-	 *    vdev labels, so when we tried to import the pool again
-	 *    the checkpointed uberblock was selected from the import
-	 *    procedure.
-	 *
-	 * In both cases we want to zero out all the ZIL blocks, except
-	 * the ones that have been claimed at the time of the checkpoint
-	 * (their zh_claim_txg != 0). The reason is that these blocks
-	 * may be corrupted since we may have reused their locations on
-	 * disk after we took the checkpoint.
-	 *
-	 * We could try to set spa_log_state to SPA_LOG_CLEAR earlier
-	 * when we first figure out whether the current uberblock is
-	 * checkpointed or not. Unfortunately, that would discard all
-	 * the logs, including the ones that are claimed, and we would
-	 * leak space.
-	 */
-	if (spa_get_log_state(zilog->zl_spa) == SPA_LOG_CLEAR ||
-	    (zilog->zl_spa->spa_uberblock.ub_checkpoint_txg != 0 &&
-	    zh->zh_claim_txg == 0)) {
-		if (!BP_IS_HOLE(&zh->zh_log)) {
-			(void) zil_parse(zilog, zil_clear_log_block,
-			    zil_noop_log_record, tx, first_txg);
-		}
-		BP_ZERO(&zh->zh_log);
-		dsl_dataset_dirty(dmu_objset_ds(os), tx);
-		dmu_objset_disown(os, FTAG);
-		return (0);
-	}
-
-	/*
-	 * If we are not rewinding and opening the pool normally, then
-	 * the min_claim_txg should be equal to the first txg of the pool.
-	 */
-	ASSERT3U(first_txg, ==, spa_first_txg(zilog->zl_spa));
-
-	/*
-	 * Claim all log blocks if we haven't already done so, and remember
-	 * the highest claimed sequence number.  This ensures that if we can
-	 * read only part of the log now (e.g. due to a missing device),
-	 * but we can read the entire log later, we will not try to replay
-	 * or destroy beyond the last block we successfully claimed.
-	 */
-	ASSERT3U(zh->zh_claim_txg, <=, first_txg);
-	if (zh->zh_claim_txg == 0 && !BP_IS_HOLE(&zh->zh_log)) {
-		(void) zil_parse(zilog, zil_claim_log_block,
-		    zil_claim_log_record, tx, first_txg);
-		zh->zh_claim_txg = first_txg;
-		zh->zh_claim_blk_seq = zilog->zl_parse_blk_seq;
-		zh->zh_claim_lr_seq = zilog->zl_parse_lr_seq;
-		if (zilog->zl_parse_lr_count || zilog->zl_parse_blk_count > 1)
-			zh->zh_flags |= ZIL_REPLAY_NEEDED;
-		zh->zh_flags |= ZIL_CLAIM_LR_SEQ_VALID;
-		dsl_dataset_dirty(dmu_objset_ds(os), tx);
-	}
-
-	ASSERT3U(first_txg, ==, (spa_last_synced_txg(zilog->zl_spa) + 1));
-	dmu_objset_disown(os, FTAG);
-	return (0);
-}
-
-/*
- * Check the log by walking the log chain.
- * Checksum errors are ok as they indicate the end of the chain.
- * Any other error (no device or read failure) returns an error.
- */
-/* ARGSUSED */
-int
-zil_check_log_chain(dsl_pool_t *dp, dsl_dataset_t *ds, void *tx)
-{
-	zilog_t *zilog;
-	objset_t *os;
-	blkptr_t *bp;
-	int error;
-
-	ASSERT(tx == NULL);
-
-	error = dmu_objset_from_ds(ds, &os);
-	if (error != 0) {
-		cmn_err(CE_WARN, "can't open objset %llu, error %d",
-		    (unsigned long long)ds->ds_object, error);
-		return (0);
-	}
-
-	zilog = dmu_objset_zil(os);
-	bp = (blkptr_t *)&zilog->zl_header->zh_log;
-
-	if (!BP_IS_HOLE(bp)) {
-		vdev_t *vd;
-		boolean_t valid = B_TRUE;
-
-		/*
-		 * Check the first block and determine if it's on a log device
-		 * which may have been removed or faulted prior to loading this
-		 * pool.  If so, there's no point in checking the rest of the
-		 * log as its content should have already been synced to the
-		 * pool.
-		 */
-		spa_config_enter(os->os_spa, SCL_STATE, FTAG, RW_READER);
-		vd = vdev_lookup_top(os->os_spa, DVA_GET_VDEV(&bp->blk_dva[0]));
-		if (vd->vdev_islog && vdev_is_dead(vd))
-			valid = vdev_log_state_valid(vd);
-		spa_config_exit(os->os_spa, SCL_STATE, FTAG);
-
-		if (!valid)
-			return (0);
-
-		/*
-		 * Check whether the current uberblock is checkpointed (e.g.
-		 * we are rewinding) and whether the current header has been
-		 * claimed or not. If it hasn't then skip verifying it. We
-		 * do this because its ZIL blocks may be part of the pool's
-		 * state before the rewind, which is no longer valid.
-		 */
-		zil_header_t *zh = zil_header_in_syncing_context(zilog);
-		if (zilog->zl_spa->spa_uberblock.ub_checkpoint_txg != 0 &&
-		    zh->zh_claim_txg == 0)
-			return (0);
-	}
-
-	/*
-	 * Because tx == NULL, zil_claim_log_block() will not actually claim
-	 * any blocks, but just determine whether it is possible to do so.
-	 * In addition to checking the log chain, zil_claim_log_block()
-	 * will invoke zio_claim() with a done func of spa_claim_notify(),
-	 * which will update spa_max_claim_txg.  See spa_load() for details.
-	 */
-	error = zil_parse(zilog, zil_claim_log_block, zil_claim_log_record, tx,
-	    zilog->zl_header->zh_claim_txg ? -1ULL :
-	    spa_min_claim_txg(os->os_spa));
-
-	return ((error == ECKSUM || error == ENOENT) ? 0 : error);
-}
-
-/*
- * When an itx is "skipped", this function is used to properly mark the
- * waiter as "done, and signal any thread(s) waiting on it. An itx can
- * be skipped (and not committed to an lwb) for a variety of reasons,
- * one of them being that the itx was committed via spa_sync(), prior to
- * it being committed to an lwb; this can happen if a thread calling
- * zil_commit() is racing with spa_sync().
- */
-static void
-zil_commit_waiter_skip(zil_commit_waiter_t *zcw)
-{
-	mutex_enter(&zcw->zcw_lock);
-	ASSERT3B(zcw->zcw_done, ==, B_FALSE);
-	zcw->zcw_done = B_TRUE;
-	cv_broadcast(&zcw->zcw_cv);
-	mutex_exit(&zcw->zcw_lock);
-}
-
-/*
- * This function is used when the given waiter is to be linked into an
- * lwb's "lwb_waiter" list; i.e. when the itx is committed to the lwb.
- * At this point, the waiter will no longer be referenced by the itx,
- * and instead, will be referenced by the lwb.
- */
-static void
-zil_commit_waiter_link_lwb(zil_commit_waiter_t *zcw, lwb_t *lwb)
-{
-	/*
-	 * The lwb_waiters field of the lwb is protected by the zilog's
-	 * zl_lock, thus it must be held when calling this function.
-	 */
-	ASSERT(MUTEX_HELD(&lwb->lwb_zilog->zl_lock));
-
-	mutex_enter(&zcw->zcw_lock);
-	ASSERT(!list_link_active(&zcw->zcw_node));
-	ASSERT3P(zcw->zcw_lwb, ==, NULL);
-	ASSERT3P(lwb, !=, NULL);
-	ASSERT(lwb->lwb_state == LWB_STATE_OPENED ||
-	    lwb->lwb_state == LWB_STATE_ISSUED ||
-	    lwb->lwb_state == LWB_STATE_WRITE_DONE);
-
-	list_insert_tail(&lwb->lwb_waiters, zcw);
-	zcw->zcw_lwb = lwb;
-	mutex_exit(&zcw->zcw_lock);
-}
-
-/*
- * This function is used when zio_alloc_zil() fails to allocate a ZIL
- * block, and the given waiter must be linked to the "nolwb waiters"
- * list inside of zil_process_commit_list().
- */
-static void
-zil_commit_waiter_link_nolwb(zil_commit_waiter_t *zcw, list_t *nolwb)
-{
-	mutex_enter(&zcw->zcw_lock);
-	ASSERT(!list_link_active(&zcw->zcw_node));
-	ASSERT3P(zcw->zcw_lwb, ==, NULL);
-	list_insert_tail(nolwb, zcw);
-	mutex_exit(&zcw->zcw_lock);
-}
-
-void
-zil_lwb_add_block(lwb_t *lwb, const blkptr_t *bp)
-{
-	avl_tree_t *t = &lwb->lwb_vdev_tree;
-	avl_index_t where;
-	zil_vdev_node_t *zv, zvsearch;
-	int ndvas = BP_GET_NDVAS(bp);
-	int i;
-
-	if (zil_nocacheflush)
-		return;
-
-	mutex_enter(&lwb->lwb_vdev_lock);
-	for (i = 0; i < ndvas; i++) {
-		zvsearch.zv_vdev = DVA_GET_VDEV(&bp->blk_dva[i]);
-		if (avl_find(t, &zvsearch, &where) == NULL) {
-			zv = kmem_alloc(sizeof (*zv), KM_SLEEP);
-			zv->zv_vdev = zvsearch.zv_vdev;
-			avl_insert(t, zv, where);
-		}
-	}
-	mutex_exit(&lwb->lwb_vdev_lock);
-}
-
-static void
-zil_lwb_flush_defer(lwb_t *lwb, lwb_t *nlwb)
-{
-	avl_tree_t *src = &lwb->lwb_vdev_tree;
-	avl_tree_t *dst = &nlwb->lwb_vdev_tree;
-	void *cookie = NULL;
-	zil_vdev_node_t *zv;
-
-	ASSERT3S(lwb->lwb_state, ==, LWB_STATE_WRITE_DONE);
-	ASSERT3S(nlwb->lwb_state, !=, LWB_STATE_WRITE_DONE);
-	ASSERT3S(nlwb->lwb_state, !=, LWB_STATE_FLUSH_DONE);
-
-	/*
-	 * While 'lwb' is at a point in its lifetime where lwb_vdev_tree does
-	 * not need the protection of lwb_vdev_lock (it will only be modified
-	 * while holding zilog->zl_lock) as its writes and those of its
-	 * children have all completed.  The younger 'nlwb' may be waiting on
-	 * future writes to additional vdevs.
-	 */
-	mutex_enter(&nlwb->lwb_vdev_lock);
-	/*
-	 * Tear down the 'lwb' vdev tree, ensuring that entries which do not
-	 * exist in 'nlwb' are moved to it, freeing any would-be duplicates.
-	 */
-	while ((zv = avl_destroy_nodes(src, &cookie)) != NULL) {
-		avl_index_t where;
-
-		if (avl_find(dst, zv, &where) == NULL) {
-			avl_insert(dst, zv, where);
-		} else {
-			kmem_free(zv, sizeof (*zv));
-		}
-	}
-	mutex_exit(&nlwb->lwb_vdev_lock);
-}
-
-void
-zil_lwb_add_txg(lwb_t *lwb, uint64_t txg)
-{
-	lwb->lwb_max_txg = MAX(lwb->lwb_max_txg, txg);
-}
-
-/*
- * This function is a called after all vdevs associated with a given lwb
- * write have completed their DKIOCFLUSHWRITECACHE command; or as soon
- * as the lwb write completes, if "zil_nocacheflush" is set. Further,
- * all "previous" lwb's will have completed before this function is
- * called; i.e. this function is called for all previous lwbs before
- * it's called for "this" lwb (enforced via zio the dependencies
- * configured in zil_lwb_set_zio_dependency()).
- *
- * The intention is for this function to be called as soon as the
- * contents of an lwb are considered "stable" on disk, and will survive
- * any sudden loss of power. At this point, any threads waiting for the
- * lwb to reach this state are signalled, and the "waiter" structures
- * are marked "done".
- */
-static void
-zil_lwb_flush_vdevs_done(zio_t *zio)
-{
-	lwb_t *lwb = zio->io_private;
-	zilog_t *zilog = lwb->lwb_zilog;
-	dmu_tx_t *tx = lwb->lwb_tx;
-	zil_commit_waiter_t *zcw;
-
-	spa_config_exit(zilog->zl_spa, SCL_STATE, lwb);
-
-	zio_buf_free(lwb->lwb_buf, lwb->lwb_sz);
-
-	mutex_enter(&zilog->zl_lock);
-
-	/*
-	 * Ensure the lwb buffer pointer is cleared before releasing the
-	 * txg. If we have had an allocation failure and the txg is
-	 * waiting to sync then we want zil_sync() to remove the lwb so
-	 * that it's not picked up as the next new one in
-	 * zil_process_commit_list(). zil_sync() will only remove the
-	 * lwb if lwb_buf is null.
-	 */
-	lwb->lwb_buf = NULL;
-	lwb->lwb_tx = NULL;
-
-	ASSERT3U(lwb->lwb_issued_timestamp, >, 0);
-	zilog->zl_last_lwb_latency = gethrtime() - lwb->lwb_issued_timestamp;
-
-	lwb->lwb_root_zio = NULL;
-
-	ASSERT3S(lwb->lwb_state, ==, LWB_STATE_WRITE_DONE);
-	lwb->lwb_state = LWB_STATE_FLUSH_DONE;
-
-	if (zilog->zl_last_lwb_opened == lwb) {
-		/*
-		 * Remember the highest committed log sequence number
-		 * for ztest. We only update this value when all the log
-		 * writes succeeded, because ztest wants to ASSERT that
-		 * it got the whole log chain.
-		 */
-		zilog->zl_commit_lr_seq = zilog->zl_lr_seq;
-	}
-
-	while ((zcw = list_head(&lwb->lwb_waiters)) != NULL) {
-		mutex_enter(&zcw->zcw_lock);
-
-		ASSERT(list_link_active(&zcw->zcw_node));
-		list_remove(&lwb->lwb_waiters, zcw);
-
-		ASSERT3P(zcw->zcw_lwb, ==, lwb);
-		zcw->zcw_lwb = NULL;
-
-		zcw->zcw_zio_error = zio->io_error;
-
-		ASSERT3B(zcw->zcw_done, ==, B_FALSE);
-		zcw->zcw_done = B_TRUE;
-		cv_broadcast(&zcw->zcw_cv);
-
-		mutex_exit(&zcw->zcw_lock);
-	}
-
-	mutex_exit(&zilog->zl_lock);
-
-	/*
-	 * Now that we've written this log block, we have a stable pointer
-	 * to the next block in the chain, so it's OK to let the txg in
-	 * which we allocated the next block sync.
-	 */
-	dmu_tx_commit(tx);
-}
-
-/*
- * This is called when an lwb's write zio completes. The callback's
- * purpose is to issue the DKIOCFLUSHWRITECACHE commands for the vdevs
- * in the lwb's lwb_vdev_tree. The tree will contain the vdevs involved
- * in writing out this specific lwb's data, and in the case that cache
- * flushes have been deferred, vdevs involved in writing the data for
- * previous lwbs. The writes corresponding to all the vdevs in the
- * lwb_vdev_tree will have completed by the time this is called, due to
- * the zio dependencies configured in zil_lwb_set_zio_dependency(),
- * which takes deferred flushes into account. The lwb will be "done"
- * once zil_lwb_flush_vdevs_done() is called, which occurs in the zio
- * completion callback for the lwb's root zio.
- */
-static void
-zil_lwb_write_done(zio_t *zio)
-{
-	lwb_t *lwb = zio->io_private;
-	spa_t *spa = zio->io_spa;
-	zilog_t *zilog = lwb->lwb_zilog;
-	avl_tree_t *t = &lwb->lwb_vdev_tree;
-	void *cookie = NULL;
-	zil_vdev_node_t *zv;
-	lwb_t *nlwb;
-
-	ASSERT3S(spa_config_held(spa, SCL_STATE, RW_READER), !=, 0);
-
-	ASSERT(BP_GET_COMPRESS(zio->io_bp) == ZIO_COMPRESS_OFF);
-	ASSERT(BP_GET_TYPE(zio->io_bp) == DMU_OT_INTENT_LOG);
-	ASSERT(BP_GET_LEVEL(zio->io_bp) == 0);
-	ASSERT(BP_GET_BYTEORDER(zio->io_bp) == ZFS_HOST_BYTEORDER);
-	ASSERT(!BP_IS_GANG(zio->io_bp));
-	ASSERT(!BP_IS_HOLE(zio->io_bp));
-	ASSERT(BP_GET_FILL(zio->io_bp) == 0);
-
-	abd_put(zio->io_abd);
-
-	mutex_enter(&zilog->zl_lock);
-	ASSERT3S(lwb->lwb_state, ==, LWB_STATE_ISSUED);
-	lwb->lwb_state = LWB_STATE_WRITE_DONE;
-	lwb->lwb_write_zio = NULL;
-	nlwb = list_next(&zilog->zl_lwb_list, lwb);
-	mutex_exit(&zilog->zl_lock);
-
-	if (avl_numnodes(t) == 0)
-		return;
-
-	/*
-	 * If there was an IO error, we're not going to call zio_flush()
-	 * on these vdevs, so we simply empty the tree and free the
-	 * nodes. We avoid calling zio_flush() since there isn't any
-	 * good reason for doing so, after the lwb block failed to be
-	 * written out.
-	 */
-	if (zio->io_error != 0) {
-		while ((zv = avl_destroy_nodes(t, &cookie)) != NULL)
-			kmem_free(zv, sizeof (*zv));
-		return;
-	}
-
-	/*
-	 * If this lwb does not have any threads waiting for it to
-	 * complete, we want to defer issuing the DKIOCFLUSHWRITECACHE
-	 * command to the vdevs written to by "this" lwb, and instead
-	 * rely on the "next" lwb to handle the DKIOCFLUSHWRITECACHE
-	 * command for those vdevs. Thus, we merge the vdev tree of
-	 * "this" lwb with the vdev tree of the "next" lwb in the list,
-	 * and assume the "next" lwb will handle flushing the vdevs (or
-	 * deferring the flush(s) again).
-	 *
-	 * This is a useful performance optimization, especially for
-	 * workloads with lots of async write activity and few sync
-	 * write and/or fsync activity, as it has the potential to
-	 * coalesce multiple flush commands to a vdev into one.
-	 */
-	if (list_head(&lwb->lwb_waiters) == NULL && nlwb != NULL) {
-		zil_lwb_flush_defer(lwb, nlwb);
-		ASSERT(avl_is_empty(&lwb->lwb_vdev_tree));
-		return;
-	}
-
-	while ((zv = avl_destroy_nodes(t, &cookie)) != NULL) {
-		vdev_t *vd = vdev_lookup_top(spa, zv->zv_vdev);
-		if (vd != NULL)
-			zio_flush(lwb->lwb_root_zio, vd);
-		kmem_free(zv, sizeof (*zv));
-	}
-}
-
-static void
-zil_lwb_set_zio_dependency(zilog_t *zilog, lwb_t *lwb)
-{
-	lwb_t *last_lwb_opened = zilog->zl_last_lwb_opened;
-
-	ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock));
-	ASSERT(MUTEX_HELD(&zilog->zl_lock));
-
-	/*
-	 * The zilog's "zl_last_lwb_opened" field is used to build the
-	 * lwb/zio dependency chain, which is used to preserve the
-	 * ordering of lwb completions that is required by the semantics
-	 * of the ZIL. Each new lwb zio becomes a parent of the
-	 * "previous" lwb zio, such that the new lwb's zio cannot
-	 * complete until the "previous" lwb's zio completes.
-	 *
-	 * This is required by the semantics of zil_commit(); the commit
-	 * waiters attached to the lwbs will be woken in the lwb zio's
-	 * completion callback, so this zio dependency graph ensures the
-	 * waiters are woken in the correct order (the same order the
-	 * lwbs were created).
-	 */
-	if (last_lwb_opened != NULL &&
-	    last_lwb_opened->lwb_state != LWB_STATE_FLUSH_DONE) {
-		ASSERT(last_lwb_opened->lwb_state == LWB_STATE_OPENED ||
-		    last_lwb_opened->lwb_state == LWB_STATE_ISSUED ||
-		    last_lwb_opened->lwb_state == LWB_STATE_WRITE_DONE);
-
-		ASSERT3P(last_lwb_opened->lwb_root_zio, !=, NULL);
-		zio_add_child(lwb->lwb_root_zio,
-		    last_lwb_opened->lwb_root_zio);
-
-		/*
-		 * If the previous lwb's write hasn't already completed,
-		 * we also want to order the completion of the lwb write
-		 * zios (above, we only order the completion of the lwb
-		 * root zios). This is required because of how we can
-		 * defer the DKIOCFLUSHWRITECACHE commands for each lwb.
-		 *
-		 * When the DKIOCFLUSHWRITECACHE commands are defered,
-		 * the previous lwb will rely on this lwb to flush the
-		 * vdevs written to by that previous lwb. Thus, we need
-		 * to ensure this lwb doesn't issue the flush until
-		 * after the previous lwb's write completes. We ensure
-		 * this ordering by setting the zio parent/child
-		 * relationship here.
-		 *
-		 * Without this relationship on the lwb's write zio,
-		 * it's possible for this lwb's write to complete prior
-		 * to the previous lwb's write completing; and thus, the
-		 * vdevs for the previous lwb would be flushed prior to
-		 * that lwb's data being written to those vdevs (the
-		 * vdevs are flushed in the lwb write zio's completion
-		 * handler, zil_lwb_write_done()).
-		 */
-		if (last_lwb_opened->lwb_state != LWB_STATE_WRITE_DONE) {
-			ASSERT(last_lwb_opened->lwb_state == LWB_STATE_OPENED ||
-			    last_lwb_opened->lwb_state == LWB_STATE_ISSUED);
-
-			ASSERT3P(last_lwb_opened->lwb_write_zio, !=, NULL);
-			zio_add_child(lwb->lwb_write_zio,
-			    last_lwb_opened->lwb_write_zio);
-		}
-	}
-}
-
-
-/*
- * This function's purpose is to "open" an lwb such that it is ready to
- * accept new itxs being committed to it. To do this, the lwb's zio
- * structures are created, and linked to the lwb. This function is
- * idempotent; if the passed in lwb has already been opened, this
- * function is essentially a no-op.
- */
-static void
-zil_lwb_write_open(zilog_t *zilog, lwb_t *lwb)
-{
-	zbookmark_phys_t zb;
-	zio_priority_t prio;
-
-	ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock));
-	ASSERT3P(lwb, !=, NULL);
-	EQUIV(lwb->lwb_root_zio == NULL, lwb->lwb_state == LWB_STATE_CLOSED);
-	EQUIV(lwb->lwb_root_zio != NULL, lwb->lwb_state == LWB_STATE_OPENED);
-
-	SET_BOOKMARK(&zb, lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_OBJSET],
-	    ZB_ZIL_OBJECT, ZB_ZIL_LEVEL,
-	    lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_SEQ]);
-
-	if (lwb->lwb_root_zio == NULL) {
-		abd_t *lwb_abd = abd_get_from_buf(lwb->lwb_buf,
-		    BP_GET_LSIZE(&lwb->lwb_blk));
-
-		if (!lwb->lwb_slog || zilog->zl_cur_used <= zil_slog_bulk)
-			prio = ZIO_PRIORITY_SYNC_WRITE;
-		else
-			prio = ZIO_PRIORITY_ASYNC_WRITE;
-
-		lwb->lwb_root_zio = zio_root(zilog->zl_spa,
-		    zil_lwb_flush_vdevs_done, lwb, ZIO_FLAG_CANFAIL);
-		ASSERT3P(lwb->lwb_root_zio, !=, NULL);
-
-		lwb->lwb_write_zio = zio_rewrite(lwb->lwb_root_zio,
-		    zilog->zl_spa, 0, &lwb->lwb_blk, lwb_abd,
-		    BP_GET_LSIZE(&lwb->lwb_blk), zil_lwb_write_done, lwb,
-		    prio, ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE, &zb);
-		ASSERT3P(lwb->lwb_write_zio, !=, NULL);
-
-		lwb->lwb_state = LWB_STATE_OPENED;
-
-		mutex_enter(&zilog->zl_lock);
-		zil_lwb_set_zio_dependency(zilog, lwb);
-		zilog->zl_last_lwb_opened = lwb;
-		mutex_exit(&zilog->zl_lock);
-	}
-
-	ASSERT3P(lwb->lwb_root_zio, !=, NULL);
-	ASSERT3P(lwb->lwb_write_zio, !=, NULL);
-	ASSERT3S(lwb->lwb_state, ==, LWB_STATE_OPENED);
-}
-
-/*
- * Define a limited set of intent log block sizes.
- *
- * These must be a multiple of 4KB. Note only the amount used (again
- * aligned to 4KB) actually gets written. However, we can't always just
- * allocate SPA_OLD_MAXBLOCKSIZE as the slog space could be exhausted.
- */
-struct {
-	uint64_t	limit;
-	uint64_t	blksz;
-} zil_block_buckets[] = {
-    { 4096,		4096 },			/* non TX_WRITE */
-    { 8192 + 4096,	8192 + 4096 },		/* database */
-    { 32768 + 4096,	32768 + 4096 },		/* NFS writes */
-    { 65536 + 4096,	65536 + 4096 },		/* 64KB writes */
-    { 131072,		131072 },		/* < 128KB writes */
-    { 131072 + 4096,	65536 + 4096 },		/* 128KB writes */
-    { UINT64_MAX,	SPA_OLD_MAXBLOCKSIZE},	/* > 128KB writes */
-};
-
-/*
- * Maximum block size used by the ZIL.  This is picked up when the ZIL is
- * initialized.  Otherwise this should not be used directly; see
- * zl_max_block_size instead.
- */
-int zil_maxblocksize = SPA_OLD_MAXBLOCKSIZE;
-SYSCTL_INT(_vfs_zfs, OID_AUTO, zil_maxblocksize, CTLFLAG_RWTUN,
-    &zil_maxblocksize, 0, "Limit in bytes of ZIL log block size");
-
-/*
- * Start a log block write and advance to the next log block.
- * Calls are serialized.
- */
-static lwb_t *
-zil_lwb_write_issue(zilog_t *zilog, lwb_t *lwb)
-{
-	lwb_t *nlwb = NULL;
-	zil_chain_t *zilc;
-	spa_t *spa = zilog->zl_spa;
-	blkptr_t *bp;
-	dmu_tx_t *tx;
-	uint64_t txg;
-	uint64_t zil_blksz, wsz;
-	int i, error;
-	boolean_t slog;
-
-	ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock));
-	ASSERT3P(lwb->lwb_root_zio, !=, NULL);
-	ASSERT3P(lwb->lwb_write_zio, !=, NULL);
-	ASSERT3S(lwb->lwb_state, ==, LWB_STATE_OPENED);
-
-	if (BP_GET_CHECKSUM(&lwb->lwb_blk) == ZIO_CHECKSUM_ZILOG2) {
-		zilc = (zil_chain_t *)lwb->lwb_buf;
-		bp = &zilc->zc_next_blk;
-	} else {
-		zilc = (zil_chain_t *)(lwb->lwb_buf + lwb->lwb_sz);
-		bp = &zilc->zc_next_blk;
-	}
-
-	ASSERT(lwb->lwb_nused <= lwb->lwb_sz);
-
-	/*
-	 * Allocate the next block and save its address in this block
-	 * before writing it in order to establish the log chain.
-	 * Note that if the allocation of nlwb synced before we wrote
-	 * the block that points at it (lwb), we'd leak it if we crashed.
-	 * Therefore, we don't do dmu_tx_commit() until zil_lwb_write_done().
-	 * We dirty the dataset to ensure that zil_sync() will be called
-	 * to clean up in the event of allocation failure or I/O failure.
-	 */
-
-	tx = dmu_tx_create(zilog->zl_os);
-
-	/*
-	 * Since we are not going to create any new dirty data, and we
-	 * can even help with clearing the existing dirty data, we
-	 * should not be subject to the dirty data based delays. We
-	 * use TXG_NOTHROTTLE to bypass the delay mechanism.
-	 */
-	VERIFY0(dmu_tx_assign(tx, TXG_WAIT | TXG_NOTHROTTLE));
-
-	dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx);
-	txg = dmu_tx_get_txg(tx);
-
-	lwb->lwb_tx = tx;
-
-	/*
-	 * Log blocks are pre-allocated. Here we select the size of the next
-	 * block, based on size used in the last block.
-	 * - first find the smallest bucket that will fit the block from a
-	 *   limited set of block sizes. This is because it's faster to write
-	 *   blocks allocated from the same metaslab as they are adjacent or
-	 *   close.
-	 * - next find the maximum from the new suggested size and an array of
-	 *   previous sizes. This lessens a picket fence effect of wrongly
-	 *   guesssing the size if we have a stream of say 2k, 64k, 2k, 64k
-	 *   requests.
-	 *
-	 * Note we only write what is used, but we can't just allocate
-	 * the maximum block size because we can exhaust the available
-	 * pool log space.
-	 */
-	zil_blksz = zilog->zl_cur_used + sizeof (zil_chain_t);
-	for (i = 0; zil_blksz > zil_block_buckets[i].limit; i++)
-		continue;
-	zil_blksz = MIN(zil_block_buckets[i].blksz, zilog->zl_max_block_size);
-	zilog->zl_prev_blks[zilog->zl_prev_rotor] = zil_blksz;
-	for (i = 0; i < ZIL_PREV_BLKS; i++)
-		zil_blksz = MAX(zil_blksz, zilog->zl_prev_blks[i]);
-	zilog->zl_prev_rotor = (zilog->zl_prev_rotor + 1) & (ZIL_PREV_BLKS - 1);
-
-	BP_ZERO(bp);
-
-	/* pass the old blkptr in order to spread log blocks across devs */
-	error = zio_alloc_zil(spa, zilog->zl_os->os_dsl_dataset->ds_object,
-	    txg, bp, &lwb->lwb_blk, zil_blksz, &slog);
-	if (error == 0) {
-		ASSERT3U(bp->blk_birth, ==, txg);
-		bp->blk_cksum = lwb->lwb_blk.blk_cksum;
-		bp->blk_cksum.zc_word[ZIL_ZC_SEQ]++;
-
-		/*
-		 * Allocate a new log write block (lwb).
-		 */
-		nlwb = zil_alloc_lwb(zilog, bp, slog, txg);
-	}
-
-	if (BP_GET_CHECKSUM(&lwb->lwb_blk) == ZIO_CHECKSUM_ZILOG2) {
-		/* For Slim ZIL only write what is used. */
-		wsz = P2ROUNDUP_TYPED(lwb->lwb_nused, ZIL_MIN_BLKSZ, uint64_t);
-		ASSERT3U(wsz, <=, lwb->lwb_sz);
-		zio_shrink(lwb->lwb_write_zio, wsz);
-
-	} else {
-		wsz = lwb->lwb_sz;
-	}
-
-	zilc->zc_pad = 0;
-	zilc->zc_nused = lwb->lwb_nused;
-	zilc->zc_eck.zec_cksum = lwb->lwb_blk.blk_cksum;
-
-	/*
-	 * clear unused data for security
-	 */
-	bzero(lwb->lwb_buf + lwb->lwb_nused, wsz - lwb->lwb_nused);
-
-	spa_config_enter(zilog->zl_spa, SCL_STATE, lwb, RW_READER);
-
-	zil_lwb_add_block(lwb, &lwb->lwb_blk);
-	lwb->lwb_issued_timestamp = gethrtime();
-	lwb->lwb_state = LWB_STATE_ISSUED;
-
-	zio_nowait(lwb->lwb_root_zio);
-	zio_nowait(lwb->lwb_write_zio);
-
-	/*
-	 * If there was an allocation failure then nlwb will be null which
-	 * forces a txg_wait_synced().
-	 */
-	return (nlwb);
-}
-
-/*
- * Maximum amount of write data that can be put into single log block.
- */
-uint64_t
-zil_max_log_data(zilog_t *zilog)
-{
-	return (zilog->zl_max_block_size -
-	    sizeof (zil_chain_t) - sizeof (lr_write_t));
-}
-
-/*
- * Maximum amount of log space we agree to waste to reduce number of
- * WR_NEED_COPY chunks to reduce zl_get_data() overhead (~12%).
- */
-static inline uint64_t
-zil_max_waste_space(zilog_t *zilog)
-{
-	return (zil_max_log_data(zilog) / 8);
-}
-
-/*
- * Maximum amount of write data for WR_COPIED.  For correctness, consumers
- * must fall back to WR_NEED_COPY if we can't fit the entire record into one
- * maximum sized log block, because each WR_COPIED record must fit in a
- * single log block.  For space efficiency, we want to fit two records into a
- * max-sized log block.
- */
-uint64_t
-zil_max_copied_data(zilog_t *zilog)
-{
-	return ((zilog->zl_max_block_size - sizeof (zil_chain_t)) / 2 -
-	    sizeof (lr_write_t));
-}
-
-static lwb_t *
-zil_lwb_commit(zilog_t *zilog, itx_t *itx, lwb_t *lwb)
-{
-	lr_t *lrcb, *lrc;
-	lr_write_t *lrwb, *lrw;
-	char *lr_buf;
-	uint64_t dlen, dnow, lwb_sp, reclen, txg, max_log_data;
-
-	ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock));
-	ASSERT3P(lwb, !=, NULL);
-	ASSERT3P(lwb->lwb_buf, !=, NULL);
-
-	zil_lwb_write_open(zilog, lwb);
-
-	lrc = &itx->itx_lr;
-	lrw = (lr_write_t *)lrc;
-
-	/*
-	 * A commit itx doesn't represent any on-disk state; instead
-	 * it's simply used as a place holder on the commit list, and
-	 * provides a mechanism for attaching a "commit waiter" onto the
-	 * correct lwb (such that the waiter can be signalled upon
-	 * completion of that lwb). Thus, we don't process this itx's
-	 * log record if it's a commit itx (these itx's don't have log
-	 * records), and instead link the itx's waiter onto the lwb's
-	 * list of waiters.
-	 *
-	 * For more details, see the comment above zil_commit().
-	 */
-	if (lrc->lrc_txtype == TX_COMMIT) {
-		mutex_enter(&zilog->zl_lock);
-		zil_commit_waiter_link_lwb(itx->itx_private, lwb);
-		itx->itx_private = NULL;
-		mutex_exit(&zilog->zl_lock);
-		return (lwb);
-	}
-
-	if (lrc->lrc_txtype == TX_WRITE && itx->itx_wr_state == WR_NEED_COPY) {
-		dlen = P2ROUNDUP_TYPED(
-		    lrw->lr_length, sizeof (uint64_t), uint64_t);
-	} else {
-		dlen = 0;
-	}
-	reclen = lrc->lrc_reclen;
-	zilog->zl_cur_used += (reclen + dlen);
-	txg = lrc->lrc_txg;
-
-	ASSERT3U(zilog->zl_cur_used, <, UINT64_MAX - (reclen + dlen));
-
-cont:
-	/*
-	 * If this record won't fit in the current log block, start a new one.
-	 * For WR_NEED_COPY optimize layout for minimal number of chunks.
-	 */
-	lwb_sp = lwb->lwb_sz - lwb->lwb_nused;
-	max_log_data = zil_max_log_data(zilog);
-	if (reclen > lwb_sp || (reclen + dlen > lwb_sp &&
-	    lwb_sp < zil_max_waste_space(zilog) &&
-	    (dlen % max_log_data == 0 ||
-	    lwb_sp < reclen + dlen % max_log_data))) {
-		lwb = zil_lwb_write_issue(zilog, lwb);
-		if (lwb == NULL)
-			return (NULL);
-		zil_lwb_write_open(zilog, lwb);
-		ASSERT(LWB_EMPTY(lwb));
-		lwb_sp = lwb->lwb_sz - lwb->lwb_nused;
-
-		/*
-		 * There must be enough space in the new, empty log block to
-		 * hold reclen.  For WR_COPIED, we need to fit the whole
-		 * record in one block, and reclen is the header size + the
-		 * data size. For WR_NEED_COPY, we can create multiple
-		 * records, splitting the data into multiple blocks, so we
-		 * only need to fit one word of data per block; in this case
-		 * reclen is just the header size (no data).
-		 */
-		ASSERT3U(reclen + MIN(dlen, sizeof (uint64_t)), <=, lwb_sp);
-	}
-
-	dnow = MIN(dlen, lwb_sp - reclen);
-	lr_buf = lwb->lwb_buf + lwb->lwb_nused;
-	bcopy(lrc, lr_buf, reclen);
-	lrcb = (lr_t *)lr_buf;		/* Like lrc, but inside lwb. */
-	lrwb = (lr_write_t *)lrcb;	/* Like lrw, but inside lwb. */
-
-	/*
-	 * If it's a write, fetch the data or get its blkptr as appropriate.
-	 */
-	if (lrc->lrc_txtype == TX_WRITE) {
-		if (txg > spa_freeze_txg(zilog->zl_spa))
-			txg_wait_synced(zilog->zl_dmu_pool, txg);
-		if (itx->itx_wr_state != WR_COPIED) {
-			char *dbuf;
-			int error;
-
-			if (itx->itx_wr_state == WR_NEED_COPY) {
-				dbuf = lr_buf + reclen;
-				lrcb->lrc_reclen += dnow;
-				if (lrwb->lr_length > dnow)
-					lrwb->lr_length = dnow;
-				lrw->lr_offset += dnow;
-				lrw->lr_length -= dnow;
-			} else {
-				ASSERT(itx->itx_wr_state == WR_INDIRECT);
-				dbuf = NULL;
-			}
-
-			/*
-			 * We pass in the "lwb_write_zio" rather than
-			 * "lwb_root_zio" so that the "lwb_write_zio"
-			 * becomes the parent of any zio's created by
-			 * the "zl_get_data" callback. The vdevs are
-			 * flushed after the "lwb_write_zio" completes,
-			 * so we want to make sure that completion
-			 * callback waits for these additional zio's,
-			 * such that the vdevs used by those zio's will
-			 * be included in the lwb's vdev tree, and those
-			 * vdevs will be properly flushed. If we passed
-			 * in "lwb_root_zio" here, then these additional
-			 * vdevs may not be flushed; e.g. if these zio's
-			 * completed after "lwb_write_zio" completed.
-			 */
-			error = zilog->zl_get_data(itx->itx_private,
-			    lrwb, dbuf, lwb, lwb->lwb_write_zio);
-
-			if (error == EIO) {
-				txg_wait_synced(zilog->zl_dmu_pool, txg);
-				return (lwb);
-			}
-			if (error != 0) {
-				ASSERT(error == ENOENT || error == EEXIST ||
-				    error == EALREADY);
-				return (lwb);
-			}
-		}
-	}
-
-	/*
-	 * We're actually making an entry, so update lrc_seq to be the
-	 * log record sequence number.  Note that this is generally not
-	 * equal to the itx sequence number because not all transactions
-	 * are synchronous, and sometimes spa_sync() gets there first.
-	 */
-	lrcb->lrc_seq = ++zilog->zl_lr_seq;
-	lwb->lwb_nused += reclen + dnow;
-
-	zil_lwb_add_txg(lwb, txg);
-
-	ASSERT3U(lwb->lwb_nused, <=, lwb->lwb_sz);
-	ASSERT0(P2PHASE(lwb->lwb_nused, sizeof (uint64_t)));
-
-	dlen -= dnow;
-	if (dlen > 0) {
-		zilog->zl_cur_used += reclen;
-		goto cont;
-	}
-
-	return (lwb);
-}
-
-itx_t *
-zil_itx_create(uint64_t txtype, size_t lrsize)
-{
-	itx_t *itx;
-
-	lrsize = P2ROUNDUP_TYPED(lrsize, sizeof (uint64_t), size_t);
-
-	itx = kmem_alloc(offsetof(itx_t, itx_lr) + lrsize, KM_SLEEP);
-	itx->itx_lr.lrc_txtype = txtype;
-	itx->itx_lr.lrc_reclen = lrsize;
-	itx->itx_lr.lrc_seq = 0;	/* defensive */
-	itx->itx_sync = B_TRUE;		/* default is synchronous */
-
-	return (itx);
-}
-
-void
-zil_itx_destroy(itx_t *itx)
-{
-	kmem_free(itx, offsetof(itx_t, itx_lr) + itx->itx_lr.lrc_reclen);
-}
-
-/*
- * Free up the sync and async itxs. The itxs_t has already been detached
- * so no locks are needed.
- */
-static void
-zil_itxg_clean(itxs_t *itxs)
-{
-	itx_t *itx;
-	list_t *list;
-	avl_tree_t *t;
-	void *cookie;
-	itx_async_node_t *ian;
-
-	list = &itxs->i_sync_list;
-	while ((itx = list_head(list)) != NULL) {
-		/*
-		 * In the general case, commit itxs will not be found
-		 * here, as they'll be committed to an lwb via
-		 * zil_lwb_commit(), and free'd in that function. Having
-		 * said that, it is still possible for commit itxs to be
-		 * found here, due to the following race:
-		 *
-		 *	- a thread calls zil_commit() which assigns the
-		 *	  commit itx to a per-txg i_sync_list
-		 *	- zil_itxg_clean() is called (e.g. via spa_sync())
-		 *	  while the waiter is still on the i_sync_list
-		 *
-		 * There's nothing to prevent syncing the txg while the
-		 * waiter is on the i_sync_list. This normally doesn't
-		 * happen because spa_sync() is slower than zil_commit(),
-		 * but if zil_commit() calls txg_wait_synced() (e.g.
-		 * because zil_create() or zil_commit_writer_stall() is
-		 * called) we will hit this case.
-		 */
-		if (itx->itx_lr.lrc_txtype == TX_COMMIT)
-			zil_commit_waiter_skip(itx->itx_private);
-
-		list_remove(list, itx);
-		zil_itx_destroy(itx);
-	}
-
-	cookie = NULL;
-	t = &itxs->i_async_tree;
-	while ((ian = avl_destroy_nodes(t, &cookie)) != NULL) {
-		list = &ian->ia_list;
-		while ((itx = list_head(list)) != NULL) {
-			list_remove(list, itx);
-			/* commit itxs should never be on the async lists. */
-			ASSERT3U(itx->itx_lr.lrc_txtype, !=, TX_COMMIT);
-			zil_itx_destroy(itx);
-		}
-		list_destroy(list);
-		kmem_free(ian, sizeof (itx_async_node_t));
-	}
-	avl_destroy(t);
-
-	kmem_free(itxs, sizeof (itxs_t));
-}
-
-static int
-zil_aitx_compare(const void *x1, const void *x2)
-{
-	const uint64_t o1 = ((itx_async_node_t *)x1)->ia_foid;
-	const uint64_t o2 = ((itx_async_node_t *)x2)->ia_foid;
-
-	return (AVL_CMP(o1, o2));
-}
-
-/*
- * Remove all async itx with the given oid.
- */
-static void
-zil_remove_async(zilog_t *zilog, uint64_t oid)
-{
-	uint64_t otxg, txg;
-	itx_async_node_t *ian;
-	avl_tree_t *t;
-	avl_index_t where;
-	list_t clean_list;
-	itx_t *itx;
-
-	ASSERT(oid != 0);
-	list_create(&clean_list, sizeof (itx_t), offsetof(itx_t, itx_node));
-
-	if (spa_freeze_txg(zilog->zl_spa) != UINT64_MAX) /* ziltest support */
-		otxg = ZILTEST_TXG;
-	else
-		otxg = spa_last_synced_txg(zilog->zl_spa) + 1;
-
-	for (txg = otxg; txg < (otxg + TXG_CONCURRENT_STATES); txg++) {
-		itxg_t *itxg = &zilog->zl_itxg[txg & TXG_MASK];
-
-		mutex_enter(&itxg->itxg_lock);
-		if (itxg->itxg_txg != txg) {
-			mutex_exit(&itxg->itxg_lock);
-			continue;
-		}
-
-		/*
-		 * Locate the object node and append its list.
-		 */
-		t = &itxg->itxg_itxs->i_async_tree;
-		ian = avl_find(t, &oid, &where);
-		if (ian != NULL)
-			list_move_tail(&clean_list, &ian->ia_list);
-		mutex_exit(&itxg->itxg_lock);
-	}
-	while ((itx = list_head(&clean_list)) != NULL) {
-		list_remove(&clean_list, itx);
-		/* commit itxs should never be on the async lists. */
-		ASSERT3U(itx->itx_lr.lrc_txtype, !=, TX_COMMIT);
-		zil_itx_destroy(itx);
-	}
-	list_destroy(&clean_list);
-}
-
-void
-zil_itx_assign(zilog_t *zilog, itx_t *itx, dmu_tx_t *tx)
-{
-	uint64_t txg;
-	itxg_t *itxg;
-	itxs_t *itxs, *clean = NULL;
-
-	/*
-	 * Object ids can be re-instantiated in the next txg so
-	 * remove any async transactions to avoid future leaks.
-	 * This can happen if a fsync occurs on the re-instantiated
-	 * object for a WR_INDIRECT or WR_NEED_COPY write, which gets
-	 * the new file data and flushes a write record for the old object.
-	 */
-	if ((itx->itx_lr.lrc_txtype & ~TX_CI) == TX_REMOVE)
-		zil_remove_async(zilog, itx->itx_oid);
-
-	/*
-	 * Ensure the data of a renamed file is committed before the rename.
-	 */
-	if ((itx->itx_lr.lrc_txtype & ~TX_CI) == TX_RENAME)
-		zil_async_to_sync(zilog, itx->itx_oid);
-
-	if (spa_freeze_txg(zilog->zl_spa) != UINT64_MAX)
-		txg = ZILTEST_TXG;
-	else
-		txg = dmu_tx_get_txg(tx);
-
-	itxg = &zilog->zl_itxg[txg & TXG_MASK];
-	mutex_enter(&itxg->itxg_lock);
-	itxs = itxg->itxg_itxs;
-	if (itxg->itxg_txg != txg) {
-		if (itxs != NULL) {
-			/*
-			 * The zil_clean callback hasn't got around to cleaning
-			 * this itxg. Save the itxs for release below.
-			 * This should be rare.
-			 */
-			zfs_dbgmsg("zil_itx_assign: missed itx cleanup for "
-			    "txg %llu", itxg->itxg_txg);
-			clean = itxg->itxg_itxs;
-		}
-		itxg->itxg_txg = txg;
-		itxs = itxg->itxg_itxs = kmem_zalloc(sizeof (itxs_t), KM_SLEEP);
-
-		list_create(&itxs->i_sync_list, sizeof (itx_t),
-		    offsetof(itx_t, itx_node));
-		avl_create(&itxs->i_async_tree, zil_aitx_compare,
-		    sizeof (itx_async_node_t),
-		    offsetof(itx_async_node_t, ia_node));
-	}
-	if (itx->itx_sync) {
-		list_insert_tail(&itxs->i_sync_list, itx);
-	} else {
-		avl_tree_t *t = &itxs->i_async_tree;
-		uint64_t foid =
-		    LR_FOID_GET_OBJ(((lr_ooo_t *)&itx->itx_lr)->lr_foid);
-		itx_async_node_t *ian;
-		avl_index_t where;
-
-		ian = avl_find(t, &foid, &where);
-		if (ian == NULL) {
-			ian = kmem_alloc(sizeof (itx_async_node_t), KM_SLEEP);
-			list_create(&ian->ia_list, sizeof (itx_t),
-			    offsetof(itx_t, itx_node));
-			ian->ia_foid = foid;
-			avl_insert(t, ian, where);
-		}
-		list_insert_tail(&ian->ia_list, itx);
-	}
-
-	itx->itx_lr.lrc_txg = dmu_tx_get_txg(tx);
-
-	/*
-	 * We don't want to dirty the ZIL using ZILTEST_TXG, because
-	 * zil_clean() will never be called using ZILTEST_TXG. Thus, we
-	 * need to be careful to always dirty the ZIL using the "real"
-	 * TXG (not itxg_txg) even when the SPA is frozen.
-	 */
-	zilog_dirty(zilog, dmu_tx_get_txg(tx));
-	mutex_exit(&itxg->itxg_lock);
-
-	/* Release the old itxs now we've dropped the lock */
-	if (clean != NULL)
-		zil_itxg_clean(clean);
-}
-
-/*
- * If there are any in-memory intent log transactions which have now been
- * synced then start up a taskq to free them. We should only do this after we
- * have written out the uberblocks (i.e. txg has been comitted) so that
- * don't inadvertently clean out in-memory log records that would be required
- * by zil_commit().
- */
-void
-zil_clean(zilog_t *zilog, uint64_t synced_txg)
-{
-	itxg_t *itxg = &zilog->zl_itxg[synced_txg & TXG_MASK];
-	itxs_t *clean_me;
-
-	ASSERT3U(synced_txg, <, ZILTEST_TXG);
-
-	mutex_enter(&itxg->itxg_lock);
-	if (itxg->itxg_itxs == NULL || itxg->itxg_txg == ZILTEST_TXG) {
-		mutex_exit(&itxg->itxg_lock);
-		return;
-	}
-	ASSERT3U(itxg->itxg_txg, <=, synced_txg);
-	ASSERT3U(itxg->itxg_txg, !=, 0);
-	clean_me = itxg->itxg_itxs;
-	itxg->itxg_itxs = NULL;
-	itxg->itxg_txg = 0;
-	mutex_exit(&itxg->itxg_lock);
-	/*
-	 * Preferably start a task queue to free up the old itxs but
-	 * if taskq_dispatch can't allocate resources to do that then
-	 * free it in-line. This should be rare. Note, using TQ_SLEEP
-	 * created a bad performance problem.
-	 */
-	ASSERT3P(zilog->zl_dmu_pool, !=, NULL);
-	ASSERT3P(zilog->zl_dmu_pool->dp_zil_clean_taskq, !=, NULL);
-	if (taskq_dispatch(zilog->zl_dmu_pool->dp_zil_clean_taskq,
-	    (void (*)(void *))zil_itxg_clean, clean_me, TQ_NOSLEEP) == 0)
-		zil_itxg_clean(clean_me);
-}
-
-/*
- * This function will traverse the queue of itxs that need to be
- * committed, and move them onto the ZIL's zl_itx_commit_list.
- */
-static void
-zil_get_commit_list(zilog_t *zilog)
-{
-	uint64_t otxg, txg;
-	list_t *commit_list = &zilog->zl_itx_commit_list;
-
-	ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock));
-
-	if (spa_freeze_txg(zilog->zl_spa) != UINT64_MAX) /* ziltest support */
-		otxg = ZILTEST_TXG;
-	else
-		otxg = spa_last_synced_txg(zilog->zl_spa) + 1;
-
-	/*
-	 * This is inherently racy, since there is nothing to prevent
-	 * the last synced txg from changing. That's okay since we'll
-	 * only commit things in the future.
-	 */
-	for (txg = otxg; txg < (otxg + TXG_CONCURRENT_STATES); txg++) {
-		itxg_t *itxg = &zilog->zl_itxg[txg & TXG_MASK];
-
-		mutex_enter(&itxg->itxg_lock);
-		if (itxg->itxg_txg != txg) {
-			mutex_exit(&itxg->itxg_lock);
-			continue;
-		}
-
-		/*
-		 * If we're adding itx records to the zl_itx_commit_list,
-		 * then the zil better be dirty in this "txg". We can assert
-		 * that here since we're holding the itxg_lock which will
-		 * prevent spa_sync from cleaning it. Once we add the itxs
-		 * to the zl_itx_commit_list we must commit it to disk even
-		 * if it's unnecessary (i.e. the txg was synced).
-		 */
-		ASSERT(zilog_is_dirty_in_txg(zilog, txg) ||
-		    spa_freeze_txg(zilog->zl_spa) != UINT64_MAX);
-		list_move_tail(commit_list, &itxg->itxg_itxs->i_sync_list);
-
-		mutex_exit(&itxg->itxg_lock);
-	}
-}
-
-/*
- * Move the async itxs for a specified object to commit into sync lists.
- */
-void
-zil_async_to_sync(zilog_t *zilog, uint64_t foid)
-{
-	uint64_t otxg, txg;
-	itx_async_node_t *ian;
-	avl_tree_t *t;
-	avl_index_t where;
-
-	if (spa_freeze_txg(zilog->zl_spa) != UINT64_MAX) /* ziltest support */
-		otxg = ZILTEST_TXG;
-	else
-		otxg = spa_last_synced_txg(zilog->zl_spa) + 1;
-
-	/*
-	 * This is inherently racy, since there is nothing to prevent
-	 * the last synced txg from changing.
-	 */
-	for (txg = otxg; txg < (otxg + TXG_CONCURRENT_STATES); txg++) {
-		itxg_t *itxg = &zilog->zl_itxg[txg & TXG_MASK];
-
-		mutex_enter(&itxg->itxg_lock);
-		if (itxg->itxg_txg != txg) {
-			mutex_exit(&itxg->itxg_lock);
-			continue;
-		}
-
-		/*
-		 * If a foid is specified then find that node and append its
-		 * list. Otherwise walk the tree appending all the lists
-		 * to the sync list. We add to the end rather than the
-		 * beginning to ensure the create has happened.
-		 */
-		t = &itxg->itxg_itxs->i_async_tree;
-		if (foid != 0) {
-			ian = avl_find(t, &foid, &where);
-			if (ian != NULL) {
-				list_move_tail(&itxg->itxg_itxs->i_sync_list,
-				    &ian->ia_list);
-			}
-		} else {
-			void *cookie = NULL;
-
-			while ((ian = avl_destroy_nodes(t, &cookie)) != NULL) {
-				list_move_tail(&itxg->itxg_itxs->i_sync_list,
-				    &ian->ia_list);
-				list_destroy(&ian->ia_list);
-				kmem_free(ian, sizeof (itx_async_node_t));
-			}
-		}
-		mutex_exit(&itxg->itxg_lock);
-	}
-}
-
-/*
- * This function will prune commit itxs that are at the head of the
- * commit list (it won't prune past the first non-commit itx), and
- * either: a) attach them to the last lwb that's still pending
- * completion, or b) skip them altogether.
- *
- * This is used as a performance optimization to prevent commit itxs
- * from generating new lwbs when it's unnecessary to do so.
- */
-static void
-zil_prune_commit_list(zilog_t *zilog)
-{
-	itx_t *itx;
-
-	ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock));
-
-	while (itx = list_head(&zilog->zl_itx_commit_list)) {
-		lr_t *lrc = &itx->itx_lr;
-		if (lrc->lrc_txtype != TX_COMMIT)
-			break;
-
-		mutex_enter(&zilog->zl_lock);
-
-		lwb_t *last_lwb = zilog->zl_last_lwb_opened;
-		if (last_lwb == NULL ||
-		    last_lwb->lwb_state == LWB_STATE_FLUSH_DONE) {
-			/*
-			 * All of the itxs this waiter was waiting on
-			 * must have already completed (or there were
-			 * never any itx's for it to wait on), so it's
-			 * safe to skip this waiter and mark it done.
-			 */
-			zil_commit_waiter_skip(itx->itx_private);
-		} else {
-			zil_commit_waiter_link_lwb(itx->itx_private, last_lwb);
-			itx->itx_private = NULL;
-		}
-
-		mutex_exit(&zilog->zl_lock);
-
-		list_remove(&zilog->zl_itx_commit_list, itx);
-		zil_itx_destroy(itx);
-	}
-
-	IMPLY(itx != NULL, itx->itx_lr.lrc_txtype != TX_COMMIT);
-}
-
-static void
-zil_commit_writer_stall(zilog_t *zilog)
-{
-	/*
-	 * When zio_alloc_zil() fails to allocate the next lwb block on
-	 * disk, we must call txg_wait_synced() to ensure all of the
-	 * lwbs in the zilog's zl_lwb_list are synced and then freed (in
-	 * zil_sync()), such that any subsequent ZIL writer (i.e. a call
-	 * to zil_process_commit_list()) will have to call zil_create(),
-	 * and start a new ZIL chain.
-	 *
-	 * Since zil_alloc_zil() failed, the lwb that was previously
-	 * issued does not have a pointer to the "next" lwb on disk.
-	 * Thus, if another ZIL writer thread was to allocate the "next"
-	 * on-disk lwb, that block could be leaked in the event of a
-	 * crash (because the previous lwb on-disk would not point to
-	 * it).
-	 *
-	 * We must hold the zilog's zl_issuer_lock while we do this, to
-	 * ensure no new threads enter zil_process_commit_list() until
-	 * all lwb's in the zl_lwb_list have been synced and freed
-	 * (which is achieved via the txg_wait_synced() call).
-	 */
-	ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock));
-	txg_wait_synced(zilog->zl_dmu_pool, 0);
-	ASSERT3P(list_tail(&zilog->zl_lwb_list), ==, NULL);
-}
-
-/*
- * This function will traverse the commit list, creating new lwbs as
- * needed, and committing the itxs from the commit list to these newly
- * created lwbs. Additionally, as a new lwb is created, the previous
- * lwb will be issued to the zio layer to be written to disk.
- */
-static void
-zil_process_commit_list(zilog_t *zilog)
-{
-	spa_t *spa = zilog->zl_spa;
-	list_t nolwb_waiters;
-	lwb_t *lwb;
-	itx_t *itx;
-
-	ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock));
-
-	/*
-	 * Return if there's nothing to commit before we dirty the fs by
-	 * calling zil_create().
-	 */
-	if (list_head(&zilog->zl_itx_commit_list) == NULL)
-		return;
-
-	list_create(&nolwb_waiters, sizeof (zil_commit_waiter_t),
-	    offsetof(zil_commit_waiter_t, zcw_node));
-
-	lwb = list_tail(&zilog->zl_lwb_list);
-	if (lwb == NULL) {
-		lwb = zil_create(zilog);
-	} else {
-		ASSERT3S(lwb->lwb_state, !=, LWB_STATE_ISSUED);
-		ASSERT3S(lwb->lwb_state, !=, LWB_STATE_WRITE_DONE);
-		ASSERT3S(lwb->lwb_state, !=, LWB_STATE_FLUSH_DONE);
-	}
-
-	while (itx = list_head(&zilog->zl_itx_commit_list)) {
-		lr_t *lrc = &itx->itx_lr;
-		uint64_t txg = lrc->lrc_txg;
-
-		ASSERT3U(txg, !=, 0);
-
-		if (lrc->lrc_txtype == TX_COMMIT) {
-			DTRACE_PROBE2(zil__process__commit__itx,
-			    zilog_t *, zilog, itx_t *, itx);
-		} else {
-			DTRACE_PROBE2(zil__process__normal__itx,
-			    zilog_t *, zilog, itx_t *, itx);
-		}
-
-		boolean_t synced = txg <= spa_last_synced_txg(spa);
-		boolean_t frozen = txg > spa_freeze_txg(spa);
-
-		/*
-		 * If the txg of this itx has already been synced out, then
-		 * we don't need to commit this itx to an lwb. This is
-		 * because the data of this itx will have already been
-		 * written to the main pool. This is inherently racy, and
-		 * it's still ok to commit an itx whose txg has already
-		 * been synced; this will result in a write that's
-		 * unnecessary, but will do no harm.
-		 *
-		 * With that said, we always want to commit TX_COMMIT itxs
-		 * to an lwb, regardless of whether or not that itx's txg
-		 * has been synced out. We do this to ensure any OPENED lwb
-		 * will always have at least one zil_commit_waiter_t linked
-		 * to the lwb.
-		 *
-		 * As a counter-example, if we skipped TX_COMMIT itx's
-		 * whose txg had already been synced, the following
-		 * situation could occur if we happened to be racing with
-		 * spa_sync:
-		 *
-		 * 1. we commit a non-TX_COMMIT itx to an lwb, where the
-		 *    itx's txg is 10 and the last synced txg is 9.
-		 * 2. spa_sync finishes syncing out txg 10.
-		 * 3. we move to the next itx in the list, it's a TX_COMMIT
-		 *    whose txg is 10, so we skip it rather than committing
-		 *    it to the lwb used in (1).
-		 *
-		 * If the itx that is skipped in (3) is the last TX_COMMIT
-		 * itx in the commit list, than it's possible for the lwb
-		 * used in (1) to remain in the OPENED state indefinitely.
-		 *
-		 * To prevent the above scenario from occuring, ensuring
-		 * that once an lwb is OPENED it will transition to ISSUED
-		 * and eventually DONE, we always commit TX_COMMIT itx's to
-		 * an lwb here, even if that itx's txg has already been
-		 * synced.
-		 *
-		 * Finally, if the pool is frozen, we _always_ commit the
-		 * itx.  The point of freezing the pool is to prevent data
-		 * from being written to the main pool via spa_sync, and
-		 * instead rely solely on the ZIL to persistently store the
-		 * data; i.e.  when the pool is frozen, the last synced txg
-		 * value can't be trusted.
-		 */
-		if (frozen || !synced || lrc->lrc_txtype == TX_COMMIT) {
-			if (lwb != NULL) {
-				lwb = zil_lwb_commit(zilog, itx, lwb);
-			} else if (lrc->lrc_txtype == TX_COMMIT) {
-				ASSERT3P(lwb, ==, NULL);
-				zil_commit_waiter_link_nolwb(
-				    itx->itx_private, &nolwb_waiters);
-			}
-		}
-
-		list_remove(&zilog->zl_itx_commit_list, itx);
-		zil_itx_destroy(itx);
-	}
-
-	if (lwb == NULL) {
-		/*
-		 * This indicates zio_alloc_zil() failed to allocate the
-		 * "next" lwb on-disk. When this happens, we must stall
-		 * the ZIL write pipeline; see the comment within
-		 * zil_commit_writer_stall() for more details.
-		 */
-		zil_commit_writer_stall(zilog);
-
-		/*
-		 * Additionally, we have to signal and mark the "nolwb"
-		 * waiters as "done" here, since without an lwb, we
-		 * can't do this via zil_lwb_flush_vdevs_done() like
-		 * normal.
-		 */
-		zil_commit_waiter_t *zcw;
-		while (zcw = list_head(&nolwb_waiters)) {
-			zil_commit_waiter_skip(zcw);
-			list_remove(&nolwb_waiters, zcw);
-		}
-	} else {
-		ASSERT(list_is_empty(&nolwb_waiters));
-		ASSERT3P(lwb, !=, NULL);
-		ASSERT3S(lwb->lwb_state, !=, LWB_STATE_ISSUED);
-		ASSERT3S(lwb->lwb_state, !=, LWB_STATE_WRITE_DONE);
-		ASSERT3S(lwb->lwb_state, !=, LWB_STATE_FLUSH_DONE);
-
-		/*
-		 * At this point, the ZIL block pointed at by the "lwb"
-		 * variable is in one of the following states: "closed"
-		 * or "open".
-		 *
-		 * If its "closed", then no itxs have been committed to
-		 * it, so there's no point in issuing its zio (i.e.
-		 * it's "empty").
-		 *
-		 * If its "open" state, then it contains one or more
-		 * itxs that eventually need to be committed to stable
-		 * storage. In this case we intentionally do not issue
-		 * the lwb's zio to disk yet, and instead rely on one of
-		 * the following two mechanisms for issuing the zio:
-		 *
-		 * 1. Ideally, there will be more ZIL activity occuring
-		 * on the system, such that this function will be
-		 * immediately called again (not necessarily by the same
-		 * thread) and this lwb's zio will be issued via
-		 * zil_lwb_commit(). This way, the lwb is guaranteed to
-		 * be "full" when it is issued to disk, and we'll make
-		 * use of the lwb's size the best we can.
-		 *
-		 * 2. If there isn't sufficient ZIL activity occuring on
-		 * the system, such that this lwb's zio isn't issued via
-		 * zil_lwb_commit(), zil_commit_waiter() will issue the
-		 * lwb's zio. If this occurs, the lwb is not guaranteed
-		 * to be "full" by the time its zio is issued, and means
-		 * the size of the lwb was "too large" given the amount
-		 * of ZIL activity occuring on the system at that time.
-		 *
-		 * We do this for a couple of reasons:
-		 *
-		 * 1. To try and reduce the number of IOPs needed to
-		 * write the same number of itxs. If an lwb has space
-		 * available in it's buffer for more itxs, and more itxs
-		 * will be committed relatively soon (relative to the
-		 * latency of performing a write), then it's beneficial
-		 * to wait for these "next" itxs. This way, more itxs
-		 * can be committed to stable storage with fewer writes.
-		 *
-		 * 2. To try and use the largest lwb block size that the
-		 * incoming rate of itxs can support. Again, this is to
-		 * try and pack as many itxs into as few lwbs as
-		 * possible, without significantly impacting the latency
-		 * of each individual itx.
-		 */
-	}
-}
-
-/*
- * This function is responsible for ensuring the passed in commit waiter
- * (and associated commit itx) is committed to an lwb. If the waiter is
- * not already committed to an lwb, all itxs in the zilog's queue of
- * itxs will be processed. The assumption is the passed in waiter's
- * commit itx will found in the queue just like the other non-commit
- * itxs, such that when the entire queue is processed, the waiter will
- * have been commited to an lwb.
- *
- * The lwb associated with the passed in waiter is not guaranteed to
- * have been issued by the time this function completes. If the lwb is
- * not issued, we rely on future calls to zil_commit_writer() to issue
- * the lwb, or the timeout mechanism found in zil_commit_waiter().
- */
-static void
-zil_commit_writer(zilog_t *zilog, zil_commit_waiter_t *zcw)
-{
-	ASSERT(!MUTEX_HELD(&zilog->zl_lock));
-	ASSERT(spa_writeable(zilog->zl_spa));
-
-	mutex_enter(&zilog->zl_issuer_lock);
-
-	if (zcw->zcw_lwb != NULL || zcw->zcw_done) {
-		/*
-		 * It's possible that, while we were waiting to acquire
-		 * the "zl_issuer_lock", another thread committed this
-		 * waiter to an lwb. If that occurs, we bail out early,
-		 * without processing any of the zilog's queue of itxs.
-		 *
-		 * On certain workloads and system configurations, the
-		 * "zl_issuer_lock" can become highly contended. In an
-		 * attempt to reduce this contention, we immediately drop
-		 * the lock if the waiter has already been processed.
-		 *
-		 * We've measured this optimization to reduce CPU spent
-		 * contending on this lock by up to 5%, using a system
-		 * with 32 CPUs, low latency storage (~50 usec writes),
-		 * and 1024 threads performing sync writes.
-		 */
-		goto out;
-	}
-
-	zil_get_commit_list(zilog);
-	zil_prune_commit_list(zilog);
-	zil_process_commit_list(zilog);
-
-out:
-	mutex_exit(&zilog->zl_issuer_lock);
-}
-
-static void
-zil_commit_waiter_timeout(zilog_t *zilog, zil_commit_waiter_t *zcw)
-{
-	ASSERT(!MUTEX_HELD(&zilog->zl_issuer_lock));
-	ASSERT(MUTEX_HELD(&zcw->zcw_lock));
-	ASSERT3B(zcw->zcw_done, ==, B_FALSE);
-
-	lwb_t *lwb = zcw->zcw_lwb;
-	ASSERT3P(lwb, !=, NULL);
-	ASSERT3S(lwb->lwb_state, !=, LWB_STATE_CLOSED);
-
-	/*
-	 * If the lwb has already been issued by another thread, we can
-	 * immediately return since there's no work to be done (the
-	 * point of this function is to issue the lwb). Additionally, we
-	 * do this prior to acquiring the zl_issuer_lock, to avoid
-	 * acquiring it when it's not necessary to do so.
-	 */
-	if (lwb->lwb_state == LWB_STATE_ISSUED ||
-	    lwb->lwb_state == LWB_STATE_WRITE_DONE ||
-	    lwb->lwb_state == LWB_STATE_FLUSH_DONE)
-		return;
-
-	/*
-	 * In order to call zil_lwb_write_issue() we must hold the
-	 * zilog's "zl_issuer_lock". We can't simply acquire that lock,
-	 * since we're already holding the commit waiter's "zcw_lock",
-	 * and those two locks are aquired in the opposite order
-	 * elsewhere.
-	 */
-	mutex_exit(&zcw->zcw_lock);
-	mutex_enter(&zilog->zl_issuer_lock);
-	mutex_enter(&zcw->zcw_lock);
-
-	/*
-	 * Since we just dropped and re-acquired the commit waiter's
-	 * lock, we have to re-check to see if the waiter was marked
-	 * "done" during that process. If the waiter was marked "done",
-	 * the "lwb" pointer is no longer valid (it can be free'd after
-	 * the waiter is marked "done"), so without this check we could
-	 * wind up with a use-after-free error below.
-	 */
-	if (zcw->zcw_done)
-		goto out;
-
-	ASSERT3P(lwb, ==, zcw->zcw_lwb);
-
-	/*
-	 * We've already checked this above, but since we hadn't acquired
-	 * the zilog's zl_issuer_lock, we have to perform this check a
-	 * second time while holding the lock.
-	 *
-	 * We don't need to hold the zl_lock since the lwb cannot transition
-	 * from OPENED to ISSUED while we hold the zl_issuer_lock. The lwb
-	 * _can_ transition from ISSUED to DONE, but it's OK to race with
-	 * that transition since we treat the lwb the same, whether it's in
-	 * the ISSUED or DONE states.
-	 *
-	 * The important thing, is we treat the lwb differently depending on
-	 * if it's ISSUED or OPENED, and block any other threads that might
-	 * attempt to issue this lwb. For that reason we hold the
-	 * zl_issuer_lock when checking the lwb_state; we must not call
-	 * zil_lwb_write_issue() if the lwb had already been issued.
-	 *
-	 * See the comment above the lwb_state_t structure definition for
-	 * more details on the lwb states, and locking requirements.
-	 */
-	if (lwb->lwb_state == LWB_STATE_ISSUED ||
-	    lwb->lwb_state == LWB_STATE_WRITE_DONE ||
-	    lwb->lwb_state == LWB_STATE_FLUSH_DONE)
-		goto out;
-
-	ASSERT3S(lwb->lwb_state, ==, LWB_STATE_OPENED);
-
-	/*
-	 * As described in the comments above zil_commit_waiter() and
-	 * zil_process_commit_list(), we need to issue this lwb's zio
-	 * since we've reached the commit waiter's timeout and it still
-	 * hasn't been issued.
-	 */
-	lwb_t *nlwb = zil_lwb_write_issue(zilog, lwb);
-
-	IMPLY(nlwb != NULL, lwb->lwb_state != LWB_STATE_OPENED);
-
-	/*
-	 * Since the lwb's zio hadn't been issued by the time this thread
-	 * reached its timeout, we reset the zilog's "zl_cur_used" field
-	 * to influence the zil block size selection algorithm.
-	 *
-	 * By having to issue the lwb's zio here, it means the size of the
-	 * lwb was too large, given the incoming throughput of itxs.  By
-	 * setting "zl_cur_used" to zero, we communicate this fact to the
-	 * block size selection algorithm, so it can take this informaiton
-	 * into account, and potentially select a smaller size for the
-	 * next lwb block that is allocated.
-	 */
-	zilog->zl_cur_used = 0;
-
-	if (nlwb == NULL) {
-		/*
-		 * When zil_lwb_write_issue() returns NULL, this
-		 * indicates zio_alloc_zil() failed to allocate the
-		 * "next" lwb on-disk. When this occurs, the ZIL write
-		 * pipeline must be stalled; see the comment within the
-		 * zil_commit_writer_stall() function for more details.
-		 *
-		 * We must drop the commit waiter's lock prior to
-		 * calling zil_commit_writer_stall() or else we can wind
-		 * up with the following deadlock:
-		 *
-		 * - This thread is waiting for the txg to sync while
-		 *   holding the waiter's lock; txg_wait_synced() is
-		 *   used within txg_commit_writer_stall().
-		 *
-		 * - The txg can't sync because it is waiting for this
-		 *   lwb's zio callback to call dmu_tx_commit().
-		 *
-		 * - The lwb's zio callback can't call dmu_tx_commit()
-		 *   because it's blocked trying to acquire the waiter's
-		 *   lock, which occurs prior to calling dmu_tx_commit()
-		 */
-		mutex_exit(&zcw->zcw_lock);
-		zil_commit_writer_stall(zilog);
-		mutex_enter(&zcw->zcw_lock);
-	}
-
-out:
-	mutex_exit(&zilog->zl_issuer_lock);
-	ASSERT(MUTEX_HELD(&zcw->zcw_lock));
-}
-
-/*
- * This function is responsible for performing the following two tasks:
- *
- * 1. its primary responsibility is to block until the given "commit
- *    waiter" is considered "done".
- *
- * 2. its secondary responsibility is to issue the zio for the lwb that
- *    the given "commit waiter" is waiting on, if this function has
- *    waited "long enough" and the lwb is still in the "open" state.
- *
- * Given a sufficient amount of itxs being generated and written using
- * the ZIL, the lwb's zio will be issued via the zil_lwb_commit()
- * function. If this does not occur, this secondary responsibility will
- * ensure the lwb is issued even if there is not other synchronous
- * activity on the system.
- *
- * For more details, see zil_process_commit_list(); more specifically,
- * the comment at the bottom of that function.
- */
-static void
-zil_commit_waiter(zilog_t *zilog, zil_commit_waiter_t *zcw)
-{
-	ASSERT(!MUTEX_HELD(&zilog->zl_lock));
-	ASSERT(!MUTEX_HELD(&zilog->zl_issuer_lock));
-	ASSERT(spa_writeable(zilog->zl_spa));
-
-	mutex_enter(&zcw->zcw_lock);
-
-	/*
-	 * The timeout is scaled based on the lwb latency to avoid
-	 * significantly impacting the latency of each individual itx.
-	 * For more details, see the comment at the bottom of the
-	 * zil_process_commit_list() function.
-	 */
-	int pct = MAX(zfs_commit_timeout_pct, 1);
-#if defined(illumos) || !defined(_KERNEL)
-	hrtime_t sleep = (zilog->zl_last_lwb_latency * pct) / 100;
-	hrtime_t wakeup = gethrtime() + sleep;
-#else
-	sbintime_t sleep = nstosbt((zilog->zl_last_lwb_latency * pct) / 100);
-	sbintime_t wakeup = getsbinuptime() + sleep;
-#endif
-	boolean_t timedout = B_FALSE;
-
-	while (!zcw->zcw_done) {
-		ASSERT(MUTEX_HELD(&zcw->zcw_lock));
-
-		lwb_t *lwb = zcw->zcw_lwb;
-
-		/*
-		 * Usually, the waiter will have a non-NULL lwb field here,
-		 * but it's possible for it to be NULL as a result of
-		 * zil_commit() racing with spa_sync().
-		 *
-		 * When zil_clean() is called, it's possible for the itxg
-		 * list (which may be cleaned via a taskq) to contain
-		 * commit itxs. When this occurs, the commit waiters linked
-		 * off of these commit itxs will not be committed to an
-		 * lwb.  Additionally, these commit waiters will not be
-		 * marked done until zil_commit_waiter_skip() is called via
-		 * zil_itxg_clean().
-		 *
-		 * Thus, it's possible for this commit waiter (i.e. the
-		 * "zcw" variable) to be found in this "in between" state;
-		 * where it's "zcw_lwb" field is NULL, and it hasn't yet
-		 * been skipped, so it's "zcw_done" field is still B_FALSE.
-		 */
-		IMPLY(lwb != NULL, lwb->lwb_state != LWB_STATE_CLOSED);
-
-		if (lwb != NULL && lwb->lwb_state == LWB_STATE_OPENED) {
-			ASSERT3B(timedout, ==, B_FALSE);
-
-			/*
-			 * If the lwb hasn't been issued yet, then we
-			 * need to wait with a timeout, in case this
-			 * function needs to issue the lwb after the
-			 * timeout is reached; responsibility (2) from
-			 * the comment above this function.
-			 */
-#if defined(illumos) || !defined(_KERNEL)
-			clock_t timeleft = cv_timedwait_hires(&zcw->zcw_cv,
-			    &zcw->zcw_lock, wakeup, USEC2NSEC(1),
-			    CALLOUT_FLAG_ABSOLUTE);
-
-			if (timeleft >= 0 || zcw->zcw_done)
-				continue;
-#else
-			int wait_err = cv_timedwait_sbt(&zcw->zcw_cv,
-			    &zcw->zcw_lock, wakeup, SBT_1NS, C_ABSOLUTE);
-			if (wait_err != EWOULDBLOCK || zcw->zcw_done)
-				continue;
-#endif
-
-			timedout = B_TRUE;
-			zil_commit_waiter_timeout(zilog, zcw);
-
-			if (!zcw->zcw_done) {
-				/*
-				 * If the commit waiter has already been
-				 * marked "done", it's possible for the
-				 * waiter's lwb structure to have already
-				 * been freed.  Thus, we can only reliably
-				 * make these assertions if the waiter
-				 * isn't done.
-				 */
-				ASSERT3P(lwb, ==, zcw->zcw_lwb);
-				ASSERT3S(lwb->lwb_state, !=, LWB_STATE_OPENED);
-			}
-		} else {
-			/*
-			 * If the lwb isn't open, then it must have already
-			 * been issued. In that case, there's no need to
-			 * use a timeout when waiting for the lwb to
-			 * complete.
-			 *
-			 * Additionally, if the lwb is NULL, the waiter
-			 * will soon be signalled and marked done via
-			 * zil_clean() and zil_itxg_clean(), so no timeout
-			 * is required.
-			 */
-
-			IMPLY(lwb != NULL,
-			    lwb->lwb_state == LWB_STATE_ISSUED ||
-			    lwb->lwb_state == LWB_STATE_WRITE_DONE ||
-			    lwb->lwb_state == LWB_STATE_FLUSH_DONE);
-			cv_wait(&zcw->zcw_cv, &zcw->zcw_lock);
-		}
-	}
-
-	mutex_exit(&zcw->zcw_lock);
-}
-
-static zil_commit_waiter_t *
-zil_alloc_commit_waiter()
-{
-	zil_commit_waiter_t *zcw = kmem_cache_alloc(zil_zcw_cache, KM_SLEEP);
-
-	cv_init(&zcw->zcw_cv, NULL, CV_DEFAULT, NULL);
-	mutex_init(&zcw->zcw_lock, NULL, MUTEX_DEFAULT, NULL);
-	list_link_init(&zcw->zcw_node);
-	zcw->zcw_lwb = NULL;
-	zcw->zcw_done = B_FALSE;
-	zcw->zcw_zio_error = 0;
-
-	return (zcw);
-}
-
-static void
-zil_free_commit_waiter(zil_commit_waiter_t *zcw)
-{
-	ASSERT(!list_link_active(&zcw->zcw_node));
-	ASSERT3P(zcw->zcw_lwb, ==, NULL);
-	ASSERT3B(zcw->zcw_done, ==, B_TRUE);
-	mutex_destroy(&zcw->zcw_lock);
-	cv_destroy(&zcw->zcw_cv);
-	kmem_cache_free(zil_zcw_cache, zcw);
-}
-
-/*
- * This function is used to create a TX_COMMIT itx and assign it. This
- * way, it will be linked into the ZIL's list of synchronous itxs, and
- * then later committed to an lwb (or skipped) when
- * zil_process_commit_list() is called.
- */
-static void
-zil_commit_itx_assign(zilog_t *zilog, zil_commit_waiter_t *zcw)
-{
-	dmu_tx_t *tx = dmu_tx_create(zilog->zl_os);
-	VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
-
-	itx_t *itx = zil_itx_create(TX_COMMIT, sizeof (lr_t));
-	itx->itx_sync = B_TRUE;
-	itx->itx_private = zcw;
-
-	zil_itx_assign(zilog, itx, tx);
-
-	dmu_tx_commit(tx);
-}
-
-/*
- * Commit ZFS Intent Log transactions (itxs) to stable storage.
- *
- * When writing ZIL transactions to the on-disk representation of the
- * ZIL, the itxs are committed to a Log Write Block (lwb). Multiple
- * itxs can be committed to a single lwb. Once a lwb is written and
- * committed to stable storage (i.e. the lwb is written, and vdevs have
- * been flushed), each itx that was committed to that lwb is also
- * considered to be committed to stable storage.
- *
- * When an itx is committed to an lwb, the log record (lr_t) contained
- * by the itx is copied into the lwb's zio buffer, and once this buffer
- * is written to disk, it becomes an on-disk ZIL block.
- *
- * As itxs are generated, they're inserted into the ZIL's queue of
- * uncommitted itxs. The semantics of zil_commit() are such that it will
- * block until all itxs that were in the queue when it was called, are
- * committed to stable storage.
- *
- * If "foid" is zero, this means all "synchronous" and "asynchronous"
- * itxs, for all objects in the dataset, will be committed to stable
- * storage prior to zil_commit() returning. If "foid" is non-zero, all
- * "synchronous" itxs for all objects, but only "asynchronous" itxs
- * that correspond to the foid passed in, will be committed to stable
- * storage prior to zil_commit() returning.
- *
- * Generally speaking, when zil_commit() is called, the consumer doesn't
- * actually care about _all_ of the uncommitted itxs. Instead, they're
- * simply trying to waiting for a specific itx to be committed to disk,
- * but the interface(s) for interacting with the ZIL don't allow such
- * fine-grained communication. A better interface would allow a consumer
- * to create and assign an itx, and then pass a reference to this itx to
- * zil_commit(); such that zil_commit() would return as soon as that
- * specific itx was committed to disk (instead of waiting for _all_
- * itxs to be committed).
- *
- * When a thread calls zil_commit() a special "commit itx" will be
- * generated, along with a corresponding "waiter" for this commit itx.
- * zil_commit() will wait on this waiter's CV, such that when the waiter
- * is marked done, and signalled, zil_commit() will return.
- *
- * This commit itx is inserted into the queue of uncommitted itxs. This
- * provides an easy mechanism for determining which itxs were in the
- * queue prior to zil_commit() having been called, and which itxs were
- * added after zil_commit() was called.
- *
- * The commit it is special; it doesn't have any on-disk representation.
- * When a commit itx is "committed" to an lwb, the waiter associated
- * with it is linked onto the lwb's list of waiters. Then, when that lwb
- * completes, each waiter on the lwb's list is marked done and signalled
- * -- allowing the thread waiting on the waiter to return from zil_commit().
- *
- * It's important to point out a few critical factors that allow us
- * to make use of the commit itxs, commit waiters, per-lwb lists of
- * commit waiters, and zio completion callbacks like we're doing:
- *
- *   1. The list of waiters for each lwb is traversed, and each commit
- *      waiter is marked "done" and signalled, in the zio completion
- *      callback of the lwb's zio[*].
- *
- *      * Actually, the waiters are signalled in the zio completion
- *        callback of the root zio for the DKIOCFLUSHWRITECACHE commands
- *        that are sent to the vdevs upon completion of the lwb zio.
- *
- *   2. When the itxs are inserted into the ZIL's queue of uncommitted
- *      itxs, the order in which they are inserted is preserved[*]; as
- *      itxs are added to the queue, they are added to the tail of
- *      in-memory linked lists.
- *
- *      When committing the itxs to lwbs (to be written to disk), they
- *      are committed in the same order in which the itxs were added to
- *      the uncommitted queue's linked list(s); i.e. the linked list of
- *      itxs to commit is traversed from head to tail, and each itx is
- *      committed to an lwb in that order.
- *
- *      * To clarify:
- *
- *        - the order of "sync" itxs is preserved w.r.t. other
- *          "sync" itxs, regardless of the corresponding objects.
- *        - the order of "async" itxs is preserved w.r.t. other
- *          "async" itxs corresponding to the same object.
- *        - the order of "async" itxs is *not* preserved w.r.t. other
- *          "async" itxs corresponding to different objects.
- *        - the order of "sync" itxs w.r.t. "async" itxs (or vice
- *          versa) is *not* preserved, even for itxs that correspond
- *          to the same object.
- *
- *      For more details, see: zil_itx_assign(), zil_async_to_sync(),
- *      zil_get_commit_list(), and zil_process_commit_list().
- *
- *   3. The lwbs represent a linked list of blocks on disk. Thus, any
- *      lwb cannot be considered committed to stable storage, until its
- *      "previous" lwb is also committed to stable storage. This fact,
- *      coupled with the fact described above, means that itxs are
- *      committed in (roughly) the order in which they were generated.
- *      This is essential because itxs are dependent on prior itxs.
- *      Thus, we *must not* deem an itx as being committed to stable
- *      storage, until *all* prior itxs have also been committed to
- *      stable storage.
- *
- *      To enforce this ordering of lwb zio's, while still leveraging as
- *      much of the underlying storage performance as possible, we rely
- *      on two fundamental concepts:
- *
- *          1. The creation and issuance of lwb zio's is protected by
- *             the zilog's "zl_issuer_lock", which ensures only a single
- *             thread is creating and/or issuing lwb's at a time
- *          2. The "previous" lwb is a child of the "current" lwb
- *             (leveraging the zio parent-child depenency graph)
- *
- *      By relying on this parent-child zio relationship, we can have
- *      many lwb zio's concurrently issued to the underlying storage,
- *      but the order in which they complete will be the same order in
- *      which they were created.
- */
-void
-zil_commit(zilog_t *zilog, uint64_t foid)
-{
-	/*
-	 * We should never attempt to call zil_commit on a snapshot for
-	 * a couple of reasons:
-	 *
-	 * 1. A snapshot may never be modified, thus it cannot have any
-	 *    in-flight itxs that would have modified the dataset.
-	 *
-	 * 2. By design, when zil_commit() is called, a commit itx will
-	 *    be assigned to this zilog; as a result, the zilog will be
-	 *    dirtied. We must not dirty the zilog of a snapshot; there's
-	 *    checks in the code that enforce this invariant, and will
-	 *    cause a panic if it's not upheld.
-	 */
-	ASSERT3B(dmu_objset_is_snapshot(zilog->zl_os), ==, B_FALSE);
-
-	if (zilog->zl_sync == ZFS_SYNC_DISABLED)
-		return;
-
-	if (!spa_writeable(zilog->zl_spa)) {
-		/*
-		 * If the SPA is not writable, there should never be any
-		 * pending itxs waiting to be committed to disk. If that
-		 * weren't true, we'd skip writing those itxs out, and
-		 * would break the sematics of zil_commit(); thus, we're
-		 * verifying that truth before we return to the caller.
-		 */
-		ASSERT(list_is_empty(&zilog->zl_lwb_list));
-		ASSERT3P(zilog->zl_last_lwb_opened, ==, NULL);
-		for (int i = 0; i < TXG_SIZE; i++)
-			ASSERT3P(zilog->zl_itxg[i].itxg_itxs, ==, NULL);
-		return;
-	}
-
-	/*
-	 * If the ZIL is suspended, we don't want to dirty it by calling
-	 * zil_commit_itx_assign() below, nor can we write out
-	 * lwbs like would be done in zil_commit_write(). Thus, we
-	 * simply rely on txg_wait_synced() to maintain the necessary
-	 * semantics, and avoid calling those functions altogether.
-	 */
-	if (zilog->zl_suspend > 0) {
-		txg_wait_synced(zilog->zl_dmu_pool, 0);
-		return;
-	}
-
-	zil_commit_impl(zilog, foid);
-}
-
-void
-zil_commit_impl(zilog_t *zilog, uint64_t foid)
-{
-	/*
-	 * Move the "async" itxs for the specified foid to the "sync"
-	 * queues, such that they will be later committed (or skipped)
-	 * to an lwb when zil_process_commit_list() is called.
-	 *
-	 * Since these "async" itxs must be committed prior to this
-	 * call to zil_commit returning, we must perform this operation
-	 * before we call zil_commit_itx_assign().
-	 */
-	zil_async_to_sync(zilog, foid);
-
-	/*
-	 * We allocate a new "waiter" structure which will initially be
-	 * linked to the commit itx using the itx's "itx_private" field.
-	 * Since the commit itx doesn't represent any on-disk state,
-	 * when it's committed to an lwb, rather than copying the its
-	 * lr_t into the lwb's buffer, the commit itx's "waiter" will be
-	 * added to the lwb's list of waiters. Then, when the lwb is
-	 * committed to stable storage, each waiter in the lwb's list of
-	 * waiters will be marked "done", and signalled.
-	 *
-	 * We must create the waiter and assign the commit itx prior to
-	 * calling zil_commit_writer(), or else our specific commit itx
-	 * is not guaranteed to be committed to an lwb prior to calling
-	 * zil_commit_waiter().
-	 */
-	zil_commit_waiter_t *zcw = zil_alloc_commit_waiter();
-	zil_commit_itx_assign(zilog, zcw);
-
-	zil_commit_writer(zilog, zcw);
-	zil_commit_waiter(zilog, zcw);
-
-	if (zcw->zcw_zio_error != 0) {
-		/*
-		 * If there was an error writing out the ZIL blocks that
-		 * this thread is waiting on, then we fallback to
-		 * relying on spa_sync() to write out the data this
-		 * thread is waiting on. Obviously this has performance
-		 * implications, but the expectation is for this to be
-		 * an exceptional case, and shouldn't occur often.
-		 */
-		DTRACE_PROBE2(zil__commit__io__error,
-		    zilog_t *, zilog, zil_commit_waiter_t *, zcw);
-		txg_wait_synced(zilog->zl_dmu_pool, 0);
-	}
-
-	zil_free_commit_waiter(zcw);
-}
-
-/*
- * Called in syncing context to free committed log blocks and update log header.
- */
-void
-zil_sync(zilog_t *zilog, dmu_tx_t *tx)
-{
-	zil_header_t *zh = zil_header_in_syncing_context(zilog);
-	uint64_t txg = dmu_tx_get_txg(tx);
-	spa_t *spa = zilog->zl_spa;
-	uint64_t *replayed_seq = &zilog->zl_replayed_seq[txg & TXG_MASK];
-	lwb_t *lwb;
-
-	/*
-	 * We don't zero out zl_destroy_txg, so make sure we don't try
-	 * to destroy it twice.
-	 */
-	if (spa_sync_pass(spa) != 1)
-		return;
-
-	mutex_enter(&zilog->zl_lock);
-
-	ASSERT(zilog->zl_stop_sync == 0);
-
-	if (*replayed_seq != 0) {
-		ASSERT(zh->zh_replay_seq < *replayed_seq);
-		zh->zh_replay_seq = *replayed_seq;
-		*replayed_seq = 0;
-	}
-
-	if (zilog->zl_destroy_txg == txg) {
-		blkptr_t blk = zh->zh_log;
-
-		ASSERT(list_head(&zilog->zl_lwb_list) == NULL);
-
-		bzero(zh, sizeof (zil_header_t));
-		bzero(zilog->zl_replayed_seq, sizeof (zilog->zl_replayed_seq));
-
-		if (zilog->zl_keep_first) {
-			/*
-			 * If this block was part of log chain that couldn't
-			 * be claimed because a device was missing during
-			 * zil_claim(), but that device later returns,
-			 * then this block could erroneously appear valid.
-			 * To guard against this, assign a new GUID to the new
-			 * log chain so it doesn't matter what blk points to.
-			 */
-			zil_init_log_chain(zilog, &blk);
-			zh->zh_log = blk;
-		}
-	}
-
-	while ((lwb = list_head(&zilog->zl_lwb_list)) != NULL) {
-		zh->zh_log = lwb->lwb_blk;
-		if (lwb->lwb_buf != NULL || lwb->lwb_max_txg > txg)
-			break;
-		list_remove(&zilog->zl_lwb_list, lwb);
-		zio_free(spa, txg, &lwb->lwb_blk);
-		zil_free_lwb(zilog, lwb);
-
-		/*
-		 * If we don't have anything left in the lwb list then
-		 * we've had an allocation failure and we need to zero
-		 * out the zil_header blkptr so that we don't end
-		 * up freeing the same block twice.
-		 */
-		if (list_head(&zilog->zl_lwb_list) == NULL)
-			BP_ZERO(&zh->zh_log);
-	}
-	mutex_exit(&zilog->zl_lock);
-}
-
-/* ARGSUSED */
-static int
-zil_lwb_cons(void *vbuf, void *unused, int kmflag)
-{
-	lwb_t *lwb = vbuf;
-	list_create(&lwb->lwb_waiters, sizeof (zil_commit_waiter_t),
-	    offsetof(zil_commit_waiter_t, zcw_node));
-	avl_create(&lwb->lwb_vdev_tree, zil_lwb_vdev_compare,
-	    sizeof (zil_vdev_node_t), offsetof(zil_vdev_node_t, zv_node));
-	mutex_init(&lwb->lwb_vdev_lock, NULL, MUTEX_DEFAULT, NULL);
-	return (0);
-}
-
-/* ARGSUSED */
-static void
-zil_lwb_dest(void *vbuf, void *unused)
-{
-	lwb_t *lwb = vbuf;
-	mutex_destroy(&lwb->lwb_vdev_lock);
-	avl_destroy(&lwb->lwb_vdev_tree);
-	list_destroy(&lwb->lwb_waiters);
-}
-
-void
-zil_init(void)
-{
-	zil_lwb_cache = kmem_cache_create("zil_lwb_cache",
-	    sizeof (lwb_t), 0, zil_lwb_cons, zil_lwb_dest, NULL, NULL, NULL, 0);
-
-	zil_zcw_cache = kmem_cache_create("zil_zcw_cache",
-	    sizeof (zil_commit_waiter_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
-}
-
-void
-zil_fini(void)
-{
-	kmem_cache_destroy(zil_zcw_cache);
-	kmem_cache_destroy(zil_lwb_cache);
-}
-
-void
-zil_set_sync(zilog_t *zilog, uint64_t sync)
-{
-	zilog->zl_sync = sync;
-}
-
-void
-zil_set_logbias(zilog_t *zilog, uint64_t logbias)
-{
-	zilog->zl_logbias = logbias;
-}
-
-zilog_t *
-zil_alloc(objset_t *os, zil_header_t *zh_phys)
-{
-	zilog_t *zilog;
-
-	zilog = kmem_zalloc(sizeof (zilog_t), KM_SLEEP);
-
-	zilog->zl_header = zh_phys;
-	zilog->zl_os = os;
-	zilog->zl_spa = dmu_objset_spa(os);
-	zilog->zl_dmu_pool = dmu_objset_pool(os);
-	zilog->zl_destroy_txg = TXG_INITIAL - 1;
-	zilog->zl_logbias = dmu_objset_logbias(os);
-	zilog->zl_sync = dmu_objset_syncprop(os);
-	zilog->zl_dirty_max_txg = 0;
-	zilog->zl_last_lwb_opened = NULL;
-	zilog->zl_last_lwb_latency = 0;
-	zilog->zl_max_block_size = zil_maxblocksize;
-
-	mutex_init(&zilog->zl_lock, NULL, MUTEX_DEFAULT, NULL);
-	mutex_init(&zilog->zl_issuer_lock, NULL, MUTEX_DEFAULT, NULL);
-
-	for (int i = 0; i < TXG_SIZE; i++) {
-		mutex_init(&zilog->zl_itxg[i].itxg_lock, NULL,
-		    MUTEX_DEFAULT, NULL);
-	}
-
-	list_create(&zilog->zl_lwb_list, sizeof (lwb_t),
-	    offsetof(lwb_t, lwb_node));
-
-	list_create(&zilog->zl_itx_commit_list, sizeof (itx_t),
-	    offsetof(itx_t, itx_node));
-
-	cv_init(&zilog->zl_cv_suspend, NULL, CV_DEFAULT, NULL);
-
-	return (zilog);
-}
-
-void
-zil_free(zilog_t *zilog)
-{
-	zilog->zl_stop_sync = 1;
-
-	ASSERT0(zilog->zl_suspend);
-	ASSERT0(zilog->zl_suspending);
-
-	ASSERT(list_is_empty(&zilog->zl_lwb_list));
-	list_destroy(&zilog->zl_lwb_list);
-
-	ASSERT(list_is_empty(&zilog->zl_itx_commit_list));
-	list_destroy(&zilog->zl_itx_commit_list);
-
-	for (int i = 0; i < TXG_SIZE; i++) {
-		/*
-		 * It's possible for an itx to be generated that doesn't dirty
-		 * a txg (e.g. ztest TX_TRUNCATE). So there's no zil_clean()
-		 * callback to remove the entry. We remove those here.
-		 *
-		 * Also free up the ziltest itxs.
-		 */
-		if (zilog->zl_itxg[i].itxg_itxs)
-			zil_itxg_clean(zilog->zl_itxg[i].itxg_itxs);
-		mutex_destroy(&zilog->zl_itxg[i].itxg_lock);
-	}
-
-	mutex_destroy(&zilog->zl_issuer_lock);
-	mutex_destroy(&zilog->zl_lock);
-
-	cv_destroy(&zilog->zl_cv_suspend);
-
-	kmem_free(zilog, sizeof (zilog_t));
-}
-
-/*
- * Open an intent log.
- */
-zilog_t *
-zil_open(objset_t *os, zil_get_data_t *get_data)
-{
-	zilog_t *zilog = dmu_objset_zil(os);
-
-	ASSERT3P(zilog->zl_get_data, ==, NULL);
-	ASSERT3P(zilog->zl_last_lwb_opened, ==, NULL);
-	ASSERT(list_is_empty(&zilog->zl_lwb_list));
-
-	zilog->zl_get_data = get_data;
-
-	return (zilog);
-}
-
-/*
- * Close an intent log.
- */
-void
-zil_close(zilog_t *zilog)
-{
-	lwb_t *lwb;
-	uint64_t txg;
-
-	if (!dmu_objset_is_snapshot(zilog->zl_os)) {
-		zil_commit(zilog, 0);
-	} else {
-		ASSERT3P(list_tail(&zilog->zl_lwb_list), ==, NULL);
-		ASSERT0(zilog->zl_dirty_max_txg);
-		ASSERT3B(zilog_is_dirty(zilog), ==, B_FALSE);
-	}
-
-	mutex_enter(&zilog->zl_lock);
-	lwb = list_tail(&zilog->zl_lwb_list);
-	if (lwb == NULL)
-		txg = zilog->zl_dirty_max_txg;
-	else
-		txg = MAX(zilog->zl_dirty_max_txg, lwb->lwb_max_txg);
-	mutex_exit(&zilog->zl_lock);
-
-	/*
-	 * We need to use txg_wait_synced() to wait long enough for the
-	 * ZIL to be clean, and to wait for all pending lwbs to be
-	 * written out.
-	 */
-	if (txg)
-		txg_wait_synced(zilog->zl_dmu_pool, txg);
-
-	if (zilog_is_dirty(zilog))
-		zfs_dbgmsg("zil (%p) is dirty, txg %llu", zilog, txg);
-	if (txg < spa_freeze_txg(zilog->zl_spa))
-		VERIFY(!zilog_is_dirty(zilog));
-
-	zilog->zl_get_data = NULL;
-
-	/*
-	 * We should have only one lwb left on the list; remove it now.
-	 */
-	mutex_enter(&zilog->zl_lock);
-	lwb = list_head(&zilog->zl_lwb_list);
-	if (lwb != NULL) {
-		ASSERT3P(lwb, ==, list_tail(&zilog->zl_lwb_list));
-		ASSERT3S(lwb->lwb_state, !=, LWB_STATE_ISSUED);
-		list_remove(&zilog->zl_lwb_list, lwb);
-		zio_buf_free(lwb->lwb_buf, lwb->lwb_sz);
-		zil_free_lwb(zilog, lwb);
-	}
-	mutex_exit(&zilog->zl_lock);
-}
-
-static char *suspend_tag = "zil suspending";
-
-/*
- * Suspend an intent log.  While in suspended mode, we still honor
- * synchronous semantics, but we rely on txg_wait_synced() to do it.
- * On old version pools, we suspend the log briefly when taking a
- * snapshot so that it will have an empty intent log.
- *
- * Long holds are not really intended to be used the way we do here --
- * held for such a short time.  A concurrent caller of dsl_dataset_long_held()
- * could fail.  Therefore we take pains to only put a long hold if it is
- * actually necessary.  Fortunately, it will only be necessary if the
- * objset is currently mounted (or the ZVOL equivalent).  In that case it
- * will already have a long hold, so we are not really making things any worse.
- *
- * Ideally, we would locate the existing long-holder (i.e. the zfsvfs_t or
- * zvol_state_t), and use their mechanism to prevent their hold from being
- * dropped (e.g. VFS_HOLD()).  However, that would be even more pain for
- * very little gain.
- *
- * if cookiep == NULL, this does both the suspend & resume.
- * Otherwise, it returns with the dataset "long held", and the cookie
- * should be passed into zil_resume().
- */
-int
-zil_suspend(const char *osname, void **cookiep)
-{
-	objset_t *os;
-	zilog_t *zilog;
-	const zil_header_t *zh;
-	int error;
-
-	error = dmu_objset_hold(osname, suspend_tag, &os);
-	if (error != 0)
-		return (error);
-	zilog = dmu_objset_zil(os);
-
-	mutex_enter(&zilog->zl_lock);
-	zh = zilog->zl_header;
-
-	if (zh->zh_flags & ZIL_REPLAY_NEEDED) {		/* unplayed log */
-		mutex_exit(&zilog->zl_lock);
-		dmu_objset_rele(os, suspend_tag);
-		return (SET_ERROR(EBUSY));
-	}
-
-	/*
-	 * Don't put a long hold in the cases where we can avoid it.  This
-	 * is when there is no cookie so we are doing a suspend & resume
-	 * (i.e. called from zil_vdev_offline()), and there's nothing to do
-	 * for the suspend because it's already suspended, or there's no ZIL.
-	 */
-	if (cookiep == NULL && !zilog->zl_suspending &&
-	    (zilog->zl_suspend > 0 || BP_IS_HOLE(&zh->zh_log))) {
-		mutex_exit(&zilog->zl_lock);
-		dmu_objset_rele(os, suspend_tag);
-		return (0);
-	}
-
-	dsl_dataset_long_hold(dmu_objset_ds(os), suspend_tag);
-	dsl_pool_rele(dmu_objset_pool(os), suspend_tag);
-
-	zilog->zl_suspend++;
-
-	if (zilog->zl_suspend > 1) {
-		/*
-		 * Someone else is already suspending it.
-		 * Just wait for them to finish.
-		 */
-
-		while (zilog->zl_suspending)
-			cv_wait(&zilog->zl_cv_suspend, &zilog->zl_lock);
-		mutex_exit(&zilog->zl_lock);
-
-		if (cookiep == NULL)
-			zil_resume(os);
-		else
-			*cookiep = os;
-		return (0);
-	}
-
-	/*
-	 * If there is no pointer to an on-disk block, this ZIL must not
-	 * be active (e.g. filesystem not mounted), so there's nothing
-	 * to clean up.
-	 */
-	if (BP_IS_HOLE(&zh->zh_log)) {
-		ASSERT(cookiep != NULL); /* fast path already handled */
-
-		*cookiep = os;
-		mutex_exit(&zilog->zl_lock);
-		return (0);
-	}
-
-	zilog->zl_suspending = B_TRUE;
-	mutex_exit(&zilog->zl_lock);
-
-	/*
-	 * We need to use zil_commit_impl to ensure we wait for all
-	 * LWB_STATE_OPENED and LWB_STATE_ISSUED lwb's to be committed
-	 * to disk before proceeding. If we used zil_commit instead, it
-	 * would just call txg_wait_synced(), because zl_suspend is set.
-	 * txg_wait_synced() doesn't wait for these lwb's to be
-	 * LWB_STATE_FLUSH_DONE before returning.
-	 */
-	zil_commit_impl(zilog, 0);
-
-	/*
-	 * Now that we've ensured all lwb's are LWB_STATE_FLUSH_DONE, we
-	 * use txg_wait_synced() to ensure the data from the zilog has
-	 * migrated to the main pool before calling zil_destroy().
-	 */
-	txg_wait_synced(zilog->zl_dmu_pool, 0);
-
-	zil_destroy(zilog, B_FALSE);
-
-	mutex_enter(&zilog->zl_lock);
-	zilog->zl_suspending = B_FALSE;
-	cv_broadcast(&zilog->zl_cv_suspend);
-	mutex_exit(&zilog->zl_lock);
-
-	if (cookiep == NULL)
-		zil_resume(os);
-	else
-		*cookiep = os;
-	return (0);
-}
-
-void
-zil_resume(void *cookie)
-{
-	objset_t *os = cookie;
-	zilog_t *zilog = dmu_objset_zil(os);
-
-	mutex_enter(&zilog->zl_lock);
-	ASSERT(zilog->zl_suspend != 0);
-	zilog->zl_suspend--;
-	mutex_exit(&zilog->zl_lock);
-	dsl_dataset_long_rele(dmu_objset_ds(os), suspend_tag);
-	dsl_dataset_rele(dmu_objset_ds(os), suspend_tag);
-}
-
-typedef struct zil_replay_arg {
-	zil_replay_func_t **zr_replay;
-	void		*zr_arg;
-	boolean_t	zr_byteswap;
-	char		*zr_lr;
-} zil_replay_arg_t;
-
-static int
-zil_replay_error(zilog_t *zilog, lr_t *lr, int error)
-{
-	char name[ZFS_MAX_DATASET_NAME_LEN];
-
-	zilog->zl_replaying_seq--;	/* didn't actually replay this one */
-
-	dmu_objset_name(zilog->zl_os, name);
-
-	cmn_err(CE_WARN, "ZFS replay transaction error %d, "
-	    "dataset %s, seq 0x%llx, txtype %llu %s\n", error, name,
-	    (u_longlong_t)lr->lrc_seq,
-	    (u_longlong_t)(lr->lrc_txtype & ~TX_CI),
-	    (lr->lrc_txtype & TX_CI) ? "CI" : "");
-
-	return (error);
-}
-
-static int
-zil_replay_log_record(zilog_t *zilog, lr_t *lr, void *zra, uint64_t claim_txg)
-{
-	zil_replay_arg_t *zr = zra;
-	const zil_header_t *zh = zilog->zl_header;
-	uint64_t reclen = lr->lrc_reclen;
-	uint64_t txtype = lr->lrc_txtype;
-	int error = 0;
-
-	zilog->zl_replaying_seq = lr->lrc_seq;
-
-	if (lr->lrc_seq <= zh->zh_replay_seq)	/* already replayed */
-		return (0);
-
-	if (lr->lrc_txg < claim_txg)		/* already committed */
-		return (0);
-
-	/* Strip case-insensitive bit, still present in log record */
-	txtype &= ~TX_CI;
-
-	if (txtype == 0 || txtype >= TX_MAX_TYPE)
-		return (zil_replay_error(zilog, lr, EINVAL));
-
-	/*
-	 * If this record type can be logged out of order, the object
-	 * (lr_foid) may no longer exist.  That's legitimate, not an error.
-	 */
-	if (TX_OOO(txtype)) {
-		error = dmu_object_info(zilog->zl_os,
-		    LR_FOID_GET_OBJ(((lr_ooo_t *)lr)->lr_foid), NULL);
-		if (error == ENOENT || error == EEXIST)
-			return (0);
-	}
-
-	/*
-	 * Make a copy of the data so we can revise and extend it.
-	 */
-	bcopy(lr, zr->zr_lr, reclen);
-
-	/*
-	 * If this is a TX_WRITE with a blkptr, suck in the data.
-	 */
-	if (txtype == TX_WRITE && reclen == sizeof (lr_write_t)) {
-		error = zil_read_log_data(zilog, (lr_write_t *)lr,
-		    zr->zr_lr + reclen);
-		if (error != 0)
-			return (zil_replay_error(zilog, lr, error));
-	}
-
-	/*
-	 * The log block containing this lr may have been byteswapped
-	 * so that we can easily examine common fields like lrc_txtype.
-	 * However, the log is a mix of different record types, and only the
-	 * replay vectors know how to byteswap their records.  Therefore, if
-	 * the lr was byteswapped, undo it before invoking the replay vector.
-	 */
-	if (zr->zr_byteswap)
-		byteswap_uint64_array(zr->zr_lr, reclen);
-
-	/*
-	 * We must now do two things atomically: replay this log record,
-	 * and update the log header sequence number to reflect the fact that
-	 * we did so. At the end of each replay function the sequence number
-	 * is updated if we are in replay mode.
-	 */
-	error = zr->zr_replay[txtype](zr->zr_arg, zr->zr_lr, zr->zr_byteswap);
-	if (error != 0) {
-		/*
-		 * The DMU's dnode layer doesn't see removes until the txg
-		 * commits, so a subsequent claim can spuriously fail with
-		 * EEXIST. So if we receive any error we try syncing out
-		 * any removes then retry the transaction.  Note that we
-		 * specify B_FALSE for byteswap now, so we don't do it twice.
-		 */
-		txg_wait_synced(spa_get_dsl(zilog->zl_spa), 0);
-		error = zr->zr_replay[txtype](zr->zr_arg, zr->zr_lr, B_FALSE);
-		if (error != 0)
-			return (zil_replay_error(zilog, lr, error));
-	}
-	return (0);
-}
-
-/* ARGSUSED */
-static int
-zil_incr_blks(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg)
-{
-	zilog->zl_replay_blks++;
-
-	return (0);
-}
-
-/*
- * If this dataset has a non-empty intent log, replay it and destroy it.
- */
-void
-zil_replay(objset_t *os, void *arg, zil_replay_func_t *replay_func[TX_MAX_TYPE])
-{
-	zilog_t *zilog = dmu_objset_zil(os);
-	const zil_header_t *zh = zilog->zl_header;
-	zil_replay_arg_t zr;
-
-	if ((zh->zh_flags & ZIL_REPLAY_NEEDED) == 0) {
-		zil_destroy(zilog, B_TRUE);
-		return;
-	}
-
-	zr.zr_replay = replay_func;
-	zr.zr_arg = arg;
-	zr.zr_byteswap = BP_SHOULD_BYTESWAP(&zh->zh_log);
-	zr.zr_lr = kmem_alloc(2 * SPA_MAXBLOCKSIZE, KM_SLEEP);
-
-	/*
-	 * Wait for in-progress removes to sync before starting replay.
-	 */
-	txg_wait_synced(zilog->zl_dmu_pool, 0);
-
-	zilog->zl_replay = B_TRUE;
-	zilog->zl_replay_time = ddi_get_lbolt();
-	ASSERT(zilog->zl_replay_blks == 0);
-	(void) zil_parse(zilog, zil_incr_blks, zil_replay_log_record, &zr,
-	    zh->zh_claim_txg);
-	kmem_free(zr.zr_lr, 2 * SPA_MAXBLOCKSIZE);
-
-	zil_destroy(zilog, B_FALSE);
-	txg_wait_synced(zilog->zl_dmu_pool, zilog->zl_destroy_txg);
-	zilog->zl_replay = B_FALSE;
-}
-
-boolean_t
-zil_replaying(zilog_t *zilog, dmu_tx_t *tx)
-{
-	if (zilog->zl_sync == ZFS_SYNC_DISABLED)
-		return (B_TRUE);
-
-	if (zilog->zl_replay) {
-		dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx);
-		zilog->zl_replayed_seq[dmu_tx_get_txg(tx) & TXG_MASK] =
-		    zilog->zl_replaying_seq;
-		return (B_TRUE);
-	}
-
-	return (B_FALSE);
-}
-
-/* ARGSUSED */
-int
-zil_reset(const char *osname, void *arg)
-{
-	int error;
-
-	error = zil_suspend(osname, NULL);
-	if (error != 0)
-		return (SET_ERROR(EEXIST));
-	return (0);
-}
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c
+++ /dev/null
@@ -1,4386 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
- * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved.
- * Copyright (c) 2014 Integros [integros.com]
- * Copyright (c) 2017, Intel Corporation.
- */
-
-#include <sys/sysmacros.h>
-#include <sys/zfs_context.h>
-#include <sys/fm/fs/zfs.h>
-#include <sys/spa.h>
-#include <sys/txg.h>
-#include <sys/spa_impl.h>
-#include <sys/vdev_impl.h>
-#include <sys/zio_impl.h>
-#include <sys/zio_compress.h>
-#include <sys/zio_checksum.h>
-#include <sys/dmu_objset.h>
-#include <sys/arc.h>
-#include <sys/ddt.h>
-#include <sys/trim_map.h>
-#include <sys/blkptr.h>
-#include <sys/zfeature.h>
-#include <sys/dsl_scan.h>
-#include <sys/metaslab_impl.h>
-#include <sys/abd.h>
-#include <sys/cityhash.h>
-
-SYSCTL_DECL(_vfs_zfs);
-SYSCTL_NODE(_vfs_zfs, OID_AUTO, zio, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 
-    "ZFS ZIO");
-#if defined(__amd64__)
-static int zio_use_uma = 1;
-#else
-static int zio_use_uma = 0;
-#endif
-SYSCTL_INT(_vfs_zfs_zio, OID_AUTO, use_uma, CTLFLAG_RDTUN, &zio_use_uma, 0,
-    "Use uma(9) for ZIO allocations");
-static int zio_exclude_metadata = 0;
-SYSCTL_INT(_vfs_zfs_zio, OID_AUTO, exclude_metadata, CTLFLAG_RDTUN, &zio_exclude_metadata, 0,
-    "Exclude metadata buffers from dumps as well");
-
-zio_trim_stats_t zio_trim_stats = {
-	{ "bytes",		KSTAT_DATA_UINT64,
-	  "Number of bytes successfully TRIMmed" },
-	{ "success",		KSTAT_DATA_UINT64,
-	  "Number of successful TRIM requests" },
-	{ "unsupported",	KSTAT_DATA_UINT64,
-	  "Number of TRIM requests that failed because TRIM is not supported" },
-	{ "failed",		KSTAT_DATA_UINT64,
-	  "Number of TRIM requests that failed for reasons other than not supported" },
-};
-
-static kstat_t *zio_trim_ksp;
-
-/*
- * ==========================================================================
- * I/O type descriptions
- * ==========================================================================
- */
-const char *zio_type_name[ZIO_TYPES] = {
-	"zio_null", "zio_read", "zio_write", "zio_free", "zio_claim",
-	"zio_ioctl"
-};
-
-boolean_t zio_dva_throttle_enabled = B_TRUE;
-SYSCTL_INT(_vfs_zfs_zio, OID_AUTO, dva_throttle_enabled, CTLFLAG_RWTUN,
-    &zio_dva_throttle_enabled, 0, "Enable allocation throttling");
-
-/*
- * ==========================================================================
- * I/O kmem caches
- * ==========================================================================
- */
-kmem_cache_t *zio_cache;
-kmem_cache_t *zio_link_cache;
-kmem_cache_t *zio_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
-kmem_cache_t *zio_data_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
-
-#ifdef _KERNEL
-extern vmem_t *zio_alloc_arena;
-#endif
-
-#define	BP_SPANB(indblkshift, level) \
-	(((uint64_t)1) << ((level) * ((indblkshift) - SPA_BLKPTRSHIFT)))
-#define	COMPARE_META_LEVEL	0x80000000ul
-/*
- * The following actions directly effect the spa's sync-to-convergence logic.
- * The values below define the sync pass when we start performing the action.
- * Care should be taken when changing these values as they directly impact
- * spa_sync() performance. Tuning these values may introduce subtle performance
- * pathologies and should only be done in the context of performance analysis.
- * These tunables will eventually be removed and replaced with #defines once
- * enough analysis has been done to determine optimal values.
- *
- * The 'zfs_sync_pass_deferred_free' pass must be greater than 1 to ensure that
- * regular blocks are not deferred.
- */
-int zfs_sync_pass_deferred_free = 2; /* defer frees starting in this pass */
-SYSCTL_INT(_vfs_zfs, OID_AUTO, sync_pass_deferred_free, CTLFLAG_RDTUN,
-    &zfs_sync_pass_deferred_free, 0, "defer frees starting in this pass");
-int zfs_sync_pass_dont_compress = 5; /* don't compress starting in this pass */
-SYSCTL_INT(_vfs_zfs, OID_AUTO, sync_pass_dont_compress, CTLFLAG_RDTUN,
-    &zfs_sync_pass_dont_compress, 0, "don't compress starting in this pass");
-int zfs_sync_pass_rewrite = 2; /* rewrite new bps starting in this pass */
-SYSCTL_INT(_vfs_zfs, OID_AUTO, sync_pass_rewrite, CTLFLAG_RDTUN,
-    &zfs_sync_pass_rewrite, 0, "rewrite new bps starting in this pass");
-
-/*
- * An allocating zio is one that either currently has the DVA allocate
- * stage set or will have it later in its lifetime.
- */
-#define	IO_IS_ALLOCATING(zio) ((zio)->io_orig_pipeline & ZIO_STAGE_DVA_ALLOCATE)
-
-boolean_t	zio_requeue_io_start_cut_in_line = B_TRUE;
-
-#ifdef illumos
-#ifdef ZFS_DEBUG
-int zio_buf_debug_limit = 16384;
-#else
-int zio_buf_debug_limit = 0;
-#endif
-#endif
-
-static void zio_taskq_dispatch(zio_t *, zio_taskq_type_t, boolean_t);
-
-void
-zio_init(void)
-{
-	size_t c;
-	zio_cache = kmem_cache_create("zio_cache",
-	    sizeof (zio_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
-	zio_link_cache = kmem_cache_create("zio_link_cache",
-	    sizeof (zio_link_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
-	if (!zio_use_uma)
-		goto out;
-
-	/*
-	 * For small buffers, we want a cache for each multiple of
-	 * SPA_MINBLOCKSIZE.  For larger buffers, we want a cache
-	 * for each quarter-power of 2.
-	 */
-	for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) {
-		size_t size = (c + 1) << SPA_MINBLOCKSHIFT;
-		size_t p2 = size;
-		size_t align = 0;
-		int cflags = zio_exclude_metadata ? KMC_NODEBUG : 0;
-
-		while (!ISP2(p2))
-			p2 &= p2 - 1;
-
-#ifdef illumos
-#ifndef _KERNEL
-		/*
-		 * If we are using watchpoints, put each buffer on its own page,
-		 * to eliminate the performance overhead of trapping to the
-		 * kernel when modifying a non-watched buffer that shares the
-		 * page with a watched buffer.
-		 */
-		if (arc_watch && !IS_P2ALIGNED(size, PAGESIZE))
-			continue;
-#endif
-#endif /* illumos */
-		if (size <= 4 * SPA_MINBLOCKSIZE) {
-			align = SPA_MINBLOCKSIZE;
-		} else if (IS_P2ALIGNED(size, p2 >> 2)) {
-			align = MIN(p2 >> 2, PAGESIZE);
-		}
-
-		if (align != 0) {
-			char name[36];
-			(void) sprintf(name, "zio_buf_%lu", (ulong_t)size);
-			zio_buf_cache[c] = kmem_cache_create(name, size,
-			    align, NULL, NULL, NULL, NULL, NULL, cflags);
-
-			/*
-			 * Since zio_data bufs do not appear in crash dumps, we
-			 * pass KMC_NOTOUCH so that no allocator metadata is
-			 * stored with the buffers.
-			 */
-			(void) sprintf(name, "zio_data_buf_%lu", (ulong_t)size);
-			zio_data_buf_cache[c] = kmem_cache_create(name, size,
-			    align, NULL, NULL, NULL, NULL, NULL,
-			    cflags | KMC_NOTOUCH | KMC_NODEBUG);
-		}
-	}
-
-	while (--c != 0) {
-		ASSERT(zio_buf_cache[c] != NULL);
-		if (zio_buf_cache[c - 1] == NULL)
-			zio_buf_cache[c - 1] = zio_buf_cache[c];
-
-		ASSERT(zio_data_buf_cache[c] != NULL);
-		if (zio_data_buf_cache[c - 1] == NULL)
-			zio_data_buf_cache[c - 1] = zio_data_buf_cache[c];
-	}
-out:
-
-	zio_inject_init();
-
-	zio_trim_ksp = kstat_create("zfs", 0, "zio_trim", "misc",
-	    KSTAT_TYPE_NAMED,
-	    sizeof(zio_trim_stats) / sizeof(kstat_named_t),
-	    KSTAT_FLAG_VIRTUAL);
-
-	if (zio_trim_ksp != NULL) {
-		zio_trim_ksp->ks_data = &zio_trim_stats;
-		kstat_install(zio_trim_ksp);
-	}
-}
-
-void
-zio_fini(void)
-{
-	size_t c;
-	kmem_cache_t *last_cache = NULL;
-	kmem_cache_t *last_data_cache = NULL;
-
-	for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) {
-		if (zio_buf_cache[c] != last_cache) {
-			last_cache = zio_buf_cache[c];
-			kmem_cache_destroy(zio_buf_cache[c]);
-		}
-		zio_buf_cache[c] = NULL;
-
-		if (zio_data_buf_cache[c] != last_data_cache) {
-			last_data_cache = zio_data_buf_cache[c];
-			kmem_cache_destroy(zio_data_buf_cache[c]);
-		}
-		zio_data_buf_cache[c] = NULL;
-	}
-
-	kmem_cache_destroy(zio_link_cache);
-	kmem_cache_destroy(zio_cache);
-
-	zio_inject_fini();
-
-	if (zio_trim_ksp != NULL) {
-		kstat_delete(zio_trim_ksp);
-		zio_trim_ksp = NULL;
-	}
-}
-
-/*
- * ==========================================================================
- * Allocate and free I/O buffers
- * ==========================================================================
- */
-
-/*
- * Use zio_buf_alloc to allocate ZFS metadata.  This data will appear in a
- * crashdump if the kernel panics, so use it judiciously.  Obviously, it's
- * useful to inspect ZFS metadata, but if possible, we should avoid keeping
- * excess / transient data in-core during a crashdump.
- */
-void *
-zio_buf_alloc(size_t size)
-{
-	size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
-	int flags = zio_exclude_metadata ? KM_NODEBUG : 0;
-
-	VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
-
-	if (zio_use_uma)
-		return (kmem_cache_alloc(zio_buf_cache[c], KM_PUSHPAGE));
-	else
-		return (kmem_alloc(size, KM_SLEEP|flags));
-}
-
-/*
- * Use zio_data_buf_alloc to allocate data.  The data will not appear in a
- * crashdump if the kernel panics.  This exists so that we will limit the amount
- * of ZFS data that shows up in a kernel crashdump.  (Thus reducing the amount
- * of kernel heap dumped to disk when the kernel panics)
- */
-void *
-zio_data_buf_alloc(size_t size)
-{
-	size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
-
-	VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
-
-	if (zio_use_uma)
-		return (kmem_cache_alloc(zio_data_buf_cache[c], KM_PUSHPAGE));
-	else
-		return (kmem_alloc(size, KM_SLEEP | KM_NODEBUG));
-}
-
-void
-zio_buf_free(void *buf, size_t size)
-{
-	size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
-
-	VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
-
-	if (zio_use_uma)
-		kmem_cache_free(zio_buf_cache[c], buf);
-	else
-		kmem_free(buf, size);
-}
-
-void
-zio_data_buf_free(void *buf, size_t size)
-{
-	size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
-
-	VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
-
-	if (zio_use_uma)
-		kmem_cache_free(zio_data_buf_cache[c], buf);
-	else
-		kmem_free(buf, size);
-}
-
-/*
- * ==========================================================================
- * Push and pop I/O transform buffers
- * ==========================================================================
- */
-void
-zio_push_transform(zio_t *zio, abd_t *data, uint64_t size, uint64_t bufsize,
-    zio_transform_func_t *transform)
-{
-	zio_transform_t *zt = kmem_alloc(sizeof (zio_transform_t), KM_SLEEP);
-
-	/*
-	 * Ensure that anyone expecting this zio to contain a linear ABD isn't
-	 * going to get a nasty surprise when they try to access the data.
-	 */
-#ifdef illumos
-	IMPLY(abd_is_linear(zio->io_abd), abd_is_linear(data));
-#else
-	IMPLY(zio->io_abd != NULL && abd_is_linear(zio->io_abd),
-	    abd_is_linear(data));
-#endif
-
-	zt->zt_orig_abd = zio->io_abd;
-	zt->zt_orig_size = zio->io_size;
-	zt->zt_bufsize = bufsize;
-	zt->zt_transform = transform;
-
-	zt->zt_next = zio->io_transform_stack;
-	zio->io_transform_stack = zt;
-
-	zio->io_abd = data;
-	zio->io_size = size;
-}
-
-void
-zio_pop_transforms(zio_t *zio)
-{
-	zio_transform_t *zt;
-
-	while ((zt = zio->io_transform_stack) != NULL) {
-		if (zt->zt_transform != NULL)
-			zt->zt_transform(zio,
-			    zt->zt_orig_abd, zt->zt_orig_size);
-
-		if (zt->zt_bufsize != 0)
-			abd_free(zio->io_abd);
-
-		zio->io_abd = zt->zt_orig_abd;
-		zio->io_size = zt->zt_orig_size;
-		zio->io_transform_stack = zt->zt_next;
-
-		kmem_free(zt, sizeof (zio_transform_t));
-	}
-}
-
-/*
- * ==========================================================================
- * I/O transform callbacks for subblocks and decompression
- * ==========================================================================
- */
-static void
-zio_subblock(zio_t *zio, abd_t *data, uint64_t size)
-{
-	ASSERT(zio->io_size > size);
-
-	if (zio->io_type == ZIO_TYPE_READ)
-		abd_copy(data, zio->io_abd, size);
-}
-
-static void
-zio_decompress(zio_t *zio, abd_t *data, uint64_t size)
-{
-	if (zio->io_error == 0) {
-		void *tmp = abd_borrow_buf(data, size);
-		int ret = zio_decompress_data(BP_GET_COMPRESS(zio->io_bp),
-		    zio->io_abd, tmp, zio->io_size, size);
-		abd_return_buf_copy(data, tmp, size);
-
-		if (ret != 0)
-			zio->io_error = SET_ERROR(EIO);
-	}
-}
-
-/*
- * ==========================================================================
- * I/O parent/child relationships and pipeline interlocks
- * ==========================================================================
- */
-zio_t *
-zio_walk_parents(zio_t *cio, zio_link_t **zl)
-{
-	list_t *pl = &cio->io_parent_list;
-
-	*zl = (*zl == NULL) ? list_head(pl) : list_next(pl, *zl);
-	if (*zl == NULL)
-		return (NULL);
-
-	ASSERT((*zl)->zl_child == cio);
-	return ((*zl)->zl_parent);
-}
-
-zio_t *
-zio_walk_children(zio_t *pio, zio_link_t **zl)
-{
-	list_t *cl = &pio->io_child_list;
-
-	ASSERT(MUTEX_HELD(&pio->io_lock));
-
-	*zl = (*zl == NULL) ? list_head(cl) : list_next(cl, *zl);
-	if (*zl == NULL)
-		return (NULL);
-
-	ASSERT((*zl)->zl_parent == pio);
-	return ((*zl)->zl_child);
-}
-
-zio_t *
-zio_unique_parent(zio_t *cio)
-{
-	zio_link_t *zl = NULL;
-	zio_t *pio = zio_walk_parents(cio, &zl);
-
-	VERIFY3P(zio_walk_parents(cio, &zl), ==, NULL);
-	return (pio);
-}
-
-void
-zio_add_child(zio_t *pio, zio_t *cio)
-{
-	zio_link_t *zl = kmem_cache_alloc(zio_link_cache, KM_SLEEP);
-
-	/*
-	 * Logical I/Os can have logical, gang, or vdev children.
-	 * Gang I/Os can have gang or vdev children.
-	 * Vdev I/Os can only have vdev children.
-	 * The following ASSERT captures all of these constraints.
-	 */
-	ASSERT3S(cio->io_child_type, <=, pio->io_child_type);
-
-	zl->zl_parent = pio;
-	zl->zl_child = cio;
-
-	mutex_enter(&pio->io_lock);
-	mutex_enter(&cio->io_lock);
-
-	ASSERT(pio->io_state[ZIO_WAIT_DONE] == 0);
-
-	for (int w = 0; w < ZIO_WAIT_TYPES; w++)
-		pio->io_children[cio->io_child_type][w] += !cio->io_state[w];
-
-	list_insert_head(&pio->io_child_list, zl);
-	list_insert_head(&cio->io_parent_list, zl);
-
-	pio->io_child_count++;
-	cio->io_parent_count++;
-
-	mutex_exit(&cio->io_lock);
-	mutex_exit(&pio->io_lock);
-}
-
-static void
-zio_remove_child(zio_t *pio, zio_t *cio, zio_link_t *zl)
-{
-	ASSERT(zl->zl_parent == pio);
-	ASSERT(zl->zl_child == cio);
-
-	mutex_enter(&pio->io_lock);
-	mutex_enter(&cio->io_lock);
-
-	list_remove(&pio->io_child_list, zl);
-	list_remove(&cio->io_parent_list, zl);
-
-	pio->io_child_count--;
-	cio->io_parent_count--;
-
-	mutex_exit(&cio->io_lock);
-	mutex_exit(&pio->io_lock);
-	kmem_cache_free(zio_link_cache, zl);
-}
-
-static boolean_t
-zio_wait_for_children(zio_t *zio, uint8_t childbits, enum zio_wait_type wait)
-{
-	boolean_t waiting = B_FALSE;
-
-	mutex_enter(&zio->io_lock);
-	ASSERT(zio->io_stall == NULL);
-	for (int c = 0; c < ZIO_CHILD_TYPES; c++) {
-		if (!(ZIO_CHILD_BIT_IS_SET(childbits, c)))
-			continue;
-
-		uint64_t *countp = &zio->io_children[c][wait];
-		if (*countp != 0) {
-			zio->io_stage >>= 1;
-			ASSERT3U(zio->io_stage, !=, ZIO_STAGE_OPEN);
-			zio->io_stall = countp;
-			waiting = B_TRUE;
-			break;
-		}
-	}
-	mutex_exit(&zio->io_lock);
-	return (waiting);
-}
-
-static void
-zio_notify_parent(zio_t *pio, zio_t *zio, enum zio_wait_type wait,
-    zio_t **next_to_executep)
-{
-	uint64_t *countp = &pio->io_children[zio->io_child_type][wait];
-	int *errorp = &pio->io_child_error[zio->io_child_type];
-
-	mutex_enter(&pio->io_lock);
-	if (zio->io_error && !(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE))
-		*errorp = zio_worst_error(*errorp, zio->io_error);
-	pio->io_reexecute |= zio->io_reexecute;
-	ASSERT3U(*countp, >, 0);
-
-	(*countp)--;
-
-	if (*countp == 0 && pio->io_stall == countp) {
-		zio_taskq_type_t type =
-		    pio->io_stage < ZIO_STAGE_VDEV_IO_START ? ZIO_TASKQ_ISSUE :
-		    ZIO_TASKQ_INTERRUPT;
-		pio->io_stall = NULL;
-		mutex_exit(&pio->io_lock);
-
-		/*
-		 * If we can tell the caller to execute this parent next, do
-		 * so.  Otherwise dispatch the parent zio as its own task.
-		 *
-		 * Having the caller execute the parent when possible reduces
-		 * locking on the zio taskq's, reduces context switch
-		 * overhead, and has no recursion penalty.  Note that one
-		 * read from disk typically causes at least 3 zio's: a
-		 * zio_null(), the logical zio_read(), and then a physical
-		 * zio.  When the physical ZIO completes, we are able to call
-		 * zio_done() on all 3 of these zio's from one invocation of
-		 * zio_execute() by returning the parent back to
-		 * zio_execute().  Since the parent isn't executed until this
-		 * thread returns back to zio_execute(), the caller should do
-		 * so promptly.
-		 *
-		 * In other cases, dispatching the parent prevents
-		 * overflowing the stack when we have deeply nested
-		 * parent-child relationships, as we do with the "mega zio"
-		 * of writes for spa_sync(), and the chain of ZIL blocks.
-		 */
-		if (next_to_executep != NULL && *next_to_executep == NULL) {
-			*next_to_executep = pio;
-		} else {
-			zio_taskq_dispatch(pio, type, B_FALSE);
-		}
-	} else {
-		mutex_exit(&pio->io_lock);
-	}
-}
-
-static void
-zio_inherit_child_errors(zio_t *zio, enum zio_child c)
-{
-	if (zio->io_child_error[c] != 0 && zio->io_error == 0)
-		zio->io_error = zio->io_child_error[c];
-}
-
-int
-zio_bookmark_compare(const void *x1, const void *x2)
-{
-	const zio_t *z1 = x1;
-	const zio_t *z2 = x2;
-
-	if (z1->io_bookmark.zb_objset < z2->io_bookmark.zb_objset)
-		return (-1);
-	if (z1->io_bookmark.zb_objset > z2->io_bookmark.zb_objset)
-		return (1);
-
-	if (z1->io_bookmark.zb_object < z2->io_bookmark.zb_object)
-		return (-1);
-	if (z1->io_bookmark.zb_object > z2->io_bookmark.zb_object)
-		return (1);
-
-	if (z1->io_bookmark.zb_level < z2->io_bookmark.zb_level)
-		return (-1);
-	if (z1->io_bookmark.zb_level > z2->io_bookmark.zb_level)
-		return (1);
-
-	if (z1->io_bookmark.zb_blkid < z2->io_bookmark.zb_blkid)
-		return (-1);
-	if (z1->io_bookmark.zb_blkid > z2->io_bookmark.zb_blkid)
-		return (1);
-
-	if (z1 < z2)
-		return (-1);
-	if (z1 > z2)
-		return (1);
-
-	return (0);
-}
-
-/*
- * ==========================================================================
- * Create the various types of I/O (read, write, free, etc)
- * ==========================================================================
- */
-static zio_t *
-zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
-    abd_t *data, uint64_t lsize, uint64_t psize, zio_done_func_t *done,
-    void *private, zio_type_t type, zio_priority_t priority,
-    enum zio_flag flags, vdev_t *vd, uint64_t offset,
-    const zbookmark_phys_t *zb, enum zio_stage stage, enum zio_stage pipeline)
-{
-	zio_t *zio;
-
-	IMPLY(type != ZIO_TYPE_FREE, psize <= SPA_MAXBLOCKSIZE);
-	ASSERT(P2PHASE(psize, SPA_MINBLOCKSIZE) == 0);
-	ASSERT(P2PHASE(offset, SPA_MINBLOCKSIZE) == 0);
-
-	ASSERT(!vd || spa_config_held(spa, SCL_STATE_ALL, RW_READER));
-	ASSERT(!bp || !(flags & ZIO_FLAG_CONFIG_WRITER));
-	ASSERT(vd || stage == ZIO_STAGE_OPEN);
-
-	IMPLY(lsize != psize, (flags & ZIO_FLAG_RAW) != 0);
-
-	zio = kmem_cache_alloc(zio_cache, KM_SLEEP);
-	bzero(zio, sizeof (zio_t));
-
-	mutex_init(&zio->io_lock, NULL, MUTEX_DEFAULT, NULL);
-	cv_init(&zio->io_cv, NULL, CV_DEFAULT, NULL);
-#if defined(__FreeBSD__) && defined(_KERNEL)
-	callout_init(&zio->io_timer, 1);
-#endif
-
-	list_create(&zio->io_parent_list, sizeof (zio_link_t),
-	    offsetof(zio_link_t, zl_parent_node));
-	list_create(&zio->io_child_list, sizeof (zio_link_t),
-	    offsetof(zio_link_t, zl_child_node));
-	metaslab_trace_init(&zio->io_alloc_list);
-
-	if (vd != NULL)
-		zio->io_child_type = ZIO_CHILD_VDEV;
-	else if (flags & ZIO_FLAG_GANG_CHILD)
-		zio->io_child_type = ZIO_CHILD_GANG;
-	else if (flags & ZIO_FLAG_DDT_CHILD)
-		zio->io_child_type = ZIO_CHILD_DDT;
-	else
-		zio->io_child_type = ZIO_CHILD_LOGICAL;
-
-	if (bp != NULL) {
-		zio->io_bp = (blkptr_t *)bp;
-		zio->io_bp_copy = *bp;
-		zio->io_bp_orig = *bp;
-		if (type != ZIO_TYPE_WRITE ||
-		    zio->io_child_type == ZIO_CHILD_DDT)
-			zio->io_bp = &zio->io_bp_copy;	/* so caller can free */
-		if (zio->io_child_type == ZIO_CHILD_LOGICAL)
-			zio->io_logical = zio;
-		if (zio->io_child_type > ZIO_CHILD_GANG && BP_IS_GANG(bp))
-			pipeline |= ZIO_GANG_STAGES;
-	}
-
-	zio->io_spa = spa;
-	zio->io_txg = txg;
-	zio->io_done = done;
-	zio->io_private = private;
-	zio->io_type = type;
-	zio->io_priority = priority;
-	zio->io_vd = vd;
-	zio->io_offset = offset;
-	zio->io_orig_abd = zio->io_abd = data;
-	zio->io_orig_size = zio->io_size = psize;
-	zio->io_lsize = lsize;
-	zio->io_orig_flags = zio->io_flags = flags;
-	zio->io_orig_stage = zio->io_stage = stage;
-	zio->io_orig_pipeline = zio->io_pipeline = pipeline;
-	zio->io_pipeline_trace = ZIO_STAGE_OPEN;
-
-	zio->io_state[ZIO_WAIT_READY] = (stage >= ZIO_STAGE_READY);
-	zio->io_state[ZIO_WAIT_DONE] = (stage >= ZIO_STAGE_DONE);
-
-	if (zb != NULL)
-		zio->io_bookmark = *zb;
-
-	if (pio != NULL) {
-		if (zio->io_metaslab_class == NULL)
-			zio->io_metaslab_class = pio->io_metaslab_class;
-		if (zio->io_logical == NULL)
-			zio->io_logical = pio->io_logical;
-		if (zio->io_child_type == ZIO_CHILD_GANG)
-			zio->io_gang_leader = pio->io_gang_leader;
-		zio_add_child(pio, zio);
-	}
-
-	return (zio);
-}
-
-static void
-zio_destroy(zio_t *zio)
-{
-#ifdef __FreeBSD__
-	KASSERT(!(callout_active(&zio->io_timer) ||
-	    callout_pending(&zio->io_timer)), ("zio_destroy: timer active"));
-#endif
-	metaslab_trace_fini(&zio->io_alloc_list);
-	list_destroy(&zio->io_parent_list);
-	list_destroy(&zio->io_child_list);
-	mutex_destroy(&zio->io_lock);
-	cv_destroy(&zio->io_cv);
-	kmem_cache_free(zio_cache, zio);
-}
-
-zio_t *
-zio_null(zio_t *pio, spa_t *spa, vdev_t *vd, zio_done_func_t *done,
-    void *private, enum zio_flag flags)
-{
-	zio_t *zio;
-
-	zio = zio_create(pio, spa, 0, NULL, NULL, 0, 0, done, private,
-	    ZIO_TYPE_NULL, ZIO_PRIORITY_NOW, flags, vd, 0, NULL,
-	    ZIO_STAGE_OPEN, ZIO_INTERLOCK_PIPELINE);
-
-	return (zio);
-}
-
-zio_t *
-zio_root(spa_t *spa, zio_done_func_t *done, void *private, enum zio_flag flags)
-{
-	return (zio_null(NULL, spa, NULL, done, private, flags));
-}
-
-void
-zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp)
-{
-	if (!DMU_OT_IS_VALID(BP_GET_TYPE(bp))) {
-		zfs_panic_recover("blkptr at %p has invalid TYPE %llu",
-		    bp, (longlong_t)BP_GET_TYPE(bp));
-	}
-	if (BP_GET_CHECKSUM(bp) >= ZIO_CHECKSUM_FUNCTIONS ||
-	    BP_GET_CHECKSUM(bp) <= ZIO_CHECKSUM_ON) {
-		zfs_panic_recover("blkptr at %p has invalid CHECKSUM %llu",
-		    bp, (longlong_t)BP_GET_CHECKSUM(bp));
-	}
-	if (BP_GET_COMPRESS(bp) >= ZIO_COMPRESS_FUNCTIONS ||
-	    BP_GET_COMPRESS(bp) <= ZIO_COMPRESS_ON) {
-		zfs_panic_recover("blkptr at %p has invalid COMPRESS %llu",
-		    bp, (longlong_t)BP_GET_COMPRESS(bp));
-	}
-	if (BP_GET_LSIZE(bp) > SPA_MAXBLOCKSIZE) {
-		zfs_panic_recover("blkptr at %p has invalid LSIZE %llu",
-		    bp, (longlong_t)BP_GET_LSIZE(bp));
-	}
-	if (BP_GET_PSIZE(bp) > SPA_MAXBLOCKSIZE) {
-		zfs_panic_recover("blkptr at %p has invalid PSIZE %llu",
-		    bp, (longlong_t)BP_GET_PSIZE(bp));
-	}
-
-	if (BP_IS_EMBEDDED(bp)) {
-		if (BPE_GET_ETYPE(bp) > NUM_BP_EMBEDDED_TYPES) {
-			zfs_panic_recover("blkptr at %p has invalid ETYPE %llu",
-			    bp, (longlong_t)BPE_GET_ETYPE(bp));
-		}
-	}
-
-	/*
-	 * Do not verify individual DVAs if the config is not trusted. This
-	 * will be done once the zio is executed in vdev_mirror_map_alloc.
-	 */
-	if (!spa->spa_trust_config)
-		return;
-
-	/*
-	 * Pool-specific checks.
-	 *
-	 * Note: it would be nice to verify that the blk_birth and
-	 * BP_PHYSICAL_BIRTH() are not too large.  However, spa_freeze()
-	 * allows the birth time of log blocks (and dmu_sync()-ed blocks
-	 * that are in the log) to be arbitrarily large.
-	 */
-	for (int i = 0; i < BP_GET_NDVAS(bp); i++) {
-		uint64_t vdevid = DVA_GET_VDEV(&bp->blk_dva[i]);
-		if (vdevid >= spa->spa_root_vdev->vdev_children) {
-			zfs_panic_recover("blkptr at %p DVA %u has invalid "
-			    "VDEV %llu",
-			    bp, i, (longlong_t)vdevid);
-			continue;
-		}
-		vdev_t *vd = spa->spa_root_vdev->vdev_child[vdevid];
-		if (vd == NULL) {
-			zfs_panic_recover("blkptr at %p DVA %u has invalid "
-			    "VDEV %llu",
-			    bp, i, (longlong_t)vdevid);
-			continue;
-		}
-		if (vd->vdev_ops == &vdev_hole_ops) {
-			zfs_panic_recover("blkptr at %p DVA %u has hole "
-			    "VDEV %llu",
-			    bp, i, (longlong_t)vdevid);
-			continue;
-		}
-		if (vd->vdev_ops == &vdev_missing_ops) {
-			/*
-			 * "missing" vdevs are valid during import, but we
-			 * don't have their detailed info (e.g. asize), so
-			 * we can't perform any more checks on them.
-			 */
-			continue;
-		}
-		uint64_t offset = DVA_GET_OFFSET(&bp->blk_dva[i]);
-		uint64_t asize = DVA_GET_ASIZE(&bp->blk_dva[i]);
-		if (BP_IS_GANG(bp))
-			asize = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
-		if (offset + asize > vd->vdev_asize) {
-			zfs_panic_recover("blkptr at %p DVA %u has invalid "
-			    "OFFSET %llu",
-			    bp, i, (longlong_t)offset);
-		}
-	}
-}
-
-boolean_t
-zfs_dva_valid(spa_t *spa, const dva_t *dva, const blkptr_t *bp)
-{
-	uint64_t vdevid = DVA_GET_VDEV(dva);
-
-	if (vdevid >= spa->spa_root_vdev->vdev_children)
-		return (B_FALSE);
-
-	vdev_t *vd = spa->spa_root_vdev->vdev_child[vdevid];
-	if (vd == NULL)
-		return (B_FALSE);
-
-	if (vd->vdev_ops == &vdev_hole_ops)
-		return (B_FALSE);
-
-	if (vd->vdev_ops == &vdev_missing_ops) {
-		return (B_FALSE);
-	}
-
-	uint64_t offset = DVA_GET_OFFSET(dva);
-	uint64_t asize = DVA_GET_ASIZE(dva);
-
-	if (BP_IS_GANG(bp))
-		asize = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
-	if (offset + asize > vd->vdev_asize)
-		return (B_FALSE);
-
-	return (B_TRUE);
-}
-
-zio_t *
-zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp,
-    abd_t *data, uint64_t size, zio_done_func_t *done, void *private,
-    zio_priority_t priority, enum zio_flag flags, const zbookmark_phys_t *zb)
-{
-	zio_t *zio;
-
-	zfs_blkptr_verify(spa, bp);
-
-	zio = zio_create(pio, spa, BP_PHYSICAL_BIRTH(bp), bp,
-	    data, size, size, done, private,
-	    ZIO_TYPE_READ, priority, flags, NULL, 0, zb,
-	    ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ?
-	    ZIO_DDT_CHILD_READ_PIPELINE : ZIO_READ_PIPELINE);
-
-	return (zio);
-}
-
-zio_t *
-zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
-    abd_t *data, uint64_t lsize, uint64_t psize, const zio_prop_t *zp,
-    zio_done_func_t *ready, zio_done_func_t *children_ready,
-    zio_done_func_t *physdone, zio_done_func_t *done,
-    void *private, zio_priority_t priority, enum zio_flag flags,
-    const zbookmark_phys_t *zb)
-{
-	zio_t *zio;
-
-	ASSERT(zp->zp_checksum >= ZIO_CHECKSUM_OFF &&
-	    zp->zp_checksum < ZIO_CHECKSUM_FUNCTIONS &&
-	    zp->zp_compress >= ZIO_COMPRESS_OFF &&
-	    zp->zp_compress < ZIO_COMPRESS_FUNCTIONS &&
-	    DMU_OT_IS_VALID(zp->zp_type) &&
-	    zp->zp_level < 32 &&
-	    zp->zp_copies > 0 &&
-	    zp->zp_copies <= spa_max_replication(spa));
-
-	zio = zio_create(pio, spa, txg, bp, data, lsize, psize, done, private,
-	    ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb,
-	    ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ?
-	    ZIO_DDT_CHILD_WRITE_PIPELINE : ZIO_WRITE_PIPELINE);
-
-	zio->io_ready = ready;
-	zio->io_children_ready = children_ready;
-	zio->io_physdone = physdone;
-	zio->io_prop = *zp;
-
-	/*
-	 * Data can be NULL if we are going to call zio_write_override() to
-	 * provide the already-allocated BP.  But we may need the data to
-	 * verify a dedup hit (if requested).  In this case, don't try to
-	 * dedup (just take the already-allocated BP verbatim).
-	 */
-	if (data == NULL && zio->io_prop.zp_dedup_verify) {
-		zio->io_prop.zp_dedup = zio->io_prop.zp_dedup_verify = B_FALSE;
-	}
-
-	return (zio);
-}
-
-zio_t *
-zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, abd_t *data,
-    uint64_t size, zio_done_func_t *done, void *private,
-    zio_priority_t priority, enum zio_flag flags, zbookmark_phys_t *zb)
-{
-	zio_t *zio;
-
-	zio = zio_create(pio, spa, txg, bp, data, size, size, done, private,
-	    ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_IO_REWRITE, NULL, 0, zb,
-	    ZIO_STAGE_OPEN, ZIO_REWRITE_PIPELINE);
-
-	return (zio);
-}
-
-void
-zio_write_override(zio_t *zio, blkptr_t *bp, int copies, boolean_t nopwrite)
-{
-	ASSERT(zio->io_type == ZIO_TYPE_WRITE);
-	ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
-	ASSERT(zio->io_stage == ZIO_STAGE_OPEN);
-	ASSERT(zio->io_txg == spa_syncing_txg(zio->io_spa));
-
-	/*
-	 * We must reset the io_prop to match the values that existed
-	 * when the bp was first written by dmu_sync() keeping in mind
-	 * that nopwrite and dedup are mutually exclusive.
-	 */
-	zio->io_prop.zp_dedup = nopwrite ? B_FALSE : zio->io_prop.zp_dedup;
-	zio->io_prop.zp_nopwrite = nopwrite;
-	zio->io_prop.zp_copies = copies;
-	zio->io_bp_override = bp;
-}
-
-void
-zio_free(spa_t *spa, uint64_t txg, const blkptr_t *bp)
-{
-
-	zfs_blkptr_verify(spa, bp);
-
-	/*
-	 * The check for EMBEDDED is a performance optimization.  We
-	 * process the free here (by ignoring it) rather than
-	 * putting it on the list and then processing it in zio_free_sync().
-	 */
-	if (BP_IS_EMBEDDED(bp))
-		return;
-	metaslab_check_free(spa, bp);
-
-	/*
-	 * Frees that are for the currently-syncing txg, are not going to be
-	 * deferred, and which will not need to do a read (i.e. not GANG or
-	 * DEDUP), can be processed immediately.  Otherwise, put them on the
-	 * in-memory list for later processing.
-	 */
-	if (zfs_trim_enabled || BP_IS_GANG(bp) || BP_GET_DEDUP(bp) ||
-	    txg != spa->spa_syncing_txg ||
-	    spa_sync_pass(spa) >= zfs_sync_pass_deferred_free) {
-		bplist_append(&spa->spa_free_bplist[txg & TXG_MASK], bp);
-	} else {
-		VERIFY0(zio_wait(zio_free_sync(NULL, spa, txg, bp,
-		    BP_GET_PSIZE(bp), 0)));
-	}
-}
-
-zio_t *
-zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
-    uint64_t size, enum zio_flag flags)
-{
-	zio_t *zio;
-	enum zio_stage stage = ZIO_FREE_PIPELINE;
-
-	ASSERT(!BP_IS_HOLE(bp));
-	ASSERT(spa_syncing_txg(spa) == txg);
-	ASSERT(spa_sync_pass(spa) < zfs_sync_pass_deferred_free);
-
-	if (BP_IS_EMBEDDED(bp))
-		return (zio_null(pio, spa, NULL, NULL, NULL, 0));
-
-	metaslab_check_free(spa, bp);
-	arc_freed(spa, bp);
-	dsl_scan_freed(spa, bp);
-
-	if (zfs_trim_enabled)
-		stage |= ZIO_STAGE_ISSUE_ASYNC | ZIO_STAGE_VDEV_IO_START |
-		    ZIO_STAGE_VDEV_IO_ASSESS;
-	/*
-	 * GANG and DEDUP blocks can induce a read (for the gang block header,
-	 * or the DDT), so issue them asynchronously so that this thread is
-	 * not tied up.
-	 */
-	else if (BP_IS_GANG(bp) || BP_GET_DEDUP(bp))
-		stage |= ZIO_STAGE_ISSUE_ASYNC;
-
-	flags |= ZIO_FLAG_DONT_QUEUE;
-
-	zio = zio_create(pio, spa, txg, bp, NULL, size,
-	    size, NULL, NULL, ZIO_TYPE_FREE, ZIO_PRIORITY_NOW,
-	    flags, NULL, 0, NULL, ZIO_STAGE_OPEN, stage);
-
-	return (zio);
-}
-
-zio_t *
-zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
-    zio_done_func_t *done, void *private, enum zio_flag flags)
-{
-	zio_t *zio;
-
-	zfs_blkptr_verify(spa, bp);
-
-	if (BP_IS_EMBEDDED(bp))
-		return (zio_null(pio, spa, NULL, NULL, NULL, 0));
-
-	/*
-	 * A claim is an allocation of a specific block.  Claims are needed
-	 * to support immediate writes in the intent log.  The issue is that
-	 * immediate writes contain committed data, but in a txg that was
-	 * *not* committed.  Upon opening the pool after an unclean shutdown,
-	 * the intent log claims all blocks that contain immediate write data
-	 * so that the SPA knows they're in use.
-	 *
-	 * All claims *must* be resolved in the first txg -- before the SPA
-	 * starts allocating blocks -- so that nothing is allocated twice.
-	 * If txg == 0 we just verify that the block is claimable.
-	 */
-	ASSERT3U(spa->spa_uberblock.ub_rootbp.blk_birth, <,
-	    spa_min_claim_txg(spa));
-	ASSERT(txg == spa_min_claim_txg(spa) || txg == 0);
-	ASSERT(!BP_GET_DEDUP(bp) || !spa_writeable(spa));	/* zdb(1M) */
-
-	zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp),
-	    BP_GET_PSIZE(bp), done, private, ZIO_TYPE_CLAIM, ZIO_PRIORITY_NOW,
-	    flags, NULL, 0, NULL, ZIO_STAGE_OPEN, ZIO_CLAIM_PIPELINE);
-	ASSERT0(zio->io_queued_timestamp);
-
-	return (zio);
-}
-
-zio_t *
-zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd, uint64_t offset,
-    uint64_t size, zio_done_func_t *done, void *private,
-    zio_priority_t priority, enum zio_flag flags)
-{
-	zio_t *zio;
-	int c;
-
-	if (vd->vdev_children == 0) {
-		zio = zio_create(pio, spa, 0, NULL, NULL, 0, 0, done, private,
-		    ZIO_TYPE_IOCTL, ZIO_PRIORITY_NOW, flags, vd, 0, NULL,
-		    ZIO_STAGE_OPEN, ZIO_IOCTL_PIPELINE);
-
-		zio->io_cmd = cmd;
-	} else {
-		zio = zio_null(pio, spa, NULL, NULL, NULL, flags);
-
-		for (c = 0; c < vd->vdev_children; c++)
-			zio_nowait(zio_ioctl(zio, spa, vd->vdev_child[c], cmd,
-			    offset, size, done, private, priority, flags));
-	}
-
-	return (zio);
-}
-
-zio_t *
-zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
-    abd_t *data, int checksum, zio_done_func_t *done, void *private,
-    zio_priority_t priority, enum zio_flag flags, boolean_t labels)
-{
-	zio_t *zio;
-
-	ASSERT(vd->vdev_children == 0);
-	ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE ||
-	    offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE);
-	ASSERT3U(offset + size, <=, vd->vdev_psize);
-
-	zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, size, done,
-	    private, ZIO_TYPE_READ, priority, flags | ZIO_FLAG_PHYSICAL, vd,
-	    offset, NULL, ZIO_STAGE_OPEN, ZIO_READ_PHYS_PIPELINE);
-
-	zio->io_prop.zp_checksum = checksum;
-
-	return (zio);
-}
-
-zio_t *
-zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
-    abd_t *data, int checksum, zio_done_func_t *done, void *private,
-    zio_priority_t priority, enum zio_flag flags, boolean_t labels)
-{
-	zio_t *zio;
-
-	ASSERT(vd->vdev_children == 0);
-	ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE ||
-	    offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE);
-	ASSERT3U(offset + size, <=, vd->vdev_psize);
-
-	zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, size, done,
-	    private, ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_PHYSICAL, vd,
-	    offset, NULL, ZIO_STAGE_OPEN, ZIO_WRITE_PHYS_PIPELINE);
-
-	zio->io_prop.zp_checksum = checksum;
-
-	if (zio_checksum_table[checksum].ci_flags & ZCHECKSUM_FLAG_EMBEDDED) {
-		/*
-		 * zec checksums are necessarily destructive -- they modify
-		 * the end of the write buffer to hold the verifier/checksum.
-		 * Therefore, we must make a local copy in case the data is
-		 * being written to multiple places in parallel.
-		 */
-		abd_t *wbuf = abd_alloc_sametype(data, size);
-		abd_copy(wbuf, data, size);
-
-		zio_push_transform(zio, wbuf, size, size, NULL);
-	}
-
-	return (zio);
-}
-
-/*
- * Create a child I/O to do some work for us.
- */
-zio_t *
-zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset,
-    abd_t *data, uint64_t size, int type, zio_priority_t priority,
-    enum zio_flag flags, zio_done_func_t *done, void *private)
-{
-	enum zio_stage pipeline = ZIO_VDEV_CHILD_PIPELINE;
-	zio_t *zio;
-
-	/*
-	 * vdev child I/Os do not propagate their error to the parent.
-	 * Therefore, for correct operation the caller *must* check for
-	 * and handle the error in the child i/o's done callback.
-	 * The only exceptions are i/os that we don't care about
-	 * (OPTIONAL or REPAIR).
-	 */
-	ASSERT((flags & ZIO_FLAG_OPTIONAL) || (flags & ZIO_FLAG_IO_REPAIR) ||
-	    done != NULL);
-
-	if (type == ZIO_TYPE_READ && bp != NULL) {
-		/*
-		 * If we have the bp, then the child should perform the
-		 * checksum and the parent need not.  This pushes error
-		 * detection as close to the leaves as possible and
-		 * eliminates redundant checksums in the interior nodes.
-		 */
-		pipeline |= ZIO_STAGE_CHECKSUM_VERIFY;
-		pio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY;
-	}
-
-	/* Not all IO types require vdev io done stage e.g. free */
-	if (type == ZIO_TYPE_FREE &&
-	    !(pio->io_pipeline & ZIO_STAGE_VDEV_IO_DONE))
-		pipeline &= ~ZIO_STAGE_VDEV_IO_DONE;
-
-	if (vd->vdev_ops->vdev_op_leaf) {
-		ASSERT0(vd->vdev_children);
-		offset += VDEV_LABEL_START_SIZE;
-	}
-
-	flags |= ZIO_VDEV_CHILD_FLAGS(pio);
-
-	/*
-	 * If we've decided to do a repair, the write is not speculative --
-	 * even if the original read was.
-	 */
-	if (flags & ZIO_FLAG_IO_REPAIR)
-		flags &= ~ZIO_FLAG_SPECULATIVE;
-
-	/*
-	 * If we're creating a child I/O that is not associated with a
-	 * top-level vdev, then the child zio is not an allocating I/O.
-	 * If this is a retried I/O then we ignore it since we will
-	 * have already processed the original allocating I/O.
-	 */
-	if (flags & ZIO_FLAG_IO_ALLOCATING &&
-	    (vd != vd->vdev_top || (flags & ZIO_FLAG_IO_RETRY))) {
-		ASSERT(pio->io_metaslab_class != NULL);
-		ASSERT(pio->io_metaslab_class->mc_alloc_throttle_enabled);
-		ASSERT(type == ZIO_TYPE_WRITE);
-		ASSERT(priority == ZIO_PRIORITY_ASYNC_WRITE);
-		ASSERT(!(flags & ZIO_FLAG_IO_REPAIR));
-		ASSERT(!(pio->io_flags & ZIO_FLAG_IO_REWRITE) ||
-		    pio->io_child_type == ZIO_CHILD_GANG);
-
-		flags &= ~ZIO_FLAG_IO_ALLOCATING;
-	}
-
-	zio = zio_create(pio, pio->io_spa, pio->io_txg, bp, data, size, size,
-	    done, private, type, priority, flags, vd, offset, &pio->io_bookmark,
-	    ZIO_STAGE_VDEV_IO_START >> 1, pipeline);
-	ASSERT3U(zio->io_child_type, ==, ZIO_CHILD_VDEV);
-
-	zio->io_physdone = pio->io_physdone;
-	if (vd->vdev_ops->vdev_op_leaf && zio->io_logical != NULL)
-		zio->io_logical->io_phys_children++;
-
-	return (zio);
-}
-
-zio_t *
-zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, abd_t *data, uint64_t size,
-    zio_type_t type, zio_priority_t priority, enum zio_flag flags,
-    zio_done_func_t *done, void *private)
-{
-	zio_t *zio;
-
-	ASSERT(vd->vdev_ops->vdev_op_leaf);
-
-	zio = zio_create(NULL, vd->vdev_spa, 0, NULL,
-	    data, size, size, done, private, type, priority,
-	    flags | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY | ZIO_FLAG_DELEGATED,
-	    vd, offset, NULL,
-	    ZIO_STAGE_VDEV_IO_START >> 1, ZIO_VDEV_CHILD_PIPELINE);
-
-	return (zio);
-}
-
-void
-zio_flush(zio_t *zio, vdev_t *vd)
-{
-	zio_nowait(zio_ioctl(zio, zio->io_spa, vd, DKIOCFLUSHWRITECACHE, 0, 0,
-	    NULL, NULL, ZIO_PRIORITY_NOW,
-	    ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY));
-}
-
-zio_t *
-zio_trim(zio_t *zio, spa_t *spa, vdev_t *vd, uint64_t offset, uint64_t size)
-{
-
-	ASSERT(vd->vdev_ops->vdev_op_leaf);
-
-	return (zio_create(zio, spa, 0, NULL, NULL, size, size, NULL, NULL,
-	    ZIO_TYPE_FREE, ZIO_PRIORITY_TRIM, ZIO_FLAG_DONT_AGGREGATE |
-	    ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY,
-	    vd, offset, NULL, ZIO_STAGE_OPEN, ZIO_FREE_PHYS_PIPELINE));
-}
-
-void
-zio_shrink(zio_t *zio, uint64_t size)
-{
-	ASSERT3P(zio->io_executor, ==, NULL);
-	ASSERT3P(zio->io_orig_size, ==, zio->io_size);
-	ASSERT3U(size, <=, zio->io_size);
-
-	/*
-	 * We don't shrink for raidz because of problems with the
-	 * reconstruction when reading back less than the block size.
-	 * Note, BP_IS_RAIDZ() assumes no compression.
-	 */
-	ASSERT(BP_GET_COMPRESS(zio->io_bp) == ZIO_COMPRESS_OFF);
-	if (!BP_IS_RAIDZ(zio->io_bp)) {
-		/* we are not doing a raw write */
-		ASSERT3U(zio->io_size, ==, zio->io_lsize);
-		zio->io_orig_size = zio->io_size = zio->io_lsize = size;
-	}
-}
-
-/*
- * ==========================================================================
- * Prepare to read and write logical blocks
- * ==========================================================================
- */
-
-static zio_t *
-zio_read_bp_init(zio_t *zio)
-{
-	blkptr_t *bp = zio->io_bp;
-
-	ASSERT3P(zio->io_bp, ==, &zio->io_bp_copy);
-
-	if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF &&
-	    zio->io_child_type == ZIO_CHILD_LOGICAL &&
-	    !(zio->io_flags & ZIO_FLAG_RAW)) {
-		uint64_t psize =
-		    BP_IS_EMBEDDED(bp) ? BPE_GET_PSIZE(bp) : BP_GET_PSIZE(bp);
-		zio_push_transform(zio, abd_alloc_sametype(zio->io_abd, psize),
-		    psize, psize, zio_decompress);
-	}
-
-	if (BP_IS_EMBEDDED(bp) && BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA) {
-		zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
-
-		int psize = BPE_GET_PSIZE(bp);
-		void *data = abd_borrow_buf(zio->io_abd, psize);
-		decode_embedded_bp_compressed(bp, data);
-		abd_return_buf_copy(zio->io_abd, data, psize);
-	} else {
-		ASSERT(!BP_IS_EMBEDDED(bp));
-		ASSERT3P(zio->io_bp, ==, &zio->io_bp_copy);
-	}
-
-	if (!DMU_OT_IS_METADATA(BP_GET_TYPE(bp)) && BP_GET_LEVEL(bp) == 0)
-		zio->io_flags |= ZIO_FLAG_DONT_CACHE;
-
-	if (BP_GET_TYPE(bp) == DMU_OT_DDT_ZAP)
-		zio->io_flags |= ZIO_FLAG_DONT_CACHE;
-
-	if (BP_GET_DEDUP(bp) && zio->io_child_type == ZIO_CHILD_LOGICAL)
-		zio->io_pipeline = ZIO_DDT_READ_PIPELINE;
-
-	return (zio);
-}
-
-static zio_t *
-zio_write_bp_init(zio_t *zio)
-{
-	if (!IO_IS_ALLOCATING(zio))
-		return (zio);
-
-	ASSERT(zio->io_child_type != ZIO_CHILD_DDT);
-
-	if (zio->io_bp_override) {
-		blkptr_t *bp = zio->io_bp;
-		zio_prop_t *zp = &zio->io_prop;
-
-		ASSERT(bp->blk_birth != zio->io_txg);
-		ASSERT(BP_GET_DEDUP(zio->io_bp_override) == 0);
-
-		*bp = *zio->io_bp_override;
-		zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
-
-		if (BP_IS_EMBEDDED(bp))
-			return (zio);
-
-		/*
-		 * If we've been overridden and nopwrite is set then
-		 * set the flag accordingly to indicate that a nopwrite
-		 * has already occurred.
-		 */
-		if (!BP_IS_HOLE(bp) && zp->zp_nopwrite) {
-			ASSERT(!zp->zp_dedup);
-			ASSERT3U(BP_GET_CHECKSUM(bp), ==, zp->zp_checksum);
-			zio->io_flags |= ZIO_FLAG_NOPWRITE;
-			return (zio);
-		}
-
-		ASSERT(!zp->zp_nopwrite);
-
-		if (BP_IS_HOLE(bp) || !zp->zp_dedup)
-			return (zio);
-
-		ASSERT((zio_checksum_table[zp->zp_checksum].ci_flags &
-		    ZCHECKSUM_FLAG_DEDUP) || zp->zp_dedup_verify);
-
-		if (BP_GET_CHECKSUM(bp) == zp->zp_checksum) {
-			BP_SET_DEDUP(bp, 1);
-			zio->io_pipeline |= ZIO_STAGE_DDT_WRITE;
-			return (zio);
-		}
-
-		/*
-		 * We were unable to handle this as an override bp, treat
-		 * it as a regular write I/O.
-		 */
-		zio->io_bp_override = NULL;
-		*bp = zio->io_bp_orig;
-		zio->io_pipeline = zio->io_orig_pipeline;
-	}
-
-	return (zio);
-}
-
-static zio_t *
-zio_write_compress(zio_t *zio)
-{
-	spa_t *spa = zio->io_spa;
-	zio_prop_t *zp = &zio->io_prop;
-	enum zio_compress compress = zp->zp_compress;
-	blkptr_t *bp = zio->io_bp;
-	uint64_t lsize = zio->io_lsize;
-	uint64_t psize = zio->io_size;
-	int pass = 1;
-
-	EQUIV(lsize != psize, (zio->io_flags & ZIO_FLAG_RAW) != 0);
-
-	/*
-	 * If our children haven't all reached the ready stage,
-	 * wait for them and then repeat this pipeline stage.
-	 */
-	if (zio_wait_for_children(zio, ZIO_CHILD_LOGICAL_BIT |
-	    ZIO_CHILD_GANG_BIT, ZIO_WAIT_READY)) {
-		return (NULL);
-	}
-
-	if (!IO_IS_ALLOCATING(zio))
-		return (zio);
-
-	if (zio->io_children_ready != NULL) {
-		/*
-		 * Now that all our children are ready, run the callback
-		 * associated with this zio in case it wants to modify the
-		 * data to be written.
-		 */
-		ASSERT3U(zp->zp_level, >, 0);
-		zio->io_children_ready(zio);
-	}
-
-	ASSERT(zio->io_child_type != ZIO_CHILD_DDT);
-	ASSERT(zio->io_bp_override == NULL);
-
-	if (!BP_IS_HOLE(bp) && bp->blk_birth == zio->io_txg) {
-		/*
-		 * We're rewriting an existing block, which means we're
-		 * working on behalf of spa_sync().  For spa_sync() to
-		 * converge, it must eventually be the case that we don't
-		 * have to allocate new blocks.  But compression changes
-		 * the blocksize, which forces a reallocate, and makes
-		 * convergence take longer.  Therefore, after the first
-		 * few passes, stop compressing to ensure convergence.
-		 */
-		pass = spa_sync_pass(spa);
-
-		ASSERT(zio->io_txg == spa_syncing_txg(spa));
-		ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
-		ASSERT(!BP_GET_DEDUP(bp));
-
-		if (pass >= zfs_sync_pass_dont_compress)
-			compress = ZIO_COMPRESS_OFF;
-
-		/* Make sure someone doesn't change their mind on overwrites */
-		ASSERT(BP_IS_EMBEDDED(bp) || MIN(zp->zp_copies + BP_IS_GANG(bp),
-		    spa_max_replication(spa)) == BP_GET_NDVAS(bp));
-	}
-
-	/* If it's a compressed write that is not raw, compress the buffer. */
-	if (compress != ZIO_COMPRESS_OFF && psize == lsize) {
-		void *cbuf = zio_buf_alloc(lsize);
-		psize = zio_compress_data(compress, zio->io_abd, cbuf, lsize);
-		if (psize == 0 || psize == lsize) {
-			compress = ZIO_COMPRESS_OFF;
-			zio_buf_free(cbuf, lsize);
-		} else if (!zp->zp_dedup && psize <= BPE_PAYLOAD_SIZE &&
-		    zp->zp_level == 0 && !DMU_OT_HAS_FILL(zp->zp_type) &&
-		    spa_feature_is_enabled(spa, SPA_FEATURE_EMBEDDED_DATA)) {
-			encode_embedded_bp_compressed(bp,
-			    cbuf, compress, lsize, psize);
-			BPE_SET_ETYPE(bp, BP_EMBEDDED_TYPE_DATA);
-			BP_SET_TYPE(bp, zio->io_prop.zp_type);
-			BP_SET_LEVEL(bp, zio->io_prop.zp_level);
-			zio_buf_free(cbuf, lsize);
-			bp->blk_birth = zio->io_txg;
-			zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
-			ASSERT(spa_feature_is_active(spa,
-			    SPA_FEATURE_EMBEDDED_DATA));
-			return (zio);
-		} else {
-			/*
-			 * Round up compressed size up to the ashift
-			 * of the smallest-ashift device, and zero the tail.
-			 * This ensures that the compressed size of the BP
-			 * (and thus compressratio property) are correct,
-			 * in that we charge for the padding used to fill out
-			 * the last sector.
-			 */
-			ASSERT3U(spa->spa_min_ashift, >=, SPA_MINBLOCKSHIFT);
-			size_t rounded = (size_t)P2ROUNDUP(psize,
-			    1ULL << spa->spa_min_ashift);
-			if (rounded >= lsize) {
-				compress = ZIO_COMPRESS_OFF;
-				zio_buf_free(cbuf, lsize);
-				psize = lsize;
-			} else {
-				abd_t *cdata = abd_get_from_buf(cbuf, lsize);
-				abd_take_ownership_of_buf(cdata, B_TRUE);
-				abd_zero_off(cdata, psize, rounded - psize);
-				psize = rounded;
-				zio_push_transform(zio, cdata,
-				    psize, lsize, NULL);
-			}
-		}
-
-		/*
-		 * We were unable to handle this as an override bp, treat
-		 * it as a regular write I/O.
-		 */
-		zio->io_bp_override = NULL;
-		*bp = zio->io_bp_orig;
-		zio->io_pipeline = zio->io_orig_pipeline;
-	} else {
-		ASSERT3U(psize, !=, 0);
-	}
-
-	/*
-	 * The final pass of spa_sync() must be all rewrites, but the first
-	 * few passes offer a trade-off: allocating blocks defers convergence,
-	 * but newly allocated blocks are sequential, so they can be written
-	 * to disk faster.  Therefore, we allow the first few passes of
-	 * spa_sync() to allocate new blocks, but force rewrites after that.
-	 * There should only be a handful of blocks after pass 1 in any case.
-	 */
-	if (!BP_IS_HOLE(bp) && bp->blk_birth == zio->io_txg &&
-	    BP_GET_PSIZE(bp) == psize &&
-	    pass >= zfs_sync_pass_rewrite) {
-		VERIFY3U(psize, !=, 0);
-		enum zio_stage gang_stages = zio->io_pipeline & ZIO_GANG_STAGES;
-
-		zio->io_pipeline = ZIO_REWRITE_PIPELINE | gang_stages;
-		zio->io_flags |= ZIO_FLAG_IO_REWRITE;
-	} else {
-		BP_ZERO(bp);
-		zio->io_pipeline = ZIO_WRITE_PIPELINE;
-	}
-
-	if (psize == 0) {
-		if (zio->io_bp_orig.blk_birth != 0 &&
-		    spa_feature_is_active(spa, SPA_FEATURE_HOLE_BIRTH)) {
-			BP_SET_LSIZE(bp, lsize);
-			BP_SET_TYPE(bp, zp->zp_type);
-			BP_SET_LEVEL(bp, zp->zp_level);
-			BP_SET_BIRTH(bp, zio->io_txg, 0);
-		}
-		zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
-	} else {
-		ASSERT(zp->zp_checksum != ZIO_CHECKSUM_GANG_HEADER);
-		BP_SET_LSIZE(bp, lsize);
-		BP_SET_TYPE(bp, zp->zp_type);
-		BP_SET_LEVEL(bp, zp->zp_level);
-		BP_SET_PSIZE(bp, psize);
-		BP_SET_COMPRESS(bp, compress);
-		BP_SET_CHECKSUM(bp, zp->zp_checksum);
-		BP_SET_DEDUP(bp, zp->zp_dedup);
-		BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
-		if (zp->zp_dedup) {
-			ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
-			ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE));
-			zio->io_pipeline = ZIO_DDT_WRITE_PIPELINE;
-		}
-		if (zp->zp_nopwrite) {
-			ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
-			ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE));
-			zio->io_pipeline |= ZIO_STAGE_NOP_WRITE;
-		}
-	}
-	return (zio);
-}
-
-static zio_t *
-zio_free_bp_init(zio_t *zio)
-{
-	blkptr_t *bp = zio->io_bp;
-
-	if (zio->io_child_type == ZIO_CHILD_LOGICAL) {
-		if (BP_GET_DEDUP(bp))
-			zio->io_pipeline = ZIO_DDT_FREE_PIPELINE;
-	}
-
-	ASSERT3P(zio->io_bp, ==, &zio->io_bp_copy);
-
-	return (zio);
-}
-
-/*
- * ==========================================================================
- * Execute the I/O pipeline
- * ==========================================================================
- */
-
-static void
-zio_taskq_dispatch(zio_t *zio, zio_taskq_type_t q, boolean_t cutinline)
-{
-	spa_t *spa = zio->io_spa;
-	zio_type_t t = zio->io_type;
-	int flags = (cutinline ? TQ_FRONT : 0);
-
-	ASSERT(q == ZIO_TASKQ_ISSUE || q == ZIO_TASKQ_INTERRUPT);
-
-	/*
-	 * If we're a config writer or a probe, the normal issue and
-	 * interrupt threads may all be blocked waiting for the config lock.
-	 * In this case, select the otherwise-unused taskq for ZIO_TYPE_NULL.
-	 */
-	if (zio->io_flags & (ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_PROBE))
-		t = ZIO_TYPE_NULL;
-
-	/*
-	 * A similar issue exists for the L2ARC write thread until L2ARC 2.0.
-	 */
-	if (t == ZIO_TYPE_WRITE && zio->io_vd && zio->io_vd->vdev_aux)
-		t = ZIO_TYPE_NULL;
-
-	/*
-	 * If this is a high priority I/O, then use the high priority taskq if
-	 * available.
-	 */
-	if ((zio->io_priority == ZIO_PRIORITY_NOW ||
-	    zio->io_priority == ZIO_PRIORITY_SYNC_WRITE) &&
-	    spa->spa_zio_taskq[t][q + 1].stqs_count != 0)
-		q++;
-
-	ASSERT3U(q, <, ZIO_TASKQ_TYPES);
-
-	/*
-	 * NB: We are assuming that the zio can only be dispatched
-	 * to a single taskq at a time.  It would be a grievous error
-	 * to dispatch the zio to another taskq at the same time.
-	 */
-#if defined(illumos) || !defined(_KERNEL)
-	ASSERT(zio->io_tqent.tqent_next == NULL);
-#else
-	ASSERT(zio->io_tqent.tqent_task.ta_pending == 0);
-#endif
-	spa_taskq_dispatch_ent(spa, t, q, (task_func_t *)zio_execute, zio,
-	    flags, &zio->io_tqent);
-}
-
-static boolean_t
-zio_taskq_member(zio_t *zio, zio_taskq_type_t q)
-{
-	kthread_t *executor = zio->io_executor;
-	spa_t *spa = zio->io_spa;
-
-	for (zio_type_t t = 0; t < ZIO_TYPES; t++) {
-		spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
-		uint_t i;
-		for (i = 0; i < tqs->stqs_count; i++) {
-			if (taskq_member(tqs->stqs_taskq[i], executor))
-				return (B_TRUE);
-		}
-	}
-
-	return (B_FALSE);
-}
-
-static zio_t *
-zio_issue_async(zio_t *zio)
-{
-	zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE);
-
-	return (NULL);
-}
-
-void
-zio_interrupt(zio_t *zio)
-{
-	zio_taskq_dispatch(zio, ZIO_TASKQ_INTERRUPT, B_FALSE);
-}
-
-void
-zio_delay_interrupt(zio_t *zio)
-{
-	/*
-	 * The timeout_generic() function isn't defined in userspace, so
-	 * rather than trying to implement the function, the zio delay
-	 * functionality has been disabled for userspace builds.
-	 */
-
-#ifdef _KERNEL
-	/*
-	 * If io_target_timestamp is zero, then no delay has been registered
-	 * for this IO, thus jump to the end of this function and "skip" the
-	 * delay; issuing it directly to the zio layer.
-	 */
-	if (zio->io_target_timestamp != 0) {
-		hrtime_t now = gethrtime();
-
-		if (now >= zio->io_target_timestamp) {
-			/*
-			 * This IO has already taken longer than the target
-			 * delay to complete, so we don't want to delay it
-			 * any longer; we "miss" the delay and issue it
-			 * directly to the zio layer. This is likely due to
-			 * the target latency being set to a value less than
-			 * the underlying hardware can satisfy (e.g. delay
-			 * set to 1ms, but the disks take 10ms to complete an
-			 * IO request).
-			 */
-
-			DTRACE_PROBE2(zio__delay__miss, zio_t *, zio,
-			    hrtime_t, now);
-
-			zio_interrupt(zio);
-		} else {
-			hrtime_t diff = zio->io_target_timestamp - now;
-
-			DTRACE_PROBE3(zio__delay__hit, zio_t *, zio,
-			    hrtime_t, now, hrtime_t, diff);
-
-#ifdef __FreeBSD__
-			callout_reset_sbt(&zio->io_timer, nstosbt(diff), 0,
-			    (void (*)(void *))zio_interrupt, zio, C_HARDCLOCK);
-#else
-			(void) timeout_generic(CALLOUT_NORMAL,
-			    (void (*)(void *))zio_interrupt, zio, diff, 1, 0);
-#endif
-		}
-
-		return;
-	}
-#endif
-
-	DTRACE_PROBE1(zio__delay__skip, zio_t *, zio);
-	zio_interrupt(zio);
-}
-
-/*
- * Execute the I/O pipeline until one of the following occurs:
- *
- *	(1) the I/O completes
- *	(2) the pipeline stalls waiting for dependent child I/Os
- *	(3) the I/O issues, so we're waiting for an I/O completion interrupt
- *	(4) the I/O is delegated by vdev-level caching or aggregation
- *	(5) the I/O is deferred due to vdev-level queueing
- *	(6) the I/O is handed off to another thread.
- *
- * In all cases, the pipeline stops whenever there's no CPU work; it never
- * burns a thread in cv_wait().
- *
- * There's no locking on io_stage because there's no legitimate way
- * for multiple threads to be attempting to process the same I/O.
- */
-static zio_pipe_stage_t *zio_pipeline[];
-
-void
-zio_execute(zio_t *zio)
-{
-	ASSERT3U(zio->io_queued_timestamp, >, 0);
-
-	while (zio->io_stage < ZIO_STAGE_DONE) {
-		enum zio_stage pipeline = zio->io_pipeline;
-		enum zio_stage stage = zio->io_stage;
-
-		zio->io_executor = curthread;
-
-		ASSERT(!MUTEX_HELD(&zio->io_lock));
-		ASSERT(ISP2(stage));
-		ASSERT(zio->io_stall == NULL);
-
-		do {
-			stage <<= 1;
-		} while ((stage & pipeline) == 0);
-
-		ASSERT(stage <= ZIO_STAGE_DONE);
-
-		/*
-		 * If we are in interrupt context and this pipeline stage
-		 * will grab a config lock that is held across I/O,
-		 * or may wait for an I/O that needs an interrupt thread
-		 * to complete, issue async to avoid deadlock.
-		 *
-		 * For VDEV_IO_START, we cut in line so that the io will
-		 * be sent to disk promptly.
-		 */
-		if ((stage & ZIO_BLOCKING_STAGES) && zio->io_vd == NULL &&
-		    zio_taskq_member(zio, ZIO_TASKQ_INTERRUPT)) {
-			boolean_t cut = (stage == ZIO_STAGE_VDEV_IO_START) ?
-			    zio_requeue_io_start_cut_in_line : B_FALSE;
-			zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, cut);
-			return;
-		}
-
-		zio->io_stage = stage;
-		zio->io_pipeline_trace |= zio->io_stage;
-
-		/*
-		 * The zio pipeline stage returns the next zio to execute
-		 * (typically the same as this one), or NULL if we should
-		 * stop.
-		 */
-		zio = zio_pipeline[highbit64(stage) - 1](zio);
-
-		if (zio == NULL)
-			return;
-	}
-}
-
-/*
- * ==========================================================================
- * Initiate I/O, either sync or async
- * ==========================================================================
- */
-int
-zio_wait(zio_t *zio)
-{
-	int error;
-
-	ASSERT3P(zio->io_stage, ==, ZIO_STAGE_OPEN);
-	ASSERT3P(zio->io_executor, ==, NULL);
-
-	zio->io_waiter = curthread;
-	ASSERT0(zio->io_queued_timestamp);
-	zio->io_queued_timestamp = gethrtime();
-
-	zio_execute(zio);
-
-	mutex_enter(&zio->io_lock);
-	while (zio->io_executor != NULL)
-		cv_wait(&zio->io_cv, &zio->io_lock);
-	mutex_exit(&zio->io_lock);
-
-	error = zio->io_error;
-	zio_destroy(zio);
-
-	return (error);
-}
-
-void
-zio_nowait(zio_t *zio)
-{
-	ASSERT3P(zio->io_executor, ==, NULL);
-
-	if (zio->io_child_type == ZIO_CHILD_LOGICAL &&
-	    zio_unique_parent(zio) == NULL) {
-		/*
-		 * This is a logical async I/O with no parent to wait for it.
-		 * We add it to the spa_async_root_zio "Godfather" I/O which
-		 * will ensure they complete prior to unloading the pool.
-		 */
-		spa_t *spa = zio->io_spa;
-
-		zio_add_child(spa->spa_async_zio_root[CPU_SEQID], zio);
-	}
-
-	ASSERT0(zio->io_queued_timestamp);
-	zio->io_queued_timestamp = gethrtime();
-	zio_execute(zio);
-}
-
-/*
- * ==========================================================================
- * Reexecute, cancel, or suspend/resume failed I/O
- * ==========================================================================
- */
-
-static void
-zio_reexecute(zio_t *pio)
-{
-	zio_t *cio, *cio_next;
-
-	ASSERT(pio->io_child_type == ZIO_CHILD_LOGICAL);
-	ASSERT(pio->io_orig_stage == ZIO_STAGE_OPEN);
-	ASSERT(pio->io_gang_leader == NULL);
-	ASSERT(pio->io_gang_tree == NULL);
-
-	pio->io_flags = pio->io_orig_flags;
-	pio->io_stage = pio->io_orig_stage;
-	pio->io_pipeline = pio->io_orig_pipeline;
-	pio->io_reexecute = 0;
-	pio->io_flags |= ZIO_FLAG_REEXECUTED;
-	pio->io_pipeline_trace = 0;
-	pio->io_error = 0;
-	for (int w = 0; w < ZIO_WAIT_TYPES; w++)
-		pio->io_state[w] = 0;
-	for (int c = 0; c < ZIO_CHILD_TYPES; c++)
-		pio->io_child_error[c] = 0;
-
-	if (IO_IS_ALLOCATING(pio))
-		BP_ZERO(pio->io_bp);
-
-	/*
-	 * As we reexecute pio's children, new children could be created.
-	 * New children go to the head of pio's io_child_list, however,
-	 * so we will (correctly) not reexecute them.  The key is that
-	 * the remainder of pio's io_child_list, from 'cio_next' onward,
-	 * cannot be affected by any side effects of reexecuting 'cio'.
-	 */
-	zio_link_t *zl = NULL;
-	mutex_enter(&pio->io_lock);
-	for (cio = zio_walk_children(pio, &zl); cio != NULL; cio = cio_next) {
-		cio_next = zio_walk_children(pio, &zl);
-		for (int w = 0; w < ZIO_WAIT_TYPES; w++)
-			pio->io_children[cio->io_child_type][w]++;
-		mutex_exit(&pio->io_lock);
-		zio_reexecute(cio);
-		mutex_enter(&pio->io_lock);
-	}
-	mutex_exit(&pio->io_lock);
-
-	/*
-	 * Now that all children have been reexecuted, execute the parent.
-	 * We don't reexecute "The Godfather" I/O here as it's the
-	 * responsibility of the caller to wait on it.
-	 */
-	if (!(pio->io_flags & ZIO_FLAG_GODFATHER)) {
-		pio->io_queued_timestamp = gethrtime();
-		zio_execute(pio);
-	}
-}
-
-void
-zio_suspend(spa_t *spa, zio_t *zio, zio_suspend_reason_t reason)
-{
-	if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_PANIC)
-		fm_panic("Pool '%s' has encountered an uncorrectable I/O "
-		    "failure and the failure mode property for this pool "
-		    "is set to panic.", spa_name(spa));
-
-	zfs_ereport_post(FM_EREPORT_ZFS_IO_FAILURE, spa, NULL, NULL, 0, 0);
-
-	mutex_enter(&spa->spa_suspend_lock);
-
-	if (spa->spa_suspend_zio_root == NULL)
-		spa->spa_suspend_zio_root = zio_root(spa, NULL, NULL,
-		    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE |
-		    ZIO_FLAG_GODFATHER);
-
-	spa->spa_suspended = reason;
-
-	if (zio != NULL) {
-		ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER));
-		ASSERT(zio != spa->spa_suspend_zio_root);
-		ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
-		ASSERT(zio_unique_parent(zio) == NULL);
-		ASSERT(zio->io_stage == ZIO_STAGE_DONE);
-		zio_add_child(spa->spa_suspend_zio_root, zio);
-	}
-
-	mutex_exit(&spa->spa_suspend_lock);
-}
-
-int
-zio_resume(spa_t *spa)
-{
-	zio_t *pio;
-
-	/*
-	 * Reexecute all previously suspended i/o.
-	 */
-	mutex_enter(&spa->spa_suspend_lock);
-	spa->spa_suspended = ZIO_SUSPEND_NONE;
-	cv_broadcast(&spa->spa_suspend_cv);
-	pio = spa->spa_suspend_zio_root;
-	spa->spa_suspend_zio_root = NULL;
-	mutex_exit(&spa->spa_suspend_lock);
-
-	if (pio == NULL)
-		return (0);
-
-	zio_reexecute(pio);
-	return (zio_wait(pio));
-}
-
-void
-zio_resume_wait(spa_t *spa)
-{
-	mutex_enter(&spa->spa_suspend_lock);
-	while (spa_suspended(spa))
-		cv_wait(&spa->spa_suspend_cv, &spa->spa_suspend_lock);
-	mutex_exit(&spa->spa_suspend_lock);
-}
-
-/*
- * ==========================================================================
- * Gang blocks.
- *
- * A gang block is a collection of small blocks that looks to the DMU
- * like one large block.  When zio_dva_allocate() cannot find a block
- * of the requested size, due to either severe fragmentation or the pool
- * being nearly full, it calls zio_write_gang_block() to construct the
- * block from smaller fragments.
- *
- * A gang block consists of a gang header (zio_gbh_phys_t) and up to
- * three (SPA_GBH_NBLKPTRS) gang members.  The gang header is just like
- * an indirect block: it's an array of block pointers.  It consumes
- * only one sector and hence is allocatable regardless of fragmentation.
- * The gang header's bps point to its gang members, which hold the data.
- *
- * Gang blocks are self-checksumming, using the bp's <vdev, offset, txg>
- * as the verifier to ensure uniqueness of the SHA256 checksum.
- * Critically, the gang block bp's blk_cksum is the checksum of the data,
- * not the gang header.  This ensures that data block signatures (needed for
- * deduplication) are independent of how the block is physically stored.
- *
- * Gang blocks can be nested: a gang member may itself be a gang block.
- * Thus every gang block is a tree in which root and all interior nodes are
- * gang headers, and the leaves are normal blocks that contain user data.
- * The root of the gang tree is called the gang leader.
- *
- * To perform any operation (read, rewrite, free, claim) on a gang block,
- * zio_gang_assemble() first assembles the gang tree (minus data leaves)
- * in the io_gang_tree field of the original logical i/o by recursively
- * reading the gang leader and all gang headers below it.  This yields
- * an in-core tree containing the contents of every gang header and the
- * bps for every constituent of the gang block.
- *
- * With the gang tree now assembled, zio_gang_issue() just walks the gang tree
- * and invokes a callback on each bp.  To free a gang block, zio_gang_issue()
- * calls zio_free_gang() -- a trivial wrapper around zio_free() -- for each bp.
- * zio_claim_gang() provides a similarly trivial wrapper for zio_claim().
- * zio_read_gang() is a wrapper around zio_read() that omits reading gang
- * headers, since we already have those in io_gang_tree.  zio_rewrite_gang()
- * performs a zio_rewrite() of the data or, for gang headers, a zio_rewrite()
- * of the gang header plus zio_checksum_compute() of the data to update the
- * gang header's blk_cksum as described above.
- *
- * The two-phase assemble/issue model solves the problem of partial failure --
- * what if you'd freed part of a gang block but then couldn't read the
- * gang header for another part?  Assembling the entire gang tree first
- * ensures that all the necessary gang header I/O has succeeded before
- * starting the actual work of free, claim, or write.  Once the gang tree
- * is assembled, free and claim are in-memory operations that cannot fail.
- *
- * In the event that a gang write fails, zio_dva_unallocate() walks the
- * gang tree to immediately free (i.e. insert back into the space map)
- * everything we've allocated.  This ensures that we don't get ENOSPC
- * errors during repeated suspend/resume cycles due to a flaky device.
- *
- * Gang rewrites only happen during sync-to-convergence.  If we can't assemble
- * the gang tree, we won't modify the block, so we can safely defer the free
- * (knowing that the block is still intact).  If we *can* assemble the gang
- * tree, then even if some of the rewrites fail, zio_dva_unallocate() will free
- * each constituent bp and we can allocate a new block on the next sync pass.
- *
- * In all cases, the gang tree allows complete recovery from partial failure.
- * ==========================================================================
- */
-
-static void
-zio_gang_issue_func_done(zio_t *zio)
-{
-	abd_put(zio->io_abd);
-}
-
-static zio_t *
-zio_read_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, abd_t *data,
-    uint64_t offset)
-{
-	if (gn != NULL)
-		return (pio);
-
-	return (zio_read(pio, pio->io_spa, bp, abd_get_offset(data, offset),
-	    BP_GET_PSIZE(bp), zio_gang_issue_func_done,
-	    NULL, pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio),
-	    &pio->io_bookmark));
-}
-
-static zio_t *
-zio_rewrite_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, abd_t *data,
-    uint64_t offset)
-{
-	zio_t *zio;
-
-	if (gn != NULL) {
-		abd_t *gbh_abd =
-		    abd_get_from_buf(gn->gn_gbh, SPA_GANGBLOCKSIZE);
-		zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp,
-		    gbh_abd, SPA_GANGBLOCKSIZE, zio_gang_issue_func_done, NULL,
-		    pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio),
-		    &pio->io_bookmark);
-		/*
-		 * As we rewrite each gang header, the pipeline will compute
-		 * a new gang block header checksum for it; but no one will
-		 * compute a new data checksum, so we do that here.  The one
-		 * exception is the gang leader: the pipeline already computed
-		 * its data checksum because that stage precedes gang assembly.
-		 * (Presently, nothing actually uses interior data checksums;
-		 * this is just good hygiene.)
-		 */
-		if (gn != pio->io_gang_leader->io_gang_tree) {
-			abd_t *buf = abd_get_offset(data, offset);
-
-			zio_checksum_compute(zio, BP_GET_CHECKSUM(bp),
-			    buf, BP_GET_PSIZE(bp));
-
-			abd_put(buf);
-		}
-		/*
-		 * If we are here to damage data for testing purposes,
-		 * leave the GBH alone so that we can detect the damage.
-		 */
-		if (pio->io_gang_leader->io_flags & ZIO_FLAG_INDUCE_DAMAGE)
-			zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES;
-	} else {
-		zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp,
-		    abd_get_offset(data, offset), BP_GET_PSIZE(bp),
-		    zio_gang_issue_func_done, NULL, pio->io_priority,
-		    ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark);
-	}
-
-	return (zio);
-}
-
-/* ARGSUSED */
-static zio_t *
-zio_free_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, abd_t *data,
-    uint64_t offset)
-{
-	return (zio_free_sync(pio, pio->io_spa, pio->io_txg, bp,
-	    BP_IS_GANG(bp) ? SPA_GANGBLOCKSIZE : BP_GET_PSIZE(bp),
-	    ZIO_GANG_CHILD_FLAGS(pio)));
-}
-
-/* ARGSUSED */
-static zio_t *
-zio_claim_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, abd_t *data,
-    uint64_t offset)
-{
-	return (zio_claim(pio, pio->io_spa, pio->io_txg, bp,
-	    NULL, NULL, ZIO_GANG_CHILD_FLAGS(pio)));
-}
-
-static zio_gang_issue_func_t *zio_gang_issue_func[ZIO_TYPES] = {
-	NULL,
-	zio_read_gang,
-	zio_rewrite_gang,
-	zio_free_gang,
-	zio_claim_gang,
-	NULL
-};
-
-static void zio_gang_tree_assemble_done(zio_t *zio);
-
-static zio_gang_node_t *
-zio_gang_node_alloc(zio_gang_node_t **gnpp)
-{
-	zio_gang_node_t *gn;
-
-	ASSERT(*gnpp == NULL);
-
-	gn = kmem_zalloc(sizeof (*gn), KM_SLEEP);
-	gn->gn_gbh = zio_buf_alloc(SPA_GANGBLOCKSIZE);
-	*gnpp = gn;
-
-	return (gn);
-}
-
-static void
-zio_gang_node_free(zio_gang_node_t **gnpp)
-{
-	zio_gang_node_t *gn = *gnpp;
-
-	for (int g = 0; g < SPA_GBH_NBLKPTRS; g++)
-		ASSERT(gn->gn_child[g] == NULL);
-
-	zio_buf_free(gn->gn_gbh, SPA_GANGBLOCKSIZE);
-	kmem_free(gn, sizeof (*gn));
-	*gnpp = NULL;
-}
-
-static void
-zio_gang_tree_free(zio_gang_node_t **gnpp)
-{
-	zio_gang_node_t *gn = *gnpp;
-
-	if (gn == NULL)
-		return;
-
-	for (int g = 0; g < SPA_GBH_NBLKPTRS; g++)
-		zio_gang_tree_free(&gn->gn_child[g]);
-
-	zio_gang_node_free(gnpp);
-}
-
-static void
-zio_gang_tree_assemble(zio_t *gio, blkptr_t *bp, zio_gang_node_t **gnpp)
-{
-	zio_gang_node_t *gn = zio_gang_node_alloc(gnpp);
-	abd_t *gbh_abd = abd_get_from_buf(gn->gn_gbh, SPA_GANGBLOCKSIZE);
-
-	ASSERT(gio->io_gang_leader == gio);
-	ASSERT(BP_IS_GANG(bp));
-
-	zio_nowait(zio_read(gio, gio->io_spa, bp, gbh_abd, SPA_GANGBLOCKSIZE,
-	    zio_gang_tree_assemble_done, gn, gio->io_priority,
-	    ZIO_GANG_CHILD_FLAGS(gio), &gio->io_bookmark));
-}
-
-static void
-zio_gang_tree_assemble_done(zio_t *zio)
-{
-	zio_t *gio = zio->io_gang_leader;
-	zio_gang_node_t *gn = zio->io_private;
-	blkptr_t *bp = zio->io_bp;
-
-	ASSERT(gio == zio_unique_parent(zio));
-	ASSERT(zio->io_child_count == 0);
-
-	if (zio->io_error)
-		return;
-
-	/* this ABD was created from a linear buf in zio_gang_tree_assemble */
-	if (BP_SHOULD_BYTESWAP(bp))
-		byteswap_uint64_array(abd_to_buf(zio->io_abd), zio->io_size);
-
-	ASSERT3P(abd_to_buf(zio->io_abd), ==, gn->gn_gbh);
-	ASSERT(zio->io_size == SPA_GANGBLOCKSIZE);
-	ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC);
-
-	abd_put(zio->io_abd);
-
-	for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) {
-		blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g];
-		if (!BP_IS_GANG(gbp))
-			continue;
-		zio_gang_tree_assemble(gio, gbp, &gn->gn_child[g]);
-	}
-}
-
-static void
-zio_gang_tree_issue(zio_t *pio, zio_gang_node_t *gn, blkptr_t *bp, abd_t *data,
-    uint64_t offset)
-{
-	zio_t *gio = pio->io_gang_leader;
-	zio_t *zio;
-
-	ASSERT(BP_IS_GANG(bp) == !!gn);
-	ASSERT(BP_GET_CHECKSUM(bp) == BP_GET_CHECKSUM(gio->io_bp));
-	ASSERT(BP_GET_LSIZE(bp) == BP_GET_PSIZE(bp) || gn == gio->io_gang_tree);
-
-	/*
-	 * If you're a gang header, your data is in gn->gn_gbh.
-	 * If you're a gang member, your data is in 'data' and gn == NULL.
-	 */
-	zio = zio_gang_issue_func[gio->io_type](pio, bp, gn, data, offset);
-
-	if (gn != NULL) {
-		ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC);
-
-		for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) {
-			blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g];
-			if (BP_IS_HOLE(gbp))
-				continue;
-			zio_gang_tree_issue(zio, gn->gn_child[g], gbp, data,
-			    offset);
-			offset += BP_GET_PSIZE(gbp);
-		}
-	}
-
-	if (gn == gio->io_gang_tree && gio->io_abd != NULL)
-		ASSERT3U(gio->io_size, ==, offset);
-
-	if (zio != pio)
-		zio_nowait(zio);
-}
-
-static zio_t *
-zio_gang_assemble(zio_t *zio)
-{
-	blkptr_t *bp = zio->io_bp;
-
-	ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == NULL);
-	ASSERT(zio->io_child_type > ZIO_CHILD_GANG);
-
-	zio->io_gang_leader = zio;
-
-	zio_gang_tree_assemble(zio, bp, &zio->io_gang_tree);
-
-	return (zio);
-}
-
-static zio_t *
-zio_gang_issue(zio_t *zio)
-{
-	blkptr_t *bp = zio->io_bp;
-
-	if (zio_wait_for_children(zio, ZIO_CHILD_GANG_BIT, ZIO_WAIT_DONE)) {
-		return (NULL);
-	}
-
-	ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == zio);
-	ASSERT(zio->io_child_type > ZIO_CHILD_GANG);
-
-	if (zio->io_child_error[ZIO_CHILD_GANG] == 0)
-		zio_gang_tree_issue(zio, zio->io_gang_tree, bp, zio->io_abd,
-		    0);
-	else
-		zio_gang_tree_free(&zio->io_gang_tree);
-
-	zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
-
-	return (zio);
-}
-
-static void
-zio_write_gang_member_ready(zio_t *zio)
-{
-	zio_t *pio = zio_unique_parent(zio);
-	zio_t *gio = zio->io_gang_leader;
-	dva_t *cdva = zio->io_bp->blk_dva;
-	dva_t *pdva = pio->io_bp->blk_dva;
-	uint64_t asize;
-
-	if (BP_IS_HOLE(zio->io_bp))
-		return;
-
-	ASSERT(BP_IS_HOLE(&zio->io_bp_orig));
-
-	ASSERT(zio->io_child_type == ZIO_CHILD_GANG);
-	ASSERT3U(zio->io_prop.zp_copies, ==, gio->io_prop.zp_copies);
-	ASSERT3U(zio->io_prop.zp_copies, <=, BP_GET_NDVAS(zio->io_bp));
-	ASSERT3U(pio->io_prop.zp_copies, <=, BP_GET_NDVAS(pio->io_bp));
-	ASSERT3U(BP_GET_NDVAS(zio->io_bp), <=, BP_GET_NDVAS(pio->io_bp));
-
-	mutex_enter(&pio->io_lock);
-	for (int d = 0; d < BP_GET_NDVAS(zio->io_bp); d++) {
-		ASSERT(DVA_GET_GANG(&pdva[d]));
-		asize = DVA_GET_ASIZE(&pdva[d]);
-		asize += DVA_GET_ASIZE(&cdva[d]);
-		DVA_SET_ASIZE(&pdva[d], asize);
-	}
-	mutex_exit(&pio->io_lock);
-}
-
-static void
-zio_write_gang_done(zio_t *zio)
-{
-	/*
-	 * The io_abd field will be NULL for a zio with no data.  The io_flags
-	 * will initially have the ZIO_FLAG_NODATA bit flag set, but we can't
-	 * check for it here as it is cleared in zio_ready.
-	 */
-	if (zio->io_abd != NULL)
-		abd_put(zio->io_abd);
-}
-
-static zio_t *
-zio_write_gang_block(zio_t *pio)
-{
-	spa_t *spa = pio->io_spa;
-	metaslab_class_t *mc = spa_normal_class(spa);
-	blkptr_t *bp = pio->io_bp;
-	zio_t *gio = pio->io_gang_leader;
-	zio_t *zio;
-	zio_gang_node_t *gn, **gnpp;
-	zio_gbh_phys_t *gbh;
-	abd_t *gbh_abd;
-	uint64_t txg = pio->io_txg;
-	uint64_t resid = pio->io_size;
-	uint64_t lsize;
-	int copies = gio->io_prop.zp_copies;
-	int gbh_copies = MIN(copies + 1, spa_max_replication(spa));
-	zio_prop_t zp;
-	int error;
-	boolean_t has_data = !(pio->io_flags & ZIO_FLAG_NODATA);
-
-	int flags = METASLAB_HINTBP_FAVOR | METASLAB_GANG_HEADER;
-	if (pio->io_flags & ZIO_FLAG_IO_ALLOCATING) {
-		ASSERT(pio->io_priority == ZIO_PRIORITY_ASYNC_WRITE);
-		ASSERT(has_data);
-
-		flags |= METASLAB_ASYNC_ALLOC;
-		VERIFY(zfs_refcount_held(&mc->mc_alloc_slots[pio->io_allocator],
-		    pio));
-
-		/*
-		 * The logical zio has already placed a reservation for
-		 * 'copies' allocation slots but gang blocks may require
-		 * additional copies. These additional copies
-		 * (i.e. gbh_copies - copies) are guaranteed to succeed
-		 * since metaslab_class_throttle_reserve() always allows
-		 * additional reservations for gang blocks.
-		 */
-		VERIFY(metaslab_class_throttle_reserve(mc, gbh_copies - copies,
-		    pio->io_allocator, pio, flags));
-	}
-
-	error = metaslab_alloc(spa, mc, SPA_GANGBLOCKSIZE,
-	    bp, gbh_copies, txg, pio == gio ? NULL : gio->io_bp, flags,
-	    &pio->io_alloc_list, pio, pio->io_allocator);
-	if (error) {
-		if (pio->io_flags & ZIO_FLAG_IO_ALLOCATING) {
-			ASSERT(pio->io_priority == ZIO_PRIORITY_ASYNC_WRITE);
-			ASSERT(has_data);
-
-			/*
-			 * If we failed to allocate the gang block header then
-			 * we remove any additional allocation reservations that
-			 * we placed here. The original reservation will
-			 * be removed when the logical I/O goes to the ready
-			 * stage.
-			 */
-			metaslab_class_throttle_unreserve(mc,
-			    gbh_copies - copies, pio->io_allocator, pio);
-		}
-		pio->io_error = error;
-		return (pio);
-	}
-
-	if (pio == gio) {
-		gnpp = &gio->io_gang_tree;
-	} else {
-		gnpp = pio->io_private;
-		ASSERT(pio->io_ready == zio_write_gang_member_ready);
-	}
-
-	gn = zio_gang_node_alloc(gnpp);
-	gbh = gn->gn_gbh;
-	bzero(gbh, SPA_GANGBLOCKSIZE);
-	gbh_abd = abd_get_from_buf(gbh, SPA_GANGBLOCKSIZE);
-
-	/*
-	 * Create the gang header.
-	 */
-	zio = zio_rewrite(pio, spa, txg, bp, gbh_abd, SPA_GANGBLOCKSIZE,
-	    zio_write_gang_done, NULL, pio->io_priority,
-	    ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark);
-
-	/*
-	 * Create and nowait the gang children.
-	 */
-	for (int g = 0; resid != 0; resid -= lsize, g++) {
-		lsize = P2ROUNDUP(resid / (SPA_GBH_NBLKPTRS - g),
-		    SPA_MINBLOCKSIZE);
-		ASSERT(lsize >= SPA_MINBLOCKSIZE && lsize <= resid);
-
-		zp.zp_checksum = gio->io_prop.zp_checksum;
-		zp.zp_compress = ZIO_COMPRESS_OFF;
-		zp.zp_type = DMU_OT_NONE;
-		zp.zp_level = 0;
-		zp.zp_copies = gio->io_prop.zp_copies;
-		zp.zp_dedup = B_FALSE;
-		zp.zp_dedup_verify = B_FALSE;
-		zp.zp_nopwrite = B_FALSE;
-
-		zio_t *cio = zio_write(zio, spa, txg, &gbh->zg_blkptr[g],
-		    has_data ? abd_get_offset(pio->io_abd, pio->io_size -
-		    resid) : NULL, lsize, lsize, &zp,
-		    zio_write_gang_member_ready, NULL, NULL,
-		    zio_write_gang_done, &gn->gn_child[g], pio->io_priority,
-		    ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark);
-
-		if (pio->io_flags & ZIO_FLAG_IO_ALLOCATING) {
-			ASSERT(pio->io_priority == ZIO_PRIORITY_ASYNC_WRITE);
-			ASSERT(has_data);
-
-			/*
-			 * Gang children won't throttle but we should
-			 * account for their work, so reserve an allocation
-			 * slot for them here.
-			 */
-			VERIFY(metaslab_class_throttle_reserve(mc,
-			    zp.zp_copies, cio->io_allocator, cio, flags));
-		}
-		zio_nowait(cio);
-	}
-
-	/*
-	 * Set pio's pipeline to just wait for zio to finish.
-	 */
-	pio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
-
-	zio_nowait(zio);
-
-	return (pio);
-}
-
-/*
- * The zio_nop_write stage in the pipeline determines if allocating a
- * new bp is necessary.  The nopwrite feature can handle writes in
- * either syncing or open context (i.e. zil writes) and as a result is
- * mutually exclusive with dedup.
- *
- * By leveraging a cryptographically secure checksum, such as SHA256, we
- * can compare the checksums of the new data and the old to determine if
- * allocating a new block is required.  Note that our requirements for
- * cryptographic strength are fairly weak: there can't be any accidental
- * hash collisions, but we don't need to be secure against intentional
- * (malicious) collisions.  To trigger a nopwrite, you have to be able
- * to write the file to begin with, and triggering an incorrect (hash
- * collision) nopwrite is no worse than simply writing to the file.
- * That said, there are no known attacks against the checksum algorithms
- * used for nopwrite, assuming that the salt and the checksums
- * themselves remain secret.
- */
-static zio_t *
-zio_nop_write(zio_t *zio)
-{
-	blkptr_t *bp = zio->io_bp;
-	blkptr_t *bp_orig = &zio->io_bp_orig;
-	zio_prop_t *zp = &zio->io_prop;
-
-	ASSERT(BP_GET_LEVEL(bp) == 0);
-	ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE));
-	ASSERT(zp->zp_nopwrite);
-	ASSERT(!zp->zp_dedup);
-	ASSERT(zio->io_bp_override == NULL);
-	ASSERT(IO_IS_ALLOCATING(zio));
-
-	/*
-	 * Check to see if the original bp and the new bp have matching
-	 * characteristics (i.e. same checksum, compression algorithms, etc).
-	 * If they don't then just continue with the pipeline which will
-	 * allocate a new bp.
-	 */
-	if (BP_IS_HOLE(bp_orig) ||
-	    !(zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_flags &
-	    ZCHECKSUM_FLAG_NOPWRITE) ||
-	    BP_GET_CHECKSUM(bp) != BP_GET_CHECKSUM(bp_orig) ||
-	    BP_GET_COMPRESS(bp) != BP_GET_COMPRESS(bp_orig) ||
-	    BP_GET_DEDUP(bp) != BP_GET_DEDUP(bp_orig) ||
-	    zp->zp_copies != BP_GET_NDVAS(bp_orig))
-		return (zio);
-
-	/*
-	 * If the checksums match then reset the pipeline so that we
-	 * avoid allocating a new bp and issuing any I/O.
-	 */
-	if (ZIO_CHECKSUM_EQUAL(bp->blk_cksum, bp_orig->blk_cksum)) {
-		ASSERT(zio_checksum_table[zp->zp_checksum].ci_flags &
-		    ZCHECKSUM_FLAG_NOPWRITE);
-		ASSERT3U(BP_GET_PSIZE(bp), ==, BP_GET_PSIZE(bp_orig));
-		ASSERT3U(BP_GET_LSIZE(bp), ==, BP_GET_LSIZE(bp_orig));
-		ASSERT(zp->zp_compress != ZIO_COMPRESS_OFF);
-		ASSERT(bcmp(&bp->blk_prop, &bp_orig->blk_prop,
-		    sizeof (uint64_t)) == 0);
-
-		*bp = *bp_orig;
-		zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
-		zio->io_flags |= ZIO_FLAG_NOPWRITE;
-	}
-
-	return (zio);
-}
-
-/*
- * ==========================================================================
- * Dedup
- * ==========================================================================
- */
-static void
-zio_ddt_child_read_done(zio_t *zio)
-{
-	blkptr_t *bp = zio->io_bp;
-	ddt_entry_t *dde = zio->io_private;
-	ddt_phys_t *ddp;
-	zio_t *pio = zio_unique_parent(zio);
-
-	mutex_enter(&pio->io_lock);
-	ddp = ddt_phys_select(dde, bp);
-	if (zio->io_error == 0)
-		ddt_phys_clear(ddp);	/* this ddp doesn't need repair */
-
-	if (zio->io_error == 0 && dde->dde_repair_abd == NULL)
-		dde->dde_repair_abd = zio->io_abd;
-	else
-		abd_free(zio->io_abd);
-	mutex_exit(&pio->io_lock);
-}
-
-static zio_t *
-zio_ddt_read_start(zio_t *zio)
-{
-	blkptr_t *bp = zio->io_bp;
-
-	ASSERT(BP_GET_DEDUP(bp));
-	ASSERT(BP_GET_PSIZE(bp) == zio->io_size);
-	ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
-
-	if (zio->io_child_error[ZIO_CHILD_DDT]) {
-		ddt_t *ddt = ddt_select(zio->io_spa, bp);
-		ddt_entry_t *dde = ddt_repair_start(ddt, bp);
-		ddt_phys_t *ddp = dde->dde_phys;
-		ddt_phys_t *ddp_self = ddt_phys_select(dde, bp);
-		blkptr_t blk;
-
-		ASSERT(zio->io_vsd == NULL);
-		zio->io_vsd = dde;
-
-		if (ddp_self == NULL)
-			return (zio);
-
-		for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
-			if (ddp->ddp_phys_birth == 0 || ddp == ddp_self)
-				continue;
-			ddt_bp_create(ddt->ddt_checksum, &dde->dde_key, ddp,
-			    &blk);
-			zio_nowait(zio_read(zio, zio->io_spa, &blk,
-			    abd_alloc_for_io(zio->io_size, B_TRUE),
-			    zio->io_size, zio_ddt_child_read_done, dde,
-			    zio->io_priority, ZIO_DDT_CHILD_FLAGS(zio) |
-			    ZIO_FLAG_DONT_PROPAGATE, &zio->io_bookmark));
-		}
-		return (zio);
-	}
-
-	zio_nowait(zio_read(zio, zio->io_spa, bp,
-	    zio->io_abd, zio->io_size, NULL, NULL, zio->io_priority,
-	    ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark));
-
-	return (zio);
-}
-
-static zio_t *
-zio_ddt_read_done(zio_t *zio)
-{
-	blkptr_t *bp = zio->io_bp;
-
-	if (zio_wait_for_children(zio, ZIO_CHILD_DDT_BIT, ZIO_WAIT_DONE)) {
-		return (NULL);
-	}
-
-	ASSERT(BP_GET_DEDUP(bp));
-	ASSERT(BP_GET_PSIZE(bp) == zio->io_size);
-	ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
-
-	if (zio->io_child_error[ZIO_CHILD_DDT]) {
-		ddt_t *ddt = ddt_select(zio->io_spa, bp);
-		ddt_entry_t *dde = zio->io_vsd;
-		if (ddt == NULL) {
-			ASSERT(spa_load_state(zio->io_spa) != SPA_LOAD_NONE);
-			return (zio);
-		}
-		if (dde == NULL) {
-			zio->io_stage = ZIO_STAGE_DDT_READ_START >> 1;
-			zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE);
-			return (NULL);
-		}
-		if (dde->dde_repair_abd != NULL) {
-			abd_copy(zio->io_abd, dde->dde_repair_abd,
-			    zio->io_size);
-			zio->io_child_error[ZIO_CHILD_DDT] = 0;
-		}
-		ddt_repair_done(ddt, dde);
-		zio->io_vsd = NULL;
-	}
-
-	ASSERT(zio->io_vsd == NULL);
-
-	return (zio);
-}
-
-static boolean_t
-zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde)
-{
-	spa_t *spa = zio->io_spa;
-	boolean_t do_raw = (zio->io_flags & ZIO_FLAG_RAW);
-
-	/* We should never get a raw, override zio */
-	ASSERT(!(zio->io_bp_override && do_raw));
-
-	/*
-	 * Note: we compare the original data, not the transformed data,
-	 * because when zio->io_bp is an override bp, we will not have
-	 * pushed the I/O transforms.  That's an important optimization
-	 * because otherwise we'd compress/encrypt all dmu_sync() data twice.
-	 */
-	for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) {
-		zio_t *lio = dde->dde_lead_zio[p];
-
-		if (lio != NULL) {
-			return (lio->io_orig_size != zio->io_orig_size ||
-			    abd_cmp(zio->io_orig_abd, lio->io_orig_abd,
-			    zio->io_orig_size) != 0);
-		}
-	}
-
-	for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) {
-		ddt_phys_t *ddp = &dde->dde_phys[p];
-
-		if (ddp->ddp_phys_birth != 0) {
-			arc_buf_t *abuf = NULL;
-			arc_flags_t aflags = ARC_FLAG_WAIT;
-			int zio_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE;
-			blkptr_t blk = *zio->io_bp;
-			int error;
-
-			ddt_bp_fill(ddp, &blk, ddp->ddp_phys_birth);
-
-			ddt_exit(ddt);
-
-			/*
-			 * Intuitively, it would make more sense to compare
-			 * io_abd than io_orig_abd in the raw case since you
-			 * don't want to look at any transformations that have
-			 * happened to the data. However, for raw I/Os the
-			 * data will actually be the same in io_abd and
-			 * io_orig_abd, so all we have to do is issue this as
-			 * a raw ARC read.
-			 */
-			if (do_raw) {
-				zio_flags |= ZIO_FLAG_RAW;
-				ASSERT3U(zio->io_size, ==, zio->io_orig_size);
-				ASSERT0(abd_cmp(zio->io_abd, zio->io_orig_abd,
-				    zio->io_size));
-				ASSERT3P(zio->io_transform_stack, ==, NULL);
-			}
-
-			error = arc_read(NULL, spa, &blk,
-			    arc_getbuf_func, &abuf, ZIO_PRIORITY_SYNC_READ,
-			    zio_flags, &aflags, &zio->io_bookmark);
-
-			if (error == 0) {
-				if (arc_buf_size(abuf) != zio->io_orig_size ||
-				    abd_cmp_buf(zio->io_orig_abd, abuf->b_data,
-				    zio->io_orig_size) != 0)
-					error = SET_ERROR(EEXIST);
-				arc_buf_destroy(abuf, &abuf);
-			}
-
-			ddt_enter(ddt);
-			return (error != 0);
-		}
-	}
-
-	return (B_FALSE);
-}
-
-static void
-zio_ddt_child_write_ready(zio_t *zio)
-{
-	int p = zio->io_prop.zp_copies;
-	ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp);
-	ddt_entry_t *dde = zio->io_private;
-	ddt_phys_t *ddp = &dde->dde_phys[p];
-	zio_t *pio;
-
-	if (zio->io_error)
-		return;
-
-	ddt_enter(ddt);
-
-	ASSERT(dde->dde_lead_zio[p] == zio);
-
-	ddt_phys_fill(ddp, zio->io_bp);
-
-	zio_link_t *zl = NULL;
-	while ((pio = zio_walk_parents(zio, &zl)) != NULL)
-		ddt_bp_fill(ddp, pio->io_bp, zio->io_txg);
-
-	ddt_exit(ddt);
-}
-
-static void
-zio_ddt_child_write_done(zio_t *zio)
-{
-	int p = zio->io_prop.zp_copies;
-	ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp);
-	ddt_entry_t *dde = zio->io_private;
-	ddt_phys_t *ddp = &dde->dde_phys[p];
-
-	ddt_enter(ddt);
-
-	ASSERT(ddp->ddp_refcnt == 0);
-	ASSERT(dde->dde_lead_zio[p] == zio);
-	dde->dde_lead_zio[p] = NULL;
-
-	if (zio->io_error == 0) {
-		zio_link_t *zl = NULL;
-		while (zio_walk_parents(zio, &zl) != NULL)
-			ddt_phys_addref(ddp);
-	} else {
-		ddt_phys_clear(ddp);
-	}
-
-	ddt_exit(ddt);
-}
-
-static void
-zio_ddt_ditto_write_done(zio_t *zio)
-{
-	int p = DDT_PHYS_DITTO;
-	zio_prop_t *zp = &zio->io_prop;
-	blkptr_t *bp = zio->io_bp;
-	ddt_t *ddt = ddt_select(zio->io_spa, bp);
-	ddt_entry_t *dde = zio->io_private;
-	ddt_phys_t *ddp = &dde->dde_phys[p];
-	ddt_key_t *ddk = &dde->dde_key;
-
-	ddt_enter(ddt);
-
-	ASSERT(ddp->ddp_refcnt == 0);
-	ASSERT(dde->dde_lead_zio[p] == zio);
-	dde->dde_lead_zio[p] = NULL;
-
-	if (zio->io_error == 0) {
-		ASSERT(ZIO_CHECKSUM_EQUAL(bp->blk_cksum, ddk->ddk_cksum));
-		ASSERT(zp->zp_copies < SPA_DVAS_PER_BP);
-		ASSERT(zp->zp_copies == BP_GET_NDVAS(bp) - BP_IS_GANG(bp));
-		if (ddp->ddp_phys_birth != 0)
-			ddt_phys_free(ddt, ddk, ddp, zio->io_txg);
-		ddt_phys_fill(ddp, bp);
-	}
-
-	ddt_exit(ddt);
-}
-
-static zio_t *
-zio_ddt_write(zio_t *zio)
-{
-	spa_t *spa = zio->io_spa;
-	blkptr_t *bp = zio->io_bp;
-	uint64_t txg = zio->io_txg;
-	zio_prop_t *zp = &zio->io_prop;
-	int p = zp->zp_copies;
-	int ditto_copies;
-	zio_t *cio = NULL;
-	zio_t *dio = NULL;
-	ddt_t *ddt = ddt_select(spa, bp);
-	ddt_entry_t *dde;
-	ddt_phys_t *ddp;
-
-	ASSERT(BP_GET_DEDUP(bp));
-	ASSERT(BP_GET_CHECKSUM(bp) == zp->zp_checksum);
-	ASSERT(BP_IS_HOLE(bp) || zio->io_bp_override);
-	ASSERT(!(zio->io_bp_override && (zio->io_flags & ZIO_FLAG_RAW)));
-
-	ddt_enter(ddt);
-	dde = ddt_lookup(ddt, bp, B_TRUE);
-	ddp = &dde->dde_phys[p];
-
-	if (zp->zp_dedup_verify && zio_ddt_collision(zio, ddt, dde)) {
-		/*
-		 * If we're using a weak checksum, upgrade to a strong checksum
-		 * and try again.  If we're already using a strong checksum,
-		 * we can't resolve it, so just convert to an ordinary write.
-		 * (And automatically e-mail a paper to Nature?)
-		 */
-		if (!(zio_checksum_table[zp->zp_checksum].ci_flags &
-		    ZCHECKSUM_FLAG_DEDUP)) {
-			zp->zp_checksum = spa_dedup_checksum(spa);
-			zio_pop_transforms(zio);
-			zio->io_stage = ZIO_STAGE_OPEN;
-			BP_ZERO(bp);
-		} else {
-			zp->zp_dedup = B_FALSE;
-			BP_SET_DEDUP(bp, B_FALSE);
-		}
-		ASSERT(!BP_GET_DEDUP(bp));
-		zio->io_pipeline = ZIO_WRITE_PIPELINE;
-		ddt_exit(ddt);
-		return (zio);
-	}
-
-	ditto_copies = ddt_ditto_copies_needed(ddt, dde, ddp);
-	ASSERT(ditto_copies < SPA_DVAS_PER_BP);
-
-	if (ditto_copies > ddt_ditto_copies_present(dde) &&
-	    dde->dde_lead_zio[DDT_PHYS_DITTO] == NULL) {
-		zio_prop_t czp = *zp;
-
-		czp.zp_copies = ditto_copies;
-
-		/*
-		 * If we arrived here with an override bp, we won't have run
-		 * the transform stack, so we won't have the data we need to
-		 * generate a child i/o.  So, toss the override bp and restart.
-		 * This is safe, because using the override bp is just an
-		 * optimization; and it's rare, so the cost doesn't matter.
-		 */
-		if (zio->io_bp_override) {
-			zio_pop_transforms(zio);
-			zio->io_stage = ZIO_STAGE_OPEN;
-			zio->io_pipeline = ZIO_WRITE_PIPELINE;
-			zio->io_bp_override = NULL;
-			BP_ZERO(bp);
-			ddt_exit(ddt);
-			return (zio);
-		}
-
-		dio = zio_write(zio, spa, txg, bp, zio->io_orig_abd,
-		    zio->io_orig_size, zio->io_orig_size, &czp, NULL, NULL,
-		    NULL, zio_ddt_ditto_write_done, dde, zio->io_priority,
-		    ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark);
-
-		zio_push_transform(dio, zio->io_abd, zio->io_size, 0, NULL);
-		dde->dde_lead_zio[DDT_PHYS_DITTO] = dio;
-	}
-
-	if (ddp->ddp_phys_birth != 0 || dde->dde_lead_zio[p] != NULL) {
-		if (ddp->ddp_phys_birth != 0)
-			ddt_bp_fill(ddp, bp, txg);
-		if (dde->dde_lead_zio[p] != NULL)
-			zio_add_child(zio, dde->dde_lead_zio[p]);
-		else
-			ddt_phys_addref(ddp);
-	} else if (zio->io_bp_override) {
-		ASSERT(bp->blk_birth == txg);
-		ASSERT(BP_EQUAL(bp, zio->io_bp_override));
-		ddt_phys_fill(ddp, bp);
-		ddt_phys_addref(ddp);
-	} else {
-		cio = zio_write(zio, spa, txg, bp, zio->io_orig_abd,
-		    zio->io_orig_size, zio->io_orig_size, zp,
-		    zio_ddt_child_write_ready, NULL, NULL,
-		    zio_ddt_child_write_done, dde, zio->io_priority,
-		    ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark);
-
-		zio_push_transform(cio, zio->io_abd, zio->io_size, 0, NULL);
-		dde->dde_lead_zio[p] = cio;
-	}
-
-	ddt_exit(ddt);
-
-	if (cio)
-		zio_nowait(cio);
-	if (dio)
-		zio_nowait(dio);
-
-	return (zio);
-}
-
-ddt_entry_t *freedde; /* for debugging */
-
-static zio_t *
-zio_ddt_free(zio_t *zio)
-{
-	spa_t *spa = zio->io_spa;
-	blkptr_t *bp = zio->io_bp;
-	ddt_t *ddt = ddt_select(spa, bp);
-	ddt_entry_t *dde;
-	ddt_phys_t *ddp;
-
-	ASSERT(BP_GET_DEDUP(bp));
-	ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
-
-	ddt_enter(ddt);
-	freedde = dde = ddt_lookup(ddt, bp, B_TRUE);
-	if (dde) {
-		ddp = ddt_phys_select(dde, bp);
-		if (ddp)
-			ddt_phys_decref(ddp);
-	}
-	ddt_exit(ddt);
-
-	return (zio);
-}
-
-/*
- * ==========================================================================
- * Allocate and free blocks
- * ==========================================================================
- */
-
-static zio_t *
-zio_io_to_allocate(spa_t *spa, int allocator)
-{
-	zio_t *zio;
-
-	ASSERT(MUTEX_HELD(&spa->spa_alloc_locks[allocator]));
-
-	zio = avl_first(&spa->spa_alloc_trees[allocator]);
-	if (zio == NULL)
-		return (NULL);
-
-	ASSERT(IO_IS_ALLOCATING(zio));
-
-	/*
-	 * Try to place a reservation for this zio. If we're unable to
-	 * reserve then we throttle.
-	 */
-	ASSERT3U(zio->io_allocator, ==, allocator);
-	if (!metaslab_class_throttle_reserve(zio->io_metaslab_class,
-	    zio->io_prop.zp_copies, zio->io_allocator, zio, 0)) {
-		return (NULL);
-	}
-
-	avl_remove(&spa->spa_alloc_trees[allocator], zio);
-	ASSERT3U(zio->io_stage, <, ZIO_STAGE_DVA_ALLOCATE);
-
-	return (zio);
-}
-
-static zio_t *
-zio_dva_throttle(zio_t *zio)
-{
-	spa_t *spa = zio->io_spa;
-	zio_t *nio;
-	metaslab_class_t *mc;
-
-	/* locate an appropriate allocation class */
-	mc = spa_preferred_class(spa, zio->io_size, zio->io_prop.zp_type,
-	    zio->io_prop.zp_level, zio->io_prop.zp_zpl_smallblk);
-
-	if (zio->io_priority == ZIO_PRIORITY_SYNC_WRITE ||
-	    !mc->mc_alloc_throttle_enabled ||
-	    zio->io_child_type == ZIO_CHILD_GANG ||
-	    zio->io_flags & ZIO_FLAG_NODATA) {
-		return (zio);
-	}
-
-	ASSERT(zio->io_child_type > ZIO_CHILD_GANG);
-
-	ASSERT3U(zio->io_queued_timestamp, >, 0);
-	ASSERT(zio->io_stage == ZIO_STAGE_DVA_THROTTLE);
-
-	zbookmark_phys_t *bm = &zio->io_bookmark;
-	/*
-	 * We want to try to use as many allocators as possible to help improve
-	 * performance, but we also want logically adjacent IOs to be physically
-	 * adjacent to improve sequential read performance. We chunk each object
-	 * into 2^20 block regions, and then hash based on the objset, object,
-	 * level, and region to accomplish both of these goals.
-	 */
-	zio->io_allocator = cityhash4(bm->zb_objset, bm->zb_object,
-	    bm->zb_level, bm->zb_blkid >> 20) % spa->spa_alloc_count;
-	mutex_enter(&spa->spa_alloc_locks[zio->io_allocator]);
-	ASSERT(zio->io_type == ZIO_TYPE_WRITE);
-	zio->io_metaslab_class = mc;
-	avl_add(&spa->spa_alloc_trees[zio->io_allocator], zio);
-	nio = zio_io_to_allocate(spa, zio->io_allocator);
-	mutex_exit(&spa->spa_alloc_locks[zio->io_allocator]);
-
-	return (nio);
-}
-
-static void
-zio_allocate_dispatch(spa_t *spa, int allocator)
-{
-	zio_t *zio;
-
-	mutex_enter(&spa->spa_alloc_locks[allocator]);
-	zio = zio_io_to_allocate(spa, allocator);
-	mutex_exit(&spa->spa_alloc_locks[allocator]);
-	if (zio == NULL)
-		return;
-
-	ASSERT3U(zio->io_stage, ==, ZIO_STAGE_DVA_THROTTLE);
-	ASSERT0(zio->io_error);
-	zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_TRUE);
-}
-
-static zio_t *
-zio_dva_allocate(zio_t *zio)
-{
-	spa_t *spa = zio->io_spa;
-	metaslab_class_t *mc;
-	blkptr_t *bp = zio->io_bp;
-	int error;
-	int flags = 0;
-
-	if (zio->io_gang_leader == NULL) {
-		ASSERT(zio->io_child_type > ZIO_CHILD_GANG);
-		zio->io_gang_leader = zio;
-	}
-
-	ASSERT(BP_IS_HOLE(bp));
-	ASSERT0(BP_GET_NDVAS(bp));
-	ASSERT3U(zio->io_prop.zp_copies, >, 0);
-	ASSERT3U(zio->io_prop.zp_copies, <=, spa_max_replication(spa));
-	ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp));
-
-	if (zio->io_flags & ZIO_FLAG_NODATA)
-		flags |= METASLAB_DONT_THROTTLE;
-	if (zio->io_flags & ZIO_FLAG_GANG_CHILD)
-		flags |= METASLAB_GANG_CHILD;
-	if (zio->io_priority == ZIO_PRIORITY_ASYNC_WRITE)
-		flags |= METASLAB_ASYNC_ALLOC;
-
-	/*
-	 * if not already chosen, locate an appropriate allocation class
-	 */
-	mc = zio->io_metaslab_class;
-	if (mc == NULL) {
-		mc = spa_preferred_class(spa, zio->io_size,
-		    zio->io_prop.zp_type, zio->io_prop.zp_level,
-		    zio->io_prop.zp_zpl_smallblk);
-		zio->io_metaslab_class = mc;
-	}
-
-	error = metaslab_alloc(spa, mc, zio->io_size, bp,
-	    zio->io_prop.zp_copies, zio->io_txg, NULL, flags,
-	    &zio->io_alloc_list, zio, zio->io_allocator);
-
-	/*
-	 * Fallback to normal class when an alloc class is full
-	 */
-	if (error == ENOSPC && mc != spa_normal_class(spa)) {
-		/*
-		 * If throttling, transfer reservation over to normal class.
-		 * The io_allocator slot can remain the same even though we
-		 * are switching classes.
-		 */
-		if (mc->mc_alloc_throttle_enabled &&
-		    (zio->io_flags & ZIO_FLAG_IO_ALLOCATING)) {
-			metaslab_class_throttle_unreserve(mc,
-			    zio->io_prop.zp_copies, zio->io_allocator, zio);
-			zio->io_flags &= ~ZIO_FLAG_IO_ALLOCATING;
-
-			mc = spa_normal_class(spa);
-			VERIFY(metaslab_class_throttle_reserve(mc,
-			    zio->io_prop.zp_copies, zio->io_allocator, zio,
-			    flags | METASLAB_MUST_RESERVE));
-		} else {
-			mc = spa_normal_class(spa);
-		}
-		zio->io_metaslab_class = mc;
-
-		error = metaslab_alloc(spa, mc, zio->io_size, bp,
-		    zio->io_prop.zp_copies, zio->io_txg, NULL, flags,
-		    &zio->io_alloc_list, zio, zio->io_allocator);
-	}
-
-	if (error != 0) {
-		zfs_dbgmsg("%s: metaslab allocation failure: zio %p, "
-		    "size %llu, error %d", spa_name(spa), zio, zio->io_size,
-		    error);
-		if (error == ENOSPC && zio->io_size > SPA_MINBLOCKSIZE)
-			return (zio_write_gang_block(zio));
-		zio->io_error = error;
-	}
-
-	return (zio);
-}
-
-static zio_t *
-zio_dva_free(zio_t *zio)
-{
-	metaslab_free(zio->io_spa, zio->io_bp, zio->io_txg, B_FALSE);
-
-	return (zio);
-}
-
-static zio_t *
-zio_dva_claim(zio_t *zio)
-{
-	int error;
-
-	error = metaslab_claim(zio->io_spa, zio->io_bp, zio->io_txg);
-	if (error)
-		zio->io_error = error;
-
-	return (zio);
-}
-
-/*
- * Undo an allocation.  This is used by zio_done() when an I/O fails
- * and we want to give back the block we just allocated.
- * This handles both normal blocks and gang blocks.
- */
-static void
-zio_dva_unallocate(zio_t *zio, zio_gang_node_t *gn, blkptr_t *bp)
-{
-	ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp));
-	ASSERT(zio->io_bp_override == NULL);
-
-	if (!BP_IS_HOLE(bp))
-		metaslab_free(zio->io_spa, bp, bp->blk_birth, B_TRUE);
-
-	if (gn != NULL) {
-		for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) {
-			zio_dva_unallocate(zio, gn->gn_child[g],
-			    &gn->gn_gbh->zg_blkptr[g]);
-		}
-	}
-}
-
-/*
- * Try to allocate an intent log block.  Return 0 on success, errno on failure.
- */
-int
-zio_alloc_zil(spa_t *spa, uint64_t objset, uint64_t txg, blkptr_t *new_bp,
-    blkptr_t *old_bp, uint64_t size, boolean_t *slog)
-{
-	int error = 1;
-	zio_alloc_list_t io_alloc_list;
-
-	ASSERT(txg > spa_syncing_txg(spa));
-
-	metaslab_trace_init(&io_alloc_list);
-
-	/*
-	 * Block pointer fields are useful to metaslabs for stats and debugging.
-	 * Fill in the obvious ones before calling into metaslab_alloc().
-	 */
-	BP_SET_TYPE(new_bp, DMU_OT_INTENT_LOG);
-	BP_SET_PSIZE(new_bp, size);
-	BP_SET_LEVEL(new_bp, 0);
-
-	/*
-	 * When allocating a zil block, we don't have information about
-	 * the final destination of the block except the objset it's part
-	 * of, so we just hash the objset ID to pick the allocator to get
-	 * some parallelism.
-	 */
-	error = metaslab_alloc(spa, spa_log_class(spa), size, new_bp, 1,
-	    txg, old_bp, METASLAB_HINTBP_AVOID, &io_alloc_list, NULL,
-	    cityhash4(0, 0, 0, objset) % spa->spa_alloc_count);
-	if (error == 0) {
-		*slog = TRUE;
-	} else {
-		error = metaslab_alloc(spa, spa_normal_class(spa), size,
-		    new_bp, 1, txg, old_bp, METASLAB_HINTBP_AVOID,
-		    &io_alloc_list, NULL, cityhash4(0, 0, 0, objset) %
-		    spa->spa_alloc_count);
-		if (error == 0)
-			*slog = FALSE;
-	}
-	metaslab_trace_fini(&io_alloc_list);
-
-	if (error == 0) {
-		BP_SET_LSIZE(new_bp, size);
-		BP_SET_PSIZE(new_bp, size);
-		BP_SET_COMPRESS(new_bp, ZIO_COMPRESS_OFF);
-		BP_SET_CHECKSUM(new_bp,
-		    spa_version(spa) >= SPA_VERSION_SLIM_ZIL
-		    ? ZIO_CHECKSUM_ZILOG2 : ZIO_CHECKSUM_ZILOG);
-		BP_SET_TYPE(new_bp, DMU_OT_INTENT_LOG);
-		BP_SET_LEVEL(new_bp, 0);
-		BP_SET_DEDUP(new_bp, 0);
-		BP_SET_BYTEORDER(new_bp, ZFS_HOST_BYTEORDER);
-	} else {
-		zfs_dbgmsg("%s: zil block allocation failure: "
-		    "size %llu, error %d", spa_name(spa), size, error);
-	}
-
-	return (error);
-}
-
-/*
- * ==========================================================================
- * Read, write and delete to physical devices
- * ==========================================================================
- */
-
-
-/*
- * Issue an I/O to the underlying vdev. Typically the issue pipeline
- * stops after this stage and will resume upon I/O completion.
- * However, there are instances where the vdev layer may need to
- * continue the pipeline when an I/O was not issued. Since the I/O
- * that was sent to the vdev layer might be different than the one
- * currently active in the pipeline (see vdev_queue_io()), we explicitly
- * force the underlying vdev layers to call either zio_execute() or
- * zio_interrupt() to ensure that the pipeline continues with the correct I/O.
- */
-static zio_t *
-zio_vdev_io_start(zio_t *zio)
-{
-	vdev_t *vd = zio->io_vd;
-	uint64_t align;
-	spa_t *spa = zio->io_spa;
-	int ret;
-
-	ASSERT(zio->io_error == 0);
-	ASSERT(zio->io_child_error[ZIO_CHILD_VDEV] == 0);
-
-	if (vd == NULL) {
-		if (!(zio->io_flags & ZIO_FLAG_CONFIG_WRITER))
-			spa_config_enter(spa, SCL_ZIO, zio, RW_READER);
-
-		/*
-		 * The mirror_ops handle multiple DVAs in a single BP.
-		 */
-		vdev_mirror_ops.vdev_op_io_start(zio);
-		return (NULL);
-	}
-
-	if (vd->vdev_ops->vdev_op_leaf && zio->io_type == ZIO_TYPE_FREE &&
-	    zio->io_priority == ZIO_PRIORITY_NOW) {
-		trim_map_free(vd, zio->io_offset, zio->io_size, zio->io_txg);
-		return (zio);
-	}
-
-	ASSERT3P(zio->io_logical, !=, zio);
-	if (zio->io_type == ZIO_TYPE_WRITE) {
-		ASSERT(spa->spa_trust_config);
-
-		if (zio->io_vd->vdev_removing) {
-			/*
-			 * Note: the code can handle other kinds of writes,
-			 * but we don't expect them.
-			 */
-			ASSERT(zio->io_flags &
-			    (ZIO_FLAG_PHYSICAL | ZIO_FLAG_SELF_HEAL |
-			    ZIO_FLAG_RESILVER | ZIO_FLAG_INDUCE_DAMAGE));
-		}
-	}
-
-        /*
-         * We keep track of time-sensitive I/Os so that the scan thread
-         * can quickly react to certain workloads.  In particular, we care
-         * about non-scrubbing, top-level reads and writes with the following
-         * characteristics:
-         *      - synchronous writes of user data to non-slog devices
-         *      - any reads of user data
-         * When these conditions are met, adjust the timestamp of spa_last_io
-         * which allows the scan thread to adjust its workload accordingly.
-         */
-        if (!(zio->io_flags & ZIO_FLAG_SCAN_THREAD) && zio->io_bp != NULL &&
-            vd == vd->vdev_top && !vd->vdev_islog &&
-            zio->io_bookmark.zb_objset != DMU_META_OBJSET &&
-            zio->io_txg != spa_syncing_txg(spa)) {
-                uint64_t old = spa->spa_last_io;
-                uint64_t new = ddi_get_lbolt64();
-                if (old != new)
-                        (void) atomic_cas_64(&spa->spa_last_io, old, new);
-        }
-	align = 1ULL << vd->vdev_top->vdev_ashift;
-
-	if (!(zio->io_flags & ZIO_FLAG_PHYSICAL) &&
-	    P2PHASE(zio->io_size, align) != 0) {
-		/* Transform logical writes to be a full physical block size. */
-		uint64_t asize = P2ROUNDUP(zio->io_size, align);
-		abd_t *abuf = NULL;
-		if (zio->io_type == ZIO_TYPE_READ ||
-		    zio->io_type == ZIO_TYPE_WRITE)
-			abuf = abd_alloc_sametype(zio->io_abd, asize);
-		ASSERT(vd == vd->vdev_top);
-		if (zio->io_type == ZIO_TYPE_WRITE) {
-			abd_copy(abuf, zio->io_abd, zio->io_size);
-			abd_zero_off(abuf, zio->io_size, asize - zio->io_size);
-		}
-		zio_push_transform(zio, abuf, asize, abuf ? asize : 0,
-		    zio_subblock);
-	}
-
-	/*
-	 * If this is not a physical io, make sure that it is properly aligned
-	 * before proceeding.
-	 */
-	if (!(zio->io_flags & ZIO_FLAG_PHYSICAL)) {
-		ASSERT0(P2PHASE(zio->io_offset, align));
-		ASSERT0(P2PHASE(zio->io_size, align));
-	} else {
-		/*
-		 * For the physical io we allow alignment
-		 * to a logical block size.
-		 */
-		uint64_t log_align =
-		    1ULL << vd->vdev_top->vdev_logical_ashift;
-		ASSERT0(P2PHASE(zio->io_offset, log_align));
-		ASSERT0(P2PHASE(zio->io_size, log_align));
-	}
-
-	VERIFY(zio->io_type == ZIO_TYPE_READ || spa_writeable(spa));
-
-	/*
-	 * If this is a repair I/O, and there's no self-healing involved --
-	 * that is, we're just resilvering what we expect to resilver --
-	 * then don't do the I/O unless zio's txg is actually in vd's DTL.
-	 * This prevents spurious resilvering.
-	 *
-	 * There are a few ways that we can end up creating these spurious
-	 * resilver i/os:
-	 *
-	 * 1. A resilver i/o will be issued if any DVA in the BP has a
-	 * dirty DTL.  The mirror code will issue resilver writes to
-	 * each DVA, including the one(s) that are not on vdevs with dirty
-	 * DTLs.
-	 *
-	 * 2. With nested replication, which happens when we have a
-	 * "replacing" or "spare" vdev that's a child of a mirror or raidz.
-	 * For example, given mirror(replacing(A+B), C), it's likely that
-	 * only A is out of date (it's the new device). In this case, we'll
-	 * read from C, then use the data to resilver A+B -- but we don't
-	 * actually want to resilver B, just A. The top-level mirror has no
-	 * way to know this, so instead we just discard unnecessary repairs
-	 * as we work our way down the vdev tree.
-	 *
-	 * 3. ZTEST also creates mirrors of mirrors, mirrors of raidz, etc.
-	 * The same logic applies to any form of nested replication: ditto
-	 * + mirror, RAID-Z + replacing, etc.
-	 *
-	 * However, indirect vdevs point off to other vdevs which may have
-	 * DTL's, so we never bypass them.  The child i/os on concrete vdevs
-	 * will be properly bypassed instead.
-	 */
-	if ((zio->io_flags & ZIO_FLAG_IO_REPAIR) &&
-	    !(zio->io_flags & ZIO_FLAG_SELF_HEAL) &&
-	    zio->io_txg != 0 &&	/* not a delegated i/o */
-	    vd->vdev_ops != &vdev_indirect_ops &&
-	    !vdev_dtl_contains(vd, DTL_PARTIAL, zio->io_txg, 1)) {
-		ASSERT(zio->io_type == ZIO_TYPE_WRITE);
-		zio_vdev_io_bypass(zio);
-		return (zio);
-	}
-
-	if (vd->vdev_ops->vdev_op_leaf) {
-		switch (zio->io_type) {
-		case ZIO_TYPE_READ:
-			if (vdev_cache_read(zio))
-				return (zio);
-			/* FALLTHROUGH */
-		case ZIO_TYPE_WRITE:
-		case ZIO_TYPE_FREE:
-			if ((zio = vdev_queue_io(zio)) == NULL)
-				return (NULL);
-
-			if (!vdev_accessible(vd, zio)) {
-				zio->io_error = SET_ERROR(ENXIO);
-				zio_interrupt(zio);
-				return (NULL);
-			}
-			break;
-		}
-		/*
-		 * Note that we ignore repair writes for TRIM because they can
-		 * conflict with normal writes. This isn't an issue because, by
-		 * definition, we only repair blocks that aren't freed.
-		 */
-		if (zio->io_type == ZIO_TYPE_WRITE &&
-		    !(zio->io_flags & ZIO_FLAG_IO_REPAIR) &&
-		    !trim_map_write_start(zio))
-			return (NULL);
-	}
-
-	vd->vdev_ops->vdev_op_io_start(zio);
-	return (NULL);
-}
-
-static zio_t *
-zio_vdev_io_done(zio_t *zio)
-{
-	vdev_t *vd = zio->io_vd;
-	vdev_ops_t *ops = vd ? vd->vdev_ops : &vdev_mirror_ops;
-	boolean_t unexpected_error = B_FALSE;
-
-	if (zio_wait_for_children(zio, ZIO_CHILD_VDEV_BIT, ZIO_WAIT_DONE)) {
-		return (NULL);
-	}
-
-	ASSERT(zio->io_type == ZIO_TYPE_READ ||
-	    zio->io_type == ZIO_TYPE_WRITE || zio->io_type == ZIO_TYPE_FREE);
-
-	if (vd != NULL && vd->vdev_ops->vdev_op_leaf &&
-	    (zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE ||
-	    zio->io_type == ZIO_TYPE_FREE)) {
-
-		if (zio->io_type == ZIO_TYPE_WRITE &&
-		    !(zio->io_flags & ZIO_FLAG_IO_REPAIR))
-			trim_map_write_done(zio);
-
-		vdev_queue_io_done(zio);
-
-		if (zio->io_type == ZIO_TYPE_WRITE)
-			vdev_cache_write(zio);
-
-		if (zio_injection_enabled && zio->io_error == 0)
-			zio->io_error = zio_handle_device_injection(vd,
-			    zio, EIO);
-
-		if (zio_injection_enabled && zio->io_error == 0)
-			zio->io_error = zio_handle_label_injection(zio, EIO);
-
-		if (zio->io_error) {
-			if (zio->io_error == ENOTSUP &&
-			    zio->io_type == ZIO_TYPE_FREE) {
-				/* Not all devices support TRIM. */
-			} else if (!vdev_accessible(vd, zio)) {
-				zio->io_error = SET_ERROR(ENXIO);
-			} else {
-				unexpected_error = B_TRUE;
-			}
-		}
-	}
-
-	ops->vdev_op_io_done(zio);
-
-	if (unexpected_error)
-		VERIFY(vdev_probe(vd, zio) == NULL);
-
-	return (zio);
-}
-
-/*
- * This function is used to change the priority of an existing zio that is
- * currently in-flight. This is used by the arc to upgrade priority in the
- * event that a demand read is made for a block that is currently queued
- * as a scrub or async read IO. Otherwise, the high priority read request
- * would end up having to wait for the lower priority IO.
- */
-void
-zio_change_priority(zio_t *pio, zio_priority_t priority)
-{
-	zio_t *cio, *cio_next;
-	zio_link_t *zl = NULL;
-
-	ASSERT3U(priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
-
-	if (pio->io_vd != NULL && pio->io_vd->vdev_ops->vdev_op_leaf) {
-		vdev_queue_change_io_priority(pio, priority);
-	} else {
-		pio->io_priority = priority;
-	}
-
-	mutex_enter(&pio->io_lock);
-	for (cio = zio_walk_children(pio, &zl); cio != NULL; cio = cio_next) {
-		cio_next = zio_walk_children(pio, &zl);
-		zio_change_priority(cio, priority);
-	}
-	mutex_exit(&pio->io_lock);
-}
-
-/*
- * For non-raidz ZIOs, we can just copy aside the bad data read from the
- * disk, and use that to finish the checksum ereport later.
- */
-static void
-zio_vsd_default_cksum_finish(zio_cksum_report_t *zcr,
-    const void *good_buf)
-{
-	/* no processing needed */
-	zfs_ereport_finish_checksum(zcr, good_buf, zcr->zcr_cbdata, B_FALSE);
-}
-
-/*ARGSUSED*/
-void
-zio_vsd_default_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *ignored)
-{
-	void *buf = zio_buf_alloc(zio->io_size);
-
-	abd_copy_to_buf(buf, zio->io_abd, zio->io_size);
-
-	zcr->zcr_cbinfo = zio->io_size;
-	zcr->zcr_cbdata = buf;
-	zcr->zcr_finish = zio_vsd_default_cksum_finish;
-	zcr->zcr_free = zio_buf_free;
-}
-
-static zio_t *
-zio_vdev_io_assess(zio_t *zio)
-{
-	vdev_t *vd = zio->io_vd;
-
-	if (zio_wait_for_children(zio, ZIO_CHILD_VDEV_BIT, ZIO_WAIT_DONE)) {
-		return (NULL);
-	}
-
-	if (vd == NULL && !(zio->io_flags & ZIO_FLAG_CONFIG_WRITER))
-		spa_config_exit(zio->io_spa, SCL_ZIO, zio);
-
-	if (zio->io_vsd != NULL) {
-		zio->io_vsd_ops->vsd_free(zio);
-		zio->io_vsd = NULL;
-	}
-
-	if (zio_injection_enabled && zio->io_error == 0)
-		zio->io_error = zio_handle_fault_injection(zio, EIO);
-
-	if (zio->io_type == ZIO_TYPE_FREE &&
-	    zio->io_priority != ZIO_PRIORITY_NOW) {
-		switch (zio->io_error) {
-		case 0:
-			ZIO_TRIM_STAT_INCR(bytes, zio->io_size);
-			ZIO_TRIM_STAT_BUMP(success);
-			break;
-		case EOPNOTSUPP:
-			ZIO_TRIM_STAT_BUMP(unsupported);
-			break;
-		default:
-			ZIO_TRIM_STAT_BUMP(failed);
-			break;
-		}
-	}
-
-	/*
-	 * If the I/O failed, determine whether we should attempt to retry it.
-	 *
-	 * On retry, we cut in line in the issue queue, since we don't want
-	 * compression/checksumming/etc. work to prevent our (cheap) IO reissue.
-	 */
-	if (zio->io_error && vd == NULL &&
-	    !(zio->io_flags & (ZIO_FLAG_DONT_RETRY | ZIO_FLAG_IO_RETRY))) {
-		ASSERT(!(zio->io_flags & ZIO_FLAG_DONT_QUEUE));	/* not a leaf */
-		ASSERT(!(zio->io_flags & ZIO_FLAG_IO_BYPASS));	/* not a leaf */
-		zio->io_error = 0;
-		zio->io_flags |= ZIO_FLAG_IO_RETRY |
-		    ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_AGGREGATE;
-		zio->io_stage = ZIO_STAGE_VDEV_IO_START >> 1;
-		zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE,
-		    zio_requeue_io_start_cut_in_line);
-		return (NULL);
-	}
-
-	/*
-	 * If we got an error on a leaf device, convert it to ENXIO
-	 * if the device is not accessible at all.
-	 */
-	if (zio->io_error && vd != NULL && vd->vdev_ops->vdev_op_leaf &&
-	    !vdev_accessible(vd, zio))
-		zio->io_error = SET_ERROR(ENXIO);
-
-	/*
-	 * If we can't write to an interior vdev (mirror or RAID-Z),
-	 * set vdev_cant_write so that we stop trying to allocate from it.
-	 */
-	if (zio->io_error == ENXIO && zio->io_type == ZIO_TYPE_WRITE &&
-	    vd != NULL && !vd->vdev_ops->vdev_op_leaf) {
-		vd->vdev_cant_write = B_TRUE;
-	}
-
-	/*
-	 * If a cache flush returns ENOTSUP or ENOTTY, we know that no future
-	 * attempts will ever succeed. In this case we set a persistent bit so
-	 * that we don't bother with it in the future.
-	 */
-	if ((zio->io_error == ENOTSUP || zio->io_error == ENOTTY) &&
-	    zio->io_type == ZIO_TYPE_IOCTL &&
-	    zio->io_cmd == DKIOCFLUSHWRITECACHE && vd != NULL)
-		vd->vdev_nowritecache = B_TRUE;
-
-	if (zio->io_error)
-		zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
-
-	if (vd != NULL && vd->vdev_ops->vdev_op_leaf &&
-	    zio->io_physdone != NULL) {
-		ASSERT(!(zio->io_flags & ZIO_FLAG_DELEGATED));
-		ASSERT(zio->io_child_type == ZIO_CHILD_VDEV);
-		zio->io_physdone(zio->io_logical);
-	}
-
-	return (zio);
-}
-
-void
-zio_vdev_io_reissue(zio_t *zio)
-{
-	ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START);
-	ASSERT(zio->io_error == 0);
-
-	zio->io_stage >>= 1;
-}
-
-void
-zio_vdev_io_redone(zio_t *zio)
-{
-	ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_DONE);
-
-	zio->io_stage >>= 1;
-}
-
-void
-zio_vdev_io_bypass(zio_t *zio)
-{
-	ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START);
-	ASSERT(zio->io_error == 0);
-
-	zio->io_flags |= ZIO_FLAG_IO_BYPASS;
-	zio->io_stage = ZIO_STAGE_VDEV_IO_ASSESS >> 1;
-}
-
-/*
- * ==========================================================================
- * Generate and verify checksums
- * ==========================================================================
- */
-static zio_t *
-zio_checksum_generate(zio_t *zio)
-{
-	blkptr_t *bp = zio->io_bp;
-	enum zio_checksum checksum;
-
-	if (bp == NULL) {
-		/*
-		 * This is zio_write_phys().
-		 * We're either generating a label checksum, or none at all.
-		 */
-		checksum = zio->io_prop.zp_checksum;
-
-		if (checksum == ZIO_CHECKSUM_OFF)
-			return (zio);
-
-		ASSERT(checksum == ZIO_CHECKSUM_LABEL);
-	} else {
-		if (BP_IS_GANG(bp) && zio->io_child_type == ZIO_CHILD_GANG) {
-			ASSERT(!IO_IS_ALLOCATING(zio));
-			checksum = ZIO_CHECKSUM_GANG_HEADER;
-		} else {
-			checksum = BP_GET_CHECKSUM(bp);
-		}
-	}
-
-	zio_checksum_compute(zio, checksum, zio->io_abd, zio->io_size);
-
-	return (zio);
-}
-
-static zio_t *
-zio_checksum_verify(zio_t *zio)
-{
-	zio_bad_cksum_t info;
-	blkptr_t *bp = zio->io_bp;
-	int error;
-
-	ASSERT(zio->io_vd != NULL);
-
-	if (bp == NULL) {
-		/*
-		 * This is zio_read_phys().
-		 * We're either verifying a label checksum, or nothing at all.
-		 */
-		if (zio->io_prop.zp_checksum == ZIO_CHECKSUM_OFF)
-			return (zio);
-
-		ASSERT(zio->io_prop.zp_checksum == ZIO_CHECKSUM_LABEL);
-	}
-
-	if ((error = zio_checksum_error(zio, &info)) != 0) {
-		zio->io_error = error;
-		if (error == ECKSUM &&
-		    !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
-			zfs_ereport_start_checksum(zio->io_spa,
-			    zio->io_vd, zio, zio->io_offset,
-			    zio->io_size, NULL, &info);
-		}
-	}
-
-	return (zio);
-}
-
-/*
- * Called by RAID-Z to ensure we don't compute the checksum twice.
- */
-void
-zio_checksum_verified(zio_t *zio)
-{
-	zio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY;
-}
-
-/*
- * ==========================================================================
- * Error rank.  Error are ranked in the order 0, ENXIO, ECKSUM, EIO, other.
- * An error of 0 indicates success.  ENXIO indicates whole-device failure,
- * which may be transient (e.g. unplugged) or permament.  ECKSUM and EIO
- * indicate errors that are specific to one I/O, and most likely permanent.
- * Any other error is presumed to be worse because we weren't expecting it.
- * ==========================================================================
- */
-int
-zio_worst_error(int e1, int e2)
-{
-	static int zio_error_rank[] = { 0, ENXIO, ECKSUM, EIO };
-	int r1, r2;
-
-	for (r1 = 0; r1 < sizeof (zio_error_rank) / sizeof (int); r1++)
-		if (e1 == zio_error_rank[r1])
-			break;
-
-	for (r2 = 0; r2 < sizeof (zio_error_rank) / sizeof (int); r2++)
-		if (e2 == zio_error_rank[r2])
-			break;
-
-	return (r1 > r2 ? e1 : e2);
-}
-
-/*
- * ==========================================================================
- * I/O completion
- * ==========================================================================
- */
-static zio_t *
-zio_ready(zio_t *zio)
-{
-	blkptr_t *bp = zio->io_bp;
-	zio_t *pio, *pio_next;
-	zio_link_t *zl = NULL;
-
-	if (zio_wait_for_children(zio, ZIO_CHILD_GANG_BIT | ZIO_CHILD_DDT_BIT,
-	    ZIO_WAIT_READY)) {
-		return (NULL);
-	}
-
-	if (zio->io_ready) {
-		ASSERT(IO_IS_ALLOCATING(zio));
-		ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp) ||
-		    (zio->io_flags & ZIO_FLAG_NOPWRITE));
-		ASSERT(zio->io_children[ZIO_CHILD_GANG][ZIO_WAIT_READY] == 0);
-
-		zio->io_ready(zio);
-	}
-
-	if (bp != NULL && bp != &zio->io_bp_copy)
-		zio->io_bp_copy = *bp;
-
-	if (zio->io_error != 0) {
-		zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
-
-		if (zio->io_flags & ZIO_FLAG_IO_ALLOCATING) {
-			ASSERT(IO_IS_ALLOCATING(zio));
-			ASSERT(zio->io_priority == ZIO_PRIORITY_ASYNC_WRITE);
-			ASSERT(zio->io_metaslab_class != NULL);
-
-			/*
-			 * We were unable to allocate anything, unreserve and
-			 * issue the next I/O to allocate.
-			 */
-			metaslab_class_throttle_unreserve(
-			    zio->io_metaslab_class, zio->io_prop.zp_copies,
-			    zio->io_allocator, zio);
-			zio_allocate_dispatch(zio->io_spa, zio->io_allocator);
-		}
-	}
-
-	mutex_enter(&zio->io_lock);
-	zio->io_state[ZIO_WAIT_READY] = 1;
-	pio = zio_walk_parents(zio, &zl);
-	mutex_exit(&zio->io_lock);
-
-	/*
-	 * As we notify zio's parents, new parents could be added.
-	 * New parents go to the head of zio's io_parent_list, however,
-	 * so we will (correctly) not notify them.  The remainder of zio's
-	 * io_parent_list, from 'pio_next' onward, cannot change because
-	 * all parents must wait for us to be done before they can be done.
-	 */
-	for (; pio != NULL; pio = pio_next) {
-		pio_next = zio_walk_parents(zio, &zl);
-		zio_notify_parent(pio, zio, ZIO_WAIT_READY, NULL);
-	}
-
-	if (zio->io_flags & ZIO_FLAG_NODATA) {
-		if (BP_IS_GANG(bp)) {
-			zio->io_flags &= ~ZIO_FLAG_NODATA;
-		} else {
-			ASSERT((uintptr_t)zio->io_abd < SPA_MAXBLOCKSIZE);
-			zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES;
-		}
-	}
-
-	if (zio_injection_enabled &&
-	    zio->io_spa->spa_syncing_txg == zio->io_txg)
-		zio_handle_ignored_writes(zio);
-
-	return (zio);
-}
-
-/*
- * Update the allocation throttle accounting.
- */
-static void
-zio_dva_throttle_done(zio_t *zio)
-{
-	zio_t *lio = zio->io_logical;
-	zio_t *pio = zio_unique_parent(zio);
-	vdev_t *vd = zio->io_vd;
-	int flags = METASLAB_ASYNC_ALLOC;
-
-	ASSERT3P(zio->io_bp, !=, NULL);
-	ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE);
-	ASSERT3U(zio->io_priority, ==, ZIO_PRIORITY_ASYNC_WRITE);
-	ASSERT3U(zio->io_child_type, ==, ZIO_CHILD_VDEV);
-	ASSERT(vd != NULL);
-	ASSERT3P(vd, ==, vd->vdev_top);
-	ASSERT(!(zio->io_flags & (ZIO_FLAG_IO_REPAIR | ZIO_FLAG_IO_RETRY)));
-	ASSERT(zio->io_flags & ZIO_FLAG_IO_ALLOCATING);
-	ASSERT(!(lio->io_flags & ZIO_FLAG_IO_REWRITE));
-	ASSERT(!(lio->io_orig_flags & ZIO_FLAG_NODATA));
-
-	/*
-	 * Parents of gang children can have two flavors -- ones that
-	 * allocated the gang header (will have ZIO_FLAG_IO_REWRITE set)
-	 * and ones that allocated the constituent blocks. The allocation
-	 * throttle needs to know the allocating parent zio so we must find
-	 * it here.
-	 */
-	if (pio->io_child_type == ZIO_CHILD_GANG) {
-		/*
-		 * If our parent is a rewrite gang child then our grandparent
-		 * would have been the one that performed the allocation.
-		 */
-		if (pio->io_flags & ZIO_FLAG_IO_REWRITE)
-			pio = zio_unique_parent(pio);
-		flags |= METASLAB_GANG_CHILD;
-	}
-
-	ASSERT(IO_IS_ALLOCATING(pio));
-	ASSERT3P(zio, !=, zio->io_logical);
-	ASSERT(zio->io_logical != NULL);
-	ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REPAIR));
-	ASSERT0(zio->io_flags & ZIO_FLAG_NOPWRITE);
-	ASSERT(zio->io_metaslab_class != NULL);
-
-	mutex_enter(&pio->io_lock);
-	metaslab_group_alloc_decrement(zio->io_spa, vd->vdev_id, pio, flags,
-	    pio->io_allocator, B_TRUE);
-	mutex_exit(&pio->io_lock);
-
-	metaslab_class_throttle_unreserve(zio->io_metaslab_class, 1,
-	    pio->io_allocator, pio);
-
-	/*
-	 * Call into the pipeline to see if there is more work that
-	 * needs to be done. If there is work to be done it will be
-	 * dispatched to another taskq thread.
-	 */
-	zio_allocate_dispatch(zio->io_spa, pio->io_allocator);
-}
-
-static zio_t *
-zio_done(zio_t *zio)
-{
-	spa_t *spa = zio->io_spa;
-	zio_t *lio = zio->io_logical;
-	blkptr_t *bp = zio->io_bp;
-	vdev_t *vd = zio->io_vd;
-	uint64_t psize = zio->io_size;
-	zio_t *pio, *pio_next;
-	zio_link_t *zl = NULL;
-
-	/*
-	 * If our children haven't all completed,
-	 * wait for them and then repeat this pipeline stage.
-	 */
-	if (zio_wait_for_children(zio, ZIO_CHILD_ALL_BITS, ZIO_WAIT_DONE)) {
-		return (NULL);
-	}
-
-	/*
-	 * If the allocation throttle is enabled, then update the accounting.
-	 * We only track child I/Os that are part of an allocating async
-	 * write. We must do this since the allocation is performed
-	 * by the logical I/O but the actual write is done by child I/Os.
-	 */
-	if (zio->io_flags & ZIO_FLAG_IO_ALLOCATING &&
-	    zio->io_child_type == ZIO_CHILD_VDEV) {
-		ASSERT(zio->io_metaslab_class != NULL);
-		ASSERT(zio->io_metaslab_class->mc_alloc_throttle_enabled);
-		zio_dva_throttle_done(zio);
-	}
-
-	/*
-	 * If the allocation throttle is enabled, verify that
-	 * we have decremented the refcounts for every I/O that was throttled.
-	 */
-	if (zio->io_flags & ZIO_FLAG_IO_ALLOCATING) {
-		ASSERT(zio->io_type == ZIO_TYPE_WRITE);
-		ASSERT(zio->io_priority == ZIO_PRIORITY_ASYNC_WRITE);
-		ASSERT(bp != NULL);
-
-		metaslab_group_alloc_verify(spa, zio->io_bp, zio,
-		    zio->io_allocator);
-		VERIFY(zfs_refcount_not_held(
-		    &zio->io_metaslab_class->mc_alloc_slots[zio->io_allocator],
-		    zio));
-	}
-
-	for (int c = 0; c < ZIO_CHILD_TYPES; c++)
-		for (int w = 0; w < ZIO_WAIT_TYPES; w++)
-			ASSERT(zio->io_children[c][w] == 0);
-
-	if (bp != NULL && !BP_IS_EMBEDDED(bp)) {
-		ASSERT(bp->blk_pad[0] == 0);
-		ASSERT(bp->blk_pad[1] == 0);
-		ASSERT(bcmp(bp, &zio->io_bp_copy, sizeof (blkptr_t)) == 0 ||
-		    (bp == zio_unique_parent(zio)->io_bp));
-		if (zio->io_type == ZIO_TYPE_WRITE && !BP_IS_HOLE(bp) &&
-		    zio->io_bp_override == NULL &&
-		    !(zio->io_flags & ZIO_FLAG_IO_REPAIR)) {
-			ASSERT(!BP_SHOULD_BYTESWAP(bp));
-			ASSERT3U(zio->io_prop.zp_copies, <=, BP_GET_NDVAS(bp));
-			ASSERT(BP_COUNT_GANG(bp) == 0 ||
-			    (BP_COUNT_GANG(bp) == BP_GET_NDVAS(bp)));
-		}
-		if (zio->io_flags & ZIO_FLAG_NOPWRITE)
-			VERIFY(BP_EQUAL(bp, &zio->io_bp_orig));
-	}
-
-	/*
-	 * If there were child vdev/gang/ddt errors, they apply to us now.
-	 */
-	zio_inherit_child_errors(zio, ZIO_CHILD_VDEV);
-	zio_inherit_child_errors(zio, ZIO_CHILD_GANG);
-	zio_inherit_child_errors(zio, ZIO_CHILD_DDT);
-
-	/*
-	 * If the I/O on the transformed data was successful, generate any
-	 * checksum reports now while we still have the transformed data.
-	 */
-	if (zio->io_error == 0) {
-		while (zio->io_cksum_report != NULL) {
-			zio_cksum_report_t *zcr = zio->io_cksum_report;
-			uint64_t align = zcr->zcr_align;
-			uint64_t asize = P2ROUNDUP(psize, align);
-			char *abuf = NULL;
-			abd_t *adata = zio->io_abd;
-
-			if (asize != psize) {
-				adata = abd_alloc_linear(asize, B_TRUE);
-				abd_copy(adata, zio->io_abd, psize);
-				abd_zero_off(adata, psize, asize - psize);
-			}
-
-			if (adata != NULL)
-				abuf = abd_borrow_buf_copy(adata, asize);
-
-			zio->io_cksum_report = zcr->zcr_next;
-			zcr->zcr_next = NULL;
-			zcr->zcr_finish(zcr, abuf);
-			zfs_ereport_free_checksum(zcr);
-
-			if (adata != NULL)
-				abd_return_buf(adata, abuf, asize);
-
-			if (asize != psize)
-				abd_free(adata);
-		}
-	}
-
-	zio_pop_transforms(zio);	/* note: may set zio->io_error */
-
-	vdev_stat_update(zio, psize);
-
-	if (zio->io_error) {
-		/*
-		 * If this I/O is attached to a particular vdev,
-		 * generate an error message describing the I/O failure
-		 * at the block level.  We ignore these errors if the
-		 * device is currently unavailable.
-		 */
-		if (zio->io_error != ECKSUM && vd != NULL && !vdev_is_dead(vd))
-			zfs_ereport_post(FM_EREPORT_ZFS_IO, spa, vd, zio, 0, 0);
-
-		if ((zio->io_error == EIO || !(zio->io_flags &
-		    (ZIO_FLAG_SPECULATIVE | ZIO_FLAG_DONT_PROPAGATE))) &&
-		    zio == lio) {
-			/*
-			 * For logical I/O requests, tell the SPA to log the
-			 * error and generate a logical data ereport.
-			 */
-			spa_log_error(spa, zio);
-			zfs_ereport_post(FM_EREPORT_ZFS_DATA, spa, NULL, zio,
-			    0, 0);
-		}
-	}
-
-	if (zio->io_error && zio == lio) {
-		/*
-		 * Determine whether zio should be reexecuted.  This will
-		 * propagate all the way to the root via zio_notify_parent().
-		 */
-		ASSERT(vd == NULL && bp != NULL);
-		ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
-
-		if (IO_IS_ALLOCATING(zio) &&
-		    !(zio->io_flags & ZIO_FLAG_CANFAIL)) {
-			if (zio->io_error != ENOSPC)
-				zio->io_reexecute |= ZIO_REEXECUTE_NOW;
-			else
-				zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND;
-		}
-
-		if ((zio->io_type == ZIO_TYPE_READ ||
-		    zio->io_type == ZIO_TYPE_FREE) &&
-		    !(zio->io_flags & ZIO_FLAG_SCAN_THREAD) &&
-		    zio->io_error == ENXIO &&
-		    spa_load_state(spa) == SPA_LOAD_NONE &&
-		    spa_get_failmode(spa) != ZIO_FAILURE_MODE_CONTINUE)
-			zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND;
-
-		if (!(zio->io_flags & ZIO_FLAG_CANFAIL) && !zio->io_reexecute)
-			zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND;
-
-		/*
-		 * Here is a possibly good place to attempt to do
-		 * either combinatorial reconstruction or error correction
-		 * based on checksums.  It also might be a good place
-		 * to send out preliminary ereports before we suspend
-		 * processing.
-		 */
-	}
-
-	/*
-	 * If there were logical child errors, they apply to us now.
-	 * We defer this until now to avoid conflating logical child
-	 * errors with errors that happened to the zio itself when
-	 * updating vdev stats and reporting FMA events above.
-	 */
-	zio_inherit_child_errors(zio, ZIO_CHILD_LOGICAL);
-
-	if ((zio->io_error || zio->io_reexecute) &&
-	    IO_IS_ALLOCATING(zio) && zio->io_gang_leader == zio &&
-	    !(zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE)))
-		zio_dva_unallocate(zio, zio->io_gang_tree, bp);
-
-	zio_gang_tree_free(&zio->io_gang_tree);
-
-	/*
-	 * Godfather I/Os should never suspend.
-	 */
-	if ((zio->io_flags & ZIO_FLAG_GODFATHER) &&
-	    (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND))
-		zio->io_reexecute = 0;
-
-	if (zio->io_reexecute) {
-		/*
-		 * This is a logical I/O that wants to reexecute.
-		 *
-		 * Reexecute is top-down.  When an i/o fails, if it's not
-		 * the root, it simply notifies its parent and sticks around.
-		 * The parent, seeing that it still has children in zio_done(),
-		 * does the same.  This percolates all the way up to the root.
-		 * The root i/o will reexecute or suspend the entire tree.
-		 *
-		 * This approach ensures that zio_reexecute() honors
-		 * all the original i/o dependency relationships, e.g.
-		 * parents not executing until children are ready.
-		 */
-		ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
-
-		zio->io_gang_leader = NULL;
-
-		mutex_enter(&zio->io_lock);
-		zio->io_state[ZIO_WAIT_DONE] = 1;
-		mutex_exit(&zio->io_lock);
-
-		/*
-		 * "The Godfather" I/O monitors its children but is
-		 * not a true parent to them. It will track them through
-		 * the pipeline but severs its ties whenever they get into
-		 * trouble (e.g. suspended). This allows "The Godfather"
-		 * I/O to return status without blocking.
-		 */
-		zl = NULL;
-		for (pio = zio_walk_parents(zio, &zl); pio != NULL;
-		    pio = pio_next) {
-			zio_link_t *remove_zl = zl;
-			pio_next = zio_walk_parents(zio, &zl);
-
-			if ((pio->io_flags & ZIO_FLAG_GODFATHER) &&
-			    (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND)) {
-				zio_remove_child(pio, zio, remove_zl);
-				/*
-				 * This is a rare code path, so we don't
-				 * bother with "next_to_execute".
-				 */
-				zio_notify_parent(pio, zio, ZIO_WAIT_DONE,
-				    NULL);
-			}
-		}
-
-		if ((pio = zio_unique_parent(zio)) != NULL) {
-			/*
-			 * We're not a root i/o, so there's nothing to do
-			 * but notify our parent.  Don't propagate errors
-			 * upward since we haven't permanently failed yet.
-			 */
-			ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER));
-			zio->io_flags |= ZIO_FLAG_DONT_PROPAGATE;
-			/*
-			 * This is a rare code path, so we don't bother with
-			 * "next_to_execute".
-			 */
-			zio_notify_parent(pio, zio, ZIO_WAIT_DONE, NULL);
-		} else if (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND) {
-			/*
-			 * We'd fail again if we reexecuted now, so suspend
-			 * until conditions improve (e.g. device comes online).
-			 */
-			zio_suspend(zio->io_spa, zio, ZIO_SUSPEND_IOERR);
-		} else {
-			/*
-			 * Reexecution is potentially a huge amount of work.
-			 * Hand it off to the otherwise-unused claim taskq.
-			 */
-#if defined(illumos) || !defined(_KERNEL)
-			ASSERT(zio->io_tqent.tqent_next == NULL);
-#else
-			ASSERT(zio->io_tqent.tqent_task.ta_pending == 0);
-#endif
-			spa_taskq_dispatch_ent(spa, ZIO_TYPE_CLAIM,
-			    ZIO_TASKQ_ISSUE, (task_func_t *)zio_reexecute, zio,
-			    0, &zio->io_tqent);
-		}
-		return (NULL);
-	}
-
-	ASSERT(zio->io_child_count == 0);
-	ASSERT(zio->io_reexecute == 0);
-	ASSERT(zio->io_error == 0 || (zio->io_flags & ZIO_FLAG_CANFAIL));
-
-	/*
-	 * Report any checksum errors, since the I/O is complete.
-	 */
-	while (zio->io_cksum_report != NULL) {
-		zio_cksum_report_t *zcr = zio->io_cksum_report;
-		zio->io_cksum_report = zcr->zcr_next;
-		zcr->zcr_next = NULL;
-		zcr->zcr_finish(zcr, NULL);
-		zfs_ereport_free_checksum(zcr);
-	}
-
-	/*
-	 * It is the responsibility of the done callback to ensure that this
-	 * particular zio is no longer discoverable for adoption, and as
-	 * such, cannot acquire any new parents.
-	 */
-	if (zio->io_done)
-		zio->io_done(zio);
-
-	mutex_enter(&zio->io_lock);
-	zio->io_state[ZIO_WAIT_DONE] = 1;
-	mutex_exit(&zio->io_lock);
-
-	/*
-	 * We are done executing this zio.  We may want to execute a parent
-	 * next.  See the comment in zio_notify_parent().
-	 */
-	zio_t *next_to_execute = NULL;
-	zl = NULL;
-	for (pio = zio_walk_parents(zio, &zl); pio != NULL; pio = pio_next) {
-		zio_link_t *remove_zl = zl;
-		pio_next = zio_walk_parents(zio, &zl);
-		zio_remove_child(pio, zio, remove_zl);
-		zio_notify_parent(pio, zio, ZIO_WAIT_DONE, &next_to_execute);
-	}
-
-	if (zio->io_waiter != NULL) {
-		mutex_enter(&zio->io_lock);
-		zio->io_executor = NULL;
-		cv_broadcast(&zio->io_cv);
-		mutex_exit(&zio->io_lock);
-	} else {
-		zio_destroy(zio);
-	}
-
-	return (next_to_execute);
-}
-
-/*
- * ==========================================================================
- * I/O pipeline definition
- * ==========================================================================
- */
-static zio_pipe_stage_t *zio_pipeline[] = {
-	NULL,
-	zio_read_bp_init,
-	zio_write_bp_init,
-	zio_free_bp_init,
-	zio_issue_async,
-	zio_write_compress,
-	zio_checksum_generate,
-	zio_nop_write,
-	zio_ddt_read_start,
-	zio_ddt_read_done,
-	zio_ddt_write,
-	zio_ddt_free,
-	zio_gang_assemble,
-	zio_gang_issue,
-	zio_dva_throttle,
-	zio_dva_allocate,
-	zio_dva_free,
-	zio_dva_claim,
-	zio_ready,
-	zio_vdev_io_start,
-	zio_vdev_io_done,
-	zio_vdev_io_assess,
-	zio_checksum_verify,
-	zio_done
-};
-
-
-
-
-/*
- * Compare two zbookmark_phys_t's to see which we would reach first in a
- * pre-order traversal of the object tree.
- *
- * This is simple in every case aside from the meta-dnode object. For all other
- * objects, we traverse them in order (object 1 before object 2, and so on).
- * However, all of these objects are traversed while traversing object 0, since
- * the data it points to is the list of objects.  Thus, we need to convert to a
- * canonical representation so we can compare meta-dnode bookmarks to
- * non-meta-dnode bookmarks.
- *
- * We do this by calculating "equivalents" for each field of the zbookmark.
- * zbookmarks outside of the meta-dnode use their own object and level, and
- * calculate the level 0 equivalent (the first L0 blkid that is contained in the
- * blocks this bookmark refers to) by multiplying their blkid by their span
- * (the number of L0 blocks contained within one block at their level).
- * zbookmarks inside the meta-dnode calculate their object equivalent
- * (which is L0equiv * dnodes per data block), use 0 for their L0equiv, and use
- * level + 1<<31 (any value larger than a level could ever be) for their level.
- * This causes them to always compare before a bookmark in their object
- * equivalent, compare appropriately to bookmarks in other objects, and to
- * compare appropriately to other bookmarks in the meta-dnode.
- */
-int
-zbookmark_compare(uint16_t dbss1, uint8_t ibs1, uint16_t dbss2, uint8_t ibs2,
-    const zbookmark_phys_t *zb1, const zbookmark_phys_t *zb2)
-{
-	/*
-	 * These variables represent the "equivalent" values for the zbookmark,
-	 * after converting zbookmarks inside the meta dnode to their
-	 * normal-object equivalents.
-	 */
-	uint64_t zb1obj, zb2obj;
-	uint64_t zb1L0, zb2L0;
-	uint64_t zb1level, zb2level;
-
-	if (zb1->zb_object == zb2->zb_object &&
-	    zb1->zb_level == zb2->zb_level &&
-	    zb1->zb_blkid == zb2->zb_blkid)
-		return (0);
-
-	/*
-	 * BP_SPANB calculates the span in blocks.
-	 */
-	zb1L0 = (zb1->zb_blkid) * BP_SPANB(ibs1, zb1->zb_level);
-	zb2L0 = (zb2->zb_blkid) * BP_SPANB(ibs2, zb2->zb_level);
-
-	if (zb1->zb_object == DMU_META_DNODE_OBJECT) {
-		zb1obj = zb1L0 * (dbss1 << (SPA_MINBLOCKSHIFT - DNODE_SHIFT));
-		zb1L0 = 0;
-		zb1level = zb1->zb_level + COMPARE_META_LEVEL;
-	} else {
-		zb1obj = zb1->zb_object;
-		zb1level = zb1->zb_level;
-	}
-
-	if (zb2->zb_object == DMU_META_DNODE_OBJECT) {
-		zb2obj = zb2L0 * (dbss2 << (SPA_MINBLOCKSHIFT - DNODE_SHIFT));
-		zb2L0 = 0;
-		zb2level = zb2->zb_level + COMPARE_META_LEVEL;
-	} else {
-		zb2obj = zb2->zb_object;
-		zb2level = zb2->zb_level;
-	}
-
-	/* Now that we have a canonical representation, do the comparison. */
-	if (zb1obj != zb2obj)
-		return (zb1obj < zb2obj ? -1 : 1);
-	else if (zb1L0 != zb2L0)
-		return (zb1L0 < zb2L0 ? -1 : 1);
-	else if (zb1level != zb2level)
-		return (zb1level > zb2level ? -1 : 1);
-	/*
-	 * This can (theoretically) happen if the bookmarks have the same object
-	 * and level, but different blkids, if the block sizes are not the same.
-	 * There is presently no way to change the indirect block sizes
-	 */
-	return (0);
-}
-
-/*
- *  This function checks the following: given that last_block is the place that
- *  our traversal stopped last time, does that guarantee that we've visited
- *  every node under subtree_root?  Therefore, we can't just use the raw output
- *  of zbookmark_compare.  We have to pass in a modified version of
- *  subtree_root; by incrementing the block id, and then checking whether
- *  last_block is before or equal to that, we can tell whether or not having
- *  visited last_block implies that all of subtree_root's children have been
- *  visited.
- */
-boolean_t
-zbookmark_subtree_completed(const dnode_phys_t *dnp,
-    const zbookmark_phys_t *subtree_root, const zbookmark_phys_t *last_block)
-{
-	zbookmark_phys_t mod_zb = *subtree_root;
-	mod_zb.zb_blkid++;
-	ASSERT(last_block->zb_level == 0);
-
-	/* The objset_phys_t isn't before anything. */
-	if (dnp == NULL)
-		return (B_FALSE);
-
-	/*
-	 * We pass in 1ULL << (DNODE_BLOCK_SHIFT - SPA_MINBLOCKSHIFT) for the
-	 * data block size in sectors, because that variable is only used if
-	 * the bookmark refers to a block in the meta-dnode.  Since we don't
-	 * know without examining it what object it refers to, and there's no
-	 * harm in passing in this value in other cases, we always pass it in.
-	 *
-	 * We pass in 0 for the indirect block size shift because zb2 must be
-	 * level 0.  The indirect block size is only used to calculate the span
-	 * of the bookmark, but since the bookmark must be level 0, the span is
-	 * always 1, so the math works out.
-	 *
-	 * If you make changes to how the zbookmark_compare code works, be sure
-	 * to make sure that this code still works afterwards.
-	 */
-	return (zbookmark_compare(dnp->dn_datablkszsec, dnp->dn_indblkshift,
-	    1ULL << (DNODE_BLOCK_SHIFT - SPA_MINBLOCKSHIFT), 0, &mod_zb,
-	    last_block) <= 0);
-}
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_checksum.c
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_checksum.c
+++ /dev/null
@@ -1,475 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2013, 2016 by Delphix. All rights reserved.
- * Copyright (c) 2013, Joyent, Inc. All rights reserved.
- * Copyright 2013 Saso Kiselkov. All rights reserved.
- */
-
-#include <sys/zfs_context.h>
-#include <sys/spa.h>
-#include <sys/spa_impl.h>
-#include <sys/zio.h>
-#include <sys/zio_checksum.h>
-#include <sys/zil.h>
-#include <sys/abd.h>
-#include <zfs_fletcher.h>
-
-/*
- * Checksum vectors.
- *
- * In the SPA, everything is checksummed.  We support checksum vectors
- * for three distinct reasons:
- *
- *   1. Different kinds of data need different levels of protection.
- *	For SPA metadata, we always want a very strong checksum.
- *	For user data, we let users make the trade-off between speed
- *	and checksum strength.
- *
- *   2. Cryptographic hash and MAC algorithms are an area of active research.
- *	It is likely that in future hash functions will be at least as strong
- *	as current best-of-breed, and may be substantially faster as well.
- *	We want the ability to take advantage of these new hashes as soon as
- *	they become available.
- *
- *   3. If someone develops hardware that can compute a strong hash quickly,
- *	we want the ability to take advantage of that hardware.
- *
- * Of course, we don't want a checksum upgrade to invalidate existing
- * data, so we store the checksum *function* in eight bits of the bp.
- * This gives us room for up to 256 different checksum functions.
- *
- * When writing a block, we always checksum it with the latest-and-greatest
- * checksum function of the appropriate strength.  When reading a block,
- * we compare the expected checksum against the actual checksum, which we
- * compute via the checksum function specified by BP_GET_CHECKSUM(bp).
- *
- * SALTED CHECKSUMS
- *
- * To enable the use of less secure hash algorithms with dedup, we
- * introduce the notion of salted checksums (MACs, really).  A salted
- * checksum is fed both a random 256-bit value (the salt) and the data
- * to be checksummed.  This salt is kept secret (stored on the pool, but
- * never shown to the user).  Thus even if an attacker knew of collision
- * weaknesses in the hash algorithm, they won't be able to mount a known
- * plaintext attack on the DDT, since the actual hash value cannot be
- * known ahead of time.  How the salt is used is algorithm-specific
- * (some might simply prefix it to the data block, others might need to
- * utilize a full-blown HMAC).  On disk the salt is stored in a ZAP
- * object in the MOS (DMU_POOL_CHECKSUM_SALT).
- *
- * CONTEXT TEMPLATES
- *
- * Some hashing algorithms need to perform a substantial amount of
- * initialization work (e.g. salted checksums above may need to pre-hash
- * the salt) before being able to process data.  Performing this
- * redundant work for each block would be wasteful, so we instead allow
- * a checksum algorithm to do the work once (the first time it's used)
- * and then keep this pre-initialized context as a template inside the
- * spa_t (spa_cksum_tmpls).  If the zio_checksum_info_t contains
- * non-NULL ci_tmpl_init and ci_tmpl_free callbacks, they are used to
- * construct and destruct the pre-initialized checksum context.  The
- * pre-initialized context is then reused during each checksum
- * invocation and passed to the checksum function.
- */
-
-/*ARGSUSED*/
-static void
-abd_checksum_off(abd_t *abd, uint64_t size,
-    const void *ctx_template, zio_cksum_t *zcp)
-{
-	ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0);
-}
-
-/*ARGSUSED*/
-void
-abd_fletcher_2_native(abd_t *abd, uint64_t size,
-    const void *ctx_template, zio_cksum_t *zcp)
-{
-	fletcher_init(zcp);
-	(void) abd_iterate_func(abd, 0, size,
-	    fletcher_2_incremental_native, zcp);
-}
-
-/*ARGSUSED*/
-void
-abd_fletcher_2_byteswap(abd_t *abd, uint64_t size,
-    const void *ctx_template, zio_cksum_t *zcp)
-{
-	fletcher_init(zcp);
-	(void) abd_iterate_func(abd, 0, size,
-	    fletcher_2_incremental_byteswap, zcp);
-}
-
-/*ARGSUSED*/
-void
-abd_fletcher_4_native(abd_t *abd, uint64_t size,
-    const void *ctx_template, zio_cksum_t *zcp)
-{
-	fletcher_init(zcp);
-	(void) abd_iterate_func(abd, 0, size,
-	    fletcher_4_incremental_native, zcp);
-}
-
-/*ARGSUSED*/
-void
-abd_fletcher_4_byteswap(abd_t *abd, uint64_t size,
-    const void *ctx_template, zio_cksum_t *zcp)
-{
-	fletcher_init(zcp);
-	(void) abd_iterate_func(abd, 0, size,
-	    fletcher_4_incremental_byteswap, zcp);
-}
-
-zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS] = {
-	{{NULL, NULL}, NULL, NULL, 0, "inherit"},
-	{{NULL, NULL}, NULL, NULL, 0, "on"},
-	{{abd_checksum_off,		abd_checksum_off},
-	    NULL, NULL, 0, "off"},
-	{{abd_checksum_SHA256,		abd_checksum_SHA256},
-	    NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_EMBEDDED,
-	    "label"},
-	{{abd_checksum_SHA256,		abd_checksum_SHA256},
-	    NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_EMBEDDED,
-	    "gang_header"},
-	{{abd_fletcher_2_native,	abd_fletcher_2_byteswap},
-	    NULL, NULL, ZCHECKSUM_FLAG_EMBEDDED, "zilog"},
-	{{abd_fletcher_2_native,	abd_fletcher_2_byteswap},
-	    NULL, NULL, 0, "fletcher2"},
-	{{abd_fletcher_4_native,	abd_fletcher_4_byteswap},
-	    NULL, NULL, ZCHECKSUM_FLAG_METADATA, "fletcher4"},
-	{{abd_checksum_SHA256,		abd_checksum_SHA256},
-	    NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP |
-	    ZCHECKSUM_FLAG_NOPWRITE, "sha256"},
-	{{abd_fletcher_4_native,	abd_fletcher_4_byteswap},
-	    NULL, NULL, ZCHECKSUM_FLAG_EMBEDDED, "zilog2"},
-	{{abd_checksum_off,		abd_checksum_off},
-	    NULL, NULL, 0, "noparity"},
-	{{abd_checksum_SHA512_native,	abd_checksum_SHA512_byteswap},
-	    NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP |
-	    ZCHECKSUM_FLAG_NOPWRITE, "sha512"},
-	{{abd_checksum_skein_native,	abd_checksum_skein_byteswap},
-	    abd_checksum_skein_tmpl_init, abd_checksum_skein_tmpl_free,
-	    ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP |
-	    ZCHECKSUM_FLAG_SALTED | ZCHECKSUM_FLAG_NOPWRITE, "skein"},
-#ifdef illumos
-	{{abd_checksum_edonr_native,	abd_checksum_edonr_byteswap},
-	    abd_checksum_edonr_tmpl_init, abd_checksum_edonr_tmpl_free,
-	    ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_SALTED |
-	    ZCHECKSUM_FLAG_NOPWRITE, "edonr"},
-#endif
-};
-
-/*
- * The flag corresponding to the "verify" in dedup=[checksum,]verify
- * must be cleared first, so callers should use ZIO_CHECKSUM_MASK.
- */
-spa_feature_t
-zio_checksum_to_feature(enum zio_checksum cksum)
-{
-	VERIFY((cksum & ~ZIO_CHECKSUM_MASK) == 0);
-
-	switch (cksum) {
-	case ZIO_CHECKSUM_SHA512:
-		return (SPA_FEATURE_SHA512);
-	case ZIO_CHECKSUM_SKEIN:
-		return (SPA_FEATURE_SKEIN);
-#ifdef illumos
-	case ZIO_CHECKSUM_EDONR:
-		return (SPA_FEATURE_EDONR);
-#endif
-	}
-	return (SPA_FEATURE_NONE);
-}
-
-enum zio_checksum
-zio_checksum_select(enum zio_checksum child, enum zio_checksum parent)
-{
-	ASSERT(child < ZIO_CHECKSUM_FUNCTIONS);
-	ASSERT(parent < ZIO_CHECKSUM_FUNCTIONS);
-	ASSERT(parent != ZIO_CHECKSUM_INHERIT && parent != ZIO_CHECKSUM_ON);
-
-	if (child == ZIO_CHECKSUM_INHERIT)
-		return (parent);
-
-	if (child == ZIO_CHECKSUM_ON)
-		return (ZIO_CHECKSUM_ON_VALUE);
-
-	return (child);
-}
-
-enum zio_checksum
-zio_checksum_dedup_select(spa_t *spa, enum zio_checksum child,
-    enum zio_checksum parent)
-{
-	ASSERT((child & ZIO_CHECKSUM_MASK) < ZIO_CHECKSUM_FUNCTIONS);
-	ASSERT((parent & ZIO_CHECKSUM_MASK) < ZIO_CHECKSUM_FUNCTIONS);
-	ASSERT(parent != ZIO_CHECKSUM_INHERIT && parent != ZIO_CHECKSUM_ON);
-
-	if (child == ZIO_CHECKSUM_INHERIT)
-		return (parent);
-
-	if (child == ZIO_CHECKSUM_ON)
-		return (spa_dedup_checksum(spa));
-
-	if (child == (ZIO_CHECKSUM_ON | ZIO_CHECKSUM_VERIFY))
-		return (spa_dedup_checksum(spa) | ZIO_CHECKSUM_VERIFY);
-
-	ASSERT((zio_checksum_table[child & ZIO_CHECKSUM_MASK].ci_flags &
-	    ZCHECKSUM_FLAG_DEDUP) ||
-	    (child & ZIO_CHECKSUM_VERIFY) || child == ZIO_CHECKSUM_OFF);
-
-	return (child);
-}
-
-/*
- * Set the external verifier for a gang block based on <vdev, offset, txg>,
- * a tuple which is guaranteed to be unique for the life of the pool.
- */
-static void
-zio_checksum_gang_verifier(zio_cksum_t *zcp, blkptr_t *bp)
-{
-	dva_t *dva = BP_IDENTITY(bp);
-	uint64_t txg = BP_PHYSICAL_BIRTH(bp);
-
-	ASSERT(BP_IS_GANG(bp));
-
-	ZIO_SET_CHECKSUM(zcp, DVA_GET_VDEV(dva), DVA_GET_OFFSET(dva), txg, 0);
-}
-
-/*
- * Set the external verifier for a label block based on its offset.
- * The vdev is implicit, and the txg is unknowable at pool open time --
- * hence the logic in vdev_uberblock_load() to find the most recent copy.
- */
-static void
-zio_checksum_label_verifier(zio_cksum_t *zcp, uint64_t offset)
-{
-	ZIO_SET_CHECKSUM(zcp, offset, 0, 0, 0);
-}
-
-/*
- * Calls the template init function of a checksum which supports context
- * templates and installs the template into the spa_t.
- */
-static void
-zio_checksum_template_init(enum zio_checksum checksum, spa_t *spa)
-{
-	zio_checksum_info_t *ci = &zio_checksum_table[checksum];
-
-	if (ci->ci_tmpl_init == NULL)
-		return;
-	if (spa->spa_cksum_tmpls[checksum] != NULL)
-		return;
-
-	VERIFY(ci->ci_tmpl_free != NULL);
-	mutex_enter(&spa->spa_cksum_tmpls_lock);
-	if (spa->spa_cksum_tmpls[checksum] == NULL) {
-		spa->spa_cksum_tmpls[checksum] =
-		    ci->ci_tmpl_init(&spa->spa_cksum_salt);
-		VERIFY(spa->spa_cksum_tmpls[checksum] != NULL);
-	}
-	mutex_exit(&spa->spa_cksum_tmpls_lock);
-}
-
-/*
- * Generate the checksum.
- */
-void
-zio_checksum_compute(zio_t *zio, enum zio_checksum checksum,
-    abd_t *abd, uint64_t size)
-{
-	blkptr_t *bp = zio->io_bp;
-	uint64_t offset = zio->io_offset;
-	zio_checksum_info_t *ci = &zio_checksum_table[checksum];
-	zio_cksum_t cksum;
-	spa_t *spa = zio->io_spa;
-
-	ASSERT((uint_t)checksum < ZIO_CHECKSUM_FUNCTIONS);
-	ASSERT(ci->ci_func[0] != NULL);
-
-	zio_checksum_template_init(checksum, spa);
-
-	if (ci->ci_flags & ZCHECKSUM_FLAG_EMBEDDED) {
-		zio_eck_t *eck;
-		void *data = abd_to_buf(abd);
-
-		if (checksum == ZIO_CHECKSUM_ZILOG2) {
-			zil_chain_t *zilc = data;
-
-			size = P2ROUNDUP_TYPED(zilc->zc_nused, ZIL_MIN_BLKSZ,
-			    uint64_t);
-			eck = &zilc->zc_eck;
-		} else {
-			eck = (zio_eck_t *)((char *)data + size) - 1;
-		}
-		if (checksum == ZIO_CHECKSUM_GANG_HEADER)
-			zio_checksum_gang_verifier(&eck->zec_cksum, bp);
-		else if (checksum == ZIO_CHECKSUM_LABEL)
-			zio_checksum_label_verifier(&eck->zec_cksum, offset);
-		else
-			bp->blk_cksum = eck->zec_cksum;
-		eck->zec_magic = ZEC_MAGIC;
-		ci->ci_func[0](abd, size, spa->spa_cksum_tmpls[checksum],
-		    &cksum);
-		eck->zec_cksum = cksum;
-	} else {
-		ci->ci_func[0](abd, size, spa->spa_cksum_tmpls[checksum],
-		    &bp->blk_cksum);
-	}
-}
-
-int
-zio_checksum_error_impl(spa_t *spa, blkptr_t *bp, enum zio_checksum checksum,
-    abd_t *abd, uint64_t size, uint64_t offset, zio_bad_cksum_t *info)
-{
-	zio_checksum_info_t *ci = &zio_checksum_table[checksum];
-	zio_cksum_t actual_cksum, expected_cksum;
-	int byteswap;
-
-	if (checksum >= ZIO_CHECKSUM_FUNCTIONS || ci->ci_func[0] == NULL)
-		return (SET_ERROR(EINVAL));
-
-	zio_checksum_template_init(checksum, spa);
-
-	if (ci->ci_flags & ZCHECKSUM_FLAG_EMBEDDED) {
-		zio_eck_t *eck;
-		zio_cksum_t verifier;
-		uint64_t data_size = size;
-		void *data = abd_borrow_buf_copy(abd, data_size);
-
-		if (checksum == ZIO_CHECKSUM_ZILOG2) {
-			zil_chain_t *zilc = data;
-			uint64_t nused;
-
-			eck = &zilc->zc_eck;
-			if (eck->zec_magic == ZEC_MAGIC) {
-				nused = zilc->zc_nused;
-			} else if (eck->zec_magic == BSWAP_64(ZEC_MAGIC)) {
-				nused = BSWAP_64(zilc->zc_nused);
-			} else {
-				abd_return_buf(abd, data, data_size);
-				return (SET_ERROR(ECKSUM));
-			}
-
-			if (nused > data_size) {
-				abd_return_buf(abd, data, data_size);
-				return (SET_ERROR(ECKSUM));
-			}
-
-			size = P2ROUNDUP_TYPED(nused, ZIL_MIN_BLKSZ, uint64_t);
-		} else {
-			eck = (zio_eck_t *)((char *)data + data_size) - 1;
-		}
-
-		if (checksum == ZIO_CHECKSUM_GANG_HEADER)
-			zio_checksum_gang_verifier(&verifier, bp);
-		else if (checksum == ZIO_CHECKSUM_LABEL)
-			zio_checksum_label_verifier(&verifier, offset);
-		else
-			verifier = bp->blk_cksum;
-
-		byteswap = (eck->zec_magic == BSWAP_64(ZEC_MAGIC));
-
-		if (byteswap)
-			byteswap_uint64_array(&verifier, sizeof (zio_cksum_t));
-
-		size_t eck_offset = (size_t)(&eck->zec_cksum) - (size_t)data;
-		expected_cksum = eck->zec_cksum;
-		eck->zec_cksum = verifier;
-		abd_return_buf_copy(abd, data, data_size);
-
-		ci->ci_func[byteswap](abd, size,
-		    spa->spa_cksum_tmpls[checksum], &actual_cksum);
-		abd_copy_from_buf_off(abd, &expected_cksum,
-		    eck_offset, sizeof (zio_cksum_t));
-
-		if (byteswap) {
-			byteswap_uint64_array(&expected_cksum,
-			    sizeof (zio_cksum_t));
-		}
-	} else {
-		byteswap = BP_SHOULD_BYTESWAP(bp);
-		expected_cksum = bp->blk_cksum;
-		ci->ci_func[byteswap](abd, size,
-		    spa->spa_cksum_tmpls[checksum], &actual_cksum);
-	}
-
-	if (info != NULL) {
-		info->zbc_expected = expected_cksum;
-		info->zbc_actual = actual_cksum;
-		info->zbc_checksum_name = ci->ci_name;
-		info->zbc_byteswapped = byteswap;
-		info->zbc_injected = 0;
-		info->zbc_has_cksum = 1;
-	}
-
-	if (!ZIO_CHECKSUM_EQUAL(actual_cksum, expected_cksum))
-		return (SET_ERROR(ECKSUM));
-
-	return (0);
-}
-
-int
-zio_checksum_error(zio_t *zio, zio_bad_cksum_t *info)
-{
-	blkptr_t *bp = zio->io_bp;
-	uint_t checksum = (bp == NULL ? zio->io_prop.zp_checksum :
-	    (BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp)));
-	int error;
-	uint64_t size = (bp == NULL ? zio->io_size :
-	    (BP_IS_GANG(bp) ? SPA_GANGBLOCKSIZE : BP_GET_PSIZE(bp)));
-	uint64_t offset = zio->io_offset;
-	abd_t *data = zio->io_abd;
-	spa_t *spa = zio->io_spa;
-
-	error = zio_checksum_error_impl(spa, bp, checksum, data, size,
-	    offset, info);
-
-	if (zio_injection_enabled && error == 0 && zio->io_error == 0) {
-		error = zio_handle_fault_injection(zio, ECKSUM);
-		if (error != 0)
-			info->zbc_injected = 1;
-	}
-
-	return (error);
-}
-
-/*
- * Called by a spa_t that's about to be deallocated. This steps through
- * all of the checksum context templates and deallocates any that were
- * initialized using the algorithm-specific template init function.
- */
-void
-zio_checksum_templates_free(spa_t *spa)
-{
-	for (enum zio_checksum checksum = 0;
-	    checksum < ZIO_CHECKSUM_FUNCTIONS; checksum++) {
-		if (spa->spa_cksum_tmpls[checksum] != NULL) {
-			zio_checksum_info_t *ci = &zio_checksum_table[checksum];
-
-			VERIFY(ci->ci_tmpl_free != NULL);
-			ci->ci_tmpl_free(spa->spa_cksum_tmpls[checksum]);
-			spa->spa_cksum_tmpls[checksum] = NULL;
-		}
-	}
-}
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_compress.c
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_compress.c
+++ /dev/null
@@ -1,215 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-
-/*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-/*
- * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
- * Copyright (c) 2013, 2018 by Delphix. All rights reserved.
- */
-
-#include <sys/zfs_context.h>
-#include <sys/compress.h>
-#include <sys/kstat.h>
-#include <sys/spa.h>
-#include <sys/zfeature.h>
-#include <sys/zio.h>
-#include <sys/zio_compress.h>
-
-typedef struct zcomp_stats {
-	kstat_named_t zcompstat_attempts;
-	kstat_named_t zcompstat_empty;
-	kstat_named_t zcompstat_skipped_insufficient_gain;
-} zcomp_stats_t;
-
-static zcomp_stats_t zcomp_stats = {
-	{ "attempts",			KSTAT_DATA_UINT64 },
-	{ "empty",			KSTAT_DATA_UINT64 },
-	{ "skipped_insufficient_gain",	KSTAT_DATA_UINT64 }
-};
-
-#define	ZCOMPSTAT_INCR(stat, val) \
-	atomic_add_64(&zcomp_stats.stat.value.ui64, (val));
-
-#define	ZCOMPSTAT_BUMP(stat)		ZCOMPSTAT_INCR(stat, 1);
-
-kstat_t		*zcomp_ksp;
-
-/*
- * If nonzero, every 1/X decompression attempts will fail, simulating
- * an undetected memory error.
- */
-uint64_t zio_decompress_fail_fraction = 0;
-
-/*
- * Compression vectors.
- */
-zio_compress_info_t zio_compress_table[ZIO_COMPRESS_FUNCTIONS] = {
-	{"inherit",		0,	NULL,		NULL},
-	{"on",			0,	NULL,		NULL},
-	{"uncompressed",	0,	NULL,		NULL},
-	{"lzjb",		0,	lzjb_compress,	lzjb_decompress},
-	{"empty",		0,	NULL,		NULL},
-	{"gzip-1",		1,	gzip_compress,	gzip_decompress},
-	{"gzip-2",		2,	gzip_compress,	gzip_decompress},
-	{"gzip-3",		3,	gzip_compress,	gzip_decompress},
-	{"gzip-4",		4,	gzip_compress,	gzip_decompress},
-	{"gzip-5",		5,	gzip_compress,	gzip_decompress},
-	{"gzip-6",		6,	gzip_compress,	gzip_decompress},
-	{"gzip-7",		7,	gzip_compress,	gzip_decompress},
-	{"gzip-8",		8,	gzip_compress,	gzip_decompress},
-	{"gzip-9",		9,	gzip_compress,	gzip_decompress},
-	{"zle",			64,	zle_compress,	zle_decompress},
-	{"lz4",			0,	lz4_compress,	lz4_decompress}
-};
-
-enum zio_compress
-zio_compress_select(spa_t *spa, enum zio_compress child,
-    enum zio_compress parent)
-{
-	enum zio_compress result;
-
-	ASSERT(child < ZIO_COMPRESS_FUNCTIONS);
-	ASSERT(parent < ZIO_COMPRESS_FUNCTIONS);
-	ASSERT(parent != ZIO_COMPRESS_INHERIT);
-
-	result = child;
-	if (result == ZIO_COMPRESS_INHERIT)
-		result = parent;
-
-	if (result == ZIO_COMPRESS_ON) {
-		if (spa_feature_is_active(spa, SPA_FEATURE_LZ4_COMPRESS))
-			result = ZIO_COMPRESS_LZ4_ON_VALUE;
-		else
-			result = ZIO_COMPRESS_LEGACY_ON_VALUE;
-	}
-
-	return (result);
-}
-
-/*ARGSUSED*/
-static int
-zio_compress_zeroed_cb(void *data, size_t len, void *private)
-{
-	uint64_t *end = (uint64_t *)((char *)data + len);
-	for (uint64_t *word = (uint64_t *)data; word < end; word++)
-		if (*word != 0)
-			return (1);
-
-	return (0);
-}
-
-size_t
-zio_compress_data(enum zio_compress c, abd_t *src, void *dst, size_t s_len)
-{
-	size_t c_len, d_len;
-	zio_compress_info_t *ci = &zio_compress_table[c];
-
-	ASSERT((uint_t)c < ZIO_COMPRESS_FUNCTIONS);
-	ASSERT((uint_t)c == ZIO_COMPRESS_EMPTY || ci->ci_compress != NULL);
-
-	ZCOMPSTAT_BUMP(zcompstat_attempts);
-
-	/*
-	 * If the data is all zeroes, we don't even need to allocate
-	 * a block for it.  We indicate this by returning zero size.
-	 */
-	if (abd_iterate_func(src, 0, s_len, zio_compress_zeroed_cb, NULL) == 0) {
-		ZCOMPSTAT_BUMP(zcompstat_empty);
-		return (0);
-	}
-
-	if (c == ZIO_COMPRESS_EMPTY)
-		return (s_len);
-
-	/* Compress at least 12.5% */
-	d_len = s_len - (s_len >> 3);
-
-	/* No compression algorithms can read from ABDs directly */
-	void *tmp = abd_borrow_buf_copy(src, s_len);
-	c_len = ci->ci_compress(tmp, dst, s_len, d_len, ci->ci_level);
-	abd_return_buf(src, tmp, s_len);
-
-	if (c_len > d_len) {
-		ZCOMPSTAT_BUMP(zcompstat_skipped_insufficient_gain);
-		return (s_len);
-	}
-
-	ASSERT3U(c_len, <=, d_len);
-	return (c_len);
-}
-
-int
-zio_decompress_data_buf(enum zio_compress c, void *src, void *dst,
-    size_t s_len, size_t d_len)
-{
-	zio_compress_info_t *ci = &zio_compress_table[c];
-	if ((uint_t)c >= ZIO_COMPRESS_FUNCTIONS || ci->ci_decompress == NULL)
-		return (SET_ERROR(EINVAL));
-
-	return (ci->ci_decompress(src, dst, s_len, d_len, ci->ci_level));
-}
-
-int
-zio_decompress_data(enum zio_compress c, abd_t *src, void *dst,
-    size_t s_len, size_t d_len)
-{
-	void *tmp = abd_borrow_buf_copy(src, s_len);
-	int ret = zio_decompress_data_buf(c, tmp, dst, s_len, d_len);
-	abd_return_buf(src, tmp, s_len);
-
-	/*
-	 * Decompression shouldn't fail, because we've already verifyied
-	 * the checksum.  However, for extra protection (e.g. against bitflips
-	 * in non-ECC RAM), we handle this error (and test it).
-	 */
-	ASSERT0(ret);
-	if (zio_decompress_fail_fraction != 0 &&
-	    spa_get_random(zio_decompress_fail_fraction) == 0)
-		ret = SET_ERROR(EINVAL);
-
-	return (ret);
-}
-
-void
-zio_compress_init(void)
-{
-
-	zcomp_ksp = kstat_create("zfs", 0, "zcompstats", "misc",
-	    KSTAT_TYPE_NAMED, sizeof (zcomp_stats) / sizeof (kstat_named_t),
-	    KSTAT_FLAG_VIRTUAL);
-
-	if (zcomp_ksp != NULL) {
-		zcomp_ksp->ks_data = &zcomp_stats;
-		kstat_install(zcomp_ksp);
-	}
-}
-
-void
-zio_compress_fini(void)
-{
-	if (zcomp_ksp != NULL) {
-		kstat_delete(zcomp_ksp);
-		zcomp_ksp = NULL;
-	}
-}
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_inject.c
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_inject.c
+++ /dev/null
@@ -1,755 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
- */
-
-/*
- * ZFS fault injection
- *
- * To handle fault injection, we keep track of a series of zinject_record_t
- * structures which describe which logical block(s) should be injected with a
- * fault.  These are kept in a global list.  Each record corresponds to a given
- * spa_t and maintains a special hold on the spa_t so that it cannot be deleted
- * or exported while the injection record exists.
- *
- * Device level injection is done using the 'zi_guid' field.  If this is set, it
- * means that the error is destined for a particular device, not a piece of
- * data.
- *
- * This is a rather poor data structure and algorithm, but we don't expect more
- * than a few faults at any one time, so it should be sufficient for our needs.
- */
-
-#include <sys/arc.h>
-#include <sys/zio_impl.h>
-#include <sys/zfs_ioctl.h>
-#include <sys/vdev_impl.h>
-#include <sys/dmu_objset.h>
-#include <sys/fs/zfs.h>
-
-uint32_t zio_injection_enabled;
-
-/*
- * Data describing each zinject handler registered on the system, and
- * contains the list node linking the handler in the global zinject
- * handler list.
- */
-typedef struct inject_handler {
-	int			zi_id;
-	spa_t			*zi_spa;
-	zinject_record_t	zi_record;
-	uint64_t		*zi_lanes;
-	int			zi_next_lane;
-	list_node_t		zi_link;
-} inject_handler_t;
-
-/*
- * List of all zinject handlers registered on the system, protected by
- * the inject_lock defined below.
- */
-static list_t inject_handlers;
-
-/*
- * This protects insertion into, and traversal of, the inject handler
- * list defined above; as well as the inject_delay_count. Any time a
- * handler is inserted or removed from the list, this lock should be
- * taken as a RW_WRITER; and any time traversal is done over the list
- * (without modification to it) this lock should be taken as a RW_READER.
- */
-static krwlock_t inject_lock;
-
-/*
- * This holds the number of zinject delay handlers that have been
- * registered on the system. It is protected by the inject_lock defined
- * above. Thus modifications to this count must be a RW_WRITER of the
- * inject_lock, and reads of this count must be (at least) a RW_READER
- * of the lock.
- */
-static int inject_delay_count = 0;
-
-/*
- * This lock is used only in zio_handle_io_delay(), refer to the comment
- * in that function for more details.
- */
-static kmutex_t inject_delay_mtx;
-
-/*
- * Used to assign unique identifying numbers to each new zinject handler.
- */
-static int inject_next_id = 1;
-
-/*
- * Returns true if the given record matches the I/O in progress.
- */
-static boolean_t
-zio_match_handler(zbookmark_phys_t *zb, uint64_t type,
-    zinject_record_t *record, int error)
-{
-	/*
-	 * Check for a match against the MOS, which is based on type
-	 */
-	if (zb->zb_objset == DMU_META_OBJSET &&
-	    record->zi_objset == DMU_META_OBJSET &&
-	    record->zi_object == DMU_META_DNODE_OBJECT) {
-		if (record->zi_type == DMU_OT_NONE ||
-		    type == record->zi_type)
-			return (record->zi_freq == 0 ||
-			    spa_get_random(100) < record->zi_freq);
-		else
-			return (B_FALSE);
-	}
-
-	/*
-	 * Check for an exact match.
-	 */
-	if (zb->zb_objset == record->zi_objset &&
-	    zb->zb_object == record->zi_object &&
-	    zb->zb_level == record->zi_level &&
-	    zb->zb_blkid >= record->zi_start &&
-	    zb->zb_blkid <= record->zi_end &&
-	    error == record->zi_error)
-		return (record->zi_freq == 0 ||
-		    spa_get_random(100) < record->zi_freq);
-
-	return (B_FALSE);
-}
-
-/*
- * Panic the system when a config change happens in the function
- * specified by tag.
- */
-void
-zio_handle_panic_injection(spa_t *spa, char *tag, uint64_t type)
-{
-	inject_handler_t *handler;
-
-	rw_enter(&inject_lock, RW_READER);
-
-	for (handler = list_head(&inject_handlers); handler != NULL;
-	    handler = list_next(&inject_handlers, handler)) {
-
-		if (spa != handler->zi_spa)
-			continue;
-
-		if (handler->zi_record.zi_type == type &&
-		    strcmp(tag, handler->zi_record.zi_func) == 0)
-			panic("Panic requested in function %s\n", tag);
-	}
-
-	rw_exit(&inject_lock);
-}
-
-/*
- * Determine if the I/O in question should return failure.  Returns the errno
- * to be returned to the caller.
- */
-int
-zio_handle_fault_injection(zio_t *zio, int error)
-{
-	int ret = 0;
-	inject_handler_t *handler;
-
-	/*
-	 * Ignore I/O not associated with any logical data.
-	 */
-	if (zio->io_logical == NULL)
-		return (0);
-
-	/*
-	 * Currently, we only support fault injection on reads.
-	 */
-	if (zio->io_type != ZIO_TYPE_READ)
-		return (0);
-
-	rw_enter(&inject_lock, RW_READER);
-
-	for (handler = list_head(&inject_handlers); handler != NULL;
-	    handler = list_next(&inject_handlers, handler)) {
-
-		if (zio->io_spa != handler->zi_spa ||
-		    handler->zi_record.zi_cmd != ZINJECT_DATA_FAULT)
-			continue;
-
-		/* If this handler matches, return EIO */
-		if (zio_match_handler(&zio->io_logical->io_bookmark,
-		    zio->io_bp ? BP_GET_TYPE(zio->io_bp) : DMU_OT_NONE,
-		    &handler->zi_record, error)) {
-			ret = error;
-			break;
-		}
-	}
-
-	rw_exit(&inject_lock);
-
-	return (ret);
-}
-
-/*
- * Determine if the zio is part of a label update and has an injection
- * handler associated with that portion of the label. Currently, we
- * allow error injection in either the nvlist or the uberblock region of
- * of the vdev label.
- */
-int
-zio_handle_label_injection(zio_t *zio, int error)
-{
-	inject_handler_t *handler;
-	vdev_t *vd = zio->io_vd;
-	uint64_t offset = zio->io_offset;
-	int label;
-	int ret = 0;
-
-	if (offset >= VDEV_LABEL_START_SIZE &&
-	    offset < vd->vdev_psize - VDEV_LABEL_END_SIZE)
-		return (0);
-
-	rw_enter(&inject_lock, RW_READER);
-
-	for (handler = list_head(&inject_handlers); handler != NULL;
-	    handler = list_next(&inject_handlers, handler)) {
-		uint64_t start = handler->zi_record.zi_start;
-		uint64_t end = handler->zi_record.zi_end;
-
-		if (handler->zi_record.zi_cmd != ZINJECT_LABEL_FAULT)
-			continue;
-
-		/*
-		 * The injection region is the relative offsets within a
-		 * vdev label. We must determine the label which is being
-		 * updated and adjust our region accordingly.
-		 */
-		label = vdev_label_number(vd->vdev_psize, offset);
-		start = vdev_label_offset(vd->vdev_psize, label, start);
-		end = vdev_label_offset(vd->vdev_psize, label, end);
-
-		if (zio->io_vd->vdev_guid == handler->zi_record.zi_guid &&
-		    (offset >= start && offset <= end)) {
-			ret = error;
-			break;
-		}
-	}
-	rw_exit(&inject_lock);
-	return (ret);
-}
-
-
-int
-zio_handle_device_injection(vdev_t *vd, zio_t *zio, int error)
-{
-	inject_handler_t *handler;
-	int ret = 0;
-
-	/*
-	 * We skip over faults in the labels unless it's during
-	 * device open (i.e. zio == NULL).
-	 */
-	if (zio != NULL) {
-		uint64_t offset = zio->io_offset;
-
-		if (offset < VDEV_LABEL_START_SIZE ||
-		    offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE)
-			return (0);
-	}
-
-	rw_enter(&inject_lock, RW_READER);
-
-	for (handler = list_head(&inject_handlers); handler != NULL;
-	    handler = list_next(&inject_handlers, handler)) {
-
-		if (handler->zi_record.zi_cmd != ZINJECT_DEVICE_FAULT)
-			continue;
-
-		if (vd->vdev_guid == handler->zi_record.zi_guid) {
-			if (handler->zi_record.zi_failfast &&
-			    (zio == NULL || (zio->io_flags &
-			    (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD)))) {
-				continue;
-			}
-
-			/* Handle type specific I/O failures */
-			if (zio != NULL &&
-			    handler->zi_record.zi_iotype != ZIO_TYPES &&
-			    handler->zi_record.zi_iotype != zio->io_type)
-				continue;
-
-			if (handler->zi_record.zi_error == error) {
-				/*
-				 * For a failed open, pretend like the device
-				 * has gone away.
-				 */
-				if (error == ENXIO)
-					vd->vdev_stat.vs_aux =
-					    VDEV_AUX_OPEN_FAILED;
-
-				/*
-				 * Treat these errors as if they had been
-				 * retried so that all the appropriate stats
-				 * and FMA events are generated.
-				 */
-				if (!handler->zi_record.zi_failfast &&
-				    zio != NULL)
-					zio->io_flags |= ZIO_FLAG_IO_RETRY;
-
-				ret = error;
-				break;
-			}
-			if (handler->zi_record.zi_error == ENXIO) {
-				ret = SET_ERROR(EIO);
-				break;
-			}
-		}
-	}
-
-	rw_exit(&inject_lock);
-
-	return (ret);
-}
-
-/*
- * Simulate hardware that ignores cache flushes.  For requested number
- * of seconds nix the actual writing to disk.
- */
-void
-zio_handle_ignored_writes(zio_t *zio)
-{
-	inject_handler_t *handler;
-
-	rw_enter(&inject_lock, RW_READER);
-
-	for (handler = list_head(&inject_handlers); handler != NULL;
-	    handler = list_next(&inject_handlers, handler)) {
-
-		/* Ignore errors not destined for this pool */
-		if (zio->io_spa != handler->zi_spa ||
-		    handler->zi_record.zi_cmd != ZINJECT_IGNORED_WRITES)
-			continue;
-
-		/*
-		 * Positive duration implies # of seconds, negative
-		 * a number of txgs
-		 */
-		if (handler->zi_record.zi_timer == 0) {
-			if (handler->zi_record.zi_duration > 0)
-				handler->zi_record.zi_timer = ddi_get_lbolt64();
-			else
-				handler->zi_record.zi_timer = zio->io_txg;
-		}
-
-		/* Have a "problem" writing 60% of the time */
-		if (spa_get_random(100) < 60)
-			zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES;
-		break;
-	}
-
-	rw_exit(&inject_lock);
-}
-
-void
-spa_handle_ignored_writes(spa_t *spa)
-{
-	inject_handler_t *handler;
-
-	if (zio_injection_enabled == 0)
-		return;
-
-	rw_enter(&inject_lock, RW_READER);
-
-	for (handler = list_head(&inject_handlers); handler != NULL;
-	    handler = list_next(&inject_handlers, handler)) {
-
-		if (spa != handler->zi_spa ||
-		    handler->zi_record.zi_cmd != ZINJECT_IGNORED_WRITES)
-			continue;
-
-		if (handler->zi_record.zi_duration > 0) {
-			VERIFY(handler->zi_record.zi_timer == 0 ||
-			    handler->zi_record.zi_timer +
-			    handler->zi_record.zi_duration * hz >
-			    ddi_get_lbolt64());
-		} else {
-			/* duration is negative so the subtraction here adds */
-			VERIFY(handler->zi_record.zi_timer == 0 ||
-			    handler->zi_record.zi_timer -
-			    handler->zi_record.zi_duration >=
-			    spa_syncing_txg(spa));
-		}
-	}
-
-	rw_exit(&inject_lock);
-}
-
-hrtime_t
-zio_handle_io_delay(zio_t *zio)
-{
-	vdev_t *vd = zio->io_vd;
-	inject_handler_t *min_handler = NULL;
-	hrtime_t min_target = 0;
-
-	rw_enter(&inject_lock, RW_READER);
-
-	/*
-	 * inject_delay_count is a subset of zio_injection_enabled that
-	 * is only incremented for delay handlers. These checks are
-	 * mainly added to remind the reader why we're not explicitly
-	 * checking zio_injection_enabled like the other functions.
-	 */
-	IMPLY(inject_delay_count > 0, zio_injection_enabled > 0);
-	IMPLY(zio_injection_enabled == 0, inject_delay_count == 0);
-
-	/*
-	 * If there aren't any inject delay handlers registered, then we
-	 * can short circuit and simply return 0 here. A value of zero
-	 * informs zio_delay_interrupt() that this request should not be
-	 * delayed. This short circuit keeps us from acquiring the
-	 * inject_delay_mutex unnecessarily.
-	 */
-	if (inject_delay_count == 0) {
-		rw_exit(&inject_lock);
-		return (0);
-	}
-
-	/*
-	 * Each inject handler has a number of "lanes" associated with
-	 * it. Each lane is able to handle requests independently of one
-	 * another, and at a latency defined by the inject handler
-	 * record's zi_timer field. Thus if a handler in configured with
-	 * a single lane with a 10ms latency, it will delay requests
-	 * such that only a single request is completed every 10ms. So,
-	 * if more than one request is attempted per each 10ms interval,
-	 * the average latency of the requests will be greater than
-	 * 10ms; but if only a single request is submitted each 10ms
-	 * interval the average latency will be 10ms.
-	 *
-	 * We need to acquire this mutex to prevent multiple concurrent
-	 * threads being assigned to the same lane of a given inject
-	 * handler. The mutex allows us to perform the following two
-	 * operations atomically:
-	 *
-	 *	1. determine the minimum handler and minimum target
-	 *	   value of all the possible handlers
-	 *	2. update that minimum handler's lane array
-	 *
-	 * Without atomicity, two (or more) threads could pick the same
-	 * lane in step (1), and then conflict with each other in step
-	 * (2). This could allow a single lane handler to process
-	 * multiple requests simultaneously, which shouldn't be possible.
-	 */
-	mutex_enter(&inject_delay_mtx);
-
-	for (inject_handler_t *handler = list_head(&inject_handlers);
-	    handler != NULL; handler = list_next(&inject_handlers, handler)) {
-		if (handler->zi_record.zi_cmd != ZINJECT_DELAY_IO)
-			continue;
-
-		if (vd->vdev_guid != handler->zi_record.zi_guid)
-			continue;
-
-		/*
-		 * Defensive; should never happen as the array allocation
-		 * occurs prior to inserting this handler on the list.
-		 */
-		ASSERT3P(handler->zi_lanes, !=, NULL);
-
-		/*
-		 * This should never happen, the zinject command should
-		 * prevent a user from setting an IO delay with zero lanes.
-		 */
-		ASSERT3U(handler->zi_record.zi_nlanes, !=, 0);
-
-		ASSERT3U(handler->zi_record.zi_nlanes, >,
-		    handler->zi_next_lane);
-
-		/*
-		 * We want to issue this IO to the lane that will become
-		 * idle the soonest, so we compare the soonest this
-		 * specific handler can complete the IO with all other
-		 * handlers, to find the lowest value of all possible
-		 * lanes. We then use this lane to submit the request.
-		 *
-		 * Since each handler has a constant value for its
-		 * delay, we can just use the "next" lane for that
-		 * handler; as it will always be the lane with the
-		 * lowest value for that particular handler (i.e. the
-		 * lane that will become idle the soonest). This saves a
-		 * scan of each handler's lanes array.
-		 *
-		 * There's two cases to consider when determining when
-		 * this specific IO request should complete. If this
-		 * lane is idle, we want to "submit" the request now so
-		 * it will complete after zi_timer milliseconds. Thus,
-		 * we set the target to now + zi_timer.
-		 *
-		 * If the lane is busy, we want this request to complete
-		 * zi_timer milliseconds after the lane becomes idle.
-		 * Since the 'zi_lanes' array holds the time at which
-		 * each lane will become idle, we use that value to
-		 * determine when this request should complete.
-		 */
-		hrtime_t idle = handler->zi_record.zi_timer + gethrtime();
-		hrtime_t busy = handler->zi_record.zi_timer +
-		    handler->zi_lanes[handler->zi_next_lane];
-		hrtime_t target = MAX(idle, busy);
-
-		if (min_handler == NULL) {
-			min_handler = handler;
-			min_target = target;
-			continue;
-		}
-
-		ASSERT3P(min_handler, !=, NULL);
-		ASSERT3U(min_target, !=, 0);
-
-		/*
-		 * We don't yet increment the "next lane" variable since
-		 * we still might find a lower value lane in another
-		 * handler during any remaining iterations. Once we're
-		 * sure we've selected the absolute minimum, we'll claim
-		 * the lane and increment the handler's "next lane"
-		 * field below.
-		 */
-
-		if (target < min_target) {
-			min_handler = handler;
-			min_target = target;
-		}
-	}
-
-	/*
-	 * 'min_handler' will be NULL if no IO delays are registered for
-	 * this vdev, otherwise it will point to the handler containing
-	 * the lane that will become idle the soonest.
-	 */
-	if (min_handler != NULL) {
-		ASSERT3U(min_target, !=, 0);
-		min_handler->zi_lanes[min_handler->zi_next_lane] = min_target;
-
-		/*
-		 * If we've used all possible lanes for this handler,
-		 * loop back and start using the first lane again;
-		 * otherwise, just increment the lane index.
-		 */
-		min_handler->zi_next_lane = (min_handler->zi_next_lane + 1) %
-		    min_handler->zi_record.zi_nlanes;
-	}
-
-	mutex_exit(&inject_delay_mtx);
-	rw_exit(&inject_lock);
-
-	return (min_target);
-}
-
-/*
- * Create a new handler for the given record.  We add it to the list, adding
- * a reference to the spa_t in the process.  We increment zio_injection_enabled,
- * which is the switch to trigger all fault injection.
- */
-int
-zio_inject_fault(char *name, int flags, int *id, zinject_record_t *record)
-{
-	inject_handler_t *handler;
-	int error;
-	spa_t *spa;
-
-	/*
-	 * If this is pool-wide metadata, make sure we unload the corresponding
-	 * spa_t, so that the next attempt to load it will trigger the fault.
-	 * We call spa_reset() to unload the pool appropriately.
-	 */
-	if (flags & ZINJECT_UNLOAD_SPA)
-		if ((error = spa_reset(name)) != 0)
-			return (error);
-
-	if (record->zi_cmd == ZINJECT_DELAY_IO) {
-		/*
-		 * A value of zero for the number of lanes or for the
-		 * delay time doesn't make sense.
-		 */
-		if (record->zi_timer == 0 || record->zi_nlanes == 0)
-			return (SET_ERROR(EINVAL));
-
-		/*
-		 * The number of lanes is directly mapped to the size of
-		 * an array used by the handler. Thus, to ensure the
-		 * user doesn't trigger an allocation that's "too large"
-		 * we cap the number of lanes here.
-		 */
-		if (record->zi_nlanes >= UINT16_MAX)
-			return (SET_ERROR(EINVAL));
-	}
-
-	if (!(flags & ZINJECT_NULL)) {
-		/*
-		 * spa_inject_ref() will add an injection reference, which will
-		 * prevent the pool from being removed from the namespace while
-		 * still allowing it to be unloaded.
-		 */
-		if ((spa = spa_inject_addref(name)) == NULL)
-			return (SET_ERROR(ENOENT));
-
-		handler = kmem_alloc(sizeof (inject_handler_t), KM_SLEEP);
-
-		handler->zi_spa = spa;
-		handler->zi_record = *record;
-
-		if (handler->zi_record.zi_cmd == ZINJECT_DELAY_IO) {
-			handler->zi_lanes = kmem_zalloc(
-			    sizeof (*handler->zi_lanes) *
-			    handler->zi_record.zi_nlanes, KM_SLEEP);
-			handler->zi_next_lane = 0;
-		} else {
-			handler->zi_lanes = NULL;
-			handler->zi_next_lane = 0;
-		}
-
-		rw_enter(&inject_lock, RW_WRITER);
-
-		/*
-		 * We can't move this increment into the conditional
-		 * above because we need to hold the RW_WRITER lock of
-		 * inject_lock, and we don't want to hold that while
-		 * allocating the handler's zi_lanes array.
-		 */
-		if (handler->zi_record.zi_cmd == ZINJECT_DELAY_IO) {
-			ASSERT3S(inject_delay_count, >=, 0);
-			inject_delay_count++;
-			ASSERT3S(inject_delay_count, >, 0);
-		}
-
-		*id = handler->zi_id = inject_next_id++;
-		list_insert_tail(&inject_handlers, handler);
-		atomic_inc_32(&zio_injection_enabled);
-
-		rw_exit(&inject_lock);
-	}
-
-	/*
-	 * Flush the ARC, so that any attempts to read this data will end up
-	 * going to the ZIO layer.  Note that this is a little overkill, but
-	 * we don't have the necessary ARC interfaces to do anything else, and
-	 * fault injection isn't a performance critical path.
-	 */
-	if (flags & ZINJECT_FLUSH_ARC)
-		/*
-		 * We must use FALSE to ensure arc_flush returns, since
-		 * we're not preventing concurrent ARC insertions.
-		 */
-		arc_flush(NULL, FALSE);
-
-	return (0);
-}
-
-/*
- * Returns the next record with an ID greater than that supplied to the
- * function.  Used to iterate over all handlers in the system.
- */
-int
-zio_inject_list_next(int *id, char *name, size_t buflen,
-    zinject_record_t *record)
-{
-	inject_handler_t *handler;
-	int ret;
-
-	mutex_enter(&spa_namespace_lock);
-	rw_enter(&inject_lock, RW_READER);
-
-	for (handler = list_head(&inject_handlers); handler != NULL;
-	    handler = list_next(&inject_handlers, handler))
-		if (handler->zi_id > *id)
-			break;
-
-	if (handler) {
-		*record = handler->zi_record;
-		*id = handler->zi_id;
-		(void) strncpy(name, spa_name(handler->zi_spa), buflen);
-		ret = 0;
-	} else {
-		ret = SET_ERROR(ENOENT);
-	}
-
-	rw_exit(&inject_lock);
-	mutex_exit(&spa_namespace_lock);
-
-	return (ret);
-}
-
-/*
- * Clear the fault handler with the given identifier, or return ENOENT if none
- * exists.
- */
-int
-zio_clear_fault(int id)
-{
-	inject_handler_t *handler;
-
-	rw_enter(&inject_lock, RW_WRITER);
-
-	for (handler = list_head(&inject_handlers); handler != NULL;
-	    handler = list_next(&inject_handlers, handler))
-		if (handler->zi_id == id)
-			break;
-
-	if (handler == NULL) {
-		rw_exit(&inject_lock);
-		return (SET_ERROR(ENOENT));
-	}
-
-	if (handler->zi_record.zi_cmd == ZINJECT_DELAY_IO) {
-		ASSERT3S(inject_delay_count, >, 0);
-		inject_delay_count--;
-		ASSERT3S(inject_delay_count, >=, 0);
-	}
-
-	list_remove(&inject_handlers, handler);
-	rw_exit(&inject_lock);
-
-	if (handler->zi_record.zi_cmd == ZINJECT_DELAY_IO) {
-		ASSERT3P(handler->zi_lanes, !=, NULL);
-		kmem_free(handler->zi_lanes, sizeof (*handler->zi_lanes) *
-		    handler->zi_record.zi_nlanes);
-	} else {
-		ASSERT3P(handler->zi_lanes, ==, NULL);
-	}
-
-	spa_inject_delref(handler->zi_spa);
-	kmem_free(handler, sizeof (inject_handler_t));
-	atomic_dec_32(&zio_injection_enabled);
-
-	return (0);
-}
-
-void
-zio_inject_init(void)
-{
-	rw_init(&inject_lock, NULL, RW_DEFAULT, NULL);
-	mutex_init(&inject_delay_mtx, NULL, MUTEX_DEFAULT, NULL);
-	list_create(&inject_handlers, sizeof (inject_handler_t),
-	    offsetof(inject_handler_t, zi_link));
-}
-
-void
-zio_inject_fini(void)
-{
-	list_destroy(&inject_handlers);
-	mutex_destroy(&inject_delay_mtx);
-	rw_destroy(&inject_lock);
-}
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zle.c
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zle.c
+++ /dev/null
@@ -1,86 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-
-/*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-
-/*
- * Zero-length encoding.  This is a fast and simple algorithm to eliminate
- * runs of zeroes.  Each chunk of compressed data begins with a length byte, b.
- * If b < n (where n is the compression parameter) then the next b + 1 bytes
- * are literal values.  If b >= n then the next (256 - b + 1) bytes are zero.
- */
-#include <sys/types.h>
-#include <sys/sysmacros.h>
-
-size_t
-zle_compress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n)
-{
-	uchar_t *src = s_start;
-	uchar_t *dst = d_start;
-	uchar_t *s_end = src + s_len;
-	uchar_t *d_end = dst + d_len;
-
-	while (src < s_end && dst < d_end - 1) {
-		uchar_t *first = src;
-		uchar_t *len = dst++;
-		if (src[0] == 0) {
-			uchar_t *last = src + (256 - n);
-			while (src < MIN(last, s_end) && src[0] == 0)
-				src++;
-			*len = src - first - 1 + n;
-		} else {
-			uchar_t *last = src + n;
-			if (d_end - dst < n)
-				break;
-			while (src < MIN(last, s_end) - 1 && (src[0] | src[1]))
-				*dst++ = *src++;
-			if (src[0])
-				*dst++ = *src++;
-			*len = src - first - 1;
-		}
-	}
-	return (src == s_end ? dst - (uchar_t *)d_start : s_len);
-}
-
-int
-zle_decompress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n)
-{
-	uchar_t *src = s_start;
-	uchar_t *dst = d_start;
-	uchar_t *s_end = src + s_len;
-	uchar_t *d_end = dst + d_len;
-
-	while (src < s_end && dst < d_end) {
-		int len = 1 + *src++;
-		if (len <= n) {
-			while (len-- != 0)
-				*dst++ = *src++;
-		} else {
-			len -= n;
-			while (len-- != 0)
-				*dst++ = 0;
-		}
-	}
-	return (dst == d_end ? 0 : -1);
-}
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zrlock.c
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zrlock.c
+++ /dev/null
@@ -1,187 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2014, 2015 by Delphix. All rights reserved.
- * Copyright 2016 The MathWorks, Inc. All rights reserved.
- */
-
-/*
- * A Zero Reference Lock (ZRL) is a reference count that can lock out new
- * references only when the count is zero and only without waiting if the count
- * is not already zero. It is similar to a read-write lock in that it allows
- * multiple readers and only a single writer, but it does not allow a writer to
- * block while waiting for readers to exit, and therefore the question of
- * reader/writer priority is moot (no WRWANT bit). Since the equivalent of
- * rw_enter(&lock, RW_WRITER) is disallowed and only tryenter() is allowed, it
- * is perfectly safe for the same reader to acquire the same lock multiple
- * times. The fact that a ZRL is reentrant for readers (through multiple calls
- * to zrl_add()) makes it convenient for determining whether something is
- * actively referenced without the fuss of flagging lock ownership across
- * function calls.
- */
-#include <sys/zrlock.h>
-
-/*
- * A ZRL can be locked only while there are zero references, so ZRL_LOCKED is
- * treated as zero references.
- */
-#define	ZRL_LOCKED	-1
-#define	ZRL_DESTROYED	-2
-
-void
-zrl_init(zrlock_t *zrl)
-{
-	mutex_init(&zrl->zr_mtx, NULL, MUTEX_DEFAULT, NULL);
-	zrl->zr_refcount = 0;
-	cv_init(&zrl->zr_cv, NULL, CV_DEFAULT, NULL);
-#ifdef	ZFS_DEBUG
-	zrl->zr_owner = NULL;
-	zrl->zr_caller = NULL;
-#endif
-}
-
-void
-zrl_destroy(zrlock_t *zrl)
-{
-	ASSERT0(zrl->zr_refcount);
-
-	mutex_destroy(&zrl->zr_mtx);
-	zrl->zr_refcount = ZRL_DESTROYED;
-	cv_destroy(&zrl->zr_cv);
-}
-
-void
-zrl_add_impl(zrlock_t *zrl, const char *zc)
-{
-	for (;;) {
-		uint32_t n = (uint32_t)zrl->zr_refcount;
-		while (n != ZRL_LOCKED) {
-			uint32_t cas = atomic_cas_32(
-			    (uint32_t *)&zrl->zr_refcount, n, n + 1);
-			if (cas == n) {
-				ASSERT3S((int32_t)n, >=, 0);
-#ifdef	ZFS_DEBUG
-				if (zrl->zr_owner == curthread) {
-					DTRACE_PROBE2(zrlock__reentry,
-					    zrlock_t *, zrl, uint32_t, n);
-				}
-				zrl->zr_owner = curthread;
-				zrl->zr_caller = zc;
-#endif
-				return;
-			}
-			n = cas;
-		}
-
-		mutex_enter(&zrl->zr_mtx);
-		while (zrl->zr_refcount == ZRL_LOCKED) {
-			cv_wait(&zrl->zr_cv, &zrl->zr_mtx);
-		}
-		mutex_exit(&zrl->zr_mtx);
-	}
-}
-
-void
-zrl_remove(zrlock_t *zrl)
-{
-	uint32_t n;
-
-#ifdef	ZFS_DEBUG
-	if (zrl->zr_owner == curthread) {
-		zrl->zr_owner = NULL;
-		zrl->zr_caller = NULL;
-	}
-#endif
-	n = atomic_dec_32_nv((uint32_t *)&zrl->zr_refcount);
-	ASSERT3S((int32_t)n, >=, 0);
-}
-
-int
-zrl_tryenter(zrlock_t *zrl)
-{
-	uint32_t n = (uint32_t)zrl->zr_refcount;
-
-	if (n == 0) {
-		uint32_t cas = atomic_cas_32(
-		    (uint32_t *)&zrl->zr_refcount, 0, ZRL_LOCKED);
-		if (cas == 0) {
-#ifdef	ZFS_DEBUG
-			ASSERT3P(zrl->zr_owner, ==, NULL);
-			zrl->zr_owner = curthread;
-#endif
-			return (1);
-		}
-	}
-
-	ASSERT3S((int32_t)n, >, ZRL_DESTROYED);
-
-	return (0);
-}
-
-void
-zrl_exit(zrlock_t *zrl)
-{
-	ASSERT3S(zrl->zr_refcount, ==, ZRL_LOCKED);
-
-	mutex_enter(&zrl->zr_mtx);
-#ifdef	ZFS_DEBUG
-	ASSERT3P(zrl->zr_owner, ==, curthread);
-	zrl->zr_owner = NULL;
-	membar_producer();	/* make sure the owner store happens first */
-#endif
-	zrl->zr_refcount = 0;
-	cv_broadcast(&zrl->zr_cv);
-	mutex_exit(&zrl->zr_mtx);
-}
-
-int
-zrl_refcount(zrlock_t *zrl)
-{
-	ASSERT3S(zrl->zr_refcount, >, ZRL_DESTROYED);
-
-	int n = (int)zrl->zr_refcount;
-	return (n <= 0 ? 0 : n);
-}
-
-int
-zrl_is_zero(zrlock_t *zrl)
-{
-	ASSERT3S(zrl->zr_refcount, >, ZRL_DESTROYED);
-
-	return (zrl->zr_refcount <= 0);
-}
-
-int
-zrl_is_locked(zrlock_t *zrl)
-{
-	ASSERT3S(zrl->zr_refcount, >, ZRL_DESTROYED);
-
-	return (zrl->zr_refcount == ZRL_LOCKED);
-}
-
-#ifdef	ZFS_DEBUG
-kthread_t *
-zrl_owner(zrlock_t *zrl)
-{
-	return (zrl->zr_owner);
-}
-#endif
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zthr.c
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zthr.c
+++ /dev/null
@@ -1,431 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * This file and its contents are supplied under the terms of the
- * Common Development and Distribution License ("CDDL"), version 1.0.
- * You may only use this file in accordance with the terms of version
- * 1.0 of the CDDL.
- *
- * A full copy of the text of the CDDL should have accompanied this
- * source. A copy of the CDDL is also available via the Internet at
- * http://www.illumos.org/license/CDDL.
- *
- * CDDL HEADER END
- */
-
-/*
- * Copyright (c) 2017, 2019 by Delphix. All rights reserved.
- */
-
-/*
- * ZTHR Infrastructure
- * ===================
- *
- * ZTHR threads are used for isolated operations that span multiple txgs
- * within a SPA. They generally exist from SPA creation/loading and until
- * the SPA is exported/destroyed. The ideal requirements for an operation
- * to be modeled with a zthr are the following:
- *
- * 1] The operation needs to run over multiple txgs.
- * 2] There is be a single point of reference in memory or on disk that
- *    indicates whether the operation should run/is running or has
- *    stopped.
- *
- * If the operation satisfies the above then the following rules guarantee
- * a certain level of correctness:
- *
- * 1] Any thread EXCEPT the zthr changes the work indicator from stopped
- *    to running but not the opposite.
- * 2] Only the zthr can change the work indicator from running to stopped
- *    (e.g. when it is done) but not the opposite.
- *
- * This way a normal zthr cycle should go like this:
- *
- * 1] An external thread changes the work indicator from stopped to
- *    running and wakes up the zthr.
- * 2] The zthr wakes up, checks the indicator and starts working.
- * 3] When the zthr is done, it changes the indicator to stopped, allowing
- *    a new cycle to start.
- *
- * Besides being awakened by other threads, a zthr can be configured
- * during creation to wakeup on it's own after a specified interval
- * [see zthr_create_timer()].
- *
- * Note: ZTHR threads are NOT a replacement for generic threads! Please
- * ensure that they fit your use-case well before using them.
- *
- * == ZTHR creation
- *
- * Every zthr needs three inputs to start running:
- *
- * 1] A user-defined checker function (checkfunc) that decides whether
- *    the zthr should start working or go to sleep. The function should
- *    return TRUE when the zthr needs to work or FALSE to let it sleep,
- *    and should adhere to the following signature:
- *    boolean_t checkfunc_name(void *args, zthr_t *t);
- *
- * 2] A user-defined ZTHR function (func) which the zthr executes when
- *    it is not sleeping. The function should adhere to the following
- *    signature type:
- *    void func_name(void *args, zthr_t *t);
- *
- * 3] A void args pointer that will be passed to checkfunc and func
- *    implicitly by the infrastructure.
- *
- * The reason why the above API needs two different functions,
- * instead of one that both checks and does the work, has to do with
- * the zthr's internal state lock (zthr_state_lock) and the allowed
- * cancellation windows. We want to hold the zthr_state_lock while
- * running checkfunc but not while running func. This way the zthr
- * can be cancelled while doing work and not while checking for work.
- *
- * To start a zthr:
- *     zthr_t *zthr_pointer = zthr_create(checkfunc, func, args);
- * or
- *     zthr_t *zthr_pointer = zthr_create_timer(checkfunc, func,
- *         args, max_sleep);
- *
- * After that you should be able to wakeup, cancel, and resume the
- * zthr from another thread using the zthr_pointer.
- *
- * NOTE: ZTHR threads could potentially wake up spuriously and the
- * user should take this into account when writing a checkfunc.
- * [see ZTHR state transitions]
- *
- * == ZTHR cancellation
- *
- * ZTHR threads must be cancelled when their SPA is being exported
- * or when they need to be paused so they don't interfere with other
- * operations.
- *
- * To cancel a zthr:
- *     zthr_cancel(zthr_pointer);
- *
- * To resume it:
- *     zthr_resume(zthr_pointer);
- *
- * A zthr will implicitly check if it has received a cancellation
- * signal every time func returns and every time it wakes up [see
- * ZTHR state transitions below].
- *
- * At times, waiting for the zthr's func to finish its job may take
- * time. This may be very time-consuming for some operations that
- * need to cancel the SPA's zthrs (e.g spa_export). For this scenario
- * the user can explicitly make their ZTHR function aware of incoming
- * cancellation signals using zthr_iscancelled(). A common pattern for
- * that looks like this:
- *
- * int
- * func_name(void *args, zthr_t *t)
- * {
- *     ... <unpack args> ...
- *     while (!work_done && !zthr_iscancelled(t)) {
- *         ... <do more work> ...
- *     }
- * }
- *
- * == ZTHR cleanup
- *
- * Cancelling a zthr doesn't clean up its metadata (internal locks,
- * function pointers to func and checkfunc, etc..). This is because
- * we want to keep them around in case we want to resume the execution
- * of the zthr later. Similarly for zthrs that exit themselves.
- *
- * To completely cleanup a zthr, cancel it first to ensure that it
- * is not running and then use zthr_destroy().
- *
- * == ZTHR state transitions
- *
- *    zthr creation
- *      +
- *      |
- *      |      woke up
- *      |   +--------------+ sleep
- *      |   |                  ^
- *      |   |                  |
- *      |   |                  | FALSE
- *      |   |                  |
- *      v   v     FALSE        +
- *   cancelled? +---------> checkfunc?
- *      +   ^                  +
- *      |   |                  |
- *      |   |                  | TRUE
- *      |   |                  |
- *      |   |  func returned   v
- *      |   +---------------+ func
- *      |
- *      | TRUE
- *      |
- *      v
- *   zthr stopped running
- *
- * == Implementation of ZTHR requests
- *
- * ZTHR wakeup, cancel, and resume are requests on a zthr to
- * change its internal state. Requests on a zthr are serialized
- * using the zthr_request_lock, while changes in its internal
- * state are protected by the zthr_state_lock. A request will
- * first acquire the zthr_request_lock and then immediately
- * acquire the zthr_state_lock. We do this so that incoming
- * requests are serialized using the request lock, while still
- * allowing us to use the state lock for thread communication
- * via zthr_cv.
- */
-
-#include <sys/zfs_context.h>
-#include <sys/zthr.h>
-
-struct zthr {
-	/* running thread doing the work */
-	kthread_t	*zthr_thread;
-
-	/* lock protecting internal data & invariants */
-	kmutex_t	zthr_state_lock;
-
-	/* mutex that serializes external requests */
-	kmutex_t	zthr_request_lock;
-
-	/* notification mechanism for requests */
-	kcondvar_t	zthr_cv;
-
-	/* flag set to true if we are canceling the zthr */
-	boolean_t	zthr_cancel;
-
-	/*
-	 * maximum amount of time that the zthr is spent sleeping;
-	 * if this is 0, the thread doesn't wake up until it gets
-	 * signaled.
-	 */
-	hrtime_t	zthr_wait_time;
-
-	/* consumer-provided callbacks & data */
-	zthr_checkfunc_t	*zthr_checkfunc;
-	zthr_func_t	*zthr_func;
-	void		*zthr_arg;
-};
-
-static void
-zthr_procedure(void *arg)
-{
-	zthr_t *t = arg;
-
-	mutex_enter(&t->zthr_state_lock);
-	ASSERT3P(t->zthr_thread, ==, curthread);
-
-	while (!t->zthr_cancel) {
-		if (t->zthr_checkfunc(t->zthr_arg, t)) {
-			mutex_exit(&t->zthr_state_lock);
-			t->zthr_func(t->zthr_arg, t);
-			mutex_enter(&t->zthr_state_lock);
-		} else {
-			/* go to sleep */
-			if (t->zthr_wait_time == 0) {
-				cv_wait(&t->zthr_cv, &t->zthr_state_lock);
-			} else {
-				(void) cv_timedwait_hires(&t->zthr_cv,
-				    &t->zthr_state_lock, t->zthr_wait_time,
-				    MSEC2NSEC(1), 0);
-			}
-		}
-	}
-
-	/*
-	 * Clear out the kernel thread metadata and notify the
-	 * zthr_cancel() thread that we've stopped running.
-	 */
-	t->zthr_thread = NULL;
-	t->zthr_cancel = B_FALSE;
-	cv_broadcast(&t->zthr_cv);
-
-	mutex_exit(&t->zthr_state_lock);
-	thread_exit();
-}
-
-zthr_t *
-zthr_create(zthr_checkfunc_t *checkfunc, zthr_func_t *func, void *arg)
-{
-	return (zthr_create_timer(checkfunc, func, arg, (hrtime_t)0));
-}
-
-/*
- * Create a zthr with specified maximum sleep time.  If the time
- * in sleeping state exceeds max_sleep, a wakeup(do the check and
- * start working if required) will be triggered.
- */
-zthr_t *
-zthr_create_timer(zthr_checkfunc_t *checkfunc, zthr_func_t *func,
-    void *arg, hrtime_t max_sleep)
-{
-	zthr_t *t = kmem_zalloc(sizeof (*t), KM_SLEEP);
-	mutex_init(&t->zthr_state_lock, NULL, MUTEX_DEFAULT, NULL);
-	mutex_init(&t->zthr_request_lock, NULL, MUTEX_DEFAULT, NULL);
-	cv_init(&t->zthr_cv, NULL, CV_DEFAULT, NULL);
-
-	mutex_enter(&t->zthr_state_lock);
-	t->zthr_checkfunc = checkfunc;
-	t->zthr_func = func;
-	t->zthr_arg = arg;
-	t->zthr_wait_time = max_sleep;
-
-	t->zthr_thread = thread_create(NULL, 0, zthr_procedure, t,
-	    0, &p0, TS_RUN, minclsyspri);
-	mutex_exit(&t->zthr_state_lock);
-
-	return (t);
-}
-
-void
-zthr_destroy(zthr_t *t)
-{
-	ASSERT(!MUTEX_HELD(&t->zthr_state_lock));
-	ASSERT(!MUTEX_HELD(&t->zthr_request_lock));
-	VERIFY3P(t->zthr_thread, ==, NULL);
-	mutex_destroy(&t->zthr_request_lock);
-	mutex_destroy(&t->zthr_state_lock);
-	cv_destroy(&t->zthr_cv);
-	kmem_free(t, sizeof (*t));
-}
-
-/*
- * Wake up the zthr if it is sleeping. If the thread has been
- * cancelled that does nothing.
- */
-void
-zthr_wakeup(zthr_t *t)
-{
-	mutex_enter(&t->zthr_request_lock);
-	mutex_enter(&t->zthr_state_lock);
-
-	/*
-	 * There are 4 states that we can find the zthr when issuing
-	 * this broadcast:
-	 *
-	 * [1] The common case of the thread being asleep, at which
-	 *     point the broadcast will wake it up.
-	 * [2] The thread has been cancelled. Waking up a cancelled
-	 *     thread is a no-op. Any work that is still left to be
-	 *     done should be handled the next time the thread is
-	 *     resumed.
-	 * [3] The thread is doing work and is already up, so this
-	 *     is basically a no-op.
-	 * [4] The thread was just created/resumed, in which case the
-	 *     behavior is similar to [3].
-	 */
-	cv_broadcast(&t->zthr_cv);
-
-	mutex_exit(&t->zthr_state_lock);
-	mutex_exit(&t->zthr_request_lock);
-}
-
-/*
- * Sends a cancel request to the zthr and blocks until the zthr is
- * cancelled. If the zthr is not running (e.g. has been cancelled
- * already), this is a no-op.
- */
-void
-zthr_cancel(zthr_t *t)
-{
-	mutex_enter(&t->zthr_request_lock);
-	mutex_enter(&t->zthr_state_lock);
-
-	/*
-	 * Since we are holding the zthr_state_lock at this point
-	 * we can find the state in one of the following 4 states:
-	 *
-	 * [1] The thread has already been cancelled, therefore
-	 *     there is nothing for us to do.
-	 * [2] The thread is sleeping, so we broadcast the CV first
-	 *     to wake it up and then we set the flag and we are
-	 *     waiting for it to exit.
-	 * [3] The thread is doing work, in which case we just set
-	 *     the flag and wait for it to finish.
-	 * [4] The thread was just created/resumed, in which case
-	 *     the behavior is similar to [3].
-	 *
-	 * Since requests are serialized, by the time that we get
-	 * control back we expect that the zthr is cancelled and
-	 * not running anymore.
-	 */
-	if (t->zthr_thread != NULL) {
-		t->zthr_cancel = B_TRUE;
-
-		/* broadcast in case the zthr is sleeping */
-		cv_broadcast(&t->zthr_cv);
-
-		while (t->zthr_thread != NULL)
-			cv_wait(&t->zthr_cv, &t->zthr_state_lock);
-
-		ASSERT(!t->zthr_cancel);
-	}
-
-	mutex_exit(&t->zthr_state_lock);
-	mutex_exit(&t->zthr_request_lock);
-}
-
-/*
- * Sends a resume request to the supplied zthr. If the zthr is
- * already running this is a no-op.
- */
-void
-zthr_resume(zthr_t *t)
-{
-	mutex_enter(&t->zthr_request_lock);
-	mutex_enter(&t->zthr_state_lock);
-
-	ASSERT3P(&t->zthr_checkfunc, !=, NULL);
-	ASSERT3P(&t->zthr_func, !=, NULL);
-	ASSERT(!t->zthr_cancel);
-
-	/*
-	 * There are 4 states that we find the zthr in at this point
-	 * given the locks that we hold:
-	 *
-	 * [1] The zthr was cancelled, so we spawn a new thread for
-	 *     the zthr (common case).
-	 * [2] The zthr is running at which point this is a no-op.
-	 * [3] The zthr is sleeping at which point this is a no-op.
-	 * [4] The zthr was just spawned at which point this is a
-	 *     no-op.
-	 */
-	if (t->zthr_thread == NULL) {
-		t->zthr_thread = thread_create(NULL, 0, zthr_procedure, t,
-		    0, &p0, TS_RUN, minclsyspri);
-	}
-
-	mutex_exit(&t->zthr_state_lock);
-	mutex_exit(&t->zthr_request_lock);
-}
-
-/*
- * This function is intended to be used by the zthr itself
- * (specifically the zthr_func callback provided) to check
- * if another thread has signaled it to stop running before
- * doing some expensive operation.
- *
- * returns TRUE if we are in the middle of trying to cancel
- *     this thread.
- *
- * returns FALSE otherwise.
- */
-boolean_t
-zthr_iscancelled(zthr_t *t)
-{
-	ASSERT3P(t->zthr_thread, ==, curthread);
-
-	/*
-	 * The majority of the functions here grab zthr_request_lock
-	 * first and then zthr_state_lock. This function only grabs
-	 * the zthr_state_lock. That is because this function should
-	 * only be called from the zthr_func to check if someone has
-	 * issued a zthr_cancel() on the thread. If there is a zthr_cancel()
-	 * happening concurrently, attempting to grab the request lock
-	 * here would result in a deadlock.
-	 *
-	 * By grabbing only the zthr_state_lock this function is allowed
-	 * to run concurrently with a zthr_cancel() request.
-	 */
-	mutex_enter(&t->zthr_state_lock);
-	boolean_t cancelled = t->zthr_cancel;
-	mutex_exit(&t->zthr_state_lock);
-	return (cancelled);
-}
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c
+++ /dev/null
@@ -1,3347 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- *
- * Copyright (c) 2006-2010 Pawel Jakub Dawidek <pjd@FreeBSD.org>
- * All rights reserved.
- *
- * Portions Copyright 2010 Robert Milkowski
- *
- * Copyright 2017 Nexenta Systems, Inc.  All rights reserved.
- * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
- * Copyright (c) 2013, Joyent, Inc. All rights reserved.
- * Copyright (c) 2014 Integros [integros.com]
- * Copyright (c) 2016 Actifio, Inc. All rights reserved.
- */
-
-/* Portions Copyright 2011 Martin Matuska <mm@FreeBSD.org> */
-
-/*
- * ZFS volume emulation driver.
- *
- * Makes a DMU object look like a volume of arbitrary size, up to 2^64 bytes.
- * Volumes are accessed through the symbolic links named:
- *
- * /dev/zvol/dsk/<pool_name>/<dataset_name>
- * /dev/zvol/rdsk/<pool_name>/<dataset_name>
- *
- * These links are created by the /dev filesystem (sdev_zvolops.c).
- * Volumes are persistent through reboot.  No user command needs to be
- * run before opening and using a device.
- *
- * FreeBSD notes.
- * On FreeBSD ZVOLs are simply GEOM providers like any other storage device
- * in the system.
- */
-
-#include <sys/types.h>
-#include <sys/param.h>
-#include <sys/kernel.h>
-#include <sys/errno.h>
-#include <sys/uio.h>
-#include <sys/bio.h>
-#include <sys/buf.h>
-#include <sys/kmem.h>
-#include <sys/conf.h>
-#include <sys/cmn_err.h>
-#include <sys/stat.h>
-#include <sys/zap.h>
-#include <sys/spa.h>
-#include <sys/spa_impl.h>
-#include <sys/zio.h>
-#include <sys/disk.h>
-#include <sys/dmu_traverse.h>
-#include <sys/dnode.h>
-#include <sys/dsl_dataset.h>
-#include <sys/dsl_prop.h>
-#include <sys/dkio.h>
-#include <sys/byteorder.h>
-#include <sys/sunddi.h>
-#include <sys/dirent.h>
-#include <sys/policy.h>
-#include <sys/queue.h>
-#include <sys/fs/zfs.h>
-#include <sys/zfs_ioctl.h>
-#include <sys/zil.h>
-#include <sys/refcount.h>
-#include <sys/zfs_znode.h>
-#include <sys/zfs_rlock.h>
-#include <sys/vdev_impl.h>
-#include <sys/vdev_raidz.h>
-#include <sys/zvol.h>
-#include <sys/zil_impl.h>
-#include <sys/dbuf.h>
-#include <sys/dmu_tx.h>
-#include <sys/zfeature.h>
-#include <sys/zio_checksum.h>
-#include <sys/zil_impl.h>
-#include <sys/filio.h>
-#include <sys/zfs_rlock.h>
-
-#include <geom/geom.h>
-
-#include "zfs_namecheck.h"
-
-#ifndef illumos
-struct g_class zfs_zvol_class = {
-	.name = "ZFS::ZVOL",
-	.version = G_VERSION,
-};
-
-DECLARE_GEOM_CLASS(zfs_zvol_class, zfs_zvol);
-
-#endif
-void *zfsdev_state;
-static char *zvol_tag = "zvol_tag";
-
-#define	ZVOL_DUMPSIZE		"dumpsize"
-
-/*
- * This lock protects the zfsdev_state structure from being modified
- * while it's being used, e.g. an open that comes in before a create
- * finishes.  It also protects temporary opens of the dataset so that,
- * e.g., an open doesn't get a spurious EBUSY.
- */
-#ifdef illumos
-kmutex_t zfsdev_state_lock;
-#else
-/*
- * In FreeBSD we've replaced the upstream zfsdev_state_lock with the
- * spa_namespace_lock in the ZVOL code.
- */
-#define zfsdev_state_lock spa_namespace_lock
-#endif
-static uint32_t zvol_minors;
-
-#ifndef illumos
-SYSCTL_DECL(_vfs_zfs);
-SYSCTL_NODE(_vfs_zfs, OID_AUTO, vol, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
-    "ZFS VOLUME");
-static int	volmode = ZFS_VOLMODE_GEOM;
-SYSCTL_INT(_vfs_zfs_vol, OID_AUTO, mode, CTLFLAG_RWTUN, &volmode, 0,
-    "Expose as GEOM providers (1), device files (2) or neither");
-static boolean_t zpool_on_zvol = B_FALSE;
-SYSCTL_INT(_vfs_zfs_vol, OID_AUTO, recursive, CTLFLAG_RWTUN, &zpool_on_zvol, 0,
-    "Allow zpools to use zvols as vdevs (DANGEROUS)");
-
-#endif
-typedef struct zvol_extent {
-	list_node_t	ze_node;
-	dva_t		ze_dva;		/* dva associated with this extent */
-	uint64_t	ze_nblks;	/* number of blocks in extent */
-} zvol_extent_t;
-
-/*
- * The in-core state of each volume.
- */
-typedef struct zvol_state {
-#ifndef illumos
-	LIST_ENTRY(zvol_state)	zv_links;
-#endif
-	char		zv_name[MAXPATHLEN]; /* pool/dd name */
-	uint64_t	zv_volsize;	/* amount of space we advertise */
-	uint64_t	zv_volblocksize; /* volume block size */
-#ifdef illumos
-	minor_t		zv_minor;	/* minor number */
-#else
-	struct cdev	*zv_dev;	/* non-GEOM device */
-	struct g_provider *zv_provider;	/* GEOM provider */
-#endif
-	uint8_t		zv_min_bs;	/* minimum addressable block shift */
-	uint8_t		zv_flags;	/* readonly, dumpified, etc. */
-	objset_t	*zv_objset;	/* objset handle */
-#ifdef illumos
-	uint32_t	zv_open_count[OTYPCNT];	/* open counts */
-#endif
-	uint32_t	zv_total_opens;	/* total open count */
-	uint32_t	zv_sync_cnt;	/* synchronous open count */
-	zilog_t		*zv_zilog;	/* ZIL handle */
-	list_t		zv_extents;	/* List of extents for dump */
-	rangelock_t	zv_rangelock;
-	dnode_t		*zv_dn;		/* dnode hold */
-#ifndef illumos
-	int		zv_state;
-	int		zv_volmode;	/* Provide GEOM or cdev */
-	struct bio_queue_head zv_queue;
-	struct mtx	zv_queue_mtx;	/* zv_queue mutex */
-#endif
-} zvol_state_t;
-
-typedef enum {
-	ZVOL_ASYNC_CREATE_MINORS,
-	ZVOL_ASYNC_REMOVE_MINORS,
-	ZVOL_ASYNC_RENAME_MINORS,
-	ZVOL_ASYNC_MAX
-} zvol_async_op_t;
-
-typedef struct {
-	zvol_async_op_t op;
-	char pool[ZFS_MAX_DATASET_NAME_LEN];
-	char name1[ZFS_MAX_DATASET_NAME_LEN];
-	char name2[ZFS_MAX_DATASET_NAME_LEN];
-} zvol_task_t;
-
-#ifndef illumos
-static LIST_HEAD(, zvol_state) all_zvols;
-#endif
-/*
- * zvol specific flags
- */
-#define	ZVOL_RDONLY	0x1
-#define	ZVOL_DUMPIFIED	0x2
-#define	ZVOL_EXCL	0x4
-#define	ZVOL_WCE	0x8
-
-/*
- * zvol maximum transfer in one DMU tx.
- */
-int zvol_maxphys = DMU_MAX_ACCESS/2;
-
-/*
- * Toggle unmap functionality.
- */
-boolean_t zvol_unmap_enabled = B_TRUE;
-
-/*
- * If true, unmaps requested as synchronous are executed synchronously,
- * otherwise all unmaps are asynchronous.
- */
-boolean_t zvol_unmap_sync_enabled = B_FALSE;
-
-#ifndef illumos
-SYSCTL_INT(_vfs_zfs_vol, OID_AUTO, unmap_enabled, CTLFLAG_RWTUN,
-    &zvol_unmap_enabled, 0,
-    "Enable UNMAP functionality");
-
-SYSCTL_INT(_vfs_zfs_vol, OID_AUTO, unmap_sync_enabled, CTLFLAG_RWTUN,
-    &zvol_unmap_sync_enabled, 0,
-    "UNMAPs requested as sync are executed synchronously");
-
-static d_open_t		zvol_d_open;
-static d_close_t	zvol_d_close;
-static d_read_t		zvol_read;
-static d_write_t	zvol_write;
-static d_ioctl_t	zvol_d_ioctl;
-static d_strategy_t	zvol_strategy;
-
-static struct cdevsw zvol_cdevsw = {
-	.d_version =	D_VERSION,
-	.d_open =	zvol_d_open,
-	.d_close =	zvol_d_close,
-	.d_read =	zvol_read,
-	.d_write =	zvol_write,
-	.d_ioctl =	zvol_d_ioctl,
-	.d_strategy =	zvol_strategy,
-	.d_name =	"zvol",
-	.d_flags =	D_DISK | D_TRACKCLOSE,
-};
-
-static void zvol_geom_run(zvol_state_t *zv);
-static void zvol_geom_destroy(zvol_state_t *zv);
-static int zvol_geom_access(struct g_provider *pp, int acr, int acw, int ace);
-static void zvol_geom_start(struct bio *bp);
-static void zvol_geom_worker(void *arg);
-static void zvol_log_truncate(zvol_state_t *zv, dmu_tx_t *tx, uint64_t off,
-    uint64_t len, boolean_t sync);
-#endif	/* !illumos */
-
-extern int zfs_set_prop_nvlist(const char *, zprop_source_t,
-    nvlist_t *, nvlist_t *);
-static int zvol_remove_zv(zvol_state_t *);
-static int zvol_get_data(void *arg, lr_write_t *lr, char *buf,
-    struct lwb *lwb, zio_t *zio);
-static int zvol_dumpify(zvol_state_t *zv);
-static int zvol_dump_fini(zvol_state_t *zv);
-static int zvol_dump_init(zvol_state_t *zv, boolean_t resize);
-
-static void
-zvol_size_changed(zvol_state_t *zv, uint64_t volsize)
-{
-#ifdef illumos
-	dev_t dev = makedevice(ddi_driver_major(zfs_dip), zv->zv_minor);
-
-	zv->zv_volsize = volsize;
-	VERIFY(ddi_prop_update_int64(dev, zfs_dip,
-	    "Size", volsize) == DDI_SUCCESS);
-	VERIFY(ddi_prop_update_int64(dev, zfs_dip,
-	    "Nblocks", lbtodb(volsize)) == DDI_SUCCESS);
-
-	/* Notify specfs to invalidate the cached size */
-	spec_size_invalidate(dev, VBLK);
-	spec_size_invalidate(dev, VCHR);
-#else	/* !illumos */
-	zv->zv_volsize = volsize;
-	if (zv->zv_volmode == ZFS_VOLMODE_GEOM) {
-		struct g_provider *pp;
-
-		pp = zv->zv_provider;
-		if (pp == NULL)
-			return;
-		g_topology_lock();
-
-		/*
-		 * Do not invoke resize event when initial size was zero.
-		 * ZVOL initializes the size on first open, this is not
-		 * real resizing.
-		 */
-		if (pp->mediasize == 0)
-			pp->mediasize = zv->zv_volsize;
-		else
-			g_resize_provider(pp, zv->zv_volsize);
-		g_topology_unlock();
-	}
-#endif	/* illumos */
-}
-
-int
-zvol_check_volsize(uint64_t volsize, uint64_t blocksize)
-{
-	if (volsize == 0)
-		return (SET_ERROR(EINVAL));
-
-	if (volsize % blocksize != 0)
-		return (SET_ERROR(EINVAL));
-
-#ifdef _ILP32
-	if (volsize - 1 > SPEC_MAXOFFSET_T)
-		return (SET_ERROR(EOVERFLOW));
-#endif
-	return (0);
-}
-
-int
-zvol_check_volblocksize(uint64_t volblocksize)
-{
-	if (volblocksize < SPA_MINBLOCKSIZE ||
-	    volblocksize > SPA_OLD_MAXBLOCKSIZE ||
-	    !ISP2(volblocksize))
-		return (SET_ERROR(EDOM));
-
-	return (0);
-}
-
-int
-zvol_get_stats(objset_t *os, nvlist_t *nv)
-{
-	int error;
-	dmu_object_info_t doi;
-	uint64_t val;
-
-	error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &val);
-	if (error)
-		return (error);
-
-	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_VOLSIZE, val);
-
-	error = dmu_object_info(os, ZVOL_OBJ, &doi);
-
-	if (error == 0) {
-		dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_VOLBLOCKSIZE,
-		    doi.doi_data_block_size);
-	}
-
-	return (error);
-}
-
-static zvol_state_t *
-zvol_minor_lookup(const char *name)
-{
-#ifdef illumos
-	minor_t minor;
-#endif
-	zvol_state_t *zv;
-
-	ASSERT(MUTEX_HELD(&zfsdev_state_lock));
-
-#ifdef illumos
-	for (minor = 1; minor <= ZFSDEV_MAX_MINOR; minor++) {
-		zv = zfsdev_get_soft_state(minor, ZSST_ZVOL);
-		if (zv == NULL)
-			continue;
-#else
-	LIST_FOREACH(zv, &all_zvols, zv_links) {
-#endif
-		if (strcmp(zv->zv_name, name) == 0)
-			return (zv);
-	}
-
-	return (NULL);
-}
-
-/* extent mapping arg */
-struct maparg {
-	zvol_state_t	*ma_zv;
-	uint64_t	ma_blks;
-};
-
-/*ARGSUSED*/
-static int
-zvol_map_block(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
-    const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg)
-{
-	struct maparg *ma = arg;
-	zvol_extent_t *ze;
-	int bs = ma->ma_zv->zv_volblocksize;
-
-	if (bp == NULL || BP_IS_HOLE(bp) ||
-	    zb->zb_object != ZVOL_OBJ || zb->zb_level != 0)
-		return (0);
-
-	VERIFY(!BP_IS_EMBEDDED(bp));
-
-	VERIFY3U(ma->ma_blks, ==, zb->zb_blkid);
-	ma->ma_blks++;
-
-	/* Abort immediately if we have encountered gang blocks */
-	if (BP_IS_GANG(bp))
-		return (SET_ERROR(EFRAGS));
-
-	/*
-	 * See if the block is at the end of the previous extent.
-	 */
-	ze = list_tail(&ma->ma_zv->zv_extents);
-	if (ze &&
-	    DVA_GET_VDEV(BP_IDENTITY(bp)) == DVA_GET_VDEV(&ze->ze_dva) &&
-	    DVA_GET_OFFSET(BP_IDENTITY(bp)) ==
-	    DVA_GET_OFFSET(&ze->ze_dva) + ze->ze_nblks * bs) {
-		ze->ze_nblks++;
-		return (0);
-	}
-
-	dprintf_bp(bp, "%s", "next blkptr:");
-
-	/* start a new extent */
-	ze = kmem_zalloc(sizeof (zvol_extent_t), KM_SLEEP);
-	ze->ze_dva = bp->blk_dva[0];	/* structure assignment */
-	ze->ze_nblks = 1;
-	list_insert_tail(&ma->ma_zv->zv_extents, ze);
-	return (0);
-}
-
-static void
-zvol_free_extents(zvol_state_t *zv)
-{
-	zvol_extent_t *ze;
-
-	while (ze = list_head(&zv->zv_extents)) {
-		list_remove(&zv->zv_extents, ze);
-		kmem_free(ze, sizeof (zvol_extent_t));
-	}
-}
-
-static int
-zvol_get_lbas(zvol_state_t *zv)
-{
-	objset_t *os = zv->zv_objset;
-	struct maparg	ma;
-	int		err;
-
-	ma.ma_zv = zv;
-	ma.ma_blks = 0;
-	zvol_free_extents(zv);
-
-	/* commit any in-flight changes before traversing the dataset */
-	txg_wait_synced(dmu_objset_pool(os), 0);
-	err = traverse_dataset(dmu_objset_ds(os), 0,
-	    TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA, zvol_map_block, &ma);
-	if (err || ma.ma_blks != (zv->zv_volsize / zv->zv_volblocksize)) {
-		zvol_free_extents(zv);
-		return (err ? err : EIO);
-	}
-
-	return (0);
-}
-
-/* ARGSUSED */
-void
-zvol_create_cb(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx)
-{
-	zfs_creat_t *zct = arg;
-	nvlist_t *nvprops = zct->zct_props;
-	int error;
-	uint64_t volblocksize, volsize;
-
-	VERIFY(nvlist_lookup_uint64(nvprops,
-	    zfs_prop_to_name(ZFS_PROP_VOLSIZE), &volsize) == 0);
-	if (nvlist_lookup_uint64(nvprops,
-	    zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), &volblocksize) != 0)
-		volblocksize = zfs_prop_default_numeric(ZFS_PROP_VOLBLOCKSIZE);
-
-	/*
-	 * These properties must be removed from the list so the generic
-	 * property setting step won't apply to them.
-	 */
-	VERIFY(nvlist_remove_all(nvprops,
-	    zfs_prop_to_name(ZFS_PROP_VOLSIZE)) == 0);
-	(void) nvlist_remove_all(nvprops,
-	    zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE));
-
-	error = dmu_object_claim(os, ZVOL_OBJ, DMU_OT_ZVOL, volblocksize,
-	    DMU_OT_NONE, 0, tx);
-	ASSERT(error == 0);
-
-	error = zap_create_claim(os, ZVOL_ZAP_OBJ, DMU_OT_ZVOL_PROP,
-	    DMU_OT_NONE, 0, tx);
-	ASSERT(error == 0);
-
-	error = zap_update(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize, tx);
-	ASSERT(error == 0);
-}
-
-/*
- * Replay a TX_TRUNCATE ZIL transaction if asked.  TX_TRUNCATE is how we
- * implement DKIOCFREE/free-long-range.
- */
-static int
-zvol_replay_truncate(void *arg1, void *arg2, boolean_t byteswap)
-{
-	zvol_state_t *zv = arg1;
-	lr_truncate_t *lr = arg2;
-	uint64_t offset, length;
-
-	if (byteswap)
-		byteswap_uint64_array(lr, sizeof (*lr));
-
-	offset = lr->lr_offset;
-	length = lr->lr_length;
-
-	return (dmu_free_long_range(zv->zv_objset, ZVOL_OBJ, offset, length));
-}
-
-/*
- * Replay a TX_WRITE ZIL transaction that didn't get committed
- * after a system failure
- */
-static int
-zvol_replay_write(void *arg1, void *arg2, boolean_t byteswap)
-{
-	zvol_state_t *zv = arg1;
-	lr_write_t *lr = arg2;
-	objset_t *os = zv->zv_objset;
-	char *data = (char *)(lr + 1);	/* data follows lr_write_t */
-	uint64_t offset, length;
-	dmu_tx_t *tx;
-	int error;
-
-	if (byteswap)
-		byteswap_uint64_array(lr, sizeof (*lr));
-
-	offset = lr->lr_offset;
-	length = lr->lr_length;
-
-	/* If it's a dmu_sync() block, write the whole block */
-	if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) {
-		uint64_t blocksize = BP_GET_LSIZE(&lr->lr_blkptr);
-		if (length < blocksize) {
-			offset -= offset % blocksize;
-			length = blocksize;
-		}
-	}
-
-	tx = dmu_tx_create(os);
-	dmu_tx_hold_write(tx, ZVOL_OBJ, offset, length);
-	error = dmu_tx_assign(tx, TXG_WAIT);
-	if (error) {
-		dmu_tx_abort(tx);
-	} else {
-		dmu_write(os, ZVOL_OBJ, offset, length, data, tx);
-		dmu_tx_commit(tx);
-	}
-
-	return (error);
-}
-
-/* ARGSUSED */
-static int
-zvol_replay_err(void *arg1, void *arg2, boolean_t byteswap)
-{
-	return (SET_ERROR(ENOTSUP));
-}
-
-/*
- * Callback vectors for replaying records.
- * Only TX_WRITE and TX_TRUNCATE are needed for zvol.
- */
-zil_replay_func_t *zvol_replay_vector[TX_MAX_TYPE] = {
-	zvol_replay_err,	/* 0 no such transaction type */
-	zvol_replay_err,	/* TX_CREATE */
-	zvol_replay_err,	/* TX_MKDIR */
-	zvol_replay_err,	/* TX_MKXATTR */
-	zvol_replay_err,	/* TX_SYMLINK */
-	zvol_replay_err,	/* TX_REMOVE */
-	zvol_replay_err,	/* TX_RMDIR */
-	zvol_replay_err,	/* TX_LINK */
-	zvol_replay_err,	/* TX_RENAME */
-	zvol_replay_write,	/* TX_WRITE */
-	zvol_replay_truncate,	/* TX_TRUNCATE */
-	zvol_replay_err,	/* TX_SETATTR */
-	zvol_replay_err,	/* TX_ACL */
-	zvol_replay_err,	/* TX_CREATE_ACL */
-	zvol_replay_err,	/* TX_CREATE_ATTR */
-	zvol_replay_err,	/* TX_CREATE_ACL_ATTR */
-	zvol_replay_err,	/* TX_MKDIR_ACL */
-	zvol_replay_err,	/* TX_MKDIR_ATTR */
-	zvol_replay_err,	/* TX_MKDIR_ACL_ATTR */
-	zvol_replay_err,	/* TX_WRITE2 */
-};
-
-#ifdef illumos
-int
-zvol_name2minor(const char *name, minor_t *minor)
-{
-	zvol_state_t *zv;
-
-	mutex_enter(&zfsdev_state_lock);
-	zv = zvol_minor_lookup(name);
-	if (minor && zv)
-		*minor = zv->zv_minor;
-	mutex_exit(&zfsdev_state_lock);
-	return (zv ? 0 : -1);
-}
-#endif	/* illumos */
-
-/*
- * Create a minor node (plus a whole lot more) for the specified volume.
- */
-static int
-zvol_create_minor(const char *name)
-{
-	zfs_soft_state_t *zs;
-	zvol_state_t *zv;
-	objset_t *os;
-#ifdef illumos
-	dmu_object_info_t doi;
-	minor_t minor = 0;
-	char chrbuf[30], blkbuf[30];
-#else
-	struct g_provider *pp;
-	struct g_geom *gp;
-	uint64_t mode;
-#endif
-	int error;
-
-#ifndef illumos
-	ZFS_LOG(1, "Creating ZVOL %s...", name);
-#endif
-
-	mutex_enter(&zfsdev_state_lock);
-
-	if (zvol_minor_lookup(name) != NULL) {
-		mutex_exit(&zfsdev_state_lock);
-		return (SET_ERROR(EEXIST));
-	}
-
-	/* lie and say we're read-only */
-	error = dmu_objset_own(name, DMU_OST_ZVOL, B_TRUE, FTAG, &os);
-
-	if (error) {
-		mutex_exit(&zfsdev_state_lock);
-		return (error);
-	}
-
-#ifdef illumos
-	if ((minor = zfsdev_minor_alloc()) == 0) {
-		dmu_objset_disown(os, FTAG);
-		mutex_exit(&zfsdev_state_lock);
-		return (SET_ERROR(ENXIO));
-	}
-
-	if (ddi_soft_state_zalloc(zfsdev_state, minor) != DDI_SUCCESS) {
-		dmu_objset_disown(os, FTAG);
-		mutex_exit(&zfsdev_state_lock);
-		return (SET_ERROR(EAGAIN));
-	}
-	(void) ddi_prop_update_string(minor, zfs_dip, ZVOL_PROP_NAME,
-	    (char *)name);
-
-	(void) snprintf(chrbuf, sizeof (chrbuf), "%u,raw", minor);
-
-	if (ddi_create_minor_node(zfs_dip, chrbuf, S_IFCHR,
-	    minor, DDI_PSEUDO, 0) == DDI_FAILURE) {
-		ddi_soft_state_free(zfsdev_state, minor);
-		dmu_objset_disown(os, FTAG);
-		mutex_exit(&zfsdev_state_lock);
-		return (SET_ERROR(EAGAIN));
-	}
-
-	(void) snprintf(blkbuf, sizeof (blkbuf), "%u", minor);
-
-	if (ddi_create_minor_node(zfs_dip, blkbuf, S_IFBLK,
-	    minor, DDI_PSEUDO, 0) == DDI_FAILURE) {
-		ddi_remove_minor_node(zfs_dip, chrbuf);
-		ddi_soft_state_free(zfsdev_state, minor);
-		dmu_objset_disown(os, FTAG);
-		mutex_exit(&zfsdev_state_lock);
-		return (SET_ERROR(EAGAIN));
-	}
-
-	zs = ddi_get_soft_state(zfsdev_state, minor);
-	zs->zss_type = ZSST_ZVOL;
-	zv = zs->zss_data = kmem_zalloc(sizeof (zvol_state_t), KM_SLEEP);
-#else	/* !illumos */
-
-	zv = kmem_zalloc(sizeof(*zv), KM_SLEEP);
-	zv->zv_state = 0;
-	error = dsl_prop_get_integer(name,
-	    zfs_prop_to_name(ZFS_PROP_VOLMODE), &mode, NULL);
-	if (error != 0 || mode == ZFS_VOLMODE_DEFAULT)
-		mode = volmode;
-
-	zv->zv_volmode = mode;
-	if (zv->zv_volmode == ZFS_VOLMODE_GEOM) {
-		g_topology_lock();
-		gp = g_new_geomf(&zfs_zvol_class, "zfs::zvol::%s", name);
-		gp->start = zvol_geom_start;
-		gp->access = zvol_geom_access;
-		pp = g_new_providerf(gp, "%s/%s", ZVOL_DRIVER, name);
-		pp->flags |= G_PF_DIRECT_RECEIVE | G_PF_DIRECT_SEND;
-		pp->sectorsize = DEV_BSIZE;
-		pp->mediasize = 0;
-		pp->private = zv;
-
-		zv->zv_provider = pp;
-		bioq_init(&zv->zv_queue);
-		mtx_init(&zv->zv_queue_mtx, "zvol", NULL, MTX_DEF);
-	} else if (zv->zv_volmode == ZFS_VOLMODE_DEV) {
-		struct make_dev_args args;
-
-		make_dev_args_init(&args);
-		args.mda_flags = MAKEDEV_CHECKNAME | MAKEDEV_WAITOK;
-		args.mda_devsw = &zvol_cdevsw;
-		args.mda_cr = NULL;
-		args.mda_uid = UID_ROOT;
-		args.mda_gid = GID_OPERATOR;
-		args.mda_mode = 0640;
-		args.mda_si_drv2 = zv;
-		error = make_dev_s(&args, &zv->zv_dev,
-		    "%s/%s", ZVOL_DRIVER, name);
-		if (error != 0) {
-			kmem_free(zv, sizeof(*zv));
-			dmu_objset_disown(os, FTAG);
-			mutex_exit(&zfsdev_state_lock);
-			return (error);
-		}
-		zv->zv_dev->si_iosize_max = MAXPHYS;
-	}
-	LIST_INSERT_HEAD(&all_zvols, zv, zv_links);
-#endif	/* illumos */
-
-	(void) strlcpy(zv->zv_name, name, MAXPATHLEN);
-	zv->zv_min_bs = DEV_BSHIFT;
-#ifdef illumos
-	zv->zv_minor = minor;
-#endif
-	zv->zv_objset = os;
-	if (dmu_objset_is_snapshot(os) || !spa_writeable(dmu_objset_spa(os)))
-		zv->zv_flags |= ZVOL_RDONLY;
-	rangelock_init(&zv->zv_rangelock, NULL, NULL);
-	list_create(&zv->zv_extents, sizeof (zvol_extent_t),
-	    offsetof(zvol_extent_t, ze_node));
-#ifdef illumos
-	/* get and cache the blocksize */
-	error = dmu_object_info(os, ZVOL_OBJ, &doi);
-	ASSERT(error == 0);
-	zv->zv_volblocksize = doi.doi_data_block_size;
-#endif
-
-	if (spa_writeable(dmu_objset_spa(os))) {
-		if (zil_replay_disable)
-			zil_destroy(dmu_objset_zil(os), B_FALSE);
-		else
-			zil_replay(os, zv, zvol_replay_vector);
-	}
-	dmu_objset_disown(os, FTAG);
-	zv->zv_objset = NULL;
-
-	zvol_minors++;
-
-	mutex_exit(&zfsdev_state_lock);
-#ifndef illumos
-	if (zv->zv_volmode == ZFS_VOLMODE_GEOM) {
-		zvol_geom_run(zv);
-		g_topology_unlock();
-	}
-
-	ZFS_LOG(1, "ZVOL %s created.", name);
-#endif
-
-	return (0);
-}
-
-/*
- * Remove minor node for the specified volume.
- */
-static int
-zvol_remove_zv(zvol_state_t *zv)
-{
-#ifdef illumos
-	char nmbuf[20];
-	minor_t minor = zv->zv_minor;
-#endif
-
-	ASSERT(MUTEX_HELD(&zfsdev_state_lock));
-	if (zv->zv_total_opens != 0)
-		return (SET_ERROR(EBUSY));
-
-#ifdef illumos
-	(void) snprintf(nmbuf, sizeof (nmbuf), "%u,raw", minor);
-	ddi_remove_minor_node(zfs_dip, nmbuf);
-
-	(void) snprintf(nmbuf, sizeof (nmbuf), "%u", minor);
-	ddi_remove_minor_node(zfs_dip, nmbuf);
-#else
-	ZFS_LOG(1, "ZVOL %s destroyed.", zv->zv_name);
-
-	LIST_REMOVE(zv, zv_links);
-	if (zv->zv_volmode == ZFS_VOLMODE_GEOM) {
-		g_topology_lock();
-		zvol_geom_destroy(zv);
-		g_topology_unlock();
-	} else if (zv->zv_volmode == ZFS_VOLMODE_DEV) {
-		if (zv->zv_dev != NULL)
-			destroy_dev(zv->zv_dev);
-	}
-#endif
-
-	rangelock_fini(&zv->zv_rangelock);
-
-	kmem_free(zv, sizeof (zvol_state_t));
-#ifdef illumos
-	ddi_soft_state_free(zfsdev_state, minor);
-#endif
-	zvol_minors--;
-	return (0);
-}
-
-int
-zvol_first_open(zvol_state_t *zv)
-{
-	dmu_object_info_t doi;
-	objset_t *os;
-	uint64_t volsize;
-	int error;
-	uint64_t readonly;
-
-	/* lie and say we're read-only */
-	error = dmu_objset_own(zv->zv_name, DMU_OST_ZVOL, B_TRUE,
-	    zvol_tag, &os);
-	if (error)
-		return (error);
-
-	zv->zv_objset = os;
-	error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize);
-	if (error) {
-		ASSERT(error == 0);
-		dmu_objset_disown(os, zvol_tag);
-		return (error);
-	}
-
-	/* get and cache the blocksize */
-	error = dmu_object_info(os, ZVOL_OBJ, &doi);
-	if (error) {
-		ASSERT(error == 0);
-		dmu_objset_disown(os, zvol_tag);
-		return (error);
-	}
-	zv->zv_volblocksize = doi.doi_data_block_size;
-
-	error = dnode_hold(os, ZVOL_OBJ, zvol_tag, &zv->zv_dn);
-	if (error) {
-		dmu_objset_disown(os, zvol_tag);
-		return (error);
-	}
-
-	zvol_size_changed(zv, volsize);
-	zv->zv_zilog = zil_open(os, zvol_get_data);
-
-	VERIFY(dsl_prop_get_integer(zv->zv_name, "readonly", &readonly,
-	    NULL) == 0);
-	if (readonly || dmu_objset_is_snapshot(os) ||
-	    !spa_writeable(dmu_objset_spa(os)))
-		zv->zv_flags |= ZVOL_RDONLY;
-	else
-		zv->zv_flags &= ~ZVOL_RDONLY;
-	return (error);
-}
-
-void
-zvol_last_close(zvol_state_t *zv)
-{
-	zil_close(zv->zv_zilog);
-	zv->zv_zilog = NULL;
-
-	dnode_rele(zv->zv_dn, zvol_tag);
-	zv->zv_dn = NULL;
-
-	/*
-	 * Evict cached data
-	 */
-	if (dsl_dataset_is_dirty(dmu_objset_ds(zv->zv_objset)) &&
-	    !(zv->zv_flags & ZVOL_RDONLY))
-		txg_wait_synced(dmu_objset_pool(zv->zv_objset), 0);
-	dmu_objset_evict_dbufs(zv->zv_objset);
-
-	dmu_objset_disown(zv->zv_objset, zvol_tag);
-	zv->zv_objset = NULL;
-}
-
-#ifdef illumos
-int
-zvol_prealloc(zvol_state_t *zv)
-{
-	objset_t *os = zv->zv_objset;
-	dmu_tx_t *tx;
-	uint64_t refd, avail, usedobjs, availobjs;
-	uint64_t resid = zv->zv_volsize;
-	uint64_t off = 0;
-
-	/* Check the space usage before attempting to allocate the space */
-	dmu_objset_space(os, &refd, &avail, &usedobjs, &availobjs);
-	if (avail < zv->zv_volsize)
-		return (SET_ERROR(ENOSPC));
-
-	/* Free old extents if they exist */
-	zvol_free_extents(zv);
-
-	while (resid != 0) {
-		int error;
-		uint64_t bytes = MIN(resid, SPA_OLD_MAXBLOCKSIZE);
-
-		tx = dmu_tx_create(os);
-		dmu_tx_hold_write(tx, ZVOL_OBJ, off, bytes);
-		error = dmu_tx_assign(tx, TXG_WAIT);
-		if (error) {
-			dmu_tx_abort(tx);
-			(void) dmu_free_long_range(os, ZVOL_OBJ, 0, off);
-			return (error);
-		}
-		dmu_prealloc(os, ZVOL_OBJ, off, bytes, tx);
-		dmu_tx_commit(tx);
-		off += bytes;
-		resid -= bytes;
-	}
-	txg_wait_synced(dmu_objset_pool(os), 0);
-
-	return (0);
-}
-#endif	/* illumos */
-
-static int
-zvol_update_volsize(objset_t *os, uint64_t volsize)
-{
-	dmu_tx_t *tx;
-	int error;
-
-	ASSERT(MUTEX_HELD(&zfsdev_state_lock));
-
-	tx = dmu_tx_create(os);
-	dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, TRUE, NULL);
-	dmu_tx_mark_netfree(tx);
-	error = dmu_tx_assign(tx, TXG_WAIT);
-	if (error) {
-		dmu_tx_abort(tx);
-		return (error);
-	}
-
-	error = zap_update(os, ZVOL_ZAP_OBJ, "size", 8, 1,
-	    &volsize, tx);
-	dmu_tx_commit(tx);
-
-	if (error == 0)
-		error = dmu_free_long_range(os,
-		    ZVOL_OBJ, volsize, DMU_OBJECT_END);
-	return (error);
-}
-
-void
-zvol_remove_minors_impl(const char *name)
-{
-#ifdef illumos
-	zvol_state_t *zv;
-	char *namebuf;
-	minor_t minor;
-
-	namebuf = kmem_zalloc(strlen(name) + 2, KM_SLEEP);
-	(void) strncpy(namebuf, name, strlen(name));
-	(void) strcat(namebuf, "/");
-	mutex_enter(&zfsdev_state_lock);
-	for (minor = 1; minor <= ZFSDEV_MAX_MINOR; minor++) {
-
-		zv = zfsdev_get_soft_state(minor, ZSST_ZVOL);
-		if (zv == NULL)
-			continue;
-		if (strncmp(namebuf, zv->zv_name, strlen(namebuf)) == 0)
-			(void) zvol_remove_zv(zv);
-	}
-	kmem_free(namebuf, strlen(name) + 2);
-
-	mutex_exit(&zfsdev_state_lock);
-#else	/* !illumos */
-	zvol_state_t *zv, *tzv;
-	size_t namelen;
-
-	namelen = strlen(name);
-
-	mutex_enter(&zfsdev_state_lock);
-
-	LIST_FOREACH_SAFE(zv, &all_zvols, zv_links, tzv) {
-		if (strcmp(zv->zv_name, name) == 0 ||
-		    (strncmp(zv->zv_name, name, namelen) == 0 &&
-		    strlen(zv->zv_name) > namelen && (zv->zv_name[namelen] == '/' ||
-		    zv->zv_name[namelen] == '@'))) {
-			(void) zvol_remove_zv(zv);
-		}
-	}
-
-	mutex_exit(&zfsdev_state_lock);
-#endif	/* illumos */
-}
-
-static int
-zvol_update_live_volsize(zvol_state_t *zv, uint64_t volsize)
-{
-	uint64_t old_volsize = 0ULL;
-	int error = 0;
-
-	ASSERT(MUTEX_HELD(&zfsdev_state_lock));
-
-	/*
-	 * Reinitialize the dump area to the new size. If we
-	 * failed to resize the dump area then restore it back to
-	 * its original size.  We must set the new volsize prior
-	 * to calling dumpvp_resize() to ensure that the devices'
-	 * size(9P) is not visible by the dump subsystem.
-	 */
-	old_volsize = zv->zv_volsize;
-	zvol_size_changed(zv, volsize);
-
-#ifdef ZVOL_DUMP
-	if (zv->zv_flags & ZVOL_DUMPIFIED) {
-		if ((error = zvol_dumpify(zv)) != 0 ||
-		    (error = dumpvp_resize()) != 0) {
-			int dumpify_error;
-
-			(void) zvol_update_volsize(zv->zv_objset, old_volsize);
-			zvol_size_changed(zv, old_volsize);
-			dumpify_error = zvol_dumpify(zv);
-			error = dumpify_error ? dumpify_error : error;
-		}
-	}
-#endif	/* ZVOL_DUMP */
-
-#ifdef illumos
-	/*
-	 * Generate a LUN expansion event.
-	 */
-	if (error == 0) {
-		sysevent_id_t eid;
-		nvlist_t *attr;
-		char *physpath = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
-
-		(void) snprintf(physpath, MAXPATHLEN, "%s%u", ZVOL_PSEUDO_DEV,
-		    zv->zv_minor);
-
-		VERIFY(nvlist_alloc(&attr, NV_UNIQUE_NAME, KM_SLEEP) == 0);
-		VERIFY(nvlist_add_string(attr, DEV_PHYS_PATH, physpath) == 0);
-
-		(void) ddi_log_sysevent(zfs_dip, SUNW_VENDOR, EC_DEV_STATUS,
-		    ESC_DEV_DLE, attr, &eid, DDI_SLEEP);
-
-		nvlist_free(attr);
-		kmem_free(physpath, MAXPATHLEN);
-	}
-#endif	/* illumos */
-	return (error);
-}
-
-int
-zvol_set_volsize(const char *name, uint64_t volsize)
-{
-	zvol_state_t *zv = NULL;
-	objset_t *os;
-	int error;
-	dmu_object_info_t doi;
-	uint64_t readonly;
-	boolean_t owned = B_FALSE;
-
-	error = dsl_prop_get_integer(name,
-	    zfs_prop_to_name(ZFS_PROP_READONLY), &readonly, NULL);
-	if (error != 0)
-		return (error);
-	if (readonly)
-		return (SET_ERROR(EROFS));
-
-	mutex_enter(&zfsdev_state_lock);
-	zv = zvol_minor_lookup(name);
-
-	if (zv == NULL || zv->zv_objset == NULL) {
-		if ((error = dmu_objset_own(name, DMU_OST_ZVOL, B_FALSE,
-		    FTAG, &os)) != 0) {
-			mutex_exit(&zfsdev_state_lock);
-			return (error);
-		}
-		owned = B_TRUE;
-		if (zv != NULL)
-			zv->zv_objset = os;
-	} else {
-		os = zv->zv_objset;
-	}
-
-	if ((error = dmu_object_info(os, ZVOL_OBJ, &doi)) != 0 ||
-	    (error = zvol_check_volsize(volsize, doi.doi_data_block_size)) != 0)
-		goto out;
-
-	error = zvol_update_volsize(os, volsize);
-
-	if (error == 0 && zv != NULL)
-		error = zvol_update_live_volsize(zv, volsize);
-out:
-	if (owned) {
-		dmu_objset_disown(os, FTAG);
-		if (zv != NULL)
-			zv->zv_objset = NULL;
-	}
-	mutex_exit(&zfsdev_state_lock);
-	return (error);
-}
-
-/*ARGSUSED*/
-#ifdef illumos
-int
-zvol_open(dev_t *devp, int flag, int otyp, cred_t *cr)
-#else
-static int
-zvol_open(struct g_provider *pp, int flag, int count)
-#endif
-{
-	zvol_state_t *zv;
-	int err = 0;
-#ifdef illumos
-
-	mutex_enter(&zfsdev_state_lock);
-
-	zv = zfsdev_get_soft_state(getminor(*devp), ZSST_ZVOL);
-	if (zv == NULL) {
-		mutex_exit(&zfsdev_state_lock);
-		return (SET_ERROR(ENXIO));
-	}
-
-	if (zv->zv_total_opens == 0)
-		err = zvol_first_open(zv);
-	if (err) {
-		mutex_exit(&zfsdev_state_lock);
-		return (err);
-	}
-#else	/* !illumos */
-	boolean_t locked = B_FALSE;
-
-	if (!zpool_on_zvol && tsd_get(zfs_geom_probe_vdev_key) != NULL) {
-		/*
-		 * if zfs_geom_probe_vdev_key is set, that means that zfs is
-		 * attempting to probe geom providers while looking for a
-		 * replacement for a missing VDEV.  In this case, the
-		 * spa_namespace_lock will not be held, but it is still illegal
-		 * to use a zvol as a vdev.  Deadlocks can result if another
-		 * thread has spa_namespace_lock
-		 */
-		return (EOPNOTSUPP);
-	}
-	/*
-	 * Protect against recursively entering spa_namespace_lock
-	 * when spa_open() is used for a pool on a (local) ZVOL(s).
-	 * This is needed since we replaced upstream zfsdev_state_lock
-	 * with spa_namespace_lock in the ZVOL code.
-	 * We are using the same trick as spa_open().
-	 * Note that calls in zvol_first_open which need to resolve
-	 * pool name to a spa object will enter spa_open()
-	 * recursively, but that function already has all the
-	 * necessary protection.
-	 */
-	if (!MUTEX_HELD(&zfsdev_state_lock)) {
-		mutex_enter(&zfsdev_state_lock);
-		locked = B_TRUE;
-	}
-
-	zv = pp->private;
-	if (zv == NULL) {
-		if (locked)
-			mutex_exit(&zfsdev_state_lock);
-		return (SET_ERROR(ENXIO));
-	}
-
-	if (zv->zv_total_opens == 0) {
-		err = zvol_first_open(zv);
-		if (err) {
-			if (locked)
-				mutex_exit(&zfsdev_state_lock);
-			return (err);
-		}
-		pp->mediasize = zv->zv_volsize;
-		pp->stripeoffset = 0;
-		pp->stripesize = zv->zv_volblocksize;
-	}
-#endif	/* illumos */
-	if ((flag & FWRITE) && (zv->zv_flags & ZVOL_RDONLY)) {
-		err = SET_ERROR(EROFS);
-		goto out;
-	}
-	if (zv->zv_flags & ZVOL_EXCL) {
-		err = SET_ERROR(EBUSY);
-		goto out;
-	}
-#ifdef FEXCL
-	if (flag & FEXCL) {
-		if (zv->zv_total_opens != 0) {
-			err = SET_ERROR(EBUSY);
-			goto out;
-		}
-		zv->zv_flags |= ZVOL_EXCL;
-	}
-#endif
-
-#ifdef illumos
-	if (zv->zv_open_count[otyp] == 0 || otyp == OTYP_LYR) {
-		zv->zv_open_count[otyp]++;
-		zv->zv_total_opens++;
-	}
-	mutex_exit(&zfsdev_state_lock);
-#else
-	zv->zv_total_opens += count;
-	if (locked)
-		mutex_exit(&zfsdev_state_lock);
-#endif
-
-	return (err);
-out:
-	if (zv->zv_total_opens == 0)
-		zvol_last_close(zv);
-#ifdef illumos
-	mutex_exit(&zfsdev_state_lock);
-#else
-	if (locked)
-		mutex_exit(&zfsdev_state_lock);
-#endif
-	return (err);
-}
-
-/*ARGSUSED*/
-#ifdef illumos
-int
-zvol_close(dev_t dev, int flag, int otyp, cred_t *cr)
-{
-	minor_t minor = getminor(dev);
-	zvol_state_t *zv;
-	int error = 0;
-
-	mutex_enter(&zfsdev_state_lock);
-
-	zv = zfsdev_get_soft_state(minor, ZSST_ZVOL);
-	if (zv == NULL) {
-		mutex_exit(&zfsdev_state_lock);
-#else	/* !illumos */
-static int
-zvol_close(struct g_provider *pp, int flag, int count)
-{
-	zvol_state_t *zv;
-	int error = 0;
-	boolean_t locked = B_FALSE;
-
-	/* See comment in zvol_open(). */
-	if (!MUTEX_HELD(&zfsdev_state_lock)) {
-		mutex_enter(&zfsdev_state_lock);
-		locked = B_TRUE;
-	}
-
-	zv = pp->private;
-	if (zv == NULL) {
-		if (locked)
-			mutex_exit(&zfsdev_state_lock);
-#endif	/* illumos */
-		return (SET_ERROR(ENXIO));
-	}
-
-	if (zv->zv_flags & ZVOL_EXCL) {
-		ASSERT(zv->zv_total_opens == 1);
-		zv->zv_flags &= ~ZVOL_EXCL;
-	}
-
-	/*
-	 * If the open count is zero, this is a spurious close.
-	 * That indicates a bug in the kernel / DDI framework.
-	 */
-#ifdef illumos
-	ASSERT(zv->zv_open_count[otyp] != 0);
-#endif
-	ASSERT(zv->zv_total_opens != 0);
-
-	/*
-	 * You may get multiple opens, but only one close.
-	 */
-#ifdef illumos
-	zv->zv_open_count[otyp]--;
-	zv->zv_total_opens--;
-#else
-	zv->zv_total_opens -= count;
-#endif
-
-	if (zv->zv_total_opens == 0)
-		zvol_last_close(zv);
-
-#ifdef illumos
-	mutex_exit(&zfsdev_state_lock);
-#else
-	if (locked)
-		mutex_exit(&zfsdev_state_lock);
-#endif
-	return (error);
-}
-
-/* ARGSUSED */
-static void
-zvol_get_done(zgd_t *zgd, int error)
-{
-	if (zgd->zgd_db)
-		dmu_buf_rele(zgd->zgd_db, zgd);
-
-	rangelock_exit(zgd->zgd_lr);
-
-	kmem_free(zgd, sizeof (zgd_t));
-}
-
-/*
- * Get data to generate a TX_WRITE intent log record.
- */
-static int
-zvol_get_data(void *arg, lr_write_t *lr, char *buf, struct lwb *lwb, zio_t *zio)
-{
-	zvol_state_t *zv = arg;
-	uint64_t offset = lr->lr_offset;
-	uint64_t size = lr->lr_length;	/* length of user data */
-	dmu_buf_t *db;
-	zgd_t *zgd;
-	int error;
-
-	ASSERT3P(lwb, !=, NULL);
-	ASSERT3P(zio, !=, NULL);
-	ASSERT3U(size, !=, 0);
-
-	zgd = kmem_zalloc(sizeof (zgd_t), KM_SLEEP);
-	zgd->zgd_lwb = lwb;
-
-	/*
-	 * Write records come in two flavors: immediate and indirect.
-	 * For small writes it's cheaper to store the data with the
-	 * log record (immediate); for large writes it's cheaper to
-	 * sync the data and get a pointer to it (indirect) so that
-	 * we don't have to write the data twice.
-	 */
-	if (buf != NULL) { /* immediate write */
-		zgd->zgd_lr = rangelock_enter(&zv->zv_rangelock, offset, size,
-		    RL_READER);
-		error = dmu_read_by_dnode(zv->zv_dn, offset, size, buf,
-		    DMU_READ_NO_PREFETCH);
-	} else { /* indirect write */
-		/*
-		 * Have to lock the whole block to ensure when it's written out
-		 * and its checksum is being calculated that no one can change
-		 * the data. Contrarily to zfs_get_data we need not re-check
-		 * blocksize after we get the lock because it cannot be changed.
-		 */
-		size = zv->zv_volblocksize;
-		offset = P2ALIGN(offset, size);
-		zgd->zgd_lr = rangelock_enter(&zv->zv_rangelock, offset, size,
-		    RL_READER);
-		error = dmu_buf_hold_by_dnode(zv->zv_dn, offset, zgd, &db,
-		    DMU_READ_NO_PREFETCH);
-		if (error == 0) {
-			blkptr_t *bp = &lr->lr_blkptr;
-
-			zgd->zgd_db = db;
-			zgd->zgd_bp = bp;
-
-			ASSERT(db->db_offset == offset);
-			ASSERT(db->db_size == size);
-
-			error = dmu_sync(zio, lr->lr_common.lrc_txg,
-			    zvol_get_done, zgd);
-
-			if (error == 0)
-				return (0);
-		}
-	}
-
-	zvol_get_done(zgd, error);
-
-	return (error);
-}
-
-/*
- * zvol_log_write() handles synchronous writes using TX_WRITE ZIL transactions.
- *
- * We store data in the log buffers if it's small enough.
- * Otherwise we will later flush the data out via dmu_sync().
- */
-ssize_t zvol_immediate_write_sz = 32768;
-#ifdef _KERNEL
-SYSCTL_LONG(_vfs_zfs_vol, OID_AUTO, immediate_write_sz, CTLFLAG_RWTUN,
-    &zvol_immediate_write_sz, 0, "Minimal size for indirect log write");
-#endif
-
-static void
-zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, offset_t off, ssize_t resid,
-    boolean_t sync)
-{
-	uint32_t blocksize = zv->zv_volblocksize;
-	zilog_t *zilog = zv->zv_zilog;
-	itx_wr_state_t write_state;
-
-	if (zil_replaying(zilog, tx))
-		return;
-
-	if (zilog->zl_logbias == ZFS_LOGBIAS_THROUGHPUT)
-		write_state = WR_INDIRECT;
-	else if (!spa_has_slogs(zilog->zl_spa) &&
-	    resid >= blocksize && blocksize > zvol_immediate_write_sz)
-		write_state = WR_INDIRECT;
-	else if (sync)
-		write_state = WR_COPIED;
-	else
-		write_state = WR_NEED_COPY;
-
-	while (resid) {
-		itx_t *itx;
-		lr_write_t *lr;
-		itx_wr_state_t wr_state = write_state;
-		ssize_t len = resid;
-
-		if (wr_state == WR_COPIED && resid > zil_max_copied_data(zilog))
-			wr_state = WR_NEED_COPY;
-		else if (wr_state == WR_INDIRECT)
-			len = MIN(blocksize - P2PHASE(off, blocksize), resid);
-
-		itx = zil_itx_create(TX_WRITE, sizeof (*lr) +
-		    (wr_state == WR_COPIED ? len : 0));
-		lr = (lr_write_t *)&itx->itx_lr;
-		if (wr_state == WR_COPIED && dmu_read_by_dnode(zv->zv_dn,
-		    off, len, lr + 1, DMU_READ_NO_PREFETCH) != 0) {
-			zil_itx_destroy(itx);
-			itx = zil_itx_create(TX_WRITE, sizeof (*lr));
-			lr = (lr_write_t *)&itx->itx_lr;
-			wr_state = WR_NEED_COPY;
-		}
-
-		itx->itx_wr_state = wr_state;
-		lr->lr_foid = ZVOL_OBJ;
-		lr->lr_offset = off;
-		lr->lr_length = len;
-		lr->lr_blkoff = 0;
-		BP_ZERO(&lr->lr_blkptr);
-
-		itx->itx_private = zv;
-
-		if (!sync && (zv->zv_sync_cnt == 0))
-			itx->itx_sync = B_FALSE;
-
-		zil_itx_assign(zilog, itx, tx);
-
-		off += len;
-		resid -= len;
-	}
-}
-
-#ifdef illumos
-static int
-zvol_dumpio_vdev(vdev_t *vd, void *addr, uint64_t offset, uint64_t origoffset,
-    uint64_t size, boolean_t doread, boolean_t isdump)
-{
-	vdev_disk_t *dvd;
-	int c;
-	int numerrors = 0;
-
-	if (vd->vdev_ops == &vdev_mirror_ops ||
-	    vd->vdev_ops == &vdev_replacing_ops ||
-	    vd->vdev_ops == &vdev_spare_ops) {
-		for (c = 0; c < vd->vdev_children; c++) {
-			int err = zvol_dumpio_vdev(vd->vdev_child[c],
-			    addr, offset, origoffset, size, doread, isdump);
-			if (err != 0) {
-				numerrors++;
-			} else if (doread) {
-				break;
-			}
-		}
-	}
-
-	if (!vd->vdev_ops->vdev_op_leaf && vd->vdev_ops != &vdev_raidz_ops)
-		return (numerrors < vd->vdev_children ? 0 : EIO);
-
-	if (doread && !vdev_readable(vd))
-		return (SET_ERROR(EIO));
-	else if (!doread && !vdev_writeable(vd))
-		return (SET_ERROR(EIO));
-
-	if (vd->vdev_ops == &vdev_raidz_ops) {
-		return (vdev_raidz_physio(vd,
-		    addr, size, offset, origoffset, doread, isdump));
-	}
-
-	offset += VDEV_LABEL_START_SIZE;
-
-	if (ddi_in_panic() || isdump) {
-		ASSERT(!doread);
-		if (doread)
-			return (SET_ERROR(EIO));
-		dvd = vd->vdev_tsd;
-		ASSERT3P(dvd, !=, NULL);
-		return (ldi_dump(dvd->vd_lh, addr, lbtodb(offset),
-		    lbtodb(size)));
-	} else {
-		dvd = vd->vdev_tsd;
-		ASSERT3P(dvd, !=, NULL);
-		return (vdev_disk_ldi_physio(dvd->vd_lh, addr, size,
-		    offset, doread ? B_READ : B_WRITE));
-	}
-}
-
-static int
-zvol_dumpio(zvol_state_t *zv, void *addr, uint64_t offset, uint64_t size,
-    boolean_t doread, boolean_t isdump)
-{
-	vdev_t *vd;
-	int error;
-	zvol_extent_t *ze;
-	spa_t *spa = dmu_objset_spa(zv->zv_objset);
-
-	/* Must be sector aligned, and not stradle a block boundary. */
-	if (P2PHASE(offset, DEV_BSIZE) || P2PHASE(size, DEV_BSIZE) ||
-	    P2BOUNDARY(offset, size, zv->zv_volblocksize)) {
-		return (SET_ERROR(EINVAL));
-	}
-	ASSERT(size <= zv->zv_volblocksize);
-
-	/* Locate the extent this belongs to */
-	ze = list_head(&zv->zv_extents);
-	while (offset >= ze->ze_nblks * zv->zv_volblocksize) {
-		offset -= ze->ze_nblks * zv->zv_volblocksize;
-		ze = list_next(&zv->zv_extents, ze);
-	}
-
-	if (ze == NULL)
-		return (SET_ERROR(EINVAL));
-
-	if (!ddi_in_panic())
-		spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
-
-	vd = vdev_lookup_top(spa, DVA_GET_VDEV(&ze->ze_dva));
-	offset += DVA_GET_OFFSET(&ze->ze_dva);
-	error = zvol_dumpio_vdev(vd, addr, offset, DVA_GET_OFFSET(&ze->ze_dva),
-	    size, doread, isdump);
-
-	if (!ddi_in_panic())
-		spa_config_exit(spa, SCL_STATE, FTAG);
-
-	return (error);
-}
-
-int
-zvol_strategy(buf_t *bp)
-{
-	zfs_soft_state_t *zs = NULL;
-#else	/* !illumos */
-void
-zvol_strategy(struct bio *bp)
-{
-#endif	/* illumos */
-	zvol_state_t *zv;
-	uint64_t off, volsize;
-	size_t resid;
-	char *addr;
-	objset_t *os;
-	int error = 0;
-#ifdef illumos
-	boolean_t doread = bp->b_flags & B_READ;
-#else
-	boolean_t doread = 0;
-#endif
-	boolean_t is_dumpified;
-	boolean_t sync;
-
-#ifdef illumos
-	if (getminor(bp->b_edev) == 0) {
-		error = SET_ERROR(EINVAL);
-	} else {
-		zs = ddi_get_soft_state(zfsdev_state, getminor(bp->b_edev));
-		if (zs == NULL)
-			error = SET_ERROR(ENXIO);
-		else if (zs->zss_type != ZSST_ZVOL)
-			error = SET_ERROR(EINVAL);
-	}
-
-	if (error) {
-		bioerror(bp, error);
-		biodone(bp);
-		return (0);
-	}
-
-	zv = zs->zss_data;
-
-	if (!(bp->b_flags & B_READ) && (zv->zv_flags & ZVOL_RDONLY)) {
-		bioerror(bp, EROFS);
-		biodone(bp);
-		return (0);
-	}
-
-	off = ldbtob(bp->b_blkno);
-#else	/* !illumos */
-	if (bp->bio_to)
-		zv = bp->bio_to->private;
-	else
-		zv = bp->bio_dev->si_drv2;
-
-	if (zv == NULL) {
-		error = SET_ERROR(ENXIO);
-		goto out;
-	}
-
-	if (bp->bio_cmd != BIO_READ && (zv->zv_flags & ZVOL_RDONLY)) {
-		error = SET_ERROR(EROFS);
-		goto out;
-	}
-
-	switch (bp->bio_cmd) {
-	case BIO_FLUSH:
-		goto sync;
-	case BIO_READ:
-		doread = 1;
-	case BIO_WRITE:
-	case BIO_DELETE:
-		break;
-	default:
-		error = EOPNOTSUPP;
-		goto out;
-	}
-
-	off = bp->bio_offset;
-#endif	/* illumos */
-	volsize = zv->zv_volsize;
-
-	os = zv->zv_objset;
-	ASSERT(os != NULL);
-
-#ifdef illumos
-	bp_mapin(bp);
-	addr = bp->b_un.b_addr;
-	resid = bp->b_bcount;
-
-	if (resid > 0 && (off < 0 || off >= volsize)) {
-		bioerror(bp, EIO);
-		biodone(bp);
-		return (0);
-	}
-
-	is_dumpified = zv->zv_flags & ZVOL_DUMPIFIED;
-	sync = ((!(bp->b_flags & B_ASYNC) &&
-	    !(zv->zv_flags & ZVOL_WCE)) ||
-	    (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS)) &&
-	    !doread && !is_dumpified;
-#else	/* !illumos */
-	addr = bp->bio_data;
-	resid = bp->bio_length;
-
-	if (resid > 0 && (off < 0 || off >= volsize)) {
-		error = SET_ERROR(EIO);
-		goto out;
-	}
-
-	is_dumpified = B_FALSE;
-	sync = !doread && !is_dumpified &&
-	    zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS;
-#endif	/* illumos */
-
-	/*
-	 * There must be no buffer changes when doing a dmu_sync() because
-	 * we can't change the data whilst calculating the checksum.
-	 */
-	locked_range_t *lr = rangelock_enter(&zv->zv_rangelock, off, resid,
-	    doread ? RL_READER : RL_WRITER);
-
-#ifndef illumos
-	if (bp->bio_cmd == BIO_DELETE) {
-		dmu_tx_t *tx = dmu_tx_create(zv->zv_objset);
-		error = dmu_tx_assign(tx, TXG_WAIT);
-		if (error != 0) {
-			dmu_tx_abort(tx);
-		} else {
-			zvol_log_truncate(zv, tx, off, resid, sync);
-			dmu_tx_commit(tx);
-			error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ,
-			    off, resid);
-			resid = 0;
-		}
-		goto unlock;
-	}
-#endif
-	while (resid != 0 && off < volsize) {
-		size_t size = MIN(resid, zvol_maxphys);
-#ifdef illumos
-		if (is_dumpified) {
-			size = MIN(size, P2END(off, zv->zv_volblocksize) - off);
-			error = zvol_dumpio(zv, addr, off, size,
-			    doread, B_FALSE);
-		} else if (doread) {
-#else
-		if (doread) {
-#endif
-			error = dmu_read(os, ZVOL_OBJ, off, size, addr,
-			    DMU_READ_PREFETCH);
-		} else {
-			dmu_tx_t *tx = dmu_tx_create(os);
-			dmu_tx_hold_write(tx, ZVOL_OBJ, off, size);
-			error = dmu_tx_assign(tx, TXG_WAIT);
-			if (error) {
-				dmu_tx_abort(tx);
-			} else {
-				dmu_write(os, ZVOL_OBJ, off, size, addr, tx);
-				zvol_log_write(zv, tx, off, size, sync);
-				dmu_tx_commit(tx);
-			}
-		}
-		if (error) {
-			/* convert checksum errors into IO errors */
-			if (error == ECKSUM)
-				error = SET_ERROR(EIO);
-			break;
-		}
-		off += size;
-		addr += size;
-		resid -= size;
-	}
-#ifndef illumos
-unlock:
-#endif
-	rangelock_exit(lr);
-
-#ifdef illumos
-	if ((bp->b_resid = resid) == bp->b_bcount)
-		bioerror(bp, off > volsize ? EINVAL : error);
-
-	if (sync)
-		zil_commit(zv->zv_zilog, ZVOL_OBJ);
-	biodone(bp);
-
-	return (0);
-#else	/* !illumos */
-	bp->bio_completed = bp->bio_length - resid;
-	if (bp->bio_completed < bp->bio_length && off > volsize)
-		error = EINVAL;
-
-	if (sync) {
-sync:
-		zil_commit(zv->zv_zilog, ZVOL_OBJ);
-	}
-out:
-	if (bp->bio_to)
-		g_io_deliver(bp, error);
-	else
-		biofinish(bp, NULL, error);
-#endif	/* illumos */
-}
-
-#ifdef illumos
-/*
- * Set the buffer count to the zvol maximum transfer.
- * Using our own routine instead of the default minphys()
- * means that for larger writes we write bigger buffers on X86
- * (128K instead of 56K) and flush the disk write cache less often
- * (every zvol_maxphys - currently 1MB) instead of minphys (currently
- * 56K on X86 and 128K on sparc).
- */
-void
-zvol_minphys(struct buf *bp)
-{
-	if (bp->b_bcount > zvol_maxphys)
-		bp->b_bcount = zvol_maxphys;
-}
-
-int
-zvol_dump(dev_t dev, caddr_t addr, daddr_t blkno, int nblocks)
-{
-	minor_t minor = getminor(dev);
-	zvol_state_t *zv;
-	int error = 0;
-	uint64_t size;
-	uint64_t boff;
-	uint64_t resid;
-
-	zv = zfsdev_get_soft_state(minor, ZSST_ZVOL);
-	if (zv == NULL)
-		return (SET_ERROR(ENXIO));
-
-	if ((zv->zv_flags & ZVOL_DUMPIFIED) == 0)
-		return (SET_ERROR(EINVAL));
-
-	boff = ldbtob(blkno);
-	resid = ldbtob(nblocks);
-
-	VERIFY3U(boff + resid, <=, zv->zv_volsize);
-
-	while (resid) {
-		size = MIN(resid, P2END(boff, zv->zv_volblocksize) - boff);
-		error = zvol_dumpio(zv, addr, boff, size, B_FALSE, B_TRUE);
-		if (error)
-			break;
-		boff += size;
-		addr += size;
-		resid -= size;
-	}
-
-	return (error);
-}
-
-/*ARGSUSED*/
-int
-zvol_read(dev_t dev, uio_t *uio, cred_t *cr)
-{
-	minor_t minor = getminor(dev);
-#else	/* !illumos */
-int
-zvol_read(struct cdev *dev, struct uio *uio, int ioflag)
-{
-#endif	/* illumos */
-	zvol_state_t *zv;
-	uint64_t volsize;
-	int error = 0;
-
-#ifdef illumos
-	zv = zfsdev_get_soft_state(minor, ZSST_ZVOL);
-	if (zv == NULL)
-		return (SET_ERROR(ENXIO));
-#else
-	zv = dev->si_drv2;
-#endif
-
-	volsize = zv->zv_volsize;
-	/* uio_loffset == volsize isn't an error as its required for EOF processing. */
-	if (uio->uio_resid > 0 &&
-	    (uio->uio_loffset < 0 || uio->uio_loffset > volsize))
-		return (SET_ERROR(EIO));
-
-#ifdef illumos
-	if (zv->zv_flags & ZVOL_DUMPIFIED) {
-		error = physio(zvol_strategy, NULL, dev, B_READ,
-		    zvol_minphys, uio);
-		return (error);
-	}
-#endif
-
-	locked_range_t *lr = rangelock_enter(&zv->zv_rangelock,
-	    uio->uio_loffset, uio->uio_resid, RL_READER);
-	while (uio->uio_resid > 0 && uio->uio_loffset < volsize) {
-		uint64_t bytes = MIN(uio->uio_resid, DMU_MAX_ACCESS >> 1);
-
-		/* don't read past the end */
-		if (bytes > volsize - uio->uio_loffset)
-			bytes = volsize - uio->uio_loffset;
-
-		error =  dmu_read_uio_dnode(zv->zv_dn, uio, bytes);
-		if (error) {
-			/* convert checksum errors into IO errors */
-			if (error == ECKSUM)
-				error = SET_ERROR(EIO);
-			break;
-		}
-	}
-	rangelock_exit(lr);
-
-	return (error);
-}
-
-#ifdef illumos
-/*ARGSUSED*/
-int
-zvol_write(dev_t dev, uio_t *uio, cred_t *cr)
-{
-	minor_t minor = getminor(dev);
-#else	/* !illumos */
-int
-zvol_write(struct cdev *dev, struct uio *uio, int ioflag)
-{
-#endif	/* illumos */
-	zvol_state_t *zv;
-	uint64_t volsize;
-	int error = 0;
-	boolean_t sync;
-
-#ifdef illumos
-	zv = zfsdev_get_soft_state(minor, ZSST_ZVOL);
-	if (zv == NULL)
-		return (SET_ERROR(ENXIO));
-#else
-	zv = dev->si_drv2;
-#endif
-
-	volsize = zv->zv_volsize;
-	/* uio_loffset == volsize isn't an error as its required for EOF processing. */
-	if (uio->uio_resid > 0 &&
-	    (uio->uio_loffset < 0 || uio->uio_loffset > volsize))
-		return (SET_ERROR(EIO));
-
-#ifdef illumos
-	if (zv->zv_flags & ZVOL_DUMPIFIED) {
-		error = physio(zvol_strategy, NULL, dev, B_WRITE,
-		    zvol_minphys, uio);
-		return (error);
-	}
-
-	sync = !(zv->zv_flags & ZVOL_WCE) ||
-#else
-	sync = (ioflag & IO_SYNC) ||
-#endif
-	    (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS);
-
-	locked_range_t *lr = rangelock_enter(&zv->zv_rangelock,
-	    uio->uio_loffset, uio->uio_resid, RL_WRITER);
-	while (uio->uio_resid > 0 && uio->uio_loffset < volsize) {
-		uint64_t bytes = MIN(uio->uio_resid, DMU_MAX_ACCESS >> 1);
-		uint64_t off = uio->uio_loffset;
-		dmu_tx_t *tx = dmu_tx_create(zv->zv_objset);
-
-		if (bytes > volsize - off)	/* don't write past the end */
-			bytes = volsize - off;
-
-		dmu_tx_hold_write(tx, ZVOL_OBJ, off, bytes);
-		error = dmu_tx_assign(tx, TXG_WAIT);
-		if (error) {
-			dmu_tx_abort(tx);
-			break;
-		}
-		error = dmu_write_uio_dnode(zv->zv_dn, uio, bytes, tx);
-		if (error == 0)
-			zvol_log_write(zv, tx, off, bytes, sync);
-		dmu_tx_commit(tx);
-
-		if (error)
-			break;
-	}
-	rangelock_exit(lr);
-
-	if (sync)
-		zil_commit(zv->zv_zilog, ZVOL_OBJ);
-	return (error);
-}
-
-#ifdef illumos
-int
-zvol_getefi(void *arg, int flag, uint64_t vs, uint8_t bs)
-{
-	struct uuid uuid = EFI_RESERVED;
-	efi_gpe_t gpe = { 0 };
-	uint32_t crc;
-	dk_efi_t efi;
-	int length;
-	char *ptr;
-
-	if (ddi_copyin(arg, &efi, sizeof (dk_efi_t), flag))
-		return (SET_ERROR(EFAULT));
-	ptr = (char *)(uintptr_t)efi.dki_data_64;
-	length = efi.dki_length;
-	/*
-	 * Some clients may attempt to request a PMBR for the
-	 * zvol.  Currently this interface will return EINVAL to
-	 * such requests.  These requests could be supported by
-	 * adding a check for lba == 0 and consing up an appropriate
-	 * PMBR.
-	 */
-	if (efi.dki_lba < 1 || efi.dki_lba > 2 || length <= 0)
-		return (SET_ERROR(EINVAL));
-
-	gpe.efi_gpe_StartingLBA = LE_64(34ULL);
-	gpe.efi_gpe_EndingLBA = LE_64((vs >> bs) - 1);
-	UUID_LE_CONVERT(gpe.efi_gpe_PartitionTypeGUID, uuid);
-
-	if (efi.dki_lba == 1) {
-		efi_gpt_t gpt = { 0 };
-
-		gpt.efi_gpt_Signature = LE_64(EFI_SIGNATURE);
-		gpt.efi_gpt_Revision = LE_32(EFI_VERSION_CURRENT);
-		gpt.efi_gpt_HeaderSize = LE_32(sizeof (gpt));
-		gpt.efi_gpt_MyLBA = LE_64(1ULL);
-		gpt.efi_gpt_FirstUsableLBA = LE_64(34ULL);
-		gpt.efi_gpt_LastUsableLBA = LE_64((vs >> bs) - 1);
-		gpt.efi_gpt_PartitionEntryLBA = LE_64(2ULL);
-		gpt.efi_gpt_NumberOfPartitionEntries = LE_32(1);
-		gpt.efi_gpt_SizeOfPartitionEntry =
-		    LE_32(sizeof (efi_gpe_t));
-		CRC32(crc, &gpe, sizeof (gpe), -1U, crc32_table);
-		gpt.efi_gpt_PartitionEntryArrayCRC32 = LE_32(~crc);
-		CRC32(crc, &gpt, sizeof (gpt), -1U, crc32_table);
-		gpt.efi_gpt_HeaderCRC32 = LE_32(~crc);
-		if (ddi_copyout(&gpt, ptr, MIN(sizeof (gpt), length),
-		    flag))
-			return (SET_ERROR(EFAULT));
-		ptr += sizeof (gpt);
-		length -= sizeof (gpt);
-	}
-	if (length > 0 && ddi_copyout(&gpe, ptr, MIN(sizeof (gpe),
-	    length), flag))
-		return (SET_ERROR(EFAULT));
-	return (0);
-}
-
-/*
- * BEGIN entry points to allow external callers access to the volume.
- */
-/*
- * Return the volume parameters needed for access from an external caller.
- * These values are invariant as long as the volume is held open.
- */
-int
-zvol_get_volume_params(minor_t minor, uint64_t *blksize,
-    uint64_t *max_xfer_len, void **minor_hdl, void **objset_hdl, void **zil_hdl,
-    void **rl_hdl, void **dnode_hdl)
-{
-	zvol_state_t *zv;
-
-	zv = zfsdev_get_soft_state(minor, ZSST_ZVOL);
-	if (zv == NULL)
-		return (SET_ERROR(ENXIO));
-	if (zv->zv_flags & ZVOL_DUMPIFIED)
-		return (SET_ERROR(ENXIO));
-
-	ASSERT(blksize && max_xfer_len && minor_hdl &&
-	    objset_hdl && zil_hdl && rl_hdl && dnode_hdl);
-
-	*blksize = zv->zv_volblocksize;
-	*max_xfer_len = (uint64_t)zvol_maxphys;
-	*minor_hdl = zv;
-	*objset_hdl = zv->zv_objset;
-	*zil_hdl = zv->zv_zilog;
-	*rl_hdl = &zv->zv_rangelock;
-	*dnode_hdl = zv->zv_dn;
-	return (0);
-}
-
-/*
- * Return the current volume size to an external caller.
- * The size can change while the volume is open.
- */
-uint64_t
-zvol_get_volume_size(void *minor_hdl)
-{
-	zvol_state_t *zv = minor_hdl;
-
-	return (zv->zv_volsize);
-}
-
-/*
- * Return the current WCE setting to an external caller.
- * The WCE setting can change while the volume is open.
- */
-int
-zvol_get_volume_wce(void *minor_hdl)
-{
-	zvol_state_t *zv = minor_hdl;
-
-	return ((zv->zv_flags & ZVOL_WCE) ? 1 : 0);
-}
-
-/*
- * Entry point for external callers to zvol_log_write
- */
-void
-zvol_log_write_minor(void *minor_hdl, dmu_tx_t *tx, offset_t off, ssize_t resid,
-    boolean_t sync)
-{
-	zvol_state_t *zv = minor_hdl;
-
-	zvol_log_write(zv, tx, off, resid, sync);
-}
-/*
- * END entry points to allow external callers access to the volume.
- */
-#endif	/* illumos */
-
-/*
- * Log a DKIOCFREE/free-long-range to the ZIL with TX_TRUNCATE.
- */
-static void
-zvol_log_truncate(zvol_state_t *zv, dmu_tx_t *tx, uint64_t off, uint64_t len,
-    boolean_t sync)
-{
-	itx_t *itx;
-	lr_truncate_t *lr;
-	zilog_t *zilog = zv->zv_zilog;
-
-	if (zil_replaying(zilog, tx))
-		return;
-
-	itx = zil_itx_create(TX_TRUNCATE, sizeof (*lr));
-	lr = (lr_truncate_t *)&itx->itx_lr;
-	lr->lr_foid = ZVOL_OBJ;
-	lr->lr_offset = off;
-	lr->lr_length = len;
-
-	itx->itx_sync = (sync || zv->zv_sync_cnt != 0);
-	zil_itx_assign(zilog, itx, tx);
-}
-
-#ifdef illumos
-/*
- * Dirtbag ioctls to support mkfs(1M) for UFS filesystems.  See dkio(7I).
- * Also a dirtbag dkio ioctl for unmap/free-block functionality.
- */
-/*ARGSUSED*/
-int
-zvol_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp)
-{
-	zvol_state_t *zv;
-	struct dk_callback *dkc;
-	int error = 0;
-	locked_range_t *lr;
-
-	mutex_enter(&zfsdev_state_lock);
-
-	zv = zfsdev_get_soft_state(getminor(dev), ZSST_ZVOL);
-
-	if (zv == NULL) {
-		mutex_exit(&zfsdev_state_lock);
-		return (SET_ERROR(ENXIO));
-	}
-	ASSERT(zv->zv_total_opens > 0);
-
-	switch (cmd) {
-
-	case DKIOCINFO:
-	{
-		struct dk_cinfo dki;
-
-		bzero(&dki, sizeof (dki));
-		(void) strcpy(dki.dki_cname, "zvol");
-		(void) strcpy(dki.dki_dname, "zvol");
-		dki.dki_ctype = DKC_UNKNOWN;
-		dki.dki_unit = getminor(dev);
-		dki.dki_maxtransfer =
-		    1 << (SPA_OLD_MAXBLOCKSHIFT - zv->zv_min_bs);
-		mutex_exit(&zfsdev_state_lock);
-		if (ddi_copyout(&dki, (void *)arg, sizeof (dki), flag))
-			error = SET_ERROR(EFAULT);
-		return (error);
-	}
-
-	case DKIOCGMEDIAINFO:
-	{
-		struct dk_minfo dkm;
-
-		bzero(&dkm, sizeof (dkm));
-		dkm.dki_lbsize = 1U << zv->zv_min_bs;
-		dkm.dki_capacity = zv->zv_volsize >> zv->zv_min_bs;
-		dkm.dki_media_type = DK_UNKNOWN;
-		mutex_exit(&zfsdev_state_lock);
-		if (ddi_copyout(&dkm, (void *)arg, sizeof (dkm), flag))
-			error = SET_ERROR(EFAULT);
-		return (error);
-	}
-
-	case DKIOCGMEDIAINFOEXT:
-	{
-		struct dk_minfo_ext dkmext;
-
-		bzero(&dkmext, sizeof (dkmext));
-		dkmext.dki_lbsize = 1U << zv->zv_min_bs;
-		dkmext.dki_pbsize = zv->zv_volblocksize;
-		dkmext.dki_capacity = zv->zv_volsize >> zv->zv_min_bs;
-		dkmext.dki_media_type = DK_UNKNOWN;
-		mutex_exit(&zfsdev_state_lock);
-		if (ddi_copyout(&dkmext, (void *)arg, sizeof (dkmext), flag))
-			error = SET_ERROR(EFAULT);
-		return (error);
-	}
-
-	case DKIOCGETEFI:
-	{
-		uint64_t vs = zv->zv_volsize;
-		uint8_t bs = zv->zv_min_bs;
-
-		mutex_exit(&zfsdev_state_lock);
-		error = zvol_getefi((void *)arg, flag, vs, bs);
-		return (error);
-	}
-
-	case DKIOCFLUSHWRITECACHE:
-		dkc = (struct dk_callback *)arg;
-		mutex_exit(&zfsdev_state_lock);
-		zil_commit(zv->zv_zilog, ZVOL_OBJ);
-		if ((flag & FKIOCTL) && dkc != NULL && dkc->dkc_callback) {
-			(*dkc->dkc_callback)(dkc->dkc_cookie, error);
-			error = 0;
-		}
-		return (error);
-
-	case DKIOCGETWCE:
-	{
-		int wce = (zv->zv_flags & ZVOL_WCE) ? 1 : 0;
-		if (ddi_copyout(&wce, (void *)arg, sizeof (int),
-		    flag))
-			error = SET_ERROR(EFAULT);
-		break;
-	}
-	case DKIOCSETWCE:
-	{
-		int wce;
-		if (ddi_copyin((void *)arg, &wce, sizeof (int),
-		    flag)) {
-			error = SET_ERROR(EFAULT);
-			break;
-		}
-		if (wce) {
-			zv->zv_flags |= ZVOL_WCE;
-			mutex_exit(&zfsdev_state_lock);
-		} else {
-			zv->zv_flags &= ~ZVOL_WCE;
-			mutex_exit(&zfsdev_state_lock);
-			zil_commit(zv->zv_zilog, ZVOL_OBJ);
-		}
-		return (0);
-	}
-
-	case DKIOCGGEOM:
-	case DKIOCGVTOC:
-		/*
-		 * commands using these (like prtvtoc) expect ENOTSUP
-		 * since we're emulating an EFI label
-		 */
-		error = SET_ERROR(ENOTSUP);
-		break;
-
-	case DKIOCDUMPINIT:
-		lr = rangelock_enter(&zv->zv_rangelock, 0, zv->zv_volsize,
-		    RL_WRITER);
-		error = zvol_dumpify(zv);
-		rangelock_exit(lr);
-		break;
-
-	case DKIOCDUMPFINI:
-		if (!(zv->zv_flags & ZVOL_DUMPIFIED))
-			break;
-		lr = rangelock_enter(&zv->zv_rangelock, 0, zv->zv_volsize,
-		    RL_WRITER);
-		error = zvol_dump_fini(zv);
-		rangelock_exit(lr);
-		break;
-
-	case DKIOCFREE:
-	{
-		dkioc_free_list_t *dfl;
-		dmu_tx_t *tx;
-
-		if (!zvol_unmap_enabled)
-			break;
-
-		if (!(flag & FKIOCTL)) {
-			error = dfl_copyin((void *)arg, &dfl, flag, KM_SLEEP);
-			if (error != 0)
-				break;
-		} else {
-			dfl = (dkioc_free_list_t *)arg;
-			ASSERT3U(dfl->dfl_num_exts, <=, DFL_COPYIN_MAX_EXTS);
-			if (dfl->dfl_num_exts > DFL_COPYIN_MAX_EXTS) {
-				error = SET_ERROR(EINVAL);
-				break;
-			}
-		}
-
-		mutex_exit(&zfsdev_state_lock);
-
-		for (int i = 0; i < dfl->dfl_num_exts; i++) {
-			uint64_t start = dfl->dfl_exts[i].dfle_start,
-			    length = dfl->dfl_exts[i].dfle_length,
-			    end = start + length;
-
-			/*
-			 * Apply Postel's Law to length-checking.  If they
-			 * overshoot, just blank out until the end, if there's
-			 * a need to blank out anything.
-			 */
-			if (start >= zv->zv_volsize)
-				continue;	/* No need to do anything... */
-			if (end > zv->zv_volsize) {
-				end = DMU_OBJECT_END;
-				length = end - start;
-			}
-
-			lr = rangelock_enter(&zv->zv_rangelock, start, length,
-			    RL_WRITER);
-			tx = dmu_tx_create(zv->zv_objset);
-			error = dmu_tx_assign(tx, TXG_WAIT);
-			if (error != 0) {
-				dmu_tx_abort(tx);
-			} else {
-				zvol_log_truncate(zv, tx, start, length,
-				    B_TRUE);
-				dmu_tx_commit(tx);
-				error = dmu_free_long_range(zv->zv_objset,
-				    ZVOL_OBJ, start, length);
-			}
-
-			rangelock_exit(lr);
-
-			if (error != 0)
-				break;
-		}
-
-		/*
-		 * If the write-cache is disabled, 'sync' property
-		 * is set to 'always', or if the caller is asking for
-		 * a synchronous free, commit this operation to the zil.
-		 * This will sync any previous uncommitted writes to the
-		 * zvol object.
-		 * Can be overridden by the zvol_unmap_sync_enabled tunable.
-		 */
-		if ((error == 0) && zvol_unmap_sync_enabled &&
-		    (!(zv->zv_flags & ZVOL_WCE) ||
-		    (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS) ||
-		    (dfl->dfl_flags & DF_WAIT_SYNC))) {
-			zil_commit(zv->zv_zilog, ZVOL_OBJ);
-		}
-
-		if (!(flag & FKIOCTL))
-			dfl_free(dfl);
-
-		return (error);
-	}
-
-	default:
-		error = SET_ERROR(ENOTTY);
-		break;
-
-	}
-	mutex_exit(&zfsdev_state_lock);
-	return (error);
-}
-#endif	/* illumos */
-
-int
-zvol_busy(void)
-{
-	return (zvol_minors != 0);
-}
-
-void
-zvol_init(void)
-{
-	VERIFY(ddi_soft_state_init(&zfsdev_state, sizeof (zfs_soft_state_t),
-	    1) == 0);
-#ifdef illumos
-	mutex_init(&zfsdev_state_lock, NULL, MUTEX_DEFAULT, NULL);
-#else
-	ZFS_LOG(1, "ZVOL Initialized.");
-#endif
-}
-
-void
-zvol_fini(void)
-{
-#ifdef illumos
-	mutex_destroy(&zfsdev_state_lock);
-#endif
-	ddi_soft_state_fini(&zfsdev_state);
-	ZFS_LOG(1, "ZVOL Deinitialized.");
-}
-
-#ifdef illumos
-/*ARGSUSED*/
-static int
-zfs_mvdev_dump_feature_check(void *arg, dmu_tx_t *tx)
-{
-	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
-
-	if (spa_feature_is_active(spa, SPA_FEATURE_MULTI_VDEV_CRASH_DUMP))
-		return (1);
-	return (0);
-}
-
-/*ARGSUSED*/
-static void
-zfs_mvdev_dump_activate_feature_sync(void *arg, dmu_tx_t *tx)
-{
-	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
-
-	spa_feature_incr(spa, SPA_FEATURE_MULTI_VDEV_CRASH_DUMP, tx);
-}
-
-static int
-zvol_dump_init(zvol_state_t *zv, boolean_t resize)
-{
-	dmu_tx_t *tx;
-	int error;
-	objset_t *os = zv->zv_objset;
-	spa_t *spa = dmu_objset_spa(os);
-	vdev_t *vd = spa->spa_root_vdev;
-	nvlist_t *nv = NULL;
-	uint64_t version = spa_version(spa);
-	uint64_t checksum, compress, refresrv, vbs, dedup;
-
-	ASSERT(MUTEX_HELD(&zfsdev_state_lock));
-	ASSERT(vd->vdev_ops == &vdev_root_ops);
-
-	error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ, 0,
-	    DMU_OBJECT_END);
-	if (error != 0)
-		return (error);
-	/* wait for dmu_free_long_range to actually free the blocks */
-	txg_wait_synced(dmu_objset_pool(zv->zv_objset), 0);
-
-	/*
-	 * If the pool on which the dump device is being initialized has more
-	 * than one child vdev, check that the MULTI_VDEV_CRASH_DUMP feature is
-	 * enabled.  If so, bump that feature's counter to indicate that the
-	 * feature is active. We also check the vdev type to handle the
-	 * following case:
-	 *   # zpool create test raidz disk1 disk2 disk3
-	 *   Now have spa_root_vdev->vdev_children == 1 (the raidz vdev),
-	 *   the raidz vdev itself has 3 children.
-	 */
-	if (vd->vdev_children > 1 || vd->vdev_ops == &vdev_raidz_ops) {
-		if (!spa_feature_is_enabled(spa,
-		    SPA_FEATURE_MULTI_VDEV_CRASH_DUMP))
-			return (SET_ERROR(ENOTSUP));
-		(void) dsl_sync_task(spa_name(spa),
-		    zfs_mvdev_dump_feature_check,
-		    zfs_mvdev_dump_activate_feature_sync, NULL,
-		    2, ZFS_SPACE_CHECK_RESERVED);
-	}
-
-	if (!resize) {
-		error = dsl_prop_get_integer(zv->zv_name,
-		    zfs_prop_to_name(ZFS_PROP_COMPRESSION), &compress, NULL);
-		if (error == 0) {
-			error = dsl_prop_get_integer(zv->zv_name,
-			    zfs_prop_to_name(ZFS_PROP_CHECKSUM), &checksum,
-			    NULL);
-		}
-		if (error == 0) {
-			error = dsl_prop_get_integer(zv->zv_name,
-			    zfs_prop_to_name(ZFS_PROP_REFRESERVATION),
-			    &refresrv, NULL);
-		}
-		if (error == 0) {
-			error = dsl_prop_get_integer(zv->zv_name,
-			    zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), &vbs,
-			    NULL);
-		}
-		if (version >= SPA_VERSION_DEDUP && error == 0) {
-			error = dsl_prop_get_integer(zv->zv_name,
-			    zfs_prop_to_name(ZFS_PROP_DEDUP), &dedup, NULL);
-		}
-	}
-	if (error != 0)
-		return (error);
-
-	tx = dmu_tx_create(os);
-	dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, TRUE, NULL);
-	dmu_tx_hold_bonus(tx, ZVOL_OBJ);
-	error = dmu_tx_assign(tx, TXG_WAIT);
-	if (error != 0) {
-		dmu_tx_abort(tx);
-		return (error);
-	}
-
-	/*
-	 * If we are resizing the dump device then we only need to
-	 * update the refreservation to match the newly updated
-	 * zvolsize. Otherwise, we save off the original state of the
-	 * zvol so that we can restore them if the zvol is ever undumpified.
-	 */
-	if (resize) {
-		error = zap_update(os, ZVOL_ZAP_OBJ,
-		    zfs_prop_to_name(ZFS_PROP_REFRESERVATION), 8, 1,
-		    &zv->zv_volsize, tx);
-	} else {
-		error = zap_update(os, ZVOL_ZAP_OBJ,
-		    zfs_prop_to_name(ZFS_PROP_COMPRESSION), 8, 1,
-		    &compress, tx);
-		if (error == 0) {
-			error = zap_update(os, ZVOL_ZAP_OBJ,
-			    zfs_prop_to_name(ZFS_PROP_CHECKSUM), 8, 1,
-			    &checksum, tx);
-		}
-		if (error == 0) {
-			error = zap_update(os, ZVOL_ZAP_OBJ,
-			    zfs_prop_to_name(ZFS_PROP_REFRESERVATION), 8, 1,
-			    &refresrv, tx);
-		}
-		if (error == 0) {
-			error = zap_update(os, ZVOL_ZAP_OBJ,
-			    zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), 8, 1,
-			    &vbs, tx);
-		}
-		if (error == 0) {
-			error = dmu_object_set_blocksize(
-			    os, ZVOL_OBJ, SPA_OLD_MAXBLOCKSIZE, 0, tx);
-		}
-		if (version >= SPA_VERSION_DEDUP && error == 0) {
-			error = zap_update(os, ZVOL_ZAP_OBJ,
-			    zfs_prop_to_name(ZFS_PROP_DEDUP), 8, 1,
-			    &dedup, tx);
-		}
-		if (error == 0)
-			zv->zv_volblocksize = SPA_OLD_MAXBLOCKSIZE;
-	}
-	dmu_tx_commit(tx);
-
-	/*
-	 * We only need update the zvol's property if we are initializing
-	 * the dump area for the first time.
-	 */
-	if (error == 0 && !resize) {
-		/*
-		 * If MULTI_VDEV_CRASH_DUMP is active, use the NOPARITY checksum
-		 * function.  Otherwise, use the old default -- OFF.
-		 */
-		checksum = spa_feature_is_active(spa,
-		    SPA_FEATURE_MULTI_VDEV_CRASH_DUMP) ? ZIO_CHECKSUM_NOPARITY :
-		    ZIO_CHECKSUM_OFF;
-
-		VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0);
-		VERIFY(nvlist_add_uint64(nv,
-		    zfs_prop_to_name(ZFS_PROP_REFRESERVATION), 0) == 0);
-		VERIFY(nvlist_add_uint64(nv,
-		    zfs_prop_to_name(ZFS_PROP_COMPRESSION),
-		    ZIO_COMPRESS_OFF) == 0);
-		VERIFY(nvlist_add_uint64(nv,
-		    zfs_prop_to_name(ZFS_PROP_CHECKSUM),
-		    checksum) == 0);
-		if (version >= SPA_VERSION_DEDUP) {
-			VERIFY(nvlist_add_uint64(nv,
-			    zfs_prop_to_name(ZFS_PROP_DEDUP),
-			    ZIO_CHECKSUM_OFF) == 0);
-		}
-
-		error = zfs_set_prop_nvlist(zv->zv_name, ZPROP_SRC_LOCAL,
-		    nv, NULL);
-		nvlist_free(nv);
-	}
-
-	/* Allocate the space for the dump */
-	if (error == 0)
-		error = zvol_prealloc(zv);
-	return (error);
-}
-
-static int
-zvol_dumpify(zvol_state_t *zv)
-{
-	int error = 0;
-	uint64_t dumpsize = 0;
-	dmu_tx_t *tx;
-	objset_t *os = zv->zv_objset;
-
-	if (zv->zv_flags & ZVOL_RDONLY)
-		return (SET_ERROR(EROFS));
-
-	if (zap_lookup(zv->zv_objset, ZVOL_ZAP_OBJ, ZVOL_DUMPSIZE,
-	    8, 1, &dumpsize) != 0 || dumpsize != zv->zv_volsize) {
-		boolean_t resize = (dumpsize > 0);
-
-		if ((error = zvol_dump_init(zv, resize)) != 0) {
-			(void) zvol_dump_fini(zv);
-			return (error);
-		}
-	}
-
-	/*
-	 * Build up our lba mapping.
-	 */
-	error = zvol_get_lbas(zv);
-	if (error) {
-		(void) zvol_dump_fini(zv);
-		return (error);
-	}
-
-	tx = dmu_tx_create(os);
-	dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, TRUE, NULL);
-	error = dmu_tx_assign(tx, TXG_WAIT);
-	if (error) {
-		dmu_tx_abort(tx);
-		(void) zvol_dump_fini(zv);
-		return (error);
-	}
-
-	zv->zv_flags |= ZVOL_DUMPIFIED;
-	error = zap_update(os, ZVOL_ZAP_OBJ, ZVOL_DUMPSIZE, 8, 1,
-	    &zv->zv_volsize, tx);
-	dmu_tx_commit(tx);
-
-	if (error) {
-		(void) zvol_dump_fini(zv);
-		return (error);
-	}
-
-	txg_wait_synced(dmu_objset_pool(os), 0);
-	return (0);
-}
-
-static int
-zvol_dump_fini(zvol_state_t *zv)
-{
-	dmu_tx_t *tx;
-	objset_t *os = zv->zv_objset;
-	nvlist_t *nv;
-	int error = 0;
-	uint64_t checksum, compress, refresrv, vbs, dedup;
-	uint64_t version = spa_version(dmu_objset_spa(zv->zv_objset));
-
-	/*
-	 * Attempt to restore the zvol back to its pre-dumpified state.
-	 * This is a best-effort attempt as it's possible that not all
-	 * of these properties were initialized during the dumpify process
-	 * (i.e. error during zvol_dump_init).
-	 */
-
-	tx = dmu_tx_create(os);
-	dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, TRUE, NULL);
-	error = dmu_tx_assign(tx, TXG_WAIT);
-	if (error) {
-		dmu_tx_abort(tx);
-		return (error);
-	}
-	(void) zap_remove(os, ZVOL_ZAP_OBJ, ZVOL_DUMPSIZE, tx);
-	dmu_tx_commit(tx);
-
-	(void) zap_lookup(zv->zv_objset, ZVOL_ZAP_OBJ,
-	    zfs_prop_to_name(ZFS_PROP_CHECKSUM), 8, 1, &checksum);
-	(void) zap_lookup(zv->zv_objset, ZVOL_ZAP_OBJ,
-	    zfs_prop_to_name(ZFS_PROP_COMPRESSION), 8, 1, &compress);
-	(void) zap_lookup(zv->zv_objset, ZVOL_ZAP_OBJ,
-	    zfs_prop_to_name(ZFS_PROP_REFRESERVATION), 8, 1, &refresrv);
-	(void) zap_lookup(zv->zv_objset, ZVOL_ZAP_OBJ,
-	    zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), 8, 1, &vbs);
-
-	VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0);
-	(void) nvlist_add_uint64(nv,
-	    zfs_prop_to_name(ZFS_PROP_CHECKSUM), checksum);
-	(void) nvlist_add_uint64(nv,
-	    zfs_prop_to_name(ZFS_PROP_COMPRESSION), compress);
-	(void) nvlist_add_uint64(nv,
-	    zfs_prop_to_name(ZFS_PROP_REFRESERVATION), refresrv);
-	if (version >= SPA_VERSION_DEDUP &&
-	    zap_lookup(zv->zv_objset, ZVOL_ZAP_OBJ,
-	    zfs_prop_to_name(ZFS_PROP_DEDUP), 8, 1, &dedup) == 0) {
-		(void) nvlist_add_uint64(nv,
-		    zfs_prop_to_name(ZFS_PROP_DEDUP), dedup);
-	}
-	(void) zfs_set_prop_nvlist(zv->zv_name, ZPROP_SRC_LOCAL,
-	    nv, NULL);
-	nvlist_free(nv);
-
-	zvol_free_extents(zv);
-	zv->zv_flags &= ~ZVOL_DUMPIFIED;
-	(void) dmu_free_long_range(os, ZVOL_OBJ, 0, DMU_OBJECT_END);
-	/* wait for dmu_free_long_range to actually free the blocks */
-	txg_wait_synced(dmu_objset_pool(zv->zv_objset), 0);
-	tx = dmu_tx_create(os);
-	dmu_tx_hold_bonus(tx, ZVOL_OBJ);
-	error = dmu_tx_assign(tx, TXG_WAIT);
-	if (error) {
-		dmu_tx_abort(tx);
-		return (error);
-	}
-	if (dmu_object_set_blocksize(os, ZVOL_OBJ, vbs, 0, tx) == 0)
-		zv->zv_volblocksize = vbs;
-	dmu_tx_commit(tx);
-
-	return (0);
-}
-#else	/* !illumos */
-
-static void
-zvol_geom_run(zvol_state_t *zv)
-{
-	struct g_provider *pp;
-
-	pp = zv->zv_provider;
-	g_error_provider(pp, 0);
-
-	kproc_kthread_add(zvol_geom_worker, zv, &system_proc, NULL, 0, 0,
-	    "zfskern", "zvol %s", pp->name + sizeof(ZVOL_DRIVER));
-}
-
-static void
-zvol_geom_destroy(zvol_state_t *zv)
-{
-	struct g_provider *pp;
-
-	g_topology_assert();
-
-	mtx_lock(&zv->zv_queue_mtx);
-	zv->zv_state = 1;
-	wakeup_one(&zv->zv_queue);
-	while (zv->zv_state != 2)
-		msleep(&zv->zv_state, &zv->zv_queue_mtx, 0, "zvol:w", 0);
-	mtx_destroy(&zv->zv_queue_mtx);
-
-	pp = zv->zv_provider;
-	zv->zv_provider = NULL;
-	pp->private = NULL;
-	g_wither_geom(pp->geom, ENXIO);
-}
-
-static int
-zvol_geom_access(struct g_provider *pp, int acr, int acw, int ace)
-{
-	int count, error, flags;
-
-	g_topology_assert();
-
-	/*
-	 * To make it easier we expect either open or close, but not both
-	 * at the same time.
-	 */
-	KASSERT((acr >= 0 && acw >= 0 && ace >= 0) ||
-	    (acr <= 0 && acw <= 0 && ace <= 0),
-	    ("Unsupported access request to %s (acr=%d, acw=%d, ace=%d).",
-	    pp->name, acr, acw, ace));
-
-	if (pp->private == NULL) {
-		if (acr <= 0 && acw <= 0 && ace <= 0)
-			return (0);
-		return (pp->error);
-	}
-
-	/*
-	 * We don't pass FEXCL flag to zvol_open()/zvol_close() if ace != 0,
-	 * because GEOM already handles that and handles it a bit differently.
-	 * GEOM allows for multiple read/exclusive consumers and ZFS allows
-	 * only one exclusive consumer, no matter if it is reader or writer.
-	 * I like better the way GEOM works so I'll leave it for GEOM to
-	 * decide what to do.
-	 */
-
-	count = acr + acw + ace;
-	if (count == 0)
-		return (0);
-
-	flags = 0;
-	if (acr != 0 || ace != 0)
-		flags |= FREAD;
-	if (acw != 0)
-		flags |= FWRITE;
-
-	g_topology_unlock();
-	if (count > 0)
-		error = zvol_open(pp, flags, count);
-	else
-		error = zvol_close(pp, flags, -count);
-	g_topology_lock();
-	return (error);
-}
-
-static void
-zvol_geom_start(struct bio *bp)
-{
-	zvol_state_t *zv;
-	boolean_t first;
-
-	zv = bp->bio_to->private;
-	ASSERT(zv != NULL);
-	switch (bp->bio_cmd) {
-	case BIO_FLUSH:
-		if (!THREAD_CAN_SLEEP())
-			goto enqueue;
-		zil_commit(zv->zv_zilog, ZVOL_OBJ);
-		g_io_deliver(bp, 0);
-		break;
-	case BIO_READ:
-	case BIO_WRITE:
-	case BIO_DELETE:
-		if (!THREAD_CAN_SLEEP())
-			goto enqueue;
-		zvol_strategy(bp);
-		break;
-	case BIO_GETATTR: {
-		spa_t *spa = dmu_objset_spa(zv->zv_objset);
-		uint64_t refd, avail, usedobjs, availobjs, val;
-
-		if (g_handleattr_int(bp, "GEOM::candelete", 1))
-			return;
-		if (strcmp(bp->bio_attribute, "blocksavail") == 0) {
-			dmu_objset_space(zv->zv_objset, &refd, &avail,
-			    &usedobjs, &availobjs);
-			if (g_handleattr_off_t(bp, "blocksavail",
-			    avail / DEV_BSIZE))
-				return;
-		} else if (strcmp(bp->bio_attribute, "blocksused") == 0) {
-			dmu_objset_space(zv->zv_objset, &refd, &avail,
-			    &usedobjs, &availobjs);
-			if (g_handleattr_off_t(bp, "blocksused",
-			    refd / DEV_BSIZE))
-				return;
-		} else if (strcmp(bp->bio_attribute, "poolblocksavail") == 0) {
-			avail = metaslab_class_get_space(spa_normal_class(spa));
-			avail -= metaslab_class_get_alloc(spa_normal_class(spa));
-			if (g_handleattr_off_t(bp, "poolblocksavail",
-			    avail / DEV_BSIZE))
-				return;
-		} else if (strcmp(bp->bio_attribute, "poolblocksused") == 0) {
-			refd = metaslab_class_get_alloc(spa_normal_class(spa));
-			if (g_handleattr_off_t(bp, "poolblocksused",
-			    refd / DEV_BSIZE))
-				return;
-		}
-		/* FALLTHROUGH */
-	}
-	default:
-		g_io_deliver(bp, EOPNOTSUPP);
-		break;
-	}
-	return;
-
-enqueue:
-	mtx_lock(&zv->zv_queue_mtx);
-	first = (bioq_first(&zv->zv_queue) == NULL);
-	bioq_insert_tail(&zv->zv_queue, bp);
-	mtx_unlock(&zv->zv_queue_mtx);
-	if (first)
-		wakeup_one(&zv->zv_queue);
-}
-
-static void
-zvol_geom_worker(void *arg)
-{
-	zvol_state_t *zv;
-	struct bio *bp;
-
-	thread_lock(curthread);
-	sched_prio(curthread, PRIBIO);
-	thread_unlock(curthread);
-
-	zv = arg;
-	for (;;) {
-		mtx_lock(&zv->zv_queue_mtx);
-		bp = bioq_takefirst(&zv->zv_queue);
-		if (bp == NULL) {
-			if (zv->zv_state == 1) {
-				zv->zv_state = 2;
-				wakeup(&zv->zv_state);
-				mtx_unlock(&zv->zv_queue_mtx);
-				kthread_exit();
-			}
-			msleep(&zv->zv_queue, &zv->zv_queue_mtx, PRIBIO | PDROP,
-			    "zvol:io", 0);
-			continue;
-		}
-		mtx_unlock(&zv->zv_queue_mtx);
-		switch (bp->bio_cmd) {
-		case BIO_FLUSH:
-			zil_commit(zv->zv_zilog, ZVOL_OBJ);
-			g_io_deliver(bp, 0);
-			break;
-		case BIO_READ:
-		case BIO_WRITE:
-		case BIO_DELETE:
-			zvol_strategy(bp);
-			break;
-		default:
-			g_io_deliver(bp, EOPNOTSUPP);
-			break;
-		}
-	}
-}
-
-extern boolean_t dataset_name_hidden(const char *name);
-
-static int
-zvol_create_snapshots(objset_t *os, const char *name)
-{
-	uint64_t cookie, obj;
-	char *sname;
-	int error, len;
-
-	cookie = obj = 0;
-	sname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
-
-#if 0
-	(void) dmu_objset_find(name, dmu_objset_prefetch, NULL,
-	    DS_FIND_SNAPSHOTS);
-#endif
-
-	for (;;) {
-		len = snprintf(sname, MAXPATHLEN, "%s@", name);
-		if (len >= MAXPATHLEN) {
-			dmu_objset_rele(os, FTAG);
-			error = ENAMETOOLONG;
-			break;
-		}
-
-		dsl_pool_config_enter(dmu_objset_pool(os), FTAG);
-		error = dmu_snapshot_list_next(os, MAXPATHLEN - len,
-		    sname + len, &obj, &cookie, NULL);
-		dsl_pool_config_exit(dmu_objset_pool(os), FTAG);
-		if (error != 0) {
-			if (error == ENOENT)
-				error = 0;
-			break;
-		}
-
-		error = zvol_create_minor(sname);
-		if (error != 0 && error != EEXIST) {
-			printf("ZFS WARNING: Unable to create ZVOL %s (error=%d).\n",
-			    sname, error);
-			break;
-		}
-	}
-
-	kmem_free(sname, MAXPATHLEN);
-	return (error);
-}
-
-int
-zvol_create_minors_impl(const char *name)
-{
-	uint64_t cookie;
-	objset_t *os;
-	char *osname, *p;
-	int error, len;
-
-	if (dataset_name_hidden(name))
-		return (0);
-
-	if ((error = dmu_objset_hold(name, FTAG, &os)) != 0) {
-		printf("ZFS WARNING: Unable to put hold on %s (error=%d).\n",
-		    name, error);
-		return (error);
-	}
-	if (dmu_objset_type(os) == DMU_OST_ZVOL) {
-		dsl_dataset_long_hold(os->os_dsl_dataset, FTAG);
-		dsl_pool_rele(dmu_objset_pool(os), FTAG);
-		error = zvol_create_minor(name);
-		if (error == 0 || error == EEXIST) {
-			error = zvol_create_snapshots(os, name);
-		} else {
-			printf("ZFS WARNING: Unable to create ZVOL %s (error=%d).\n",
-			    name, error);
-		}
-		dsl_dataset_long_rele(os->os_dsl_dataset, FTAG);
-		dsl_dataset_rele(os->os_dsl_dataset, FTAG);
-		return (error);
-	}
-	if (dmu_objset_type(os) != DMU_OST_ZFS) {
-		dmu_objset_rele(os, FTAG);
-		return (0);
-	}
-
-	osname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
-	if (snprintf(osname, MAXPATHLEN, "%s/", name) >= MAXPATHLEN) {
-		dmu_objset_rele(os, FTAG);
-		kmem_free(osname, MAXPATHLEN);
-		return (ENOENT);
-	}
-	p = osname + strlen(osname);
-	len = MAXPATHLEN - (p - osname);
-
-#if 0
-	/* Prefetch the datasets. */
-	cookie = 0;
-	while (dmu_dir_list_next(os, len, p, NULL, &cookie) == 0) {
-		if (!dataset_name_hidden(osname))
-			(void) dmu_objset_prefetch(osname, NULL);
-	}
-#endif
-
-	cookie = 0;
-	while (dmu_dir_list_next(os, MAXPATHLEN - (p - osname), p, NULL,
-	    &cookie) == 0) {
-		dmu_objset_rele(os, FTAG);
-		(void)zvol_create_minors_impl(osname);
-		if ((error = dmu_objset_hold(name, FTAG, &os)) != 0) {
-			printf("ZFS WARNING: Unable to put hold on %s (error=%d).\n",
-			    name, error);
-			return (error);
-		}
-	}
-
-	dmu_objset_rele(os, FTAG);
-	kmem_free(osname, MAXPATHLEN);
-	return (0);
-}
-
-static void
-zvol_rename_minor(zvol_state_t *zv, const char *newname)
-{
-	struct g_geom *gp;
-	struct g_provider *pp;
-	struct cdev *dev;
-
-	ASSERT(MUTEX_HELD(&zfsdev_state_lock));
-
-	if (zv->zv_volmode == ZFS_VOLMODE_GEOM) {
-		g_topology_lock();
-		pp = zv->zv_provider;
-		ASSERT(pp != NULL);
-		gp = pp->geom;
-		ASSERT(gp != NULL);
-
-		zv->zv_provider = NULL;
-		g_wither_provider(pp, ENXIO);
-
-		pp = g_new_providerf(gp, "%s/%s", ZVOL_DRIVER, newname);
-		pp->flags |= G_PF_DIRECT_RECEIVE | G_PF_DIRECT_SEND;
-		pp->sectorsize = DEV_BSIZE;
-		pp->mediasize = zv->zv_volsize;
-		pp->private = zv;
-		zv->zv_provider = pp;
-		g_error_provider(pp, 0);
-		g_topology_unlock();
-	} else if (zv->zv_volmode == ZFS_VOLMODE_DEV) {
-		struct make_dev_args args;
-
-		if ((dev = zv->zv_dev) != NULL) {
-			zv->zv_dev = NULL;
-			destroy_dev(dev);
-			if (zv->zv_total_opens > 0) {
-				zv->zv_flags &= ~ZVOL_EXCL;
-				zv->zv_total_opens = 0;
-				zvol_last_close(zv);
-			}
-		}
-
-		make_dev_args_init(&args);
-		args.mda_flags = MAKEDEV_CHECKNAME | MAKEDEV_WAITOK;
-		args.mda_devsw = &zvol_cdevsw;
-		args.mda_cr = NULL;
-		args.mda_uid = UID_ROOT;
-		args.mda_gid = GID_OPERATOR;
-		args.mda_mode = 0640;
-		args.mda_si_drv2 = zv;
-		if (make_dev_s(&args, &zv->zv_dev,
-		    "%s/%s", ZVOL_DRIVER, newname) == 0)
-			zv->zv_dev->si_iosize_max = MAXPHYS;
-	}
-	strlcpy(zv->zv_name, newname, sizeof(zv->zv_name));
-}
-
-void
-zvol_rename_minors_impl(const char *oldname, const char *newname)
-{
-	char name[MAXPATHLEN];
-	struct g_provider *pp;
-	struct g_geom *gp;
-	size_t oldnamelen, newnamelen;
-	zvol_state_t *zv;
-	char *namebuf;
-	boolean_t locked = B_FALSE;
-
-	oldnamelen = strlen(oldname);
-	newnamelen = strlen(newname);
-
-	/* See comment in zvol_open(). */
-	if (!MUTEX_HELD(&zfsdev_state_lock)) {
-		mutex_enter(&zfsdev_state_lock);
-		locked = B_TRUE;
-	}
-
-	LIST_FOREACH(zv, &all_zvols, zv_links) {
-		if (strcmp(zv->zv_name, oldname) == 0) {
-			zvol_rename_minor(zv, newname);
-		} else if (strncmp(zv->zv_name, oldname, oldnamelen) == 0 &&
-		    (zv->zv_name[oldnamelen] == '/' ||
-		     zv->zv_name[oldnamelen] == '@')) {
-			snprintf(name, sizeof(name), "%s%c%s", newname,
-			    zv->zv_name[oldnamelen],
-			    zv->zv_name + oldnamelen + 1);
-			zvol_rename_minor(zv, name);
-		}
-	}
-
-	if (locked)
-		mutex_exit(&zfsdev_state_lock);
-}
-
-static zvol_task_t *
-zvol_task_alloc(zvol_async_op_t op, const char *name1, const char *name2)
-{
-	zvol_task_t *task;
-	char *delim;
-
-	task = kmem_zalloc(sizeof (zvol_task_t), KM_SLEEP);
-	task->op = op;
-	delim = strchr(name1, '/');
-	strlcpy(task->pool, name1, delim ? (delim - name1 + 1) : MAXNAMELEN);
-
-	strlcpy(task->name1, name1, MAXNAMELEN);
-	if (name2 != NULL)
-		strlcpy(task->name2, name2, MAXNAMELEN);
-
-	return (task);
-}
-
-static void
-zvol_task_free(zvol_task_t *task)
-{
-	kmem_free(task, sizeof (zvol_task_t));
-}
-
-/*
- * The worker thread function performed asynchronously.
- */
-static void
-zvol_task_cb(void *param)
-{
-	zvol_task_t *task = (zvol_task_t *)param;
-
-	switch (task->op) {
-	case ZVOL_ASYNC_CREATE_MINORS:
-		(void) zvol_create_minors_impl(task->name1);
-		break;
-	case ZVOL_ASYNC_REMOVE_MINORS:
-		zvol_remove_minors_impl(task->name1);
-		break;
-	case ZVOL_ASYNC_RENAME_MINORS:
-		zvol_rename_minors_impl(task->name1, task->name2);
-		break;
-	default:
-		VERIFY(0);
-		break;
-	}
-
-	zvol_task_free(task);
-}
-
-static void
-zvol_minors_helper(spa_t *spa, zvol_async_op_t op, const char *name1,
-    const char *name2)
-{
-	zvol_task_t *task;
-
-	if (dataset_name_hidden(name1))
-		return;
-	if (name2 != NULL && dataset_name_hidden(name2))
-		return;
-	task = zvol_task_alloc(op, name1, name2);
-	(void)taskq_dispatch(spa->spa_zvol_taskq, zvol_task_cb, task, TQ_SLEEP);
-}
-
-void
-zvol_create_minors(spa_t *spa, const char *name)
-{
-	zvol_minors_helper(spa, ZVOL_ASYNC_CREATE_MINORS, name, NULL);
-}
-
-void
-zvol_remove_minors(spa_t *spa, const char *name)
-{
-	zvol_minors_helper(spa, ZVOL_ASYNC_REMOVE_MINORS, name, NULL);
-}
-
-void
-zvol_rename_minors(spa_t *spa, const char *oldname, const char *newname)
-{
-	zvol_minors_helper(spa, ZVOL_ASYNC_RENAME_MINORS, oldname, newname);
-}
-
-static int
-zvol_d_open(struct cdev *dev, int flags, int fmt, struct thread *td)
-{
-	zvol_state_t *zv = dev->si_drv2;
-	int err = 0;
-
-	mutex_enter(&zfsdev_state_lock);
-	if (zv->zv_total_opens == 0)
-		err = zvol_first_open(zv);
-	if (err) {
-		mutex_exit(&zfsdev_state_lock);
-		return (err);
-	}
-	if ((flags & FWRITE) && (zv->zv_flags & ZVOL_RDONLY)) {
-		err = SET_ERROR(EROFS);
-		goto out;
-	}
-	if (zv->zv_flags & ZVOL_EXCL) {
-		err = SET_ERROR(EBUSY);
-		goto out;
-	}
-#ifdef FEXCL
-	if (flags & FEXCL) {
-		if (zv->zv_total_opens != 0) {
-			err = SET_ERROR(EBUSY);
-			goto out;
-		}
-		zv->zv_flags |= ZVOL_EXCL;
-	}
-#endif
-
-	zv->zv_total_opens++;
-	if (flags & (FSYNC | FDSYNC)) {
-		zv->zv_sync_cnt++;
-		if (zv->zv_sync_cnt == 1)
-			zil_async_to_sync(zv->zv_zilog, ZVOL_OBJ);
-	}
-	mutex_exit(&zfsdev_state_lock);
-	return (err);
-out:
-	if (zv->zv_total_opens == 0)
-		zvol_last_close(zv);
-	mutex_exit(&zfsdev_state_lock);
-	return (err);
-}
-
-static int
-zvol_d_close(struct cdev *dev, int flags, int fmt, struct thread *td)
-{
-	zvol_state_t *zv = dev->si_drv2;
-
-	mutex_enter(&zfsdev_state_lock);
-	if (zv->zv_flags & ZVOL_EXCL) {
-		ASSERT(zv->zv_total_opens == 1);
-		zv->zv_flags &= ~ZVOL_EXCL;
-	}
-
-	/*
-	 * If the open count is zero, this is a spurious close.
-	 * That indicates a bug in the kernel / DDI framework.
-	 */
-	ASSERT(zv->zv_total_opens != 0);
-
-	/*
-	 * You may get multiple opens, but only one close.
-	 */
-	zv->zv_total_opens--;
-	if (flags & (FSYNC | FDSYNC))
-		zv->zv_sync_cnt--;
-
-	if (zv->zv_total_opens == 0)
-		zvol_last_close(zv);
-
-	mutex_exit(&zfsdev_state_lock);
-	return (0);
-}
-
-static int
-zvol_d_ioctl(struct cdev *dev, u_long cmd, caddr_t data, int fflag, struct thread *td)
-{
-	zvol_state_t *zv;
-	locked_range_t *lr;
-	off_t offset, length;
-	int i, error;
-	boolean_t sync;
-
-	zv = dev->si_drv2;
-
-	error = 0;
-	KASSERT(zv->zv_total_opens > 0,
-	    ("Device with zero access count in zvol_d_ioctl"));
-
-	i = IOCPARM_LEN(cmd);
-	switch (cmd) {
-	case DIOCGSECTORSIZE:
-		*(u_int *)data = DEV_BSIZE;
-		break;
-	case DIOCGMEDIASIZE:
-		*(off_t *)data = zv->zv_volsize;
-		break;
-	case DIOCGFLUSH:
-		zil_commit(zv->zv_zilog, ZVOL_OBJ);
-		break;
-	case DIOCGDELETE:
-		if (!zvol_unmap_enabled)
-			break;
-
-		offset = ((off_t *)data)[0];
-		length = ((off_t *)data)[1];
-		if ((offset % DEV_BSIZE) != 0 || (length % DEV_BSIZE) != 0 ||
-		    offset < 0 || offset >= zv->zv_volsize ||
-		    length <= 0) {
-			printf("%s: offset=%jd length=%jd\n", __func__, offset,
-			    length);
-			error = EINVAL;
-			break;
-		}
-
-		lr = rangelock_enter(&zv->zv_rangelock, offset, length,
-		    RL_WRITER);
-		dmu_tx_t *tx = dmu_tx_create(zv->zv_objset);
-		error = dmu_tx_assign(tx, TXG_WAIT);
-		if (error != 0) {
-			sync = FALSE;
-			dmu_tx_abort(tx);
-		} else {
-			sync = (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS);
-			zvol_log_truncate(zv, tx, offset, length, sync);
-			dmu_tx_commit(tx);
-			error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ,
-			    offset, length);
-		}
-		rangelock_exit(lr);
-		if (sync)
-			zil_commit(zv->zv_zilog, ZVOL_OBJ);
-		break;
-	case DIOCGSTRIPESIZE:
-		*(off_t *)data = zv->zv_volblocksize;
-		break;
-	case DIOCGSTRIPEOFFSET:
-		*(off_t *)data = 0;
-		break;
-	case DIOCGATTR: {
-		spa_t *spa = dmu_objset_spa(zv->zv_objset);
-		struct diocgattr_arg *arg = (struct diocgattr_arg *)data;
-		uint64_t refd, avail, usedobjs, availobjs;
-
-		if (strcmp(arg->name, "GEOM::candelete") == 0)
-			arg->value.i = 1;
-		else if (strcmp(arg->name, "blocksavail") == 0) {
-			dmu_objset_space(zv->zv_objset, &refd, &avail,
-			    &usedobjs, &availobjs);
-			arg->value.off = avail / DEV_BSIZE;
-		} else if (strcmp(arg->name, "blocksused") == 0) {
-			dmu_objset_space(zv->zv_objset, &refd, &avail,
-			    &usedobjs, &availobjs);
-			arg->value.off = refd / DEV_BSIZE;
-		} else if (strcmp(arg->name, "poolblocksavail") == 0) {
-			avail = metaslab_class_get_space(spa_normal_class(spa));
-			avail -= metaslab_class_get_alloc(spa_normal_class(spa));
-			arg->value.off = avail / DEV_BSIZE;
-		} else if (strcmp(arg->name, "poolblocksused") == 0) {
-			refd = metaslab_class_get_alloc(spa_normal_class(spa));
-			arg->value.off = refd / DEV_BSIZE;
-		} else
-			error = ENOIOCTL;
-		break;
-	}
-	case FIOSEEKHOLE:
-	case FIOSEEKDATA: {
-		off_t *off = (off_t *)data;
-		uint64_t noff;
-		boolean_t hole;
-
-		hole = (cmd == FIOSEEKHOLE);
-		noff = *off;
-		error = dmu_offset_next(zv->zv_objset, ZVOL_OBJ, hole, &noff);
-		*off = noff;
-		break;
-	}
-	default:
-		error = ENOIOCTL;
-	}
-
-	return (error);
-}
-#endif	/* illumos */
Index: sys/cddl/contrib/opensolaris/uts/common/os/callb.c
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/os/callb.c
+++ /dev/null
@@ -1,438 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-
-#include <sys/param.h>
-#include <sys/types.h>
-#include <sys/time.h>
-#include <sys/sysmacros.h>
-#include <sys/systm.h>
-#include <sys/proc.h>
-#include <sys/mutex.h>
-#include <sys/condvar.h>
-#include <sys/callb.h>
-#include <sys/kmem.h>
-#include <sys/cmn_err.h>
-#include <sys/debug.h>
-#include <sys/kobj.h>
-#include <sys/systm.h>	/* for delay() */
-#include <sys/taskq.h>  /* For TASKQ_NAMELEN */
-#include <sys/kernel.h>
-
-#define	CB_MAXNAME	TASKQ_NAMELEN
-
-/*
- * The callb mechanism provides generic event scheduling/echoing.
- * A callb function is registered and called on behalf of the event.
- */
-typedef struct callb {
-	struct callb	*c_next; 	/* next in class or on freelist */
-	kthread_id_t	c_thread;	/* ptr to caller's thread struct */
-	char		c_flag;		/* info about the callb state */
-	uchar_t		c_class;	/* this callb's class */
-	kcondvar_t	c_done_cv;	/* signal callb completion */
-	boolean_t	(*c_func)();	/* cb function: returns true if ok */
-	void		*c_arg;		/* arg to c_func */
-	char		c_name[CB_MAXNAME+1]; /* debug:max func name length */
-} callb_t;
-
-/*
- * callb c_flag bitmap definitions
- */
-#define	CALLB_FREE		0x0
-#define	CALLB_TAKEN		0x1
-#define	CALLB_EXECUTING		0x2
-
-/*
- * Basic structure for a callb table.
- * All callbs are organized into different class groups described
- * by ct_class array.
- * The callbs within a class are single-linked and normally run by a
- * serial execution.
- */
-typedef struct callb_table {
-	kmutex_t ct_lock;		/* protect all callb states */
-	callb_t	*ct_freelist; 		/* free callb structures */
-	int	ct_busy;		/* != 0 prevents additions */
-	kcondvar_t ct_busy_cv;		/* to wait for not busy    */
-	int	ct_ncallb; 		/* num of callbs allocated */
-	callb_t	*ct_first_cb[NCBCLASS];	/* ptr to 1st callb in a class */
-} callb_table_t;
-
-int callb_timeout_sec = CPR_KTHREAD_TIMEOUT_SEC;
-
-static callb_id_t callb_add_common(boolean_t (*)(void *, int),
-    void *, int, char *, kthread_id_t);
-
-static callb_table_t callb_table;	/* system level callback table */
-static callb_table_t *ct = &callb_table;
-static kmutex_t	callb_safe_mutex;
-callb_cpr_t	callb_cprinfo_safe = {
-	&callb_safe_mutex, CALLB_CPR_ALWAYS_SAFE, 0, 0, 0 };
-
-/*
- * Init all callb tables in the system.
- */
-void
-callb_init(void *dummy __unused)
-{
-	callb_table.ct_busy = 0;	/* mark table open for additions */
-	mutex_init(&callb_safe_mutex, NULL, MUTEX_DEFAULT, NULL);
-	mutex_init(&callb_table.ct_lock, NULL, MUTEX_DEFAULT, NULL);
-}
-
-void
-callb_fini(void *dummy __unused)
-{
-	callb_t *cp;
-	int i;
-
-	mutex_enter(&ct->ct_lock);
-	for (i = 0; i < 16; i++) {
-		while ((cp = ct->ct_freelist) != NULL) {
-			ct->ct_freelist = cp->c_next;
-			ct->ct_ncallb--;
-			kmem_free(cp, sizeof (callb_t));
-		}
-		if (ct->ct_ncallb == 0)
-			break;
-		/* Not all callbacks finished, waiting for the rest. */
-		mutex_exit(&ct->ct_lock);
-		tsleep(ct, 0, "callb", hz / 4);
-		mutex_enter(&ct->ct_lock);
-	}
-	if (ct->ct_ncallb > 0)
-		printf("%s: Leaked %d callbacks!\n", __func__, ct->ct_ncallb);
-	mutex_exit(&ct->ct_lock);
-	mutex_destroy(&callb_safe_mutex);
-	mutex_destroy(&callb_table.ct_lock);
-}
-
-/*
- * callout_add() is called to register func() be called later.
- */
-static callb_id_t
-callb_add_common(boolean_t (*func)(void *arg, int code),
-    void *arg, int class, char *name, kthread_id_t t)
-{
-	callb_t *cp;
-
-	ASSERT(class < NCBCLASS);
-
-	mutex_enter(&ct->ct_lock);
-	while (ct->ct_busy)
-		cv_wait(&ct->ct_busy_cv, &ct->ct_lock);
-	if ((cp = ct->ct_freelist) == NULL) {
-		ct->ct_ncallb++;
-		cp = (callb_t *)kmem_zalloc(sizeof (callb_t), KM_SLEEP);
-	}
-	ct->ct_freelist = cp->c_next;
-	cp->c_thread = t;
-	cp->c_func = func;
-	cp->c_arg = arg;
-	cp->c_class = (uchar_t)class;
-	cp->c_flag |= CALLB_TAKEN;
-#ifdef DEBUG
-	if (strlen(name) > CB_MAXNAME)
-		cmn_err(CE_WARN, "callb_add: name of callback function '%s' "
-		    "too long -- truncated to %d chars",
-		    name, CB_MAXNAME);
-#endif
-	(void) strncpy(cp->c_name, name, CB_MAXNAME);
-	cp->c_name[CB_MAXNAME] = '\0';
-
-	/*
-	 * Insert the new callb at the head of its class list.
-	 */
-	cp->c_next = ct->ct_first_cb[class];
-	ct->ct_first_cb[class] = cp;
-
-	mutex_exit(&ct->ct_lock);
-	return ((callb_id_t)cp);
-}
-
-/*
- * The default function to add an entry to the callback table.  Since
- * it uses curthread as the thread identifier to store in the table,
- * it should be used for the normal case of a thread which is calling
- * to add ITSELF to the table.
- */
-callb_id_t
-callb_add(boolean_t (*func)(void *arg, int code),
-    void *arg, int class, char *name)
-{
-	return (callb_add_common(func, arg, class, name, curthread));
-}
-
-/*
- * A special version of callb_add() above for use by threads which
- * might be adding an entry to the table on behalf of some other
- * thread (for example, one which is constructed but not yet running).
- * In this version the thread id is an argument.
- */
-callb_id_t
-callb_add_thread(boolean_t (*func)(void *arg, int code),
-    void *arg, int class, char *name, kthread_id_t t)
-{
-	return (callb_add_common(func, arg, class, name, t));
-}
-
-/*
- * callout_delete() is called to remove an entry identified by id
- * that was originally placed there by a call to callout_add().
- * return -1 if fail to delete a callb entry otherwise return 0.
- */
-int
-callb_delete(callb_id_t id)
-{
-	callb_t **pp;
-	callb_t *me = (callb_t *)id;
-
-	mutex_enter(&ct->ct_lock);
-
-	for (;;) {
-		pp = &ct->ct_first_cb[me->c_class];
-		while (*pp != NULL && *pp != me)
-			pp = &(*pp)->c_next;
-
-#ifdef DEBUG
-		if (*pp != me) {
-			cmn_err(CE_WARN, "callb delete bogus entry 0x%p",
-			    (void *)me);
-			mutex_exit(&ct->ct_lock);
-			return (-1);
-		}
-#endif /* DEBUG */
-
-		/*
-		 * It is not allowed to delete a callb in the middle of
-		 * executing otherwise, the callb_execute() will be confused.
-		 */
-		if (!(me->c_flag & CALLB_EXECUTING))
-			break;
-
-		cv_wait(&me->c_done_cv, &ct->ct_lock);
-	}
-	/* relink the class list */
-	*pp = me->c_next;
-
-	/* clean up myself and return the free callb to the head of freelist */
-	me->c_flag = CALLB_FREE;
-	me->c_next = ct->ct_freelist;
-	ct->ct_freelist = me;
-
-	mutex_exit(&ct->ct_lock);
-	return (0);
-}
-
-/*
- * class:	indicates to execute all callbs in the same class;
- * code:	optional argument for the callb functions.
- * return:	 = 0: success
- *		!= 0: ptr to string supplied when callback was registered
- */
-void *
-callb_execute_class(int class, int code)
-{
-	callb_t *cp;
-	void *ret = NULL;
-
-	ASSERT(class < NCBCLASS);
-
-	mutex_enter(&ct->ct_lock);
-
-	for (cp = ct->ct_first_cb[class];
-	    cp != NULL && ret == 0; cp = cp->c_next) {
-		while (cp->c_flag & CALLB_EXECUTING)
-			cv_wait(&cp->c_done_cv, &ct->ct_lock);
-		/*
-		 * cont if the callb is deleted while we're sleeping
-		 */
-		if (cp->c_flag == CALLB_FREE)
-			continue;
-		cp->c_flag |= CALLB_EXECUTING;
-
-#ifdef CALLB_DEBUG
-		printf("callb_execute: name=%s func=%p arg=%p\n",
-		    cp->c_name, (void *)cp->c_func, (void *)cp->c_arg);
-#endif /* CALLB_DEBUG */
-
-		mutex_exit(&ct->ct_lock);
-		/* If callback function fails, pass back client's name */
-		if (!(*cp->c_func)(cp->c_arg, code))
-			ret = cp->c_name;
-		mutex_enter(&ct->ct_lock);
-
-		cp->c_flag &= ~CALLB_EXECUTING;
-		cv_broadcast(&cp->c_done_cv);
-	}
-	mutex_exit(&ct->ct_lock);
-	return (ret);
-}
-
-/*
- * callers make sure no recursive entries to this func.
- * dp->cc_lockp is registered by callb_add to protect callb_cpr_t structure.
- *
- * When calling to stop a kernel thread (code == CB_CODE_CPR_CHKPT) we
- * use a cv_timedwait() in case the kernel thread is blocked.
- *
- * Note that this is a generic callback handler for daemon CPR and
- * should NOT be changed to accommodate any specific requirement in a daemon.
- * Individual daemons that require changes to the handler shall write
- * callback routines in their own daemon modules.
- */
-boolean_t
-callb_generic_cpr(void *arg, int code)
-{
-	callb_cpr_t *cp = (callb_cpr_t *)arg;
-	clock_t ret = 0;			/* assume success */
-
-	mutex_enter(cp->cc_lockp);
-
-	switch (code) {
-	case CB_CODE_CPR_CHKPT:
-		cp->cc_events |= CALLB_CPR_START;
-#ifdef CPR_NOT_THREAD_SAFE
-		while (!(cp->cc_events & CALLB_CPR_SAFE))
-			/* cv_timedwait() returns -1 if it times out. */
-			if ((ret = cv_reltimedwait(&cp->cc_callb_cv,
-			    cp->cc_lockp, (callb_timeout_sec * hz),
-			    TR_CLOCK_TICK)) == -1)
-				break;
-#endif
-		break;
-
-	case CB_CODE_CPR_RESUME:
-		cp->cc_events &= ~CALLB_CPR_START;
-		cv_signal(&cp->cc_stop_cv);
-		break;
-	}
-	mutex_exit(cp->cc_lockp);
-	return (ret != -1);
-}
-
-/*
- * The generic callback function associated with kernel threads which
- * are always considered safe.
- */
-/* ARGSUSED */
-boolean_t
-callb_generic_cpr_safe(void *arg, int code)
-{
-	return (B_TRUE);
-}
-/*
- * Prevent additions to callback table.
- */
-void
-callb_lock_table(void)
-{
-	mutex_enter(&ct->ct_lock);
-	ASSERT(ct->ct_busy == 0);
-	ct->ct_busy = 1;
-	mutex_exit(&ct->ct_lock);
-}
-
-/*
- * Allow additions to callback table.
- */
-void
-callb_unlock_table(void)
-{
-	mutex_enter(&ct->ct_lock);
-	ASSERT(ct->ct_busy != 0);
-	ct->ct_busy = 0;
-	cv_broadcast(&ct->ct_busy_cv);
-	mutex_exit(&ct->ct_lock);
-}
-
-#ifdef illumos
-/*
- * Return a boolean value indicating whether a particular kernel thread is
- * stopped in accordance with the cpr callback protocol.  If returning
- * false, also return a pointer to the thread name via the 2nd argument.
- */
-boolean_t
-callb_is_stopped(kthread_id_t tp, caddr_t *thread_name)
-{
-	callb_t *cp;
-	boolean_t ret_val;
-
-	mutex_enter(&ct->ct_lock);
-
-	for (cp = ct->ct_first_cb[CB_CL_CPR_DAEMON];
-	    cp != NULL && tp != cp->c_thread; cp = cp->c_next)
-		;
-
-	ret_val = (cp != NULL);
-	if (ret_val) {
-		/*
-		 * We found the thread in the callback table and have
-		 * provisionally set the return value to true.  Now
-		 * see if it is marked "safe" and is sleeping or stopped.
-		 */
-		callb_cpr_t *ccp = (callb_cpr_t *)cp->c_arg;
-
-		*thread_name = cp->c_name;	/* in case not stopped */
-		mutex_enter(ccp->cc_lockp);
-
-		if (ccp->cc_events & CALLB_CPR_SAFE) {
-			int retry;
-
-			mutex_exit(ccp->cc_lockp);
-			for (retry = 0; retry < CALLB_MAX_RETRY; retry++) {
-				thread_lock(tp);
-				if (tp->t_state & (TS_SLEEP | TS_STOPPED)) {
-					thread_unlock(tp);
-					break;
-				}
-				thread_unlock(tp);
-				delay(CALLB_THREAD_DELAY);
-			}
-			ret_val = retry < CALLB_MAX_RETRY;
-		} else {
-			ret_val =
-			    (ccp->cc_events & CALLB_CPR_ALWAYS_SAFE) != 0;
-			mutex_exit(ccp->cc_lockp);
-		}
-	} else {
-		/*
-		 * Thread not found in callback table.  Make the best
-		 * attempt to identify the thread in the error message.
-		 */
-		ulong_t offset;
-		char *sym = kobj_getsymname((uintptr_t)tp->t_startpc,
-		    &offset);
-
-		*thread_name = sym ? sym : "*unknown*";
-	}
-
-	mutex_exit(&ct->ct_lock);
-	return (ret_val);
-}
-#endif	/* illumos */
-
-SYSINIT(sol_callb, SI_SUB_DRIVERS, SI_ORDER_FIRST, callb_init, NULL);
-SYSUNINIT(sol_callb, SI_SUB_DRIVERS, SI_ORDER_FIRST, callb_fini, NULL);
Index: sys/cddl/contrib/opensolaris/uts/common/os/fm.c
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/os/fm.c
+++ /dev/null
@@ -1,1399 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
- */
-
-/*
- * Fault Management Architecture (FMA) Resource and Protocol Support
- *
- * The routines contained herein provide services to support kernel subsystems
- * in publishing fault management telemetry (see PSARC 2002/412 and 2003/089).
- *
- * Name-Value Pair Lists
- *
- * The embodiment of an FMA protocol element (event, fmri or authority) is a
- * name-value pair list (nvlist_t).  FMA-specific nvlist construtor and
- * destructor functions, fm_nvlist_create() and fm_nvlist_destroy(), are used
- * to create an nvpair list using custom allocators.  Callers may choose to
- * allocate either from the kernel memory allocator, or from a preallocated
- * buffer, useful in constrained contexts like high-level interrupt routines.
- *
- * Protocol Event and FMRI Construction
- *
- * Convenience routines are provided to construct nvlist events according to
- * the FMA Event Protocol and Naming Schema specification for ereports and
- * FMRIs for the dev, cpu, hc, mem, legacy hc and de schemes.
- *
- * ENA Manipulation
- *
- * Routines to generate ENA formats 0, 1 and 2 are available as well as
- * routines to increment formats 1 and 2.  Individual fields within the
- * ENA are extractable via fm_ena_time_get(), fm_ena_id_get(),
- * fm_ena_format_get() and fm_ena_gen_get().
- */
-
-#include <sys/types.h>
-#include <sys/time.h>
-#include <sys/sysevent.h>
-#include <sys/nvpair.h>
-#include <sys/cmn_err.h>
-#include <sys/cpuvar.h>
-#include <sys/sysmacros.h>
-#include <sys/systm.h>
-#include <sys/compress.h>
-#include <sys/cpuvar.h>
-#include <sys/kobj.h>
-#include <sys/kstat.h>
-#include <sys/processor.h>
-#include <sys/pcpu.h>
-#include <sys/sunddi.h>
-#include <sys/systeminfo.h>
-#include <sys/sysevent/eventdefs.h>
-#include <sys/fm/util.h>
-#include <sys/fm/protocol.h>
-
-/*
- * URL and SUNW-MSG-ID value to display for fm_panic(), defined below.  These
- * values must be kept in sync with the FMA source code in usr/src/cmd/fm.
- */
-static const char *fm_url = "http://www.sun.com/msg";
-static const char *fm_msgid = "SUNOS-8000-0G";
-static char *volatile fm_panicstr = NULL;
-
-#ifdef illumos
-errorq_t *ereport_errorq;
-#endif
-void *ereport_dumpbuf;
-size_t ereport_dumplen;
-
-static uint_t ereport_chanlen = ERPT_EVCH_MAX;
-static evchan_t *ereport_chan = NULL;
-static ulong_t ereport_qlen = 0;
-static size_t ereport_size = 0;
-static int ereport_cols = 80;
-
-extern void fastreboot_disable_highpil(void);
-
-/*
- * Common fault management kstats to record ereport generation
- * failures
- */
-
-struct erpt_kstat {
-	kstat_named_t	erpt_dropped;		/* num erpts dropped on post */
-	kstat_named_t	erpt_set_failed;	/* num erpt set failures */
-	kstat_named_t	fmri_set_failed;	/* num fmri set failures */
-	kstat_named_t	payload_set_failed;	/* num payload set failures */
-};
-
-static struct erpt_kstat erpt_kstat_data = {
-	{ "erpt-dropped", KSTAT_DATA_UINT64 },
-	{ "erpt-set-failed", KSTAT_DATA_UINT64 },
-	{ "fmri-set-failed", KSTAT_DATA_UINT64 },
-	{ "payload-set-failed", KSTAT_DATA_UINT64 }
-};
-
-#ifdef illumos
-/*ARGSUSED*/
-static void
-fm_drain(void *private, void *data, errorq_elem_t *eep)
-{
-	nvlist_t *nvl = errorq_elem_nvl(ereport_errorq, eep);
-
-	if (!KERNEL_PANICKED())
-		(void) fm_ereport_post(nvl, EVCH_TRYHARD);
-	else
-		fm_nvprint(nvl);
-}
-#endif
-
-void
-fm_init(void)
-{
-	kstat_t *ksp;
-
-#ifdef illumos
-	(void) sysevent_evc_bind(FM_ERROR_CHAN,
-	    &ereport_chan, EVCH_CREAT | EVCH_HOLD_PEND);
-
-	(void) sysevent_evc_control(ereport_chan,
-	    EVCH_SET_CHAN_LEN, &ereport_chanlen);
-#endif
-
-	if (ereport_qlen == 0)
-		ereport_qlen = ERPT_MAX_ERRS * MAX(max_ncpus, 4);
-
-	if (ereport_size == 0)
-		ereport_size = ERPT_DATA_SZ;
-
-#ifdef illumos
-	ereport_errorq = errorq_nvcreate("fm_ereport_queue",
-	    (errorq_func_t)fm_drain, NULL, ereport_qlen, ereport_size,
-	    FM_ERR_PIL, ERRORQ_VITAL);
-	if (ereport_errorq == NULL)
-		panic("failed to create required ereport error queue");
-#endif
-
-	ereport_dumpbuf = kmem_alloc(ereport_size, KM_SLEEP);
-	ereport_dumplen = ereport_size;
-
-	/* Initialize ereport allocation and generation kstats */
-	ksp = kstat_create("unix", 0, "fm", "misc", KSTAT_TYPE_NAMED,
-	    sizeof (struct erpt_kstat) / sizeof (kstat_named_t),
-	    KSTAT_FLAG_VIRTUAL);
-
-	if (ksp != NULL) {
-		ksp->ks_data = &erpt_kstat_data;
-		kstat_install(ksp);
-	} else {
-		cmn_err(CE_NOTE, "failed to create fm/misc kstat\n");
-
-	}
-}
-
-#ifdef illumos
-/*
- * Formatting utility function for fm_nvprintr.  We attempt to wrap chunks of
- * output so they aren't split across console lines, and return the end column.
- */
-/*PRINTFLIKE4*/
-static int
-fm_printf(int depth, int c, int cols, const char *format, ...)
-{
-	va_list ap;
-	int width;
-	char c1;
-
-	va_start(ap, format);
-	width = vsnprintf(&c1, sizeof (c1), format, ap);
-	va_end(ap);
-
-	if (c + width >= cols) {
-		console_printf("\n\r");
-		c = 0;
-		if (format[0] != ' ' && depth > 0) {
-			console_printf(" ");
-			c++;
-		}
-	}
-
-	va_start(ap, format);
-	console_vprintf(format, ap);
-	va_end(ap);
-
-	return ((c + width) % cols);
-}
-
-/*
- * Recursively print a nvlist in the specified column width and return the
- * column we end up in.  This function is called recursively by fm_nvprint(),
- * below.  We generically format the entire nvpair using hexadecimal
- * integers and strings, and elide any integer arrays.  Arrays are basically
- * used for cache dumps right now, so we suppress them so as not to overwhelm
- * the amount of console output we produce at panic time.  This can be further
- * enhanced as FMA technology grows based upon the needs of consumers.  All
- * FMA telemetry is logged using the dump device transport, so the console
- * output serves only as a fallback in case this procedure is unsuccessful.
- */
-static int
-fm_nvprintr(nvlist_t *nvl, int d, int c, int cols)
-{
-	nvpair_t *nvp;
-
-	for (nvp = nvlist_next_nvpair(nvl, NULL);
-	    nvp != NULL; nvp = nvlist_next_nvpair(nvl, nvp)) {
-
-		data_type_t type = nvpair_type(nvp);
-		const char *name = nvpair_name(nvp);
-
-		boolean_t b;
-		uint8_t i8;
-		uint16_t i16;
-		uint32_t i32;
-		uint64_t i64;
-		char *str;
-		nvlist_t *cnv;
-
-		if (strcmp(name, FM_CLASS) == 0)
-			continue; /* already printed by caller */
-
-		c = fm_printf(d, c, cols, " %s=", name);
-
-		switch (type) {
-		case DATA_TYPE_BOOLEAN:
-			c = fm_printf(d + 1, c, cols, " 1");
-			break;
-
-		case DATA_TYPE_BOOLEAN_VALUE:
-			(void) nvpair_value_boolean_value(nvp, &b);
-			c = fm_printf(d + 1, c, cols, b ? "1" : "0");
-			break;
-
-		case DATA_TYPE_BYTE:
-			(void) nvpair_value_byte(nvp, &i8);
-			c = fm_printf(d + 1, c, cols, "%x", i8);
-			break;
-
-		case DATA_TYPE_INT8:
-			(void) nvpair_value_int8(nvp, (void *)&i8);
-			c = fm_printf(d + 1, c, cols, "%x", i8);
-			break;
-
-		case DATA_TYPE_UINT8:
-			(void) nvpair_value_uint8(nvp, &i8);
-			c = fm_printf(d + 1, c, cols, "%x", i8);
-			break;
-
-		case DATA_TYPE_INT16:
-			(void) nvpair_value_int16(nvp, (void *)&i16);
-			c = fm_printf(d + 1, c, cols, "%x", i16);
-			break;
-
-		case DATA_TYPE_UINT16:
-			(void) nvpair_value_uint16(nvp, &i16);
-			c = fm_printf(d + 1, c, cols, "%x", i16);
-			break;
-
-		case DATA_TYPE_INT32:
-			(void) nvpair_value_int32(nvp, (void *)&i32);
-			c = fm_printf(d + 1, c, cols, "%x", i32);
-			break;
-
-		case DATA_TYPE_UINT32:
-			(void) nvpair_value_uint32(nvp, &i32);
-			c = fm_printf(d + 1, c, cols, "%x", i32);
-			break;
-
-		case DATA_TYPE_INT64:
-			(void) nvpair_value_int64(nvp, (void *)&i64);
-			c = fm_printf(d + 1, c, cols, "%llx",
-			    (u_longlong_t)i64);
-			break;
-
-		case DATA_TYPE_UINT64:
-			(void) nvpair_value_uint64(nvp, &i64);
-			c = fm_printf(d + 1, c, cols, "%llx",
-			    (u_longlong_t)i64);
-			break;
-
-		case DATA_TYPE_HRTIME:
-			(void) nvpair_value_hrtime(nvp, (void *)&i64);
-			c = fm_printf(d + 1, c, cols, "%llx",
-			    (u_longlong_t)i64);
-			break;
-
-		case DATA_TYPE_STRING:
-			(void) nvpair_value_string(nvp, &str);
-			c = fm_printf(d + 1, c, cols, "\"%s\"",
-			    str ? str : "<NULL>");
-			break;
-
-		case DATA_TYPE_NVLIST:
-			c = fm_printf(d + 1, c, cols, "[");
-			(void) nvpair_value_nvlist(nvp, &cnv);
-			c = fm_nvprintr(cnv, d + 1, c, cols);
-			c = fm_printf(d + 1, c, cols, " ]");
-			break;
-
-		case DATA_TYPE_NVLIST_ARRAY: {
-			nvlist_t **val;
-			uint_t i, nelem;
-
-			c = fm_printf(d + 1, c, cols, "[");
-			(void) nvpair_value_nvlist_array(nvp, &val, &nelem);
-			for (i = 0; i < nelem; i++) {
-				c = fm_nvprintr(val[i], d + 1, c, cols);
-			}
-			c = fm_printf(d + 1, c, cols, " ]");
-			}
-			break;
-
-		case DATA_TYPE_BOOLEAN_ARRAY:
-		case DATA_TYPE_BYTE_ARRAY:
-		case DATA_TYPE_INT8_ARRAY:
-		case DATA_TYPE_UINT8_ARRAY:
-		case DATA_TYPE_INT16_ARRAY:
-		case DATA_TYPE_UINT16_ARRAY:
-		case DATA_TYPE_INT32_ARRAY:
-		case DATA_TYPE_UINT32_ARRAY:
-		case DATA_TYPE_INT64_ARRAY:
-		case DATA_TYPE_UINT64_ARRAY:
-		case DATA_TYPE_STRING_ARRAY:
-			c = fm_printf(d + 1, c, cols, "[...]");
-			break;
-		case DATA_TYPE_UNKNOWN:
-			c = fm_printf(d + 1, c, cols, "<unknown>");
-			break;
-		}
-	}
-
-	return (c);
-}
-
-void
-fm_nvprint(nvlist_t *nvl)
-{
-	char *class;
-	int c = 0;
-
-	console_printf("\r");
-
-	if (nvlist_lookup_string(nvl, FM_CLASS, &class) == 0)
-		c = fm_printf(0, c, ereport_cols, "%s", class);
-
-	if (fm_nvprintr(nvl, 0, c, ereport_cols) != 0)
-		console_printf("\n");
-
-	console_printf("\n");
-}
-
-/*
- * Wrapper for panic() that first produces an FMA-style message for admins.
- * Normally such messages are generated by fmd(1M)'s syslog-msgs agent: this
- * is the one exception to that rule and the only error that gets messaged.
- * This function is intended for use by subsystems that have detected a fatal
- * error and enqueued appropriate ereports and wish to then force a panic.
- */
-/*PRINTFLIKE1*/
-void
-fm_panic(const char *format, ...)
-{
-	va_list ap;
-
-	(void) atomic_cas_ptr((void *)&fm_panicstr, NULL, (void *)format);
-#if defined(__i386) || defined(__amd64)
-	fastreboot_disable_highpil();
-#endif /* __i386 || __amd64 */
-	va_start(ap, format);
-	vpanic(format, ap);
-	va_end(ap);
-}
-
-/*
- * Simply tell the caller if fm_panicstr is set, ie. an fma event has
- * caused the panic. If so, something other than the default panic
- * diagnosis method will diagnose the cause of the panic.
- */
-int
-is_fm_panic()
-{
-	if (fm_panicstr)
-		return (1);
-	else
-		return (0);
-}
-
-/*
- * Print any appropriate FMA banner message before the panic message.  This
- * function is called by panicsys() and prints the message for fm_panic().
- * We print the message here so that it comes after the system is quiesced.
- * A one-line summary is recorded in the log only (cmn_err(9F) with "!" prefix).
- * The rest of the message is for the console only and not needed in the log,
- * so it is printed using console_printf().  We break it up into multiple
- * chunks so as to avoid overflowing any small legacy prom_printf() buffers.
- */
-void
-fm_banner(void)
-{
-	timespec_t tod;
-	hrtime_t now;
-
-	if (!fm_panicstr)
-		return; /* panic was not initiated by fm_panic(); do nothing */
-
-	if (KERNEL_PANICKED()) {
-		tod = panic_hrestime;
-		now = panic_hrtime;
-	} else {
-		gethrestime(&tod);
-		now = gethrtime_waitfree();
-	}
-
-	cmn_err(CE_NOTE, "!SUNW-MSG-ID: %s, "
-	    "TYPE: Error, VER: 1, SEVERITY: Major\n", fm_msgid);
-
-	console_printf(
-"\n\rSUNW-MSG-ID: %s, TYPE: Error, VER: 1, SEVERITY: Major\n"
-"EVENT-TIME: 0x%lx.0x%lx (0x%llx)\n",
-	    fm_msgid, tod.tv_sec, tod.tv_nsec, (u_longlong_t)now);
-
-	console_printf(
-"PLATFORM: %s, CSN: -, HOSTNAME: %s\n"
-"SOURCE: %s, REV: %s %s\n",
-	    platform, utsname.nodename, utsname.sysname,
-	    utsname.release, utsname.version);
-
-	console_printf(
-"DESC: Errors have been detected that require a reboot to ensure system\n"
-"integrity.  See %s/%s for more information.\n",
-	    fm_url, fm_msgid);
-
-	console_printf(
-"AUTO-RESPONSE: Solaris will attempt to save and diagnose the error telemetry\n"
-"IMPACT: The system will sync files, save a crash dump if needed, and reboot\n"
-"REC-ACTION: Save the error summary below in case telemetry cannot be saved\n");
-
-	console_printf("\n");
-}
-
-/*
- * Utility function to write all of the pending ereports to the dump device.
- * This function is called at either normal reboot or panic time, and simply
- * iterates over the in-transit messages in the ereport sysevent channel.
- */
-void
-fm_ereport_dump(void)
-{
-	evchanq_t *chq;
-	sysevent_t *sep;
-	erpt_dump_t ed;
-
-	timespec_t tod;
-	hrtime_t now;
-	char *buf;
-	size_t len;
-
-	if (KERNEL_PANICKED()) {
-		tod = panic_hrestime;
-		now = panic_hrtime;
-	} else {
-		if (ereport_errorq != NULL)
-			errorq_drain(ereport_errorq);
-		gethrestime(&tod);
-		now = gethrtime_waitfree();
-	}
-
-	/*
-	 * In the panic case, sysevent_evc_walk_init() will return NULL.
-	 */
-	if ((chq = sysevent_evc_walk_init(ereport_chan, NULL)) == NULL &&
-	    !KERNEL_PANICKED())
-		return; /* event channel isn't initialized yet */
-
-	while ((sep = sysevent_evc_walk_step(chq)) != NULL) {
-		if ((buf = sysevent_evc_event_attr(sep, &len)) == NULL)
-			break;
-
-		ed.ed_magic = ERPT_MAGIC;
-		ed.ed_chksum = checksum32(buf, len);
-		ed.ed_size = (uint32_t)len;
-		ed.ed_pad = 0;
-		ed.ed_hrt_nsec = SE_TIME(sep);
-		ed.ed_hrt_base = now;
-		ed.ed_tod_base.sec = tod.tv_sec;
-		ed.ed_tod_base.nsec = tod.tv_nsec;
-
-		dumpvp_write(&ed, sizeof (ed));
-		dumpvp_write(buf, len);
-	}
-
-	sysevent_evc_walk_fini(chq);
-}
-#endif
-
-/*
- * Post an error report (ereport) to the sysevent error channel.  The error
- * channel must be established with a prior call to sysevent_evc_create()
- * before publication may occur.
- */
-void
-fm_ereport_post(nvlist_t *ereport, int evc_flag)
-{
-	size_t nvl_size = 0;
-	evchan_t *error_chan;
-	sysevent_id_t eid;
-
-	(void) nvlist_size(ereport, &nvl_size, NV_ENCODE_NATIVE);
-	if (nvl_size > ERPT_DATA_SZ || nvl_size == 0) {
-		atomic_inc_64(&erpt_kstat_data.erpt_dropped.value.ui64);
-		return;
-	}
-
-#ifdef illumos
-	if (sysevent_evc_bind(FM_ERROR_CHAN, &error_chan,
-	    EVCH_CREAT|EVCH_HOLD_PEND) != 0) {
-		atomic_inc_64(&erpt_kstat_data.erpt_dropped.value.ui64);
-		return;
-	}
-
-	if (sysevent_evc_publish(error_chan, EC_FM, ESC_FM_ERROR,
-	    SUNW_VENDOR, FM_PUB, ereport, evc_flag) != 0) {
-		atomic_inc_64(&erpt_kstat_data.erpt_dropped.value.ui64);
-		(void) sysevent_evc_unbind(error_chan);
-		return;
-	}
-	(void) sysevent_evc_unbind(error_chan);
-#else
-	(void) ddi_log_sysevent(NULL, SUNW_VENDOR, EC_DEV_STATUS,
-	    ESC_DEV_DLE, ereport, &eid, DDI_SLEEP);
-#endif
-}
-
-/*
- * Wrapppers for FM nvlist allocators
- */
-/* ARGSUSED */
-static void *
-i_fm_alloc(nv_alloc_t *nva, size_t size)
-{
-	return (kmem_zalloc(size, KM_SLEEP));
-}
-
-/* ARGSUSED */
-static void
-i_fm_free(nv_alloc_t *nva, void *buf, size_t size)
-{
-	kmem_free(buf, size);
-}
-
-const nv_alloc_ops_t fm_mem_alloc_ops = {
-	NULL,
-	NULL,
-	i_fm_alloc,
-	i_fm_free,
-	NULL
-};
-
-/*
- * Create and initialize a new nv_alloc_t for a fixed buffer, buf.  A pointer
- * to the newly allocated nv_alloc_t structure is returned upon success or NULL
- * is returned to indicate that the nv_alloc structure could not be created.
- */
-nv_alloc_t *
-fm_nva_xcreate(char *buf, size_t bufsz)
-{
-	nv_alloc_t *nvhdl = kmem_zalloc(sizeof (nv_alloc_t), KM_SLEEP);
-
-	if (bufsz == 0 || nv_alloc_init(nvhdl, nv_fixed_ops, buf, bufsz) != 0) {
-		kmem_free(nvhdl, sizeof (nv_alloc_t));
-		return (NULL);
-	}
-
-	return (nvhdl);
-}
-
-/*
- * Destroy a previously allocated nv_alloc structure.  The fixed buffer
- * associated with nva must be freed by the caller.
- */
-void
-fm_nva_xdestroy(nv_alloc_t *nva)
-{
-	nv_alloc_fini(nva);
-	kmem_free(nva, sizeof (nv_alloc_t));
-}
-
-/*
- * Create a new nv list.  A pointer to a new nv list structure is returned
- * upon success or NULL is returned to indicate that the structure could
- * not be created.  The newly created nv list is created and managed by the
- * operations installed in nva.   If nva is NULL, the default FMA nva
- * operations are installed and used.
- *
- * When called from the kernel and nva == NULL, this function must be called
- * from passive kernel context with no locks held that can prevent a
- * sleeping memory allocation from occurring.  Otherwise, this function may
- * be called from other kernel contexts as long a valid nva created via
- * fm_nva_create() is supplied.
- */
-nvlist_t *
-fm_nvlist_create(nv_alloc_t *nva)
-{
-	int hdl_alloced = 0;
-	nvlist_t *nvl;
-	nv_alloc_t *nvhdl;
-
-	if (nva == NULL) {
-		nvhdl = kmem_zalloc(sizeof (nv_alloc_t), KM_SLEEP);
-
-		if (nv_alloc_init(nvhdl, &fm_mem_alloc_ops, NULL, 0) != 0) {
-			kmem_free(nvhdl, sizeof (nv_alloc_t));
-			return (NULL);
-		}
-		hdl_alloced = 1;
-	} else {
-		nvhdl = nva;
-	}
-
-	if (nvlist_xalloc(&nvl, NV_UNIQUE_NAME, nvhdl) != 0) {
-		if (hdl_alloced) {
-			nv_alloc_fini(nvhdl);
-			kmem_free(nvhdl, sizeof (nv_alloc_t));
-		}
-		return (NULL);
-	}
-
-	return (nvl);
-}
-
-/*
- * Destroy a previously allocated nvlist structure.  flag indicates whether
- * or not the associated nva structure should be freed (FM_NVA_FREE) or
- * retained (FM_NVA_RETAIN).  Retaining the nv alloc structure allows
- * it to be re-used for future nvlist creation operations.
- */
-void
-fm_nvlist_destroy(nvlist_t *nvl, int flag)
-{
-	nv_alloc_t *nva = nvlist_lookup_nv_alloc(nvl);
-
-	nvlist_free(nvl);
-
-	if (nva != NULL) {
-		if (flag == FM_NVA_FREE)
-			fm_nva_xdestroy(nva);
-	}
-}
-
-int
-i_fm_payload_set(nvlist_t *payload, const char *name, va_list ap)
-{
-	int nelem, ret = 0;
-	data_type_t type;
-
-	while (ret == 0 && name != NULL) {
-		type = va_arg(ap, data_type_t);
-		switch (type) {
-		case DATA_TYPE_BYTE:
-			ret = nvlist_add_byte(payload, name,
-			    va_arg(ap, uint_t));
-			break;
-		case DATA_TYPE_BYTE_ARRAY:
-			nelem = va_arg(ap, int);
-			ret = nvlist_add_byte_array(payload, name,
-			    va_arg(ap, uchar_t *), nelem);
-			break;
-		case DATA_TYPE_BOOLEAN_VALUE:
-			ret = nvlist_add_boolean_value(payload, name,
-			    va_arg(ap, boolean_t));
-			break;
-		case DATA_TYPE_BOOLEAN_ARRAY:
-			nelem = va_arg(ap, int);
-			ret = nvlist_add_boolean_array(payload, name,
-			    va_arg(ap, boolean_t *), nelem);
-			break;
-		case DATA_TYPE_INT8:
-			ret = nvlist_add_int8(payload, name,
-			    va_arg(ap, int));
-			break;
-		case DATA_TYPE_INT8_ARRAY:
-			nelem = va_arg(ap, int);
-			ret = nvlist_add_int8_array(payload, name,
-			    va_arg(ap, int8_t *), nelem);
-			break;
-		case DATA_TYPE_UINT8:
-			ret = nvlist_add_uint8(payload, name,
-			    va_arg(ap, uint_t));
-			break;
-		case DATA_TYPE_UINT8_ARRAY:
-			nelem = va_arg(ap, int);
-			ret = nvlist_add_uint8_array(payload, name,
-			    va_arg(ap, uint8_t *), nelem);
-			break;
-		case DATA_TYPE_INT16:
-			ret = nvlist_add_int16(payload, name,
-			    va_arg(ap, int));
-			break;
-		case DATA_TYPE_INT16_ARRAY:
-			nelem = va_arg(ap, int);
-			ret = nvlist_add_int16_array(payload, name,
-			    va_arg(ap, int16_t *), nelem);
-			break;
-		case DATA_TYPE_UINT16:
-			ret = nvlist_add_uint16(payload, name,
-			    va_arg(ap, uint_t));
-			break;
-		case DATA_TYPE_UINT16_ARRAY:
-			nelem = va_arg(ap, int);
-			ret = nvlist_add_uint16_array(payload, name,
-			    va_arg(ap, uint16_t *), nelem);
-			break;
-		case DATA_TYPE_INT32:
-			ret = nvlist_add_int32(payload, name,
-			    va_arg(ap, int32_t));
-			break;
-		case DATA_TYPE_INT32_ARRAY:
-			nelem = va_arg(ap, int);
-			ret = nvlist_add_int32_array(payload, name,
-			    va_arg(ap, int32_t *), nelem);
-			break;
-		case DATA_TYPE_UINT32:
-			ret = nvlist_add_uint32(payload, name,
-			    va_arg(ap, uint32_t));
-			break;
-		case DATA_TYPE_UINT32_ARRAY:
-			nelem = va_arg(ap, int);
-			ret = nvlist_add_uint32_array(payload, name,
-			    va_arg(ap, uint32_t *), nelem);
-			break;
-		case DATA_TYPE_INT64:
-			ret = nvlist_add_int64(payload, name,
-			    va_arg(ap, int64_t));
-			break;
-		case DATA_TYPE_INT64_ARRAY:
-			nelem = va_arg(ap, int);
-			ret = nvlist_add_int64_array(payload, name,
-			    va_arg(ap, int64_t *), nelem);
-			break;
-		case DATA_TYPE_UINT64:
-			ret = nvlist_add_uint64(payload, name,
-			    va_arg(ap, uint64_t));
-			break;
-		case DATA_TYPE_UINT64_ARRAY:
-			nelem = va_arg(ap, int);
-			ret = nvlist_add_uint64_array(payload, name,
-			    va_arg(ap, uint64_t *), nelem);
-			break;
-		case DATA_TYPE_STRING:
-			ret = nvlist_add_string(payload, name,
-			    va_arg(ap, char *));
-			break;
-		case DATA_TYPE_STRING_ARRAY:
-			nelem = va_arg(ap, int);
-			ret = nvlist_add_string_array(payload, name,
-			    va_arg(ap, char **), nelem);
-			break;
-		case DATA_TYPE_NVLIST:
-			ret = nvlist_add_nvlist(payload, name,
-			    va_arg(ap, nvlist_t *));
-			break;
-		case DATA_TYPE_NVLIST_ARRAY:
-			nelem = va_arg(ap, int);
-			ret = nvlist_add_nvlist_array(payload, name,
-			    va_arg(ap, nvlist_t **), nelem);
-			break;
-		default:
-			ret = EINVAL;
-		}
-
-		name = va_arg(ap, char *);
-	}
-	return (ret);
-}
-
-void
-fm_payload_set(nvlist_t *payload, ...)
-{
-	int ret;
-	const char *name;
-	va_list ap;
-
-	va_start(ap, payload);
-	name = va_arg(ap, char *);
-	ret = i_fm_payload_set(payload, name, ap);
-	va_end(ap);
-
-	if (ret)
-		atomic_inc_64(&erpt_kstat_data.payload_set_failed.value.ui64);
-}
-
-/*
- * Set-up and validate the members of an ereport event according to:
- *
- *	Member name		Type		Value
- *	====================================================
- *	class			string		ereport
- *	version			uint8_t		0
- *	ena			uint64_t	<ena>
- *	detector		nvlist_t	<detector>
- *	ereport-payload		nvlist_t	<var args>
- *
- * We don't actually add a 'version' member to the payload.  Really,
- * the version quoted to us by our caller is that of the category 1
- * "ereport" event class (and we require FM_EREPORT_VERS0) but
- * the payload version of the actual leaf class event under construction
- * may be something else.  Callers should supply a version in the varargs,
- * or (better) we could take two version arguments - one for the
- * ereport category 1 classification (expect FM_EREPORT_VERS0) and one
- * for the leaf class.
- */
-void
-fm_ereport_set(nvlist_t *ereport, int version, const char *erpt_class,
-    uint64_t ena, const nvlist_t *detector, ...)
-{
-	char ereport_class[FM_MAX_CLASS];
-	const char *name;
-	va_list ap;
-	int ret;
-
-	if (version != FM_EREPORT_VERS0) {
-		atomic_inc_64(&erpt_kstat_data.erpt_set_failed.value.ui64);
-		return;
-	}
-
-	(void) snprintf(ereport_class, FM_MAX_CLASS, "%s.%s",
-	    FM_EREPORT_CLASS, erpt_class);
-	if (nvlist_add_string(ereport, FM_CLASS, ereport_class) != 0) {
-		atomic_inc_64(&erpt_kstat_data.erpt_set_failed.value.ui64);
-		return;
-	}
-
-	if (nvlist_add_uint64(ereport, FM_EREPORT_ENA, ena)) {
-		atomic_inc_64(&erpt_kstat_data.erpt_set_failed.value.ui64);
-	}
-
-	if (nvlist_add_nvlist(ereport, FM_EREPORT_DETECTOR,
-	    (nvlist_t *)detector) != 0) {
-		atomic_inc_64(&erpt_kstat_data.erpt_set_failed.value.ui64);
-	}
-
-	va_start(ap, detector);
-	name = va_arg(ap, const char *);
-	ret = i_fm_payload_set(ereport, name, ap);
-	va_end(ap);
-
-	if (ret)
-		atomic_inc_64(&erpt_kstat_data.erpt_set_failed.value.ui64);
-}
-
-/*
- * Set-up and validate the members of an hc fmri according to;
- *
- *	Member name		Type		Value
- *	===================================================
- *	version			uint8_t		0
- *	auth			nvlist_t	<auth>
- *	hc-name			string		<name>
- *	hc-id			string		<id>
- *
- * Note that auth and hc-id are optional members.
- */
-
-#define	HC_MAXPAIRS	20
-#define	HC_MAXNAMELEN	50
-
-static int
-fm_fmri_hc_set_common(nvlist_t *fmri, int version, const nvlist_t *auth)
-{
-	if (version != FM_HC_SCHEME_VERSION) {
-		atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
-		return (0);
-	}
-
-	if (nvlist_add_uint8(fmri, FM_VERSION, version) != 0 ||
-	    nvlist_add_string(fmri, FM_FMRI_SCHEME, FM_FMRI_SCHEME_HC) != 0) {
-		atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
-		return (0);
-	}
-
-	if (auth != NULL && nvlist_add_nvlist(fmri, FM_FMRI_AUTHORITY,
-	    (nvlist_t *)auth) != 0) {
-		atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
-		return (0);
-	}
-
-	return (1);
-}
-
-void
-fm_fmri_hc_set(nvlist_t *fmri, int version, const nvlist_t *auth,
-    nvlist_t *snvl, int npairs, ...)
-{
-	nv_alloc_t *nva = nvlist_lookup_nv_alloc(fmri);
-	nvlist_t *pairs[HC_MAXPAIRS];
-	va_list ap;
-	int i;
-
-	if (!fm_fmri_hc_set_common(fmri, version, auth))
-		return;
-
-	npairs = MIN(npairs, HC_MAXPAIRS);
-
-	va_start(ap, npairs);
-	for (i = 0; i < npairs; i++) {
-		const char *name = va_arg(ap, const char *);
-		uint32_t id = va_arg(ap, uint32_t);
-		char idstr[11];
-
-		(void) snprintf(idstr, sizeof (idstr), "%u", id);
-
-		pairs[i] = fm_nvlist_create(nva);
-		if (nvlist_add_string(pairs[i], FM_FMRI_HC_NAME, name) != 0 ||
-		    nvlist_add_string(pairs[i], FM_FMRI_HC_ID, idstr) != 0) {
-			atomic_inc_64(
-			    &erpt_kstat_data.fmri_set_failed.value.ui64);
-		}
-	}
-	va_end(ap);
-
-	if (nvlist_add_nvlist_array(fmri, FM_FMRI_HC_LIST, pairs, npairs) != 0)
-		atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
-
-	for (i = 0; i < npairs; i++)
-		fm_nvlist_destroy(pairs[i], FM_NVA_RETAIN);
-
-	if (snvl != NULL) {
-		if (nvlist_add_nvlist(fmri, FM_FMRI_HC_SPECIFIC, snvl) != 0) {
-			atomic_inc_64(
-			    &erpt_kstat_data.fmri_set_failed.value.ui64);
-		}
-	}
-}
-
-/*
- * Set-up and validate the members of an dev fmri according to:
- *
- *	Member name		Type		Value
- *	====================================================
- *	version			uint8_t		0
- *	auth			nvlist_t	<auth>
- *	devpath			string		<devpath>
- *	[devid]			string		<devid>
- *	[target-port-l0id]	string		<target-port-lun0-id>
- *
- * Note that auth and devid are optional members.
- */
-void
-fm_fmri_dev_set(nvlist_t *fmri_dev, int version, const nvlist_t *auth,
-    const char *devpath, const char *devid, const char *tpl0)
-{
-	int err = 0;
-
-	if (version != DEV_SCHEME_VERSION0) {
-		atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
-		return;
-	}
-
-	err |= nvlist_add_uint8(fmri_dev, FM_VERSION, version);
-	err |= nvlist_add_string(fmri_dev, FM_FMRI_SCHEME, FM_FMRI_SCHEME_DEV);
-
-	if (auth != NULL) {
-		err |= nvlist_add_nvlist(fmri_dev, FM_FMRI_AUTHORITY,
-		    (nvlist_t *)auth);
-	}
-
-	err |= nvlist_add_string(fmri_dev, FM_FMRI_DEV_PATH, devpath);
-
-	if (devid != NULL)
-		err |= nvlist_add_string(fmri_dev, FM_FMRI_DEV_ID, devid);
-
-	if (tpl0 != NULL)
-		err |= nvlist_add_string(fmri_dev, FM_FMRI_DEV_TGTPTLUN0, tpl0);
-
-	if (err)
-		atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
-
-}
-
-/*
- * Set-up and validate the members of an cpu fmri according to:
- *
- *	Member name		Type		Value
- *	====================================================
- *	version			uint8_t		0
- *	auth			nvlist_t	<auth>
- *	cpuid			uint32_t	<cpu_id>
- *	cpumask			uint8_t		<cpu_mask>
- *	serial			uint64_t	<serial_id>
- *
- * Note that auth, cpumask, serial are optional members.
- *
- */
-void
-fm_fmri_cpu_set(nvlist_t *fmri_cpu, int version, const nvlist_t *auth,
-    uint32_t cpu_id, uint8_t *cpu_maskp, const char *serial_idp)
-{
-	uint64_t *failedp = &erpt_kstat_data.fmri_set_failed.value.ui64;
-
-	if (version < CPU_SCHEME_VERSION1) {
-		atomic_inc_64(failedp);
-		return;
-	}
-
-	if (nvlist_add_uint8(fmri_cpu, FM_VERSION, version) != 0) {
-		atomic_inc_64(failedp);
-		return;
-	}
-
-	if (nvlist_add_string(fmri_cpu, FM_FMRI_SCHEME,
-	    FM_FMRI_SCHEME_CPU) != 0) {
-		atomic_inc_64(failedp);
-		return;
-	}
-
-	if (auth != NULL && nvlist_add_nvlist(fmri_cpu, FM_FMRI_AUTHORITY,
-	    (nvlist_t *)auth) != 0)
-		atomic_inc_64(failedp);
-
-	if (nvlist_add_uint32(fmri_cpu, FM_FMRI_CPU_ID, cpu_id) != 0)
-		atomic_inc_64(failedp);
-
-	if (cpu_maskp != NULL && nvlist_add_uint8(fmri_cpu, FM_FMRI_CPU_MASK,
-	    *cpu_maskp) != 0)
-		atomic_inc_64(failedp);
-
-	if (serial_idp == NULL || nvlist_add_string(fmri_cpu,
-	    FM_FMRI_CPU_SERIAL_ID, (char *)serial_idp) != 0)
-			atomic_inc_64(failedp);
-}
-
-/*
- * Set-up and validate the members of a mem according to:
- *
- *	Member name		Type		Value
- *	====================================================
- *	version			uint8_t		0
- *	auth			nvlist_t	<auth>		[optional]
- *	unum			string		<unum>
- *	serial			string		<serial>	[optional*]
- *	offset			uint64_t	<offset>	[optional]
- *
- *	* serial is required if offset is present
- */
-void
-fm_fmri_mem_set(nvlist_t *fmri, int version, const nvlist_t *auth,
-    const char *unum, const char *serial, uint64_t offset)
-{
-	if (version != MEM_SCHEME_VERSION0) {
-		atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
-		return;
-	}
-
-	if (!serial && (offset != (uint64_t)-1)) {
-		atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
-		return;
-	}
-
-	if (nvlist_add_uint8(fmri, FM_VERSION, version) != 0) {
-		atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
-		return;
-	}
-
-	if (nvlist_add_string(fmri, FM_FMRI_SCHEME, FM_FMRI_SCHEME_MEM) != 0) {
-		atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
-		return;
-	}
-
-	if (auth != NULL) {
-		if (nvlist_add_nvlist(fmri, FM_FMRI_AUTHORITY,
-		    (nvlist_t *)auth) != 0) {
-			atomic_inc_64(
-			    &erpt_kstat_data.fmri_set_failed.value.ui64);
-		}
-	}
-
-	if (nvlist_add_string(fmri, FM_FMRI_MEM_UNUM, unum) != 0) {
-		atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
-	}
-
-	if (serial != NULL) {
-		if (nvlist_add_string_array(fmri, FM_FMRI_MEM_SERIAL_ID,
-		    (char **)&serial, 1) != 0) {
-			atomic_inc_64(
-			    &erpt_kstat_data.fmri_set_failed.value.ui64);
-		}
-		if (offset != (uint64_t)-1 && nvlist_add_uint64(fmri,
-		    FM_FMRI_MEM_OFFSET, offset) != 0) {
-			atomic_inc_64(
-			    &erpt_kstat_data.fmri_set_failed.value.ui64);
-		}
-	}
-}
-
-void
-fm_fmri_zfs_set(nvlist_t *fmri, int version, uint64_t pool_guid,
-    uint64_t vdev_guid)
-{
-	if (version != ZFS_SCHEME_VERSION0) {
-		atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
-		return;
-	}
-
-	if (nvlist_add_uint8(fmri, FM_VERSION, version) != 0) {
-		atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
-		return;
-	}
-
-	if (nvlist_add_string(fmri, FM_FMRI_SCHEME, FM_FMRI_SCHEME_ZFS) != 0) {
-		atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
-		return;
-	}
-
-	if (nvlist_add_uint64(fmri, FM_FMRI_ZFS_POOL, pool_guid) != 0) {
-		atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
-	}
-
-	if (vdev_guid != 0) {
-		if (nvlist_add_uint64(fmri, FM_FMRI_ZFS_VDEV, vdev_guid) != 0) {
-			atomic_inc_64(
-			    &erpt_kstat_data.fmri_set_failed.value.ui64);
-		}
-	}
-}
-
-uint64_t
-fm_ena_increment(uint64_t ena)
-{
-	uint64_t new_ena;
-
-	switch (ENA_FORMAT(ena)) {
-	case FM_ENA_FMT1:
-		new_ena = ena + (1 << ENA_FMT1_GEN_SHFT);
-		break;
-	case FM_ENA_FMT2:
-		new_ena = ena + (1 << ENA_FMT2_GEN_SHFT);
-		break;
-	default:
-		new_ena = 0;
-	}
-
-	return (new_ena);
-}
-
-uint64_t
-fm_ena_generate_cpu(uint64_t timestamp, processorid_t cpuid, uchar_t format)
-{
-	uint64_t ena = 0;
-
-	switch (format) {
-	case FM_ENA_FMT1:
-		if (timestamp) {
-			ena = (uint64_t)((format & ENA_FORMAT_MASK) |
-			    ((cpuid << ENA_FMT1_CPUID_SHFT) &
-			    ENA_FMT1_CPUID_MASK) |
-			    ((timestamp << ENA_FMT1_TIME_SHFT) &
-			    ENA_FMT1_TIME_MASK));
-		} else {
-			ena = (uint64_t)((format & ENA_FORMAT_MASK) |
-			    ((cpuid << ENA_FMT1_CPUID_SHFT) &
-			    ENA_FMT1_CPUID_MASK) |
-			    ((gethrtime_waitfree() << ENA_FMT1_TIME_SHFT) &
-			    ENA_FMT1_TIME_MASK));
-		}
-		break;
-	case FM_ENA_FMT2:
-		ena = (uint64_t)((format & ENA_FORMAT_MASK) |
-		    ((timestamp << ENA_FMT2_TIME_SHFT) & ENA_FMT2_TIME_MASK));
-		break;
-	default:
-		break;
-	}
-
-	return (ena);
-}
-
-uint64_t
-fm_ena_generate(uint64_t timestamp, uchar_t format)
-{
-	return (fm_ena_generate_cpu(timestamp, PCPU_GET(cpuid), format));
-}
-
-uint64_t
-fm_ena_generation_get(uint64_t ena)
-{
-	uint64_t gen;
-
-	switch (ENA_FORMAT(ena)) {
-	case FM_ENA_FMT1:
-		gen = (ena & ENA_FMT1_GEN_MASK) >> ENA_FMT1_GEN_SHFT;
-		break;
-	case FM_ENA_FMT2:
-		gen = (ena & ENA_FMT2_GEN_MASK) >> ENA_FMT2_GEN_SHFT;
-		break;
-	default:
-		gen = 0;
-		break;
-	}
-
-	return (gen);
-}
-
-uchar_t
-fm_ena_format_get(uint64_t ena)
-{
-
-	return (ENA_FORMAT(ena));
-}
-
-uint64_t
-fm_ena_id_get(uint64_t ena)
-{
-	uint64_t id;
-
-	switch (ENA_FORMAT(ena)) {
-	case FM_ENA_FMT1:
-		id = (ena & ENA_FMT1_ID_MASK) >> ENA_FMT1_ID_SHFT;
-		break;
-	case FM_ENA_FMT2:
-		id = (ena & ENA_FMT2_ID_MASK) >> ENA_FMT2_ID_SHFT;
-		break;
-	default:
-		id = 0;
-	}
-
-	return (id);
-}
-
-uint64_t
-fm_ena_time_get(uint64_t ena)
-{
-	uint64_t time;
-
-	switch (ENA_FORMAT(ena)) {
-	case FM_ENA_FMT1:
-		time = (ena & ENA_FMT1_TIME_MASK) >> ENA_FMT1_TIME_SHFT;
-		break;
-	case FM_ENA_FMT2:
-		time = (ena & ENA_FMT2_TIME_MASK) >> ENA_FMT2_TIME_SHFT;
-		break;
-	default:
-		time = 0;
-	}
-
-	return (time);
-}
-
-#ifdef illumos
-/*
- * Convert a getpcstack() trace to symbolic name+offset, and add the resulting
- * string array to a Fault Management ereport as FM_EREPORT_PAYLOAD_NAME_STACK.
- */
-void
-fm_payload_stack_add(nvlist_t *payload, const pc_t *stack, int depth)
-{
-	int i;
-	char *sym;
-	ulong_t off;
-	char *stkpp[FM_STK_DEPTH];
-	char buf[FM_STK_DEPTH * FM_SYM_SZ];
-	char *stkp = buf;
-
-	for (i = 0; i < depth && i != FM_STK_DEPTH; i++, stkp += FM_SYM_SZ) {
-		if ((sym = kobj_getsymname(stack[i], &off)) != NULL)
-			(void) snprintf(stkp, FM_SYM_SZ, "%s+%lx", sym, off);
-		else
-			(void) snprintf(stkp, FM_SYM_SZ, "%lx", (long)stack[i]);
-		stkpp[i] = stkp;
-	}
-
-	fm_payload_set(payload, FM_EREPORT_PAYLOAD_NAME_STACK,
-	    DATA_TYPE_STRING_ARRAY, depth, stkpp, NULL);
-}
-#endif
-
-#ifdef illumos
-void
-print_msg_hwerr(ctid_t ct_id, proc_t *p)
-{
-	uprintf("Killed process %d (%s) in contract id %d "
-	    "due to hardware error\n", p->p_pid, p->p_user.u_comm, ct_id);
-}
-#endif
-
-void
-fm_fmri_hc_create(nvlist_t *fmri, int version, const nvlist_t *auth,
-    nvlist_t *snvl, nvlist_t *bboard, int npairs, ...)
-{
-	nv_alloc_t *nva = nvlist_lookup_nv_alloc(fmri);
-	nvlist_t *pairs[HC_MAXPAIRS];
-	nvlist_t **hcl;
-	uint_t n;
-	int i, j;
-	va_list ap;
-	char *hcname, *hcid;
-
-	if (!fm_fmri_hc_set_common(fmri, version, auth))
-		return;
-
-	/*
-	 * copy the bboard nvpairs to the pairs array
-	 */
-	if (nvlist_lookup_nvlist_array(bboard, FM_FMRI_HC_LIST, &hcl, &n)
-	    != 0) {
-		atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
-		return;
-	}
-
-	for (i = 0; i < n; i++) {
-		if (nvlist_lookup_string(hcl[i], FM_FMRI_HC_NAME,
-		    &hcname) != 0) {
-			atomic_inc_64(
-			    &erpt_kstat_data.fmri_set_failed.value.ui64);
-			return;
-		}
-		if (nvlist_lookup_string(hcl[i], FM_FMRI_HC_ID, &hcid) != 0) {
-			atomic_inc_64(
-			    &erpt_kstat_data.fmri_set_failed.value.ui64);
-			return;
-		}
-
-		pairs[i] = fm_nvlist_create(nva);
-		if (nvlist_add_string(pairs[i], FM_FMRI_HC_NAME, hcname) != 0 ||
-		    nvlist_add_string(pairs[i], FM_FMRI_HC_ID, hcid) != 0) {
-			for (j = 0; j <= i; j++) {
-				if (pairs[j] != NULL)
-					fm_nvlist_destroy(pairs[j],
-					    FM_NVA_RETAIN);
-			}
-			atomic_inc_64(
-			    &erpt_kstat_data.fmri_set_failed.value.ui64);
-			return;
-		}
-	}
-
-	/*
-	 * create the pairs from passed in pairs
-	 */
-	npairs = MIN(npairs, HC_MAXPAIRS);
-
-	va_start(ap, npairs);
-	for (i = n; i < npairs + n; i++) {
-		const char *name = va_arg(ap, const char *);
-		uint32_t id = va_arg(ap, uint32_t);
-		char idstr[11];
-		(void) snprintf(idstr, sizeof (idstr), "%u", id);
-		pairs[i] = fm_nvlist_create(nva);
-		if (nvlist_add_string(pairs[i], FM_FMRI_HC_NAME, name) != 0 ||
-		    nvlist_add_string(pairs[i], FM_FMRI_HC_ID, idstr) != 0) {
-			for (j = 0; j <= i; j++) {
-				if (pairs[j] != NULL)
-					fm_nvlist_destroy(pairs[j],
-					    FM_NVA_RETAIN);
-			}
-			atomic_inc_64(
-			    &erpt_kstat_data.fmri_set_failed.value.ui64);
-			return;
-		}
-	}
-	va_end(ap);
-
-	/*
-	 * Create the fmri hc list
-	 */
-	if (nvlist_add_nvlist_array(fmri, FM_FMRI_HC_LIST, pairs,
-	    npairs + n) != 0) {
-		atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
-		return;
-	}
-
-	for (i = 0; i < npairs + n; i++) {
-			fm_nvlist_destroy(pairs[i], FM_NVA_RETAIN);
-	}
-
-	if (snvl != NULL) {
-		if (nvlist_add_nvlist(fmri, FM_FMRI_HC_SPECIFIC, snvl) != 0) {
-			atomic_inc_64(
-			    &erpt_kstat_data.fmri_set_failed.value.ui64);
-			return;
-		}
-	}
-}
Index: sys/cddl/contrib/opensolaris/uts/common/os/nvpair_alloc_system.c
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/os/nvpair_alloc_system.c
+++ /dev/null
@@ -1,63 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
-#include <sys/nvpair.h>
-
-static void *
-nv_alloc_sys(nv_alloc_t *nva, size_t size)
-{
-	return (kmem_alloc(size, (int)(uintptr_t)nva->nva_arg));
-}
-
-/*ARGSUSED*/
-static void
-nv_free_sys(nv_alloc_t *nva, void *buf, size_t size)
-{
-	kmem_free(buf, size);
-}
-
-static const nv_alloc_ops_t system_ops = {
-	NULL,			/* nv_ao_init() */
-	NULL,			/* nv_ao_fini() */
-	nv_alloc_sys,		/* nv_ao_alloc() */
-	nv_free_sys,		/* nv_ao_free() */
-	NULL			/* nv_ao_reset() */
-};
-
-nv_alloc_t nv_alloc_sleep_def = {
-	&system_ops,
-	(void *)KM_SLEEP
-};
-
-nv_alloc_t nv_alloc_nosleep_def = {
-	&system_ops,
-	(void *)KM_NOSLEEP
-};
-
-nv_alloc_t *nv_alloc_sleep = &nv_alloc_sleep_def;
-nv_alloc_t *nv_alloc_nosleep = &nv_alloc_nosleep_def;
Index: sys/cddl/contrib/opensolaris/uts/common/sys/acl.h
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/sys/acl.h
+++ /dev/null
@@ -1,313 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2014 Garrett D'Amore <garrett@damore.org>
- *
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- * Copyright 2017 RackTop Systems.
- */
-
-#ifndef _SYS_ACL_H
-#define	_SYS_ACL_H
-
-#include <sys/types.h>
-#include <sys/acl_impl.h>
-
-#if defined(_KERNEL)
-/*
- * When compiling OpenSolaris kernel code, this file is included instead of the
- * FreeBSD one.  Include the original sys/acl.h as well.
- */
-#undef _SYS_ACL_H
-#include_next <sys/acl.h>
-#define	_SYS_ACL_H
-#endif	/* _KERNEL */
-
-#ifdef	__cplusplus
-extern "C" {
-#endif
-
-#define	MAX_ACL_ENTRIES		(1024)	/* max entries of each type */
-typedef struct {
-	int		a_type;		/* the type of ACL entry */
-	uid_t		a_id;		/* the entry in -uid or gid */
-	o_mode_t	a_perm;		/* the permission field */
-} aclent_t;
-
-typedef struct ace {
-	uid_t		a_who;		/* uid or gid */
-	uint32_t	a_access_mask;	/* read,write,... */
-	uint16_t	a_flags;	/* see below */
-	uint16_t	a_type;		/* allow or deny */
-} ace_t;
-
-#ifndef _KERNEL
-typedef struct acl_info acl_t;
-#endif
-
-/*
- * The following are Defined types for an aclent_t.
- */
-#define	USER_OBJ	(0x01)		/* object owner */
-#define	USER		(0x02)		/* additional users */
-#define	GROUP_OBJ	(0x04)		/* owning group of the object */
-#define	GROUP		(0x08)		/* additional groups */
-#define	CLASS_OBJ	(0x10)		/* file group class and mask entry */
-#define	OTHER_OBJ	(0x20)		/* other entry for the object */
-#define	ACL_DEFAULT	(0x1000)	/* default flag */
-/* default object owner */
-#define	DEF_USER_OBJ	(ACL_DEFAULT | USER_OBJ)
-/* default additional users */
-#define	DEF_USER	(ACL_DEFAULT | USER)
-/* default owning group */
-#define	DEF_GROUP_OBJ	(ACL_DEFAULT | GROUP_OBJ)
-/* default additional groups */
-#define	DEF_GROUP	(ACL_DEFAULT | GROUP)
-/* default mask entry */
-#define	DEF_CLASS_OBJ	(ACL_DEFAULT | CLASS_OBJ)
-/* default other entry */
-#define	DEF_OTHER_OBJ	(ACL_DEFAULT | OTHER_OBJ)
-
-/*
- * The following are defined for ace_t.
- */
-#define	ACE_READ_DATA		0x00000001
-#define	ACE_LIST_DIRECTORY	0x00000001
-#define	ACE_WRITE_DATA		0x00000002
-#define	ACE_ADD_FILE		0x00000002
-#define	ACE_APPEND_DATA		0x00000004
-#define	ACE_ADD_SUBDIRECTORY	0x00000004
-#define	ACE_READ_NAMED_ATTRS	0x00000008
-#define	ACE_WRITE_NAMED_ATTRS	0x00000010
-#define	ACE_EXECUTE		0x00000020
-#define	ACE_DELETE_CHILD	0x00000040
-#define	ACE_READ_ATTRIBUTES	0x00000080
-#define	ACE_WRITE_ATTRIBUTES	0x00000100
-#define	ACE_DELETE		0x00010000
-#define	ACE_READ_ACL		0x00020000
-#define	ACE_WRITE_ACL		0x00040000
-#define	ACE_WRITE_OWNER		0x00080000
-#define	ACE_SYNCHRONIZE		0x00100000
-
-#define	ACE_FILE_INHERIT_ACE		0x0001
-#define	ACE_DIRECTORY_INHERIT_ACE	0x0002
-#define	ACE_NO_PROPAGATE_INHERIT_ACE	0x0004
-#define	ACE_INHERIT_ONLY_ACE		0x0008
-#define	ACE_SUCCESSFUL_ACCESS_ACE_FLAG	0x0010
-#define	ACE_FAILED_ACCESS_ACE_FLAG	0x0020
-#define	ACE_IDENTIFIER_GROUP		0x0040
-#define	ACE_INHERITED_ACE		0x0080
-#define	ACE_OWNER			0x1000
-#define	ACE_GROUP			0x2000
-#define	ACE_EVERYONE			0x4000
-
-#define	ACE_ACCESS_ALLOWED_ACE_TYPE	0x0000
-#define	ACE_ACCESS_DENIED_ACE_TYPE	0x0001
-#define	ACE_SYSTEM_AUDIT_ACE_TYPE	0x0002
-#define	ACE_SYSTEM_ALARM_ACE_TYPE	0x0003
-
-#define	ACL_AUTO_INHERIT		0x0001
-#define	ACL_PROTECTED			0x0002
-#define	ACL_DEFAULTED			0x0004
-#define	ACL_FLAGS_ALL			(ACL_AUTO_INHERIT|ACL_PROTECTED| \
-    ACL_DEFAULTED)
-
-#if defined(_KERNEL) || defined(_FAKE_KERNEL)
-
-/*
- * These are only applicable in a CIFS context.
- */
-#define	ACE_ACCESS_ALLOWED_COMPOUND_ACE_TYPE		0x04
-#define	ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE		0x05
-#define	ACE_ACCESS_DENIED_OBJECT_ACE_TYPE		0x06
-#define	ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE		0x07
-#define	ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE		0x08
-#define	ACE_ACCESS_ALLOWED_CALLBACK_ACE_TYPE		0x09
-#define	ACE_ACCESS_DENIED_CALLBACK_ACE_TYPE		0x0A
-#define	ACE_ACCESS_ALLOWED_CALLBACK_OBJECT_ACE_TYPE	0x0B
-#define	ACE_ACCESS_DENIED_CALLBACK_OBJECT_ACE_TYPE	0x0C
-#define	ACE_SYSTEM_AUDIT_CALLBACK_ACE_TYPE		0x0D
-#define	ACE_SYSTEM_ALARM_CALLBACK_ACE_TYPE		0x0E
-#define	ACE_SYSTEM_AUDIT_CALLBACK_OBJECT_ACE_TYPE	0x0F
-#define	ACE_SYSTEM_ALARM_CALLBACK_OBJECT_ACE_TYPE	0x10
-
-#define	ACE_ALL_TYPES	0x001F
-
-typedef struct ace_object {
-	uid_t		a_who;		/* uid or gid */
-	uint32_t	a_access_mask;	/* read,write,... */
-	uint16_t	a_flags;	/* see below */
-	uint16_t	a_type;		/* allow or deny */
-	uint8_t		a_obj_type[16];	/* obj type */
-	uint8_t		a_inherit_obj_type[16];  /* inherit obj */
-} ace_object_t;
-
-#endif
-
-#define	ACE_ALL_PERMS	(ACE_READ_DATA|ACE_LIST_DIRECTORY|ACE_WRITE_DATA| \
-    ACE_ADD_FILE|ACE_APPEND_DATA|ACE_ADD_SUBDIRECTORY|ACE_READ_NAMED_ATTRS| \
-    ACE_WRITE_NAMED_ATTRS|ACE_EXECUTE|ACE_DELETE_CHILD|ACE_READ_ATTRIBUTES| \
-    ACE_WRITE_ATTRIBUTES|ACE_DELETE|ACE_READ_ACL|ACE_WRITE_ACL| \
-    ACE_WRITE_OWNER|ACE_SYNCHRONIZE)
-
-#define	ACE_ALL_WRITE_PERMS (ACE_WRITE_DATA|ACE_APPEND_DATA| \
-    ACE_WRITE_ATTRIBUTES|ACE_WRITE_NAMED_ATTRS|ACE_WRITE_ACL| \
-    ACE_WRITE_OWNER|ACE_DELETE|ACE_DELETE_CHILD)
-
-#define	ACE_READ_PERMS	(ACE_READ_DATA|ACE_READ_ACL|ACE_READ_ATTRIBUTES| \
-    ACE_READ_NAMED_ATTRS)
-
-#define	ACE_WRITE_PERMS	(ACE_WRITE_DATA|ACE_APPEND_DATA|ACE_WRITE_ATTRIBUTES| \
-    ACE_WRITE_NAMED_ATTRS)
-
-#define	ACE_MODIFY_PERMS (ACE_READ_DATA|ACE_LIST_DIRECTORY|ACE_WRITE_DATA| \
-    ACE_ADD_FILE|ACE_APPEND_DATA|ACE_ADD_SUBDIRECTORY|ACE_READ_NAMED_ATTRS| \
-    ACE_WRITE_NAMED_ATTRS|ACE_EXECUTE|ACE_DELETE_CHILD|ACE_READ_ATTRIBUTES| \
-    ACE_WRITE_ATTRIBUTES|ACE_DELETE|ACE_READ_ACL|ACE_SYNCHRONIZE)
-/*
- * The following flags are supported by both NFSv4 ACLs and ace_t.
- */
-#define	ACE_NFSV4_SUP_FLAGS (ACE_FILE_INHERIT_ACE | \
-    ACE_DIRECTORY_INHERIT_ACE | \
-    ACE_NO_PROPAGATE_INHERIT_ACE | \
-    ACE_INHERIT_ONLY_ACE | \
-    ACE_INHERITED_ACE | \
-    ACE_IDENTIFIER_GROUP)
-
-#define	ACE_TYPE_FLAGS		(ACE_OWNER|ACE_GROUP|ACE_EVERYONE| \
-    ACE_IDENTIFIER_GROUP)
-#define	ACE_INHERIT_FLAGS	(ACE_FILE_INHERIT_ACE| ACL_INHERITED_ACE| \
-    ACE_DIRECTORY_INHERIT_ACE|ACE_NO_PROPAGATE_INHERIT_ACE|ACE_INHERIT_ONLY_ACE)
-
-/* cmd args to acl(2) for aclent_t  */
-#define	GETACL			1
-#define	SETACL			2
-#define	GETACLCNT		3
-
-/* cmd's to manipulate ace acls. */
-#define	ACE_GETACL		4
-#define	ACE_SETACL		5
-#define	ACE_GETACLCNT		6
-
-/* minimal acl entries from GETACLCNT */
-#define	MIN_ACL_ENTRIES		4
-
-#if !defined(_KERNEL)
-
-/* acl check errors */
-#define	GRP_ERROR		1
-#define	USER_ERROR		2
-#define	OTHER_ERROR		3
-#define	CLASS_ERROR		4
-#define	DUPLICATE_ERROR		5
-#define	MISS_ERROR		6
-#define	MEM_ERROR		7
-#define	ENTRY_ERROR		8
-
-
-/*
- * similar to ufs_acl.h: changed to char type for user commands (tar, cpio)
- * Attribute types
- */
-#define	UFSD_FREE	('0')	/* Free entry */
-#define	UFSD_ACL	('1')	/* Access Control Lists */
-#define	UFSD_DFACL	('2')	/* reserved for future use */
-#define	ACE_ACL		('3')	/* ace_t style acls */
-
-/*
- * flag to [f]acl_get()
- * controls whether a trivial acl should be returned.
- */
-#define	ACL_NO_TRIVIAL	0x2
-
-
-/*
- * Flags to control acl_totext()
- */
-
-#define	ACL_APPEND_ID	0x1 	/* append uid/gid to user/group entries */
-#define	ACL_COMPACT_FMT	0x2 	/* build ACL in ls -V format */
-#define	ACL_NORESOLVE	0x4	/* don't do name service lookups */
-#define	ACL_SID_FMT	0x8	/* use usersid/groupsid when appropriate */
-
-/*
- * Legacy aclcheck errors for aclent_t ACLs
- */
-#define	EACL_GRP_ERROR		GRP_ERROR
-#define	EACL_USER_ERROR		USER_ERROR
-#define	EACL_OTHER_ERROR	OTHER_ERROR
-#define	EACL_CLASS_ERROR	CLASS_ERROR
-#define	EACL_DUPLICATE_ERROR	DUPLICATE_ERROR
-#define	EACL_MISS_ERROR		MISS_ERROR
-#define	EACL_MEM_ERROR		MEM_ERROR
-#define	EACL_ENTRY_ERROR	ENTRY_ERROR
-
-#define	EACL_INHERIT_ERROR	9		/* invalid inherit flags */
-#define	EACL_FLAGS_ERROR	10		/* unknown flag value */
-#define	EACL_PERM_MASK_ERROR	11		/* unknown permission */
-#define	EACL_COUNT_ERROR	12		/* invalid acl count */
-
-#define	EACL_INVALID_SLOT	13		/* invalid acl slot */
-#define	EACL_NO_ACL_ENTRY	14		/* Entry doesn't exist */
-#define	EACL_DIFF_TYPE		15		/* acls aren't same type */
-
-#define	EACL_INVALID_USER_GROUP	16		/* need user/group name */
-#define	EACL_INVALID_STR	17		/* invalid acl string */
-#define	EACL_FIELD_NOT_BLANK	18		/* can't have blank field */
-#define	EACL_INVALID_ACCESS_TYPE 19		/* invalid access type */
-#define	EACL_UNKNOWN_DATA	20		/* Unrecognized data in ACL */
-#define	EACL_MISSING_FIELDS	21		/* missing fields in acl */
-
-#define	EACL_INHERIT_NOTDIR	22		/* Need dir for inheritance */
-
-extern int aclcheck(aclent_t *, int, int *);
-extern int acltomode(aclent_t *, int, mode_t *);
-extern int aclfrommode(aclent_t *, int, mode_t *);
-extern int aclsort(int, int, aclent_t *);
-extern char *acltotext(aclent_t *, int);
-extern aclent_t *aclfromtext(char *, int *);
-extern void acl_free(acl_t *);
-extern int acl_get(const char *, int, acl_t **);
-extern int facl_get(int, int, acl_t **);
-extern int acl_set(const char *, acl_t *acl);
-extern int facl_set(int, acl_t *acl);
-extern int acl_strip(const char *, uid_t, gid_t, mode_t);
-extern int acl_trivial(const char *);
-extern char *acl_totext(acl_t *, int);
-extern int acl_fromtext(const char *, acl_t **);
-extern int acl_check(acl_t *, int);
-
-#else	/* !defined(_KERNEL) */
-
-extern void ksort(caddr_t, int, int, int (*)(void *, void *));
-extern int cmp2acls(void *, void *);
-
-#endif	/* !defined(_KERNEL) */
-
-extern int acl(const char *path, int cmd, int cnt, void *buf);
-extern int facl(int fd, int cmd, int cnt, void *buf);
-
-#ifdef	__cplusplus
-}
-#endif
-
-#endif /* _SYS_ACL_H */
Index: sys/cddl/contrib/opensolaris/uts/common/sys/ccompile.h
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/sys/ccompile.h
+++ sys/cddl/contrib/opensolaris/uts/common/sys/ccompile.h
@@ -115,7 +115,6 @@
 #define	__VPRINTFLIKE(__n)	__sun_attr__((__VPRINTFLIKE__(__n)))
 #define	__KPRINTFLIKE(__n)	__sun_attr__((__KPRINTFLIKE__(__n)))
 #define	__KVPRINTFLIKE(__n)	__sun_attr__((__KVPRINTFLIKE__(__n)))
-#define	__NORETURN		__sun_attr__((__noreturn__))
 #define	__CONST			__sun_attr__((__const__))
 #define	__PURE			__sun_attr__((__pure__))
 
Index: sys/cddl/contrib/opensolaris/uts/common/sys/cpuvar.h
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/sys/cpuvar.h
+++ /dev/null
@@ -1,830 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-
-/*
- * Copyright (c) 1989, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright 2014 Igor Kozhukhov <ikozhukhov@gmail.com>.
- * Copyright 2017 RackTop Systems.
- */
-
-#ifndef _SYS_CPUVAR_H
-#define	_SYS_CPUVAR_H
-
-#include <sys/thread.h>
-#include <sys/sysinfo.h>	/* has cpu_stat_t definition */
-#include <sys/disp.h>
-#include <sys/processor.h>
-#include <sys/kcpc.h>		/* has kcpc_ctx_t definition */
-
-#include <sys/loadavg.h>
-#if (defined(_KERNEL) || defined(_KMEMUSER)) && defined(_MACHDEP)
-#include <sys/machcpuvar.h>
-#endif
-
-#include <sys/types.h>
-#include <sys/file.h>
-#include <sys/bitmap.h>
-#include <sys/rwlock.h>
-#include <sys/msacct.h>
-#if defined(__GNUC__) && defined(_ASM_INLINES) && defined(_KERNEL) && \
-	(defined(__i386) || defined(__amd64))
-#include <asm/cpuvar.h>
-#endif
-
-#ifdef	__cplusplus
-extern "C" {
-#endif
-
-struct squeue_set_s;
-
-#define	CPU_CACHE_COHERENCE_SIZE	64
-
-/*
- * For fast event tracing.
- */
-struct ftrace_record;
-typedef struct ftrace_data {
-	int			ftd_state;	/* ftrace flags */
-	kmutex_t		ftd_unused;	/* ftrace buffer lock, unused */
-	struct ftrace_record	*ftd_cur;	/* current record */
-	struct ftrace_record	*ftd_first;	/* first record */
-	struct ftrace_record	*ftd_last;	/* last record */
-} ftrace_data_t;
-
-struct cyc_cpu;
-struct nvlist;
-
-/*
- * Per-CPU data.
- *
- * Be careful adding new members: if they are not the same in all modules (e.g.
- * change size depending on a #define), CTF uniquification can fail to work
- * properly.  Furthermore, this is transitive in that it applies recursively to
- * all types pointed to by cpu_t.
- */
-typedef struct cpu {
-	processorid_t	cpu_id;			/* CPU number */
-	processorid_t	cpu_seqid;	/* sequential CPU id (0..ncpus-1) */
-	volatile cpu_flag_t cpu_flags;		/* flags indicating CPU state */
-	struct cpu	*cpu_self;		/* pointer to itself */
-	kthread_t	*cpu_thread;		/* current thread */
-	kthread_t	*cpu_idle_thread;	/* idle thread for this CPU */
-	kthread_t	*cpu_pause_thread;	/* pause thread for this CPU */
-	klwp_id_t	cpu_lwp;		/* current lwp (if any) */
-	klwp_id_t	cpu_fpowner;		/* currently loaded fpu owner */
-	struct cpupart	*cpu_part;		/* partition with this CPU */
-	struct lgrp_ld	*cpu_lpl;		/* pointer to this cpu's load */
-	int		cpu_cache_offset;	/* see kmem.c for details */
-
-	/*
-	 * Links to other CPUs.  It is safe to walk these lists if
-	 * one of the following is true:
-	 * 	- cpu_lock held
-	 * 	- preemption disabled via kpreempt_disable
-	 * 	- PIL >= DISP_LEVEL
-	 * 	- acting thread is an interrupt thread
-	 * 	- all other CPUs are paused
-	 */
-	struct cpu	*cpu_next;		/* next existing CPU */
-	struct cpu	*cpu_prev;		/* prev existing CPU */
-	struct cpu	*cpu_next_onln;		/* next online (enabled) CPU */
-	struct cpu	*cpu_prev_onln;		/* prev online (enabled) CPU */
-	struct cpu	*cpu_next_part;		/* next CPU in partition */
-	struct cpu	*cpu_prev_part;		/* prev CPU in partition */
-	struct cpu	*cpu_next_lgrp;		/* next CPU in latency group */
-	struct cpu	*cpu_prev_lgrp;		/* prev CPU in latency group */
-	struct cpu	*cpu_next_lpl;		/* next CPU in lgrp partition */
-	struct cpu	*cpu_prev_lpl;
-
-	struct cpu_pg	*cpu_pg;		/* cpu's processor groups */
-
-	void		*cpu_reserved[4];	/* reserved for future use */
-
-	/*
-	 * Scheduling variables.
-	 */
-	disp_t		*cpu_disp;		/* dispatch queue data */
-	/*
-	 * Note that cpu_disp is set before the CPU is added to the system
-	 * and is never modified.  Hence, no additional locking is needed
-	 * beyond what's necessary to access the cpu_t structure.
-	 */
-	char		cpu_runrun;	/* scheduling flag - set to preempt */
-	char		cpu_kprunrun;		/* force kernel preemption */
-	pri_t		cpu_chosen_level; 	/* priority at which cpu */
-						/* was chosen for scheduling */
-	kthread_t	*cpu_dispthread; /* thread selected for dispatch */
-	disp_lock_t	cpu_thread_lock; /* dispatcher lock on current thread */
-	uint8_t		cpu_disp_flags;	/* flags used by dispatcher */
-	/*
-	 * The following field is updated when ever the cpu_dispthread
-	 * changes. Also in places, where the current thread(cpu_dispthread)
-	 * priority changes. This is used in disp_lowpri_cpu()
-	 */
-	pri_t		cpu_dispatch_pri; /* priority of cpu_dispthread */
-	clock_t		cpu_last_swtch;	/* last time switched to new thread */
-
-	/*
-	 * Interrupt data.
-	 */
-	caddr_t		cpu_intr_stack;	/* interrupt stack */
-	kthread_t	*cpu_intr_thread; /* interrupt thread list */
-	uint_t		cpu_intr_actv;	/* interrupt levels active (bitmask) */
-	int		cpu_base_spl;	/* priority for highest rupt active */
-
-	/*
-	 * Statistics.
-	 */
-	cpu_stats_t	cpu_stats;		/* per-CPU statistics */
-	struct kstat	*cpu_info_kstat;	/* kstat for cpu info */
-
-	uintptr_t	cpu_profile_pc;	/* kernel PC in profile interrupt */
-	uintptr_t	cpu_profile_upc; /* user PC in profile interrupt */
-	uintptr_t	cpu_profile_pil; /* PIL when profile interrupted */
-
-	ftrace_data_t	cpu_ftrace;		/* per cpu ftrace data */
-
-	clock_t		cpu_deadman_counter;	/* used by deadman() */
-	uint_t		cpu_deadman_countdown;	/* used by deadman() */
-
-	kmutex_t	cpu_cpc_ctxlock; /* protects context for idle thread */
-	kcpc_ctx_t	*cpu_cpc_ctx;	/* performance counter context */
-
-	/*
-	 * Configuration information for the processor_info system call.
-	 */
-	processor_info_t cpu_type_info;	/* config info */
-	time_t		cpu_state_begin; /* when CPU entered current state */
-	char		cpu_cpr_flags;	/* CPR related info */
-	struct cyc_cpu	*cpu_cyclic;	/* per cpu cyclic subsystem data */
-	struct squeue_set_s *cpu_squeue_set;	/* per cpu squeue set */
-	struct nvlist	*cpu_props;	/* pool-related properties */
-
-	krwlock_t	cpu_ft_lock;		/* DTrace: fasttrap lock */
-	uintptr_t	cpu_dtrace_caller;	/* DTrace: caller, if any */
-	hrtime_t	cpu_dtrace_chillmark;	/* DTrace: chill mark time */
-	hrtime_t	cpu_dtrace_chilled;	/* DTrace: total chill time */
-	volatile uint16_t cpu_mstate;		/* cpu microstate */
-	volatile uint16_t cpu_mstate_gen;	/* generation counter */
-	volatile hrtime_t cpu_mstate_start;	/* cpu microstate start time */
-	volatile hrtime_t cpu_acct[NCMSTATES];	/* cpu microstate data */
-	hrtime_t	cpu_intracct[NCMSTATES]; /* interrupt mstate data */
-	hrtime_t	cpu_waitrq;		/* cpu run-queue wait time */
-	struct loadavg_s cpu_loadavg;		/* loadavg info for this cpu */
-
-	char		*cpu_idstr;	/* for printing and debugging */
-	char		*cpu_brandstr;	/* for printing */
-
-	/*
-	 * Sum of all device interrupt weights that are currently directed at
-	 * this cpu. Cleared at start of interrupt redistribution.
-	 */
-	int32_t		cpu_intr_weight;
-	void		*cpu_vm_data;
-
-	struct cpu_physid *cpu_physid;	/* physical associations */
-
-	uint64_t	cpu_curr_clock;		/* current clock freq in Hz */
-	char		*cpu_supp_freqs;	/* supported freqs in Hz */
-
-	uintptr_t	cpu_cpcprofile_pc;	/* kernel PC in cpc interrupt */
-	uintptr_t	cpu_cpcprofile_upc;	/* user PC in cpc interrupt */
-
-	/*
-	 * Interrupt load factor used by dispatcher & softcall
-	 */
-	hrtime_t	cpu_intrlast;   /* total interrupt time (nsec) */
-	int		cpu_intrload;   /* interrupt load factor (0-99%) */
-
-	uint_t		cpu_rotor;	/* for cheap pseudo-random numbers */
-
-	struct cu_cpu_info	*cpu_cu_info;	/* capacity & util. info */
-
-	/*
-	 * cpu_generation is updated whenever CPU goes on-line or off-line.
-	 * Updates to cpu_generation are protected by cpu_lock.
-	 *
-	 * See CPU_NEW_GENERATION() macro below.
-	 */
-	volatile uint_t		cpu_generation;	/* tracking on/off-line */
-
-	/*
-	 * New members must be added /before/ this member, as the CTF tools
-	 * rely on this being the last field before cpu_m, so they can
-	 * correctly calculate the offset when synthetically adding the cpu_m
-	 * member in objects that do not have it.  This fixup is required for
-	 * uniquification to work correctly.
-	 */
-	uintptr_t	cpu_m_pad;
-
-#if (defined(_KERNEL) || defined(_KMEMUSER)) && defined(_MACHDEP)
-	struct machcpu	cpu_m;		/* per architecture info */
-#endif
-} cpu_t;
-
-/*
- * The cpu_core structure consists of per-CPU state available in any context.
- * On some architectures, this may mean that the page(s) containing the
- * NCPU-sized array of cpu_core structures must be locked in the TLB -- it
- * is up to the platform to assure that this is performed properly.  Note that
- * the structure is sized to avoid false sharing.
- */
-#define	CPUC_SIZE		(sizeof (uint16_t) + sizeof (uint8_t) + \
-				sizeof (uintptr_t) + sizeof (kmutex_t))
-#define	CPUC_PADSIZE		CPU_CACHE_COHERENCE_SIZE - CPUC_SIZE
-
-typedef struct cpu_core {
-	uint16_t	cpuc_dtrace_flags;	/* DTrace flags */
-	uint8_t		cpuc_dcpc_intr_state;	/* DCPC provider intr state */
-	uint8_t		cpuc_pad[CPUC_PADSIZE];	/* padding */
-	uintptr_t	cpuc_dtrace_illval;	/* DTrace illegal value */
-	kmutex_t	cpuc_pid_lock;		/* DTrace pid provider lock */
-} cpu_core_t;
-
-#ifdef _KERNEL
-extern cpu_core_t cpu_core[];
-#endif /* _KERNEL */
-
-/*
- * CPU_ON_INTR() macro. Returns non-zero if currently on interrupt stack.
- * Note that this isn't a test for a high PIL.  For example, cpu_intr_actv
- * does not get updated when we go through sys_trap from TL>0 at high PIL.
- * getpil() should be used instead to check for PIL levels.
- */
-#define	CPU_ON_INTR(cpup) ((cpup)->cpu_intr_actv >> (LOCK_LEVEL + 1))
-
-/*
- * Check to see if an interrupt thread might be active at a given ipl.
- * If so return true.
- * We must be conservative--it is ok to give a false yes, but a false no
- * will cause disaster.  (But if the situation changes after we check it is
- * ok--the caller is trying to ensure that an interrupt routine has been
- * exited).
- * This is used when trying to remove an interrupt handler from an autovector
- * list in avintr.c.
- */
-#define	INTR_ACTIVE(cpup, level)	\
-	((level) <= LOCK_LEVEL ? 	\
-	((cpup)->cpu_intr_actv & (1 << (level))) : (CPU_ON_INTR(cpup)))
-
-/*
- * CPU_PSEUDO_RANDOM() returns a per CPU value that changes each time one
- * looks at it. It's meant as a cheap mechanism to be incorporated in routines
- * wanting to avoid biasing, but where true randomness isn't needed (just
- * something that changes).
- */
-#define	CPU_PSEUDO_RANDOM() (CPU->cpu_rotor++)
-
-#if defined(_KERNEL) || defined(_KMEMUSER)
-
-#define	INTR_STACK_SIZE	MAX(DEFAULTSTKSZ, PAGESIZE)
-
-/* MEMBERS PROTECTED BY "atomicity": cpu_flags */
-
-/*
- * Flags in the CPU structure.
- *
- * These are protected by cpu_lock (except during creation).
- *
- * Offlined-CPUs have three stages of being offline:
- *
- * CPU_ENABLE indicates that the CPU is participating in I/O interrupts
- * that can be directed at a number of different CPUs.  If CPU_ENABLE
- * is off, the CPU will not be given interrupts that can be sent elsewhere,
- * but will still get interrupts from devices associated with that CPU only,
- * and from other CPUs.
- *
- * CPU_OFFLINE indicates that the dispatcher should not allow any threads
- * other than interrupt threads to run on that CPU.  A CPU will not have
- * CPU_OFFLINE set if there are any bound threads (besides interrupts).
- *
- * CPU_QUIESCED is set if p_offline was able to completely turn idle the
- * CPU and it will not have to run interrupt threads.  In this case it'll
- * stay in the idle loop until CPU_QUIESCED is turned off.
- *
- * CPU_FROZEN is used only by CPR to mark CPUs that have been successfully
- * suspended (in the suspend path), or have yet to be resumed (in the resume
- * case).
- *
- * On some platforms CPUs can be individually powered off.
- * The following flags are set for powered off CPUs: CPU_QUIESCED,
- * CPU_OFFLINE, and CPU_POWEROFF.  The following flags are cleared:
- * CPU_RUNNING, CPU_READY, CPU_EXISTS, and CPU_ENABLE.
- */
-#define	CPU_RUNNING	0x001		/* CPU running */
-#define	CPU_READY	0x002		/* CPU ready for cross-calls */
-#define	CPU_QUIESCED	0x004		/* CPU will stay in idle */
-#define	CPU_EXISTS	0x008		/* CPU is configured */
-#define	CPU_ENABLE	0x010		/* CPU enabled for interrupts */
-#define	CPU_OFFLINE	0x020		/* CPU offline via p_online */
-#define	CPU_POWEROFF	0x040		/* CPU is powered off */
-#define	CPU_FROZEN	0x080		/* CPU is frozen via CPR suspend */
-#define	CPU_SPARE	0x100		/* CPU offline available for use */
-#define	CPU_FAULTED	0x200		/* CPU offline diagnosed faulty */
-
-#define	FMT_CPU_FLAGS							\
-	"\20\12fault\11spare\10frozen"					\
-	"\7poweroff\6offline\5enable\4exist\3quiesced\2ready\1run"
-
-#define	CPU_ACTIVE(cpu)	(((cpu)->cpu_flags & CPU_OFFLINE) == 0)
-
-/*
- * Flags for cpu_offline(), cpu_faulted(), and cpu_spare().
- */
-#define	CPU_FORCED	0x0001		/* Force CPU offline */
-
-/*
- * DTrace flags.
- */
-#define	CPU_DTRACE_NOFAULT	0x0001	/* Don't fault */
-#define	CPU_DTRACE_DROP		0x0002	/* Drop this ECB */
-#define	CPU_DTRACE_BADADDR	0x0004	/* DTrace fault: bad address */
-#define	CPU_DTRACE_BADALIGN	0x0008	/* DTrace fault: bad alignment */
-#define	CPU_DTRACE_DIVZERO	0x0010	/* DTrace fault: divide by zero */
-#define	CPU_DTRACE_ILLOP	0x0020	/* DTrace fault: illegal operation */
-#define	CPU_DTRACE_NOSCRATCH	0x0040	/* DTrace fault: out of scratch */
-#define	CPU_DTRACE_KPRIV	0x0080	/* DTrace fault: bad kernel access */
-#define	CPU_DTRACE_UPRIV	0x0100	/* DTrace fault: bad user access */
-#define	CPU_DTRACE_TUPOFLOW	0x0200	/* DTrace fault: tuple stack overflow */
-#if defined(__sparc)
-#define	CPU_DTRACE_FAKERESTORE	0x0400	/* pid provider hint to getreg */
-#endif
-#define	CPU_DTRACE_ENTRY	0x0800	/* pid provider hint to ustack() */
-#define	CPU_DTRACE_BADSTACK	0x1000	/* DTrace fault: bad stack */
-
-#define	CPU_DTRACE_FAULT	(CPU_DTRACE_BADADDR | CPU_DTRACE_BADALIGN | \
-				CPU_DTRACE_DIVZERO | CPU_DTRACE_ILLOP | \
-				CPU_DTRACE_NOSCRATCH | CPU_DTRACE_KPRIV | \
-				CPU_DTRACE_UPRIV | CPU_DTRACE_TUPOFLOW | \
-				CPU_DTRACE_BADSTACK)
-#define	CPU_DTRACE_ERROR	(CPU_DTRACE_FAULT | CPU_DTRACE_DROP)
-
-/*
- * Dispatcher flags
- * These flags must be changed only by the current CPU.
- */
-#define	CPU_DISP_DONTSTEAL	0x01	/* CPU undergoing context swtch */
-#define	CPU_DISP_HALTED		0x02	/* CPU halted waiting for interrupt */
-
-#endif /* _KERNEL || _KMEMUSER */
-
-#if (defined(_KERNEL) || defined(_KMEMUSER)) && defined(_MACHDEP)
-
-/*
- * Macros for manipulating sets of CPUs as a bitmap.  Note that this
- * bitmap may vary in size depending on the maximum CPU id a specific
- * platform supports.  This may be different than the number of CPUs
- * the platform supports, since CPU ids can be sparse.  We define two
- * sets of macros; one for platforms where the maximum CPU id is less
- * than the number of bits in a single word (32 in a 32-bit kernel,
- * 64 in a 64-bit kernel), and one for platforms that require bitmaps
- * of more than one word.
- */
-
-#define	CPUSET_WORDS	BT_BITOUL(NCPU)
-#define	CPUSET_NOTINSET	((uint_t)-1)
-
-#if	CPUSET_WORDS > 1
-
-typedef struct cpuset {
-	ulong_t	cpub[CPUSET_WORDS];
-} cpuset_t;
-
-/*
- * Private functions for manipulating cpusets that do not fit in a
- * single word.  These should not be used directly; instead the
- * CPUSET_* macros should be used so the code will be portable
- * across different definitions of NCPU.
- */
-extern	void	cpuset_all(cpuset_t *);
-extern	void	cpuset_all_but(cpuset_t *, uint_t);
-extern	int	cpuset_isnull(cpuset_t *);
-extern	int	cpuset_cmp(cpuset_t *, cpuset_t *);
-extern	void	cpuset_only(cpuset_t *, uint_t);
-extern	uint_t	cpuset_find(cpuset_t *);
-extern	void	cpuset_bounds(cpuset_t *, uint_t *, uint_t *);
-
-#define	CPUSET_ALL(set)			cpuset_all(&(set))
-#define	CPUSET_ALL_BUT(set, cpu)	cpuset_all_but(&(set), cpu)
-#define	CPUSET_ONLY(set, cpu)		cpuset_only(&(set), cpu)
-#define	CPU_IN_SET(set, cpu)		BT_TEST((set).cpub, cpu)
-#define	CPUSET_ADD(set, cpu)		BT_SET((set).cpub, cpu)
-#define	CPUSET_DEL(set, cpu)		BT_CLEAR((set).cpub, cpu)
-#define	CPUSET_ISNULL(set)		cpuset_isnull(&(set))
-#define	CPUSET_ISEQUAL(set1, set2)	cpuset_cmp(&(set1), &(set2))
-
-/*
- * Find one CPU in the cpuset.
- * Sets "cpu" to the id of the found CPU, or CPUSET_NOTINSET if no cpu
- * could be found. (i.e. empty set)
- */
-#define	CPUSET_FIND(set, cpu)		{		\
-	cpu = cpuset_find(&(set));			\
-}
-
-/*
- * Determine the smallest and largest CPU id in the set. Returns
- * CPUSET_NOTINSET in smallest and largest when set is empty.
- */
-#define	CPUSET_BOUNDS(set, smallest, largest)	{		\
-	cpuset_bounds(&(set), &(smallest), &(largest));		\
-}
-
-/*
- * Atomic cpuset operations
- * These are safe to use for concurrent cpuset manipulations.
- * "xdel" and "xadd" are exclusive operations, that set "result" to "0"
- * if the add or del was successful, or "-1" if not successful.
- * (e.g. attempting to add a cpu to a cpuset that's already there, or
- * deleting a cpu that's not in the cpuset)
- */
-
-#define	CPUSET_ATOMIC_DEL(set, cpu)	BT_ATOMIC_CLEAR((set).cpub, (cpu))
-#define	CPUSET_ATOMIC_ADD(set, cpu)	BT_ATOMIC_SET((set).cpub, (cpu))
-
-#define	CPUSET_ATOMIC_XADD(set, cpu, result) \
-	BT_ATOMIC_SET_EXCL((set).cpub, cpu, result)
-
-#define	CPUSET_ATOMIC_XDEL(set, cpu, result) \
-	BT_ATOMIC_CLEAR_EXCL((set).cpub, cpu, result)
-
-
-#define	CPUSET_OR(set1, set2)		{		\
-	int _i;						\
-	for (_i = 0; _i < CPUSET_WORDS; _i++)		\
-		(set1).cpub[_i] |= (set2).cpub[_i];	\
-}
-
-#define	CPUSET_XOR(set1, set2)		{		\
-	int _i;						\
-	for (_i = 0; _i < CPUSET_WORDS; _i++)		\
-		(set1).cpub[_i] ^= (set2).cpub[_i];	\
-}
-
-#define	CPUSET_AND(set1, set2)		{		\
-	int _i;						\
-	for (_i = 0; _i < CPUSET_WORDS; _i++)		\
-		(set1).cpub[_i] &= (set2).cpub[_i];	\
-}
-
-#define	CPUSET_ZERO(set)		{		\
-	int _i;						\
-	for (_i = 0; _i < CPUSET_WORDS; _i++)		\
-		(set).cpub[_i] = 0;			\
-}
-
-#elif	CPUSET_WORDS == 1
-
-typedef	ulong_t	cpuset_t;	/* a set of CPUs */
-
-#define	CPUSET(cpu)			(1UL << (cpu))
-
-#define	CPUSET_ALL(set)			((void)((set) = ~0UL))
-#define	CPUSET_ALL_BUT(set, cpu)	((void)((set) = ~CPUSET(cpu)))
-#define	CPUSET_ONLY(set, cpu)		((void)((set) = CPUSET(cpu)))
-#define	CPU_IN_SET(set, cpu)		((set) & CPUSET(cpu))
-#define	CPUSET_ADD(set, cpu)		((void)((set) |= CPUSET(cpu)))
-#define	CPUSET_DEL(set, cpu)		((void)((set) &= ~CPUSET(cpu)))
-#define	CPUSET_ISNULL(set)		((set) == 0)
-#define	CPUSET_ISEQUAL(set1, set2)	((set1) == (set2))
-#define	CPUSET_OR(set1, set2)		((void)((set1) |= (set2)))
-#define	CPUSET_XOR(set1, set2)		((void)((set1) ^= (set2)))
-#define	CPUSET_AND(set1, set2)		((void)((set1) &= (set2)))
-#define	CPUSET_ZERO(set)		((void)((set) = 0))
-
-#define	CPUSET_FIND(set, cpu)		{		\
-	cpu = (uint_t)(lowbit(set) - 1);				\
-}
-
-#define	CPUSET_BOUNDS(set, smallest, largest)	{	\
-	smallest = (uint_t)(lowbit(set) - 1);		\
-	largest = (uint_t)(highbit(set) - 1);		\
-}
-
-#define	CPUSET_ATOMIC_DEL(set, cpu)	atomic_and_ulong(&(set), ~CPUSET(cpu))
-#define	CPUSET_ATOMIC_ADD(set, cpu)	atomic_or_ulong(&(set), CPUSET(cpu))
-
-#define	CPUSET_ATOMIC_XADD(set, cpu, result) \
-	{ result = atomic_set_long_excl(&(set), (cpu)); }
-
-#define	CPUSET_ATOMIC_XDEL(set, cpu, result) \
-	{ result = atomic_clear_long_excl(&(set), (cpu)); }
-
-#else	/* CPUSET_WORDS <= 0 */
-
-#error NCPU is undefined or invalid
-
-#endif	/* CPUSET_WORDS	*/
-
-extern cpuset_t cpu_seqid_inuse;
-
-#endif	/* (_KERNEL || _KMEMUSER) && _MACHDEP */
-
-#define	CPU_CPR_OFFLINE		0x0
-#define	CPU_CPR_ONLINE		0x1
-#define	CPU_CPR_IS_OFFLINE(cpu)	(((cpu)->cpu_cpr_flags & CPU_CPR_ONLINE) == 0)
-#define	CPU_CPR_IS_ONLINE(cpu)	((cpu)->cpu_cpr_flags & CPU_CPR_ONLINE)
-#define	CPU_SET_CPR_FLAGS(cpu, flag)	((cpu)->cpu_cpr_flags |= flag)
-
-#if defined(_KERNEL) || defined(_KMEMUSER)
-
-extern struct cpu	*cpu[];		/* indexed by CPU number */
-extern struct cpu	**cpu_seq;	/* indexed by sequential CPU id */
-extern cpu_t		*cpu_list;	/* list of CPUs */
-extern cpu_t		*cpu_active;	/* list of active CPUs */
-extern int		ncpus;		/* number of CPUs present */
-extern int		ncpus_online;	/* number of CPUs not quiesced */
-extern int		max_ncpus;	/* max present before ncpus is known */
-extern int		boot_max_ncpus;	/* like max_ncpus but for real */
-extern int		boot_ncpus;	/* # cpus present @ boot */
-extern processorid_t	max_cpuid;	/* maximum CPU number */
-extern struct cpu	*cpu_inmotion;	/* offline or partition move target */
-extern cpu_t		*clock_cpu_list;
-extern processorid_t	max_cpu_seqid_ever;	/* maximum seqid ever given */
-
-#if defined(__i386) || defined(__amd64)
-extern struct cpu *curcpup(void);
-#define	CPU		(curcpup())	/* Pointer to current CPU */
-#else
-#define	CPU		(curthread->t_cpu)	/* Pointer to current CPU */
-#endif
-
-/*
- * CPU_CURRENT indicates to thread_affinity_set to use CPU->cpu_id
- * as the target and to grab cpu_lock instead of requiring the caller
- * to grab it.
- */
-#define	CPU_CURRENT	-3
-
-/*
- * Per-CPU statistics
- *
- * cpu_stats_t contains numerous system and VM-related statistics, in the form
- * of gauges or monotonically-increasing event occurrence counts.
- */
-
-#define	CPU_STATS_ENTER_K()	kpreempt_disable()
-#define	CPU_STATS_EXIT_K()	kpreempt_enable()
-
-#define	CPU_STATS_ADD_K(class, stat, amount) \
-	{	kpreempt_disable(); /* keep from switching CPUs */\
-		CPU_STATS_ADDQ(CPU, class, stat, amount); \
-		kpreempt_enable(); \
-	}
-
-#define	CPU_STATS_ADDQ(cp, class, stat, amount)	{			\
-	extern void __dtrace_probe___cpu_##class##info_##stat(uint_t,	\
-	    uint64_t *, cpu_t *);					\
-	uint64_t *stataddr = &((cp)->cpu_stats.class.stat);		\
-	__dtrace_probe___cpu_##class##info_##stat((amount),		\
-	    stataddr, cp);						\
-	*(stataddr) += (amount);					\
-}
-
-#define	CPU_STATS(cp, stat)                                       \
-	((cp)->cpu_stats.stat)
-
-/*
- * Increment CPU generation value.
- * This macro should be called whenever CPU goes on-line or off-line.
- * Updates to cpu_generation should be protected by cpu_lock.
- */
-#define	CPU_NEW_GENERATION(cp)	((cp)->cpu_generation++)
-
-#endif /* _KERNEL || _KMEMUSER */
-
-/*
- * CPU support routines (not for genassym.c)
- */
-#if	(defined(_KERNEL) || defined(_FAKE_KERNEL)) && defined(__STDC__)
-
-struct zone;
-
-void	cpu_list_init(cpu_t *);
-void	cpu_add_unit(cpu_t *);
-void	cpu_del_unit(int cpuid);
-void	cpu_add_active(cpu_t *);
-void	cpu_kstat_init(cpu_t *);
-void	cpu_visibility_add(cpu_t *, struct zone *);
-void	cpu_visibility_remove(cpu_t *, struct zone *);
-void	cpu_visibility_configure(cpu_t *, struct zone *);
-void	cpu_visibility_unconfigure(cpu_t *, struct zone *);
-void	cpu_visibility_online(cpu_t *, struct zone *);
-void	cpu_visibility_offline(cpu_t *, struct zone *);
-void	cpu_create_intrstat(cpu_t *);
-void	cpu_delete_intrstat(cpu_t *);
-int	cpu_kstat_intrstat_update(kstat_t *, int);
-void	cpu_intr_swtch_enter(kthread_t *);
-void	cpu_intr_swtch_exit(kthread_t *);
-
-void	mbox_lock_init(void);	 /* initialize cross-call locks */
-void	mbox_init(int cpun);	 /* initialize cross-calls */
-void	poke_cpu(int cpun);	 /* interrupt another CPU (to preempt) */
-
-/*
- * values for safe_list.  Pause state that CPUs are in.
- */
-#define	PAUSE_IDLE	0		/* normal state */
-#define	PAUSE_READY	1		/* paused thread ready to spl */
-#define	PAUSE_WAIT	2		/* paused thread is spl-ed high */
-#define	PAUSE_DIE	3		/* tell pause thread to leave */
-#define	PAUSE_DEAD	4		/* pause thread has left */
-
-void	mach_cpu_pause(volatile char *);
-
-void	pause_cpus(cpu_t *off_cp, void *(*func)(void *));
-void	start_cpus(void);
-int	cpus_paused(void);
-
-void	cpu_pause_init(void);
-cpu_t	*cpu_get(processorid_t cpun);	/* get the CPU struct associated */
-
-int	cpu_online(cpu_t *cp);			/* take cpu online */
-int	cpu_offline(cpu_t *cp, int flags);	/* take cpu offline */
-int	cpu_spare(cpu_t *cp, int flags);	/* take cpu to spare */
-int	cpu_faulted(cpu_t *cp, int flags);	/* take cpu to faulted */
-int	cpu_poweron(cpu_t *cp);		/* take powered-off cpu to offline */
-int	cpu_poweroff(cpu_t *cp);	/* take offline cpu to powered-off */
-
-cpu_t	*cpu_intr_next(cpu_t *cp);	/* get next online CPU taking intrs */
-int	cpu_intr_count(cpu_t *cp);	/* count # of CPUs handling intrs */
-int	cpu_intr_on(cpu_t *cp);		/* CPU taking I/O interrupts? */
-void	cpu_intr_enable(cpu_t *cp);	/* enable I/O interrupts */
-int	cpu_intr_disable(cpu_t *cp);	/* disable I/O interrupts */
-void	cpu_intr_alloc(cpu_t *cp, int n); /* allocate interrupt threads */
-
-/*
- * Routines for checking CPU states.
- */
-int	cpu_is_online(cpu_t *);		/* check if CPU is online */
-int	cpu_is_nointr(cpu_t *);		/* check if CPU can service intrs */
-int	cpu_is_active(cpu_t *);		/* check if CPU can run threads */
-int	cpu_is_offline(cpu_t *);	/* check if CPU is offline */
-int	cpu_is_poweredoff(cpu_t *);	/* check if CPU is powered off */
-
-int	cpu_flagged_online(cpu_flag_t);	/* flags show CPU is online */
-int	cpu_flagged_nointr(cpu_flag_t);	/* flags show CPU not handling intrs */
-int	cpu_flagged_active(cpu_flag_t); /* flags show CPU scheduling threads */
-int	cpu_flagged_offline(cpu_flag_t); /* flags show CPU is offline */
-int	cpu_flagged_poweredoff(cpu_flag_t); /* flags show CPU is powered off */
-
-/*
- * The processor_info(2) state of a CPU is a simplified representation suitable
- * for use by an application program.  Kernel subsystems should utilize the
- * internal per-CPU state as given by the cpu_flags member of the cpu structure,
- * as this information may include platform- or architecture-specific state
- * critical to a subsystem's disposition of a particular CPU.
- */
-void	cpu_set_state(cpu_t *);		/* record/timestamp current state */
-int	cpu_get_state(cpu_t *);		/* get current cpu state */
-const char *cpu_get_state_str(cpu_t *);	/* get current cpu state as string */
-
-
-void	cpu_set_curr_clock(uint64_t);	/* indicate the current CPU's freq */
-void	cpu_set_supp_freqs(cpu_t *, const char *); /* set the CPU supported */
-						/* frequencies */
-
-int	cpu_configure(int);
-int	cpu_unconfigure(int);
-void	cpu_destroy_bound_threads(cpu_t *cp);
-
-extern int cpu_bind_thread(kthread_t *tp, processorid_t bind,
-    processorid_t *obind, int *error);
-extern int cpu_unbind(processorid_t cpu_id, boolean_t force);
-extern void thread_affinity_set(kthread_t *t, int cpu_id);
-extern void thread_affinity_clear(kthread_t *t);
-extern void affinity_set(int cpu_id);
-extern void affinity_clear(void);
-extern void init_cpu_mstate(struct cpu *, int);
-extern void term_cpu_mstate(struct cpu *);
-extern void new_cpu_mstate(int, hrtime_t);
-extern void get_cpu_mstate(struct cpu *, hrtime_t *);
-extern void thread_nomigrate(void);
-extern void thread_allowmigrate(void);
-extern void weakbinding_stop(void);
-extern void weakbinding_start(void);
-
-/*
- * The following routines affect the CPUs participation in interrupt processing,
- * if that is applicable on the architecture.  This only affects interrupts
- * which aren't directed at the processor (not cross calls).
- *
- * cpu_disable_intr returns non-zero if interrupts were previously enabled.
- */
-int	cpu_disable_intr(struct cpu *cp); /* stop issuing interrupts to cpu */
-void	cpu_enable_intr(struct cpu *cp); /* start issuing interrupts to cpu */
-
-/*
- * The mutex cpu_lock protects cpu_flags for all CPUs, as well as the ncpus
- * and ncpus_online counts.
- */
-extern kmutex_t	cpu_lock;	/* lock protecting CPU data */
-
-/*
- * CPU state change events
- *
- * Various subsystems need to know when CPUs change their state. They get this
- * information by registering  CPU state change callbacks using
- * register_cpu_setup_func(). Whenever any CPU changes its state, the callback
- * function is called. The callback function is passed three arguments:
- *
- *   Event, described by cpu_setup_t
- *   CPU ID
- *   Transparent pointer passed when registering the callback
- *
- * The callback function is called with cpu_lock held. The return value from the
- * callback function is usually ignored, except for CPU_CONFIG and CPU_UNCONFIG
- * events. For these two events, non-zero return value indicates a failure and
- * prevents successful completion of the operation.
- *
- * New events may be added in the future. Callback functions should ignore any
- * events that they do not understand.
- *
- * The following events provide notification callbacks:
- *
- *  CPU_INIT	A new CPU is started and added to the list of active CPUs
- *		  This event is only used during boot
- *
- *  CPU_CONFIG	A newly inserted CPU is prepared for starting running code
- *		  This event is called by DR code
- *
- *  CPU_UNCONFIG CPU has been powered off and needs cleanup
- *		  This event is called by DR code
- *
- *  CPU_ON	CPU is enabled but does not run anything yet
- *
- *  CPU_INTR_ON	CPU is enabled and has interrupts enabled
- *
- *  CPU_OFF	CPU is going offline but can still run threads
- *
- *  CPU_CPUPART_OUT	CPU is going to move out of its partition
- *
- *  CPU_CPUPART_IN	CPU is going to move to a new partition
- *
- *  CPU_SETUP	CPU is set up during boot and can run threads
- */
-typedef enum {
-	CPU_INIT,
-	CPU_CONFIG,
-	CPU_UNCONFIG,
-	CPU_ON,
-	CPU_OFF,
-	CPU_CPUPART_IN,
-	CPU_CPUPART_OUT,
-	CPU_SETUP,
-	CPU_INTR_ON
-} cpu_setup_t;
-
-typedef int cpu_setup_func_t(cpu_setup_t, int, void *);
-
-/*
- * Routines used to register interest in cpu's being added to or removed
- * from the system.
- */
-extern void register_cpu_setup_func(cpu_setup_func_t *, void *);
-extern void unregister_cpu_setup_func(cpu_setup_func_t *, void *);
-extern void cpu_state_change_notify(int, cpu_setup_t);
-
-/*
- * Call specified function on the given CPU
- */
-typedef void (*cpu_call_func_t)(uintptr_t, uintptr_t);
-extern void cpu_call(cpu_t *, cpu_call_func_t, uintptr_t, uintptr_t);
-
-
-/*
- * Create various strings that describe the given CPU for the
- * processor_info system call and configuration-related kstats.
- */
-#define	CPU_IDSTRLEN	100
-
-extern void init_cpu_info(struct cpu *);
-extern void populate_idstr(struct cpu *);
-extern void cpu_vm_data_init(struct cpu *);
-extern void cpu_vm_data_destroy(struct cpu *);
-
-#endif	/* _KERNEL || _FAKE_KERNEL */
-
-#ifdef	__cplusplus
-}
-#endif
-
-#endif /* _SYS_CPUVAR_H */
Index: sys/cddl/contrib/opensolaris/uts/common/sys/dtrace.h
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/sys/dtrace.h
+++ sys/cddl/contrib/opensolaris/uts/common/sys/dtrace.h
@@ -50,25 +50,22 @@
 #ifndef _ASM
 
 #include <sys/types.h>
+#ifdef _KERNEL
+#include <sys/endian.h>
+#endif
 #include <sys/modctl.h>
 #include <sys/processor.h>
-#ifdef illumos
-#include <sys/systm.h>
-#else
 #include <sys/cpuvar.h>
 #include <sys/param.h>
 #include <sys/linker.h>
 #include <sys/ioccom.h>
+#include <sys/cred.h>
+#include <sys/proc.h>
+#include <sys/types.h>
 #include <sys/ucred.h>
 typedef int model_t;
-#endif
 #include <sys/ctf_api.h>
-#ifdef illumos
-#include <sys/cyclic.h>
-#include <sys/int_limits.h>
-#else
 #include <sys/stdint.h>
-#endif
 
 /*
  * DTrace Universal Constants and Typedefs
Index: sys/cddl/contrib/opensolaris/uts/common/sys/dtrace_impl.h
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/sys/dtrace_impl.h
+++ sys/cddl/contrib/opensolaris/uts/common/sys/dtrace_impl.h
@@ -50,6 +50,7 @@
  */
 
 #include <sys/dtrace.h>
+#include <sys/file.h>
 
 #ifndef illumos
 #ifdef __sparcv9
Index: sys/cddl/contrib/opensolaris/uts/common/sys/nvpair.h
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/sys/nvpair.h
+++ /dev/null
@@ -1,351 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright (c) 2000, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
- */
-
-#ifndef	_SYS_NVPAIR_H
-#define	_SYS_NVPAIR_H
-
-#include <sys/types.h>
-#include <sys/time.h>
-#include <sys/errno.h>
-
-#if defined(_KERNEL) && !defined(_BOOT)
-#include <sys/kmem.h>
-#endif
-
-#ifdef	__cplusplus
-extern "C" {
-#endif
-
-typedef enum {
-	DATA_TYPE_DONTCARE = -1,
-	DATA_TYPE_UNKNOWN = 0,
-	DATA_TYPE_BOOLEAN,
-	DATA_TYPE_BYTE,
-	DATA_TYPE_INT16,
-	DATA_TYPE_UINT16,
-	DATA_TYPE_INT32,
-	DATA_TYPE_UINT32,
-	DATA_TYPE_INT64,
-	DATA_TYPE_UINT64,
-	DATA_TYPE_STRING,
-	DATA_TYPE_BYTE_ARRAY,
-	DATA_TYPE_INT16_ARRAY,
-	DATA_TYPE_UINT16_ARRAY,
-	DATA_TYPE_INT32_ARRAY,
-	DATA_TYPE_UINT32_ARRAY,
-	DATA_TYPE_INT64_ARRAY,
-	DATA_TYPE_UINT64_ARRAY,
-	DATA_TYPE_STRING_ARRAY,
-	DATA_TYPE_HRTIME,
-	DATA_TYPE_NVLIST,
-	DATA_TYPE_NVLIST_ARRAY,
-	DATA_TYPE_BOOLEAN_VALUE,
-	DATA_TYPE_INT8,
-	DATA_TYPE_UINT8,
-	DATA_TYPE_BOOLEAN_ARRAY,
-	DATA_TYPE_INT8_ARRAY,
-#if !defined(_KERNEL)
-	DATA_TYPE_UINT8_ARRAY,
-	DATA_TYPE_DOUBLE
-#else
-	DATA_TYPE_UINT8_ARRAY
-#endif
-} data_type_t;
-
-typedef struct nvpair {
-	int32_t nvp_size;	/* size of this nvpair */
-	int16_t	nvp_name_sz;	/* length of name string */
-	int16_t	nvp_reserve;	/* not used */
-	int32_t	nvp_value_elem;	/* number of elements for array types */
-	data_type_t nvp_type;	/* type of value */
-	/* name string */
-	/* aligned ptr array for string arrays */
-	/* aligned array of data for value */
-} nvpair_t;
-
-/* nvlist header */
-typedef struct nvlist {
-	int32_t		nvl_version;
-	uint32_t	nvl_nvflag;	/* persistent flags */
-	uint64_t	nvl_priv;	/* ptr to private data if not packed */
-	uint32_t	nvl_flag;
-	int32_t		nvl_pad;	/* currently not used, for alignment */
-} nvlist_t;
-
-/* nvp implementation version */
-#define	NV_VERSION	0
-
-/* nvlist pack encoding */
-#define	NV_ENCODE_NATIVE	0
-#define	NV_ENCODE_XDR		1
-
-/* nvlist persistent unique name flags, stored in nvl_nvflags */
-#define	NV_UNIQUE_NAME		0x1
-#define	NV_UNIQUE_NAME_TYPE	0x2
-
-/* nvlist lookup pairs related flags */
-#define	NV_FLAG_NOENTOK		0x1
-
-/* convenience macros */
-#define	NV_ALIGN(x)		(((ulong_t)(x) + 7ul) & ~7ul)
-#define	NV_ALIGN4(x)		(((x) + 3) & ~3)
-
-#define	NVP_SIZE(nvp)		((nvp)->nvp_size)
-#define	NVP_NAME(nvp)		((char *)(nvp) + sizeof (nvpair_t))
-#define	NVP_TYPE(nvp)		((nvp)->nvp_type)
-#define	NVP_NELEM(nvp)		((nvp)->nvp_value_elem)
-#define	NVP_VALUE(nvp)		((char *)(nvp) + NV_ALIGN(sizeof (nvpair_t) \
-				+ (nvp)->nvp_name_sz))
-
-#define	NVL_VERSION(nvl)	((nvl)->nvl_version)
-#define	NVL_SIZE(nvl)		((nvl)->nvl_size)
-#define	NVL_FLAG(nvl)		((nvl)->nvl_flag)
-
-/* NV allocator framework */
-typedef struct nv_alloc_ops nv_alloc_ops_t;
-
-typedef struct nv_alloc {
-	const nv_alloc_ops_t *nva_ops;
-	void *nva_arg;
-} nv_alloc_t;
-
-struct nv_alloc_ops {
-	int (*nv_ao_init)(nv_alloc_t *, __va_list);
-	void (*nv_ao_fini)(nv_alloc_t *);
-	void *(*nv_ao_alloc)(nv_alloc_t *, size_t);
-	void (*nv_ao_free)(nv_alloc_t *, void *, size_t);
-	void (*nv_ao_reset)(nv_alloc_t *);
-};
-
-extern const nv_alloc_ops_t *nv_fixed_ops;
-extern nv_alloc_t *nv_alloc_nosleep;
-
-#if defined(_KERNEL) && !defined(_BOOT)
-extern nv_alloc_t *nv_alloc_sleep;
-#endif
-
-int nv_alloc_init(nv_alloc_t *, const nv_alloc_ops_t *, /* args */ ...);
-void nv_alloc_reset(nv_alloc_t *);
-void nv_alloc_fini(nv_alloc_t *);
-
-/* list management */
-int nvlist_alloc(nvlist_t **, uint_t, int);
-void nvlist_free(nvlist_t *);
-int nvlist_size(nvlist_t *, size_t *, int);
-int nvlist_pack(nvlist_t *, char **, size_t *, int, int);
-int nvlist_unpack(char *, size_t, nvlist_t **, int);
-int nvlist_dup(nvlist_t *, nvlist_t **, int);
-int nvlist_merge(nvlist_t *, nvlist_t *, int);
-
-uint_t nvlist_nvflag(nvlist_t *);
-
-int nvlist_xalloc(nvlist_t **, uint_t, nv_alloc_t *);
-int nvlist_xpack(nvlist_t *, char **, size_t *, int, nv_alloc_t *);
-int nvlist_xunpack(char *, size_t, nvlist_t **, nv_alloc_t *);
-int nvlist_xdup(nvlist_t *, nvlist_t **, nv_alloc_t *);
-nv_alloc_t *nvlist_lookup_nv_alloc(nvlist_t *);
-
-int nvlist_add_nvpair(nvlist_t *, nvpair_t *);
-int nvlist_add_boolean(nvlist_t *, const char *);
-int nvlist_add_boolean_value(nvlist_t *, const char *, boolean_t);
-int nvlist_add_byte(nvlist_t *, const char *, uchar_t);
-int nvlist_add_int8(nvlist_t *, const char *, int8_t);
-int nvlist_add_uint8(nvlist_t *, const char *, uint8_t);
-int nvlist_add_int16(nvlist_t *, const char *, int16_t);
-int nvlist_add_uint16(nvlist_t *, const char *, uint16_t);
-int nvlist_add_int32(nvlist_t *, const char *, int32_t);
-int nvlist_add_uint32(nvlist_t *, const char *, uint32_t);
-int nvlist_add_int64(nvlist_t *, const char *, int64_t);
-int nvlist_add_uint64(nvlist_t *, const char *, uint64_t);
-int nvlist_add_string(nvlist_t *, const char *, const char *);
-int nvlist_add_nvlist(nvlist_t *, const char *, nvlist_t *);
-int nvlist_add_boolean_array(nvlist_t *, const char *, boolean_t *, uint_t);
-int nvlist_add_byte_array(nvlist_t *, const char *, uchar_t *, uint_t);
-int nvlist_add_int8_array(nvlist_t *, const char *, int8_t *, uint_t);
-int nvlist_add_uint8_array(nvlist_t *, const char *, uint8_t *, uint_t);
-int nvlist_add_int16_array(nvlist_t *, const char *, int16_t *, uint_t);
-int nvlist_add_uint16_array(nvlist_t *, const char *, uint16_t *, uint_t);
-int nvlist_add_int32_array(nvlist_t *, const char *, int32_t *, uint_t);
-int nvlist_add_uint32_array(nvlist_t *, const char *, uint32_t *, uint_t);
-int nvlist_add_int64_array(nvlist_t *, const char *, int64_t *, uint_t);
-int nvlist_add_uint64_array(nvlist_t *, const char *, uint64_t *, uint_t);
-int nvlist_add_string_array(nvlist_t *, const char *, char *const *, uint_t);
-int nvlist_add_nvlist_array(nvlist_t *, const char *, nvlist_t **, uint_t);
-int nvlist_add_hrtime(nvlist_t *, const char *, hrtime_t);
-#if !defined(_KERNEL)
-int nvlist_add_double(nvlist_t *, const char *, double);
-#endif
-
-int nvlist_remove(nvlist_t *, const char *, data_type_t);
-int nvlist_remove_all(nvlist_t *, const char *);
-int nvlist_remove_nvpair(nvlist_t *, nvpair_t *);
-
-int nvlist_lookup_boolean(nvlist_t *, const char *);
-int nvlist_lookup_boolean_value(nvlist_t *, const char *, boolean_t *);
-int nvlist_lookup_byte(nvlist_t *, const char *, uchar_t *);
-int nvlist_lookup_int8(nvlist_t *, const char *, int8_t *);
-int nvlist_lookup_uint8(nvlist_t *, const char *, uint8_t *);
-int nvlist_lookup_int16(nvlist_t *, const char *, int16_t *);
-int nvlist_lookup_uint16(nvlist_t *, const char *, uint16_t *);
-int nvlist_lookup_int32(nvlist_t *, const char *, int32_t *);
-int nvlist_lookup_uint32(nvlist_t *, const char *, uint32_t *);
-int nvlist_lookup_int64(nvlist_t *, const char *, int64_t *);
-int nvlist_lookup_uint64(nvlist_t *, const char *, uint64_t *);
-int nvlist_lookup_string(nvlist_t *, const char *, char **);
-int nvlist_lookup_nvlist(nvlist_t *, const char *, nvlist_t **);
-int nvlist_lookup_boolean_array(nvlist_t *, const char *,
-    boolean_t **, uint_t *);
-int nvlist_lookup_byte_array(nvlist_t *, const char *, uchar_t **, uint_t *);
-int nvlist_lookup_int8_array(nvlist_t *, const char *, int8_t **, uint_t *);
-int nvlist_lookup_uint8_array(nvlist_t *, const char *, uint8_t **, uint_t *);
-int nvlist_lookup_int16_array(nvlist_t *, const char *, int16_t **, uint_t *);
-int nvlist_lookup_uint16_array(nvlist_t *, const char *, uint16_t **, uint_t *);
-int nvlist_lookup_int32_array(nvlist_t *, const char *, int32_t **, uint_t *);
-int nvlist_lookup_uint32_array(nvlist_t *, const char *, uint32_t **, uint_t *);
-int nvlist_lookup_int64_array(nvlist_t *, const char *, int64_t **, uint_t *);
-int nvlist_lookup_uint64_array(nvlist_t *, const char *, uint64_t **, uint_t *);
-int nvlist_lookup_string_array(nvlist_t *, const char *, char ***, uint_t *);
-int nvlist_lookup_nvlist_array(nvlist_t *, const char *,
-    nvlist_t ***, uint_t *);
-int nvlist_lookup_hrtime(nvlist_t *, const char *, hrtime_t *);
-int nvlist_lookup_pairs(nvlist_t *, int, ...);
-#if !defined(_KERNEL)
-int nvlist_lookup_double(nvlist_t *, const char *, double *);
-#endif
-
-int nvlist_lookup_nvpair(nvlist_t *, const char *, nvpair_t **);
-int nvlist_lookup_nvpair_embedded_index(nvlist_t *, const char *, nvpair_t **,
-    int *, char **);
-boolean_t nvlist_exists(nvlist_t *, const char *);
-boolean_t nvlist_empty(nvlist_t *);
-
-/* processing nvpair */
-nvpair_t *nvlist_next_nvpair(nvlist_t *, nvpair_t *);
-nvpair_t *nvlist_prev_nvpair(nvlist_t *, nvpair_t *);
-char *nvpair_name(nvpair_t *);
-data_type_t nvpair_type(nvpair_t *);
-int nvpair_type_is_array(nvpair_t *);
-int nvpair_value_boolean_value(nvpair_t *, boolean_t *);
-int nvpair_value_byte(nvpair_t *, uchar_t *);
-int nvpair_value_int8(nvpair_t *, int8_t *);
-int nvpair_value_uint8(nvpair_t *, uint8_t *);
-int nvpair_value_int16(nvpair_t *, int16_t *);
-int nvpair_value_uint16(nvpair_t *, uint16_t *);
-int nvpair_value_int32(nvpair_t *, int32_t *);
-int nvpair_value_uint32(nvpair_t *, uint32_t *);
-int nvpair_value_int64(nvpair_t *, int64_t *);
-int nvpair_value_uint64(nvpair_t *, uint64_t *);
-int nvpair_value_string(nvpair_t *, char **);
-int nvpair_value_nvlist(nvpair_t *, nvlist_t **);
-int nvpair_value_boolean_array(nvpair_t *, boolean_t **, uint_t *);
-int nvpair_value_byte_array(nvpair_t *, uchar_t **, uint_t *);
-int nvpair_value_int8_array(nvpair_t *, int8_t **, uint_t *);
-int nvpair_value_uint8_array(nvpair_t *, uint8_t **, uint_t *);
-int nvpair_value_int16_array(nvpair_t *, int16_t **, uint_t *);
-int nvpair_value_uint16_array(nvpair_t *, uint16_t **, uint_t *);
-int nvpair_value_int32_array(nvpair_t *, int32_t **, uint_t *);
-int nvpair_value_uint32_array(nvpair_t *, uint32_t **, uint_t *);
-int nvpair_value_int64_array(nvpair_t *, int64_t **, uint_t *);
-int nvpair_value_uint64_array(nvpair_t *, uint64_t **, uint_t *);
-int nvpair_value_string_array(nvpair_t *, char ***, uint_t *);
-int nvpair_value_nvlist_array(nvpair_t *, nvlist_t ***, uint_t *);
-int nvpair_value_hrtime(nvpair_t *, hrtime_t *);
-#if !defined(_KERNEL)
-int nvpair_value_double(nvpair_t *, double *);
-#endif
-
-nvlist_t *fnvlist_alloc(void);
-void fnvlist_free(nvlist_t *);
-size_t fnvlist_size(nvlist_t *);
-char *fnvlist_pack(nvlist_t *, size_t *);
-void fnvlist_pack_free(char *, size_t);
-nvlist_t *fnvlist_unpack(char *, size_t);
-nvlist_t *fnvlist_dup(nvlist_t *);
-void fnvlist_merge(nvlist_t *, nvlist_t *);
-size_t fnvlist_num_pairs(nvlist_t *);
-
-void fnvlist_add_boolean(nvlist_t *, const char *);
-void fnvlist_add_boolean_value(nvlist_t *, const char *, boolean_t);
-void fnvlist_add_byte(nvlist_t *, const char *, uchar_t);
-void fnvlist_add_int8(nvlist_t *, const char *, int8_t);
-void fnvlist_add_uint8(nvlist_t *, const char *, uint8_t);
-void fnvlist_add_int16(nvlist_t *, const char *, int16_t);
-void fnvlist_add_uint16(nvlist_t *, const char *, uint16_t);
-void fnvlist_add_int32(nvlist_t *, const char *, int32_t);
-void fnvlist_add_uint32(nvlist_t *, const char *, uint32_t);
-void fnvlist_add_int64(nvlist_t *, const char *, int64_t);
-void fnvlist_add_uint64(nvlist_t *, const char *, uint64_t);
-void fnvlist_add_string(nvlist_t *, const char *, const char *);
-void fnvlist_add_nvlist(nvlist_t *, const char *, nvlist_t *);
-void fnvlist_add_nvpair(nvlist_t *, nvpair_t *);
-void fnvlist_add_boolean_array(nvlist_t *, const char *, boolean_t *, uint_t);
-void fnvlist_add_byte_array(nvlist_t *, const char *, uchar_t *, uint_t);
-void fnvlist_add_int8_array(nvlist_t *, const char *, int8_t *, uint_t);
-void fnvlist_add_uint8_array(nvlist_t *, const char *, uint8_t *, uint_t);
-void fnvlist_add_int16_array(nvlist_t *, const char *, int16_t *, uint_t);
-void fnvlist_add_uint16_array(nvlist_t *, const char *, uint16_t *, uint_t);
-void fnvlist_add_int32_array(nvlist_t *, const char *, int32_t *, uint_t);
-void fnvlist_add_uint32_array(nvlist_t *, const char *, uint32_t *, uint_t);
-void fnvlist_add_int64_array(nvlist_t *, const char *, int64_t *, uint_t);
-void fnvlist_add_uint64_array(nvlist_t *, const char *, uint64_t *, uint_t);
-void fnvlist_add_string_array(nvlist_t *, const char *, char * const *, uint_t);
-void fnvlist_add_nvlist_array(nvlist_t *, const char *, nvlist_t **, uint_t);
-
-void fnvlist_remove(nvlist_t *, const char *);
-void fnvlist_remove_nvpair(nvlist_t *, nvpair_t *);
-
-nvpair_t *fnvlist_lookup_nvpair(nvlist_t *nvl, const char *name);
-boolean_t fnvlist_lookup_boolean(nvlist_t *nvl, const char *name);
-boolean_t fnvlist_lookup_boolean_value(nvlist_t *nvl, const char *name);
-uchar_t fnvlist_lookup_byte(nvlist_t *nvl, const char *name);
-int8_t fnvlist_lookup_int8(nvlist_t *nvl, const char *name);
-int16_t fnvlist_lookup_int16(nvlist_t *nvl, const char *name);
-int32_t fnvlist_lookup_int32(nvlist_t *nvl, const char *name);
-int64_t fnvlist_lookup_int64(nvlist_t *nvl, const char *name);
-uint8_t fnvlist_lookup_uint8_t(nvlist_t *nvl, const char *name);
-uint16_t fnvlist_lookup_uint16(nvlist_t *nvl, const char *name);
-uint32_t fnvlist_lookup_uint32(nvlist_t *nvl, const char *name);
-uint64_t fnvlist_lookup_uint64(nvlist_t *nvl, const char *name);
-char *fnvlist_lookup_string(nvlist_t *nvl, const char *name);
-nvlist_t *fnvlist_lookup_nvlist(nvlist_t *nvl, const char *name);
-
-boolean_t fnvpair_value_boolean_value(nvpair_t *nvp);
-uchar_t fnvpair_value_byte(nvpair_t *nvp);
-int8_t fnvpair_value_int8(nvpair_t *nvp);
-int16_t fnvpair_value_int16(nvpair_t *nvp);
-int32_t fnvpair_value_int32(nvpair_t *nvp);
-int64_t fnvpair_value_int64(nvpair_t *nvp);
-uint8_t fnvpair_value_uint8_t(nvpair_t *nvp);
-uint16_t fnvpair_value_uint16(nvpair_t *nvp);
-uint32_t fnvpair_value_uint32(nvpair_t *nvp);
-uint64_t fnvpair_value_uint64(nvpair_t *nvp);
-char *fnvpair_value_string(nvpair_t *nvp);
-nvlist_t *fnvpair_value_nvlist(nvpair_t *nvp);
-
-#ifdef	__cplusplus
-}
-#endif
-
-#endif	/* _SYS_NVPAIR_H */
Index: sys/cddl/contrib/opensolaris/uts/common/sys/vnode.h
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/sys/vnode.h
+++ /dev/null
@@ -1,427 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-
-/*
- * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright 2017 RackTop Systems.
- */
-
-/*	Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T	*/
-/*	  All Rights Reserved  	*/
-
-/*
- * University Copyright- Copyright (c) 1982, 1986, 1988
- * The Regents of the University of California
- * All Rights Reserved
- *
- * University Acknowledgment- Portions of this document are derived from
- * software developed by the University of California, Berkeley, and its
- * contributors.
- */
-
-#ifndef _SYS_VNODE_H
-#define	_SYS_VNODE_H
-
-#include_next <sys/vnode.h>
-
-#define	IS_DEVVP(vp)	\
-	((vp)->v_type == VCHR || (vp)->v_type == VBLK || (vp)->v_type == VFIFO)
-
-#define	V_XATTRDIR	0x0000	/* attribute unnamed directory */
-
-#define	AV_SCANSTAMP_SZ	32		/* length of anti-virus scanstamp */
-
-/*
- * Structure of all optional attributes.
- */
-typedef struct xoptattr {
-	timestruc_t	xoa_createtime;	/* Create time of file */
-	uint8_t		xoa_archive;
-	uint8_t		xoa_system;
-	uint8_t		xoa_readonly;
-	uint8_t		xoa_hidden;
-	uint8_t		xoa_nounlink;
-	uint8_t		xoa_immutable;
-	uint8_t		xoa_appendonly;
-	uint8_t		xoa_nodump;
-	uint8_t		xoa_opaque;
-	uint8_t		xoa_av_quarantined;
-	uint8_t		xoa_av_modified;
-	uint8_t		xoa_av_scanstamp[AV_SCANSTAMP_SZ];
-	uint8_t		xoa_reparse;
-	uint64_t	xoa_generation;
-	uint8_t		xoa_offline;
-	uint8_t		xoa_sparse;
-} xoptattr_t;
-
-/*
- * The xvattr structure is really a variable length structure that
- * is made up of:
- * - The classic vattr_t (xva_vattr)
- * - a 32 bit quantity (xva_mapsize) that specifies the size of the
- *   attribute bitmaps in 32 bit words.
- * - A pointer to the returned attribute bitmap (needed because the
- *   previous element, the requested attribute bitmap) is variable lenth.
- * - The requested attribute bitmap, which is an array of 32 bit words.
- *   Callers use the XVA_SET_REQ() macro to set the bits corresponding to
- *   the attributes that are being requested.
- * - The returned attribute bitmap, which is an array of 32 bit words.
- *   File systems that support optional attributes use the XVA_SET_RTN()
- *   macro to set the bits corresponding to the attributes that are being
- *   returned.
- * - The xoptattr_t structure which contains the attribute values
- *
- * xva_mapsize determines how many words in the attribute bitmaps.
- * Immediately following the attribute bitmaps is the xoptattr_t.
- * xva_getxoptattr() is used to get the pointer to the xoptattr_t
- * section.
- */
-
-#define	XVA_MAPSIZE	3		/* Size of attr bitmaps */
-#define	XVA_MAGIC	0x78766174	/* Magic # for verification */
-
-/*
- * The xvattr structure is an extensible structure which permits optional
- * attributes to be requested/returned.  File systems may or may not support
- * optional attributes.  They do so at their own discretion but if they do
- * support optional attributes, they must register the VFSFT_XVATTR feature
- * so that the optional attributes can be set/retrived.
- *
- * The fields of the xvattr structure are:
- *
- * xva_vattr - The first element of an xvattr is a legacy vattr structure
- * which includes the common attributes.  If AT_XVATTR is set in the va_mask
- * then the entire structure is treated as an xvattr.  If AT_XVATTR is not
- * set, then only the xva_vattr structure can be used.
- *
- * xva_magic - 0x78766174 (hex for "xvat"). Magic number for verification.
- *
- * xva_mapsize - Size of requested and returned attribute bitmaps.
- *
- * xva_rtnattrmapp - Pointer to xva_rtnattrmap[].  We need this since the
- * size of the array before it, xva_reqattrmap[], could change which means
- * the location of xva_rtnattrmap[] could change.  This will allow unbundled
- * file systems to find the location of xva_rtnattrmap[] when the sizes change.
- *
- * xva_reqattrmap[] - Array of requested attributes.  Attributes are
- * represented by a specific bit in a specific element of the attribute
- * map array.  Callers set the bits corresponding to the attributes
- * that the caller wants to get/set.
- *
- * xva_rtnattrmap[] - Array of attributes that the file system was able to
- * process.  Not all file systems support all optional attributes.  This map
- * informs the caller which attributes the underlying file system was able
- * to set/get.  (Same structure as the requested attributes array in terms
- * of each attribute  corresponding to specific bits and array elements.)
- *
- * xva_xoptattrs - Structure containing values of optional attributes.
- * These values are only valid if the corresponding bits in xva_reqattrmap
- * are set and the underlying file system supports those attributes.
- */
-typedef struct xvattr {
-	vattr_t		xva_vattr;	/* Embedded vattr structure */
-	uint32_t	xva_magic;	/* Magic Number */
-	uint32_t	xva_mapsize;	/* Size of attr bitmap (32-bit words) */
-	uint32_t	*xva_rtnattrmapp;	/* Ptr to xva_rtnattrmap[] */
-	uint32_t	xva_reqattrmap[XVA_MAPSIZE];	/* Requested attrs */
-	uint32_t	xva_rtnattrmap[XVA_MAPSIZE];	/* Returned attrs */
-	xoptattr_t	xva_xoptattrs;	/* Optional attributes */
-} xvattr_t;
-
-/*
- * Attributes of interest to the caller of setattr or getattr.
- */
-#define	AT_TYPE		0x00001
-#define	AT_MODE		0x00002
-#define	AT_UID		0x00004
-#define	AT_GID		0x00008
-#define	AT_FSID		0x00010
-#define	AT_NODEID	0x00020
-#define	AT_NLINK	0x00040
-#define	AT_SIZE		0x00080
-#define	AT_ATIME	0x00100
-#define	AT_MTIME	0x00200
-#define	AT_CTIME	0x00400
-#define	AT_RDEV		0x00800
-#define	AT_BLKSIZE	0x01000
-#define	AT_NBLOCKS	0x02000
-/*			0x04000 */	/* unused */
-#define	AT_SEQ		0x08000
-/*
- * If AT_XVATTR is set then there are additional bits to process in
- * the xvattr_t's attribute bitmap.  If this is not set then the bitmap
- * MUST be ignored.  Note that this bit must be set/cleared explicitly.
- * That is, setting AT_ALL will NOT set AT_XVATTR.
- */
-#define	AT_XVATTR	0x10000
-
-#define	AT_ALL		(AT_TYPE|AT_MODE|AT_UID|AT_GID|AT_FSID|AT_NODEID|\
-			AT_NLINK|AT_SIZE|AT_ATIME|AT_MTIME|AT_CTIME|\
-			AT_RDEV|AT_BLKSIZE|AT_NBLOCKS|AT_SEQ)
-
-#define	AT_STAT		(AT_MODE|AT_UID|AT_GID|AT_FSID|AT_NODEID|AT_NLINK|\
-			AT_SIZE|AT_ATIME|AT_MTIME|AT_CTIME|AT_RDEV|AT_TYPE)
-
-#define	AT_TIMES	(AT_ATIME|AT_MTIME|AT_CTIME)
-
-#define	AT_NOSET	(AT_NLINK|AT_RDEV|AT_FSID|AT_NODEID|AT_TYPE|\
-			AT_BLKSIZE|AT_NBLOCKS|AT_SEQ)
-
-/*
- * Attribute bits used in the extensible attribute's (xva's) attribute
- * bitmaps.  Note that the bitmaps are made up of a variable length number
- * of 32-bit words.  The convention is to use XAT{n}_{attrname} where "n"
- * is the element in the bitmap (starting at 1).  This convention is for
- * the convenience of the maintainer to keep track of which element each
- * attribute belongs to.
- *
- * NOTE THAT CONSUMERS MUST *NOT* USE THE XATn_* DEFINES DIRECTLY.  CONSUMERS
- * MUST USE THE XAT_* DEFINES.
- */
-#define	XAT0_INDEX	0LL		/* Index into bitmap for XAT0 attrs */
-#define	XAT0_CREATETIME	0x00000001	/* Create time of file */
-#define	XAT0_ARCHIVE	0x00000002	/* Archive */
-#define	XAT0_SYSTEM	0x00000004	/* System */
-#define	XAT0_READONLY	0x00000008	/* Readonly */
-#define	XAT0_HIDDEN	0x00000010	/* Hidden */
-#define	XAT0_NOUNLINK	0x00000020	/* Nounlink */
-#define	XAT0_IMMUTABLE	0x00000040	/* immutable */
-#define	XAT0_APPENDONLY	0x00000080	/* appendonly */
-#define	XAT0_NODUMP	0x00000100	/* nodump */
-#define	XAT0_OPAQUE	0x00000200	/* opaque */
-#define	XAT0_AV_QUARANTINED	0x00000400	/* anti-virus quarantine */
-#define	XAT0_AV_MODIFIED	0x00000800	/* anti-virus modified */
-#define	XAT0_AV_SCANSTAMP	0x00001000	/* anti-virus scanstamp */
-#define	XAT0_REPARSE	0x00002000	/* FS reparse point */
-#define	XAT0_GEN	0x00004000	/* object generation number */
-#define	XAT0_OFFLINE	0x00008000	/* offline */
-#define	XAT0_SPARSE	0x00010000	/* sparse */
-
-#define	XAT0_ALL_ATTRS	(XAT0_CREATETIME|XAT0_ARCHIVE|XAT0_SYSTEM| \
-    XAT0_READONLY|XAT0_HIDDEN|XAT0_NOUNLINK|XAT0_IMMUTABLE|XAT0_APPENDONLY| \
-    XAT0_NODUMP|XAT0_OPAQUE|XAT0_AV_QUARANTINED|  XAT0_AV_MODIFIED| \
-    XAT0_AV_SCANSTAMP|XAT0_REPARSE|XATO_GEN|XAT0_OFFLINE|XAT0_SPARSE)
-
-/* Support for XAT_* optional attributes */
-#define	XVA_MASK		0xffffffff	/* Used to mask off 32 bits */
-#define	XVA_SHFT		32		/* Used to shift index */
-
-/*
- * Used to pry out the index and attribute bits from the XAT_* attributes
- * defined below.  Note that we're masking things down to 32 bits then
- * casting to uint32_t.
- */
-#define	XVA_INDEX(attr)		((uint32_t)(((attr) >> XVA_SHFT) & XVA_MASK))
-#define	XVA_ATTRBIT(attr)	((uint32_t)((attr) & XVA_MASK))
-
-/*
- * The following defines present a "flat namespace" so that consumers don't
- * need to keep track of which element belongs to which bitmap entry.
- *
- * NOTE THAT THESE MUST NEVER BE OR-ed TOGETHER
- */
-#define	XAT_CREATETIME		((XAT0_INDEX << XVA_SHFT) | XAT0_CREATETIME)
-#define	XAT_ARCHIVE		((XAT0_INDEX << XVA_SHFT) | XAT0_ARCHIVE)
-#define	XAT_SYSTEM		((XAT0_INDEX << XVA_SHFT) | XAT0_SYSTEM)
-#define	XAT_READONLY		((XAT0_INDEX << XVA_SHFT) | XAT0_READONLY)
-#define	XAT_HIDDEN		((XAT0_INDEX << XVA_SHFT) | XAT0_HIDDEN)
-#define	XAT_NOUNLINK		((XAT0_INDEX << XVA_SHFT) | XAT0_NOUNLINK)
-#define	XAT_IMMUTABLE		((XAT0_INDEX << XVA_SHFT) | XAT0_IMMUTABLE)
-#define	XAT_APPENDONLY		((XAT0_INDEX << XVA_SHFT) | XAT0_APPENDONLY)
-#define	XAT_NODUMP		((XAT0_INDEX << XVA_SHFT) | XAT0_NODUMP)
-#define	XAT_OPAQUE		((XAT0_INDEX << XVA_SHFT) | XAT0_OPAQUE)
-#define	XAT_AV_QUARANTINED	((XAT0_INDEX << XVA_SHFT) | XAT0_AV_QUARANTINED)
-#define	XAT_AV_MODIFIED		((XAT0_INDEX << XVA_SHFT) | XAT0_AV_MODIFIED)
-#define	XAT_AV_SCANSTAMP	((XAT0_INDEX << XVA_SHFT) | XAT0_AV_SCANSTAMP)
-#define	XAT_REPARSE		((XAT0_INDEX << XVA_SHFT) | XAT0_REPARSE)
-#define	XAT_GEN			((XAT0_INDEX << XVA_SHFT) | XAT0_GEN)
-#define	XAT_OFFLINE		((XAT0_INDEX << XVA_SHFT) | XAT0_OFFLINE)
-#define	XAT_SPARSE		((XAT0_INDEX << XVA_SHFT) | XAT0_SPARSE)
-
-/*
- * The returned attribute map array (xva_rtnattrmap[]) is located past the
- * requested attribute map array (xva_reqattrmap[]).  Its location changes
- * when the array sizes change.  We use a separate pointer in a known location
- * (xva_rtnattrmapp) to hold the location of xva_rtnattrmap[].  This is
- * set in xva_init()
- */
-#define	XVA_RTNATTRMAP(xvap)	((xvap)->xva_rtnattrmapp)
-
-/*
- * XVA_SET_REQ() sets an attribute bit in the proper element in the bitmap
- * of requested attributes (xva_reqattrmap[]).
- */
-#define	XVA_SET_REQ(xvap, attr)	{				\
-	ASSERT((xvap)->xva_vattr.va_mask | AT_XVATTR);		\
-	ASSERT((xvap)->xva_magic == XVA_MAGIC);			\
-	(xvap)->xva_reqattrmap[XVA_INDEX(attr)] |= XVA_ATTRBIT(attr); \
-}
-/*
- * XVA_CLR_REQ() clears an attribute bit in the proper element in the bitmap
- * of requested attributes (xva_reqattrmap[]).
- */
-#define	XVA_CLR_REQ(xvap, attr)	{				\
-	ASSERT((xvap)->xva_vattr.va_mask | AT_XVATTR);		\
-	ASSERT((xvap)->xva_magic == XVA_MAGIC);			\
-	(xvap)->xva_reqattrmap[XVA_INDEX(attr)] &= ~XVA_ATTRBIT(attr); \
-}
-
-/*
- * XVA_SET_RTN() sets an attribute bit in the proper element in the bitmap
- * of returned attributes (xva_rtnattrmap[]).
- */
-#define	XVA_SET_RTN(xvap, attr)	{				\
-	ASSERT((xvap)->xva_vattr.va_mask | AT_XVATTR);		\
-	ASSERT((xvap)->xva_magic == XVA_MAGIC);			\
-	(XVA_RTNATTRMAP(xvap))[XVA_INDEX(attr)] |= XVA_ATTRBIT(attr); \
-}
-
-/*
- * XVA_ISSET_REQ() checks the requested attribute bitmap (xva_reqattrmap[])
- * to see of the corresponding attribute bit is set.  If so, returns non-zero.
- */
-#define	XVA_ISSET_REQ(xvap, attr)					\
-	((((xvap)->xva_vattr.va_mask | AT_XVATTR) &&			\
-		((xvap)->xva_magic == XVA_MAGIC) &&			\
-		((xvap)->xva_mapsize > XVA_INDEX(attr))) ?		\
-	((xvap)->xva_reqattrmap[XVA_INDEX(attr)] & XVA_ATTRBIT(attr)) :	0)
-
-/*
- * XVA_ISSET_RTN() checks the returned attribute bitmap (xva_rtnattrmap[])
- * to see of the corresponding attribute bit is set.  If so, returns non-zero.
- */
-#define	XVA_ISSET_RTN(xvap, attr)					\
-	((((xvap)->xva_vattr.va_mask | AT_XVATTR) &&			\
-		((xvap)->xva_magic == XVA_MAGIC) &&			\
-		((xvap)->xva_mapsize > XVA_INDEX(attr))) ?		\
-	((XVA_RTNATTRMAP(xvap))[XVA_INDEX(attr)] & XVA_ATTRBIT(attr)) : 0)
-
-#define	MODEMASK	07777		/* mode bits plus permission bits */
-#define	PERMMASK	00777		/* permission bits */
-
-/*
- * VOP_ACCESS flags
- */
-#define	V_ACE_MASK	0x1	/* mask represents  NFSv4 ACE permissions */
-
-/*
- * Flags for vnode operations.
- */
-enum rm		{ RMFILE, RMDIRECTORY };	/* rm or rmdir (remove) */
-enum create	{ CRCREAT, CRMKNOD, CRMKDIR };	/* reason for create */
-
-/*
- * Structure used on VOP_GETSECATTR and VOP_SETSECATTR operations
- */
-
-typedef struct vsecattr {
-	uint_t		vsa_mask;	/* See below */
-	int		vsa_aclcnt;	/* ACL entry count */
-	void		*vsa_aclentp;	/* pointer to ACL entries */
-	int		vsa_dfaclcnt;	/* default ACL entry count */
-	void		*vsa_dfaclentp;	/* pointer to default ACL entries */
-	size_t		vsa_aclentsz;	/* ACE size in bytes of vsa_aclentp */
-	uint_t		vsa_aclflags;	/* ACE ACL flags */
-} vsecattr_t;
-
-/* vsa_mask values */
-#define	VSA_ACL			0x0001
-#define	VSA_ACLCNT		0x0002
-#define	VSA_DFACL		0x0004
-#define	VSA_DFACLCNT		0x0008
-#define	VSA_ACE			0x0010
-#define	VSA_ACECNT		0x0020
-#define	VSA_ACE_ALLTYPES	0x0040
-#define	VSA_ACE_ACLFLAGS	0x0080	/* get/set ACE ACL flags */
-
-/*
- * Structure used by various vnode operations to determine
- * the context (pid, host, identity) of a caller.
- *
- * The cc_caller_id is used to identify one or more callers who invoke
- * operations, possibly on behalf of others.  For example, the NFS
- * server could have it's own cc_caller_id which can be detected by
- * vnode/vfs operations or (FEM) monitors on those operations.  New
- * caller IDs are generated by fs_new_caller_id().
- */
-typedef struct caller_context {
-	pid_t		cc_pid;		/* Process ID of the caller */
-	int		cc_sysid;	/* System ID, used for remote calls */
-	u_longlong_t	cc_caller_id;	/* Identifier for (set of) caller(s) */
-	ulong_t		cc_flags;
-} caller_context_t;
-
-struct taskq;
-
-/*
- * Flags for VOP_LOOKUP
- *
- * Defined in file.h, but also possible, FIGNORECASE and FSEARCH
- *
- */
-#define	LOOKUP_DIR		0x01	/* want parent dir vp */
-#define	LOOKUP_XATTR		0x02	/* lookup up extended attr dir */
-#define	CREATE_XATTR_DIR	0x04	/* Create extended attr dir */
-#define	LOOKUP_HAVE_SYSATTR_DIR	0x08	/* Already created virtual GFS dir */
-
-/*
- * Flags for VOP_READDIR
- */
-#define	V_RDDIR_ENTFLAGS	0x01	/* request dirent flags */
-#define	V_RDDIR_ACCFILTER	0x02	/* filter out inaccessible dirents */
-
-/*
- * Public vnode manipulation functions.
- */
-#ifdef	_KERNEL
-
-void	vn_rele_async(struct vnode *vp, struct taskq *taskq);
-
-/*
- * Extensible vnode attribute (xva) routines:
- * xva_init() initializes an xvattr_t (zero struct, init mapsize, set AT_XATTR)
- * xva_getxoptattr() returns a ponter to the xoptattr_t section of xvattr_t
- */
-void		xva_init(xvattr_t *);
-xoptattr_t	*xva_getxoptattr(xvattr_t *);	/* Get ptr to xoptattr_t */
-
-#define	VN_RELE_ASYNC(vp, taskq)	{ \
-	vn_rele_async(vp, taskq); \
-}
-
-#endif	/* _KERNEL */
-
-/*
- * Flags to VOP_SETATTR/VOP_GETATTR.
- */
-#define	ATTR_UTIME	0x01	/* non-default utime(2) request */
-#define	ATTR_EXEC	0x02	/* invocation from exec(2) */
-#define	ATTR_COMM	0x04	/* yield common vp attributes */
-#define	ATTR_HINT	0x08	/* information returned will be `hint' */
-#define	ATTR_REAL	0x10	/* yield attributes of the real vp */
-#define	ATTR_NOACLCHECK	0x20	/* Don't check ACL when checking permissions */
-#define	ATTR_TRIGGER	0x40	/* Mount first if vnode is a trigger mount */
-
-#ifdef	__cplusplus
-}
-#endif
-
-#endif	/* _SYS_VNODE_H */
Index: sys/cddl/dev/dtrace/amd64/dtrace_subr.c
===================================================================
--- sys/cddl/dev/dtrace/amd64/dtrace_subr.c
+++ sys/cddl/dev/dtrace/amd64/dtrace_subr.c
@@ -36,7 +36,6 @@
 #include <sys/types.h>
 #include <sys/kernel.h>
 #include <sys/malloc.h>
-#include <sys/kmem.h>
 #include <sys/smp.h>
 #include <sys/dtrace_impl.h>
 #include <sys/dtrace_bsd.h>
Index: sys/cddl/dev/fbt/fbt.c
===================================================================
--- sys/cddl/dev/fbt/fbt.c
+++ sys/cddl/dev/fbt/fbt.c
@@ -34,6 +34,7 @@
 #include <sys/systm.h>
 #include <sys/conf.h>
 #include <sys/cpuvar.h>
+#include <sys/endian.h>
 #include <sys/fcntl.h>
 #include <sys/filio.h>
 #include <sys/kdb.h>
Index: sys/cddl/dev/profile/profile.c
===================================================================
--- sys/cddl/dev/profile/profile.c
+++ sys/cddl/dev/profile/profile.c
@@ -34,6 +34,7 @@
 #include <sys/systm.h>
 #include <sys/conf.h>
 #include <sys/cpuvar.h>
+#include <sys/endian.h>
 #include <sys/fcntl.h>
 #include <sys/filio.h>
 #include <sys/kdb.h>
Index: sys/cddl/dev/sdt/sdt.c
===================================================================
--- sys/cddl/dev/sdt/sdt.c
+++ sys/cddl/dev/sdt/sdt.c
@@ -44,6 +44,7 @@
 #include <sys/systm.h>
 
 #include <sys/conf.h>
+#include <sys/endian.h>
 #include <sys/eventhandler.h>
 #include <sys/kernel.h>
 #include <sys/limits.h>
Index: sys/cddl/dev/systrace/systrace.c
===================================================================
--- sys/cddl/dev/systrace/systrace.c
+++ sys/cddl/dev/systrace/systrace.c
@@ -290,7 +290,7 @@
 static void
 systrace_destroy(void *arg, dtrace_id_t id, void *parg)
 {
-#ifdef DEBUG
+#ifdef SYSTRACE_DEBUG
 	int sysnum = SYSTRACE_SYSNUM((uintptr_t)parg);
 
 	/*
Index: sys/conf/files
===================================================================
--- sys/conf/files
+++ sys/conf/files
@@ -133,184 +133,232 @@
 cam/scsi/scsi_targ_bh.c		optional targbh
 cam/scsi/scsi_target.c		optional targ
 cam/scsi/smp_all.c		optional scbus
+
 # shared between zfs and dtrace
-cddl/compat/opensolaris/kern/opensolaris.c		optional zfs | dtrace compile-with "${CDDL_C}"
-cddl/compat/opensolaris/kern/opensolaris_cmn_err.c	optional zfs | dtrace compile-with "${CDDL_C}"
-cddl/compat/opensolaris/kern/opensolaris_kmem.c		optional zfs | dtrace compile-with "${CDDL_C}"
+cddl/compat/opensolaris/kern/opensolaris.c		optional dtrace compile-with "${CDDL_C}"
 cddl/compat/opensolaris/kern/opensolaris_misc.c		optional zfs | dtrace compile-with "${CDDL_C}"
 cddl/compat/opensolaris/kern/opensolaris_proc.c		optional zfs | dtrace compile-with "${CDDL_C}"
-cddl/compat/opensolaris/kern/opensolaris_sunddi.c	optional zfs | dtrace compile-with "${CDDL_C}"
-cddl/compat/opensolaris/kern/opensolaris_taskq.c	optional zfs | dtrace compile-with "${CDDL_C}"
+contrib/openzfs/module/os/freebsd/spl/spl_misc.c		optional zfs | dtrace compile-with "${ZFS_C}"
+contrib/openzfs/module/os/freebsd/spl/spl_cmn_err.c		optional zfs | dtrace compile-with "${ZFS_C}"
+contrib/openzfs/module/os/freebsd/spl/spl_taskq.c		optional zfs | dtrace compile-with "${ZFS_C}"
+contrib/openzfs/module/os/freebsd/spl/spl_kmem.c		optional zfs | dtrace compile-with "${ZFS_C}"
+
+#zfs solaris portability layer
+contrib/openzfs/module/os/freebsd/spl/acl_common.c		optional zfs compile-with "${ZFS_C}"
+contrib/openzfs/module/os/freebsd/spl/callb.c		optional zfs compile-with "${ZFS_C}"
+contrib/openzfs/module/os/freebsd/spl/list.c		optional zfs compile-with "${ZFS_C}"
+contrib/openzfs/module/os/freebsd/spl/spl_acl.c		optional zfs compile-with "${ZFS_C}"
+contrib/openzfs/module/os/freebsd/spl/spl_dtrace.c		optional zfs compile-with "${ZFS_C}"
+contrib/openzfs/module/os/freebsd/spl/spl_kstat.c		optional zfs compile-with "${ZFS_C}"
+contrib/openzfs/module/os/freebsd/spl/spl_policy.c		optional zfs compile-with "${ZFS_C}"
+contrib/openzfs/module/os/freebsd/spl/spl_string.c		optional zfs compile-with "${ZFS_C}"
+contrib/openzfs/module/os/freebsd/spl/spl_sunddi.c		optional zfs compile-with "${ZFS_C}"
+contrib/openzfs/module/os/freebsd/spl/spl_sysevent.c		optional zfs compile-with "${ZFS_C}"
+contrib/openzfs/module/os/freebsd/spl/spl_uio.c		optional zfs compile-with "${ZFS_C}"
+contrib/openzfs/module/os/freebsd/spl/spl_vfs.c		optional zfs compile-with "${ZFS_C}"
+contrib/openzfs/module/os/freebsd/spl/spl_vm.c		optional zfs compile-with "${ZFS_C}"
+contrib/openzfs/module/os/freebsd/spl/spl_zone.c		optional zfs compile-with "${ZFS_C}"
+contrib/openzfs/module/os/freebsd/spl/spl_procfs_list.c		optional zfs compile-with "${ZFS_C}"
+contrib/openzfs/module/os/freebsd/spl/spl_zlib.c		optional zfs compile-with "${ZFS_C}"
+
+
 # zfs specific
-cddl/compat/opensolaris/kern/opensolaris_acl.c				optional zfs compile-with "${ZFS_C}"
-cddl/compat/opensolaris/kern/opensolaris_dtrace.c			optional zfs compile-with "${ZFS_C}"
-cddl/compat/opensolaris/kern/opensolaris_kobj.c				optional zfs compile-with "${ZFS_C}"
-cddl/compat/opensolaris/kern/opensolaris_kstat.c			optional zfs compile-with "${ZFS_C}"
-cddl/compat/opensolaris/kern/opensolaris_lookup.c			optional zfs compile-with "${ZFS_C}"
-cddl/compat/opensolaris/kern/opensolaris_policy.c			optional zfs compile-with "${ZFS_C}"
-cddl/compat/opensolaris/kern/opensolaris_string.c			optional zfs compile-with "${ZFS_C}"
-cddl/compat/opensolaris/kern/opensolaris_sysevent.c			optional zfs compile-with "${ZFS_C}"
-cddl/compat/opensolaris/kern/opensolaris_uio.c				optional zfs compile-with "${ZFS_C}"
-cddl/compat/opensolaris/kern/opensolaris_vfs.c				optional zfs compile-with "${ZFS_C}"
-cddl/compat/opensolaris/kern/opensolaris_vm.c				optional zfs compile-with "${ZFS_C}"
-cddl/compat/opensolaris/kern/opensolaris_zone.c				optional zfs compile-with "${ZFS_C}"
-cddl/contrib/opensolaris/common/acl/acl_common.c			optional zfs compile-with "${ZFS_C}"
-cddl/contrib/opensolaris/common/avl/avl.c				optional zfs compile-with "${ZFS_C}"
-cddl/contrib/opensolaris/common/lz4/lz4.c				optional zfs compile-with "${ZFS_C}"
-cddl/contrib/opensolaris/common/nvpair/opensolaris_fnvpair.c		optional zfs compile-with "${ZFS_C}"
-cddl/contrib/opensolaris/common/nvpair/opensolaris_nvpair.c		optional zfs compile-with "${ZFS_C}"
-cddl/contrib/opensolaris/common/nvpair/opensolaris_nvpair_alloc_fixed.c	optional zfs compile-with "${ZFS_C}"
-cddl/contrib/opensolaris/common/unicode/u8_textprep.c			optional zfs compile-with "${ZFS_C}"
-cddl/contrib/opensolaris/common/zfs/zfeature_common.c			optional zfs compile-with "${ZFS_C}"
-cddl/contrib/opensolaris/common/zfs/zfs_comutil.c			optional zfs compile-with "${ZFS_C}"
-cddl/contrib/opensolaris/common/zfs/zfs_deleg.c				optional zfs compile-with "${ZFS_C}"
-cddl/contrib/opensolaris/common/zfs/zfs_fletcher.c			optional zfs compile-with "${ZFS_C}"
-cddl/contrib/opensolaris/common/zfs/zfs_ioctl_compat.c			optional zfs compile-with "${ZFS_C}"
-cddl/contrib/opensolaris/common/zfs/zfs_namecheck.c			optional zfs compile-with "${ZFS_C}"
-cddl/contrib/opensolaris/common/zfs/zfs_prop.c				optional zfs compile-with "${ZFS_C}"
-cddl/contrib/opensolaris/common/zfs/zpool_prop.c			optional zfs compile-with "${ZFS_C}"
-cddl/contrib/opensolaris/common/zfs/zprop_common.c			optional zfs compile-with "${ZFS_C}"
-cddl/contrib/opensolaris/uts/common/fs/vnode.c				optional zfs compile-with "${ZFS_C}"
-cddl/contrib/opensolaris/uts/common/fs/zfs/abd.c			optional zfs compile-with "${ZFS_C}"
-cddl/contrib/opensolaris/uts/common/fs/zfs/aggsum.c			optional zfs compile-with "${ZFS_C}"
-cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c			optional zfs compile-with "${ZFS_C}"
-cddl/contrib/opensolaris/uts/common/fs/zfs/blkptr.c			optional zfs compile-with "${ZFS_C}"
-cddl/contrib/opensolaris/uts/common/fs/zfs/bplist.c			optional zfs compile-with "${ZFS_C}"
-cddl/contrib/opensolaris/uts/common/fs/zfs/bpobj.c			optional zfs compile-with "${ZFS_C}"
-cddl/contrib/opensolaris/uts/common/fs/zfs/bptree.c			optional zfs compile-with "${ZFS_C}"
-cddl/contrib/opensolaris/uts/common/fs/zfs/bqueue.c			optional zfs compile-with "${ZFS_C}"
-cddl/contrib/opensolaris/uts/common/fs/zfs/cityhash.c			optional zfs compile-with "${ZFS_C}"
-cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c			optional zfs compile-with "${ZFS_C}"
-cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf_stats.c		optional zfs compile-with "${ZFS_C}"
-cddl/contrib/opensolaris/uts/common/fs/zfs/ddt.c			optional zfs compile-with "${ZFS_C}"
-cddl/contrib/opensolaris/uts/common/fs/zfs/ddt_zap.c			optional zfs compile-with "${ZFS_C}"
-cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c			optional zfs compile-with "${ZFS_C}"
-cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_diff.c			optional zfs compile-with "${ZFS_C}"
-cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_object.c			optional zfs compile-with "${ZFS_C}"
-cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c			optional zfs compile-with "${ZFS_C}"
-cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c			optional zfs compile-with "${ZFS_C}"
-cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_traverse.c		optional zfs compile-with "${ZFS_C}"
-cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c			optional zfs compile-with "${ZFS_C}"
-cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_zfetch.c			optional zfs compile-with "${ZFS_C}"
-cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c			optional zfs compile-with "${ZFS_C}" \
-	warning "kernel contains CDDL licensed ZFS filesystem"
-cddl/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c			optional zfs compile-with "${ZFS_C}"
-cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_bookmark.c		optional zfs compile-with "${ZFS_C}"
-cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c		optional zfs compile-with "${ZFS_C}"
-cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_deadlist.c		optional zfs compile-with "${ZFS_C}"
-cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_deleg.c			optional zfs compile-with "${ZFS_C}"
-cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_destroy.c		optional zfs compile-with "${ZFS_C}"
-cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dir.c			optional zfs compile-with "${ZFS_C}"
-cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_pool.c			optional zfs compile-with "${ZFS_C}"
-cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_prop.c			optional zfs compile-with "${ZFS_C}"
-cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scan.c			optional zfs compile-with "${ZFS_C}"
-cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_userhold.c		optional zfs compile-with "${ZFS_C}"
-cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_synctask.c		optional zfs compile-with "${ZFS_C}"
-cddl/contrib/opensolaris/uts/common/fs/zfs/gzip.c			optional zfs compile-with "${ZFS_C}"
-cddl/contrib/opensolaris/uts/common/fs/zfs/lzjb.c			optional zfs compile-with "${ZFS_C}"
-cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c			optional zfs compile-with "${ZFS_C}"
-cddl/contrib/opensolaris/uts/common/fs/zfs/mmp.c			optional zfs compile-with "${ZFS_C}"
-cddl/contrib/opensolaris/uts/common/fs/zfs/multilist.c			optional zfs compile-with "${ZFS_C}"
-cddl/contrib/opensolaris/uts/common/fs/zfs/range_tree.c			optional zfs compile-with "${ZFS_C}"
-cddl/contrib/opensolaris/uts/common/fs/zfs/refcount.c			optional zfs compile-with "${ZFS_C}"
-cddl/contrib/opensolaris/uts/common/fs/zfs/rrwlock.c			optional zfs compile-with "${ZFS_C}"
-cddl/contrib/opensolaris/uts/common/fs/zfs/sa.c				optional zfs compile-with "${ZFS_C}"
-cddl/contrib/opensolaris/uts/common/fs/zfs/sha256.c			optional zfs compile-with "${ZFS_C}"
-cddl/contrib/opensolaris/uts/common/fs/zfs/skein_zfs.c			optional zfs compile-with "${ZFS_C}"
-cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c			optional zfs compile-with "${ZFS_C}"
-cddl/contrib/opensolaris/uts/common/fs/zfs/spa_checkpoint.c		optional zfs compile-with "${ZFS_C}"
-cddl/contrib/opensolaris/uts/common/fs/zfs/spa_config.c			optional zfs compile-with "${ZFS_C}"
-cddl/contrib/opensolaris/uts/common/fs/zfs/spa_errlog.c			optional zfs compile-with "${ZFS_C}"
-cddl/contrib/opensolaris/uts/common/fs/zfs/spa_history.c		optional zfs compile-with "${ZFS_C}"
-cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c			optional zfs compile-with "${ZFS_C}"
-cddl/contrib/opensolaris/uts/common/fs/zfs/space_map.c			optional zfs compile-with "${ZFS_C}"
-cddl/contrib/opensolaris/uts/common/fs/zfs/space_reftree.c		optional zfs compile-with "${ZFS_C}"
-cddl/contrib/opensolaris/uts/common/fs/zfs/trim_map.c			optional zfs compile-with "${ZFS_C}"
-cddl/contrib/opensolaris/uts/common/fs/zfs/txg.c			optional zfs compile-with "${ZFS_C}"
-cddl/contrib/opensolaris/uts/common/fs/zfs/uberblock.c			optional zfs compile-with "${ZFS_C}"
-cddl/contrib/opensolaris/uts/common/fs/zfs/unique.c			optional zfs compile-with "${ZFS_C}"
-cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c			optional zfs compile-with "${ZFS_C}"
-cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_cache.c			optional zfs compile-with "${ZFS_C}"
-cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_file.c			optional zfs compile-with "${ZFS_C}"
-cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_indirect.c		optional zfs compile-with "${ZFS_C}"
-cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_indirect_births.c	optional zfs compile-with "${ZFS_C}"
-cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_indirect_mapping.c	optional zfs compile-with "${ZFS_C}"
-cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_initialize.c		optional zfs compile-with "${ZFS_C}"
-cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c			optional zfs compile-with "${ZFS_C}"
-cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_label.c			optional zfs compile-with "${ZFS_C}"
-cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_mirror.c		optional zfs compile-with "${ZFS_C}"
-cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_missing.c		optional zfs compile-with "${ZFS_C}"
-cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c			optional zfs compile-with "${ZFS_C}"
-cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c			optional zfs compile-with "${ZFS_C}"
-cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_removal.c		optional zfs compile-with "${ZFS_C}"
-cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_root.c			optional zfs compile-with "${ZFS_C}"
-cddl/contrib/opensolaris/uts/common/fs/zfs/zap.c			optional zfs compile-with "${ZFS_C}"
-cddl/contrib/opensolaris/uts/common/fs/zfs/zap_leaf.c			optional zfs compile-with "${ZFS_C}"
-cddl/contrib/opensolaris/uts/common/fs/zfs/zap_micro.c			optional zfs compile-with "${ZFS_C}"
-cddl/contrib/opensolaris/uts/common/fs/zfs/zcp.c			optional zfs compile-with "${ZFS_C}"
-cddl/contrib/opensolaris/uts/common/fs/zfs/zcp_get.c			optional zfs compile-with "${ZFS_C}"
-cddl/contrib/opensolaris/uts/common/fs/zfs/zcp_global.c			optional zfs compile-with "${ZFS_C}"
-cddl/contrib/opensolaris/uts/common/fs/zfs/zcp_iter.c			optional zfs compile-with "${ZFS_C}"
-cddl/contrib/opensolaris/uts/common/fs/zfs/zcp_synctask.c		optional zfs compile-with "${ZFS_C}"
-cddl/contrib/opensolaris/uts/common/fs/zfs/zfeature.c			optional zfs compile-with "${ZFS_C}"
-cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_acl.c			optional zfs compile-with "${ZFS_C}"
-cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_byteswap.c		optional zfs compile-with "${ZFS_C}"
-cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ctldir.c			optional zfs compile-with "${ZFS_C}"
-cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_debug.c			optional zfs compile-with "${ZFS_C}"
-cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_dir.c			optional zfs compile-with "${ZFS_C}"
-cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_fm.c			optional zfs compile-with "${ZFS_C}"
-cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_fuid.c			optional zfs compile-with "${ZFS_C}"
-cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c			optional zfs compile-with "${ZFS_C}"
-cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_log.c			optional zfs compile-with "${ZFS_C}"
-cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_onexit.c			optional zfs compile-with "${ZFS_C}"
-cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_replay.c			optional zfs compile-with "${ZFS_C}"
-cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_rlock.c			optional zfs compile-with "${ZFS_C}"
-cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_sa.c			optional zfs compile-with "${ZFS_C}"
-cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c			optional zfs compile-with "${ZFS_C}"
-cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c			optional zfs compile-with "${ZFS_C}"
-cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_znode.c			optional zfs compile-with "${ZFS_C}"
-cddl/contrib/opensolaris/uts/common/fs/zfs/zil.c			optional zfs compile-with "${ZFS_C}"
-cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c			optional zfs compile-with "${ZFS_C}"
-cddl/contrib/opensolaris/uts/common/fs/zfs/zio_checksum.c		optional zfs compile-with "${ZFS_C}"
-cddl/contrib/opensolaris/uts/common/fs/zfs/zio_compress.c		optional zfs compile-with "${ZFS_C}"
-cddl/contrib/opensolaris/uts/common/fs/zfs/zio_inject.c			optional zfs compile-with "${ZFS_C}"
-cddl/contrib/opensolaris/uts/common/fs/zfs/zle.c			optional zfs compile-with "${ZFS_C}"
-cddl/contrib/opensolaris/uts/common/fs/zfs/zrlock.c			optional zfs compile-with "${ZFS_C}"
-cddl/contrib/opensolaris/uts/common/fs/zfs/zthr.c			optional zfs compile-with "${ZFS_C}"
-cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c			optional zfs compile-with "${ZFS_C}"
-cddl/contrib/opensolaris/uts/common/os/callb.c				optional zfs compile-with "${ZFS_C}"
-cddl/contrib/opensolaris/uts/common/os/fm.c				optional zfs compile-with "${ZFS_C}"
-cddl/contrib/opensolaris/uts/common/os/list.c				optional zfs compile-with "${ZFS_C}"
-cddl/contrib/opensolaris/uts/common/os/nvpair_alloc_system.c		optional zfs compile-with "${ZFS_C}"
-cddl/contrib/opensolaris/uts/common/zmod/zmod.c				optional zfs compile-with "${ZFS_C}"
+
+#zfs avl
+contrib/openzfs/module/avl/avl.c				optional zfs compile-with "${ZFS_C}"
+
 # zfs lua support
-cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lapi.c			optional zfs compile-with "${ZFS_C}"
-cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lauxlib.c		optional zfs compile-with "${ZFS_C}"
-cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lbaselib.c		optional zfs compile-with "${ZFS_C}"
-cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lbitlib.c		optional zfs compile-with "${ZFS_C}"
-cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lcode.c			optional zfs compile-with "${ZFS_C}"
-cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lcompat.c		optional zfs compile-with "${ZFS_C}"
-cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lcorolib.c		optional zfs compile-with "${ZFS_C}"
-cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lctype.c			optional zfs compile-with "${ZFS_C}"
-cddl/contrib/opensolaris/uts/common/fs/zfs/lua/ldebug.c			optional zfs compile-with "${ZFS_C}"
-cddl/contrib/opensolaris/uts/common/fs/zfs/lua/ldo.c			optional zfs compile-with "${ZFS_C}"
-cddl/contrib/opensolaris/uts/common/fs/zfs/lua/ldump.c			optional zfs compile-with "${ZFS_C}"
-cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lfunc.c			optional zfs compile-with "${ZFS_C}"
-cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lgc.c			optional zfs compile-with "${ZFS_C}"
-cddl/contrib/opensolaris/uts/common/fs/zfs/lua/llex.c			optional zfs compile-with "${ZFS_C}"
-cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lmem.c			optional zfs compile-with "${ZFS_C}"
-cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lobject.c		optional zfs compile-with "${ZFS_C}"
-cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lopcodes.c		optional zfs compile-with "${ZFS_C}"
-cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lparser.c		optional zfs compile-with "${ZFS_C}"
-cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lstate.c			optional zfs compile-with "${ZFS_C}"
-cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lstring.c		optional zfs compile-with "${ZFS_C}"
-cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lstrlib.c		optional zfs compile-with "${ZFS_C}"
-cddl/contrib/opensolaris/uts/common/fs/zfs/lua/ltable.c			optional zfs compile-with "${ZFS_C}"
-cddl/contrib/opensolaris/uts/common/fs/zfs/lua/ltablib.c		optional zfs compile-with "${ZFS_C}"
-cddl/contrib/opensolaris/uts/common/fs/zfs/lua/ltm.c			optional zfs compile-with "${ZFS_C}"
-cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lundump.c		optional zfs compile-with "${ZFS_C}"
-cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lvm.c			optional zfs compile-with "${ZFS_C}"
-cddl/contrib/opensolaris/uts/common/fs/zfs/lua/lzio.c			optional zfs compile-with "${ZFS_C}"
+contrib/openzfs/module/lua/lapi.c			optional zfs compile-with "${ZFS_C}"
+contrib/openzfs/module/lua/lauxlib.c		optional zfs compile-with "${ZFS_C}"
+contrib/openzfs/module/lua/lbaselib.c		optional zfs compile-with "${ZFS_C}"
+contrib/openzfs/module/lua/lcode.c			optional zfs compile-with "${ZFS_C}"
+contrib/openzfs/module/lua/lcompat.c		optional zfs compile-with "${ZFS_C}"
+contrib/openzfs/module/lua/lcorolib.c		optional zfs compile-with "${ZFS_C}"
+contrib/openzfs/module/lua/lctype.c			optional zfs compile-with "${ZFS_C}"
+contrib/openzfs/module/lua/ldebug.c			optional zfs compile-with "${ZFS_C}"
+contrib/openzfs/module/lua/ldo.c			optional zfs compile-with "${ZFS_C}"
+contrib/openzfs/module/lua/lfunc.c			optional zfs compile-with "${ZFS_C}"
+contrib/openzfs/module/lua/lgc.c			optional zfs compile-with "${ZFS_C}"
+contrib/openzfs/module/lua/llex.c			optional zfs compile-with "${ZFS_C}"
+contrib/openzfs/module/lua/lmem.c			optional zfs compile-with "${ZFS_C}"
+contrib/openzfs/module/lua/lobject.c		optional zfs compile-with "${ZFS_C}"
+contrib/openzfs/module/lua/lopcodes.c		optional zfs compile-with "${ZFS_C}"
+contrib/openzfs/module/lua/lparser.c		optional zfs compile-with "${ZFS_C}"
+contrib/openzfs/module/lua/lstate.c			optional zfs compile-with "${ZFS_C}"
+contrib/openzfs/module/lua/lstring.c		optional zfs compile-with "${ZFS_C}"
+contrib/openzfs/module/lua/lstrlib.c		optional zfs compile-with "${ZFS_C}"
+contrib/openzfs/module/lua/ltable.c			optional zfs compile-with "${ZFS_C}"
+contrib/openzfs/module/lua/ltablib.c		optional zfs compile-with "${ZFS_C}"
+contrib/openzfs/module/lua/ltm.c			optional zfs compile-with "${ZFS_C}"
+contrib/openzfs/module/lua/lvm.c			optional zfs compile-with "${ZFS_C}"
+contrib/openzfs/module/lua/lzio.c			optional zfs compile-with "${ZFS_C}"
+
+# zfs nvpair support
+contrib/openzfs/module/nvpair/fnvpair.c		optional zfs compile-with "${ZFS_C}"
+contrib/openzfs/module/nvpair/nvpair.c		optional zfs compile-with "${ZFS_RPC_C}"
+contrib/openzfs/module/nvpair/nvpair_alloc_fixed.c	optional zfs compile-with "${ZFS_C}"
+contrib/openzfs/module/nvpair/nvpair_alloc_spl.c	optional zfs compile-with "${ZFS_C}"
+
+#zfs platform compatibility code
+contrib/openzfs/module/os/freebsd/zfs/abd_os.c		optional zfs compile-with "${ZFS_C}"
+contrib/openzfs/module/os/freebsd/zfs/arc_os.c		optional zfs compile-with "${ZFS_C}"
+contrib/openzfs/module/os/freebsd/zfs/crypto_os.c		optional zfs compile-with "${ZFS_C}"
+contrib/openzfs/module/os/freebsd/zfs/dmu_os.c		optional zfs compile-with "${ZFS_C}"
+contrib/openzfs/module/os/freebsd/zfs/hkdf.c		optional zfs compile-with "${ZFS_C}"
+contrib/openzfs/module/os/freebsd/zfs/kmod_core.c		optional zfs compile-with "${ZFS_C}"
+contrib/openzfs/module/os/freebsd/zfs/spa_os.c		optional zfs compile-with "${ZFS_C}"
+contrib/openzfs/module/os/freebsd/zfs/spa_stats.c		optional zfs compile-with "${ZFS_C}"
+contrib/openzfs/module/os/freebsd/zfs/sysctl_os.c		optional zfs compile-with "${ZFS_C}  -include $S/modules/zfs/zfs_config.h"
+contrib/openzfs/module/os/freebsd/zfs/vdev_file.c		optional zfs compile-with "${ZFS_C}"
+contrib/openzfs/module/os/freebsd/zfs/vdev_label_os.c		optional zfs compile-with "${ZFS_C}"
+contrib/openzfs/module/os/freebsd/zfs/vdev_geom.c		optional zfs compile-with "${ZFS_C}"
+contrib/openzfs/module/os/freebsd/zfs/zfs_acl.c		optional zfs compile-with "${ZFS_C}"
+contrib/openzfs/module/os/freebsd/zfs/zfs_ctldir.c		optional zfs compile-with "${ZFS_C}"
+contrib/openzfs/module/os/freebsd/zfs/zfs_debug.c		optional zfs compile-with "${ZFS_C}"
+contrib/openzfs/module/os/freebsd/zfs/zfs_dir.c		optional zfs compile-with "${ZFS_C}"
+contrib/openzfs/module/os/freebsd/zfs/zfs_file_os.c		optional zfs compile-with "${ZFS_C}"
+contrib/openzfs/module/os/freebsd/zfs/zfs_fuid_os.c		optional zfs compile-with "${ZFS_C}"
+contrib/openzfs/module/os/freebsd/zfs/zfs_ioctl_compat.c		optional zfs compile-with "${ZFS_C}"
+contrib/openzfs/module/os/freebsd/zfs/zfs_ioctl_os.c		optional zfs compile-with "${ZFS_C}"
+contrib/openzfs/module/os/freebsd/zfs/zfs_vfsops.c		optional zfs compile-with "${ZFS_C}"
+contrib/openzfs/module/os/freebsd/zfs/zfs_vnops.c		optional zfs compile-with "${ZFS_C}"
+contrib/openzfs/module/os/freebsd/zfs/zfs_znode.c		optional zfs compile-with "${ZFS_C}"
+contrib/openzfs/module/os/freebsd/zfs/zio_crypt.c		optional zfs compile-with "${ZFS_C}"
+contrib/openzfs/module/os/freebsd/zfs/zvol_os.c		optional zfs compile-with "${ZFS_C}"
+
+#zfs unicode support
+contrib/openzfs/module/unicode/uconv.c		optional zfs compile-with "${ZFS_C}"
+contrib/openzfs/module/unicode/u8_textprep.c		optional zfs compile-with "${ZFS_C}"
+
+#zfs checksums / zcommon
+contrib/openzfs/module/zcommon/cityhash.c		optional zfs compile-with "${ZFS_C}"
+contrib/openzfs/module/zcommon/zfeature_common.c		optional zfs compile-with "${ZFS_C}"
+contrib/openzfs/module/zcommon/zfs_comutil.c		optional zfs compile-with "${ZFS_C}"
+contrib/openzfs/module/zcommon/zfs_deleg.c		optional zfs compile-with "${ZFS_C}"
+contrib/openzfs/module/zcommon/zfs_fletcher.c		optional zfs compile-with "${ZFS_C}"
+contrib/openzfs/module/zcommon/zfs_fletcher_superscalar.c		optional zfs compile-with "${ZFS_C}"
+contrib/openzfs/module/zcommon/zfs_fletcher_superscalar4.c		optional zfs compile-with "${ZFS_C}"
+contrib/openzfs/module/zcommon/zfs_namecheck.c		optional zfs compile-with "${ZFS_C}"
+contrib/openzfs/module/zcommon/zfs_prop.c		optional zfs compile-with "${ZFS_C}"
+contrib/openzfs/module/zcommon/zpool_prop.c		optional zfs compile-with "${ZFS_C}"
+contrib/openzfs/module/zcommon/zprop_common.c		optional zfs compile-with "${ZFS_C}"
+
+#zfs core common code
+contrib/openzfs/module/zfs/abd.c		optional zfs compile-with "${ZFS_C}"
+contrib/openzfs/module/zfs/aggsum.c		optional zfs compile-with "${ZFS_C}"
+contrib/openzfs/module/zfs/arc.c		optional zfs compile-with "${ZFS_C}"
+contrib/openzfs/module/zfs/blkptr.c		optional zfs compile-with "${ZFS_C}"
+contrib/openzfs/module/zfs/bplist.c		optional zfs compile-with "${ZFS_C}"
+contrib/openzfs/module/zfs/bpobj.c		optional zfs compile-with "${ZFS_C}"
+contrib/openzfs/module/zfs/bptree.c		optional zfs compile-with "${ZFS_C}"
+contrib/openzfs/module/zfs/btree.c		optional zfs compile-with "${ZFS_C}"
+contrib/openzfs/module/zfs/bqueue.c		optional zfs compile-with "${ZFS_C}"
+contrib/openzfs/module/zfs/dbuf.c		optional zfs compile-with "${ZFS_C}"
+contrib/openzfs/module/zfs/dbuf_stats.c		optional zfs compile-with "${ZFS_C}"
+contrib/openzfs/module/zfs/dataset_kstats.c		optional zfs compile-with "${ZFS_C}"
+contrib/openzfs/module/zfs/ddt.c		optional zfs compile-with "${ZFS_C}"
+contrib/openzfs/module/zfs/ddt_zap.c		optional zfs compile-with "${ZFS_C}"
+contrib/openzfs/module/zfs/dmu.c		optional zfs compile-with "${ZFS_C}"
+contrib/openzfs/module/zfs/dmu_diff.c		optional zfs compile-with "${ZFS_C}"
+contrib/openzfs/module/zfs/dmu_object.c		optional zfs compile-with "${ZFS_C}"
+contrib/openzfs/module/zfs/dmu_objset.c		optional zfs compile-with "${ZFS_C}"
+contrib/openzfs/module/zfs/dmu_recv.c		optional zfs compile-with "${ZFS_C}"
+contrib/openzfs/module/zfs/dmu_redact.c		optional zfs compile-with "${ZFS_C}"
+contrib/openzfs/module/zfs/dmu_send.c		optional zfs compile-with "${ZFS_C}"
+contrib/openzfs/module/zfs/dmu_traverse.c		optional zfs compile-with "${ZFS_C}"
+contrib/openzfs/module/zfs/dmu_tx.c		optional zfs compile-with "${ZFS_C}"
+contrib/openzfs/module/zfs/dmu_zfetch.c		optional zfs compile-with "${ZFS_C}"
+contrib/openzfs/module/zfs/dnode.c		optional zfs compile-with "${ZFS_C}" \
+	warning "kernel contains CDDL licensed ZFS filesystem"
+contrib/openzfs/module/zfs/dnode_sync.c		optional zfs compile-with "${ZFS_C}"
+contrib/openzfs/module/zfs/dsl_bookmark.c		optional zfs compile-with "${ZFS_C}"
+contrib/openzfs/module/zfs/dsl_crypt.c		optional zfs compile-with "${ZFS_C}"
+contrib/openzfs/module/zfs/dsl_dataset.c		optional zfs compile-with "${ZFS_C}"
+contrib/openzfs/module/zfs/dsl_deadlist.c		optional zfs compile-with "${ZFS_C}"
+contrib/openzfs/module/zfs/dsl_deleg.c		optional zfs compile-with "${ZFS_C}"
+contrib/openzfs/module/zfs/dsl_destroy.c		optional zfs compile-with "${ZFS_C}"
+contrib/openzfs/module/zfs/dsl_dir.c		optional zfs compile-with "${ZFS_C}"
+contrib/openzfs/module/zfs/dsl_pool.c		optional zfs compile-with "${ZFS_C}"
+contrib/openzfs/module/zfs/dsl_prop.c		optional zfs compile-with "${ZFS_C}"
+contrib/openzfs/module/zfs/dsl_scan.c		optional zfs compile-with "${ZFS_C}"
+contrib/openzfs/module/zfs/dsl_synctask.c		optional zfs compile-with "${ZFS_C}"
+contrib/openzfs/module/zfs/dsl_userhold.c		optional zfs compile-with "${ZFS_C}"
+contrib/openzfs/module/zfs/fm.c		optional zfs compile-with "${ZFS_C}"
+contrib/openzfs/module/zfs/gzip.c		optional zfs compile-with "${ZFS_C}"
+contrib/openzfs/module/zfs/lzjb.c		optional zfs compile-with "${ZFS_C}"
+contrib/openzfs/module/zfs/lz4.c		optional zfs compile-with "${ZFS_C}"
+contrib/openzfs/module/zfs/metaslab.c		optional zfs compile-with "${ZFS_C}"
+contrib/openzfs/module/zfs/mmp.c		optional zfs compile-with "${ZFS_C}"
+contrib/openzfs/module/zfs/multilist.c		optional zfs compile-with "${ZFS_C}"
+contrib/openzfs/module/zfs/objlist.c		optional zfs compile-with "${ZFS_C}"
+contrib/openzfs/module/zfs/pathname.c		optional zfs compile-with "${ZFS_C}"
+contrib/openzfs/module/zfs/range_tree.c		optional zfs compile-with "${ZFS_C}"
+contrib/openzfs/module/zfs/refcount.c		optional zfs compile-with "${ZFS_C}"
+contrib/openzfs/module/zfs/rrwlock.c		optional zfs compile-with "${ZFS_C}"
+contrib/openzfs/module/zfs/sa.c		optional zfs compile-with "${ZFS_C}"
+contrib/openzfs/module/zfs/sha256.c		optional zfs compile-with "${ZFS_C}"
+contrib/openzfs/module/zfs/skein_zfs.c		optional zfs compile-with "${ZFS_C}"
+contrib/openzfs/module/zfs/spa.c		optional zfs compile-with "${ZFS_C}"
+contrib/openzfs/module/zfs/spa_boot.c		optional zfs compile-with "${ZFS_C}"
+contrib/openzfs/module/zfs/spa_checkpoint.c		optional zfs compile-with "${ZFS_C}"
+contrib/openzfs/module/zfs/spa_config.c		optional zfs compile-with "${ZFS_C}"
+contrib/openzfs/module/zfs/spa_errlog.c		optional zfs compile-with "${ZFS_C}"
+contrib/openzfs/module/zfs/spa_history.c		optional zfs compile-with "${ZFS_C}"
+contrib/openzfs/module/zfs/spa_log_spacemap.c		optional zfs compile-with "${ZFS_C}"
+contrib/openzfs/module/zfs/spa_misc.c		optional zfs compile-with "${ZFS_C}"
+contrib/openzfs/module/zfs/space_map.c		optional zfs compile-with "${ZFS_C}"
+contrib/openzfs/module/zfs/space_reftree.c		optional zfs compile-with "${ZFS_C}"
+contrib/openzfs/module/zfs/txg.c		optional zfs compile-with "${ZFS_C}"
+contrib/openzfs/module/zfs/uberblock.c		optional zfs compile-with "${ZFS_C}"
+contrib/openzfs/module/zfs/unique.c		optional zfs compile-with "${ZFS_C}"
+contrib/openzfs/module/zfs/vdev.c		optional zfs compile-with "${ZFS_C}"
+contrib/openzfs/module/zfs/vdev_cache.c		optional zfs compile-with "${ZFS_C}"
+contrib/openzfs/module/zfs/vdev_indirect.c		optional zfs compile-with "${ZFS_C}"
+contrib/openzfs/module/zfs/vdev_indirect_births.c		optional zfs compile-with "${ZFS_C}"
+contrib/openzfs/module/zfs/vdev_indirect_mapping.c		optional zfs compile-with "${ZFS_C}"
+contrib/openzfs/module/zfs/vdev_initialize.c		optional zfs compile-with "${ZFS_C}"
+contrib/openzfs/module/zfs/vdev_label.c		optional zfs compile-with "${ZFS_C}"
+contrib/openzfs/module/zfs/vdev_mirror.c		optional zfs compile-with "${ZFS_C}"
+contrib/openzfs/module/zfs/vdev_missing.c		optional zfs compile-with "${ZFS_C}"
+contrib/openzfs/module/zfs/vdev_queue.c		optional zfs compile-with "${ZFS_C}"
+contrib/openzfs/module/zfs/vdev_raidz.c		optional zfs compile-with "${ZFS_C}"
+contrib/openzfs/module/zfs/vdev_raidz_math.c		optional zfs compile-with "${ZFS_C}"
+contrib/openzfs/module/zfs/vdev_raidz_math_scalar.c		optional zfs compile-with "${ZFS_C}"
+contrib/openzfs/module/zfs/vdev_rebuild.c		optional zfs compile-with "${ZFS_C}"
+contrib/openzfs/module/zfs/vdev_removal.c		optional zfs compile-with "${ZFS_C}"
+contrib/openzfs/module/zfs/vdev_root.c		optional zfs compile-with "${ZFS_C}"
+contrib/openzfs/module/zfs/vdev_trim.c		optional zfs compile-with "${ZFS_C}"
+contrib/openzfs/module/zfs/zap.c		optional zfs compile-with "${ZFS_C}"
+contrib/openzfs/module/zfs/zap_leaf.c		optional zfs compile-with "${ZFS_C}"
+contrib/openzfs/module/zfs/zap_micro.c		optional zfs compile-with "${ZFS_C}"
+contrib/openzfs/module/zfs/zcp.c		optional zfs compile-with "${ZFS_C}"
+contrib/openzfs/module/zfs/zcp_get.c		optional zfs compile-with "${ZFS_C}"
+contrib/openzfs/module/zfs/zcp_global.c		optional zfs compile-with "${ZFS_C}"
+contrib/openzfs/module/zfs/zcp_iter.c		optional zfs compile-with "${ZFS_C}"
+contrib/openzfs/module/zfs/zcp_set.c		optional zfs compile-with "${ZFS_C}"
+contrib/openzfs/module/zfs/zcp_synctask.c		optional zfs compile-with "${ZFS_C}"
+contrib/openzfs/module/zfs/zfeature.c		optional zfs compile-with "${ZFS_C}"
+contrib/openzfs/module/zfs/zfs_byteswap.c		optional zfs compile-with "${ZFS_C}"
+contrib/openzfs/module/zfs/zfs_fm.c		optional zfs compile-with "${ZFS_C}"
+contrib/openzfs/module/zfs/zfs_fuid.c		optional zfs compile-with "${ZFS_C}"
+contrib/openzfs/module/zfs/zfs_ioctl.c		optional zfs compile-with "${ZFS_C}"
+contrib/openzfs/module/zfs/zfs_log.c		optional zfs compile-with "${ZFS_C}"
+contrib/openzfs/module/zfs/zfs_onexit.c		optional zfs compile-with "${ZFS_C}"
+contrib/openzfs/module/zfs/zfs_quota.c		optional zfs compile-with "${ZFS_C}"
+contrib/openzfs/module/zfs/zfs_ratelimit.c		optional zfs compile-with "${ZFS_C}"
+contrib/openzfs/module/zfs/zfs_replay.c		optional zfs compile-with "${ZFS_C}"
+contrib/openzfs/module/zfs/zfs_rlock.c		optional zfs compile-with "${ZFS_C}"
+contrib/openzfs/module/zfs/zfs_sa.c		optional zfs compile-with "${ZFS_C}"
+contrib/openzfs/module/zfs/zil.c		optional zfs compile-with "${ZFS_C}"
+contrib/openzfs/module/zfs/zio.c		optional zfs compile-with "${ZFS_C}"
+contrib/openzfs/module/zfs/zio_checksum.c		optional zfs compile-with "${ZFS_C}"
+contrib/openzfs/module/zfs/zio_compress.c		optional zfs compile-with "${ZFS_C}"
+contrib/openzfs/module/zfs/zio_inject.c		optional zfs compile-with "${ZFS_C}"
+contrib/openzfs/module/zfs/zle.c		optional zfs compile-with "${ZFS_C}"
+contrib/openzfs/module/zfs/zrlock.c		optional zfs compile-with "${ZFS_C}"
+contrib/openzfs/module/zfs/zthr.c		optional zfs compile-with "${ZFS_C}"
+contrib/openzfs/module/zfs/zvol.c		optional zfs compile-with "${ZFS_C}"
+
 # dtrace specific
 cddl/contrib/opensolaris/uts/common/dtrace/dtrace.c	optional dtrace compile-with "${DTRACE_C}" \
 							warning "kernel contains CDDL licensed DTRACE"
Index: sys/conf/files.amd64
===================================================================
--- sys/conf/files.amd64
+++ sys/conf/files.amd64
@@ -463,3 +463,13 @@
 x86/xen/pv.c			optional	xenhvm
 x86/xen/pvcpu_enum.c		optional	xenhvm
 x86/xen/xen_pci_bus.c		optional	xenhvm
+
+contrib/openzfs/module/zcommon/zfs_fletcher_avx512.c		optional zfs compile-with "${ZFS_C}"
+contrib/openzfs/module/zcommon/zfs_fletcher_intel.c		optional zfs compile-with "${ZFS_C}"
+contrib/openzfs/module/zcommon/zfs_fletcher_sse.c		optional zfs compile-with "${ZFS_C}"
+
+contrib/openzfs/module/zfs/vdev_raidz_math_avx2.c		optional zfs compile-with "${ZFS_C}"
+contrib/openzfs/module/zfs/vdev_raidz_math_avx512bw.c		optional zfs compile-with "${ZFS_C}"
+contrib/openzfs/module/zfs/vdev_raidz_math_avx512f.c		optional zfs compile-with "${ZFS_C}"
+contrib/openzfs/module/zfs/vdev_raidz_math_sse2.c		optional zfs compile-with "${ZFS_C}"
+contrib/openzfs/module/zfs/vdev_raidz_math_ssse3.c		optional zfs compile-with "${ZFS_C}"
Index: sys/conf/kern.pre.mk
===================================================================
--- sys/conf/kern.pre.mk
+++ sys/conf/kern.pre.mk
@@ -204,34 +204,80 @@
 ZSTD_DECOMPRESS_BLOCK_FLAGS= -fno-tree-vectorize
 .endif
 
+ZINCDIR=$S/contrib/openzfs/include
 # Common for dtrace / zfs
-CDDL_CFLAGS=	-DFREEBSD_NAMECACHE -nostdinc -I$S/cddl/compat/opensolaris -I$S/cddl/contrib/opensolaris/uts/common -I$S -I$S/cddl/contrib/opensolaris/common ${CFLAGS} -Wno-unknown-pragmas -Wno-missing-prototypes -Wno-undef -Wno-strict-prototypes -Wno-cast-qual -Wno-parentheses -Wno-redundant-decls -Wno-missing-braces -Wno-uninitialized -Wno-unused -Wno-inline -Wno-switch -Wno-pointer-arith -Wno-unknown-pragmas
-CDDL_CFLAGS+=	-include $S/cddl/compat/opensolaris/sys/debug_compat.h
+CDDL_CFLAGS=	\
+	-DFREEBSD_NAMECACHE \
+	-D_SYS_VMEM_H_ \
+	-D__KERNEL \
+	-D__KERNEL__ \
+	-DKERNEL_STATIC \
+	-nostdinc \
+	-include $S/modules/zfs/static_ccompile.h \
+	-I${ZINCDIR} \
+	-I${ZINCDIR}/spl \
+	-I${ZINCDIR}/os/freebsd \
+	-I${ZINCDIR}/os/freebsd/spl \
+	-I${ZINCDIR}/os/freebsd/zfs  \
+	-I$S/modules/zfs \
+	${CFLAGS} \
+	-Wno-unknown-pragmas \
+	-Wno-missing-prototypes \
+	-Wno-undef \
+	-Wno-strict-prototypes \
+	-Wno-cast-qual \
+	-Wno-parentheses \
+	-Wno-redundant-decls \
+	-Wno-missing-braces \
+	-Wno-uninitialized \
+	-Wno-unused \
+	-Wno-inline \
+	-Wno-switch \
+	-Wno-pointer-arith \
+	-Wno-unknown-pragmas \
+	-Wno-duplicate-decl-specifier \
+	-include ${ZINCDIR}/os/freebsd/spl/sys/ccompile.h \
+	-I$S/cddl/contrib/opensolaris/uts/common \
+	-I$S -I$S/cddl/compat/opensolaris
 CDDL_C=		${CC} -c ${CDDL_CFLAGS} ${WERROR} ${PROF} ${.IMPSRC}
 
 # Special flags for managing the compat compiles for ZFS
-ZFS_CFLAGS=	-DBUILDING_ZFS -I$S/cddl/contrib/opensolaris/uts/common/fs/zfs
-ZFS_CFLAGS+=	-I$S/cddl/contrib/opensolaris/uts/common/fs/zfs/lua
-ZFS_CFLAGS+=	-I$S/cddl/contrib/opensolaris/uts/common/zmod
-ZFS_CFLAGS+=	-I$S/cddl/contrib/opensolaris/common/lz4
-ZFS_CFLAGS+=	-I$S/cddl/contrib/opensolaris/common/zfs
-ZFS_CFLAGS+=	${CDDL_CFLAGS}
+ZFS_CFLAGS+=	${CDDL_CFLAGS} -DBUILDING_ZFS -DHAVE_UIO_ZEROCOPY -DWITH_NETDUMP 
+ZFS_CFLAGS+=	-D__KERNEL__ -D_SYS_CONDVAR_H_ -DSMP 
+
+.if ${MACHINE_ARCH} == "amd64"
+ZFS_CFLAGS+= -DHAVE_AVX2 -DHAVE_AVX -D__x86_64 -DHAVE_SSE2 -DHAVE_AVX512F \
+	-DHAVE_SSSE3 -DHAVE_AVX512BW
+.endif
+
+.if ${MACHINE_ARCH} == "i386" || ${MACHINE_ARCH} == "powerpc" || \
+	${MACHINE_ARCH} == "arm"
+ZFS_CFLAGS+= -DBITS_PER_LONG=32
+.else
+ZFS_CFLAGS+= -DBITS_PER_LONG=64
+.endif
+
+
 ZFS_ASM_CFLAGS= -x assembler-with-cpp -DLOCORE ${ZFS_CFLAGS}
 ZFS_C=		${CC} -c ${ZFS_CFLAGS} ${WERROR} ${PROF} ${.IMPSRC}
+ZFS_RPC_C=	${CC} -c ${ZFS_CFLAGS} -DHAVE_RPC_TYPES ${WERROR} ${PROF} ${.IMPSRC}
 ZFS_S=		${CC} -c ${ZFS_ASM_CFLAGS} ${WERROR} ${.IMPSRC}
 
+
+
 # Special flags for managing the compat compiles for DTrace
 DTRACE_CFLAGS=	-DBUILDING_DTRACE ${CDDL_CFLAGS} -I$S/cddl/dev/dtrace -I$S/cddl/dev/dtrace/${MACHINE_CPUARCH}
 .if ${MACHINE_CPUARCH} == "amd64" || ${MACHINE_CPUARCH} == "i386"
 DTRACE_CFLAGS+=	-I$S/cddl/contrib/opensolaris/uts/intel -I$S/cddl/dev/dtrace/x86
 .endif
-DTRACE_CFLAGS+=	-I$S/cddl/contrib/opensolaris/common/util -I$S -DDIS_MEM -DSMP
+DTRACE_CFLAGS+=	-I$S/cddl/contrib/opensolaris/common/util -I$S -DDIS_MEM -DSMP -I$S/cddl/compat/opensolaris
+DTRACE_CFLAGS+=	-I$S/cddl/contrib/opensolaris/uts/common
 DTRACE_ASM_CFLAGS=	-x assembler-with-cpp -DLOCORE ${DTRACE_CFLAGS}
 DTRACE_C=	${CC} -c ${DTRACE_CFLAGS}	${WERROR} ${PROF} ${.IMPSRC}
 DTRACE_S=	${CC} -c ${DTRACE_ASM_CFLAGS}	${WERROR} ${.IMPSRC}
 
 # Special flags for managing the compat compiles for DTrace/FBT
-FBT_CFLAGS=	-DBUILDING_DTRACE -nostdinc -I$S/cddl/dev/fbt/${MACHINE_CPUARCH} -I$S/cddl/dev/fbt -I$S/cddl/compat/opensolaris -I$S/cddl/contrib/opensolaris/uts/common -I$S ${CDDL_CFLAGS}
+FBT_CFLAGS=	-DBUILDING_DTRACE -nostdinc -I$S/cddl/dev/fbt/${MACHINE_CPUARCH} -I$S/cddl/dev/fbt ${CDDL_CFLAGS} -I$S/cddl/compat/opensolaris -I$S/cddl/contrib/opensolaris/uts/common  
 .if ${MACHINE_CPUARCH} == "amd64" || ${MACHINE_CPUARCH} == "i386"
 FBT_CFLAGS+=	-I$S/cddl/dev/fbt/x86
 .endif
Index: sys/conf/kmod.mk
===================================================================
--- sys/conf/kmod.mk
+++ sys/conf/kmod.mk
@@ -532,6 +532,22 @@
 OBJS_DEPEND_GUESS+= opt_global.h
 .endif
 
+ZINCDIR=${SYSDIR}/contrib/openzfs/include
+OPENZFS_CFLAGS=     \
+	-D_SYS_VMEM_H_  \
+	-D__KERNEL__ \
+	-nostdinc \
+	-DSMP \
+	-I${ZINCDIR}  \
+	-I${ZINCDIR}/spl \
+	-I${ZINCDIR}/os/freebsd \
+	-I${ZINCDIR}/os/freebsd/spl \
+	-I${ZINCDIR}/os/freebsd/zfs \
+	-I${SYSDIR}/cddl/compat/opensolaris \
+	-I${SYSDIR}/cddl/contrib/opensolaris/uts/common \
+	-include ${ZINCDIR}/os/freebsd/spl/sys/ccompile.h
+
+
 .include <bsd.dep.mk>
 .include <bsd.clang-analyze.mk>
 .include <bsd.obj.mk>
Index: sys/kern/genoffset.c
===================================================================
--- sys/kern/genoffset.c
+++ sys/kern/genoffset.c
@@ -32,6 +32,7 @@
 #endif
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
+#include <sys/endian.h>
 #include <sys/param.h>
 #include <sys/assym.h>
 #include <sys/proc.h>
Index: sys/modules/Makefile
===================================================================
--- sys/modules/Makefile
+++ sys/modules/Makefile
@@ -563,7 +563,7 @@
 SUBDIR+=	tests
 .endif
 
-.if ${MK_ZFS} != "no" || defined(ALL_MODULES)
+.if ${MK_ZFS} != "no" || (defined(ALL_MODULES)  && ${MACHINE_CPUARCH} != "powerpc")
 SUBDIR+=	zfs
 .endif
 
Index: sys/modules/dtrace/dtaudit/Makefile
===================================================================
--- sys/modules/dtrace/dtaudit/Makefile
+++ sys/modules/dtrace/dtaudit/Makefile
@@ -8,9 +8,7 @@
 SRCS=		audit_dtrace.c	\
 		vnode_if.h
 
-CFLAGS+=	-I${SYSDIR}/cddl/compat/opensolaris \
-		-I${SYSDIR}/cddl/contrib/opensolaris/uts/common \
-		-I${SYSDIR}
+CFLAGS+= ${OPENZFS_CFLAGS}
 
 .include <bsd.kmod.mk>
 
Index: sys/modules/dtrace/dtmalloc/Makefile
===================================================================
--- sys/modules/dtrace/dtmalloc/Makefile
+++ sys/modules/dtrace/dtmalloc/Makefile
@@ -8,9 +8,7 @@
 SRCS=		dtmalloc.c
 SRCS+=		vnode_if.h
 
-CFLAGS+=	-I${SYSDIR}/cddl/compat/opensolaris \
-		-I${SYSDIR}/cddl/contrib/opensolaris/uts/common \
-		-I${SYSDIR}
+CFLAGS+=	${OPENZFS_CFLAGS}
 
 .include <bsd.kmod.mk>
 
Index: sys/modules/dtrace/dtnfscl/Makefile
===================================================================
--- sys/modules/dtrace/dtnfscl/Makefile
+++ sys/modules/dtrace/dtnfscl/Makefile
@@ -8,9 +8,7 @@
 SRCS=		nfs_clkdtrace.c \
 		vnode_if.h
 
-CFLAGS+=	-I${SYSDIR}/cddl/compat/opensolaris \
-		-I${SYSDIR}/cddl/contrib/opensolaris/uts/common \
-		-I${SYSDIR}
+CFLAGS+=	${OPENZFS_CFLAGS}
 
 .include <bsd.kmod.mk>
 
Index: sys/modules/dtrace/dtrace/Makefile
===================================================================
--- sys/modules/dtrace/dtrace/Makefile
+++ sys/modules/dtrace/dtrace/Makefile
@@ -20,9 +20,11 @@
 .PATH: ${SYSDIR}/cddl/dev/dtrace/x86
 SRCS+=		dis_tables.c \
 		instr_size.c
-CFLAGS+=	-I${SYSDIR}/cddl/contrib/opensolaris/uts/intel \
-		-I${SYSDIR}/cddl/dev/dtrace/x86
+CFLAGS+=       -I${SYSDIR}/cddl/contrib/opensolaris/uts/intel \
+               -I${SYSDIR}/cddl/dev/dtrace/x86
+
 .endif
+CFLAGS+= ${OPENZFS_CFLAGS}
 
 SRCS+=		bus_if.h device_if.h vnode_if.h
 
@@ -56,7 +58,7 @@
 .include <bsd.kmod.mk>
 
 CFLAGS+=	-include ${SYSDIR}/cddl/compat/opensolaris/sys/debug_compat.h
-
+CFLAGS.dtrace_asm.S+= -D_SYS_ERRNO_H_ -D_SYS_PARAM_H_ -DLOCORE
 CWARNFLAGS+=	-Wno-parentheses
 CWARNFLAGS+=	-Wno-uninitialized
 CWARNFLAGS+=	-Wno-cast-qual
Index: sys/modules/dtrace/fasttrap/Makefile
===================================================================
--- sys/modules/dtrace/fasttrap/Makefile
+++ sys/modules/dtrace/fasttrap/Makefile
@@ -6,12 +6,10 @@
 
 KMOD=		fasttrap
 SRCS=		fasttrap.c fasttrap_isa.c
-SRCS+=		vnode_if.h
+SRCS+=		vnode_if.h  opt_global.h
 
-CFLAGS+=	-I${SYSDIR}/cddl/compat/opensolaris \
-		-I${SYSDIR}/cddl/contrib/opensolaris/uts/common \
-		-I${SYSDIR}/cddl/contrib/opensolaris/uts/common/dtrace \
-		-I${SYSDIR}
+CFLAGS+= 	-include ${.OBJDIR}/opt_global.h
+CFLAGS+=	${OPENZFS_CFLAGS}
 
 .if ${MACHINE_CPUARCH} == "amd64" ||  ${MACHINE_CPUARCH} == "i386"
 CFLAGS+=	-I${SYSDIR}/cddl/contrib/opensolaris/uts/intel
Index: sys/modules/dtrace/fbt/Makefile
===================================================================
--- sys/modules/dtrace/fbt/Makefile
+++ sys/modules/dtrace/fbt/Makefile
@@ -8,6 +8,7 @@
 SRCS=		fbt.c fbt_isa.c
 SRCS+=		vnode_if.h
 
+
 .if ${MACHINE_CPUARCH} == "amd64" || ${MACHINE_CPUARCH} == "i386"
 CFLAGS+=	-I${SYSDIR}/cddl/dev/fbt/x86
 .PATH:		${SYSDIR}/cddl/dev/fbt/x86
@@ -16,10 +17,8 @@
 .PATH:		${SYSDIR}/cddl/dev/fbt/${MACHINE_CPUARCH}
 .endif
 
-CFLAGS+=	-I${SYSDIR}/cddl/dev/fbt \
-		-I${SYSDIR}/cddl/compat/opensolaris \
-		-I${SYSDIR}/cddl/contrib/opensolaris/uts/common \
-		-I${SYSDIR}
+CFLAGS+= 	${OPENZFS_CFLAGS}
+CFLAGS+=  -I${SYSDIR}/cddl/dev/fbt
 
 .include <bsd.kmod.mk>
 
Index: sys/modules/dtrace/profile/Makefile
===================================================================
--- sys/modules/dtrace/profile/Makefile
+++ sys/modules/dtrace/profile/Makefile
@@ -8,9 +8,7 @@
 SRCS=		profile.c
 SRCS+=		vnode_if.h
 
-CFLAGS+=	-I${SYSDIR}/cddl/compat/opensolaris \
-		-I${SYSDIR}/cddl/contrib/opensolaris/uts/common \
-		-I${SYSDIR}
+CFLAGS+= 	${OPENZFS_CFLAGS}
 
 .include <bsd.kmod.mk>
 
Index: sys/modules/dtrace/prototype/Makefile
===================================================================
--- sys/modules/dtrace/prototype/Makefile
+++ sys/modules/dtrace/prototype/Makefile
@@ -8,9 +8,7 @@
 SRCS=		prototype.c
 SRCS+=		vnode_if.h
 
-CFLAGS+=	-I${SYSDIR}/cddl/compat/opensolaris \
-		-I${SYSDIR}/cddl/contrib/opensolaris/uts/common \
-		-I${SYSDIR}
+CFLAGS+= 	${OPENZFS_CFLAGS}
 
 .include <bsd.kmod.mk>
 
Index: sys/modules/dtrace/sdt/Makefile
===================================================================
--- sys/modules/dtrace/sdt/Makefile
+++ sys/modules/dtrace/sdt/Makefile
@@ -8,10 +8,7 @@
 SRCS=		sdt.c
 SRCS+=		vnode_if.h
 
-CFLAGS+=	-I${SYSDIR}/cddl/compat/opensolaris \
-		-I${SYSDIR}/cddl/contrib/opensolaris/uts/common \
-		-I${SYSDIR}
+CFLAGS+=	${OPENZFS_CFLAGS}
 
 .include <bsd.kmod.mk>
-
 CFLAGS+=	-include ${SYSDIR}/cddl/compat/opensolaris/sys/debug_compat.h
Index: sys/modules/dtrace/systrace/Makefile
===================================================================
--- sys/modules/dtrace/systrace/Makefile
+++ sys/modules/dtrace/systrace/Makefile
@@ -8,10 +8,7 @@
 SRCS=		systrace.c
 SRCS+=		vnode_if.h
 
-CFLAGS+=	-I${SYSDIR}/cddl/compat/opensolaris \
-		-I${SYSDIR}/cddl/contrib/opensolaris/uts/common \
-		-I${SYSDIR}/cddl/contrib/opensolaris/uts/common/dtrace \
-		-I${SYSDIR}
+CFLAGS+=	${OPENZFS_CFLAGS}
 
 .include <bsd.kmod.mk>
 
Index: sys/modules/dtrace/systrace_freebsd32/Makefile
===================================================================
--- sys/modules/dtrace/systrace_freebsd32/Makefile
+++ sys/modules/dtrace/systrace_freebsd32/Makefile
@@ -8,9 +8,8 @@
 SRCS=		systrace.c
 SRCS+=		vnode_if.h
 
-CFLAGS+=	-I${SYSDIR}/cddl/compat/opensolaris \
-		-I${SYSDIR}/cddl/contrib/opensolaris/uts/common \
-		-I${SYSDIR} -DFREEBSD32_SYSTRACE
+CFLAGS+=	${OPENZFS_CFLAGS}
+CFLAGS+=	-DFREEBSD32_SYSTRACE
 
 .include <bsd.kmod.mk>
 
Index: sys/modules/dtrace/systrace_linux/Makefile
===================================================================
--- sys/modules/dtrace/systrace_linux/Makefile
+++ sys/modules/dtrace/systrace_linux/Makefile
@@ -9,9 +9,8 @@
 SRCS=		systrace.c
 SRCS+=		vnode_if.h
 
-CFLAGS+=	-I${SYSDIR}/cddl/compat/opensolaris \
-		-I${SYSDIR}/cddl/contrib/opensolaris/uts/common \
-		-I${SYSDIR} -DLINUX_SYSTRACE
+CFLAGS+=	${OPENZFS_CFLAGS}
+CFLAGS+=	-DLINUX_SYSTRACE
 
 .include <bsd.kmod.mk>
 
Index: sys/modules/dtrace/systrace_linux32/Makefile
===================================================================
--- sys/modules/dtrace/systrace_linux32/Makefile
+++ sys/modules/dtrace/systrace_linux32/Makefile
@@ -9,9 +9,8 @@
 SRCS=		systrace.c
 SRCS+=		vnode_if.h
 
-CFLAGS+=	-I${SYSDIR}/cddl/compat/opensolaris \
-		-I${SYSDIR}/cddl/contrib/opensolaris/uts/common \
-		-I${SYSDIR} -DLINUX32_SYSTRACE
+CFLAGS+=	${OPENZFS_CFLAGS}
+CFLAGS+=	-DLINUX32_SYSTRACE
 
 .include <bsd.kmod.mk>
 
Index: sys/modules/opensolaris/Makefile
===================================================================
--- sys/modules/opensolaris/Makefile
+++ sys/modules/opensolaris/Makefile
@@ -3,14 +3,16 @@
 SYSDIR?=	${SRCTOP}/sys
 
 .PATH:		${SYSDIR}/cddl/compat/opensolaris/kern
+.PATH:		${SYSDIR}/contrib/openzfs/module/os/freebsd/spl
 
 KMOD=		opensolaris
-SRCS=		opensolaris.c		\
-		opensolaris_cmn_err.c	\
-		opensolaris_kmem.c	\
-		opensolaris_misc.c	\
+SRCS=		vnode_if.h \
+		opensolaris.c		\
 		opensolaris_proc.c	\
-		opensolaris_sunddi.c
+		spl_cmn_err.c	\
+		spl_kmem.c	\
+		spl_misc.c	\
+		spl_sunddi.c
 
 _A=${SYSDIR}/cddl/contrib/opensolaris/common/atomic
 .if exists(${_A}/${MACHINE_CPUARCH}/opensolaris_atomic.S)
@@ -23,9 +25,7 @@
 SRCS+=		opensolaris_atomic.c
 .endif
 
-CFLAGS+=	-I${SYSDIR}/cddl/compat/opensolaris		\
-		-I${SYSDIR}/cddl/contrib/opensolaris/uts/common	\
-		-I${SYSDIR}
+CFLAGS+= 	${OPENZFS_CFLAGS}
 
 EXPORT_SYMS=	cpu_core
 
Index: sys/modules/zfs/Makefile
===================================================================
--- sys/modules/zfs/Makefile
+++ sys/modules/zfs/Makefile
@@ -1,118 +1,333 @@
 # $FreeBSD$
 
-SYSDIR?=${SRCTOP}/sys
+SRCDIR=${SRCTOP}/sys/contrib/openzfs/module
+INCDIR=${SRCTOP}/sys/contrib/openzfs/include
 
 KMOD=	zfs
 
-SRCS=	bus_if.h device_if.h vnode_if.h opt_kstack_pages.h
+.PATH:	${SRCDIR}/avl \
+	${SRCDIR}/lua \
+	${SRCDIR}/nvpair \
+	${SRCDIR}/os/freebsd/spl \
+	${SRCDIR}/os/freebsd/zfs \
+	${SRCDIR}/unicode \
+	${SRCDIR}/zcommon \
+	${SRCDIR}/zfs
 
-SUNW=	${SYSDIR}/cddl/contrib/opensolaris
+CFLAGS+= -I${INCDIR}
+CFLAGS+= -I${INCDIR}/spl
+CFLAGS+= -I${INCDIR}/os/freebsd
+CFLAGS+= -I${INCDIR}/os/freebsd/spl
+CFLAGS+= -I${INCDIR}/os/freebsd/zfs
+CFLAGS+= -include ${INCDIR}/os/freebsd/spl/sys/ccompile.h
+CFLAGS+= -include ${SRCTOP}/sys/modules/zfs/static_ccompile.h
+CFLAGS+= -I${.CURDIR}
 
-.PATH:	${SUNW}/common/acl
-SRCS+=	acl_common.c
-.PATH:	${SUNW}/common/avl
-SRCS+=	avl.c
-.PATH:	${SUNW}/common/nvpair
-SRCS+=	opensolaris_nvpair.c
-SRCS+=	opensolaris_nvpair_alloc_fixed.c
-SRCS+=	opensolaris_fnvpair.c
-.PATH:	${SYSDIR}/cddl/contrib/opensolaris/common/unicode
-SRCS+=	u8_textprep.c
-.PATH:	${SUNW}/common/lz4
-SRCS+=	lz4.c
-
-.PATH:	${SYSDIR}/cddl/compat/opensolaris/kern
-SRCS+=	opensolaris_acl.c
-SRCS+=	opensolaris_dtrace.c
-SRCS+=	opensolaris_kobj.c
-SRCS+=	opensolaris_kstat.c
-SRCS+=	opensolaris_lookup.c
-SRCS+=	opensolaris_policy.c
-SRCS+=	opensolaris_string.c
-SRCS+=	opensolaris_sysevent.c
-SRCS+=	opensolaris_taskq.c
-SRCS+=	opensolaris_uio.c
-SRCS+=	opensolaris_vfs.c
-SRCS+=	opensolaris_vm.c
-SRCS+=	opensolaris_zone.c
-
-_A=${SYSDIR}/cddl/contrib/opensolaris/common/atomic
-.if exists(${_A}/${MACHINE_CPUARCH}/opensolaris_atomic.S)
-.PATH:	${_A}/${MACHINE_CPUARCH}
-SRCS+=		opensolaris_atomic.S
-.elif exists(${_A}/${MACHINE_ARCH}/opensolaris_atomic.S)
-.PATH:	${_A}/${MACHINE_ARCH}
-SRCS+=		opensolaris_atomic.S
-.else
-SRCS+=		opensolaris_atomic.c
+CFLAGS+= -D__KERNEL__ -DFREEBSD_NAMECACHE -DBUILDING_ZFS  -D__BSD_VISIBLE=1
+CFLAGS+= -DHAVE_UIO_ZEROCOPY -DWITHOUT_NETDUMP -D__KERNEL -D_SYS_CONDVAR_H_
+CFLAGS+= -D_SYS_VMEM_H_ -DKDTRACE_HOOKS -DSMP
+
+.if ${MACHINE_ARCH} == "amd64"
+CFLAGS+= -DHAVE_AVX2 -DHAVE_AVX -D__x86_64 -DHAVE_SSE2 -DHAVE_AVX512F -DHAVE_AVX512BW -DHAVE_SSSE3
 .endif
 
-.PATH:	${SUNW}/uts/common/fs
-SRCS+=	vnode.c
-
-.PATH:	${SUNW}/uts/common/os
-SRCS+=	callb.c
-SRCS+=	fm.c
-SRCS+=	list.c
-SRCS+=	nvpair_alloc_system.c
-
-.PATH:	${SUNW}/uts/common/zmod
-SRCS+=	zmod.c
-
-.PATH:	${SYSDIR}/crypto/sha2
-SRCS+=	sha256c.c sha512c.c
-
-.PATH:	${SYSDIR}/crypto/skein
-SRCS+=	skein.c skein_block.c
-
-.PATH:	${SUNW}/common/zfs
-.include "${SUNW}/uts/common/Makefile.files"
-.PATH:	${SUNW}/uts/common/fs/zfs
-ZFS_SRCS=	${ZFS_OBJS:C/.o$/.c/}
-SRCS+=	${ZFS_SRCS}
-SRCS+=	vdev_geom.c
-SRCS+=	trim_map.c
-.PATH:	${SUNW}/uts/common/fs/zfs/lua
-LUA_SRCS=	${LUA_OBJS:C/.o$/.c/}
-SRCS+=	${LUA_SRCS}
-
-# Use FreeBSD's namecache.
-CFLAGS+=-DFREEBSD_NAMECACHE
-
-CFLAGS+=-I${SYSDIR}/cddl/compat/opensolaris
-CFLAGS+=-I${SUNW}/uts/common/fs/zfs
-CFLAGS+=-I${SUNW}/uts/common/fs/zfs/lua
-CFLAGS+=-I${SUNW}/uts/common/zmod
-CFLAGS+=-I${SUNW}/uts/common
-CFLAGS+=-I${SYSDIR}
-CFLAGS+=-I${SUNW}/common/zfs
-CFLAGS+=-I${SUNW}/common/lz4
-CFLAGS+=-I${SUNW}/common
-CFLAGS+=-DBUILDING_ZFS
-CFLAGS.gcc+=-fms-extensions
-
-.if ${MACHINE_ARCH} == "powerpc64"
-CFLAGS.gcc+=-mminimal-toc
+.if defined(WITH_VFS_DEBUG) && ${WITH_VFS_DEBUG} == "true"
+# kernel must also be built with this option for this to work
+CFLAGS+= -DDEBUG_VFS_LOCKS
+.endif
+
+.if defined(WITH_GCOV) && ${WITH_GCOV} == "true"
+CFLAGS+=	 -fprofile-arcs -ftest-coverage
 .endif
 
-.ifdef ZFS_DEBUG
-CFLAGS+=-DDEBUG=1
 DEBUG_FLAGS=-g
+
+.if ${MACHINE_ARCH} == "i386" || ${MACHINE_ARCH} == "powerpc" || \
+	${MACHINE_ARCH} == "arm"
+CFLAGS+= -DBITS_PER_LONG=32
+.else
+CFLAGS+= -DBITS_PER_LONG=64
+.endif
+
+SRCS=	vnode_if.h device_if.h bus_if.h
+
+# avl
+SRCS+=	avl.c
+
+#lua
+SRCS+=	lapi.c \
+	lauxlib.c \
+	lbaselib.c \
+	lcode.c \
+	lcompat.c \
+	lcorolib.c \
+	lctype.c \
+	ldebug.c \
+	ldo.c \
+	lfunc.c \
+	lgc.c \
+	llex.c \
+	lmem.c \
+	lobject.c \
+	lopcodes.c \
+	lparser.c \
+	lstate.c \
+	lstring.c \
+	lstrlib.c \
+	ltable.c \
+	ltablib.c \
+	ltm.c \
+	lvm.c \
+	lzio.c
+
+#nvpair
+SRCS+=	nvpair.c \
+	fnvpair.c \
+	nvpair_alloc_spl.c \
+	nvpair_alloc_fixed.c
+
+#os/freebsd/spl
+SRCS+=	acl_common.c \
+	btree.c \
+	callb.c \
+	list.c \
+	spl_acl.c \
+	spl_cmn_err.c \
+	spl_dtrace.c \
+	spl_kmem.c \
+	spl_kstat.c \
+	spl_misc.c \
+	spl_policy.c \
+	spl_string.c \
+	spl_sunddi.c \
+	spl_sysevent.c \
+	spl_taskq.c \
+	spl_uio.c \
+	spl_vfs.c \
+	spl_vm.c \
+	spl_zone.c \
+	sha256c.c \
+	sha512c.c \
+	spl_procfs_list.c \
+	spl_zlib.c
+
+
+.if ${MACHINE_ARCH} == "i386" || ${MACHINE_ARCH} == "powerpc" || \
+	${MACHINE_ARCH} == "arm"
+SRCS+= spl_atomic.c
 .endif
 
+#os/freebsd/zfs
+SRCS+=	abd_os.c \
+	crypto_os.c \
+	dmu_os.c \
+	hkdf.c \
+	kmod_core.c \
+	spa_os.c \
+	sysctl_os.c \
+	vdev_file.c \
+	vdev_label_os.c \
+	vdev_geom.c \
+	zfs_acl.c \
+	zfs_ctldir.c \
+	zfs_dir.c \
+	zfs_ioctl_compat.c \
+	zfs_ioctl_os.c \
+	zfs_log.c \
+	zfs_replay.c \
+	zfs_vfsops.c \
+	zfs_vnops.c \
+	zfs_znode.c \
+	zio_crypt.c \
+	zvol_os.c
+
+#unicode
+SRCS+=	uconv.c \
+	u8_textprep.c
+
+#zcommon
+SRCS+=	zfeature_common.c \
+	zfs_comutil.c \
+	zfs_deleg.c \
+	zfs_fletcher.c \
+	zfs_fletcher_avx512.c \
+	zfs_fletcher_intel.c \
+	zfs_fletcher_sse.c \
+	zfs_fletcher_superscalar.c \
+	zfs_fletcher_superscalar4.c \
+	zfs_namecheck.c \
+	zfs_prop.c \
+	zpool_prop.c \
+	zprop_common.c
+
+#zfs
+SRCS+=	abd.c \
+	aggsum.c \
+	arc.c \
+	arc_os.c \
+	blkptr.c \
+	bplist.c \
+	bpobj.c \
+	cityhash.c \
+	dbuf.c \
+	dbuf_stats.c \
+	bptree.c \
+	bqueue.c \
+	dataset_kstats.c \
+	ddt.c \
+	ddt_zap.c \
+	dmu.c \
+	dmu_diff.c \
+	dmu_object.c \
+	dmu_objset.c \
+	dmu_recv.c \
+	dmu_redact.c \
+	dmu_send.c \
+	dmu_traverse.c \
+	dmu_tx.c \
+	dmu_zfetch.c \
+	dnode.c \
+	dnode_sync.c \
+	dsl_dataset.c \
+	dsl_deadlist.c \
+	dsl_deleg.c \
+	dsl_bookmark.c \
+	dsl_dir.c \
+	dsl_crypt.c \
+	dsl_destroy.c \
+	dsl_pool.c \
+	dsl_prop.c \
+	dsl_scan.c \
+	dsl_synctask.c \
+	dsl_userhold.c \
+	fm.c \
+	gzip.c \
+	lzjb.c \
+	lz4.c \
+	metaslab.c \
+	mmp.c \
+	multilist.c \
+	objlist.c \
+	pathname.c \
+	range_tree.c \
+	refcount.c \
+	rrwlock.c \
+	sa.c \
+	sha256.c \
+	skein_zfs.c \
+	spa.c \
+	spa_boot.c \
+	spa_checkpoint.c \
+	spa_config.c \
+	spa_errlog.c \
+	spa_history.c \
+	spa_log_spacemap.c \
+	spa_misc.c \
+	spa_stats.c \
+	space_map.c \
+	space_reftree.c \
+	txg.c \
+	uberblock.c \
+	unique.c \
+	vdev.c \
+	vdev_cache.c \
+	vdev_indirect.c \
+	vdev_indirect_births.c \
+	vdev_indirect_mapping.c \
+	vdev_initialize.c \
+	vdev_label.c \
+	vdev_mirror.c \
+	vdev_missing.c \
+	vdev_queue.c \
+	vdev_raidz.c \
+	vdev_raidz_math.c \
+	vdev_raidz_math_scalar.c \
+	vdev_raidz_math_avx2.c \
+	vdev_raidz_math_avx512bw.c \
+	vdev_raidz_math_avx512f.c \
+	vdev_raidz_math_sse2.c \
+	vdev_raidz_math_ssse3.c \
+	vdev_rebuild.c \
+	vdev_removal.c \
+	vdev_root.c \
+	vdev_trim.c \
+	zap.c \
+	zap_leaf.c \
+	zap_micro.c \
+	zcp.c \
+	zcp_get.c \
+	zcp_global.c \
+	zcp_iter.c \
+	zcp_set.c \
+	zcp_synctask.c \
+	zfeature.c \
+	zfs_byteswap.c \
+	zfs_debug.c \
+	zfs_file_os.c \
+	zfs_fm.c \
+	zfs_fuid.c \
+	zfs_fuid_os.c \
+	zfs_ioctl.c \
+	zfs_onexit.c \
+	zfs_quota.c \
+	zfs_ratelimit.c \
+	zfs_rlock.c \
+	zfs_sa.c \
+	zil.c \
+	zio.c \
+	zio_checksum.c \
+	zio_compress.c \
+	zio_inject.c \
+	zle.c \
+	zrlock.c \
+	zthr.c \
+	zvol.c
+
 .include <bsd.kmod.mk>
 
-CFLAGS+=	-include ${SYSDIR}/cddl/compat/opensolaris/sys/debug_compat.h
-
-CWARNFLAGS+=-Wno-missing-prototypes
-CWARNFLAGS+=-Wno-undef
-CWARNFLAGS+=-Wno-strict-prototypes
-CWARNFLAGS+=-Wno-cast-qual
-CWARNFLAGS+=-Wno-parentheses
-CWARNFLAGS+=-Wno-redundant-decls
-CWARNFLAGS+=-Wno-missing-braces
-CWARNFLAGS+=-Wno-uninitialized
-CWARNFLAGS+=-Wno-unused
-CWARNFLAGS+=-Wno-inline
-CWARNFLAGS+=-Wno-switch
-CWARNFLAGS+=-Wno-pointer-arith
+
+CFLAGS.gcc+= -Wno-pointer-to-int-cast
+
+CFLAGS.lapi.c= -Wno-cast-qual
+CFLAGS.lcompat.c= -Wno-cast-qual
+CFLAGS.lobject.c= -Wno-cast-qual
+CFLAGS.ltable.c= -Wno-cast-qual
+CFLAGS.lvm.c= -Wno-cast-qual
+CFLAGS.nvpair.c= -Wno-cast-qual -DHAVE_RPC_TYPES
+CFLAGS.spl_string.c= -Wno-cast-qual
+CFLAGS.spl_vm.c= -Wno-cast-qual
+CFLAGS.spl_zlib.c= -Wno-cast-qual
+CFLAGS.abd.c= -Wno-cast-qual
+CFLAGS.zfs_log.c= -Wno-cast-qual
+CFLAGS.zfs_vnops.c= -Wno-pointer-arith
+CFLAGS.u8_textprep.c= -Wno-cast-qual
+CFLAGS.zfs_fletcher.c= -Wno-cast-qual -Wno-pointer-arith
+CFLAGS.zfs_fletcher_intel.c= -Wno-cast-qual -Wno-pointer-arith
+CFLAGS.zfs_fletcher_sse.c= -Wno-cast-qual -Wno-pointer-arith
+CFLAGS.zfs_fletcher_avx512.c= -Wno-cast-qual -Wno-pointer-arith
+CFLAGS.zprop_common.c= -Wno-cast-qual
+CFLAGS.ddt.c= -Wno-cast-qual
+CFLAGS.dmu.c= -Wno-cast-qual
+CFLAGS.dmu_traverse.c= -Wno-cast-qual
+CFLAGS.dsl_dir.c= -Wno-cast-qual
+CFLAGS.dsl_deadlist.c= -Wno-cast-qual
+CFLAGS.dsl_prop.c= -Wno-cast-qual
+CFLAGS.fm.c= -Wno-cast-qual
+CFLAGS.lz4.c= -Wno-cast-qual
+CFLAGS.spa.c= -Wno-cast-qual
+CFLAGS.spa_misc.c= -Wno-cast-qual
+CFLAGS.sysctl_os.c= -include ${SRCTOP}/sys/modules/zfs/zfs_config.h
+CFLAGS.vdev_raidz.c= -Wno-cast-qual
+CFLAGS.vdev_raidz_math.c= -Wno-cast-qual
+CFLAGS.vdev_raidz_math_scalar.c= -Wno-cast-qual
+CFLAGS.vdev_raidz_math_avx2.c= -Wno-cast-qual -Wno-duplicate-decl-specifier
+CFLAGS.vdev_raidz_math_avx512f.c= -Wno-cast-qual -Wno-duplicate-decl-specifier
+CFLAGS.vdev_raidz_math_sse2.c= -Wno-cast-qual -Wno-duplicate-decl-specifier
+CFLAGS.zap_leaf.c= -Wno-cast-qual
+CFLAGS.zap_micro.c= -Wno-cast-qual
+CFLAGS.zcp.c= -Wno-cast-qual
+CFLAGS.zfs_fm.c= -Wno-cast-qual
+CFLAGS.zfs_ioctl.c= -Wno-cast-qual
+CFLAGS.zil.c= -Wno-cast-qual
+CFLAGS.zio.c= -Wno-cast-qual
+CFLAGS.zrlock.c= -Wno-cast-qual
Index: sys/modules/zfs/static_ccompile.h
===================================================================
--- /dev/null
+++ sys/modules/zfs/static_ccompile.h
@@ -0,0 +1,25 @@
+#ifndef	_SPL_NVLIST_H_
+#define	_SPL_NVLIST_H_
+
+#ifdef INVARIANTS
+#define        ZFS_DEBUG
+#endif
+
+#define	nvlist_add_nvlist	spl_nvlist_add_nvlist
+#define	nvlist_add_nvlist_array	spl_nvlist_add_nvlist_array
+#define	nvlist_add_nvpair	spl_nvlist_add_nvpair
+#define	nvlist_add_string	spl_nvlist_add_string
+#define	nvlist_add_string_array	spl_nvlist_add_string_array
+#define	nvlist_empty	spl_nvlist_empty
+#define	nvlist_exists	spl_nvlist_exists
+#define	nvlist_free	spl_nvlist_free
+#define	nvlist_next_nvpair	spl_nvlist_next_nvpair
+#define	nvlist_pack	spl_nvlist_pack
+#define	nvlist_prev_nvpair	spl_nvlist_prev_nvpair
+#define	nvlist_remove_nvpair	spl_nvlist_remove_nvpair
+#define	nvlist_size	spl_nvlist_size
+#define	nvlist_unpack	spl_nvlist_unpack
+	
+#define	nvpair_type	spl_nvpair_type
+#define	nvpair_name	spl_nvpair_name
+#endif
Index: sys/modules/zfs/zfs_config.h
===================================================================
--- /dev/null
+++ sys/modules/zfs/zfs_config.h
@@ -0,0 +1,707 @@
+/* zfs_config.h.  Generated from zfs_config.h.in by configure.  */
+/* zfs_config.h.in.  Generated from configure.ac by autoheader.  */
+
+/* Define to 1 if translation of program messages to the user's native
+   language is requested. */
+/* #undef ENABLE_NLS */
+
+/* bio_end_io_t wants 1 arg */
+/* #undef HAVE_1ARG_BIO_END_IO_T */
+
+/* lookup_bdev() wants 1 arg */
+/* #undef HAVE_1ARG_LOOKUP_BDEV */
+
+/* submit_bio() wants 1 arg */
+/* #undef HAVE_1ARG_SUBMIT_BIO */
+
+/* bdi_setup_and_register() wants 2 args */
+/* #undef HAVE_2ARGS_BDI_SETUP_AND_REGISTER */
+
+/* lookup_bdev() wants 2 args */
+/* #undef HAVE_2ARGS_LOOKUP_BDEV */
+
+/* vfs_getattr wants 2 args */
+/* #undef HAVE_2ARGS_VFS_GETATTR */
+
+/* zlib_deflate_workspacesize() wants 2 args */
+/* #undef HAVE_2ARGS_ZLIB_DEFLATE_WORKSPACESIZE */
+
+/* bdi_setup_and_register() wants 3 args */
+/* #undef HAVE_3ARGS_BDI_SETUP_AND_REGISTER */
+
+/* vfs_getattr wants 3 args */
+/* #undef HAVE_3ARGS_VFS_GETATTR */
+
+/* vfs_getattr wants 4 args */
+/* #undef HAVE_4ARGS_VFS_GETATTR */
+
+/* kernel has access_ok with 'type' parameter */
+/* #undef HAVE_ACCESS_OK_TYPE */
+
+/* posix_acl has refcount_t */
+/* #undef HAVE_ACL_REFCOUNT */
+
+/* Define if host toolchain supports AES */
+#define HAVE_AES 1
+
+#ifdef __amd64__
+#ifndef RESCUE
+/* Define if host toolchain supports AVX */
+#define HAVE_AVX 1
+#endif
+
+/* Define if host toolchain supports AVX2 */
+#define HAVE_AVX2 1
+
+/* Define if host toolchain supports AVX512BW */
+#define HAVE_AVX512BW 1
+
+/* Define if host toolchain supports AVX512CD */
+#define HAVE_AVX512CD 1
+
+/* Define if host toolchain supports AVX512DQ */
+#define HAVE_AVX512DQ 1
+
+/* Define if host toolchain supports AVX512ER */
+#define HAVE_AVX512ER 1
+
+/* Define if host toolchain supports AVX512F */
+#define HAVE_AVX512F 1
+
+/* Define if host toolchain supports AVX512IFMA */
+#define HAVE_AVX512IFMA 1
+
+/* Define if host toolchain supports AVX512PF */
+#define HAVE_AVX512PF 1
+
+/* Define if host toolchain supports AVX512VBMI */
+#define HAVE_AVX512VBMI 1
+
+/* Define if host toolchain supports AVX512VL */
+#define HAVE_AVX512VL 1
+#endif
+
+/* bio->bi_opf is defined */
+/* #undef HAVE_BIO_BI_OPF */
+
+/* bio->bi_status exists */
+/* #undef HAVE_BIO_BI_STATUS */
+
+/* bio has bi_iter */
+/* #undef HAVE_BIO_BVEC_ITER */
+
+/* bio_set_dev() is available */
+/* #undef HAVE_BIO_SET_DEV */
+
+/* bio_set_dev() GPL-only */
+/* #undef HAVE_BIO_SET_DEV_GPL_ONLY */
+
+/* bio_set_op_attrs is available */
+/* #undef HAVE_BIO_SET_OP_ATTRS */
+
+/* blkdev_reread_part() exists */
+/* #undef HAVE_BLKDEV_REREAD_PART */
+
+/* blkg_tryget() is available */
+/* #undef HAVE_BLKG_TRYGET */
+
+/* blkg_tryget() GPL-only */
+/* #undef HAVE_BLKG_TRYGET_GPL_ONLY */
+
+/* blk_alloc_queue() expects request function */
+/* #undef HAVE_BLK_ALLOC_QUEUE_REQUEST_FN */
+
+/* blk queue backing_dev_info is dynamic */
+/* #undef HAVE_BLK_QUEUE_BDI_DYNAMIC */
+
+/* blk_queue_flag_clear() exists */
+/* #undef HAVE_BLK_QUEUE_FLAG_CLEAR */
+
+/* blk_queue_flag_set() exists */
+/* #undef HAVE_BLK_QUEUE_FLAG_SET */
+
+/* blk_queue_flush() is available */
+/* #undef HAVE_BLK_QUEUE_FLUSH */
+
+/* blk_queue_flush() is GPL-only */
+/* #undef HAVE_BLK_QUEUE_FLUSH_GPL_ONLY */
+
+/* blk_queue_secdiscard() is available */
+/* #undef HAVE_BLK_QUEUE_SECDISCARD */
+
+/* blk_queue_secure_erase() is available */
+/* #undef HAVE_BLK_QUEUE_SECURE_ERASE */
+
+/* blk_queue_write_cache() exists */
+/* #undef HAVE_BLK_QUEUE_WRITE_CACHE */
+
+/* blk_queue_write_cache() is GPL-only */
+/* #undef HAVE_BLK_QUEUE_WRITE_CACHE_GPL_ONLY */
+
+/* Define to 1 if you have the Mac OS X function CFLocaleCopyCurrent in the
+   CoreFoundation framework. */
+/* #undef HAVE_CFLOCALECOPYCURRENT */
+
+/* Define to 1 if you have the Mac OS X function
+   CFLocaleCopyPreferredLanguages in the CoreFoundation framework. */
+/* #undef HAVE_CFLOCALECOPYPREFERREDLANGUAGES */
+
+/* Define to 1 if you have the Mac OS X function CFPreferencesCopyAppValue in
+   the CoreFoundation framework. */
+/* #undef HAVE_CFPREFERENCESCOPYAPPVALUE */
+
+/* clear_inode() is available */
+/* #undef HAVE_CLEAR_INODE */
+
+/* dentry uses const struct dentry_operations */
+/* #undef HAVE_CONST_DENTRY_OPERATIONS */
+
+/* current_time() exists */
+/* #undef HAVE_CURRENT_TIME */
+
+/* Define if the GNU dcgettext() function is already present or preinstalled.
+   */
+/* #undef HAVE_DCGETTEXT */
+
+/* DECLARE_EVENT_CLASS() is available */
+/* #undef HAVE_DECLARE_EVENT_CLASS */
+
+/* sops->dirty_inode() wants flags */
+/* #undef HAVE_DIRTY_INODE_WITH_FLAGS */
+
+/* Define to 1 if you have the <dlfcn.h> header file. */
+#define HAVE_DLFCN_H 1
+
+/* d_make_root() is available */
+/* #undef HAVE_D_MAKE_ROOT */
+
+/* d_prune_aliases() is available */
+/* #undef HAVE_D_PRUNE_ALIASES */
+
+/* dops->d_revalidate() operation takes nameidata */
+/* #undef HAVE_D_REVALIDATE_NAMEIDATA */
+
+/* eops->encode_fh() wants child and parent inodes */
+/* #undef HAVE_ENCODE_FH_WITH_INODE */
+
+/* sops->evict_inode() exists */
+/* #undef HAVE_EVICT_INODE */
+
+/* fops->aio_fsync() exists */
+/* #undef HAVE_FILE_AIO_FSYNC */
+
+/* file_dentry() is available */
+/* #undef HAVE_FILE_DENTRY */
+
+/* file_inode() is available */
+/* #undef HAVE_FILE_INODE */
+
+/* iops->follow_link() cookie */
+/* #undef HAVE_FOLLOW_LINK_COOKIE */
+
+/* iops->follow_link() nameidata */
+/* #undef HAVE_FOLLOW_LINK_NAMEIDATA */
+
+/* fops->fsync() with range */
+/* #undef HAVE_FSYNC_RANGE */
+
+/* fops->fsync() without dentry */
+/* #undef HAVE_FSYNC_WITHOUT_DENTRY */
+
+/* generic_start_io_acct()/generic_end_io_acct() available */
+/* #undef HAVE_GENERIC_IO_ACCT_3ARG */
+
+/* generic_start_io_acct()/generic_end_io_acct() 4 arg available */
+/* #undef HAVE_GENERIC_IO_ACCT_4ARG */
+
+/* generic_readlink is global */
+/* #undef HAVE_GENERIC_READLINK */
+
+/* generic_setxattr() exists */
+/* #undef HAVE_GENERIC_SETXATTR */
+
+/* generic_write_checks() takes kiocb */
+/* #undef HAVE_GENERIC_WRITE_CHECKS_KIOCB */
+
+/* Define if the GNU gettext() function is already present or preinstalled. */
+/* #undef HAVE_GETTEXT */
+
+/* get_disk_and_module() is available */
+/* #undef HAVE_GET_DISK_AND_MODULE */
+
+/* iops->get_link() cookie */
+/* #undef HAVE_GET_LINK_COOKIE */
+
+/* iops->get_link() delayed */
+/* #undef HAVE_GET_LINK_DELAYED */
+
+/* group_info->gid exists */
+/* #undef HAVE_GROUP_INFO_GID */
+
+/* Define if you have the iconv() function and it works. */
+#define HAVE_ICONV 1
+
+/* yes */
+/* #undef HAVE_INODE_LOCK_SHARED */
+
+/* inode_set_flags() exists */
+/* #undef HAVE_INODE_SET_FLAGS */
+
+/* inode_set_iversion() exists */
+/* #undef HAVE_INODE_SET_IVERSION */
+
+/* inode->i_*time's are timespec64 */
+/* #undef HAVE_INODE_TIMESPEC64_TIMES */
+
+/* timestamp_truncate() exists */
+/* #undef HAVE_INODE_TIMESTAMP_TRUNCATE */
+
+/* Define to 1 if you have the <inttypes.h> header file. */
+#define HAVE_INTTYPES_H 1
+
+/* in_compat_syscall() is available */
+/* #undef HAVE_IN_COMPAT_SYSCALL */
+
+/* yes */
+/* #undef HAVE_IO_SCHEDULE_TIMEOUT */
+
+/* Define to 1 if you have the `issetugid' function. */
+#define HAVE_ISSETUGID 1
+
+/* kernel has kernel_fpu_* functions */
+/* #undef HAVE_KERNEL_FPU */
+
+/* kernel has asm/fpu/api.h */
+/* #undef HAVE_KERNEL_FPU_API_HEADER */
+
+/* kernel fpu internal */
+/* #undef HAVE_KERNEL_FPU_INTERNAL */
+
+/* uncached_acl_sentinel() exists */
+/* #undef HAVE_KERNEL_GET_ACL_HANDLE_CACHE */
+
+/* kernel does stack verification */
+/* #undef HAVE_KERNEL_OBJTOOL */
+
+/* kernel_read() take loff_t pointer */
+/* #undef HAVE_KERNEL_READ_PPOS */
+
+/* timer_list.function gets a timer_list */
+/* #undef HAVE_KERNEL_TIMER_FUNCTION_TIMER_LIST */
+
+/* struct timer_list has a flags member */
+/* #undef HAVE_KERNEL_TIMER_LIST_FLAGS */
+
+/* timer_setup() is available */
+/* #undef HAVE_KERNEL_TIMER_SETUP */
+
+/* kernel_write() take loff_t pointer */
+/* #undef HAVE_KERNEL_WRITE_PPOS */
+
+/* kmem_cache_create_usercopy() exists */
+/* #undef HAVE_KMEM_CACHE_CREATE_USERCOPY */
+
+/* kstrtoul() exists */
+/* #undef HAVE_KSTRTOUL */
+
+/* ktime_get_coarse_real_ts64() exists */
+/* #undef HAVE_KTIME_GET_COARSE_REAL_TS64 */
+
+/* ktime_get_raw_ts64() exists */
+/* #undef HAVE_KTIME_GET_RAW_TS64 */
+
+/* kvmalloc exists */
+/* #undef HAVE_KVMALLOC */
+
+/* kernel has large stacks */
+/* #undef HAVE_LARGE_STACKS */
+
+/* Define if you have libaio */
+/* #undef HAVE_LIBAIO */
+
+/* Define if you have libblkid */
+/* #undef HAVE_LIBBLKID */
+
+/* Define if you have libssl */
+#define HAVE_LIBSSL 1
+
+/* Define to 1 if you have the `tirpc' library (-ltirpc). */
+/* #undef HAVE_LIBTIRPC */
+
+/* Define if you have libudev */
+/* #undef HAVE_LIBUDEV */
+
+/* Define if udev_device_get_is_initialized is available */
+/* #undef HAVE_LIBUDEV_UDEV_DEVICE_GET_IS_INITIALIZED */
+
+/* Define if you have libuuid */
+/* #undef HAVE_LIBUUID */
+
+/* lseek_execute() is available */
+/* #undef HAVE_LSEEK_EXECUTE */
+
+/* makedev() is declared in sys/mkdev.h */
+/* #undef HAVE_MAKEDEV_IN_MKDEV */
+
+/* makedev() is declared in sys/sysmacros.h */
+/* #undef HAVE_MAKEDEV_IN_SYSMACROS */
+
+/* Noting that make_request_fn() returns blk_qc_t */
+/* #undef HAVE_MAKE_REQUEST_FN_RET_QC */
+
+/* Noting that make_request_fn() returns void */
+/* #undef HAVE_MAKE_REQUEST_FN_RET_VOID */
+
+/* Define to 1 if you have the <memory.h> header file. */
+#define HAVE_MEMORY_H 1
+
+/* iops->create()/mkdir()/mknod() take umode_t */
+/* #undef HAVE_MKDIR_UMODE_T */
+
+/* Define to 1 if you have the `mlockall' function. */
+#define HAVE_MLOCKALL 1
+
+/* Define if host toolchain supports MOVBE */
+#define HAVE_MOVBE 1
+
+/* new_sync_read()/new_sync_write() are available */
+/* #undef HAVE_NEW_SYNC_READ */
+
+/* iops->getattr() takes a path */
+/* #undef HAVE_PATH_IOPS_GETATTR */
+
+/* Define if host toolchain supports PCLMULQDQ */
+#define HAVE_PCLMULQDQ 1
+
+/* posix_acl_chmod() exists */
+/* #undef HAVE_POSIX_ACL_CHMOD */
+
+/* posix_acl_from_xattr() needs user_ns */
+/* #undef HAVE_POSIX_ACL_FROM_XATTR_USERNS */
+
+/* posix_acl_release() is available */
+/* #undef HAVE_POSIX_ACL_RELEASE */
+
+/* posix_acl_release() is GPL-only */
+/* #undef HAVE_POSIX_ACL_RELEASE_GPL_ONLY */
+
+/* posix_acl_valid() wants user namespace */
+/* #undef HAVE_POSIX_ACL_VALID_WITH_NS */
+
+/* proc_ops structure exists */
+/* #undef HAVE_PROC_OPS_STRUCT */
+
+/* iops->put_link() cookie */
+/* #undef HAVE_PUT_LINK_COOKIE */
+
+/* iops->put_link() delayed */
+/* #undef HAVE_PUT_LINK_DELAYED */
+
+/* iops->put_link() nameidata */
+/* #undef HAVE_PUT_LINK_NAMEIDATA */
+
+/* If available, contains the Python version number currently in use. */
+#define HAVE_PYTHON "3.7"
+
+/* qat is enabled and existed */
+/* #undef HAVE_QAT */
+
+/* iops->rename() wants flags */
+/* #undef HAVE_RENAME_WANTS_FLAGS */
+
+/* REQ_DISCARD is defined */
+/* #undef HAVE_REQ_DISCARD */
+
+/* REQ_FLUSH is defined */
+/* #undef HAVE_REQ_FLUSH */
+
+/* REQ_OP_DISCARD is defined */
+/* #undef HAVE_REQ_OP_DISCARD */
+
+/* REQ_OP_FLUSH is defined */
+/* #undef HAVE_REQ_OP_FLUSH */
+
+/* REQ_OP_SECURE_ERASE is defined */
+/* #undef HAVE_REQ_OP_SECURE_ERASE */
+
+/* REQ_PREFLUSH is defined */
+/* #undef HAVE_REQ_PREFLUSH */
+
+/* struct rw_semaphore has member activity */
+/* #undef HAVE_RWSEM_ACTIVITY */
+
+/* struct rw_semaphore has atomic_long_t member count */
+/* #undef HAVE_RWSEM_ATOMIC_LONG_COUNT */
+
+/* linux/sched/signal.h exists */
+/* #undef HAVE_SCHED_SIGNAL_HEADER */
+
+/* setattr_prepare() is available */
+/* #undef HAVE_SETATTR_PREPARE */
+
+/* iops->set_acl() exists */
+/* #undef HAVE_SET_ACL */
+
+/* set_cached_acl() is usable */
+/* #undef HAVE_SET_CACHED_ACL_USABLE */
+
+/* struct shrink_control exists */
+/* #undef HAVE_SHRINK_CONTROL_STRUCT */
+
+/* new shrinker callback wants 2 args */
+/* #undef HAVE_SINGLE_SHRINKER_CALLBACK */
+
+/* ->count_objects exists */
+/* #undef HAVE_SPLIT_SHRINKER_CALLBACK */
+
+#if defined(__amd64__) || defined(__i386__)
+/* Define if host toolchain supports SSE */
+#define HAVE_SSE 1
+
+/* Define if host toolchain supports SSE2 */
+#define HAVE_SSE2 1
+
+/* Define if host toolchain supports SSE3 */
+#define HAVE_SSE3 1
+
+/* Define if host toolchain supports SSE4.1 */
+#define HAVE_SSE4_1 1
+
+/* Define if host toolchain supports SSE4.2 */
+#define HAVE_SSE4_2 1
+
+/* Define if host toolchain supports SSSE3 */
+#define HAVE_SSSE3 1
+#endif
+
+/* STACK_FRAME_NON_STANDARD is defined */
+/* #undef HAVE_STACK_FRAME_NON_STANDARD */
+
+/* Define to 1 if you have the <stdint.h> header file. */
+#define HAVE_STDINT_H 1
+
+/* Define to 1 if you have the <stdlib.h> header file. */
+#define HAVE_STDLIB_H 1
+
+/* Define to 1 if you have the <strings.h> header file. */
+#define HAVE_STRINGS_H 1
+
+/* Define to 1 if you have the <string.h> header file. */
+#define HAVE_STRING_H 1
+
+/* Define to 1 if you have the `strlcat' function. */
+#define HAVE_STRLCAT 1
+
+/* Define to 1 if you have the `strlcpy' function. */
+#define HAVE_STRLCPY 1
+
+/* super_setup_bdi_name() exits */
+/* #undef HAVE_SUPER_SETUP_BDI_NAME */
+
+/* super_block->s_user_ns exists */
+/* #undef HAVE_SUPER_USER_NS */
+
+/* Define to 1 if you have the <sys/stat.h> header file. */
+#define HAVE_SYS_STAT_H 1
+
+/* Define to 1 if you have the <sys/types.h> header file. */
+#define HAVE_SYS_TYPES_H 1
+
+/* i_op->tmpfile() exists */
+/* #undef HAVE_TMPFILE */
+
+/* totalhigh_pages() exists */
+/* #undef HAVE_TOTALHIGH_PAGES */
+
+/* kernel has totalram_pages() */
+/* #undef HAVE_TOTALRAM_PAGES_FUNC */
+
+/* kernel has __kernel_fpu_* functions */
+/* #undef HAVE_UNDERSCORE_KERNEL_FPU */
+
+/* Define to 1 if you have the <unistd.h> header file. */
+#define HAVE_UNISTD_H 1
+
+/* iops->getattr() takes a vfsmount */
+/* #undef HAVE_VFSMOUNT_IOPS_GETATTR */
+
+/* aops->direct_IO() uses iovec */
+/* #undef HAVE_VFS_DIRECT_IO_IOVEC */
+
+/* aops->direct_IO() uses iov_iter without rw */
+/* #undef HAVE_VFS_DIRECT_IO_ITER */
+
+/* aops->direct_IO() uses iov_iter with offset */
+/* #undef HAVE_VFS_DIRECT_IO_ITER_OFFSET */
+
+/* aops->direct_IO() uses iov_iter with rw and offset */
+/* #undef HAVE_VFS_DIRECT_IO_ITER_RW_OFFSET */
+
+/* fops->iterate() is available */
+/* #undef HAVE_VFS_ITERATE */
+
+/* fops->iterate_shared() is available */
+/* #undef HAVE_VFS_ITERATE_SHARED */
+
+/* fops->readdir() is available */
+/* #undef HAVE_VFS_READDIR */
+
+/* fops->read/write_iter() are available */
+/* #undef HAVE_VFS_RW_ITERATE */
+
+/* __vmalloc page flags exists */
+/* #undef HAVE_VMALLOC_PAGE_KERNEL */
+
+/* yes */
+/* #undef HAVE_WAIT_ON_BIT_ACTION */
+
+/* wait_queue_entry_t exists */
+/* #undef HAVE_WAIT_QUEUE_ENTRY_T */
+
+/* wq_head->head and wq_entry->entry exist */
+/* #undef HAVE_WAIT_QUEUE_HEAD_ENTRY */
+
+/* xattr_handler->get() wants dentry */
+/* #undef HAVE_XATTR_GET_DENTRY */
+
+/* xattr_handler->get() wants both dentry and inode */
+/* #undef HAVE_XATTR_GET_DENTRY_INODE */
+
+/* xattr_handler->get() wants xattr_handler */
+/* #undef HAVE_XATTR_GET_HANDLER */
+
+/* xattr_handler has name */
+/* #undef HAVE_XATTR_HANDLER_NAME */
+
+/* xattr_handler->list() wants dentry */
+/* #undef HAVE_XATTR_LIST_DENTRY */
+
+/* xattr_handler->list() wants xattr_handler */
+/* #undef HAVE_XATTR_LIST_HANDLER */
+
+/* xattr_handler->list() wants simple */
+/* #undef HAVE_XATTR_LIST_SIMPLE */
+
+/* xattr_handler->set() wants dentry */
+/* #undef HAVE_XATTR_SET_DENTRY */
+
+/* xattr_handler->set() wants both dentry and inode */
+/* #undef HAVE_XATTR_SET_DENTRY_INODE */
+
+/* xattr_handler->set() wants xattr_handler */
+/* #undef HAVE_XATTR_SET_HANDLER */
+
+/* Define if you have zlib */
+#define HAVE_ZLIB 1
+
+/* __posix_acl_chmod() exists */
+/* #undef HAVE___POSIX_ACL_CHMOD */
+
+/* Define as const if the declaration of iconv() needs const. */
+#define ICONV_CONST 
+
+/* kernel exports FPU functions */
+/* #undef KERNEL_EXPORTS_X86_FPU */
+
+/* Define to the sub-directory where libtool stores uninstalled libraries. */
+#define LT_OBJDIR ".libs/"
+
+/* make_request_fn() return type */
+/* #undef MAKE_REQUEST_FN_RET */
+
+/* hardened module_param_call */
+/* #undef MODULE_PARAM_CALL_CONST */
+
+/* struct shrink_control has nid */
+/* #undef SHRINK_CONTROL_HAS_NID */
+
+/* Defined for legacy compatibility. */
+#define SPL_META_ALIAS ZFS_META_ALIAS
+
+/* Defined for legacy compatibility. */
+#define SPL_META_RELEASE ZFS_META_RELEASE
+
+/* Defined for legacy compatibility. */
+#define SPL_META_VERSION ZFS_META_VERSION
+
+/* True if ZFS is to be compiled for a FreeBSD system */
+#define SYSTEM_FREEBSD 1
+
+/* True if ZFS is to be compiled for a Linux system */
+/* #undef SYSTEM_LINUX */
+
+/* zfs debugging enabled */
+/* #define ZFS_DEBUG 1 */
+
+/* /dev/zfs minor */
+/* #undef ZFS_DEVICE_MINOR */
+
+/* enum node_stat_item contains NR_FILE_PAGES */
+/* #undef ZFS_ENUM_NODE_STAT_ITEM_NR_FILE_PAGES */
+
+/* enum node_stat_item contains NR_INACTIVE_ANON */
+/* #undef ZFS_ENUM_NODE_STAT_ITEM_NR_INACTIVE_ANON */
+
+/* enum node_stat_item contains NR_INACTIVE_FILE */
+/* #undef ZFS_ENUM_NODE_STAT_ITEM_NR_INACTIVE_FILE */
+
+/* enum node_stat_item contains NR_SLAB_RECLAIMABLE */
+/* #undef ZFS_ENUM_NODE_STAT_ITEM_NR_SLAB_RECLAIMABLE */
+
+/* enum zone_stat_item contains NR_FILE_PAGES */
+/* #undef ZFS_ENUM_ZONE_STAT_ITEM_NR_FILE_PAGES */
+
+/* enum zone_stat_item contains NR_INACTIVE_ANON */
+/* #undef ZFS_ENUM_ZONE_STAT_ITEM_NR_INACTIVE_ANON */
+
+/* enum zone_stat_item contains NR_INACTIVE_FILE */
+/* #undef ZFS_ENUM_ZONE_STAT_ITEM_NR_INACTIVE_FILE */
+
+/* enum zone_stat_item contains NR_SLAB_RECLAIMABLE */
+/* #undef ZFS_ENUM_ZONE_STAT_ITEM_NR_SLAB_RECLAIMABLE */
+
+/* global_node_page_state() exists */
+/* #undef ZFS_GLOBAL_NODE_PAGE_STATE */
+
+/* global_zone_page_state() exists */
+/* #undef ZFS_GLOBAL_ZONE_PAGE_STATE */
+
+/* Define to 1 if GPL-only symbols can be used */
+/* #undef ZFS_IS_GPL_COMPATIBLE */
+
+/* Define the project alias string. */
+#define ZFS_META_ALIAS "zfs-0.8.0-1"
+
+/* Define the project author. */
+#define ZFS_META_AUTHOR "OpenZFS on Linux"
+
+/* Define the project release date. */
+/* #undef ZFS_META_DATA */
+
+/* Define the maximum compatible kernel version. */
+#define ZFS_META_KVER_MAX "5.6"
+
+/* Define the minimum compatible kernel version. */
+#define ZFS_META_KVER_MIN "3.10"
+
+/* Define the project license. */
+#define ZFS_META_LICENSE "CDDL"
+
+/* Define the libtool library 'age' version information. */
+/* #undef ZFS_META_LT_AGE */
+
+/* Define the libtool library 'current' version information. */
+/* #undef ZFS_META_LT_CURRENT */
+
+/* Define the libtool library 'revision' version information. */
+/* #undef ZFS_META_LT_REVISION */
+
+/* Define the project name. */
+#define ZFS_META_NAME "zfs"
+
+/* Define the project release. */
+#define ZFS_META_RELEASE "1"
+
+/* Define the project version. */
+#define ZFS_META_VERSION "0.8.0"
+
Index: sys/modules/zfs/zfs_gitrev.h
===================================================================
--- /dev/null
+++ sys/modules/zfs/zfs_gitrev.h
@@ -0,0 +1 @@
+#define	ZFS_META_GITREV "zfs-0.7.0-2997-g7ae36732f"
Index: sys/vm/vm.h
===================================================================
--- sys/vm/vm.h
+++ sys/vm/vm.h
@@ -112,7 +112,9 @@
  * Define it here for "applications" that include vm headers (e.g.,
  * genassym).
  */
+#ifndef HAVE_BOOLEAN
 typedef int boolean_t;
+#endif
 
 /*
  * The exact set of memory attributes is machine dependent.  However,
Index: tests/sys/cddl/zfs/bin/file_write.c
===================================================================
--- tests/sys/cddl/zfs/bin/file_write.c
+++ tests/sys/cddl/zfs/bin/file_write.c
@@ -28,6 +28,7 @@
 #pragma ident	"@(#)file_write.c	1.4	07/10/09 SMI"
 
 #include "file_common.h"
+#include <inttypes.h>
 #include <libgen.h>
 
 static unsigned char bigbuffer[BIGBUFFERSIZE];
@@ -180,14 +181,15 @@
 	}
 	noffset = lseek(bigfd, offset, SEEK_SET);
 	if (noffset != offset) {
-		(void) printf("lseek %s (%lld/%lld) failed [%s]%d.Aborting!\n",
+		(void) printf("lseek %s (%"PRId64"/%"PRId64") "
+		    "failed [%s]%d. Aborting!\n",
 		    filename, offset, noffset, strerror(errno), errno);
 		exit(errno);
 	}
 
 	if (verbose) {
 		(void) printf("%s: block_size = %d, write_count = %d, "
-		    "offset = %lld, data = %s%d\n", filename, block_size,
+		    "offset = %"PRId64", data = %s%d\n", filename, block_size,
 		    write_count, offset,
 		    (fillchar == 0) ? "0->" : "",
 		    (fillchar == 0) ? DATA_RANGE : fillchar);
@@ -197,17 +199,17 @@
 		ssize_t n;
 
 		if ((n = write(bigfd, &bigbuffer, block_size)) == -1) {
-			(void) printf("write failed (%ld), good_writes = %lld, "
+			(void) printf("write failed (%ld), "
+			    "good_writes = %"PRId64", "
 			    "error: %s[%d]\n", (long)n, good_writes,
-			    strerror(errno),
-			    errno);
+			    strerror(errno), errno);
 			exit(errno);
 		}
 		good_writes++;
 	}
 
 	if (verbose) {
-		(void) printf("Success: good_writes = %lld (%lld)\n",
+		(void) printf("Success: good_writes = %"PRId64" (%"PRId64")\n",
 		    good_writes, (good_writes * block_size));
 	}
 
Index: usr.sbin/fstyp/Makefile
===================================================================
--- usr.sbin/fstyp/Makefile
+++ usr.sbin/fstyp/Makefile
@@ -28,23 +28,28 @@
 .if ${MK_ZFS} != "no"
 IGNORE_PRAGMA=  YES
 
-CFLAGS+= -DNEED_SOLARIS_BOOLEAN -DHAVE_ZFS
-CFLAGS+= -I${SRCTOP}/sys/cddl/compat/opensolaris
-CFLAGS+= -I${SRCTOP}/cddl/compat/opensolaris/include
-CFLAGS+= -I${SRCTOP}/cddl/compat/opensolaris/lib/libumem
-CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/lib/libnvpair
-CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/lib/libzpool/common
-CFLAGS+= -I${SRCTOP}/sys/cddl/contrib/opensolaris/uts/common/fs/zfs
-CFLAGS+= -I${SRCTOP}/sys/cddl/contrib/opensolaris/uts/common
-CFLAGS+= -I${SRCTOP}/cddl/contrib/opensolaris/head
+WARNS?=	0
+CFLAGS.zfs.c+= -DIN_BASE
+CFLAGS.zfs.c+= -I${SRCTOP}/sys/contrib/openzfs/include
+CFLAGS.zfs.c+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include
+CFLAGS.zfs.c+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/os/freebsd
+CFLAGS.zfs.c+= -I${SRCTOP}/sys/contrib/openzfs/module/icp/include
+CFLAGS.zfs.c+= -include ${SRCTOP}/sys/contrib/openzfs/include/os/freebsd/spl/sys/ccompile.h
+CFLAGS.zfs.c+= -DHAVE_ISSETUGID
+CFLAGS.zfs.c+= -include ${SRCTOP}/sys/modules/zfs/zfs_config.h
 .endif
 
-CFLAGS+=-I${SRCTOP}/sys
+.for src in ${SRCS}
+.if ${src} != "zfs.c"
+CFLAGS.${src}+=-I${SRCTOP}/sys
+.endif
+.endfor
+
 
 LIBADD=	geom md ufs
 
 .if ${MK_ZFS} != "no"
-LIBADD+=nvpair zfs
+LIBADD+=nvpair zfs spl
 .endif
 
 .include <bsd.prog.mk>
Index: usr.sbin/fstyp/zfs.c
===================================================================
--- usr.sbin/fstyp/zfs.c
+++ usr.sbin/fstyp/zfs.c
@@ -28,9 +28,7 @@
 __FBSDID("$FreeBSD$");
 
 #include <sys/types.h>
-#include <cddl/compat/opensolaris/sys/types.h>
 #include <sys/time.h>
-#include <cddl/compat/opensolaris/sys/time.h>
 #include <stdint.h>
 #include <stdio.h>
 #include <stdlib.h>